1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2023 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR_CR(Register tmp) { 899 mfcr(tmp); 900 std(tmp, _abi0(cr), R1_SP); 901 mflr(tmp); 902 std(tmp, _abi0(lr), R1_SP); 903 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 904 } 905 906 void MacroAssembler::restore_LR_CR(Register tmp) { 907 assert(tmp != R1_SP, "must be distinct"); 908 ld(tmp, _abi0(lr), R1_SP); 909 mtlr(tmp); 910 ld(tmp, _abi0(cr), R1_SP); 911 mtcr(tmp); 912 } 913 914 address MacroAssembler::get_PC_trash_LR(Register result) { 915 Label L; 916 bl(L); 917 bind(L); 918 address lr_pc = pc(); 919 mflr(result); 920 return lr_pc; 921 } 922 923 void MacroAssembler::resize_frame(Register offset, Register tmp) { 924 #ifdef ASSERT 925 assert_different_registers(offset, tmp, R1_SP); 926 andi_(tmp, offset, frame::alignment_in_bytes-1); 927 asm_assert_eq("resize_frame: unaligned"); 928 #endif 929 930 // tmp <- *(SP) 931 ld(tmp, _abi0(callers_sp), R1_SP); 932 // addr <- SP + offset; 933 // *(addr) <- tmp; 934 // SP <- addr 935 stdux(tmp, R1_SP, offset); 936 } 937 938 void MacroAssembler::resize_frame(int offset, Register tmp) { 939 assert(is_simm(offset, 16), "too big an offset"); 940 assert_different_registers(tmp, R1_SP); 941 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 942 // tmp <- *(SP) 943 ld(tmp, _abi0(callers_sp), R1_SP); 944 // addr <- SP + offset; 945 // *(addr) <- tmp; 946 // SP <- addr 947 stdu(tmp, offset, R1_SP); 948 } 949 950 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 951 // (addr == tmp1) || (addr == tmp2) is allowed here! 952 assert(tmp1 != tmp2, "must be distinct"); 953 954 // compute offset w.r.t. current stack pointer 955 // tmp_1 <- addr - SP (!) 956 subf(tmp1, R1_SP, addr); 957 958 // atomically update SP keeping back link. 959 resize_frame(tmp1/* offset */, tmp2/* tmp */); 960 } 961 962 void MacroAssembler::push_frame(Register bytes, Register tmp) { 963 #ifdef ASSERT 964 assert(bytes != R0, "r0 not allowed here"); 965 andi_(R0, bytes, frame::alignment_in_bytes-1); 966 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 967 #endif 968 neg(tmp, bytes); 969 stdux(R1_SP, R1_SP, tmp); 970 } 971 972 // Push a frame of size `bytes'. 973 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 974 long offset = align_addr(bytes, frame::alignment_in_bytes); 975 if (is_simm(-offset, 16)) { 976 stdu(R1_SP, -offset, R1_SP); 977 } else { 978 load_const_optimized(tmp, -offset); 979 stdux(R1_SP, R1_SP, tmp); 980 } 981 } 982 983 // Push a frame of size `bytes' plus native_abi_reg_args on top. 984 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 985 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 986 } 987 988 // Setup up a new C frame with a spill area for non-volatile GPRs and 989 // additional space for local variables. 990 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 991 Register tmp) { 992 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 993 } 994 995 // Pop current C frame. 996 void MacroAssembler::pop_frame() { 997 ld(R1_SP, _abi0(callers_sp), R1_SP); 998 } 999 1000 #if defined(ABI_ELFv2) 1001 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1002 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1003 // most of the times. 1004 if (R12 != r_function_entry) { 1005 mr(R12, r_function_entry); 1006 } 1007 mtctr(R12); 1008 // Do a call or a branch. 1009 if (and_link) { 1010 bctrl(); 1011 } else { 1012 bctr(); 1013 } 1014 _last_calls_return_pc = pc(); 1015 1016 return _last_calls_return_pc; 1017 } 1018 1019 // Call a C function via a function descriptor and use full C 1020 // calling conventions. Updates and returns _last_calls_return_pc. 1021 address MacroAssembler::call_c(Register r_function_entry) { 1022 return branch_to(r_function_entry, /*and_link=*/true); 1023 } 1024 1025 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1026 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1027 return branch_to(r_function_entry, /*and_link=*/false); 1028 } 1029 1030 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1031 load_const(R12, function_entry, R0); 1032 return branch_to(R12, /*and_link=*/true); 1033 } 1034 1035 #else 1036 // Generic version of a call to C function via a function descriptor 1037 // with variable support for C calling conventions (TOC, ENV, etc.). 1038 // Updates and returns _last_calls_return_pc. 1039 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1040 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1041 // we emit standard ptrgl glue code here 1042 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1043 1044 // retrieve necessary entries from the function descriptor 1045 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1046 mtctr(R0); 1047 1048 if (load_toc_of_callee) { 1049 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1050 } 1051 if (load_env_of_callee) { 1052 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1053 } else if (load_toc_of_callee) { 1054 li(R11, 0); 1055 } 1056 1057 // do a call or a branch 1058 if (and_link) { 1059 bctrl(); 1060 } else { 1061 bctr(); 1062 } 1063 _last_calls_return_pc = pc(); 1064 1065 return _last_calls_return_pc; 1066 } 1067 1068 // Call a C function via a function descriptor and use full C calling 1069 // conventions. 1070 // We don't use the TOC in generated code, so there is no need to save 1071 // and restore its value. 1072 address MacroAssembler::call_c(Register fd) { 1073 return branch_to(fd, /*and_link=*/true, 1074 /*save toc=*/false, 1075 /*restore toc=*/false, 1076 /*load toc=*/true, 1077 /*load env=*/true); 1078 } 1079 1080 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1081 return branch_to(fd, /*and_link=*/false, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1089 if (rt != relocInfo::none) { 1090 // this call needs to be relocatable 1091 if (!ReoptimizeCallSequences 1092 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1093 || fd == nullptr // support code-size estimation 1094 || !fd->is_friend_function() 1095 || fd->entry() == nullptr) { 1096 // it's not a friend function as defined by class FunctionDescriptor, 1097 // so do a full call-c here. 1098 load_const(R11, (address)fd, R0); 1099 1100 bool has_env = (fd != nullptr && fd->env() != nullptr); 1101 return branch_to(R11, /*and_link=*/true, 1102 /*save toc=*/false, 1103 /*restore toc=*/false, 1104 /*load toc=*/true, 1105 /*load env=*/has_env); 1106 } else { 1107 // It's a friend function. Load the entry point and don't care about 1108 // toc and env. Use an optimizable call instruction, but ensure the 1109 // same code-size as in the case of a non-friend function. 1110 nop(); 1111 nop(); 1112 nop(); 1113 bl64_patchable(fd->entry(), rt); 1114 _last_calls_return_pc = pc(); 1115 return _last_calls_return_pc; 1116 } 1117 } else { 1118 // This call does not need to be relocatable, do more aggressive 1119 // optimizations. 1120 if (!ReoptimizeCallSequences 1121 || !fd->is_friend_function()) { 1122 // It's not a friend function as defined by class FunctionDescriptor, 1123 // so do a full call-c here. 1124 load_const(R11, (address)fd, R0); 1125 return branch_to(R11, /*and_link=*/true, 1126 /*save toc=*/false, 1127 /*restore toc=*/false, 1128 /*load toc=*/true, 1129 /*load env=*/true); 1130 } else { 1131 // it's a friend function, load the entry point and don't care about 1132 // toc and env. 1133 address dest = fd->entry(); 1134 if (is_within_range_of_b(dest, pc())) { 1135 bl(dest); 1136 } else { 1137 bl64_patchable(dest, rt); 1138 } 1139 _last_calls_return_pc = pc(); 1140 return _last_calls_return_pc; 1141 } 1142 } 1143 } 1144 1145 // Call a C function. All constants needed reside in TOC. 1146 // 1147 // Read the address to call from the TOC. 1148 // Read env from TOC, if fd specifies an env. 1149 // Read new TOC from TOC. 1150 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1151 relocInfo::relocType rt, Register toc) { 1152 if (!ReoptimizeCallSequences 1153 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1154 || !fd->is_friend_function()) { 1155 // It's not a friend function as defined by class FunctionDescriptor, 1156 // so do a full call-c here. 1157 assert(fd->entry() != nullptr, "function must be linked"); 1158 1159 AddressLiteral fd_entry(fd->entry()); 1160 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1161 mtctr(R11); 1162 if (fd->env() == nullptr) { 1163 li(R11, 0); 1164 nop(); 1165 } else { 1166 AddressLiteral fd_env(fd->env()); 1167 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1168 } 1169 AddressLiteral fd_toc(fd->toc()); 1170 // Set R2_TOC (load from toc) 1171 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1172 bctrl(); 1173 _last_calls_return_pc = pc(); 1174 if (!success) { return nullptr; } 1175 } else { 1176 // It's a friend function, load the entry point and don't care about 1177 // toc and env. Use an optimizable call instruction, but ensure the 1178 // same code-size as in the case of a non-friend function. 1179 nop(); 1180 bl64_patchable(fd->entry(), rt); 1181 _last_calls_return_pc = pc(); 1182 } 1183 return _last_calls_return_pc; 1184 } 1185 #endif // ABI_ELFv2 1186 1187 void MacroAssembler::post_call_nop() { 1188 // Make inline again when loom is always enabled. 1189 if (!Continuations::enabled()) { 1190 return; 1191 } 1192 // We use CMPI/CMPLI instructions to encode post call nops. 1193 // Refer to NativePostCallNop for details. 1194 relocate(post_call_nop_Relocation::spec()); 1195 InlineSkippedInstructionsCounter skipCounter(this); 1196 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1197 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1198 } 1199 1200 int MacroAssembler::ic_check_size() { 1201 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1202 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1203 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1204 1205 int num_ins; 1206 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1207 num_ins = 3; 1208 if (use_trap_based_null_check) num_ins += 1; 1209 } else { 1210 num_ins = 7; 1211 if (!implicit_null_checks_available) num_ins += 2; 1212 } 1213 return num_ins * BytesPerInstWord; 1214 } 1215 1216 int MacroAssembler::ic_check(int end_alignment) { 1217 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1218 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1219 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1220 1221 Register receiver = R3_ARG1; 1222 Register data = R19_inline_cache_reg; 1223 Register tmp1 = R11_scratch1; 1224 Register tmp2 = R12_scratch2; 1225 1226 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1227 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1228 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1229 // before the inline cache check here, and not after 1230 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1231 1232 int uep_offset = offset(); 1233 1234 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1235 // Fast version which uses SIGTRAP 1236 1237 if (use_trap_based_null_check) { 1238 trap_null_check(receiver); 1239 } 1240 if (UseCompressedClassPointers) { 1241 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1242 } else { 1243 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1244 } 1245 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1246 trap_ic_miss_check(tmp1, tmp2); 1247 1248 } else { 1249 // Slower version which doesn't use SIGTRAP 1250 1251 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1252 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1253 true, true, false); // 2 instructions 1254 mtctr(tmp1); 1255 1256 if (!implicit_null_checks_available) { 1257 cmpdi(CCR0, receiver, 0); 1258 beqctr(CCR0); 1259 } 1260 if (UseCompressedClassPointers) { 1261 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1262 } else { 1263 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1264 } 1265 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1266 cmpd(CCR0, tmp1, tmp2); 1267 bnectr(CCR0); 1268 } 1269 1270 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1271 1272 return uep_offset; 1273 } 1274 1275 void MacroAssembler::call_VM_base(Register oop_result, 1276 Register last_java_sp, 1277 address entry_point, 1278 bool check_exceptions) { 1279 BLOCK_COMMENT("call_VM {"); 1280 // Determine last_java_sp register. 1281 if (!last_java_sp->is_valid()) { 1282 last_java_sp = R1_SP; 1283 } 1284 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1285 1286 // ARG1 must hold thread address. 1287 mr(R3_ARG1, R16_thread); 1288 #if defined(ABI_ELFv2) 1289 address return_pc = call_c(entry_point, relocInfo::none); 1290 #else 1291 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1292 #endif 1293 1294 reset_last_Java_frame(); 1295 1296 // Check for pending exceptions. 1297 if (check_exceptions) { 1298 // We don't check for exceptions here. 1299 ShouldNotReachHere(); 1300 } 1301 1302 // Get oop result if there is one and reset the value in the thread. 1303 if (oop_result->is_valid()) { 1304 get_vm_result(oop_result); 1305 } 1306 1307 _last_calls_return_pc = return_pc; 1308 BLOCK_COMMENT("} call_VM"); 1309 } 1310 1311 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1312 BLOCK_COMMENT("call_VM_leaf {"); 1313 #if defined(ABI_ELFv2) 1314 call_c(entry_point, relocInfo::none); 1315 #else 1316 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1317 #endif 1318 BLOCK_COMMENT("} call_VM_leaf"); 1319 } 1320 1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1322 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1323 } 1324 1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1326 bool check_exceptions) { 1327 // R3_ARG1 is reserved for the thread. 1328 mr_if_needed(R4_ARG2, arg_1); 1329 call_VM(oop_result, entry_point, check_exceptions); 1330 } 1331 1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1333 bool check_exceptions) { 1334 // R3_ARG1 is reserved for the thread 1335 assert_different_registers(arg_2, R4_ARG2); 1336 mr_if_needed(R4_ARG2, arg_1); 1337 mr_if_needed(R5_ARG3, arg_2); 1338 call_VM(oop_result, entry_point, check_exceptions); 1339 } 1340 1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1342 bool check_exceptions) { 1343 // R3_ARG1 is reserved for the thread 1344 assert_different_registers(arg_2, R4_ARG2); 1345 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1346 mr_if_needed(R4_ARG2, arg_1); 1347 mr_if_needed(R5_ARG3, arg_2); 1348 mr_if_needed(R6_ARG4, arg_3); 1349 call_VM(oop_result, entry_point, check_exceptions); 1350 } 1351 1352 void MacroAssembler::call_VM_leaf(address entry_point) { 1353 call_VM_leaf_base(entry_point); 1354 } 1355 1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1357 mr_if_needed(R3_ARG1, arg_1); 1358 call_VM_leaf(entry_point); 1359 } 1360 1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1362 assert_different_registers(arg_2, R3_ARG1); 1363 mr_if_needed(R3_ARG1, arg_1); 1364 mr_if_needed(R4_ARG2, arg_2); 1365 call_VM_leaf(entry_point); 1366 } 1367 1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1369 assert_different_registers(arg_2, R3_ARG1); 1370 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 mr_if_needed(R5_ARG3, arg_3); 1374 call_VM_leaf(entry_point); 1375 } 1376 1377 // Check whether instruction is a read access to the polling page 1378 // which was emitted by load_from_polling_page(..). 1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1380 address* polling_address_ptr) { 1381 if (!is_ld(instruction)) 1382 return false; // It's not a ld. Fail. 1383 1384 int rt = inv_rt_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 int ds = inv_ds_field(instruction); 1387 if (!(ds == 0 && ra != 0 && rt == 0)) { 1388 return false; // It's not a ld(r0, X, ra). Fail. 1389 } 1390 1391 if (!ucontext) { 1392 // Set polling address. 1393 if (polling_address_ptr != nullptr) { 1394 *polling_address_ptr = nullptr; 1395 } 1396 return true; // No ucontext given. Can't check value of ra. Assume true. 1397 } 1398 1399 #ifdef LINUX 1400 // Ucontext given. Check that register ra contains the address of 1401 // the safepoing polling page. 1402 ucontext_t* uc = (ucontext_t*) ucontext; 1403 // Set polling address. 1404 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1405 if (polling_address_ptr != nullptr) { 1406 *polling_address_ptr = addr; 1407 } 1408 return SafepointMechanism::is_poll_address(addr); 1409 #else 1410 // Not on Linux, ucontext must be null. 1411 ShouldNotReachHere(); 1412 return false; 1413 #endif 1414 } 1415 1416 void MacroAssembler::bang_stack_with_offset(int offset) { 1417 // When increasing the stack, the old stack pointer will be written 1418 // to the new top of stack according to the PPC64 abi. 1419 // Therefore, stack banging is not necessary when increasing 1420 // the stack by <= os::vm_page_size() bytes. 1421 // When increasing the stack by a larger amount, this method is 1422 // called repeatedly to bang the intermediate pages. 1423 1424 // Stack grows down, caller passes positive offset. 1425 assert(offset > 0, "must bang with positive offset"); 1426 1427 long stdoffset = -offset; 1428 1429 if (is_simm(stdoffset, 16)) { 1430 // Signed 16 bit offset, a simple std is ok. 1431 if (UseLoadInstructionsForStackBangingPPC64) { 1432 ld(R0, (int)(signed short)stdoffset, R1_SP); 1433 } else { 1434 std(R0,(int)(signed short)stdoffset, R1_SP); 1435 } 1436 } else if (is_simm(stdoffset, 31)) { 1437 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1438 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1439 1440 Register tmp = R11; 1441 addis(tmp, R1_SP, hi); 1442 if (UseLoadInstructionsForStackBangingPPC64) { 1443 ld(R0, lo, tmp); 1444 } else { 1445 std(R0, lo, tmp); 1446 } 1447 } else { 1448 ShouldNotReachHere(); 1449 } 1450 } 1451 1452 // If instruction is a stack bang of the form 1453 // std R0, x(Ry), (see bang_stack_with_offset()) 1454 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1455 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1456 // return the banged address. Otherwise, return 0. 1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1458 #ifdef LINUX 1459 ucontext_t* uc = (ucontext_t*) ucontext; 1460 int rs = inv_rs_field(instruction); 1461 int ra = inv_ra_field(instruction); 1462 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1463 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1464 || (is_stdu(instruction) && rs == 1)) { 1465 int ds = inv_ds_field(instruction); 1466 // return banged address 1467 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1468 } else if (is_stdux(instruction) && rs == 1) { 1469 int rb = inv_rb_field(instruction); 1470 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1471 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1472 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1473 : sp + rb_val; // banged address 1474 } 1475 return nullptr; // not a stack bang 1476 #else 1477 // workaround not needed on !LINUX :-) 1478 ShouldNotCallThis(); 1479 return nullptr; 1480 #endif 1481 } 1482 1483 void MacroAssembler::reserved_stack_check(Register return_pc) { 1484 // Test if reserved zone needs to be enabled. 1485 Label no_reserved_zone_enabling; 1486 1487 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1488 cmpld(CCR0, R1_SP, R0); 1489 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1490 1491 // Enable reserved zone again, throw stack overflow exception. 1492 push_frame_reg_args(0, R0); 1493 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1494 pop_frame(); 1495 mtlr(return_pc); 1496 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1497 mtctr(R0); 1498 bctr(); 1499 1500 should_not_reach_here(); 1501 1502 bind(no_reserved_zone_enabling); 1503 } 1504 1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1506 bool cmpxchgx_hint) { 1507 Label retry; 1508 bind(retry); 1509 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1510 stdcx_(exchange_value, addr_base); 1511 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1512 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1513 } else { 1514 bne( CCR0, retry); // StXcx_ sets CCR0. 1515 } 1516 } 1517 1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1519 Register tmp, bool cmpxchgx_hint) { 1520 Label retry; 1521 bind(retry); 1522 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1523 add(tmp, dest_current_value, inc_value); 1524 stdcx_(tmp, addr_base); 1525 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1526 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1527 } else { 1528 bne( CCR0, retry); // StXcx_ sets CCR0. 1529 } 1530 } 1531 1532 // Word/sub-word atomic helper functions 1533 1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 // Atomic add always kills tmp1. 1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1539 bool cmpxchgx_hint, bool is_add, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Label retry; 1547 Register shift_amount = noreg, 1548 val32 = dest_current_value, 1549 modval = is_add ? tmp1 : exchange_value; 1550 1551 if (instruction_type != size) { 1552 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1553 modval = tmp1; 1554 shift_amount = tmp2; 1555 val32 = tmp3; 1556 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1557 #ifdef VM_LITTLE_ENDIAN 1558 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1559 clrrdi(addr_base, addr_base, 2); 1560 #else 1561 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1562 clrrdi(addr_base, addr_base, 2); 1563 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1564 #endif 1565 } 1566 1567 // atomic emulation loop 1568 bind(retry); 1569 1570 switch (instruction_type) { 1571 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1572 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1573 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1574 default: ShouldNotReachHere(); 1575 } 1576 1577 if (instruction_type != size) { 1578 srw(dest_current_value, val32, shift_amount); 1579 } 1580 1581 if (is_add) { add(modval, dest_current_value, exchange_value); } 1582 1583 if (instruction_type != size) { 1584 // Transform exchange value such that the replacement can be done by one xor instruction. 1585 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1586 clrldi(modval, modval, (size == 1) ? 56 : 48); 1587 slw(modval, modval, shift_amount); 1588 xorr(modval, val32, modval); 1589 } 1590 1591 switch (instruction_type) { 1592 case 4: stwcx_(modval, addr_base); break; 1593 case 2: sthcx_(modval, addr_base); break; 1594 case 1: stbcx_(modval, addr_base); break; 1595 default: ShouldNotReachHere(); 1596 } 1597 1598 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1599 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1600 } else { 1601 bne( CCR0, retry); // StXcx_ sets CCR0. 1602 } 1603 1604 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1605 if (size == 1) { 1606 extsb(dest_current_value, dest_current_value); 1607 } else if (size == 2) { 1608 extsh(dest_current_value, dest_current_value); 1609 }; 1610 } 1611 1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1613 // Only signed types are supported with size < 4. 1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1615 Register compare_value, Register exchange_value, 1616 Register addr_base, Register tmp1, Register tmp2, 1617 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1618 // Sub-word instructions are available since Power 8. 1619 // For older processors, instruction_type != size holds, and we 1620 // emulate the sub-word instructions by constructing a 4-byte value 1621 // that leaves the other bytes unchanged. 1622 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1623 1624 Register shift_amount = noreg, 1625 val32 = dest_current_value, 1626 modval = exchange_value; 1627 1628 if (instruction_type != size) { 1629 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1630 shift_amount = tmp1; 1631 val32 = tmp2; 1632 modval = tmp2; 1633 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1634 #ifdef VM_LITTLE_ENDIAN 1635 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1636 clrrdi(addr_base, addr_base, 2); 1637 #else 1638 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1639 clrrdi(addr_base, addr_base, 2); 1640 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1641 #endif 1642 // Transform exchange value such that the replacement can be done by one xor instruction. 1643 xorr(exchange_value, compare_value, exchange_value); 1644 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1645 slw(exchange_value, exchange_value, shift_amount); 1646 } 1647 1648 // atomic emulation loop 1649 bind(retry); 1650 1651 switch (instruction_type) { 1652 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1653 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1654 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1655 default: ShouldNotReachHere(); 1656 } 1657 1658 if (instruction_type != size) { 1659 srw(dest_current_value, val32, shift_amount); 1660 } 1661 if (size == 1) { 1662 extsb(dest_current_value, dest_current_value); 1663 } else if (size == 2) { 1664 extsh(dest_current_value, dest_current_value); 1665 }; 1666 1667 cmpw(flag, dest_current_value, compare_value); 1668 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1669 bne_predict_not_taken(flag, failed); 1670 } else { 1671 bne( flag, failed); 1672 } 1673 // branch to done => (flag == ne), (dest_current_value != compare_value) 1674 // fall through => (flag == eq), (dest_current_value == compare_value) 1675 1676 if (instruction_type != size) { 1677 xorr(modval, val32, exchange_value); 1678 } 1679 1680 switch (instruction_type) { 1681 case 4: stwcx_(modval, addr_base); break; 1682 case 2: sthcx_(modval, addr_base); break; 1683 case 1: stbcx_(modval, addr_base); break; 1684 default: ShouldNotReachHere(); 1685 } 1686 } 1687 1688 // CmpxchgX sets condition register to cmpX(current, compare). 1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1690 Register compare_value, Register exchange_value, 1691 Register addr_base, Register tmp1, Register tmp2, 1692 int semantics, bool cmpxchgx_hint, 1693 Register int_flag_success, bool contention_hint, bool weak, int size) { 1694 Label retry; 1695 Label failed; 1696 Label done; 1697 1698 // Save one branch if result is returned via register and 1699 // result register is different from the other ones. 1700 bool use_result_reg = (int_flag_success != noreg); 1701 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1702 int_flag_success != exchange_value && int_flag_success != addr_base && 1703 int_flag_success != tmp1 && int_flag_success != tmp2); 1704 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1705 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1706 1707 if (use_result_reg && preset_result_reg) { 1708 li(int_flag_success, 0); // preset (assume cas failed) 1709 } 1710 1711 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1712 if (contention_hint) { // Don't try to reserve if cmp fails. 1713 switch (size) { 1714 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1715 case 2: lha(dest_current_value, 0, addr_base); break; 1716 case 4: lwz(dest_current_value, 0, addr_base); break; 1717 default: ShouldNotReachHere(); 1718 } 1719 cmpw(flag, dest_current_value, compare_value); 1720 bne(flag, failed); 1721 } 1722 1723 // release/fence semantics 1724 if (semantics & MemBarRel) { 1725 release(); 1726 } 1727 1728 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1729 retry, failed, cmpxchgx_hint, size); 1730 if (!weak || use_result_reg) { 1731 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1732 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1733 } else { 1734 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1735 } 1736 } 1737 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1738 1739 // Result in register (must do this at the end because int_flag_success can be the 1740 // same register as one above). 1741 if (use_result_reg) { 1742 li(int_flag_success, 1); 1743 } 1744 1745 if (semantics & MemBarFenceAfter) { 1746 fence(); 1747 } else if (semantics & MemBarAcq) { 1748 isync(); 1749 } 1750 1751 if (use_result_reg && !preset_result_reg) { 1752 b(done); 1753 } 1754 1755 bind(failed); 1756 if (use_result_reg && !preset_result_reg) { 1757 li(int_flag_success, 0); 1758 } 1759 1760 bind(done); 1761 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1762 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1763 } 1764 1765 // Performs atomic compare exchange: 1766 // if (compare_value == *addr_base) 1767 // *addr_base = exchange_value 1768 // int_flag_success = 1; 1769 // else 1770 // int_flag_success = 0; 1771 // 1772 // ConditionRegister flag = cmp(compare_value, *addr_base) 1773 // Register dest_current_value = *addr_base 1774 // Register compare_value Used to compare with value in memory 1775 // Register exchange_value Written to memory if compare_value == *addr_base 1776 // Register addr_base The memory location to compareXChange 1777 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1778 // 1779 // To avoid the costly compare exchange the value is tested beforehand. 1780 // Several special cases exist to avoid that unnecessary information is generated. 1781 // 1782 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1783 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1784 Register addr_base, int semantics, bool cmpxchgx_hint, 1785 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1786 Label retry; 1787 Label failed_int; 1788 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1789 Label done; 1790 1791 // Save one branch if result is returned via register and result register is different from the other ones. 1792 bool use_result_reg = (int_flag_success!=noreg); 1793 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1794 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1795 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1796 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1797 1798 if (use_result_reg && preset_result_reg) { 1799 li(int_flag_success, 0); // preset (assume cas failed) 1800 } 1801 1802 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1803 if (contention_hint) { // Don't try to reserve if cmp fails. 1804 ld(dest_current_value, 0, addr_base); 1805 cmpd(flag, compare_value, dest_current_value); 1806 bne(flag, failed); 1807 } 1808 1809 // release/fence semantics 1810 if (semantics & MemBarRel) { 1811 release(); 1812 } 1813 1814 // atomic emulation loop 1815 bind(retry); 1816 1817 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1818 cmpd(flag, compare_value, dest_current_value); 1819 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1820 bne_predict_not_taken(flag, failed); 1821 } else { 1822 bne( flag, failed); 1823 } 1824 1825 stdcx_(exchange_value, addr_base); 1826 if (!weak || use_result_reg || failed_ext) { 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1829 } else { 1830 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1831 } 1832 } 1833 1834 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1835 if (use_result_reg) { 1836 li(int_flag_success, 1); 1837 } 1838 1839 if (semantics & MemBarFenceAfter) { 1840 fence(); 1841 } else if (semantics & MemBarAcq) { 1842 isync(); 1843 } 1844 1845 if (use_result_reg && !preset_result_reg) { 1846 b(done); 1847 } 1848 1849 bind(failed_int); 1850 if (use_result_reg && !preset_result_reg) { 1851 li(int_flag_success, 0); 1852 } 1853 1854 bind(done); 1855 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1856 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1857 } 1858 1859 // Look up the method for a megamorphic invokeinterface call. 1860 // The target method is determined by <intf_klass, itable_index>. 1861 // The receiver klass is in recv_klass. 1862 // On success, the result will be in method_result, and execution falls through. 1863 // On failure, execution transfers to the given label. 1864 void MacroAssembler::lookup_interface_method(Register recv_klass, 1865 Register intf_klass, 1866 RegisterOrConstant itable_index, 1867 Register method_result, 1868 Register scan_temp, 1869 Register temp2, 1870 Label& L_no_such_interface, 1871 bool return_method) { 1872 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1873 1874 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1875 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1876 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1877 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1878 int scan_step = itableOffsetEntry::size() * wordSize; 1879 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1880 1881 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1882 // We should store the aligned, prescaled offset in the klass. 1883 // Then the next several instructions would fold away. 1884 1885 sldi(scan_temp, scan_temp, log_vte_size); 1886 addi(scan_temp, scan_temp, vtable_base); 1887 add(scan_temp, recv_klass, scan_temp); 1888 1889 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1890 if (return_method) { 1891 if (itable_index.is_register()) { 1892 Register itable_offset = itable_index.as_register(); 1893 sldi(method_result, itable_offset, logMEsize); 1894 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1895 add(method_result, method_result, recv_klass); 1896 } else { 1897 long itable_offset = (long)itable_index.as_constant(); 1898 // static address, no relocation 1899 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1900 } 1901 } 1902 1903 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1904 // if (scan->interface() == intf) { 1905 // result = (klass + scan->offset() + itable_index); 1906 // } 1907 // } 1908 Label search, found_method; 1909 1910 for (int peel = 1; peel >= 0; peel--) { 1911 // %%%% Could load both offset and interface in one ldx, if they were 1912 // in the opposite order. This would save a load. 1913 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1914 1915 // Check that this entry is non-null. A null entry means that 1916 // the receiver class doesn't implement the interface, and wasn't the 1917 // same as when the caller was compiled. 1918 cmpd(CCR0, temp2, intf_klass); 1919 1920 if (peel) { 1921 beq(CCR0, found_method); 1922 } else { 1923 bne(CCR0, search); 1924 // (invert the test to fall through to found_method...) 1925 } 1926 1927 if (!peel) break; 1928 1929 bind(search); 1930 1931 cmpdi(CCR0, temp2, 0); 1932 beq(CCR0, L_no_such_interface); 1933 addi(scan_temp, scan_temp, scan_step); 1934 } 1935 1936 bind(found_method); 1937 1938 // Got a hit. 1939 if (return_method) { 1940 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1941 lwz(scan_temp, ito_offset, scan_temp); 1942 ldx(method_result, scan_temp, method_result); 1943 } 1944 } 1945 1946 // virtual method calling 1947 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1948 RegisterOrConstant vtable_index, 1949 Register method_result) { 1950 1951 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1952 1953 const ByteSize base = Klass::vtable_start_offset(); 1954 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1955 1956 if (vtable_index.is_register()) { 1957 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1958 add(recv_klass, vtable_index.as_register(), recv_klass); 1959 } else { 1960 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1961 } 1962 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1963 } 1964 1965 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1966 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1967 Register super_klass, 1968 Register temp1_reg, 1969 Register temp2_reg, 1970 Label* L_success, 1971 Label* L_failure, 1972 Label* L_slow_path, 1973 RegisterOrConstant super_check_offset) { 1974 1975 const Register check_cache_offset = temp1_reg; 1976 const Register cached_super = temp2_reg; 1977 1978 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1979 1980 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1981 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1982 1983 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1984 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1985 1986 Label L_fallthrough; 1987 int label_nulls = 0; 1988 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1989 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1990 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1991 assert(label_nulls <= 1 || 1992 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1993 "at most one null in the batch, usually"); 1994 1995 // If the pointers are equal, we are done (e.g., String[] elements). 1996 // This self-check enables sharing of secondary supertype arrays among 1997 // non-primary types such as array-of-interface. Otherwise, each such 1998 // type would need its own customized SSA. 1999 // We move this check to the front of the fast path because many 2000 // type checks are in fact trivially successful in this manner, 2001 // so we get a nicely predicted branch right at the start of the check. 2002 cmpd(CCR0, sub_klass, super_klass); 2003 beq(CCR0, *L_success); 2004 2005 // Check the supertype display: 2006 if (must_load_sco) { 2007 // The super check offset is always positive... 2008 lwz(check_cache_offset, sco_offset, super_klass); 2009 super_check_offset = RegisterOrConstant(check_cache_offset); 2010 // super_check_offset is register. 2011 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2012 } 2013 // The loaded value is the offset from Klass. 2014 2015 ld(cached_super, super_check_offset, sub_klass); 2016 cmpd(CCR0, cached_super, super_klass); 2017 2018 // This check has worked decisively for primary supers. 2019 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2020 // (Secondary supers are interfaces and very deeply nested subtypes.) 2021 // This works in the same check above because of a tricky aliasing 2022 // between the super_cache and the primary super display elements. 2023 // (The 'super_check_addr' can address either, as the case requires.) 2024 // Note that the cache is updated below if it does not help us find 2025 // what we need immediately. 2026 // So if it was a primary super, we can just fail immediately. 2027 // Otherwise, it's the slow path for us (no success at this point). 2028 2029 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2030 2031 if (super_check_offset.is_register()) { 2032 beq(CCR0, *L_success); 2033 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2034 if (L_failure == &L_fallthrough) { 2035 beq(CCR0, *L_slow_path); 2036 } else { 2037 bne(CCR0, *L_failure); 2038 FINAL_JUMP(*L_slow_path); 2039 } 2040 } else { 2041 if (super_check_offset.as_constant() == sc_offset) { 2042 // Need a slow path; fast failure is impossible. 2043 if (L_slow_path == &L_fallthrough) { 2044 beq(CCR0, *L_success); 2045 } else { 2046 bne(CCR0, *L_slow_path); 2047 FINAL_JUMP(*L_success); 2048 } 2049 } else { 2050 // No slow path; it's a fast decision. 2051 if (L_failure == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_failure); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } 2058 } 2059 2060 bind(L_fallthrough); 2061 #undef FINAL_JUMP 2062 } 2063 2064 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2065 Register super_klass, 2066 Register temp1_reg, 2067 Register temp2_reg, 2068 Label* L_success, 2069 Register result_reg) { 2070 const Register array_ptr = temp1_reg; // current value from cache array 2071 const Register temp = temp2_reg; 2072 2073 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2074 2075 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2076 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2077 2078 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2079 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2080 2081 Label hit, loop, failure, fallthru; 2082 2083 ld(array_ptr, source_offset, sub_klass); 2084 2085 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2086 lwz(temp, length_offset, array_ptr); 2087 cmpwi(CCR0, temp, 0); 2088 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2089 2090 mtctr(temp); // load ctr 2091 2092 bind(loop); 2093 // Oops in table are NO MORE compressed. 2094 ld(temp, base_offset, array_ptr); 2095 cmpd(CCR0, temp, super_klass); 2096 beq(CCR0, hit); 2097 addi(array_ptr, array_ptr, BytesPerWord); 2098 bdnz(loop); 2099 2100 bind(failure); 2101 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2102 b(fallthru); 2103 2104 bind(hit); 2105 std(super_klass, target_offset, sub_klass); // save result to cache 2106 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2107 if (L_success != nullptr) { b(*L_success); } 2108 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2109 2110 bind(fallthru); 2111 } 2112 2113 // Try fast path, then go to slow one if not successful 2114 void MacroAssembler::check_klass_subtype(Register sub_klass, 2115 Register super_klass, 2116 Register temp1_reg, 2117 Register temp2_reg, 2118 Label& L_success) { 2119 Label L_failure; 2120 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2121 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2122 bind(L_failure); // Fallthru if not successful. 2123 } 2124 2125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2126 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2127 2128 Label L_fallthrough; 2129 if (L_fast_path == nullptr) { 2130 L_fast_path = &L_fallthrough; 2131 } else if (L_slow_path == nullptr) { 2132 L_slow_path = &L_fallthrough; 2133 } 2134 2135 // Fast path check: class is fully initialized 2136 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2137 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2138 beq(CCR0, *L_fast_path); 2139 2140 // Fast path check: current thread is initializer thread 2141 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2142 cmpd(CCR0, thread, R0); 2143 if (L_slow_path == &L_fallthrough) { 2144 beq(CCR0, *L_fast_path); 2145 } else if (L_fast_path == &L_fallthrough) { 2146 bne(CCR0, *L_slow_path); 2147 } else { 2148 Unimplemented(); 2149 } 2150 2151 bind(L_fallthrough); 2152 } 2153 2154 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2155 Register temp_reg, 2156 int extra_slot_offset) { 2157 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2158 int stackElementSize = Interpreter::stackElementSize; 2159 int offset = extra_slot_offset * stackElementSize; 2160 if (arg_slot.is_constant()) { 2161 offset += arg_slot.as_constant() * stackElementSize; 2162 return offset; 2163 } else { 2164 assert(temp_reg != noreg, "must specify"); 2165 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2166 if (offset != 0) 2167 addi(temp_reg, temp_reg, offset); 2168 return temp_reg; 2169 } 2170 } 2171 2172 void MacroAssembler::tlab_allocate( 2173 Register obj, // result: pointer to object after successful allocation 2174 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2175 int con_size_in_bytes, // object size in bytes if known at compile time 2176 Register t1, // temp register 2177 Label& slow_case // continuation point if fast allocation fails 2178 ) { 2179 // make sure arguments make sense 2180 assert_different_registers(obj, var_size_in_bytes, t1); 2181 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2182 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2183 2184 const Register new_top = t1; 2185 //verify_tlab(); not implemented 2186 2187 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2188 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2189 if (var_size_in_bytes == noreg) { 2190 addi(new_top, obj, con_size_in_bytes); 2191 } else { 2192 add(new_top, obj, var_size_in_bytes); 2193 } 2194 cmpld(CCR0, new_top, R0); 2195 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2196 2197 #ifdef ASSERT 2198 // make sure new free pointer is properly aligned 2199 { 2200 Label L; 2201 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2202 beq(CCR0, L); 2203 stop("updated TLAB free is not properly aligned"); 2204 bind(L); 2205 } 2206 #endif // ASSERT 2207 2208 // update the tlab top pointer 2209 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2210 //verify_tlab(); not implemented 2211 } 2212 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2213 unimplemented("incr_allocated_bytes"); 2214 } 2215 2216 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2217 int insts_call_instruction_offset, Register Rtoc) { 2218 // Start the stub. 2219 address stub = start_a_stub(64); 2220 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2221 2222 // Create a trampoline stub relocation which relates this trampoline stub 2223 // with the call instruction at insts_call_instruction_offset in the 2224 // instructions code-section. 2225 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2226 const int stub_start_offset = offset(); 2227 2228 // For java_to_interp stubs we use R11_scratch1 as scratch register 2229 // and in call trampoline stubs we use R12_scratch2. This way we 2230 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2231 Register reg_scratch = R12_scratch2; 2232 2233 // Now, create the trampoline stub's code: 2234 // - load the TOC 2235 // - load the call target from the constant pool 2236 // - call 2237 if (Rtoc == noreg) { 2238 calculate_address_from_global_toc(reg_scratch, method_toc()); 2239 Rtoc = reg_scratch; 2240 } 2241 2242 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2243 mtctr(reg_scratch); 2244 bctr(); 2245 2246 const address stub_start_addr = addr_at(stub_start_offset); 2247 2248 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2249 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2250 "encoded offset into the constant pool must match"); 2251 // Trampoline_stub_size should be good. 2252 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2253 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2254 2255 // End the stub. 2256 end_a_stub(); 2257 return stub; 2258 } 2259 2260 // "The box" is the space on the stack where we copy the object mark. 2261 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2262 Register temp, Register displaced_header, Register current_header) { 2263 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2264 assert_different_registers(oop, box, temp, displaced_header, current_header); 2265 Label object_has_monitor; 2266 Label cas_failed; 2267 Label success, failure; 2268 2269 // Load markWord from object into displaced_header. 2270 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2271 2272 if (DiagnoseSyncOnValueBasedClasses != 0) { 2273 load_klass(temp, oop); 2274 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2275 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2276 bne(flag, failure); 2277 } 2278 2279 // Handle existing monitor. 2280 // The object has an existing monitor iff (mark & monitor_value) != 0. 2281 andi_(temp, displaced_header, markWord::monitor_value); 2282 bne(CCR0, object_has_monitor); 2283 2284 if (LockingMode == LM_MONITOR) { 2285 // Set NE to indicate 'failure' -> take slow-path. 2286 crandc(flag, Assembler::equal, flag, Assembler::equal); 2287 b(failure); 2288 } else { 2289 assert(LockingMode == LM_LEGACY, "must be"); 2290 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2291 ori(displaced_header, displaced_header, markWord::unlocked_value); 2292 2293 // Load Compare Value application register. 2294 2295 // Initialize the box. (Must happen before we update the object mark!) 2296 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2297 2298 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2299 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2300 cmpxchgd(/*flag=*/flag, 2301 /*current_value=*/current_header, 2302 /*compare_value=*/displaced_header, 2303 /*exchange_value=*/box, 2304 /*where=*/oop, 2305 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2306 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2307 noreg, 2308 &cas_failed, 2309 /*check without membar and ldarx first*/true); 2310 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2311 // If the compare-and-exchange succeeded, then we found an unlocked 2312 // object and we have now locked it. 2313 b(success); 2314 2315 bind(cas_failed); 2316 // We did not see an unlocked object so try the fast recursive case. 2317 2318 // Check if the owner is self by comparing the value in the markWord of object 2319 // (current_header) with the stack pointer. 2320 sub(current_header, current_header, R1_SP); 2321 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2322 2323 and_(R0/*==0?*/, current_header, temp); 2324 // If condition is true we are cont and hence we can store 0 as the 2325 // displaced header in the box, which indicates that it is a recursive lock. 2326 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2327 2328 if (flag != CCR0) { 2329 mcrf(flag, CCR0); 2330 } 2331 beq(CCR0, success); 2332 b(failure); 2333 } 2334 2335 // Handle existing monitor. 2336 bind(object_has_monitor); 2337 // The object's monitor m is unlocked iff m->owner is null, 2338 // otherwise m->owner may contain a thread or a stack address. 2339 2340 // Try to CAS m->owner from null to current thread. 2341 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2342 Register thread_id = displaced_header; 2343 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2344 cmpxchgd(/*flag=*/flag, 2345 /*current_value=*/current_header, 2346 /*compare_value=*/(intptr_t)0, 2347 /*exchange_value=*/thread_id, 2348 /*where=*/temp, 2349 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2350 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2351 2352 // Store a non-null value into the box. 2353 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2354 beq(flag, success); 2355 2356 // Check for recursive locking. 2357 cmpd(flag, current_header, thread_id); 2358 bne(flag, failure); 2359 2360 // Current thread already owns the lock. Just increment recursions. 2361 Register recursions = displaced_header; 2362 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2363 addi(recursions, recursions, 1); 2364 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2365 2366 // flag == EQ indicates success, increment held monitor count 2367 // flag == NE indicates failure 2368 bind(success); 2369 inc_held_monitor_count(temp); 2370 bind(failure); 2371 } 2372 2373 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2374 Register temp, Register displaced_header, Register current_header) { 2375 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2376 assert_different_registers(oop, box, temp, displaced_header, current_header); 2377 Label success, failure, object_has_monitor, notRecursive; 2378 2379 if (LockingMode == LM_LEGACY) { 2380 // Find the lock address and load the displaced header from the stack. 2381 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2382 2383 // If the displaced header is 0, we have a recursive unlock. 2384 cmpdi(flag, displaced_header, 0); 2385 beq(flag, success); 2386 } 2387 2388 // Handle existing monitor. 2389 // The object has an existing monitor iff (mark & monitor_value) != 0. 2390 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2391 andi_(R0, current_header, markWord::monitor_value); 2392 bne(CCR0, object_has_monitor); 2393 2394 if (LockingMode == LM_MONITOR) { 2395 // Set NE to indicate 'failure' -> take slow-path. 2396 crandc(flag, Assembler::equal, flag, Assembler::equal); 2397 b(failure); 2398 } else { 2399 assert(LockingMode == LM_LEGACY, "must be"); 2400 // Check if it is still a light weight lock, this is is true if we see 2401 // the stack address of the basicLock in the markWord of the object. 2402 // Cmpxchg sets flag to cmpd(current_header, box). 2403 cmpxchgd(/*flag=*/flag, 2404 /*current_value=*/current_header, 2405 /*compare_value=*/box, 2406 /*exchange_value=*/displaced_header, 2407 /*where=*/oop, 2408 MacroAssembler::MemBarRel, 2409 MacroAssembler::cmpxchgx_hint_release_lock(), 2410 noreg, 2411 &failure); 2412 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2413 b(success); 2414 } 2415 2416 // Handle existing monitor. 2417 bind(object_has_monitor); 2418 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2419 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2420 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2421 2422 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2423 // This is handled like owner thread mismatches: We take the slow path. 2424 Register thread_id = displaced_header; 2425 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2426 cmpd(flag, temp, thread_id); 2427 bne(flag, failure); 2428 2429 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2430 2431 addic_(displaced_header, displaced_header, -1); 2432 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2433 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2434 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2435 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2436 } 2437 b(success); 2438 2439 bind(notRecursive); 2440 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2441 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2442 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2443 cmpdi(flag, temp, 0); 2444 bne(flag, failure); 2445 release(); 2446 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2447 2448 // flag == EQ indicates success, decrement held monitor count 2449 // flag == NE indicates failure 2450 bind(success); 2451 dec_held_monitor_count(temp); 2452 bind(failure); 2453 } 2454 2455 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2456 Register tmp2, Register tmp3) { 2457 assert_different_registers(obj, tmp1, tmp2, tmp3); 2458 assert(flag == CCR0, "bad condition register"); 2459 2460 // Handle inflated monitor. 2461 Label inflated; 2462 // Finish fast lock successfully. MUST reach to with flag == NE 2463 Label locked; 2464 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2465 Label slow_path; 2466 2467 if (DiagnoseSyncOnValueBasedClasses != 0) { 2468 load_klass(tmp1, obj); 2469 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2470 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2471 bne(flag, slow_path); 2472 } 2473 2474 const Register mark = tmp1; 2475 const Register t = tmp3; // Usage of R0 allowed! 2476 2477 { // Lightweight locking 2478 2479 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2480 Label push; 2481 2482 const Register top = tmp2; 2483 2484 // Check if lock-stack is full. 2485 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2486 cmplwi(flag, top, LockStack::end_offset() - 1); 2487 bgt(flag, slow_path); 2488 2489 // The underflow check is elided. The recursive check will always fail 2490 // when the lock stack is empty because of the _bad_oop_sentinel field. 2491 2492 // Check if recursive. 2493 subi(t, top, oopSize); 2494 ldx(t, R16_thread, t); 2495 cmpd(flag, obj, t); 2496 beq(flag, push); 2497 2498 // Check for monitor (0b10) or locked (0b00). 2499 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2500 andi_(t, mark, markWord::lock_mask_in_place); 2501 cmpldi(flag, t, markWord::unlocked_value); 2502 bgt(flag, inflated); 2503 bne(flag, slow_path); 2504 2505 // Not inflated. 2506 2507 // Try to lock. Transition lock bits 0b00 => 0b01 2508 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2509 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2510 2511 bind(push); 2512 // After successful lock, push object on lock-stack. 2513 stdx(obj, R16_thread, top); 2514 addi(top, top, oopSize); 2515 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2516 b(locked); 2517 } 2518 2519 { // Handle inflated monitor. 2520 bind(inflated); 2521 2522 // mark contains the tagged ObjectMonitor*. 2523 const Register tagged_monitor = mark; 2524 const uintptr_t monitor_tag = markWord::monitor_value; 2525 const Register owner_addr = tmp2; 2526 2527 // Compute owner address. 2528 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2529 2530 // CAS owner (null => current thread id). 2531 Register thread_id = tmp1; 2532 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2533 cmpxchgd(/*flag=*/flag, 2534 /*current_value=*/t, 2535 /*compare_value=*/(intptr_t)0, 2536 /*exchange_value=*/thread_id, 2537 /*where=*/owner_addr, 2538 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2539 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2540 beq(flag, locked); 2541 2542 // Check if recursive. 2543 cmpd(flag, t, thread_id); 2544 bne(flag, slow_path); 2545 2546 // Recursive. 2547 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2548 addi(tmp1, tmp1, 1); 2549 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2550 } 2551 2552 bind(locked); 2553 inc_held_monitor_count(tmp1); 2554 2555 #ifdef ASSERT 2556 // Check that locked label is reached with flag == EQ. 2557 Label flag_correct; 2558 beq(flag, flag_correct); 2559 stop("Fast Lock Flag != EQ"); 2560 #endif 2561 bind(slow_path); 2562 #ifdef ASSERT 2563 // Check that slow_path label is reached with flag == NE. 2564 bne(flag, flag_correct); 2565 stop("Fast Lock Flag != NE"); 2566 bind(flag_correct); 2567 #endif 2568 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2569 } 2570 2571 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2572 Register tmp2, Register tmp3) { 2573 assert_different_registers(obj, tmp1, tmp2, tmp3); 2574 assert(flag == CCR0, "bad condition register"); 2575 2576 // Handle inflated monitor. 2577 Label inflated, inflated_load_monitor; 2578 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2579 Label unlocked; 2580 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2581 Label slow_path; 2582 2583 const Register mark = tmp1; 2584 const Register top = tmp2; 2585 const Register t = tmp3; 2586 2587 { // Lightweight unlock 2588 Label push_and_slow; 2589 2590 // Check if obj is top of lock-stack. 2591 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2592 subi(top, top, oopSize); 2593 ldx(t, R16_thread, top); 2594 cmpd(flag, obj, t); 2595 // Top of lock stack was not obj. Must be monitor. 2596 bne(flag, inflated_load_monitor); 2597 2598 // Pop lock-stack. 2599 DEBUG_ONLY(li(t, 0);) 2600 DEBUG_ONLY(stdx(t, R16_thread, top);) 2601 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2602 2603 // The underflow check is elided. The recursive check will always fail 2604 // when the lock stack is empty because of the _bad_oop_sentinel field. 2605 2606 // Check if recursive. 2607 subi(t, top, oopSize); 2608 ldx(t, R16_thread, t); 2609 cmpd(flag, obj, t); 2610 beq(flag, unlocked); 2611 2612 // Not recursive. 2613 2614 // Check for monitor (0b10). 2615 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2616 andi_(t, mark, markWord::monitor_value); 2617 bne(CCR0, inflated); 2618 2619 #ifdef ASSERT 2620 // Check header not unlocked (0b01). 2621 Label not_unlocked; 2622 andi_(t, mark, markWord::unlocked_value); 2623 beq(CCR0, not_unlocked); 2624 stop("lightweight_unlock already unlocked"); 2625 bind(not_unlocked); 2626 #endif 2627 2628 // Try to unlock. Transition lock bits 0b00 => 0b01 2629 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2630 b(unlocked); 2631 2632 bind(push_and_slow); 2633 // Restore lock-stack and handle the unlock in runtime. 2634 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2635 addi(top, top, oopSize); 2636 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2637 b(slow_path); 2638 } 2639 2640 { // Handle inflated monitor. 2641 bind(inflated_load_monitor); 2642 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2643 #ifdef ASSERT 2644 andi_(t, mark, markWord::monitor_value); 2645 bne(CCR0, inflated); 2646 stop("Fast Unlock not monitor"); 2647 #endif 2648 2649 bind(inflated); 2650 2651 #ifdef ASSERT 2652 Label check_done; 2653 subi(top, top, oopSize); 2654 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2655 blt(CCR0, check_done); 2656 ldx(t, R16_thread, top); 2657 cmpd(flag, obj, t); 2658 bne(flag, inflated); 2659 stop("Fast Unlock lock on stack"); 2660 bind(check_done); 2661 #endif 2662 2663 // mark contains the tagged ObjectMonitor*. 2664 const Register monitor = mark; 2665 const uintptr_t monitor_tag = markWord::monitor_value; 2666 2667 // Untag the monitor. 2668 subi(monitor, mark, monitor_tag); 2669 2670 const Register recursions = tmp2; 2671 Label not_recursive; 2672 2673 // Check if recursive. 2674 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2675 addic_(recursions, recursions, -1); 2676 blt(CCR0, not_recursive); 2677 2678 // Recursive unlock. 2679 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2680 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2681 b(unlocked); 2682 2683 bind(not_recursive); 2684 2685 Label release_; 2686 const Register t2 = tmp2; 2687 2688 // Check if the entry lists are empty. 2689 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2690 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2691 orr(t, t, t2); 2692 cmpdi(flag, t, 0); 2693 beq(flag, release_); 2694 2695 // The owner may be anonymous and we removed the last obj entry in 2696 // the lock-stack. This loses the information about the owner. 2697 // Write the thread to the owner field so the runtime knows the owner. 2698 Register thread_id = tmp2; 2699 ld(thread_id, in_bytes(JavaThread::lock_id_offset()), R16_thread); 2700 std(thread_id, in_bytes(ObjectMonitor::owner_offset()), monitor); 2701 b(slow_path); 2702 2703 bind(release_); 2704 // Set owner to null. 2705 release(); 2706 // t contains 0 2707 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2708 } 2709 2710 bind(unlocked); 2711 dec_held_monitor_count(t); 2712 2713 #ifdef ASSERT 2714 // Check that unlocked label is reached with flag == EQ. 2715 Label flag_correct; 2716 beq(flag, flag_correct); 2717 stop("Fast Lock Flag != EQ"); 2718 #endif 2719 bind(slow_path); 2720 #ifdef ASSERT 2721 // Check that slow_path label is reached with flag == NE. 2722 bne(flag, flag_correct); 2723 stop("Fast Lock Flag != NE"); 2724 bind(flag_correct); 2725 #endif 2726 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2727 } 2728 2729 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2730 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2731 2732 if (at_return) { 2733 if (in_nmethod) { 2734 if (UseSIGTRAP) { 2735 // Use Signal Handler. 2736 relocate(relocInfo::poll_return_type); 2737 td(traptoGreaterThanUnsigned, R1_SP, temp); 2738 } else { 2739 cmpld(CCR0, R1_SP, temp); 2740 // Stub may be out of range for short conditional branch. 2741 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2742 } 2743 } else { // Not in nmethod. 2744 // Frame still on stack, need to get fp. 2745 Register fp = R0; 2746 ld(fp, _abi0(callers_sp), R1_SP); 2747 cmpld(CCR0, fp, temp); 2748 bgt(CCR0, slow_path); 2749 } 2750 } else { // Normal safepoint poll. Not at return. 2751 assert(!in_nmethod, "should use load_from_polling_page"); 2752 andi_(temp, temp, SafepointMechanism::poll_bit()); 2753 bne(CCR0, slow_path); 2754 } 2755 } 2756 2757 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2758 MacroAssembler::PreservationLevel preservation_level) { 2759 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2760 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2761 } 2762 2763 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2764 MacroAssembler::PreservationLevel preservation_level) { 2765 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2766 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2767 } 2768 2769 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2770 // in frame_ppc.hpp. 2771 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2772 // Always set last_Java_pc and flags first because once last_Java_sp 2773 // is visible has_last_Java_frame is true and users will look at the 2774 // rest of the fields. (Note: flags should always be zero before we 2775 // get here so doesn't need to be set.) 2776 2777 // Verify that last_Java_pc was zeroed on return to Java 2778 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2779 "last_Java_pc not zeroed before leaving Java"); 2780 2781 // When returning from calling out from Java mode the frame anchor's 2782 // last_Java_pc will always be set to null. It is set here so that 2783 // if we are doing a call to native (not VM) that we capture the 2784 // known pc and don't have to rely on the native call having a 2785 // standard frame linkage where we can find the pc. 2786 if (last_Java_pc != noreg) 2787 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2788 2789 // Set last_Java_sp last. 2790 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2791 } 2792 2793 void MacroAssembler::reset_last_Java_frame(void) { 2794 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2795 R16_thread, "SP was not set, still zero"); 2796 2797 BLOCK_COMMENT("reset_last_Java_frame {"); 2798 li(R0, 0); 2799 2800 // _last_Java_sp = 0 2801 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2802 2803 // _last_Java_pc = 0 2804 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2805 BLOCK_COMMENT("} reset_last_Java_frame"); 2806 } 2807 2808 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2809 assert_different_registers(sp, tmp1); 2810 2811 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2812 // TOP_IJAVA_FRAME_ABI. 2813 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2814 address entry = pc(); 2815 load_const_optimized(tmp1, entry); 2816 2817 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2818 } 2819 2820 void MacroAssembler::get_vm_result(Register oop_result) { 2821 // Read: 2822 // R16_thread 2823 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2824 // 2825 // Updated: 2826 // oop_result 2827 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2828 2829 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2830 li(R0, 0); 2831 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2832 2833 verify_oop(oop_result, FILE_AND_LINE); 2834 } 2835 2836 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2837 // Read: 2838 // R16_thread 2839 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2840 // 2841 // Updated: 2842 // metadata_result 2843 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2844 2845 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2846 li(R0, 0); 2847 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2848 } 2849 2850 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2851 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2852 if (CompressedKlassPointers::base() != 0) { 2853 // Use dst as temp if it is free. 2854 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2855 current = dst; 2856 } 2857 if (CompressedKlassPointers::shift() != 0) { 2858 srdi(dst, current, CompressedKlassPointers::shift()); 2859 current = dst; 2860 } 2861 return current; 2862 } 2863 2864 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2865 if (UseCompressedClassPointers) { 2866 Register compressedKlass = encode_klass_not_null(ck, klass); 2867 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2868 } else { 2869 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2870 } 2871 } 2872 2873 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2874 if (UseCompressedClassPointers) { 2875 if (val == noreg) { 2876 val = R0; 2877 li(val, 0); 2878 } 2879 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2880 } 2881 } 2882 2883 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2884 static int computed_size = -1; 2885 2886 // Not yet computed? 2887 if (computed_size == -1) { 2888 2889 if (!UseCompressedClassPointers) { 2890 computed_size = 0; 2891 } else { 2892 // Determine by scratch emit. 2893 ResourceMark rm; 2894 int code_size = 8 * BytesPerInstWord; 2895 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 2896 MacroAssembler* a = new MacroAssembler(&cb); 2897 a->decode_klass_not_null(R11_scratch1); 2898 computed_size = a->offset(); 2899 } 2900 } 2901 2902 return computed_size; 2903 } 2904 2905 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2906 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2907 if (src == noreg) src = dst; 2908 Register shifted_src = src; 2909 if (CompressedKlassPointers::shift() != 0 || 2910 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 2911 shifted_src = dst; 2912 sldi(shifted_src, src, CompressedKlassPointers::shift()); 2913 } 2914 if (CompressedKlassPointers::base() != 0) { 2915 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 2916 } 2917 } 2918 2919 void MacroAssembler::load_klass(Register dst, Register src) { 2920 if (UseCompressedClassPointers) { 2921 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2922 // Attention: no null check here! 2923 decode_klass_not_null(dst, dst); 2924 } else { 2925 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2926 } 2927 } 2928 2929 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 2930 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 2931 load_klass(dst, src); 2932 } 2933 2934 // ((OopHandle)result).resolve(); 2935 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 2936 MacroAssembler::PreservationLevel preservation_level) { 2937 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 2938 } 2939 2940 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 2941 MacroAssembler::PreservationLevel preservation_level) { 2942 Label resolved; 2943 2944 // A null weak handle resolves to null. 2945 cmpdi(CCR0, result, 0); 2946 beq(CCR0, resolved); 2947 2948 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 2949 preservation_level); 2950 bind(resolved); 2951 } 2952 2953 void MacroAssembler::load_method_holder(Register holder, Register method) { 2954 ld(holder, in_bytes(Method::const_offset()), method); 2955 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 2956 ld(holder, ConstantPool::pool_holder_offset(), holder); 2957 } 2958 2959 // Clear Array 2960 // For very short arrays. tmp == R0 is allowed. 2961 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 2962 if (cnt_dwords > 0) { li(tmp, 0); } 2963 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 2964 } 2965 2966 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 2967 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 2968 if (cnt_dwords < 8) { 2969 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 2970 return; 2971 } 2972 2973 Label loop; 2974 const long loopcnt = cnt_dwords >> 1, 2975 remainder = cnt_dwords & 1; 2976 2977 li(tmp, loopcnt); 2978 mtctr(tmp); 2979 li(tmp, 0); 2980 bind(loop); 2981 std(tmp, 0, base_ptr); 2982 std(tmp, 8, base_ptr); 2983 addi(base_ptr, base_ptr, 16); 2984 bdnz(loop); 2985 if (remainder) { std(tmp, 0, base_ptr); } 2986 } 2987 2988 // Kills both input registers. tmp == R0 is allowed. 2989 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 2990 // Procedure for large arrays (uses data cache block zero instruction). 2991 Label startloop, fast, fastloop, small_rest, restloop, done; 2992 const int cl_size = VM_Version::L1_data_cache_line_size(), 2993 cl_dwords = cl_size >> 3, 2994 cl_dw_addr_bits = exact_log2(cl_dwords), 2995 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 2996 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 2997 2998 if (const_cnt >= 0) { 2999 // Constant case. 3000 if (const_cnt < min_cnt) { 3001 clear_memory_constlen(base_ptr, const_cnt, tmp); 3002 return; 3003 } 3004 load_const_optimized(cnt_dwords, const_cnt, tmp); 3005 } else { 3006 // cnt_dwords already loaded in register. Need to check size. 3007 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3008 blt(CCR1, small_rest); 3009 } 3010 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3011 beq(CCR0, fast); // Already 128byte aligned. 3012 3013 subfic(tmp, tmp, cl_dwords); 3014 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3015 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3016 li(tmp, 0); 3017 3018 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3019 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3020 addi(base_ptr, base_ptr, 8); 3021 bdnz(startloop); 3022 3023 bind(fast); // Clear 128byte blocks. 3024 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3025 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3026 mtctr(tmp); // Load counter. 3027 3028 bind(fastloop); 3029 dcbz(base_ptr); // Clear 128byte aligned block. 3030 addi(base_ptr, base_ptr, cl_size); 3031 bdnz(fastloop); 3032 3033 bind(small_rest); 3034 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3035 beq(CCR0, done); // rest == 0 3036 li(tmp, 0); 3037 mtctr(cnt_dwords); // Load counter. 3038 3039 bind(restloop); // Clear rest. 3040 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3041 addi(base_ptr, base_ptr, 8); 3042 bdnz(restloop); 3043 3044 bind(done); 3045 } 3046 3047 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3048 3049 // Helpers for Intrinsic Emitters 3050 // 3051 // Revert the byte order of a 32bit value in a register 3052 // src: 0x44556677 3053 // dst: 0x77665544 3054 // Three steps to obtain the result: 3055 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3056 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3057 // This value initializes dst. 3058 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3059 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3060 // This value is mask inserted into dst with a [0..23] mask of 1s. 3061 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3062 // This value is mask inserted into dst with a [8..15] mask of 1s. 3063 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3064 assert_different_registers(dst, src); 3065 3066 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3067 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3068 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3069 } 3070 3071 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3072 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3073 // body size from 20 to 16 instructions. 3074 // Returns the offset that was used to calculate the address of column tc3. 3075 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3076 // at hand, the original table address can be easily reconstructed. 3077 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3078 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3079 3080 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3081 // Layout: See StubRoutines::ppc::generate_crc_constants. 3082 #ifdef VM_LITTLE_ENDIAN 3083 const int ix0 = 3 * CRC32_TABLE_SIZE; 3084 const int ix1 = 2 * CRC32_TABLE_SIZE; 3085 const int ix2 = 1 * CRC32_TABLE_SIZE; 3086 const int ix3 = 0 * CRC32_TABLE_SIZE; 3087 #else 3088 const int ix0 = 1 * CRC32_TABLE_SIZE; 3089 const int ix1 = 2 * CRC32_TABLE_SIZE; 3090 const int ix2 = 3 * CRC32_TABLE_SIZE; 3091 const int ix3 = 4 * CRC32_TABLE_SIZE; 3092 #endif 3093 assert_different_registers(table, tc0, tc1, tc2); 3094 assert(table == tc3, "must be!"); 3095 3096 addi(tc0, table, ix0); 3097 addi(tc1, table, ix1); 3098 addi(tc2, table, ix2); 3099 if (ix3 != 0) addi(tc3, table, ix3); 3100 3101 return ix3; 3102 } 3103 3104 /** 3105 * uint32_t crc; 3106 * table[crc & 0xFF] ^ (crc >> 8); 3107 */ 3108 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3109 assert_different_registers(crc, table, tmp); 3110 assert_different_registers(val, table); 3111 3112 if (crc == val) { // Must rotate first to use the unmodified value. 3113 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3114 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3115 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3116 } else { 3117 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3118 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3119 } 3120 lwzx(tmp, table, tmp); 3121 xorr(crc, crc, tmp); 3122 } 3123 3124 /** 3125 * Emits code to update CRC-32 with a byte value according to constants in table. 3126 * 3127 * @param [in,out]crc Register containing the crc. 3128 * @param [in]val Register containing the byte to fold into the CRC. 3129 * @param [in]table Register containing the table of crc constants. 3130 * 3131 * uint32_t crc; 3132 * val = crc_table[(val ^ crc) & 0xFF]; 3133 * crc = val ^ (crc >> 8); 3134 */ 3135 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3136 BLOCK_COMMENT("update_byte_crc32:"); 3137 xorr(val, val, crc); 3138 fold_byte_crc32(crc, val, table, val); 3139 } 3140 3141 /** 3142 * @param crc register containing existing CRC (32-bit) 3143 * @param buf register pointing to input byte buffer (byte*) 3144 * @param len register containing number of bytes 3145 * @param table register pointing to CRC table 3146 */ 3147 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3148 Register data, bool loopAlignment) { 3149 assert_different_registers(crc, buf, len, table, data); 3150 3151 Label L_mainLoop, L_done; 3152 const int mainLoop_stepping = 1; 3153 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3154 3155 // Process all bytes in a single-byte loop. 3156 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3157 beq(CCR0, L_done); 3158 3159 mtctr(len); 3160 align(mainLoop_alignment); 3161 BIND(L_mainLoop); 3162 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3163 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3164 update_byte_crc32(crc, data, table); 3165 bdnz(L_mainLoop); // Iterate. 3166 3167 bind(L_done); 3168 } 3169 3170 /** 3171 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3172 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3173 */ 3174 // A note on the lookup table address(es): 3175 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3176 // To save the effort of adding the column offset to the table address each time 3177 // a table element is looked up, it is possible to pass the pre-calculated 3178 // column addresses. 3179 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3180 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3181 Register t0, Register t1, Register t2, Register t3, 3182 Register tc0, Register tc1, Register tc2, Register tc3) { 3183 assert_different_registers(crc, t3); 3184 3185 // XOR crc with next four bytes of buffer. 3186 lwz(t3, bufDisp, buf); 3187 if (bufInc != 0) { 3188 addi(buf, buf, bufInc); 3189 } 3190 xorr(t3, t3, crc); 3191 3192 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3193 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3194 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3195 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3196 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3197 3198 // Use the pre-calculated column addresses. 3199 // Load pre-calculated table values. 3200 lwzx(t0, tc0, t0); 3201 lwzx(t1, tc1, t1); 3202 lwzx(t2, tc2, t2); 3203 lwzx(t3, tc3, t3); 3204 3205 // Calculate new crc from table values. 3206 xorr(t0, t0, t1); 3207 xorr(t2, t2, t3); 3208 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3209 } 3210 3211 /** 3212 * @param crc register containing existing CRC (32-bit) 3213 * @param buf register pointing to input byte buffer (byte*) 3214 * @param len register containing number of bytes 3215 * @param table register pointing to CRC table 3216 * 3217 * uses R9..R12 as work register. Must be saved/restored by caller! 3218 */ 3219 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3220 Register t0, Register t1, Register t2, Register t3, 3221 Register tc0, Register tc1, Register tc2, Register tc3, 3222 bool invertCRC) { 3223 assert_different_registers(crc, buf, len, table); 3224 3225 Label L_mainLoop, L_tail; 3226 Register tmp = t0; 3227 Register data = t0; 3228 Register tmp2 = t1; 3229 const int mainLoop_stepping = 4; 3230 const int tailLoop_stepping = 1; 3231 const int log_stepping = exact_log2(mainLoop_stepping); 3232 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3233 const int complexThreshold = 2*mainLoop_stepping; 3234 3235 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3236 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3237 // for all well-behaved cases. The situation itself is detected and handled correctly 3238 // within update_byteLoop_crc32. 3239 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3240 3241 BLOCK_COMMENT("kernel_crc32_1word {"); 3242 3243 if (invertCRC) { 3244 nand(crc, crc, crc); // 1s complement of crc 3245 } 3246 3247 // Check for short (<mainLoop_stepping) buffer. 3248 cmpdi(CCR0, len, complexThreshold); 3249 blt(CCR0, L_tail); 3250 3251 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3252 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3253 { 3254 // Align buf addr to mainLoop_stepping boundary. 3255 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3256 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3257 3258 if (complexThreshold > mainLoop_stepping) { 3259 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3260 } else { 3261 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3262 cmpdi(CCR0, tmp, mainLoop_stepping); 3263 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3264 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3265 } 3266 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3267 } 3268 3269 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3270 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3271 mtctr(tmp2); 3272 3273 #ifdef VM_LITTLE_ENDIAN 3274 Register crc_rv = crc; 3275 #else 3276 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3277 // Occupies tmp, but frees up crc. 3278 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3279 tmp = crc; 3280 #endif 3281 3282 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3283 3284 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3285 BIND(L_mainLoop); 3286 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3287 bdnz(L_mainLoop); 3288 3289 #ifndef VM_LITTLE_ENDIAN 3290 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3291 tmp = crc_rv; // Tmp uses it's original register again. 3292 #endif 3293 3294 // Restore original table address for tailLoop. 3295 if (reconstructTableOffset != 0) { 3296 addi(table, table, -reconstructTableOffset); 3297 } 3298 3299 // Process last few (<complexThreshold) bytes of buffer. 3300 BIND(L_tail); 3301 update_byteLoop_crc32(crc, buf, len, table, data, false); 3302 3303 if (invertCRC) { 3304 nand(crc, crc, crc); // 1s complement of crc 3305 } 3306 BLOCK_COMMENT("} kernel_crc32_1word"); 3307 } 3308 3309 /** 3310 * @param crc register containing existing CRC (32-bit) 3311 * @param buf register pointing to input byte buffer (byte*) 3312 * @param len register containing number of bytes 3313 * @param constants register pointing to precomputed constants 3314 * @param t0-t6 temp registers 3315 */ 3316 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3317 Register t0, Register t1, Register t2, Register t3, 3318 Register t4, Register t5, Register t6, bool invertCRC) { 3319 assert_different_registers(crc, buf, len, constants); 3320 3321 Label L_tail; 3322 3323 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3324 3325 if (invertCRC) { 3326 nand(crc, crc, crc); // 1s complement of crc 3327 } 3328 3329 // Enforce 32 bit. 3330 clrldi(len, len, 32); 3331 3332 // Align if we have enough bytes for the fast version. 3333 const int alignment = 16, 3334 threshold = 32; 3335 Register prealign = t0; 3336 3337 neg(prealign, buf); 3338 addi(t1, len, -threshold); 3339 andi(prealign, prealign, alignment - 1); 3340 cmpw(CCR0, t1, prealign); 3341 blt(CCR0, L_tail); // len - prealign < threshold? 3342 3343 subf(len, prealign, len); 3344 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3345 3346 // Calculate from first aligned address as far as possible. 3347 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3348 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3349 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3350 3351 // Remaining bytes. 3352 BIND(L_tail); 3353 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3354 3355 if (invertCRC) { 3356 nand(crc, crc, crc); // 1s complement of crc 3357 } 3358 3359 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3360 } 3361 3362 /** 3363 * @param crc register containing existing CRC (32-bit) 3364 * @param buf register pointing to input byte buffer (byte*) 3365 * @param len register containing number of bytes (will get updated to remaining bytes) 3366 * @param constants register pointing to CRC table for 128-bit aligned memory 3367 * @param t0-t6 temp registers 3368 */ 3369 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3370 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3371 3372 // Save non-volatile vector registers (frameless). 3373 Register offset = t1; 3374 int offsetInt = 0; 3375 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3376 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3377 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3378 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3379 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3380 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3381 #ifndef VM_LITTLE_ENDIAN 3382 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3383 #endif 3384 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3385 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3386 3387 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3388 // bytes per iteration. The basic scheme is: 3389 // lvx: load vector (Big Endian needs reversal) 3390 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3391 // vxor: xor partial results together to get unroll_factor2 vectors 3392 3393 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3394 3395 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3396 const int unroll_factor = CRC32_UNROLL_FACTOR, 3397 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3398 3399 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3400 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3401 3402 // Support registers. 3403 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3404 Register num_bytes = R14, 3405 loop_count = R15, 3406 cur_const = crc; // will live in VCRC 3407 // Constant array for outer loop: unroll_factor2 - 1 registers, 3408 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3409 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3410 consts1[] = { VR23, VR24 }; 3411 // Data register arrays: 2 arrays with unroll_factor2 registers. 3412 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3413 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3414 3415 VectorRegister VCRC = data0[0]; 3416 VectorRegister Vc = VR25; 3417 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3418 3419 // We have at least 1 iteration (ensured by caller). 3420 Label L_outer_loop, L_inner_loop, L_last; 3421 3422 // If supported set DSCR pre-fetch to deepest. 3423 if (VM_Version::has_mfdscr()) { 3424 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3425 mtdscr(t0); 3426 } 3427 3428 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3429 3430 for (int i = 1; i < unroll_factor2; ++i) { 3431 li(offs[i], 16 * i); 3432 } 3433 3434 // Load consts for outer loop 3435 lvx(consts0[0], constants); 3436 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3437 lvx(consts0[i], offs[i], constants); 3438 } 3439 3440 load_const_optimized(num_bytes, 16 * unroll_factor); 3441 3442 // Reuse data registers outside of the loop. 3443 VectorRegister Vtmp = data1[0]; 3444 VectorRegister Vtmp2 = data1[1]; 3445 VectorRegister zeroes = data1[2]; 3446 3447 vspltisb(Vtmp, 0); 3448 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3449 3450 // Load vector for vpermxor (to xor both 64 bit parts together) 3451 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3452 vspltisb(Vc, 4); 3453 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3454 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3455 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3456 3457 #ifdef VM_LITTLE_ENDIAN 3458 #define BE_swap_bytes(x) 3459 #else 3460 vspltisb(Vtmp2, 0xf); 3461 vxor(swap_bytes, Vtmp, Vtmp2); 3462 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3463 #endif 3464 3465 cmpd(CCR0, len, num_bytes); 3466 blt(CCR0, L_last); 3467 3468 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3469 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3470 3471 // ********** Main loop start ********** 3472 align(32); 3473 bind(L_outer_loop); 3474 3475 // Begin of unrolled first iteration (no xor). 3476 lvx(data1[0], buf); 3477 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3478 lvx(data1[i], offs[i], buf); 3479 } 3480 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3481 lvx(consts1[0], cur_const); 3482 mtctr(loop_count); 3483 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3484 BE_swap_bytes(data1[i]); 3485 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3486 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3487 vpmsumw(data0[i], data1[i], consts1[0]); 3488 } 3489 addi(buf, buf, 16 * unroll_factor2); 3490 subf(len, num_bytes, len); 3491 lvx(consts1[1], offs[1], cur_const); 3492 addi(cur_const, cur_const, 32); 3493 // Begin of unrolled second iteration (head). 3494 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3495 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3496 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3497 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3498 } 3499 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3500 BE_swap_bytes(data1[i]); 3501 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3502 vpmsumw(data1[i], data1[i], consts1[1]); 3503 } 3504 addi(buf, buf, 16 * unroll_factor2); 3505 3506 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3507 // Double-iteration allows using the 2 constant registers alternatingly. 3508 align(32); 3509 bind(L_inner_loop); 3510 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3511 if (j & 1) { 3512 lvx(consts1[0], cur_const); 3513 } else { 3514 lvx(consts1[1], offs[1], cur_const); 3515 addi(cur_const, cur_const, 32); 3516 } 3517 for (int i = 0; i < unroll_factor2; ++i) { 3518 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3519 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3520 BE_swap_bytes(data1[idx]); 3521 vxor(data0[i], data0[i], data1[i]); 3522 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3523 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3524 } 3525 addi(buf, buf, 16 * unroll_factor2); 3526 } 3527 bdnz(L_inner_loop); 3528 3529 addi(cur_const, constants, outer_consts_size); // Reset 3530 3531 // Tail of last iteration (no loads). 3532 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3533 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3534 vxor(data0[i], data0[i], data1[i]); 3535 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3536 } 3537 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3538 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3539 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3540 } 3541 3542 // Last data register is ok, other ones need fixup shift. 3543 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3544 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3545 } 3546 3547 // Combine to 128 bit result vector VCRC = data0[0]. 3548 for (int i = 1; i < unroll_factor2; i<<=1) { 3549 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3550 vxor(data0[j], data0[j], data0[j+i]); 3551 } 3552 } 3553 cmpd(CCR0, len, num_bytes); 3554 bge(CCR0, L_outer_loop); 3555 3556 // Last chance with lower num_bytes. 3557 bind(L_last); 3558 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3559 // Point behind last const for inner loop. 3560 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3561 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3562 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3563 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3564 3565 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3566 bgt(CCR0, L_outer_loop); 3567 // ********** Main loop end ********** 3568 3569 // Restore DSCR pre-fetch value. 3570 if (VM_Version::has_mfdscr()) { 3571 load_const_optimized(t0, VM_Version::_dscr_val); 3572 mtdscr(t0); 3573 } 3574 3575 // ********** Simple loop for remaining 16 byte blocks ********** 3576 { 3577 Label L_loop, L_done; 3578 3579 srdi_(t0, len, 4); // 16 bytes per iteration 3580 clrldi(len, len, 64-4); 3581 beq(CCR0, L_done); 3582 3583 // Point to const (same as last const for inner loop). 3584 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3585 mtctr(t0); 3586 lvx(Vtmp2, cur_const); 3587 3588 align(32); 3589 bind(L_loop); 3590 3591 lvx(Vtmp, buf); 3592 addi(buf, buf, 16); 3593 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3594 BE_swap_bytes(Vtmp); 3595 vxor(VCRC, VCRC, Vtmp); 3596 vpmsumw(VCRC, VCRC, Vtmp2); 3597 bdnz(L_loop); 3598 3599 bind(L_done); 3600 } 3601 // ********** Simple loop end ********** 3602 #undef BE_swap_bytes 3603 3604 // Point to Barrett constants 3605 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3606 3607 vspltisb(zeroes, 0); 3608 3609 // Combine to 64 bit result. 3610 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3611 3612 // Reduce to 32 bit CRC: Remainder by multiply-high. 3613 lvx(Vtmp, cur_const); 3614 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3615 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3616 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3617 vsldoi(Vtmp, zeroes, Vtmp, 8); 3618 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3619 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3620 3621 // Move result. len is already updated. 3622 vsldoi(VCRC, VCRC, zeroes, 8); 3623 mfvrd(crc, VCRC); 3624 3625 // Restore non-volatile Vector registers (frameless). 3626 offsetInt = 0; 3627 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3628 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3629 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3630 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3631 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3632 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3633 #ifndef VM_LITTLE_ENDIAN 3634 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3635 #endif 3636 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3637 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3638 } 3639 3640 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3641 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3642 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3643 : StubRoutines::crc_table_addr() , R0); 3644 3645 if (VM_Version::has_vpmsumb()) { 3646 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3647 } else { 3648 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3649 } 3650 } 3651 3652 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3653 assert_different_registers(crc, val, table); 3654 3655 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3656 if (invertCRC) { 3657 nand(crc, crc, crc); // 1s complement of crc 3658 } 3659 3660 update_byte_crc32(crc, val, table); 3661 3662 if (invertCRC) { 3663 nand(crc, crc, crc); // 1s complement of crc 3664 } 3665 } 3666 3667 // dest_lo += src1 + src2 3668 // dest_hi += carry1 + carry2 3669 void MacroAssembler::add2_with_carry(Register dest_hi, 3670 Register dest_lo, 3671 Register src1, Register src2) { 3672 li(R0, 0); 3673 addc(dest_lo, dest_lo, src1); 3674 adde(dest_hi, dest_hi, R0); 3675 addc(dest_lo, dest_lo, src2); 3676 adde(dest_hi, dest_hi, R0); 3677 } 3678 3679 // Multiply 64 bit by 64 bit first loop. 3680 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3681 Register x_xstart, 3682 Register y, Register y_idx, 3683 Register z, 3684 Register carry, 3685 Register product_high, Register product, 3686 Register idx, Register kdx, 3687 Register tmp) { 3688 // jlong carry, x[], y[], z[]; 3689 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3690 // huge_128 product = y[idx] * x[xstart] + carry; 3691 // z[kdx] = (jlong)product; 3692 // carry = (jlong)(product >>> 64); 3693 // } 3694 // z[xstart] = carry; 3695 3696 Label L_first_loop, L_first_loop_exit; 3697 Label L_one_x, L_one_y, L_multiply; 3698 3699 addic_(xstart, xstart, -1); 3700 blt(CCR0, L_one_x); // Special case: length of x is 1. 3701 3702 // Load next two integers of x. 3703 sldi(tmp, xstart, LogBytesPerInt); 3704 ldx(x_xstart, x, tmp); 3705 #ifdef VM_LITTLE_ENDIAN 3706 rldicl(x_xstart, x_xstart, 32, 0); 3707 #endif 3708 3709 align(32, 16); 3710 bind(L_first_loop); 3711 3712 cmpdi(CCR0, idx, 1); 3713 blt(CCR0, L_first_loop_exit); 3714 addi(idx, idx, -2); 3715 beq(CCR0, L_one_y); 3716 3717 // Load next two integers of y. 3718 sldi(tmp, idx, LogBytesPerInt); 3719 ldx(y_idx, y, tmp); 3720 #ifdef VM_LITTLE_ENDIAN 3721 rldicl(y_idx, y_idx, 32, 0); 3722 #endif 3723 3724 3725 bind(L_multiply); 3726 multiply64(product_high, product, x_xstart, y_idx); 3727 3728 li(tmp, 0); 3729 addc(product, product, carry); // Add carry to result. 3730 adde(product_high, product_high, tmp); // Add carry of the last addition. 3731 addi(kdx, kdx, -2); 3732 3733 // Store result. 3734 #ifdef VM_LITTLE_ENDIAN 3735 rldicl(product, product, 32, 0); 3736 #endif 3737 sldi(tmp, kdx, LogBytesPerInt); 3738 stdx(product, z, tmp); 3739 mr_if_needed(carry, product_high); 3740 b(L_first_loop); 3741 3742 3743 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3744 3745 lwz(y_idx, 0, y); 3746 b(L_multiply); 3747 3748 3749 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3750 3751 lwz(x_xstart, 0, x); 3752 b(L_first_loop); 3753 3754 bind(L_first_loop_exit); 3755 } 3756 3757 // Multiply 64 bit by 64 bit and add 128 bit. 3758 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3759 Register z, Register yz_idx, 3760 Register idx, Register carry, 3761 Register product_high, Register product, 3762 Register tmp, int offset) { 3763 3764 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3765 // z[kdx] = (jlong)product; 3766 3767 sldi(tmp, idx, LogBytesPerInt); 3768 if (offset) { 3769 addi(tmp, tmp, offset); 3770 } 3771 ldx(yz_idx, y, tmp); 3772 #ifdef VM_LITTLE_ENDIAN 3773 rldicl(yz_idx, yz_idx, 32, 0); 3774 #endif 3775 3776 multiply64(product_high, product, x_xstart, yz_idx); 3777 ldx(yz_idx, z, tmp); 3778 #ifdef VM_LITTLE_ENDIAN 3779 rldicl(yz_idx, yz_idx, 32, 0); 3780 #endif 3781 3782 add2_with_carry(product_high, product, carry, yz_idx); 3783 3784 sldi(tmp, idx, LogBytesPerInt); 3785 if (offset) { 3786 addi(tmp, tmp, offset); 3787 } 3788 #ifdef VM_LITTLE_ENDIAN 3789 rldicl(product, product, 32, 0); 3790 #endif 3791 stdx(product, z, tmp); 3792 } 3793 3794 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3795 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3796 Register y, Register z, 3797 Register yz_idx, Register idx, Register carry, 3798 Register product_high, Register product, 3799 Register carry2, Register tmp) { 3800 3801 // jlong carry, x[], y[], z[]; 3802 // int kdx = ystart+1; 3803 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3804 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3805 // z[kdx+idx+1] = (jlong)product; 3806 // jlong carry2 = (jlong)(product >>> 64); 3807 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3808 // z[kdx+idx] = (jlong)product; 3809 // carry = (jlong)(product >>> 64); 3810 // } 3811 // idx += 2; 3812 // if (idx > 0) { 3813 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3814 // z[kdx+idx] = (jlong)product; 3815 // carry = (jlong)(product >>> 64); 3816 // } 3817 3818 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3819 const Register jdx = R0; 3820 3821 // Scale the index. 3822 srdi_(jdx, idx, 2); 3823 beq(CCR0, L_third_loop_exit); 3824 mtctr(jdx); 3825 3826 align(32, 16); 3827 bind(L_third_loop); 3828 3829 addi(idx, idx, -4); 3830 3831 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3832 mr_if_needed(carry2, product_high); 3833 3834 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3835 mr_if_needed(carry, product_high); 3836 bdnz(L_third_loop); 3837 3838 bind(L_third_loop_exit); // Handle any left-over operand parts. 3839 3840 andi_(idx, idx, 0x3); 3841 beq(CCR0, L_post_third_loop_done); 3842 3843 Label L_check_1; 3844 3845 addic_(idx, idx, -2); 3846 blt(CCR0, L_check_1); 3847 3848 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3849 mr_if_needed(carry, product_high); 3850 3851 bind(L_check_1); 3852 3853 addi(idx, idx, 0x2); 3854 andi_(idx, idx, 0x1); 3855 addic_(idx, idx, -1); 3856 blt(CCR0, L_post_third_loop_done); 3857 3858 sldi(tmp, idx, LogBytesPerInt); 3859 lwzx(yz_idx, y, tmp); 3860 multiply64(product_high, product, x_xstart, yz_idx); 3861 lwzx(yz_idx, z, tmp); 3862 3863 add2_with_carry(product_high, product, yz_idx, carry); 3864 3865 sldi(tmp, idx, LogBytesPerInt); 3866 stwx(product, z, tmp); 3867 srdi(product, product, 32); 3868 3869 sldi(product_high, product_high, 32); 3870 orr(product, product, product_high); 3871 mr_if_needed(carry, product); 3872 3873 bind(L_post_third_loop_done); 3874 } // multiply_128_x_128_loop 3875 3876 void MacroAssembler::muladd(Register out, Register in, 3877 Register offset, Register len, Register k, 3878 Register tmp1, Register tmp2, Register carry) { 3879 3880 // Labels 3881 Label LOOP, SKIP; 3882 3883 // Make sure length is positive. 3884 cmpdi (CCR0, len, 0); 3885 3886 // Prepare variables 3887 subi (offset, offset, 4); 3888 li (carry, 0); 3889 ble (CCR0, SKIP); 3890 3891 mtctr (len); 3892 subi (len, len, 1 ); 3893 sldi (len, len, 2 ); 3894 3895 // Main loop 3896 bind(LOOP); 3897 lwzx (tmp1, len, in ); 3898 lwzx (tmp2, offset, out ); 3899 mulld (tmp1, tmp1, k ); 3900 add (tmp2, carry, tmp2 ); 3901 add (tmp2, tmp1, tmp2 ); 3902 stwx (tmp2, offset, out ); 3903 srdi (carry, tmp2, 32 ); 3904 subi (offset, offset, 4 ); 3905 subi (len, len, 4 ); 3906 bdnz (LOOP); 3907 bind(SKIP); 3908 } 3909 3910 void MacroAssembler::multiply_to_len(Register x, Register xlen, 3911 Register y, Register ylen, 3912 Register z, Register zlen, 3913 Register tmp1, Register tmp2, 3914 Register tmp3, Register tmp4, 3915 Register tmp5, Register tmp6, 3916 Register tmp7, Register tmp8, 3917 Register tmp9, Register tmp10, 3918 Register tmp11, Register tmp12, 3919 Register tmp13) { 3920 3921 ShortBranchVerifier sbv(this); 3922 3923 assert_different_registers(x, xlen, y, ylen, z, zlen, 3924 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3925 assert_different_registers(x, xlen, y, ylen, z, zlen, 3926 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 3927 assert_different_registers(x, xlen, y, ylen, z, zlen, 3928 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 3929 3930 const Register idx = tmp1; 3931 const Register kdx = tmp2; 3932 const Register xstart = tmp3; 3933 3934 const Register y_idx = tmp4; 3935 const Register carry = tmp5; 3936 const Register product = tmp6; 3937 const Register product_high = tmp7; 3938 const Register x_xstart = tmp8; 3939 const Register tmp = tmp9; 3940 3941 // First Loop. 3942 // 3943 // final static long LONG_MASK = 0xffffffffL; 3944 // int xstart = xlen - 1; 3945 // int ystart = ylen - 1; 3946 // long carry = 0; 3947 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3948 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3949 // z[kdx] = (int)product; 3950 // carry = product >>> 32; 3951 // } 3952 // z[xstart] = (int)carry; 3953 3954 mr_if_needed(idx, ylen); // idx = ylen 3955 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 3956 li(carry, 0); // carry = 0 3957 3958 Label L_done; 3959 3960 addic_(xstart, xlen, -1); 3961 blt(CCR0, L_done); 3962 3963 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 3964 carry, product_high, product, idx, kdx, tmp); 3965 3966 Label L_second_loop; 3967 3968 cmpdi(CCR0, kdx, 0); 3969 beq(CCR0, L_second_loop); 3970 3971 Label L_carry; 3972 3973 addic_(kdx, kdx, -1); 3974 beq(CCR0, L_carry); 3975 3976 // Store lower 32 bits of carry. 3977 sldi(tmp, kdx, LogBytesPerInt); 3978 stwx(carry, z, tmp); 3979 srdi(carry, carry, 32); 3980 addi(kdx, kdx, -1); 3981 3982 3983 bind(L_carry); 3984 3985 // Store upper 32 bits of carry. 3986 sldi(tmp, kdx, LogBytesPerInt); 3987 stwx(carry, z, tmp); 3988 3989 // Second and third (nested) loops. 3990 // 3991 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3992 // carry = 0; 3993 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3994 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3995 // (z[k] & LONG_MASK) + carry; 3996 // z[k] = (int)product; 3997 // carry = product >>> 32; 3998 // } 3999 // z[i] = (int)carry; 4000 // } 4001 // 4002 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 4003 4004 bind(L_second_loop); 4005 4006 li(carry, 0); // carry = 0; 4007 4008 addic_(xstart, xstart, -1); // i = xstart-1; 4009 blt(CCR0, L_done); 4010 4011 Register zsave = tmp10; 4012 4013 mr(zsave, z); 4014 4015 4016 Label L_last_x; 4017 4018 sldi(tmp, xstart, LogBytesPerInt); 4019 add(z, z, tmp); // z = z + k - j 4020 addi(z, z, 4); 4021 addic_(xstart, xstart, -1); // i = xstart-1; 4022 blt(CCR0, L_last_x); 4023 4024 sldi(tmp, xstart, LogBytesPerInt); 4025 ldx(x_xstart, x, tmp); 4026 #ifdef VM_LITTLE_ENDIAN 4027 rldicl(x_xstart, x_xstart, 32, 0); 4028 #endif 4029 4030 4031 Label L_third_loop_prologue; 4032 4033 bind(L_third_loop_prologue); 4034 4035 Register xsave = tmp11; 4036 Register xlensave = tmp12; 4037 Register ylensave = tmp13; 4038 4039 mr(xsave, x); 4040 mr(xlensave, xstart); 4041 mr(ylensave, ylen); 4042 4043 4044 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4045 carry, product_high, product, x, tmp); 4046 4047 mr(z, zsave); 4048 mr(x, xsave); 4049 mr(xlen, xlensave); // This is the decrement of the loop counter! 4050 mr(ylen, ylensave); 4051 4052 addi(tmp3, xlen, 1); 4053 sldi(tmp, tmp3, LogBytesPerInt); 4054 stwx(carry, z, tmp); 4055 addic_(tmp3, tmp3, -1); 4056 blt(CCR0, L_done); 4057 4058 srdi(carry, carry, 32); 4059 sldi(tmp, tmp3, LogBytesPerInt); 4060 stwx(carry, z, tmp); 4061 b(L_second_loop); 4062 4063 // Next infrequent code is moved outside loops. 4064 bind(L_last_x); 4065 4066 lwz(x_xstart, 0, x); 4067 b(L_third_loop_prologue); 4068 4069 bind(L_done); 4070 } // multiply_to_len 4071 4072 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4073 #ifdef ASSERT 4074 Label ok; 4075 if (check_equal) { 4076 beq(CCR0, ok); 4077 } else { 4078 bne(CCR0, ok); 4079 } 4080 stop(msg); 4081 bind(ok); 4082 #endif 4083 } 4084 4085 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4086 Register mem_base, const char* msg) { 4087 #ifdef ASSERT 4088 switch (size) { 4089 case 4: 4090 lwz(R0, mem_offset, mem_base); 4091 cmpwi(CCR0, R0, 0); 4092 break; 4093 case 8: 4094 ld(R0, mem_offset, mem_base); 4095 cmpdi(CCR0, R0, 0); 4096 break; 4097 default: 4098 ShouldNotReachHere(); 4099 } 4100 asm_assert(check_equal, msg); 4101 #endif // ASSERT 4102 } 4103 4104 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4105 if (!VerifyOops) { return; } 4106 if (UseCompressedOops) { decode_heap_oop(coop); } 4107 verify_oop(coop, msg); 4108 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4109 } 4110 4111 // READ: oop. KILL: R0. Volatile floats perhaps. 4112 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4113 if (!VerifyOops) { 4114 return; 4115 } 4116 4117 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4118 const Register tmp = R11; // Will be preserved. 4119 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4120 4121 BLOCK_COMMENT("verify_oop {"); 4122 4123 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4124 4125 mr_if_needed(R4_ARG2, oop); 4126 save_LR_CR(tmp); // save in old frame 4127 push_frame_reg_args(nbytes_save, tmp); 4128 // load FunctionDescriptor** / entry_address * 4129 load_const_optimized(tmp, fd, R0); 4130 // load FunctionDescriptor* / entry_address 4131 ld(tmp, 0, tmp); 4132 load_const_optimized(R3_ARG1, (address)msg, R0); 4133 // Call destination for its side effect. 4134 call_c(tmp); 4135 4136 pop_frame(); 4137 restore_LR_CR(tmp); 4138 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4139 4140 BLOCK_COMMENT("} verify_oop"); 4141 } 4142 4143 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4144 if (!VerifyOops) { 4145 return; 4146 } 4147 4148 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4149 const Register tmp = R11; // Will be preserved. 4150 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4151 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4152 4153 ld(R4_ARG2, offs, base); 4154 save_LR_CR(tmp); // save in old frame 4155 push_frame_reg_args(nbytes_save, tmp); 4156 // load FunctionDescriptor** / entry_address * 4157 load_const_optimized(tmp, fd, R0); 4158 // load FunctionDescriptor* / entry_address 4159 ld(tmp, 0, tmp); 4160 load_const_optimized(R3_ARG1, (address)msg, R0); 4161 // Call destination for its side effect. 4162 call_c(tmp); 4163 4164 pop_frame(); 4165 restore_LR_CR(tmp); 4166 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4167 } 4168 4169 // Call a C-function that prints output. 4170 void MacroAssembler::stop(int type, const char* msg) { 4171 bool msg_present = (msg != nullptr); 4172 4173 #ifndef PRODUCT 4174 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4175 #else 4176 block_comment("stop {"); 4177 #endif 4178 4179 if (msg_present) { 4180 type |= stop_msg_present; 4181 } 4182 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4183 if (msg_present) { 4184 emit_int64((uintptr_t)msg); 4185 } 4186 4187 block_comment("} stop;"); 4188 } 4189 4190 #ifndef PRODUCT 4191 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4192 // Val, addr are temp registers. 4193 // If low == addr, addr is killed. 4194 // High is preserved. 4195 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4196 if (!ZapMemory) return; 4197 4198 assert_different_registers(low, val); 4199 4200 BLOCK_COMMENT("zap memory region {"); 4201 load_const_optimized(val, 0x0101010101010101); 4202 int size = before + after; 4203 if (low == high && size < 5 && size > 0) { 4204 int offset = -before*BytesPerWord; 4205 for (int i = 0; i < size; ++i) { 4206 std(val, offset, low); 4207 offset += (1*BytesPerWord); 4208 } 4209 } else { 4210 addi(addr, low, -before*BytesPerWord); 4211 assert_different_registers(high, val); 4212 if (after) addi(high, high, after * BytesPerWord); 4213 Label loop; 4214 bind(loop); 4215 std(val, 0, addr); 4216 addi(addr, addr, 8); 4217 cmpd(CCR6, addr, high); 4218 ble(CCR6, loop); 4219 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4220 } 4221 BLOCK_COMMENT("} zap memory region"); 4222 } 4223 4224 #endif // !PRODUCT 4225 4226 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4227 const bool* flag_addr, Label& label) { 4228 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4229 assert(sizeof(bool) == 1, "PowerPC ABI"); 4230 masm->lbz(temp, simm16_offset, temp); 4231 masm->cmpwi(CCR0, temp, 0); 4232 masm->beq(CCR0, label); 4233 } 4234 4235 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4236 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4237 } 4238 4239 SkipIfEqualZero::~SkipIfEqualZero() { 4240 _masm->bind(_label); 4241 } 4242 4243 void MacroAssembler::cache_wb(Address line) { 4244 assert(line.index() == noreg, "index should be noreg"); 4245 assert(line.disp() == 0, "displacement should be 0"); 4246 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4247 // Data Cache Store, not really a flush, so it works like a sync of cache 4248 // line and persistent mem, i.e. copying the cache line to persistent whilst 4249 // not invalidating the cache line. 4250 dcbst(line.base()); 4251 } 4252 4253 void MacroAssembler::cache_wbsync(bool is_presync) { 4254 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4255 // We only need a post sync barrier. Post means _after_ a cache line flush or 4256 // store instruction, pre means a barrier emitted before such a instructions. 4257 if (!is_presync) { 4258 fence(); 4259 } 4260 } 4261 4262 void MacroAssembler::push_cont_fastpath() { 4263 Label done; 4264 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4265 cmpld(CCR0, R1_SP, R0); 4266 ble(CCR0, done); 4267 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4268 bind(done); 4269 } 4270 4271 void MacroAssembler::pop_cont_fastpath() { 4272 Label done; 4273 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4274 cmpld(CCR0, R1_SP, R0); 4275 ble(CCR0, done); 4276 li(R0, 0); 4277 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4278 bind(done); 4279 } 4280 4281 // Note: Must preserve CCR0 EQ (invariant). 4282 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4283 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4284 #ifdef ASSERT 4285 Label ok; 4286 cmpdi(CCR0, tmp, 0); 4287 bge_predict_taken(CCR0, ok); 4288 stop("held monitor count is negativ at increment"); 4289 bind(ok); 4290 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4291 #endif 4292 addi(tmp, tmp, 1); 4293 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4294 } 4295 4296 // Note: Must preserve CCR0 EQ (invariant). 4297 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4298 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4299 #ifdef ASSERT 4300 Label ok; 4301 cmpdi(CCR0, tmp, 0); 4302 bgt_predict_taken(CCR0, ok); 4303 stop("held monitor count is <= 0 at decrement"); 4304 bind(ok); 4305 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4306 #endif 4307 addi(tmp, tmp, -1); 4308 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4309 } 4310 4311 // Function to flip between unlocked and locked state (fast locking). 4312 // Branches to failed if the state is not as expected with CCR0 NE. 4313 // Falls through upon success with CCR0 EQ. 4314 // This requires fewer instructions and registers and is easier to use than the 4315 // cmpxchg based implementation. 4316 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4317 assert_different_registers(obj, tmp, R0); 4318 Label retry; 4319 4320 if (semantics & MemBarRel) { 4321 release(); 4322 } 4323 4324 bind(retry); 4325 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4326 if (!is_unlock) { 4327 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4328 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4329 andi_(R0, tmp, markWord::lock_mask_in_place); 4330 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4331 } else { 4332 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4333 andi_(R0, tmp, markWord::lock_mask_in_place); 4334 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4335 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4336 } 4337 stdcx_(tmp, obj); 4338 bne(CCR0, retry); 4339 4340 if (semantics & MemBarFenceAfter) { 4341 fence(); 4342 } else if (semantics & MemBarAcq) { 4343 isync(); 4344 } 4345 } 4346 4347 // Implements lightweight-locking. 4348 // 4349 // - obj: the object to be locked 4350 // - t1, t2: temporary register 4351 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4352 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4353 assert_different_registers(obj, t1, t2); 4354 4355 Label push; 4356 const Register top = t1; 4357 const Register mark = t2; 4358 const Register t = R0; 4359 4360 // Check if the lock-stack is full. 4361 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4362 cmplwi(CCR0, top, LockStack::end_offset()); 4363 bge(CCR0, slow); 4364 4365 // The underflow check is elided. The recursive check will always fail 4366 // when the lock stack is empty because of the _bad_oop_sentinel field. 4367 4368 // Check for recursion. 4369 subi(t, top, oopSize); 4370 ldx(t, R16_thread, t); 4371 cmpd(CCR0, obj, t); 4372 beq(CCR0, push); 4373 4374 // Check header for monitor (0b10) or locked (0b00). 4375 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4376 xori(t, mark, markWord::unlocked_value); 4377 andi_(t, t, markWord::lock_mask_in_place); 4378 bne(CCR0, slow); 4379 4380 // Try to lock. Transition lock bits 0b00 => 0b01 4381 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4382 4383 bind(push); 4384 // After successful lock, push object on lock-stack 4385 stdx(obj, R16_thread, top); 4386 addi(top, top, oopSize); 4387 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4388 } 4389 4390 // Implements lightweight-unlocking. 4391 // 4392 // - obj: the object to be unlocked 4393 // - t1: temporary register 4394 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4395 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4396 assert_different_registers(obj, t1); 4397 4398 #ifdef ASSERT 4399 { 4400 // The following checks rely on the fact that LockStack is only ever modified by 4401 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4402 // entries after inflation will happen delayed in that case. 4403 4404 // Check for lock-stack underflow. 4405 Label stack_ok; 4406 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4407 cmplwi(CCR0, t1, LockStack::start_offset()); 4408 bge(CCR0, stack_ok); 4409 stop("Lock-stack underflow"); 4410 bind(stack_ok); 4411 } 4412 #endif 4413 4414 Label unlocked, push_and_slow; 4415 const Register top = t1; 4416 const Register mark = R0; 4417 Register t = R0; 4418 4419 // Check if obj is top of lock-stack. 4420 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4421 subi(top, top, oopSize); 4422 ldx(t, R16_thread, top); 4423 cmpd(CCR0, obj, t); 4424 bne(CCR0, slow); 4425 4426 // Pop lock-stack. 4427 DEBUG_ONLY(li(t, 0);) 4428 DEBUG_ONLY(stdx(t, R16_thread, top);) 4429 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4430 4431 // The underflow check is elided. The recursive check will always fail 4432 // when the lock stack is empty because of the _bad_oop_sentinel field. 4433 4434 // Check if recursive. 4435 subi(t, top, oopSize); 4436 ldx(t, R16_thread, t); 4437 cmpd(CCR0, obj, t); 4438 beq(CCR0, unlocked); 4439 4440 // Use top as tmp 4441 t = top; 4442 4443 // Not recursive. Check header for monitor (0b10). 4444 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4445 andi_(t, mark, markWord::monitor_value); 4446 bne(CCR0, push_and_slow); 4447 4448 #ifdef ASSERT 4449 // Check header not unlocked (0b01). 4450 Label not_unlocked; 4451 andi_(t, mark, markWord::unlocked_value); 4452 beq(CCR0, not_unlocked); 4453 stop("lightweight_unlock already unlocked"); 4454 bind(not_unlocked); 4455 #endif 4456 4457 // Try to unlock. Transition lock bits 0b00 => 0b01 4458 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4459 b(unlocked); 4460 4461 bind(push_and_slow); 4462 4463 // Restore lock-stack and handle the unlock in runtime. 4464 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4465 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4466 addi(top, top, oopSize); 4467 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4468 b(slow); 4469 4470 bind(unlocked); 4471 }