1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2023 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/collectedHeap.inline.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "interpreter/interpreter.hpp" 33 #include "memory/resourceArea.hpp" 34 #include "nativeInst_ppc.hpp" 35 #include "oops/klass.inline.hpp" 36 #include "oops/methodData.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "register_ppc.hpp" 39 #include "runtime/icache.hpp" 40 #include "runtime/interfaceSupport.inline.hpp" 41 #include "runtime/objectMonitor.hpp" 42 #include "runtime/os.hpp" 43 #include "runtime/safepoint.hpp" 44 #include "runtime/safepointMechanism.hpp" 45 #include "runtime/sharedRuntime.hpp" 46 #include "runtime/stubRoutines.hpp" 47 #include "runtime/vm_version.hpp" 48 #include "utilities/macros.hpp" 49 #include "utilities/powerOfTwo.hpp" 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) // nothing 53 #else 54 #define BLOCK_COMMENT(str) block_comment(str) 55 #endif 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 57 58 #ifdef ASSERT 59 // On RISC, there's no benefit to verifying instruction boundaries. 60 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 61 #endif 62 63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 64 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 65 if (Assembler::is_simm(si31, 16)) { 66 ld(d, si31, a); 67 if (emit_filler_nop) nop(); 68 } else { 69 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 70 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 71 addis(d, a, hi); 72 ld(d, lo, d); 73 } 74 } 75 76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 77 assert_different_registers(d, a); 78 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 79 } 80 81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 82 size_t size_in_bytes, bool is_signed) { 83 switch (size_in_bytes) { 84 case 8: ld(dst, offs, base); break; 85 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 86 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 87 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 88 default: ShouldNotReachHere(); 89 } 90 } 91 92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 93 size_t size_in_bytes) { 94 switch (size_in_bytes) { 95 case 8: std(dst, offs, base); break; 96 case 4: stw(dst, offs, base); break; 97 case 2: sth(dst, offs, base); break; 98 case 1: stb(dst, offs, base); break; 99 default: ShouldNotReachHere(); 100 } 101 } 102 103 void MacroAssembler::align(int modulus, int max, int rem) { 104 int padding = (rem + modulus - (offset() % modulus)) % modulus; 105 if (padding > max) return; 106 for (int c = (padding >> 2); c > 0; --c) { nop(); } 107 } 108 109 void MacroAssembler::align_prefix() { 110 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 111 } 112 113 // Issue instructions that calculate given TOC from global TOC. 114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 115 bool add_relocation, bool emit_dummy_addr) { 116 int offset = -1; 117 if (emit_dummy_addr) { 118 offset = -128; // dummy address 119 } else if (addr != (address)(intptr_t)-1) { 120 offset = MacroAssembler::offset_to_global_toc(addr); 121 } 122 123 if (hi16) { 124 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 125 } 126 if (lo16) { 127 if (add_relocation) { 128 // Relocate at the addi to avoid confusion with a load from the method's TOC. 129 relocate(internal_word_Relocation::spec(addr)); 130 } 131 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 132 } 133 } 134 135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 136 const int offset = MacroAssembler::offset_to_global_toc(addr); 137 138 const address inst2_addr = a; 139 const int inst2 = *(int *)inst2_addr; 140 141 // The relocation points to the second instruction, the addi, 142 // and the addi reads and writes the same register dst. 143 const int dst = inv_rt_field(inst2); 144 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 145 146 // Now, find the preceding addis which writes to dst. 147 int inst1 = 0; 148 address inst1_addr = inst2_addr - BytesPerInstWord; 149 while (inst1_addr >= bound) { 150 inst1 = *(int *) inst1_addr; 151 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 152 // Stop, found the addis which writes dst. 153 break; 154 } 155 inst1_addr -= BytesPerInstWord; 156 } 157 158 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 159 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 160 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 161 return inst1_addr; 162 } 163 164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 165 const address inst2_addr = a; 166 const int inst2 = *(int *)inst2_addr; 167 168 // The relocation points to the second instruction, the addi, 169 // and the addi reads and writes the same register dst. 170 const int dst = inv_rt_field(inst2); 171 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 172 173 // Now, find the preceding addis which writes to dst. 174 int inst1 = 0; 175 address inst1_addr = inst2_addr - BytesPerInstWord; 176 while (inst1_addr >= bound) { 177 inst1 = *(int *) inst1_addr; 178 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 179 // stop, found the addis which writes dst 180 break; 181 } 182 inst1_addr -= BytesPerInstWord; 183 } 184 185 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 186 187 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 188 // -1 is a special case 189 if (offset == -1) { 190 return (address)(intptr_t)-1; 191 } else { 192 return global_toc() + offset; 193 } 194 } 195 196 #ifdef _LP64 197 // Patch compressed oops or klass constants. 198 // Assembler sequence is 199 // 1) compressed oops: 200 // lis rx = const.hi 201 // ori rx = rx | const.lo 202 // 2) compressed klass: 203 // lis rx = const.hi 204 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 205 // ori rx = rx | const.lo 206 // Clrldi will be passed by. 207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 208 assert(UseCompressedOops, "Should only patch compressed oops"); 209 210 const address inst2_addr = a; 211 const int inst2 = *(int *)inst2_addr; 212 213 // The relocation points to the second instruction, the ori, 214 // and the ori reads and writes the same register dst. 215 const int dst = inv_rta_field(inst2); 216 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 217 // Now, find the preceding addis which writes to dst. 218 int inst1 = 0; 219 address inst1_addr = inst2_addr - BytesPerInstWord; 220 bool inst1_found = false; 221 while (inst1_addr >= bound) { 222 inst1 = *(int *)inst1_addr; 223 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 224 inst1_addr -= BytesPerInstWord; 225 } 226 assert(inst1_found, "inst is not lis"); 227 228 uint32_t data_value = CompressedOops::narrow_oop_value(data); 229 int xc = (data_value >> 16) & 0xffff; 230 int xd = (data_value >> 0) & 0xffff; 231 232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 233 set_imm((int *)inst2_addr, (xd)); // unsigned int 234 return inst1_addr; 235 } 236 237 // Get compressed oop constant. 238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 239 assert(UseCompressedOops, "Should only patch compressed oops"); 240 241 const address inst2_addr = a; 242 const int inst2 = *(int *)inst2_addr; 243 244 // The relocation points to the second instruction, the ori, 245 // and the ori reads and writes the same register dst. 246 const int dst = inv_rta_field(inst2); 247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 248 // Now, find the preceding lis which writes to dst. 249 int inst1 = 0; 250 address inst1_addr = inst2_addr - BytesPerInstWord; 251 bool inst1_found = false; 252 253 while (inst1_addr >= bound) { 254 inst1 = *(int *) inst1_addr; 255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 256 inst1_addr -= BytesPerInstWord; 257 } 258 assert(inst1_found, "inst is not lis"); 259 260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 262 263 return CompressedOops::narrow_oop_cast(xl | xh); 264 } 265 #endif // _LP64 266 267 // Returns true if successful. 268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 269 Register toc, bool fixed_size) { 270 int toc_offset = 0; 271 // Use RelocationHolder::none for the constant pool entry, otherwise 272 // we will end up with a failing NativeCall::verify(x) where x is 273 // the address of the constant pool entry. 274 // FIXME: We should insert relocation information for oops at the constant 275 // pool entries instead of inserting it at the loads; patching of a constant 276 // pool entry should be less expensive. 277 address const_address = address_constant((address)a.value(), RelocationHolder::none); 278 if (const_address == nullptr) { return false; } // allocation failure 279 // Relocate at the pc of the load. 280 relocate(a.rspec()); 281 toc_offset = (int)(const_address - code()->consts()->start()); 282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 283 return true; 284 } 285 286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 287 const address inst1_addr = a; 288 const int inst1 = *(int *)inst1_addr; 289 290 // The relocation points to the ld or the addis. 291 return (is_ld(inst1)) || 292 (is_addis(inst1) && inv_ra_field(inst1) != 0); 293 } 294 295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 297 298 const address inst1_addr = a; 299 const int inst1 = *(int *)inst1_addr; 300 301 if (is_ld(inst1)) { 302 return inv_d1_field(inst1); 303 } else if (is_addis(inst1)) { 304 const int dst = inv_rt_field(inst1); 305 306 // Now, find the succeeding ld which reads and writes to dst. 307 address inst2_addr = inst1_addr + BytesPerInstWord; 308 int inst2 = 0; 309 while (true) { 310 inst2 = *(int *) inst2_addr; 311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 312 // Stop, found the ld which reads and writes dst. 313 break; 314 } 315 inst2_addr += BytesPerInstWord; 316 } 317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 318 } 319 ShouldNotReachHere(); 320 return 0; 321 } 322 323 // Get the constant from a `load_const' sequence. 324 long MacroAssembler::get_const(address a) { 325 assert(is_load_const_at(a), "not a load of a constant"); 326 const int *p = (const int*) a; 327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 328 if (is_ori(*(p+1))) { 329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 331 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 332 } else if (is_lis(*(p+1))) { 333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 335 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 336 } else { 337 ShouldNotReachHere(); 338 return (long) 0; 339 } 340 return (long) x; 341 } 342 343 // Patch the 64 bit constant of a `load_const' sequence. This is a low 344 // level procedure. It neither flushes the instruction cache nor is it 345 // mt safe. 346 void MacroAssembler::patch_const(address a, long x) { 347 assert(is_load_const_at(a), "not a load of a constant"); 348 int *p = (int*) a; 349 if (is_ori(*(p+1))) { 350 set_imm(0 + p, (x >> 48) & 0xffff); 351 set_imm(1 + p, (x >> 32) & 0xffff); 352 set_imm(3 + p, (x >> 16) & 0xffff); 353 set_imm(4 + p, x & 0xffff); 354 } else if (is_lis(*(p+1))) { 355 set_imm(0 + p, (x >> 48) & 0xffff); 356 set_imm(2 + p, (x >> 32) & 0xffff); 357 set_imm(1 + p, (x >> 16) & 0xffff); 358 set_imm(3 + p, x & 0xffff); 359 } else { 360 ShouldNotReachHere(); 361 } 362 } 363 364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 365 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 366 int index = oop_recorder()->allocate_metadata_index(obj); 367 RelocationHolder rspec = metadata_Relocation::spec(index); 368 return AddressLiteral((address)obj, rspec); 369 } 370 371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 372 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 373 int index = oop_recorder()->find_index(obj); 374 RelocationHolder rspec = metadata_Relocation::spec(index); 375 return AddressLiteral((address)obj, rspec); 376 } 377 378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 379 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 380 int oop_index = oop_recorder()->allocate_oop_index(obj); 381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 382 } 383 384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 385 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 386 int oop_index = oop_recorder()->find_index(obj); 387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 388 } 389 390 #ifndef PRODUCT 391 void MacroAssembler::pd_print_patched_instruction(address branch) { 392 Unimplemented(); // TODO: PPC port 393 } 394 #endif // ndef PRODUCT 395 396 // Conditional far branch for destinations encodable in 24+2 bits. 397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 398 399 // If requested by flag optimize, relocate the bc_far as a 400 // runtime_call and prepare for optimizing it when the code gets 401 // relocated. 402 if (optimize == bc_far_optimize_on_relocate) { 403 relocate(relocInfo::runtime_call_type); 404 } 405 406 // variant 2: 407 // 408 // b!cxx SKIP 409 // bxx DEST 410 // SKIP: 411 // 412 413 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 414 opposite_bcond(inv_boint_bcond(boint))); 415 416 // We emit two branches. 417 // First, a conditional branch which jumps around the far branch. 418 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 419 const address bc_pc = pc(); 420 bc(opposite_boint, biint, not_taken_pc); 421 422 const int bc_instr = *(int*)bc_pc; 423 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 424 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 425 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 426 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 427 "postcondition"); 428 assert(biint == inv_bi_field(bc_instr), "postcondition"); 429 430 // Second, an unconditional far branch which jumps to dest. 431 // Note: target(dest) remembers the current pc (see CodeSection::target) 432 // and returns the current pc if the label is not bound yet; when 433 // the label gets bound, the unconditional far branch will be patched. 434 const address target_pc = target(dest); 435 const address b_pc = pc(); 436 b(target_pc); 437 438 assert(not_taken_pc == pc(), "postcondition"); 439 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 440 } 441 442 // 1 or 2 instructions 443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 444 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 445 bc(boint, biint, dest); 446 } else { 447 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 448 } 449 } 450 451 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 452 return is_bc_far_variant1_at(instruction_addr) || 453 is_bc_far_variant2_at(instruction_addr) || 454 is_bc_far_variant3_at(instruction_addr); 455 } 456 457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 458 if (is_bc_far_variant1_at(instruction_addr)) { 459 const address instruction_1_addr = instruction_addr; 460 const int instruction_1 = *(int*)instruction_1_addr; 461 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 462 } else if (is_bc_far_variant2_at(instruction_addr)) { 463 const address instruction_2_addr = instruction_addr + 4; 464 return bxx_destination(instruction_2_addr); 465 } else if (is_bc_far_variant3_at(instruction_addr)) { 466 return instruction_addr + 8; 467 } 468 // variant 4 ??? 469 ShouldNotReachHere(); 470 return nullptr; 471 } 472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 473 474 if (is_bc_far_variant3_at(instruction_addr)) { 475 // variant 3, far cond branch to the next instruction, already patched to nops: 476 // 477 // nop 478 // endgroup 479 // SKIP/DEST: 480 // 481 return; 482 } 483 484 // first, extract boint and biint from the current branch 485 int boint = 0; 486 int biint = 0; 487 488 ResourceMark rm; 489 const int code_size = 2 * BytesPerInstWord; 490 CodeBuffer buf(instruction_addr, code_size); 491 MacroAssembler masm(&buf); 492 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 493 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 494 masm.nop(); 495 masm.endgroup(); 496 } else { 497 if (is_bc_far_variant1_at(instruction_addr)) { 498 // variant 1, the 1st instruction contains the destination address: 499 // 500 // bcxx DEST 501 // nop 502 // 503 const int instruction_1 = *(int*)(instruction_addr); 504 boint = inv_bo_field(instruction_1); 505 biint = inv_bi_field(instruction_1); 506 } else if (is_bc_far_variant2_at(instruction_addr)) { 507 // variant 2, the 2nd instruction contains the destination address: 508 // 509 // b!cxx SKIP 510 // bxx DEST 511 // SKIP: 512 // 513 const int instruction_1 = *(int*)(instruction_addr); 514 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 515 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 516 biint = inv_bi_field(instruction_1); 517 } else { 518 // variant 4??? 519 ShouldNotReachHere(); 520 } 521 522 // second, set the new branch destination and optimize the code 523 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 524 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 525 // variant 1: 526 // 527 // bcxx DEST 528 // nop 529 // 530 masm.bc(boint, biint, dest); 531 masm.nop(); 532 } else { 533 // variant 2: 534 // 535 // b!cxx SKIP 536 // bxx DEST 537 // SKIP: 538 // 539 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 540 opposite_bcond(inv_boint_bcond(boint))); 541 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 542 masm.bc(opposite_boint, biint, not_taken_pc); 543 masm.b(dest); 544 } 545 } 546 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 547 } 548 549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 551 // get current pc 552 uint64_t start_pc = (uint64_t) pc(); 553 554 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 555 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 556 557 // relocate here 558 if (rt != relocInfo::none) { 559 relocate(rt); 560 } 561 562 if ( ReoptimizeCallSequences && 563 (( link && is_within_range_of_b(dest, pc_of_bl)) || 564 (!link && is_within_range_of_b(dest, pc_of_b)))) { 565 // variant 2: 566 // Emit an optimized, pc-relative call/jump. 567 568 if (link) { 569 // some padding 570 nop(); 571 nop(); 572 nop(); 573 nop(); 574 nop(); 575 nop(); 576 577 // do the call 578 assert(pc() == pc_of_bl, "just checking"); 579 bl(dest, relocInfo::none); 580 } else { 581 // do the jump 582 assert(pc() == pc_of_b, "just checking"); 583 b(dest, relocInfo::none); 584 585 // some padding 586 nop(); 587 nop(); 588 nop(); 589 nop(); 590 nop(); 591 nop(); 592 } 593 594 // Assert that we can identify the emitted call/jump. 595 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 596 "can't identify emitted call"); 597 } else { 598 // variant 1: 599 mr(R0, R11); // spill R11 -> R0. 600 601 // Load the destination address into CTR, 602 // calculate destination relative to global toc. 603 calculate_address_from_global_toc(R11, dest, true, true, false); 604 605 mtctr(R11); 606 mr(R11, R0); // spill R11 <- R0. 607 nop(); 608 609 // do the call/jump 610 if (link) { 611 bctrl(); 612 } else{ 613 bctr(); 614 } 615 // Assert that we can identify the emitted call/jump. 616 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 617 "can't identify emitted call"); 618 } 619 620 // Assert that we can identify the emitted call/jump. 621 assert(is_bxx64_patchable_at((address)start_pc, link), 622 "can't identify emitted call"); 623 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 624 "wrong encoding of dest address"); 625 } 626 627 // Identify a bxx64_patchable instruction. 628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 629 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 630 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 631 || is_bxx64_patchable_variant2_at(instruction_addr, link); 632 } 633 634 // Does the call64_patchable instruction use a pc-relative encoding of 635 // the call destination? 636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 637 // variant 2 is pc-relative 638 return is_bxx64_patchable_variant2_at(instruction_addr, link); 639 } 640 641 // Identify variant 1. 642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 643 unsigned int* instr = (unsigned int*) instruction_addr; 644 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 645 && is_mtctr(instr[5]) // mtctr 646 && is_load_const_at(instruction_addr); 647 } 648 649 // Identify variant 1b: load destination relative to global toc. 650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 651 unsigned int* instr = (unsigned int*) instruction_addr; 652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 653 && is_mtctr(instr[3]) // mtctr 654 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 655 } 656 657 // Identify variant 2. 658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 659 unsigned int* instr = (unsigned int*) instruction_addr; 660 if (link) { 661 return is_bl (instr[6]) // bl dest is last 662 && is_nop(instr[0]) // nop 663 && is_nop(instr[1]) // nop 664 && is_nop(instr[2]) // nop 665 && is_nop(instr[3]) // nop 666 && is_nop(instr[4]) // nop 667 && is_nop(instr[5]); // nop 668 } else { 669 return is_b (instr[0]) // b dest is first 670 && is_nop(instr[1]) // nop 671 && is_nop(instr[2]) // nop 672 && is_nop(instr[3]) // nop 673 && is_nop(instr[4]) // nop 674 && is_nop(instr[5]) // nop 675 && is_nop(instr[6]); // nop 676 } 677 } 678 679 // Set dest address of a bxx64_patchable instruction. 680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 681 ResourceMark rm; 682 int code_size = MacroAssembler::bxx64_patchable_size; 683 CodeBuffer buf(instruction_addr, code_size); 684 MacroAssembler masm(&buf); 685 masm.bxx64_patchable(dest, relocInfo::none, link); 686 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 687 } 688 689 // Get dest address of a bxx64_patchable instruction. 690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 691 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 692 return (address) (unsigned long) get_const(instruction_addr); 693 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 694 unsigned int* instr = (unsigned int*) instruction_addr; 695 if (link) { 696 const int instr_idx = 6; // bl is last 697 int branchoffset = branch_destination(instr[instr_idx], 0); 698 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 699 } else { 700 const int instr_idx = 0; // b is first 701 int branchoffset = branch_destination(instr[instr_idx], 0); 702 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 703 } 704 // Load dest relative to global toc. 705 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 706 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 707 instruction_addr); 708 } else { 709 ShouldNotReachHere(); 710 return nullptr; 711 } 712 } 713 714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 715 const int magic_number = 0x42; 716 717 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 718 // although they're technically volatile 719 for (int i = 2; i < 13; i++) { 720 Register reg = as_Register(i); 721 if (reg == excluded_register) { 722 continue; 723 } 724 725 li(reg, magic_number); 726 } 727 } 728 729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 730 const int magic_number = 0x43; 731 732 li(tmp, magic_number); 733 for (int m = 0; m <= 7; m++) { 734 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 735 } 736 } 737 738 // Uses ordering which corresponds to ABI: 739 // _savegpr0_14: std r14,-144(r1) 740 // _savegpr0_15: std r15,-136(r1) 741 // _savegpr0_16: std r16,-128(r1) 742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 743 std(R14, offset, dst); offset += 8; 744 std(R15, offset, dst); offset += 8; 745 std(R16, offset, dst); offset += 8; 746 std(R17, offset, dst); offset += 8; 747 std(R18, offset, dst); offset += 8; 748 std(R19, offset, dst); offset += 8; 749 std(R20, offset, dst); offset += 8; 750 std(R21, offset, dst); offset += 8; 751 std(R22, offset, dst); offset += 8; 752 std(R23, offset, dst); offset += 8; 753 std(R24, offset, dst); offset += 8; 754 std(R25, offset, dst); offset += 8; 755 std(R26, offset, dst); offset += 8; 756 std(R27, offset, dst); offset += 8; 757 std(R28, offset, dst); offset += 8; 758 std(R29, offset, dst); offset += 8; 759 std(R30, offset, dst); offset += 8; 760 std(R31, offset, dst); offset += 8; 761 762 stfd(F14, offset, dst); offset += 8; 763 stfd(F15, offset, dst); offset += 8; 764 stfd(F16, offset, dst); offset += 8; 765 stfd(F17, offset, dst); offset += 8; 766 stfd(F18, offset, dst); offset += 8; 767 stfd(F19, offset, dst); offset += 8; 768 stfd(F20, offset, dst); offset += 8; 769 stfd(F21, offset, dst); offset += 8; 770 stfd(F22, offset, dst); offset += 8; 771 stfd(F23, offset, dst); offset += 8; 772 stfd(F24, offset, dst); offset += 8; 773 stfd(F25, offset, dst); offset += 8; 774 stfd(F26, offset, dst); offset += 8; 775 stfd(F27, offset, dst); offset += 8; 776 stfd(F28, offset, dst); offset += 8; 777 stfd(F29, offset, dst); offset += 8; 778 stfd(F30, offset, dst); offset += 8; 779 stfd(F31, offset, dst); 780 } 781 782 // Uses ordering which corresponds to ABI: 783 // _restgpr0_14: ld r14,-144(r1) 784 // _restgpr0_15: ld r15,-136(r1) 785 // _restgpr0_16: ld r16,-128(r1) 786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 787 ld(R14, offset, src); offset += 8; 788 ld(R15, offset, src); offset += 8; 789 ld(R16, offset, src); offset += 8; 790 ld(R17, offset, src); offset += 8; 791 ld(R18, offset, src); offset += 8; 792 ld(R19, offset, src); offset += 8; 793 ld(R20, offset, src); offset += 8; 794 ld(R21, offset, src); offset += 8; 795 ld(R22, offset, src); offset += 8; 796 ld(R23, offset, src); offset += 8; 797 ld(R24, offset, src); offset += 8; 798 ld(R25, offset, src); offset += 8; 799 ld(R26, offset, src); offset += 8; 800 ld(R27, offset, src); offset += 8; 801 ld(R28, offset, src); offset += 8; 802 ld(R29, offset, src); offset += 8; 803 ld(R30, offset, src); offset += 8; 804 ld(R31, offset, src); offset += 8; 805 806 // FP registers 807 lfd(F14, offset, src); offset += 8; 808 lfd(F15, offset, src); offset += 8; 809 lfd(F16, offset, src); offset += 8; 810 lfd(F17, offset, src); offset += 8; 811 lfd(F18, offset, src); offset += 8; 812 lfd(F19, offset, src); offset += 8; 813 lfd(F20, offset, src); offset += 8; 814 lfd(F21, offset, src); offset += 8; 815 lfd(F22, offset, src); offset += 8; 816 lfd(F23, offset, src); offset += 8; 817 lfd(F24, offset, src); offset += 8; 818 lfd(F25, offset, src); offset += 8; 819 lfd(F26, offset, src); offset += 8; 820 lfd(F27, offset, src); offset += 8; 821 lfd(F28, offset, src); offset += 8; 822 lfd(F29, offset, src); offset += 8; 823 lfd(F30, offset, src); offset += 8; 824 lfd(F31, offset, src); 825 } 826 827 // For verify_oops. 828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 829 std(R2, offset, dst); offset += 8; 830 if (include_R3_RET_reg) { 831 std(R3, offset, dst); offset += 8; 832 } 833 std(R4, offset, dst); offset += 8; 834 std(R5, offset, dst); offset += 8; 835 std(R6, offset, dst); offset += 8; 836 std(R7, offset, dst); offset += 8; 837 std(R8, offset, dst); offset += 8; 838 std(R9, offset, dst); offset += 8; 839 std(R10, offset, dst); offset += 8; 840 std(R11, offset, dst); offset += 8; 841 std(R12, offset, dst); offset += 8; 842 843 if (include_fp_regs) { 844 stfd(F0, offset, dst); offset += 8; 845 stfd(F1, offset, dst); offset += 8; 846 stfd(F2, offset, dst); offset += 8; 847 stfd(F3, offset, dst); offset += 8; 848 stfd(F4, offset, dst); offset += 8; 849 stfd(F5, offset, dst); offset += 8; 850 stfd(F6, offset, dst); offset += 8; 851 stfd(F7, offset, dst); offset += 8; 852 stfd(F8, offset, dst); offset += 8; 853 stfd(F9, offset, dst); offset += 8; 854 stfd(F10, offset, dst); offset += 8; 855 stfd(F11, offset, dst); offset += 8; 856 stfd(F12, offset, dst); offset += 8; 857 stfd(F13, offset, dst); 858 } 859 } 860 861 // For verify_oops. 862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 863 ld(R2, offset, src); offset += 8; 864 if (include_R3_RET_reg) { 865 ld(R3, offset, src); offset += 8; 866 } 867 ld(R4, offset, src); offset += 8; 868 ld(R5, offset, src); offset += 8; 869 ld(R6, offset, src); offset += 8; 870 ld(R7, offset, src); offset += 8; 871 ld(R8, offset, src); offset += 8; 872 ld(R9, offset, src); offset += 8; 873 ld(R10, offset, src); offset += 8; 874 ld(R11, offset, src); offset += 8; 875 ld(R12, offset, src); offset += 8; 876 877 if (include_fp_regs) { 878 lfd(F0, offset, src); offset += 8; 879 lfd(F1, offset, src); offset += 8; 880 lfd(F2, offset, src); offset += 8; 881 lfd(F3, offset, src); offset += 8; 882 lfd(F4, offset, src); offset += 8; 883 lfd(F5, offset, src); offset += 8; 884 lfd(F6, offset, src); offset += 8; 885 lfd(F7, offset, src); offset += 8; 886 lfd(F8, offset, src); offset += 8; 887 lfd(F9, offset, src); offset += 8; 888 lfd(F10, offset, src); offset += 8; 889 lfd(F11, offset, src); offset += 8; 890 lfd(F12, offset, src); offset += 8; 891 lfd(F13, offset, src); 892 } 893 } 894 895 void MacroAssembler::save_LR_CR(Register tmp) { 896 mfcr(tmp); 897 std(tmp, _abi0(cr), R1_SP); 898 mflr(tmp); 899 std(tmp, _abi0(lr), R1_SP); 900 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 901 } 902 903 void MacroAssembler::restore_LR_CR(Register tmp) { 904 assert(tmp != R1_SP, "must be distinct"); 905 ld(tmp, _abi0(lr), R1_SP); 906 mtlr(tmp); 907 ld(tmp, _abi0(cr), R1_SP); 908 mtcr(tmp); 909 } 910 911 address MacroAssembler::get_PC_trash_LR(Register result) { 912 Label L; 913 bl(L); 914 bind(L); 915 address lr_pc = pc(); 916 mflr(result); 917 return lr_pc; 918 } 919 920 void MacroAssembler::resize_frame(Register offset, Register tmp) { 921 #ifdef ASSERT 922 assert_different_registers(offset, tmp, R1_SP); 923 andi_(tmp, offset, frame::alignment_in_bytes-1); 924 asm_assert_eq("resize_frame: unaligned"); 925 #endif 926 927 // tmp <- *(SP) 928 ld(tmp, _abi0(callers_sp), R1_SP); 929 // addr <- SP + offset; 930 // *(addr) <- tmp; 931 // SP <- addr 932 stdux(tmp, R1_SP, offset); 933 } 934 935 void MacroAssembler::resize_frame(int offset, Register tmp) { 936 assert(is_simm(offset, 16), "too big an offset"); 937 assert_different_registers(tmp, R1_SP); 938 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 939 // tmp <- *(SP) 940 ld(tmp, _abi0(callers_sp), R1_SP); 941 // addr <- SP + offset; 942 // *(addr) <- tmp; 943 // SP <- addr 944 stdu(tmp, offset, R1_SP); 945 } 946 947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 948 // (addr == tmp1) || (addr == tmp2) is allowed here! 949 assert(tmp1 != tmp2, "must be distinct"); 950 951 // compute offset w.r.t. current stack pointer 952 // tmp_1 <- addr - SP (!) 953 subf(tmp1, R1_SP, addr); 954 955 // atomically update SP keeping back link. 956 resize_frame(tmp1/* offset */, tmp2/* tmp */); 957 } 958 959 void MacroAssembler::push_frame(Register bytes, Register tmp) { 960 #ifdef ASSERT 961 assert(bytes != R0, "r0 not allowed here"); 962 andi_(R0, bytes, frame::alignment_in_bytes-1); 963 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 964 #endif 965 neg(tmp, bytes); 966 stdux(R1_SP, R1_SP, tmp); 967 } 968 969 // Push a frame of size `bytes'. 970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 971 long offset = align_addr(bytes, frame::alignment_in_bytes); 972 if (is_simm(-offset, 16)) { 973 stdu(R1_SP, -offset, R1_SP); 974 } else { 975 load_const_optimized(tmp, -offset); 976 stdux(R1_SP, R1_SP, tmp); 977 } 978 } 979 980 // Push a frame of size `bytes' plus native_abi_reg_args on top. 981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 982 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 983 } 984 985 // Setup up a new C frame with a spill area for non-volatile GPRs and 986 // additional space for local variables. 987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 988 Register tmp) { 989 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 990 } 991 992 // Pop current C frame. 993 void MacroAssembler::pop_frame() { 994 ld(R1_SP, _abi0(callers_sp), R1_SP); 995 } 996 997 #if defined(ABI_ELFv2) 998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 999 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1000 // most of the times. 1001 if (R12 != r_function_entry) { 1002 mr(R12, r_function_entry); 1003 } 1004 mtctr(R12); 1005 // Do a call or a branch. 1006 if (and_link) { 1007 bctrl(); 1008 } else { 1009 bctr(); 1010 } 1011 _last_calls_return_pc = pc(); 1012 1013 return _last_calls_return_pc; 1014 } 1015 1016 // Call a C function via a function descriptor and use full C 1017 // calling conventions. Updates and returns _last_calls_return_pc. 1018 address MacroAssembler::call_c(Register r_function_entry) { 1019 return branch_to(r_function_entry, /*and_link=*/true); 1020 } 1021 1022 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1024 return branch_to(r_function_entry, /*and_link=*/false); 1025 } 1026 1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1028 load_const(R12, function_entry, R0); 1029 return branch_to(R12, /*and_link=*/true); 1030 } 1031 1032 #else 1033 // Generic version of a call to C function via a function descriptor 1034 // with variable support for C calling conventions (TOC, ENV, etc.). 1035 // Updates and returns _last_calls_return_pc. 1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1037 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1038 // we emit standard ptrgl glue code here 1039 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1040 1041 // retrieve necessary entries from the function descriptor 1042 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1043 mtctr(R0); 1044 1045 if (load_toc_of_callee) { 1046 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1047 } 1048 if (load_env_of_callee) { 1049 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1050 } else if (load_toc_of_callee) { 1051 li(R11, 0); 1052 } 1053 1054 // do a call or a branch 1055 if (and_link) { 1056 bctrl(); 1057 } else { 1058 bctr(); 1059 } 1060 _last_calls_return_pc = pc(); 1061 1062 return _last_calls_return_pc; 1063 } 1064 1065 // Call a C function via a function descriptor and use full C calling 1066 // conventions. 1067 // We don't use the TOC in generated code, so there is no need to save 1068 // and restore its value. 1069 address MacroAssembler::call_c(Register fd) { 1070 return branch_to(fd, /*and_link=*/true, 1071 /*save toc=*/false, 1072 /*restore toc=*/false, 1073 /*load toc=*/true, 1074 /*load env=*/true); 1075 } 1076 1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1078 return branch_to(fd, /*and_link=*/false, 1079 /*save toc=*/false, 1080 /*restore toc=*/false, 1081 /*load toc=*/true, 1082 /*load env=*/true); 1083 } 1084 1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1086 if (rt != relocInfo::none) { 1087 // this call needs to be relocatable 1088 if (!ReoptimizeCallSequences 1089 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1090 || fd == nullptr // support code-size estimation 1091 || !fd->is_friend_function() 1092 || fd->entry() == nullptr) { 1093 // it's not a friend function as defined by class FunctionDescriptor, 1094 // so do a full call-c here. 1095 load_const(R11, (address)fd, R0); 1096 1097 bool has_env = (fd != nullptr && fd->env() != nullptr); 1098 return branch_to(R11, /*and_link=*/true, 1099 /*save toc=*/false, 1100 /*restore toc=*/false, 1101 /*load toc=*/true, 1102 /*load env=*/has_env); 1103 } else { 1104 // It's a friend function. Load the entry point and don't care about 1105 // toc and env. Use an optimizable call instruction, but ensure the 1106 // same code-size as in the case of a non-friend function. 1107 nop(); 1108 nop(); 1109 nop(); 1110 bl64_patchable(fd->entry(), rt); 1111 _last_calls_return_pc = pc(); 1112 return _last_calls_return_pc; 1113 } 1114 } else { 1115 // This call does not need to be relocatable, do more aggressive 1116 // optimizations. 1117 if (!ReoptimizeCallSequences 1118 || !fd->is_friend_function()) { 1119 // It's not a friend function as defined by class FunctionDescriptor, 1120 // so do a full call-c here. 1121 load_const(R11, (address)fd, R0); 1122 return branch_to(R11, /*and_link=*/true, 1123 /*save toc=*/false, 1124 /*restore toc=*/false, 1125 /*load toc=*/true, 1126 /*load env=*/true); 1127 } else { 1128 // it's a friend function, load the entry point and don't care about 1129 // toc and env. 1130 address dest = fd->entry(); 1131 if (is_within_range_of_b(dest, pc())) { 1132 bl(dest); 1133 } else { 1134 bl64_patchable(dest, rt); 1135 } 1136 _last_calls_return_pc = pc(); 1137 return _last_calls_return_pc; 1138 } 1139 } 1140 } 1141 1142 // Call a C function. All constants needed reside in TOC. 1143 // 1144 // Read the address to call from the TOC. 1145 // Read env from TOC, if fd specifies an env. 1146 // Read new TOC from TOC. 1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1148 relocInfo::relocType rt, Register toc) { 1149 if (!ReoptimizeCallSequences 1150 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1151 || !fd->is_friend_function()) { 1152 // It's not a friend function as defined by class FunctionDescriptor, 1153 // so do a full call-c here. 1154 assert(fd->entry() != nullptr, "function must be linked"); 1155 1156 AddressLiteral fd_entry(fd->entry()); 1157 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1158 mtctr(R11); 1159 if (fd->env() == nullptr) { 1160 li(R11, 0); 1161 nop(); 1162 } else { 1163 AddressLiteral fd_env(fd->env()); 1164 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1165 } 1166 AddressLiteral fd_toc(fd->toc()); 1167 // Set R2_TOC (load from toc) 1168 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1169 bctrl(); 1170 _last_calls_return_pc = pc(); 1171 if (!success) { return nullptr; } 1172 } else { 1173 // It's a friend function, load the entry point and don't care about 1174 // toc and env. Use an optimizable call instruction, but ensure the 1175 // same code-size as in the case of a non-friend function. 1176 nop(); 1177 bl64_patchable(fd->entry(), rt); 1178 _last_calls_return_pc = pc(); 1179 } 1180 return _last_calls_return_pc; 1181 } 1182 #endif // ABI_ELFv2 1183 1184 void MacroAssembler::post_call_nop() { 1185 // Make inline again when loom is always enabled. 1186 if (!Continuations::enabled()) { 1187 return; 1188 } 1189 InlineSkippedInstructionsCounter skipCounter(this); 1190 nop(); 1191 } 1192 1193 void MacroAssembler::call_VM_base(Register oop_result, 1194 Register last_java_sp, 1195 address entry_point, 1196 bool check_exceptions) { 1197 BLOCK_COMMENT("call_VM {"); 1198 // Determine last_java_sp register. 1199 if (!last_java_sp->is_valid()) { 1200 last_java_sp = R1_SP; 1201 } 1202 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1203 1204 // ARG1 must hold thread address. 1205 mr(R3_ARG1, R16_thread); 1206 #if defined(ABI_ELFv2) 1207 address return_pc = call_c(entry_point, relocInfo::none); 1208 #else 1209 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1210 #endif 1211 1212 reset_last_Java_frame(); 1213 1214 // Check for pending exceptions. 1215 if (check_exceptions) { 1216 // We don't check for exceptions here. 1217 ShouldNotReachHere(); 1218 } 1219 1220 // Get oop result if there is one and reset the value in the thread. 1221 if (oop_result->is_valid()) { 1222 get_vm_result(oop_result); 1223 } 1224 1225 _last_calls_return_pc = return_pc; 1226 BLOCK_COMMENT("} call_VM"); 1227 } 1228 1229 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1230 BLOCK_COMMENT("call_VM_leaf {"); 1231 #if defined(ABI_ELFv2) 1232 call_c(entry_point, relocInfo::none); 1233 #else 1234 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1235 #endif 1236 BLOCK_COMMENT("} call_VM_leaf"); 1237 } 1238 1239 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1240 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1241 } 1242 1243 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1244 bool check_exceptions) { 1245 // R3_ARG1 is reserved for the thread. 1246 mr_if_needed(R4_ARG2, arg_1); 1247 call_VM(oop_result, entry_point, check_exceptions); 1248 } 1249 1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1251 bool check_exceptions) { 1252 // R3_ARG1 is reserved for the thread 1253 mr_if_needed(R4_ARG2, arg_1); 1254 assert(arg_2 != R4_ARG2, "smashed argument"); 1255 mr_if_needed(R5_ARG3, arg_2); 1256 call_VM(oop_result, entry_point, check_exceptions); 1257 } 1258 1259 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1260 bool check_exceptions) { 1261 // R3_ARG1 is reserved for the thread 1262 mr_if_needed(R4_ARG2, arg_1); 1263 assert(arg_2 != R4_ARG2, "smashed argument"); 1264 mr_if_needed(R5_ARG3, arg_2); 1265 mr_if_needed(R6_ARG4, arg_3); 1266 call_VM(oop_result, entry_point, check_exceptions); 1267 } 1268 1269 void MacroAssembler::call_VM_leaf(address entry_point) { 1270 call_VM_leaf_base(entry_point); 1271 } 1272 1273 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1274 mr_if_needed(R3_ARG1, arg_1); 1275 call_VM_leaf(entry_point); 1276 } 1277 1278 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1279 mr_if_needed(R3_ARG1, arg_1); 1280 assert(arg_2 != R3_ARG1, "smashed argument"); 1281 mr_if_needed(R4_ARG2, arg_2); 1282 call_VM_leaf(entry_point); 1283 } 1284 1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1286 mr_if_needed(R3_ARG1, arg_1); 1287 assert(arg_2 != R3_ARG1, "smashed argument"); 1288 mr_if_needed(R4_ARG2, arg_2); 1289 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument"); 1290 mr_if_needed(R5_ARG3, arg_3); 1291 call_VM_leaf(entry_point); 1292 } 1293 1294 // Check whether instruction is a read access to the polling page 1295 // which was emitted by load_from_polling_page(..). 1296 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1297 address* polling_address_ptr) { 1298 if (!is_ld(instruction)) 1299 return false; // It's not a ld. Fail. 1300 1301 int rt = inv_rt_field(instruction); 1302 int ra = inv_ra_field(instruction); 1303 int ds = inv_ds_field(instruction); 1304 if (!(ds == 0 && ra != 0 && rt == 0)) { 1305 return false; // It's not a ld(r0, X, ra). Fail. 1306 } 1307 1308 if (!ucontext) { 1309 // Set polling address. 1310 if (polling_address_ptr != nullptr) { 1311 *polling_address_ptr = nullptr; 1312 } 1313 return true; // No ucontext given. Can't check value of ra. Assume true. 1314 } 1315 1316 #ifdef LINUX 1317 // Ucontext given. Check that register ra contains the address of 1318 // the safepoing polling page. 1319 ucontext_t* uc = (ucontext_t*) ucontext; 1320 // Set polling address. 1321 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1322 if (polling_address_ptr != nullptr) { 1323 *polling_address_ptr = addr; 1324 } 1325 return SafepointMechanism::is_poll_address(addr); 1326 #else 1327 // Not on Linux, ucontext must be null. 1328 ShouldNotReachHere(); 1329 return false; 1330 #endif 1331 } 1332 1333 void MacroAssembler::bang_stack_with_offset(int offset) { 1334 // When increasing the stack, the old stack pointer will be written 1335 // to the new top of stack according to the PPC64 abi. 1336 // Therefore, stack banging is not necessary when increasing 1337 // the stack by <= os::vm_page_size() bytes. 1338 // When increasing the stack by a larger amount, this method is 1339 // called repeatedly to bang the intermediate pages. 1340 1341 // Stack grows down, caller passes positive offset. 1342 assert(offset > 0, "must bang with positive offset"); 1343 1344 long stdoffset = -offset; 1345 1346 if (is_simm(stdoffset, 16)) { 1347 // Signed 16 bit offset, a simple std is ok. 1348 if (UseLoadInstructionsForStackBangingPPC64) { 1349 ld(R0, (int)(signed short)stdoffset, R1_SP); 1350 } else { 1351 std(R0,(int)(signed short)stdoffset, R1_SP); 1352 } 1353 } else if (is_simm(stdoffset, 31)) { 1354 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1355 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1356 1357 Register tmp = R11; 1358 addis(tmp, R1_SP, hi); 1359 if (UseLoadInstructionsForStackBangingPPC64) { 1360 ld(R0, lo, tmp); 1361 } else { 1362 std(R0, lo, tmp); 1363 } 1364 } else { 1365 ShouldNotReachHere(); 1366 } 1367 } 1368 1369 // If instruction is a stack bang of the form 1370 // std R0, x(Ry), (see bang_stack_with_offset()) 1371 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1372 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1373 // return the banged address. Otherwise, return 0. 1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1375 #ifdef LINUX 1376 ucontext_t* uc = (ucontext_t*) ucontext; 1377 int rs = inv_rs_field(instruction); 1378 int ra = inv_ra_field(instruction); 1379 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1380 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1381 || (is_stdu(instruction) && rs == 1)) { 1382 int ds = inv_ds_field(instruction); 1383 // return banged address 1384 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1385 } else if (is_stdux(instruction) && rs == 1) { 1386 int rb = inv_rb_field(instruction); 1387 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1388 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1389 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1390 : sp + rb_val; // banged address 1391 } 1392 return nullptr; // not a stack bang 1393 #else 1394 // workaround not needed on !LINUX :-) 1395 ShouldNotCallThis(); 1396 return nullptr; 1397 #endif 1398 } 1399 1400 void MacroAssembler::reserved_stack_check(Register return_pc) { 1401 // Test if reserved zone needs to be enabled. 1402 Label no_reserved_zone_enabling; 1403 1404 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1405 cmpld(CCR0, R1_SP, R0); 1406 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1407 1408 // Enable reserved zone again, throw stack overflow exception. 1409 push_frame_reg_args(0, R0); 1410 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1411 pop_frame(); 1412 mtlr(return_pc); 1413 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1414 mtctr(R0); 1415 bctr(); 1416 1417 should_not_reach_here(); 1418 1419 bind(no_reserved_zone_enabling); 1420 } 1421 1422 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1423 bool cmpxchgx_hint) { 1424 Label retry; 1425 bind(retry); 1426 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1427 stdcx_(exchange_value, addr_base); 1428 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1429 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1430 } else { 1431 bne( CCR0, retry); // StXcx_ sets CCR0. 1432 } 1433 } 1434 1435 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1436 Register tmp, bool cmpxchgx_hint) { 1437 Label retry; 1438 bind(retry); 1439 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1440 add(tmp, dest_current_value, inc_value); 1441 stdcx_(tmp, addr_base); 1442 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1443 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1444 } else { 1445 bne( CCR0, retry); // StXcx_ sets CCR0. 1446 } 1447 } 1448 1449 // Word/sub-word atomic helper functions 1450 1451 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1452 // Only signed types are supported with size < 4. 1453 // Atomic add always kills tmp1. 1454 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1455 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1456 bool cmpxchgx_hint, bool is_add, int size) { 1457 // Sub-word instructions are available since Power 8. 1458 // For older processors, instruction_type != size holds, and we 1459 // emulate the sub-word instructions by constructing a 4-byte value 1460 // that leaves the other bytes unchanged. 1461 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1462 1463 Label retry; 1464 Register shift_amount = noreg, 1465 val32 = dest_current_value, 1466 modval = is_add ? tmp1 : exchange_value; 1467 1468 if (instruction_type != size) { 1469 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1470 modval = tmp1; 1471 shift_amount = tmp2; 1472 val32 = tmp3; 1473 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1474 #ifdef VM_LITTLE_ENDIAN 1475 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1476 clrrdi(addr_base, addr_base, 2); 1477 #else 1478 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1479 clrrdi(addr_base, addr_base, 2); 1480 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1481 #endif 1482 } 1483 1484 // atomic emulation loop 1485 bind(retry); 1486 1487 switch (instruction_type) { 1488 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1489 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1490 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1491 default: ShouldNotReachHere(); 1492 } 1493 1494 if (instruction_type != size) { 1495 srw(dest_current_value, val32, shift_amount); 1496 } 1497 1498 if (is_add) { add(modval, dest_current_value, exchange_value); } 1499 1500 if (instruction_type != size) { 1501 // Transform exchange value such that the replacement can be done by one xor instruction. 1502 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1503 clrldi(modval, modval, (size == 1) ? 56 : 48); 1504 slw(modval, modval, shift_amount); 1505 xorr(modval, val32, modval); 1506 } 1507 1508 switch (instruction_type) { 1509 case 4: stwcx_(modval, addr_base); break; 1510 case 2: sthcx_(modval, addr_base); break; 1511 case 1: stbcx_(modval, addr_base); break; 1512 default: ShouldNotReachHere(); 1513 } 1514 1515 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1516 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1517 } else { 1518 bne( CCR0, retry); // StXcx_ sets CCR0. 1519 } 1520 1521 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1522 if (size == 1) { 1523 extsb(dest_current_value, dest_current_value); 1524 } else if (size == 2) { 1525 extsh(dest_current_value, dest_current_value); 1526 }; 1527 } 1528 1529 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1530 // Only signed types are supported with size < 4. 1531 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1532 Register compare_value, Register exchange_value, 1533 Register addr_base, Register tmp1, Register tmp2, 1534 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1535 // Sub-word instructions are available since Power 8. 1536 // For older processors, instruction_type != size holds, and we 1537 // emulate the sub-word instructions by constructing a 4-byte value 1538 // that leaves the other bytes unchanged. 1539 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1540 1541 Register shift_amount = noreg, 1542 val32 = dest_current_value, 1543 modval = exchange_value; 1544 1545 if (instruction_type != size) { 1546 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1547 shift_amount = tmp1; 1548 val32 = tmp2; 1549 modval = tmp2; 1550 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1551 #ifdef VM_LITTLE_ENDIAN 1552 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1553 clrrdi(addr_base, addr_base, 2); 1554 #else 1555 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1556 clrrdi(addr_base, addr_base, 2); 1557 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1558 #endif 1559 // Transform exchange value such that the replacement can be done by one xor instruction. 1560 xorr(exchange_value, compare_value, exchange_value); 1561 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1562 slw(exchange_value, exchange_value, shift_amount); 1563 } 1564 1565 // atomic emulation loop 1566 bind(retry); 1567 1568 switch (instruction_type) { 1569 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1570 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1571 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1572 default: ShouldNotReachHere(); 1573 } 1574 1575 if (instruction_type != size) { 1576 srw(dest_current_value, val32, shift_amount); 1577 } 1578 if (size == 1) { 1579 extsb(dest_current_value, dest_current_value); 1580 } else if (size == 2) { 1581 extsh(dest_current_value, dest_current_value); 1582 }; 1583 1584 cmpw(flag, dest_current_value, compare_value); 1585 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1586 bne_predict_not_taken(flag, failed); 1587 } else { 1588 bne( flag, failed); 1589 } 1590 // branch to done => (flag == ne), (dest_current_value != compare_value) 1591 // fall through => (flag == eq), (dest_current_value == compare_value) 1592 1593 if (instruction_type != size) { 1594 xorr(modval, val32, exchange_value); 1595 } 1596 1597 switch (instruction_type) { 1598 case 4: stwcx_(modval, addr_base); break; 1599 case 2: sthcx_(modval, addr_base); break; 1600 case 1: stbcx_(modval, addr_base); break; 1601 default: ShouldNotReachHere(); 1602 } 1603 } 1604 1605 // CmpxchgX sets condition register to cmpX(current, compare). 1606 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1607 Register compare_value, Register exchange_value, 1608 Register addr_base, Register tmp1, Register tmp2, 1609 int semantics, bool cmpxchgx_hint, 1610 Register int_flag_success, bool contention_hint, bool weak, int size) { 1611 Label retry; 1612 Label failed; 1613 Label done; 1614 1615 // Save one branch if result is returned via register and 1616 // result register is different from the other ones. 1617 bool use_result_reg = (int_flag_success != noreg); 1618 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1619 int_flag_success != exchange_value && int_flag_success != addr_base && 1620 int_flag_success != tmp1 && int_flag_success != tmp2); 1621 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1622 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1623 1624 if (use_result_reg && preset_result_reg) { 1625 li(int_flag_success, 0); // preset (assume cas failed) 1626 } 1627 1628 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1629 if (contention_hint) { // Don't try to reserve if cmp fails. 1630 switch (size) { 1631 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1632 case 2: lha(dest_current_value, 0, addr_base); break; 1633 case 4: lwz(dest_current_value, 0, addr_base); break; 1634 default: ShouldNotReachHere(); 1635 } 1636 cmpw(flag, dest_current_value, compare_value); 1637 bne(flag, failed); 1638 } 1639 1640 // release/fence semantics 1641 if (semantics & MemBarRel) { 1642 release(); 1643 } 1644 1645 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1646 retry, failed, cmpxchgx_hint, size); 1647 if (!weak || use_result_reg) { 1648 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1649 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1650 } else { 1651 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1652 } 1653 } 1654 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1655 1656 // Result in register (must do this at the end because int_flag_success can be the 1657 // same register as one above). 1658 if (use_result_reg) { 1659 li(int_flag_success, 1); 1660 } 1661 1662 if (semantics & MemBarFenceAfter) { 1663 fence(); 1664 } else if (semantics & MemBarAcq) { 1665 isync(); 1666 } 1667 1668 if (use_result_reg && !preset_result_reg) { 1669 b(done); 1670 } 1671 1672 bind(failed); 1673 if (use_result_reg && !preset_result_reg) { 1674 li(int_flag_success, 0); 1675 } 1676 1677 bind(done); 1678 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1679 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1680 } 1681 1682 // Performs atomic compare exchange: 1683 // if (compare_value == *addr_base) 1684 // *addr_base = exchange_value 1685 // int_flag_success = 1; 1686 // else 1687 // int_flag_success = 0; 1688 // 1689 // ConditionRegister flag = cmp(compare_value, *addr_base) 1690 // Register dest_current_value = *addr_base 1691 // Register compare_value Used to compare with value in memory 1692 // Register exchange_value Written to memory if compare_value == *addr_base 1693 // Register addr_base The memory location to compareXChange 1694 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1695 // 1696 // To avoid the costly compare exchange the value is tested beforehand. 1697 // Several special cases exist to avoid that unnecessary information is generated. 1698 // 1699 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1700 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1701 Register addr_base, int semantics, bool cmpxchgx_hint, 1702 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1703 Label retry; 1704 Label failed_int; 1705 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1706 Label done; 1707 1708 // Save one branch if result is returned via register and result register is different from the other ones. 1709 bool use_result_reg = (int_flag_success!=noreg); 1710 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1711 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1712 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1713 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1714 1715 if (use_result_reg && preset_result_reg) { 1716 li(int_flag_success, 0); // preset (assume cas failed) 1717 } 1718 1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1720 if (contention_hint) { // Don't try to reserve if cmp fails. 1721 ld(dest_current_value, 0, addr_base); 1722 cmpd(flag, compare_value, dest_current_value); 1723 bne(flag, failed); 1724 } 1725 1726 // release/fence semantics 1727 if (semantics & MemBarRel) { 1728 release(); 1729 } 1730 1731 // atomic emulation loop 1732 bind(retry); 1733 1734 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1735 cmpd(flag, compare_value, dest_current_value); 1736 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1737 bne_predict_not_taken(flag, failed); 1738 } else { 1739 bne( flag, failed); 1740 } 1741 1742 stdcx_(exchange_value, addr_base); 1743 if (!weak || use_result_reg || failed_ext) { 1744 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1745 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1746 } else { 1747 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1748 } 1749 } 1750 1751 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1752 if (use_result_reg) { 1753 li(int_flag_success, 1); 1754 } 1755 1756 if (semantics & MemBarFenceAfter) { 1757 fence(); 1758 } else if (semantics & MemBarAcq) { 1759 isync(); 1760 } 1761 1762 if (use_result_reg && !preset_result_reg) { 1763 b(done); 1764 } 1765 1766 bind(failed_int); 1767 if (use_result_reg && !preset_result_reg) { 1768 li(int_flag_success, 0); 1769 } 1770 1771 bind(done); 1772 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1773 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1774 } 1775 1776 // Look up the method for a megamorphic invokeinterface call. 1777 // The target method is determined by <intf_klass, itable_index>. 1778 // The receiver klass is in recv_klass. 1779 // On success, the result will be in method_result, and execution falls through. 1780 // On failure, execution transfers to the given label. 1781 void MacroAssembler::lookup_interface_method(Register recv_klass, 1782 Register intf_klass, 1783 RegisterOrConstant itable_index, 1784 Register method_result, 1785 Register scan_temp, 1786 Register temp2, 1787 Label& L_no_such_interface, 1788 bool return_method) { 1789 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1790 1791 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1792 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1793 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1794 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1795 int scan_step = itableOffsetEntry::size() * wordSize; 1796 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1797 1798 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1799 // %%% We should store the aligned, prescaled offset in the klassoop. 1800 // Then the next several instructions would fold away. 1801 1802 sldi(scan_temp, scan_temp, log_vte_size); 1803 addi(scan_temp, scan_temp, vtable_base); 1804 add(scan_temp, recv_klass, scan_temp); 1805 1806 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1807 if (return_method) { 1808 if (itable_index.is_register()) { 1809 Register itable_offset = itable_index.as_register(); 1810 sldi(method_result, itable_offset, logMEsize); 1811 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1812 add(method_result, method_result, recv_klass); 1813 } else { 1814 long itable_offset = (long)itable_index.as_constant(); 1815 // static address, no relocation 1816 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1817 } 1818 } 1819 1820 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1821 // if (scan->interface() == intf) { 1822 // result = (klass + scan->offset() + itable_index); 1823 // } 1824 // } 1825 Label search, found_method; 1826 1827 for (int peel = 1; peel >= 0; peel--) { 1828 // %%%% Could load both offset and interface in one ldx, if they were 1829 // in the opposite order. This would save a load. 1830 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1831 1832 // Check that this entry is non-null. A null entry means that 1833 // the receiver class doesn't implement the interface, and wasn't the 1834 // same as when the caller was compiled. 1835 cmpd(CCR0, temp2, intf_klass); 1836 1837 if (peel) { 1838 beq(CCR0, found_method); 1839 } else { 1840 bne(CCR0, search); 1841 // (invert the test to fall through to found_method...) 1842 } 1843 1844 if (!peel) break; 1845 1846 bind(search); 1847 1848 cmpdi(CCR0, temp2, 0); 1849 beq(CCR0, L_no_such_interface); 1850 addi(scan_temp, scan_temp, scan_step); 1851 } 1852 1853 bind(found_method); 1854 1855 // Got a hit. 1856 if (return_method) { 1857 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1858 lwz(scan_temp, ito_offset, scan_temp); 1859 ldx(method_result, scan_temp, method_result); 1860 } 1861 } 1862 1863 // virtual method calling 1864 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1865 RegisterOrConstant vtable_index, 1866 Register method_result) { 1867 1868 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1869 1870 const ByteSize base = Klass::vtable_start_offset(); 1871 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1872 1873 if (vtable_index.is_register()) { 1874 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1875 add(recv_klass, vtable_index.as_register(), recv_klass); 1876 } else { 1877 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1878 } 1879 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1880 } 1881 1882 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1884 Register super_klass, 1885 Register temp1_reg, 1886 Register temp2_reg, 1887 Label* L_success, 1888 Label* L_failure, 1889 Label* L_slow_path, 1890 RegisterOrConstant super_check_offset) { 1891 1892 const Register check_cache_offset = temp1_reg; 1893 const Register cached_super = temp2_reg; 1894 1895 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1896 1897 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1898 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1899 1900 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1901 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1902 1903 Label L_fallthrough; 1904 int label_nulls = 0; 1905 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1906 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1907 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1908 assert(label_nulls <= 1 || 1909 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1910 "at most one null in the batch, usually"); 1911 1912 // If the pointers are equal, we are done (e.g., String[] elements). 1913 // This self-check enables sharing of secondary supertype arrays among 1914 // non-primary types such as array-of-interface. Otherwise, each such 1915 // type would need its own customized SSA. 1916 // We move this check to the front of the fast path because many 1917 // type checks are in fact trivially successful in this manner, 1918 // so we get a nicely predicted branch right at the start of the check. 1919 cmpd(CCR0, sub_klass, super_klass); 1920 beq(CCR0, *L_success); 1921 1922 // Check the supertype display: 1923 if (must_load_sco) { 1924 // The super check offset is always positive... 1925 lwz(check_cache_offset, sco_offset, super_klass); 1926 super_check_offset = RegisterOrConstant(check_cache_offset); 1927 // super_check_offset is register. 1928 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 1929 } 1930 // The loaded value is the offset from KlassOopDesc. 1931 1932 ld(cached_super, super_check_offset, sub_klass); 1933 cmpd(CCR0, cached_super, super_klass); 1934 1935 // This check has worked decisively for primary supers. 1936 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1937 // (Secondary supers are interfaces and very deeply nested subtypes.) 1938 // This works in the same check above because of a tricky aliasing 1939 // between the super_cache and the primary super display elements. 1940 // (The 'super_check_addr' can address either, as the case requires.) 1941 // Note that the cache is updated below if it does not help us find 1942 // what we need immediately. 1943 // So if it was a primary super, we can just fail immediately. 1944 // Otherwise, it's the slow path for us (no success at this point). 1945 1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 1947 1948 if (super_check_offset.is_register()) { 1949 beq(CCR0, *L_success); 1950 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 1951 if (L_failure == &L_fallthrough) { 1952 beq(CCR0, *L_slow_path); 1953 } else { 1954 bne(CCR0, *L_failure); 1955 FINAL_JUMP(*L_slow_path); 1956 } 1957 } else { 1958 if (super_check_offset.as_constant() == sc_offset) { 1959 // Need a slow path; fast failure is impossible. 1960 if (L_slow_path == &L_fallthrough) { 1961 beq(CCR0, *L_success); 1962 } else { 1963 bne(CCR0, *L_slow_path); 1964 FINAL_JUMP(*L_success); 1965 } 1966 } else { 1967 // No slow path; it's a fast decision. 1968 if (L_failure == &L_fallthrough) { 1969 beq(CCR0, *L_success); 1970 } else { 1971 bne(CCR0, *L_failure); 1972 FINAL_JUMP(*L_success); 1973 } 1974 } 1975 } 1976 1977 bind(L_fallthrough); 1978 #undef FINAL_JUMP 1979 } 1980 1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1982 Register super_klass, 1983 Register temp1_reg, 1984 Register temp2_reg, 1985 Label* L_success, 1986 Register result_reg) { 1987 const Register array_ptr = temp1_reg; // current value from cache array 1988 const Register temp = temp2_reg; 1989 1990 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 1991 1992 int source_offset = in_bytes(Klass::secondary_supers_offset()); 1993 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 1994 1995 int length_offset = Array<Klass*>::length_offset_in_bytes(); 1996 int base_offset = Array<Klass*>::base_offset_in_bytes(); 1997 1998 Label hit, loop, failure, fallthru; 1999 2000 ld(array_ptr, source_offset, sub_klass); 2001 2002 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2003 lwz(temp, length_offset, array_ptr); 2004 cmpwi(CCR0, temp, 0); 2005 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2006 2007 mtctr(temp); // load ctr 2008 2009 bind(loop); 2010 // Oops in table are NO MORE compressed. 2011 ld(temp, base_offset, array_ptr); 2012 cmpd(CCR0, temp, super_klass); 2013 beq(CCR0, hit); 2014 addi(array_ptr, array_ptr, BytesPerWord); 2015 bdnz(loop); 2016 2017 bind(failure); 2018 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2019 b(fallthru); 2020 2021 bind(hit); 2022 std(super_klass, target_offset, sub_klass); // save result to cache 2023 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2024 if (L_success != nullptr) { b(*L_success); } 2025 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2026 2027 bind(fallthru); 2028 } 2029 2030 // Try fast path, then go to slow one if not successful 2031 void MacroAssembler::check_klass_subtype(Register sub_klass, 2032 Register super_klass, 2033 Register temp1_reg, 2034 Register temp2_reg, 2035 Label& L_success) { 2036 Label L_failure; 2037 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2038 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2039 bind(L_failure); // Fallthru if not successful. 2040 } 2041 2042 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2043 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2044 2045 Label L_fallthrough; 2046 if (L_fast_path == nullptr) { 2047 L_fast_path = &L_fallthrough; 2048 } else if (L_slow_path == nullptr) { 2049 L_slow_path = &L_fallthrough; 2050 } 2051 2052 // Fast path check: class is fully initialized 2053 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2054 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2055 beq(CCR0, *L_fast_path); 2056 2057 // Fast path check: current thread is initializer thread 2058 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2059 cmpd(CCR0, thread, R0); 2060 if (L_slow_path == &L_fallthrough) { 2061 beq(CCR0, *L_fast_path); 2062 } else if (L_fast_path == &L_fallthrough) { 2063 bne(CCR0, *L_slow_path); 2064 } else { 2065 Unimplemented(); 2066 } 2067 2068 bind(L_fallthrough); 2069 } 2070 2071 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2072 Register temp_reg, 2073 int extra_slot_offset) { 2074 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2075 int stackElementSize = Interpreter::stackElementSize; 2076 int offset = extra_slot_offset * stackElementSize; 2077 if (arg_slot.is_constant()) { 2078 offset += arg_slot.as_constant() * stackElementSize; 2079 return offset; 2080 } else { 2081 assert(temp_reg != noreg, "must specify"); 2082 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2083 if (offset != 0) 2084 addi(temp_reg, temp_reg, offset); 2085 return temp_reg; 2086 } 2087 } 2088 2089 void MacroAssembler::tlab_allocate( 2090 Register obj, // result: pointer to object after successful allocation 2091 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2092 int con_size_in_bytes, // object size in bytes if known at compile time 2093 Register t1, // temp register 2094 Label& slow_case // continuation point if fast allocation fails 2095 ) { 2096 // make sure arguments make sense 2097 assert_different_registers(obj, var_size_in_bytes, t1); 2098 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2099 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2100 2101 const Register new_top = t1; 2102 //verify_tlab(); not implemented 2103 2104 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2105 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2106 if (var_size_in_bytes == noreg) { 2107 addi(new_top, obj, con_size_in_bytes); 2108 } else { 2109 add(new_top, obj, var_size_in_bytes); 2110 } 2111 cmpld(CCR0, new_top, R0); 2112 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2113 2114 #ifdef ASSERT 2115 // make sure new free pointer is properly aligned 2116 { 2117 Label L; 2118 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2119 beq(CCR0, L); 2120 stop("updated TLAB free is not properly aligned"); 2121 bind(L); 2122 } 2123 #endif // ASSERT 2124 2125 // update the tlab top pointer 2126 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2127 //verify_tlab(); not implemented 2128 } 2129 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2130 unimplemented("incr_allocated_bytes"); 2131 } 2132 2133 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2134 int insts_call_instruction_offset, Register Rtoc) { 2135 // Start the stub. 2136 address stub = start_a_stub(64); 2137 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2138 2139 // Create a trampoline stub relocation which relates this trampoline stub 2140 // with the call instruction at insts_call_instruction_offset in the 2141 // instructions code-section. 2142 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2143 const int stub_start_offset = offset(); 2144 2145 // For java_to_interp stubs we use R11_scratch1 as scratch register 2146 // and in call trampoline stubs we use R12_scratch2. This way we 2147 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2148 Register reg_scratch = R12_scratch2; 2149 2150 // Now, create the trampoline stub's code: 2151 // - load the TOC 2152 // - load the call target from the constant pool 2153 // - call 2154 if (Rtoc == noreg) { 2155 calculate_address_from_global_toc(reg_scratch, method_toc()); 2156 Rtoc = reg_scratch; 2157 } 2158 2159 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2160 mtctr(reg_scratch); 2161 bctr(); 2162 2163 const address stub_start_addr = addr_at(stub_start_offset); 2164 2165 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2166 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2167 "encoded offset into the constant pool must match"); 2168 // Trampoline_stub_size should be good. 2169 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2170 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2171 2172 // End the stub. 2173 end_a_stub(); 2174 return stub; 2175 } 2176 2177 // "The box" is the space on the stack where we copy the object mark. 2178 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2179 Register temp, Register displaced_header, Register current_header) { 2180 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2181 assert_different_registers(oop, box, temp, displaced_header, current_header); 2182 Label object_has_monitor; 2183 Label cas_failed; 2184 Label success, failure; 2185 2186 // Load markWord from object into displaced_header. 2187 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2188 2189 if (DiagnoseSyncOnValueBasedClasses != 0) { 2190 load_klass(temp, oop); 2191 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2192 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2193 bne(flag, failure); 2194 } 2195 2196 // Handle existing monitor. 2197 // The object has an existing monitor iff (mark & monitor_value) != 0. 2198 andi_(temp, displaced_header, markWord::monitor_value); 2199 bne(CCR0, object_has_monitor); 2200 2201 if (LockingMode == LM_MONITOR) { 2202 // Set NE to indicate 'failure' -> take slow-path. 2203 crandc(flag, Assembler::equal, flag, Assembler::equal); 2204 b(failure); 2205 } else { 2206 assert(LockingMode == LM_LEGACY, "must be"); 2207 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2208 ori(displaced_header, displaced_header, markWord::unlocked_value); 2209 2210 // Load Compare Value application register. 2211 2212 // Initialize the box. (Must happen before we update the object mark!) 2213 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2214 2215 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2216 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2217 cmpxchgd(/*flag=*/flag, 2218 /*current_value=*/current_header, 2219 /*compare_value=*/displaced_header, 2220 /*exchange_value=*/box, 2221 /*where=*/oop, 2222 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2223 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2224 noreg, 2225 &cas_failed, 2226 /*check without membar and ldarx first*/true); 2227 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2228 // If the compare-and-exchange succeeded, then we found an unlocked 2229 // object and we have now locked it. 2230 b(success); 2231 2232 bind(cas_failed); 2233 // We did not see an unlocked object so try the fast recursive case. 2234 2235 // Check if the owner is self by comparing the value in the markWord of object 2236 // (current_header) with the stack pointer. 2237 sub(current_header, current_header, R1_SP); 2238 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2239 2240 and_(R0/*==0?*/, current_header, temp); 2241 // If condition is true we are cont and hence we can store 0 as the 2242 // displaced header in the box, which indicates that it is a recursive lock. 2243 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2244 2245 if (flag != CCR0) { 2246 mcrf(flag, CCR0); 2247 } 2248 beq(CCR0, success); 2249 b(failure); 2250 } 2251 2252 // Handle existing monitor. 2253 bind(object_has_monitor); 2254 // The object's monitor m is unlocked iff m->owner is null, 2255 // otherwise m->owner may contain a thread or a stack address. 2256 2257 // Try to CAS m->owner from null to current thread. 2258 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2259 cmpxchgd(/*flag=*/flag, 2260 /*current_value=*/current_header, 2261 /*compare_value=*/(intptr_t)0, 2262 /*exchange_value=*/R16_thread, 2263 /*where=*/temp, 2264 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2265 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2266 2267 // Store a non-null value into the box. 2268 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2269 beq(flag, success); 2270 2271 // Check for recursive locking. 2272 cmpd(flag, current_header, R16_thread); 2273 bne(flag, failure); 2274 2275 // Current thread already owns the lock. Just increment recursions. 2276 Register recursions = displaced_header; 2277 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2278 addi(recursions, recursions, 1); 2279 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2280 2281 // flag == EQ indicates success, increment held monitor count 2282 // flag == NE indicates failure 2283 bind(success); 2284 inc_held_monitor_count(temp); 2285 bind(failure); 2286 } 2287 2288 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2289 Register temp, Register displaced_header, Register current_header) { 2290 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2291 assert_different_registers(oop, box, temp, displaced_header, current_header); 2292 Label success, failure, object_has_monitor, notRecursive; 2293 2294 if (LockingMode == LM_LEGACY) { 2295 // Find the lock address and load the displaced header from the stack. 2296 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2297 2298 // If the displaced header is 0, we have a recursive unlock. 2299 cmpdi(flag, displaced_header, 0); 2300 beq(flag, success); 2301 } 2302 2303 // Handle existing monitor. 2304 // The object has an existing monitor iff (mark & monitor_value) != 0. 2305 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2306 andi_(R0, current_header, markWord::monitor_value); 2307 bne(CCR0, object_has_monitor); 2308 2309 if (LockingMode == LM_MONITOR) { 2310 // Set NE to indicate 'failure' -> take slow-path. 2311 crandc(flag, Assembler::equal, flag, Assembler::equal); 2312 b(failure); 2313 } else { 2314 assert(LockingMode == LM_LEGACY, "must be"); 2315 // Check if it is still a light weight lock, this is is true if we see 2316 // the stack address of the basicLock in the markWord of the object. 2317 // Cmpxchg sets flag to cmpd(current_header, box). 2318 cmpxchgd(/*flag=*/flag, 2319 /*current_value=*/current_header, 2320 /*compare_value=*/box, 2321 /*exchange_value=*/displaced_header, 2322 /*where=*/oop, 2323 MacroAssembler::MemBarRel, 2324 MacroAssembler::cmpxchgx_hint_release_lock(), 2325 noreg, 2326 &failure); 2327 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2328 b(success); 2329 } 2330 2331 // Handle existing monitor. 2332 bind(object_has_monitor); 2333 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2334 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2335 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2336 2337 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2338 // This is handled like owner thread mismatches: We take the slow path. 2339 cmpd(flag, temp, R16_thread); 2340 bne(flag, failure); 2341 2342 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2343 2344 addic_(displaced_header, displaced_header, -1); 2345 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2346 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2347 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2348 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2349 } 2350 b(success); 2351 2352 bind(notRecursive); 2353 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2354 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2355 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2356 cmpdi(flag, temp, 0); 2357 bne(flag, failure); 2358 release(); 2359 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2360 2361 // flag == EQ indicates success, decrement held monitor count 2362 // flag == NE indicates failure 2363 bind(success); 2364 dec_held_monitor_count(temp); 2365 bind(failure); 2366 } 2367 2368 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2369 Register tmp2, Register tmp3) { 2370 assert_different_registers(obj, tmp1, tmp2, tmp3); 2371 assert(flag == CCR0, "bad condition register"); 2372 2373 // Handle inflated monitor. 2374 Label inflated; 2375 // Finish fast lock successfully. MUST reach to with flag == NE 2376 Label locked; 2377 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2378 Label slow_path; 2379 2380 if (DiagnoseSyncOnValueBasedClasses != 0) { 2381 load_klass(tmp1, obj); 2382 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2383 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2384 bne(flag, slow_path); 2385 } 2386 2387 const Register mark = tmp1; 2388 const Register t = tmp3; // Usage of R0 allowed! 2389 2390 { // Lightweight locking 2391 2392 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2393 Label push; 2394 2395 const Register top = tmp2; 2396 2397 // Check if lock-stack is full. 2398 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2399 cmplwi(flag, top, LockStack::end_offset() - 1); 2400 bgt(flag, slow_path); 2401 2402 // The underflow check is elided. The recursive check will always fail 2403 // when the lock stack is empty because of the _bad_oop_sentinel field. 2404 2405 // Check if recursive. 2406 subi(t, top, oopSize); 2407 ldx(t, R16_thread, t); 2408 cmpd(flag, obj, t); 2409 beq(flag, push); 2410 2411 // Check for monitor (0b10) or locked (0b00). 2412 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2413 andi_(t, mark, markWord::lock_mask_in_place); 2414 cmpldi(flag, t, markWord::unlocked_value); 2415 bgt(flag, inflated); 2416 bne(flag, slow_path); 2417 2418 // Not inflated. 2419 2420 // Try to lock. Transition lock bits 0b00 => 0b01 2421 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2422 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2423 2424 bind(push); 2425 // After successful lock, push object on lock-stack. 2426 stdx(obj, R16_thread, top); 2427 addi(top, top, oopSize); 2428 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2429 b(locked); 2430 } 2431 2432 { // Handle inflated monitor. 2433 bind(inflated); 2434 2435 // mark contains the tagged ObjectMonitor*. 2436 const Register tagged_monitor = mark; 2437 const uintptr_t monitor_tag = markWord::monitor_value; 2438 const Register owner_addr = tmp2; 2439 2440 // Compute owner address. 2441 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2442 2443 // CAS owner (null => current thread). 2444 cmpxchgd(/*flag=*/flag, 2445 /*current_value=*/t, 2446 /*compare_value=*/(intptr_t)0, 2447 /*exchange_value=*/R16_thread, 2448 /*where=*/owner_addr, 2449 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2450 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2451 beq(flag, locked); 2452 2453 // Check if recursive. 2454 cmpd(flag, t, R16_thread); 2455 bne(flag, slow_path); 2456 2457 // Recursive. 2458 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2459 addi(tmp1, tmp1, 1); 2460 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2461 } 2462 2463 bind(locked); 2464 inc_held_monitor_count(tmp1); 2465 2466 #ifdef ASSERT 2467 // Check that locked label is reached with flag == EQ. 2468 Label flag_correct; 2469 beq(flag, flag_correct); 2470 stop("Fast Lock Flag != EQ"); 2471 #endif 2472 bind(slow_path); 2473 #ifdef ASSERT 2474 // Check that slow_path label is reached with flag == NE. 2475 bne(flag, flag_correct); 2476 stop("Fast Lock Flag != NE"); 2477 bind(flag_correct); 2478 #endif 2479 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2480 } 2481 2482 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2483 Register tmp2, Register tmp3) { 2484 assert_different_registers(obj, tmp1, tmp2, tmp3); 2485 assert(flag == CCR0, "bad condition register"); 2486 2487 // Handle inflated monitor. 2488 Label inflated, inflated_load_monitor; 2489 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2490 Label unlocked; 2491 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2492 Label slow_path; 2493 2494 const Register mark = tmp1; 2495 const Register top = tmp2; 2496 const Register t = tmp3; 2497 2498 { // Lightweight unlock 2499 Label push_and_slow; 2500 2501 // Check if obj is top of lock-stack. 2502 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2503 subi(top, top, oopSize); 2504 ldx(t, R16_thread, top); 2505 cmpd(flag, obj, t); 2506 // Top of lock stack was not obj. Must be monitor. 2507 bne(flag, inflated_load_monitor); 2508 2509 // Pop lock-stack. 2510 DEBUG_ONLY(li(t, 0);) 2511 DEBUG_ONLY(stdx(t, R16_thread, top);) 2512 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2513 2514 // The underflow check is elided. The recursive check will always fail 2515 // when the lock stack is empty because of the _bad_oop_sentinel field. 2516 2517 // Check if recursive. 2518 subi(t, top, oopSize); 2519 ldx(t, R16_thread, t); 2520 cmpd(flag, obj, t); 2521 beq(flag, unlocked); 2522 2523 // Not recursive. 2524 2525 // Check for monitor (0b10). 2526 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2527 andi_(t, mark, markWord::monitor_value); 2528 bne(CCR0, inflated); 2529 2530 #ifdef ASSERT 2531 // Check header not unlocked (0b01). 2532 Label not_unlocked; 2533 andi_(t, mark, markWord::unlocked_value); 2534 beq(CCR0, not_unlocked); 2535 stop("lightweight_unlock already unlocked"); 2536 bind(not_unlocked); 2537 #endif 2538 2539 // Try to unlock. Transition lock bits 0b00 => 0b01 2540 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2541 b(unlocked); 2542 2543 bind(push_and_slow); 2544 // Restore lock-stack and handle the unlock in runtime. 2545 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2546 addi(top, top, oopSize); 2547 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2548 b(slow_path); 2549 } 2550 2551 { // Handle inflated monitor. 2552 bind(inflated_load_monitor); 2553 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2554 #ifdef ASSERT 2555 andi_(t, mark, markWord::monitor_value); 2556 bne(CCR0, inflated); 2557 stop("Fast Unlock not monitor"); 2558 #endif 2559 2560 bind(inflated); 2561 2562 #ifdef ASSERT 2563 Label check_done; 2564 subi(top, top, oopSize); 2565 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2566 blt(CCR0, check_done); 2567 ldx(t, R16_thread, top); 2568 cmpd(flag, obj, t); 2569 bne(flag, inflated); 2570 stop("Fast Unlock lock on stack"); 2571 bind(check_done); 2572 #endif 2573 2574 // mark contains the tagged ObjectMonitor*. 2575 const Register monitor = mark; 2576 const uintptr_t monitor_tag = markWord::monitor_value; 2577 2578 // Untag the monitor. 2579 subi(monitor, mark, monitor_tag); 2580 2581 const Register recursions = tmp2; 2582 Label not_recursive; 2583 2584 // Check if recursive. 2585 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2586 addic_(recursions, recursions, -1); 2587 blt(CCR0, not_recursive); 2588 2589 // Recursive unlock. 2590 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2591 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2592 b(unlocked); 2593 2594 bind(not_recursive); 2595 2596 Label release_; 2597 const Register t2 = tmp2; 2598 2599 // Check if the entry lists are empty. 2600 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2601 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2602 orr(t, t, t2); 2603 cmpdi(flag, t, 0); 2604 beq(flag, release_); 2605 2606 // The owner may be anonymous and we removed the last obj entry in 2607 // the lock-stack. This loses the information about the owner. 2608 // Write the thread to the owner field so the runtime knows the owner. 2609 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor); 2610 b(slow_path); 2611 2612 bind(release_); 2613 // Set owner to null. 2614 release(); 2615 // t contains 0 2616 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2617 } 2618 2619 bind(unlocked); 2620 dec_held_monitor_count(t); 2621 2622 #ifdef ASSERT 2623 // Check that unlocked label is reached with flag == EQ. 2624 Label flag_correct; 2625 beq(flag, flag_correct); 2626 stop("Fast Lock Flag != EQ"); 2627 #endif 2628 bind(slow_path); 2629 #ifdef ASSERT 2630 // Check that slow_path label is reached with flag == NE. 2631 bne(flag, flag_correct); 2632 stop("Fast Lock Flag != NE"); 2633 bind(flag_correct); 2634 #endif 2635 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2636 } 2637 2638 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2639 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2640 2641 if (at_return) { 2642 if (in_nmethod) { 2643 if (UseSIGTRAP) { 2644 // Use Signal Handler. 2645 relocate(relocInfo::poll_return_type); 2646 td(traptoGreaterThanUnsigned, R1_SP, temp); 2647 } else { 2648 cmpld(CCR0, R1_SP, temp); 2649 // Stub may be out of range for short conditional branch. 2650 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2651 } 2652 } else { // Not in nmethod. 2653 // Frame still on stack, need to get fp. 2654 Register fp = R0; 2655 ld(fp, _abi0(callers_sp), R1_SP); 2656 cmpld(CCR0, fp, temp); 2657 bgt(CCR0, slow_path); 2658 } 2659 } else { // Normal safepoint poll. Not at return. 2660 assert(!in_nmethod, "should use load_from_polling_page"); 2661 andi_(temp, temp, SafepointMechanism::poll_bit()); 2662 bne(CCR0, slow_path); 2663 } 2664 } 2665 2666 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2667 MacroAssembler::PreservationLevel preservation_level) { 2668 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2669 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2670 } 2671 2672 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2673 MacroAssembler::PreservationLevel preservation_level) { 2674 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2675 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2676 } 2677 2678 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2679 // in frame_ppc.hpp. 2680 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2681 // Always set last_Java_pc and flags first because once last_Java_sp 2682 // is visible has_last_Java_frame is true and users will look at the 2683 // rest of the fields. (Note: flags should always be zero before we 2684 // get here so doesn't need to be set.) 2685 2686 // Verify that last_Java_pc was zeroed on return to Java 2687 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2688 "last_Java_pc not zeroed before leaving Java"); 2689 2690 // When returning from calling out from Java mode the frame anchor's 2691 // last_Java_pc will always be set to null. It is set here so that 2692 // if we are doing a call to native (not VM) that we capture the 2693 // known pc and don't have to rely on the native call having a 2694 // standard frame linkage where we can find the pc. 2695 if (last_Java_pc != noreg) 2696 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2697 2698 // Set last_Java_sp last. 2699 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2700 } 2701 2702 void MacroAssembler::reset_last_Java_frame(void) { 2703 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2704 R16_thread, "SP was not set, still zero"); 2705 2706 BLOCK_COMMENT("reset_last_Java_frame {"); 2707 li(R0, 0); 2708 2709 // _last_Java_sp = 0 2710 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2711 2712 // _last_Java_pc = 0 2713 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2714 BLOCK_COMMENT("} reset_last_Java_frame"); 2715 } 2716 2717 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2718 assert_different_registers(sp, tmp1); 2719 2720 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2721 // TOP_IJAVA_FRAME_ABI. 2722 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2723 address entry = pc(); 2724 load_const_optimized(tmp1, entry); 2725 2726 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2727 } 2728 2729 void MacroAssembler::get_vm_result(Register oop_result) { 2730 // Read: 2731 // R16_thread 2732 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2733 // 2734 // Updated: 2735 // oop_result 2736 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2737 2738 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2739 li(R0, 0); 2740 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2741 2742 verify_oop(oop_result, FILE_AND_LINE); 2743 } 2744 2745 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2746 // Read: 2747 // R16_thread 2748 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2749 // 2750 // Updated: 2751 // metadata_result 2752 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2753 2754 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2755 li(R0, 0); 2756 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2757 } 2758 2759 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2760 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2761 if (CompressedKlassPointers::base() != 0) { 2762 // Use dst as temp if it is free. 2763 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2764 current = dst; 2765 } 2766 if (CompressedKlassPointers::shift() != 0) { 2767 srdi(dst, current, CompressedKlassPointers::shift()); 2768 current = dst; 2769 } 2770 return current; 2771 } 2772 2773 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2774 if (UseCompressedClassPointers) { 2775 Register compressedKlass = encode_klass_not_null(ck, klass); 2776 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2777 } else { 2778 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2779 } 2780 } 2781 2782 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2783 if (UseCompressedClassPointers) { 2784 if (val == noreg) { 2785 val = R0; 2786 li(val, 0); 2787 } 2788 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2789 } 2790 } 2791 2792 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2793 static int computed_size = -1; 2794 2795 // Not yet computed? 2796 if (computed_size == -1) { 2797 2798 if (!UseCompressedClassPointers) { 2799 computed_size = 0; 2800 } else { 2801 // Determine by scratch emit. 2802 ResourceMark rm; 2803 int code_size = 8 * BytesPerInstWord; 2804 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 2805 MacroAssembler* a = new MacroAssembler(&cb); 2806 a->decode_klass_not_null(R11_scratch1); 2807 computed_size = a->offset(); 2808 } 2809 } 2810 2811 return computed_size; 2812 } 2813 2814 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2815 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2816 if (src == noreg) src = dst; 2817 Register shifted_src = src; 2818 if (CompressedKlassPointers::shift() != 0 || 2819 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 2820 shifted_src = dst; 2821 sldi(shifted_src, src, CompressedKlassPointers::shift()); 2822 } 2823 if (CompressedKlassPointers::base() != 0) { 2824 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 2825 } 2826 } 2827 2828 void MacroAssembler::load_klass(Register dst, Register src) { 2829 if (UseCompressedClassPointers) { 2830 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2831 // Attention: no null check here! 2832 decode_klass_not_null(dst, dst); 2833 } else { 2834 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2835 } 2836 } 2837 2838 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 2839 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 2840 load_klass(dst, src); 2841 } 2842 2843 // ((OopHandle)result).resolve(); 2844 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 2845 MacroAssembler::PreservationLevel preservation_level) { 2846 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 2847 } 2848 2849 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 2850 MacroAssembler::PreservationLevel preservation_level) { 2851 Label resolved; 2852 2853 // A null weak handle resolves to null. 2854 cmpdi(CCR0, result, 0); 2855 beq(CCR0, resolved); 2856 2857 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 2858 preservation_level); 2859 bind(resolved); 2860 } 2861 2862 void MacroAssembler::load_method_holder(Register holder, Register method) { 2863 ld(holder, in_bytes(Method::const_offset()), method); 2864 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 2865 ld(holder, ConstantPool::pool_holder_offset(), holder); 2866 } 2867 2868 // Clear Array 2869 // For very short arrays. tmp == R0 is allowed. 2870 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 2871 if (cnt_dwords > 0) { li(tmp, 0); } 2872 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 2873 } 2874 2875 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 2876 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 2877 if (cnt_dwords < 8) { 2878 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 2879 return; 2880 } 2881 2882 Label loop; 2883 const long loopcnt = cnt_dwords >> 1, 2884 remainder = cnt_dwords & 1; 2885 2886 li(tmp, loopcnt); 2887 mtctr(tmp); 2888 li(tmp, 0); 2889 bind(loop); 2890 std(tmp, 0, base_ptr); 2891 std(tmp, 8, base_ptr); 2892 addi(base_ptr, base_ptr, 16); 2893 bdnz(loop); 2894 if (remainder) { std(tmp, 0, base_ptr); } 2895 } 2896 2897 // Kills both input registers. tmp == R0 is allowed. 2898 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 2899 // Procedure for large arrays (uses data cache block zero instruction). 2900 Label startloop, fast, fastloop, small_rest, restloop, done; 2901 const int cl_size = VM_Version::L1_data_cache_line_size(), 2902 cl_dwords = cl_size >> 3, 2903 cl_dw_addr_bits = exact_log2(cl_dwords), 2904 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 2905 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 2906 2907 if (const_cnt >= 0) { 2908 // Constant case. 2909 if (const_cnt < min_cnt) { 2910 clear_memory_constlen(base_ptr, const_cnt, tmp); 2911 return; 2912 } 2913 load_const_optimized(cnt_dwords, const_cnt, tmp); 2914 } else { 2915 // cnt_dwords already loaded in register. Need to check size. 2916 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 2917 blt(CCR1, small_rest); 2918 } 2919 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 2920 beq(CCR0, fast); // Already 128byte aligned. 2921 2922 subfic(tmp, tmp, cl_dwords); 2923 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 2924 subf(cnt_dwords, tmp, cnt_dwords); // rest. 2925 li(tmp, 0); 2926 2927 bind(startloop); // Clear at the beginning to reach 128byte boundary. 2928 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2929 addi(base_ptr, base_ptr, 8); 2930 bdnz(startloop); 2931 2932 bind(fast); // Clear 128byte blocks. 2933 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 2934 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 2935 mtctr(tmp); // Load counter. 2936 2937 bind(fastloop); 2938 dcbz(base_ptr); // Clear 128byte aligned block. 2939 addi(base_ptr, base_ptr, cl_size); 2940 bdnz(fastloop); 2941 2942 bind(small_rest); 2943 cmpdi(CCR0, cnt_dwords, 0); // size 0? 2944 beq(CCR0, done); // rest == 0 2945 li(tmp, 0); 2946 mtctr(cnt_dwords); // Load counter. 2947 2948 bind(restloop); // Clear rest. 2949 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 2950 addi(base_ptr, base_ptr, 8); 2951 bdnz(restloop); 2952 2953 bind(done); 2954 } 2955 2956 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 2957 2958 // Helpers for Intrinsic Emitters 2959 // 2960 // Revert the byte order of a 32bit value in a register 2961 // src: 0x44556677 2962 // dst: 0x77665544 2963 // Three steps to obtain the result: 2964 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 2965 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 2966 // This value initializes dst. 2967 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 2968 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 2969 // This value is mask inserted into dst with a [0..23] mask of 1s. 2970 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 2971 // This value is mask inserted into dst with a [8..15] mask of 1s. 2972 void MacroAssembler::load_reverse_32(Register dst, Register src) { 2973 assert_different_registers(dst, src); 2974 2975 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 2976 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 2977 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 2978 } 2979 2980 // Calculate the column addresses of the crc32 lookup table into distinct registers. 2981 // This loop-invariant calculation is moved out of the loop body, reducing the loop 2982 // body size from 20 to 16 instructions. 2983 // Returns the offset that was used to calculate the address of column tc3. 2984 // Due to register shortage, setting tc3 may overwrite table. With the return offset 2985 // at hand, the original table address can be easily reconstructed. 2986 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 2987 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 2988 2989 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 2990 // Layout: See StubRoutines::ppc::generate_crc_constants. 2991 #ifdef VM_LITTLE_ENDIAN 2992 const int ix0 = 3 * CRC32_TABLE_SIZE; 2993 const int ix1 = 2 * CRC32_TABLE_SIZE; 2994 const int ix2 = 1 * CRC32_TABLE_SIZE; 2995 const int ix3 = 0 * CRC32_TABLE_SIZE; 2996 #else 2997 const int ix0 = 1 * CRC32_TABLE_SIZE; 2998 const int ix1 = 2 * CRC32_TABLE_SIZE; 2999 const int ix2 = 3 * CRC32_TABLE_SIZE; 3000 const int ix3 = 4 * CRC32_TABLE_SIZE; 3001 #endif 3002 assert_different_registers(table, tc0, tc1, tc2); 3003 assert(table == tc3, "must be!"); 3004 3005 addi(tc0, table, ix0); 3006 addi(tc1, table, ix1); 3007 addi(tc2, table, ix2); 3008 if (ix3 != 0) addi(tc3, table, ix3); 3009 3010 return ix3; 3011 } 3012 3013 /** 3014 * uint32_t crc; 3015 * table[crc & 0xFF] ^ (crc >> 8); 3016 */ 3017 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3018 assert_different_registers(crc, table, tmp); 3019 assert_different_registers(val, table); 3020 3021 if (crc == val) { // Must rotate first to use the unmodified value. 3022 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3023 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3024 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3025 } else { 3026 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3027 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3028 } 3029 lwzx(tmp, table, tmp); 3030 xorr(crc, crc, tmp); 3031 } 3032 3033 /** 3034 * Emits code to update CRC-32 with a byte value according to constants in table. 3035 * 3036 * @param [in,out]crc Register containing the crc. 3037 * @param [in]val Register containing the byte to fold into the CRC. 3038 * @param [in]table Register containing the table of crc constants. 3039 * 3040 * uint32_t crc; 3041 * val = crc_table[(val ^ crc) & 0xFF]; 3042 * crc = val ^ (crc >> 8); 3043 */ 3044 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3045 BLOCK_COMMENT("update_byte_crc32:"); 3046 xorr(val, val, crc); 3047 fold_byte_crc32(crc, val, table, val); 3048 } 3049 3050 /** 3051 * @param crc register containing existing CRC (32-bit) 3052 * @param buf register pointing to input byte buffer (byte*) 3053 * @param len register containing number of bytes 3054 * @param table register pointing to CRC table 3055 */ 3056 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3057 Register data, bool loopAlignment) { 3058 assert_different_registers(crc, buf, len, table, data); 3059 3060 Label L_mainLoop, L_done; 3061 const int mainLoop_stepping = 1; 3062 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3063 3064 // Process all bytes in a single-byte loop. 3065 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3066 beq(CCR0, L_done); 3067 3068 mtctr(len); 3069 align(mainLoop_alignment); 3070 BIND(L_mainLoop); 3071 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3072 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3073 update_byte_crc32(crc, data, table); 3074 bdnz(L_mainLoop); // Iterate. 3075 3076 bind(L_done); 3077 } 3078 3079 /** 3080 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3081 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3082 */ 3083 // A note on the lookup table address(es): 3084 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3085 // To save the effort of adding the column offset to the table address each time 3086 // a table element is looked up, it is possible to pass the pre-calculated 3087 // column addresses. 3088 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3089 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3090 Register t0, Register t1, Register t2, Register t3, 3091 Register tc0, Register tc1, Register tc2, Register tc3) { 3092 assert_different_registers(crc, t3); 3093 3094 // XOR crc with next four bytes of buffer. 3095 lwz(t3, bufDisp, buf); 3096 if (bufInc != 0) { 3097 addi(buf, buf, bufInc); 3098 } 3099 xorr(t3, t3, crc); 3100 3101 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3102 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3103 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3104 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3105 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3106 3107 // Use the pre-calculated column addresses. 3108 // Load pre-calculated table values. 3109 lwzx(t0, tc0, t0); 3110 lwzx(t1, tc1, t1); 3111 lwzx(t2, tc2, t2); 3112 lwzx(t3, tc3, t3); 3113 3114 // Calculate new crc from table values. 3115 xorr(t0, t0, t1); 3116 xorr(t2, t2, t3); 3117 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3118 } 3119 3120 /** 3121 * @param crc register containing existing CRC (32-bit) 3122 * @param buf register pointing to input byte buffer (byte*) 3123 * @param len register containing number of bytes 3124 * @param table register pointing to CRC table 3125 * 3126 * uses R9..R12 as work register. Must be saved/restored by caller! 3127 */ 3128 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3129 Register t0, Register t1, Register t2, Register t3, 3130 Register tc0, Register tc1, Register tc2, Register tc3, 3131 bool invertCRC) { 3132 assert_different_registers(crc, buf, len, table); 3133 3134 Label L_mainLoop, L_tail; 3135 Register tmp = t0; 3136 Register data = t0; 3137 Register tmp2 = t1; 3138 const int mainLoop_stepping = 4; 3139 const int tailLoop_stepping = 1; 3140 const int log_stepping = exact_log2(mainLoop_stepping); 3141 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3142 const int complexThreshold = 2*mainLoop_stepping; 3143 3144 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3145 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3146 // for all well-behaved cases. The situation itself is detected and handled correctly 3147 // within update_byteLoop_crc32. 3148 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3149 3150 BLOCK_COMMENT("kernel_crc32_1word {"); 3151 3152 if (invertCRC) { 3153 nand(crc, crc, crc); // 1s complement of crc 3154 } 3155 3156 // Check for short (<mainLoop_stepping) buffer. 3157 cmpdi(CCR0, len, complexThreshold); 3158 blt(CCR0, L_tail); 3159 3160 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3161 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3162 { 3163 // Align buf addr to mainLoop_stepping boundary. 3164 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3165 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3166 3167 if (complexThreshold > mainLoop_stepping) { 3168 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3169 } else { 3170 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3171 cmpdi(CCR0, tmp, mainLoop_stepping); 3172 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3173 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3174 } 3175 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3176 } 3177 3178 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3179 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3180 mtctr(tmp2); 3181 3182 #ifdef VM_LITTLE_ENDIAN 3183 Register crc_rv = crc; 3184 #else 3185 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3186 // Occupies tmp, but frees up crc. 3187 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3188 tmp = crc; 3189 #endif 3190 3191 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3192 3193 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3194 BIND(L_mainLoop); 3195 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3196 bdnz(L_mainLoop); 3197 3198 #ifndef VM_LITTLE_ENDIAN 3199 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3200 tmp = crc_rv; // Tmp uses it's original register again. 3201 #endif 3202 3203 // Restore original table address for tailLoop. 3204 if (reconstructTableOffset != 0) { 3205 addi(table, table, -reconstructTableOffset); 3206 } 3207 3208 // Process last few (<complexThreshold) bytes of buffer. 3209 BIND(L_tail); 3210 update_byteLoop_crc32(crc, buf, len, table, data, false); 3211 3212 if (invertCRC) { 3213 nand(crc, crc, crc); // 1s complement of crc 3214 } 3215 BLOCK_COMMENT("} kernel_crc32_1word"); 3216 } 3217 3218 /** 3219 * @param crc register containing existing CRC (32-bit) 3220 * @param buf register pointing to input byte buffer (byte*) 3221 * @param len register containing number of bytes 3222 * @param constants register pointing to precomputed constants 3223 * @param t0-t6 temp registers 3224 */ 3225 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3226 Register t0, Register t1, Register t2, Register t3, 3227 Register t4, Register t5, Register t6, bool invertCRC) { 3228 assert_different_registers(crc, buf, len, constants); 3229 3230 Label L_tail; 3231 3232 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3233 3234 if (invertCRC) { 3235 nand(crc, crc, crc); // 1s complement of crc 3236 } 3237 3238 // Enforce 32 bit. 3239 clrldi(len, len, 32); 3240 3241 // Align if we have enough bytes for the fast version. 3242 const int alignment = 16, 3243 threshold = 32; 3244 Register prealign = t0; 3245 3246 neg(prealign, buf); 3247 addi(t1, len, -threshold); 3248 andi(prealign, prealign, alignment - 1); 3249 cmpw(CCR0, t1, prealign); 3250 blt(CCR0, L_tail); // len - prealign < threshold? 3251 3252 subf(len, prealign, len); 3253 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3254 3255 // Calculate from first aligned address as far as possible. 3256 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3257 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3258 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3259 3260 // Remaining bytes. 3261 BIND(L_tail); 3262 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3263 3264 if (invertCRC) { 3265 nand(crc, crc, crc); // 1s complement of crc 3266 } 3267 3268 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3269 } 3270 3271 /** 3272 * @param crc register containing existing CRC (32-bit) 3273 * @param buf register pointing to input byte buffer (byte*) 3274 * @param len register containing number of bytes (will get updated to remaining bytes) 3275 * @param constants register pointing to CRC table for 128-bit aligned memory 3276 * @param t0-t6 temp registers 3277 */ 3278 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3279 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3280 3281 // Save non-volatile vector registers (frameless). 3282 Register offset = t1; 3283 int offsetInt = 0; 3284 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3285 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3286 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3287 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3288 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3289 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3290 #ifndef VM_LITTLE_ENDIAN 3291 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3292 #endif 3293 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3294 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3295 3296 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3297 // bytes per iteration. The basic scheme is: 3298 // lvx: load vector (Big Endian needs reversal) 3299 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3300 // vxor: xor partial results together to get unroll_factor2 vectors 3301 3302 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3303 3304 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3305 const int unroll_factor = CRC32_UNROLL_FACTOR, 3306 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3307 3308 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3309 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3310 3311 // Support registers. 3312 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3313 Register num_bytes = R14, 3314 loop_count = R15, 3315 cur_const = crc; // will live in VCRC 3316 // Constant array for outer loop: unroll_factor2 - 1 registers, 3317 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3318 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3319 consts1[] = { VR23, VR24 }; 3320 // Data register arrays: 2 arrays with unroll_factor2 registers. 3321 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3322 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3323 3324 VectorRegister VCRC = data0[0]; 3325 VectorRegister Vc = VR25; 3326 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3327 3328 // We have at least 1 iteration (ensured by caller). 3329 Label L_outer_loop, L_inner_loop, L_last; 3330 3331 // If supported set DSCR pre-fetch to deepest. 3332 if (VM_Version::has_mfdscr()) { 3333 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3334 mtdscr(t0); 3335 } 3336 3337 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3338 3339 for (int i = 1; i < unroll_factor2; ++i) { 3340 li(offs[i], 16 * i); 3341 } 3342 3343 // Load consts for outer loop 3344 lvx(consts0[0], constants); 3345 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3346 lvx(consts0[i], offs[i], constants); 3347 } 3348 3349 load_const_optimized(num_bytes, 16 * unroll_factor); 3350 3351 // Reuse data registers outside of the loop. 3352 VectorRegister Vtmp = data1[0]; 3353 VectorRegister Vtmp2 = data1[1]; 3354 VectorRegister zeroes = data1[2]; 3355 3356 vspltisb(Vtmp, 0); 3357 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3358 3359 // Load vector for vpermxor (to xor both 64 bit parts together) 3360 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3361 vspltisb(Vc, 4); 3362 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3363 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3364 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3365 3366 #ifdef VM_LITTLE_ENDIAN 3367 #define BE_swap_bytes(x) 3368 #else 3369 vspltisb(Vtmp2, 0xf); 3370 vxor(swap_bytes, Vtmp, Vtmp2); 3371 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3372 #endif 3373 3374 cmpd(CCR0, len, num_bytes); 3375 blt(CCR0, L_last); 3376 3377 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3378 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3379 3380 // ********** Main loop start ********** 3381 align(32); 3382 bind(L_outer_loop); 3383 3384 // Begin of unrolled first iteration (no xor). 3385 lvx(data1[0], buf); 3386 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3387 lvx(data1[i], offs[i], buf); 3388 } 3389 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3390 lvx(consts1[0], cur_const); 3391 mtctr(loop_count); 3392 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3393 BE_swap_bytes(data1[i]); 3394 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3395 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3396 vpmsumw(data0[i], data1[i], consts1[0]); 3397 } 3398 addi(buf, buf, 16 * unroll_factor2); 3399 subf(len, num_bytes, len); 3400 lvx(consts1[1], offs[1], cur_const); 3401 addi(cur_const, cur_const, 32); 3402 // Begin of unrolled second iteration (head). 3403 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3404 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3405 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3406 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3407 } 3408 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3409 BE_swap_bytes(data1[i]); 3410 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3411 vpmsumw(data1[i], data1[i], consts1[1]); 3412 } 3413 addi(buf, buf, 16 * unroll_factor2); 3414 3415 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3416 // Double-iteration allows using the 2 constant registers alternatingly. 3417 align(32); 3418 bind(L_inner_loop); 3419 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3420 if (j & 1) { 3421 lvx(consts1[0], cur_const); 3422 } else { 3423 lvx(consts1[1], offs[1], cur_const); 3424 addi(cur_const, cur_const, 32); 3425 } 3426 for (int i = 0; i < unroll_factor2; ++i) { 3427 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3428 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3429 BE_swap_bytes(data1[idx]); 3430 vxor(data0[i], data0[i], data1[i]); 3431 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3432 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3433 } 3434 addi(buf, buf, 16 * unroll_factor2); 3435 } 3436 bdnz(L_inner_loop); 3437 3438 addi(cur_const, constants, outer_consts_size); // Reset 3439 3440 // Tail of last iteration (no loads). 3441 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3442 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3443 vxor(data0[i], data0[i], data1[i]); 3444 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3445 } 3446 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3447 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3448 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3449 } 3450 3451 // Last data register is ok, other ones need fixup shift. 3452 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3453 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3454 } 3455 3456 // Combine to 128 bit result vector VCRC = data0[0]. 3457 for (int i = 1; i < unroll_factor2; i<<=1) { 3458 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3459 vxor(data0[j], data0[j], data0[j+i]); 3460 } 3461 } 3462 cmpd(CCR0, len, num_bytes); 3463 bge(CCR0, L_outer_loop); 3464 3465 // Last chance with lower num_bytes. 3466 bind(L_last); 3467 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3468 // Point behind last const for inner loop. 3469 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3470 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3471 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3472 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3473 3474 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3475 bgt(CCR0, L_outer_loop); 3476 // ********** Main loop end ********** 3477 3478 // Restore DSCR pre-fetch value. 3479 if (VM_Version::has_mfdscr()) { 3480 load_const_optimized(t0, VM_Version::_dscr_val); 3481 mtdscr(t0); 3482 } 3483 3484 // ********** Simple loop for remaining 16 byte blocks ********** 3485 { 3486 Label L_loop, L_done; 3487 3488 srdi_(t0, len, 4); // 16 bytes per iteration 3489 clrldi(len, len, 64-4); 3490 beq(CCR0, L_done); 3491 3492 // Point to const (same as last const for inner loop). 3493 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3494 mtctr(t0); 3495 lvx(Vtmp2, cur_const); 3496 3497 align(32); 3498 bind(L_loop); 3499 3500 lvx(Vtmp, buf); 3501 addi(buf, buf, 16); 3502 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3503 BE_swap_bytes(Vtmp); 3504 vxor(VCRC, VCRC, Vtmp); 3505 vpmsumw(VCRC, VCRC, Vtmp2); 3506 bdnz(L_loop); 3507 3508 bind(L_done); 3509 } 3510 // ********** Simple loop end ********** 3511 #undef BE_swap_bytes 3512 3513 // Point to Barrett constants 3514 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3515 3516 vspltisb(zeroes, 0); 3517 3518 // Combine to 64 bit result. 3519 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3520 3521 // Reduce to 32 bit CRC: Remainder by multiply-high. 3522 lvx(Vtmp, cur_const); 3523 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3524 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3525 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3526 vsldoi(Vtmp, zeroes, Vtmp, 8); 3527 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3528 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3529 3530 // Move result. len is already updated. 3531 vsldoi(VCRC, VCRC, zeroes, 8); 3532 mfvrd(crc, VCRC); 3533 3534 // Restore non-volatile Vector registers (frameless). 3535 offsetInt = 0; 3536 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3537 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3538 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3539 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3540 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3541 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3542 #ifndef VM_LITTLE_ENDIAN 3543 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3544 #endif 3545 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3546 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3547 } 3548 3549 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3550 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3551 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3552 : StubRoutines::crc_table_addr() , R0); 3553 3554 if (VM_Version::has_vpmsumb()) { 3555 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3556 } else { 3557 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3558 } 3559 } 3560 3561 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3562 assert_different_registers(crc, val, table); 3563 3564 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3565 if (invertCRC) { 3566 nand(crc, crc, crc); // 1s complement of crc 3567 } 3568 3569 update_byte_crc32(crc, val, table); 3570 3571 if (invertCRC) { 3572 nand(crc, crc, crc); // 1s complement of crc 3573 } 3574 } 3575 3576 // dest_lo += src1 + src2 3577 // dest_hi += carry1 + carry2 3578 void MacroAssembler::add2_with_carry(Register dest_hi, 3579 Register dest_lo, 3580 Register src1, Register src2) { 3581 li(R0, 0); 3582 addc(dest_lo, dest_lo, src1); 3583 adde(dest_hi, dest_hi, R0); 3584 addc(dest_lo, dest_lo, src2); 3585 adde(dest_hi, dest_hi, R0); 3586 } 3587 3588 // Multiply 64 bit by 64 bit first loop. 3589 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3590 Register x_xstart, 3591 Register y, Register y_idx, 3592 Register z, 3593 Register carry, 3594 Register product_high, Register product, 3595 Register idx, Register kdx, 3596 Register tmp) { 3597 // jlong carry, x[], y[], z[]; 3598 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3599 // huge_128 product = y[idx] * x[xstart] + carry; 3600 // z[kdx] = (jlong)product; 3601 // carry = (jlong)(product >>> 64); 3602 // } 3603 // z[xstart] = carry; 3604 3605 Label L_first_loop, L_first_loop_exit; 3606 Label L_one_x, L_one_y, L_multiply; 3607 3608 addic_(xstart, xstart, -1); 3609 blt(CCR0, L_one_x); // Special case: length of x is 1. 3610 3611 // Load next two integers of x. 3612 sldi(tmp, xstart, LogBytesPerInt); 3613 ldx(x_xstart, x, tmp); 3614 #ifdef VM_LITTLE_ENDIAN 3615 rldicl(x_xstart, x_xstart, 32, 0); 3616 #endif 3617 3618 align(32, 16); 3619 bind(L_first_loop); 3620 3621 cmpdi(CCR0, idx, 1); 3622 blt(CCR0, L_first_loop_exit); 3623 addi(idx, idx, -2); 3624 beq(CCR0, L_one_y); 3625 3626 // Load next two integers of y. 3627 sldi(tmp, idx, LogBytesPerInt); 3628 ldx(y_idx, y, tmp); 3629 #ifdef VM_LITTLE_ENDIAN 3630 rldicl(y_idx, y_idx, 32, 0); 3631 #endif 3632 3633 3634 bind(L_multiply); 3635 multiply64(product_high, product, x_xstart, y_idx); 3636 3637 li(tmp, 0); 3638 addc(product, product, carry); // Add carry to result. 3639 adde(product_high, product_high, tmp); // Add carry of the last addition. 3640 addi(kdx, kdx, -2); 3641 3642 // Store result. 3643 #ifdef VM_LITTLE_ENDIAN 3644 rldicl(product, product, 32, 0); 3645 #endif 3646 sldi(tmp, kdx, LogBytesPerInt); 3647 stdx(product, z, tmp); 3648 mr_if_needed(carry, product_high); 3649 b(L_first_loop); 3650 3651 3652 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3653 3654 lwz(y_idx, 0, y); 3655 b(L_multiply); 3656 3657 3658 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3659 3660 lwz(x_xstart, 0, x); 3661 b(L_first_loop); 3662 3663 bind(L_first_loop_exit); 3664 } 3665 3666 // Multiply 64 bit by 64 bit and add 128 bit. 3667 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3668 Register z, Register yz_idx, 3669 Register idx, Register carry, 3670 Register product_high, Register product, 3671 Register tmp, int offset) { 3672 3673 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3674 // z[kdx] = (jlong)product; 3675 3676 sldi(tmp, idx, LogBytesPerInt); 3677 if (offset) { 3678 addi(tmp, tmp, offset); 3679 } 3680 ldx(yz_idx, y, tmp); 3681 #ifdef VM_LITTLE_ENDIAN 3682 rldicl(yz_idx, yz_idx, 32, 0); 3683 #endif 3684 3685 multiply64(product_high, product, x_xstart, yz_idx); 3686 ldx(yz_idx, z, tmp); 3687 #ifdef VM_LITTLE_ENDIAN 3688 rldicl(yz_idx, yz_idx, 32, 0); 3689 #endif 3690 3691 add2_with_carry(product_high, product, carry, yz_idx); 3692 3693 sldi(tmp, idx, LogBytesPerInt); 3694 if (offset) { 3695 addi(tmp, tmp, offset); 3696 } 3697 #ifdef VM_LITTLE_ENDIAN 3698 rldicl(product, product, 32, 0); 3699 #endif 3700 stdx(product, z, tmp); 3701 } 3702 3703 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3704 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3705 Register y, Register z, 3706 Register yz_idx, Register idx, Register carry, 3707 Register product_high, Register product, 3708 Register carry2, Register tmp) { 3709 3710 // jlong carry, x[], y[], z[]; 3711 // int kdx = ystart+1; 3712 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3713 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3714 // z[kdx+idx+1] = (jlong)product; 3715 // jlong carry2 = (jlong)(product >>> 64); 3716 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3717 // z[kdx+idx] = (jlong)product; 3718 // carry = (jlong)(product >>> 64); 3719 // } 3720 // idx += 2; 3721 // if (idx > 0) { 3722 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3723 // z[kdx+idx] = (jlong)product; 3724 // carry = (jlong)(product >>> 64); 3725 // } 3726 3727 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3728 const Register jdx = R0; 3729 3730 // Scale the index. 3731 srdi_(jdx, idx, 2); 3732 beq(CCR0, L_third_loop_exit); 3733 mtctr(jdx); 3734 3735 align(32, 16); 3736 bind(L_third_loop); 3737 3738 addi(idx, idx, -4); 3739 3740 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3741 mr_if_needed(carry2, product_high); 3742 3743 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3744 mr_if_needed(carry, product_high); 3745 bdnz(L_third_loop); 3746 3747 bind(L_third_loop_exit); // Handle any left-over operand parts. 3748 3749 andi_(idx, idx, 0x3); 3750 beq(CCR0, L_post_third_loop_done); 3751 3752 Label L_check_1; 3753 3754 addic_(idx, idx, -2); 3755 blt(CCR0, L_check_1); 3756 3757 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3758 mr_if_needed(carry, product_high); 3759 3760 bind(L_check_1); 3761 3762 addi(idx, idx, 0x2); 3763 andi_(idx, idx, 0x1); 3764 addic_(idx, idx, -1); 3765 blt(CCR0, L_post_third_loop_done); 3766 3767 sldi(tmp, idx, LogBytesPerInt); 3768 lwzx(yz_idx, y, tmp); 3769 multiply64(product_high, product, x_xstart, yz_idx); 3770 lwzx(yz_idx, z, tmp); 3771 3772 add2_with_carry(product_high, product, yz_idx, carry); 3773 3774 sldi(tmp, idx, LogBytesPerInt); 3775 stwx(product, z, tmp); 3776 srdi(product, product, 32); 3777 3778 sldi(product_high, product_high, 32); 3779 orr(product, product, product_high); 3780 mr_if_needed(carry, product); 3781 3782 bind(L_post_third_loop_done); 3783 } // multiply_128_x_128_loop 3784 3785 void MacroAssembler::muladd(Register out, Register in, 3786 Register offset, Register len, Register k, 3787 Register tmp1, Register tmp2, Register carry) { 3788 3789 // Labels 3790 Label LOOP, SKIP; 3791 3792 // Make sure length is positive. 3793 cmpdi (CCR0, len, 0); 3794 3795 // Prepare variables 3796 subi (offset, offset, 4); 3797 li (carry, 0); 3798 ble (CCR0, SKIP); 3799 3800 mtctr (len); 3801 subi (len, len, 1 ); 3802 sldi (len, len, 2 ); 3803 3804 // Main loop 3805 bind(LOOP); 3806 lwzx (tmp1, len, in ); 3807 lwzx (tmp2, offset, out ); 3808 mulld (tmp1, tmp1, k ); 3809 add (tmp2, carry, tmp2 ); 3810 add (tmp2, tmp1, tmp2 ); 3811 stwx (tmp2, offset, out ); 3812 srdi (carry, tmp2, 32 ); 3813 subi (offset, offset, 4 ); 3814 subi (len, len, 4 ); 3815 bdnz (LOOP); 3816 bind(SKIP); 3817 } 3818 3819 void MacroAssembler::multiply_to_len(Register x, Register xlen, 3820 Register y, Register ylen, 3821 Register z, Register zlen, 3822 Register tmp1, Register tmp2, 3823 Register tmp3, Register tmp4, 3824 Register tmp5, Register tmp6, 3825 Register tmp7, Register tmp8, 3826 Register tmp9, Register tmp10, 3827 Register tmp11, Register tmp12, 3828 Register tmp13) { 3829 3830 ShortBranchVerifier sbv(this); 3831 3832 assert_different_registers(x, xlen, y, ylen, z, zlen, 3833 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3834 assert_different_registers(x, xlen, y, ylen, z, zlen, 3835 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 3836 assert_different_registers(x, xlen, y, ylen, z, zlen, 3837 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 3838 3839 const Register idx = tmp1; 3840 const Register kdx = tmp2; 3841 const Register xstart = tmp3; 3842 3843 const Register y_idx = tmp4; 3844 const Register carry = tmp5; 3845 const Register product = tmp6; 3846 const Register product_high = tmp7; 3847 const Register x_xstart = tmp8; 3848 const Register tmp = tmp9; 3849 3850 // First Loop. 3851 // 3852 // final static long LONG_MASK = 0xffffffffL; 3853 // int xstart = xlen - 1; 3854 // int ystart = ylen - 1; 3855 // long carry = 0; 3856 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3857 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3858 // z[kdx] = (int)product; 3859 // carry = product >>> 32; 3860 // } 3861 // z[xstart] = (int)carry; 3862 3863 mr_if_needed(idx, ylen); // idx = ylen 3864 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 3865 li(carry, 0); // carry = 0 3866 3867 Label L_done; 3868 3869 addic_(xstart, xlen, -1); 3870 blt(CCR0, L_done); 3871 3872 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 3873 carry, product_high, product, idx, kdx, tmp); 3874 3875 Label L_second_loop; 3876 3877 cmpdi(CCR0, kdx, 0); 3878 beq(CCR0, L_second_loop); 3879 3880 Label L_carry; 3881 3882 addic_(kdx, kdx, -1); 3883 beq(CCR0, L_carry); 3884 3885 // Store lower 32 bits of carry. 3886 sldi(tmp, kdx, LogBytesPerInt); 3887 stwx(carry, z, tmp); 3888 srdi(carry, carry, 32); 3889 addi(kdx, kdx, -1); 3890 3891 3892 bind(L_carry); 3893 3894 // Store upper 32 bits of carry. 3895 sldi(tmp, kdx, LogBytesPerInt); 3896 stwx(carry, z, tmp); 3897 3898 // Second and third (nested) loops. 3899 // 3900 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3901 // carry = 0; 3902 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3903 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3904 // (z[k] & LONG_MASK) + carry; 3905 // z[k] = (int)product; 3906 // carry = product >>> 32; 3907 // } 3908 // z[i] = (int)carry; 3909 // } 3910 // 3911 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 3912 3913 bind(L_second_loop); 3914 3915 li(carry, 0); // carry = 0; 3916 3917 addic_(xstart, xstart, -1); // i = xstart-1; 3918 blt(CCR0, L_done); 3919 3920 Register zsave = tmp10; 3921 3922 mr(zsave, z); 3923 3924 3925 Label L_last_x; 3926 3927 sldi(tmp, xstart, LogBytesPerInt); 3928 add(z, z, tmp); // z = z + k - j 3929 addi(z, z, 4); 3930 addic_(xstart, xstart, -1); // i = xstart-1; 3931 blt(CCR0, L_last_x); 3932 3933 sldi(tmp, xstart, LogBytesPerInt); 3934 ldx(x_xstart, x, tmp); 3935 #ifdef VM_LITTLE_ENDIAN 3936 rldicl(x_xstart, x_xstart, 32, 0); 3937 #endif 3938 3939 3940 Label L_third_loop_prologue; 3941 3942 bind(L_third_loop_prologue); 3943 3944 Register xsave = tmp11; 3945 Register xlensave = tmp12; 3946 Register ylensave = tmp13; 3947 3948 mr(xsave, x); 3949 mr(xlensave, xstart); 3950 mr(ylensave, ylen); 3951 3952 3953 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 3954 carry, product_high, product, x, tmp); 3955 3956 mr(z, zsave); 3957 mr(x, xsave); 3958 mr(xlen, xlensave); // This is the decrement of the loop counter! 3959 mr(ylen, ylensave); 3960 3961 addi(tmp3, xlen, 1); 3962 sldi(tmp, tmp3, LogBytesPerInt); 3963 stwx(carry, z, tmp); 3964 addic_(tmp3, tmp3, -1); 3965 blt(CCR0, L_done); 3966 3967 srdi(carry, carry, 32); 3968 sldi(tmp, tmp3, LogBytesPerInt); 3969 stwx(carry, z, tmp); 3970 b(L_second_loop); 3971 3972 // Next infrequent code is moved outside loops. 3973 bind(L_last_x); 3974 3975 lwz(x_xstart, 0, x); 3976 b(L_third_loop_prologue); 3977 3978 bind(L_done); 3979 } // multiply_to_len 3980 3981 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 3982 #ifdef ASSERT 3983 Label ok; 3984 if (check_equal) { 3985 beq(CCR0, ok); 3986 } else { 3987 bne(CCR0, ok); 3988 } 3989 stop(msg); 3990 bind(ok); 3991 #endif 3992 } 3993 3994 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 3995 Register mem_base, const char* msg) { 3996 #ifdef ASSERT 3997 switch (size) { 3998 case 4: 3999 lwz(R0, mem_offset, mem_base); 4000 cmpwi(CCR0, R0, 0); 4001 break; 4002 case 8: 4003 ld(R0, mem_offset, mem_base); 4004 cmpdi(CCR0, R0, 0); 4005 break; 4006 default: 4007 ShouldNotReachHere(); 4008 } 4009 asm_assert(check_equal, msg); 4010 #endif // ASSERT 4011 } 4012 4013 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4014 if (!VerifyOops) { return; } 4015 if (UseCompressedOops) { decode_heap_oop(coop); } 4016 verify_oop(coop, msg); 4017 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4018 } 4019 4020 // READ: oop. KILL: R0. Volatile floats perhaps. 4021 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4022 if (!VerifyOops) { 4023 return; 4024 } 4025 4026 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4027 const Register tmp = R11; // Will be preserved. 4028 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4029 4030 BLOCK_COMMENT("verify_oop {"); 4031 4032 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4033 4034 mr_if_needed(R4_ARG2, oop); 4035 save_LR_CR(tmp); // save in old frame 4036 push_frame_reg_args(nbytes_save, tmp); 4037 // load FunctionDescriptor** / entry_address * 4038 load_const_optimized(tmp, fd, R0); 4039 // load FunctionDescriptor* / entry_address 4040 ld(tmp, 0, tmp); 4041 load_const_optimized(R3_ARG1, (address)msg, R0); 4042 // Call destination for its side effect. 4043 call_c(tmp); 4044 4045 pop_frame(); 4046 restore_LR_CR(tmp); 4047 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4048 4049 BLOCK_COMMENT("} verify_oop"); 4050 } 4051 4052 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4053 if (!VerifyOops) { 4054 return; 4055 } 4056 4057 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4058 const Register tmp = R11; // Will be preserved. 4059 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4060 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4061 4062 ld(R4_ARG2, offs, base); 4063 save_LR_CR(tmp); // save in old frame 4064 push_frame_reg_args(nbytes_save, tmp); 4065 // load FunctionDescriptor** / entry_address * 4066 load_const_optimized(tmp, fd, R0); 4067 // load FunctionDescriptor* / entry_address 4068 ld(tmp, 0, tmp); 4069 load_const_optimized(R3_ARG1, (address)msg, R0); 4070 // Call destination for its side effect. 4071 call_c(tmp); 4072 4073 pop_frame(); 4074 restore_LR_CR(tmp); 4075 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4076 } 4077 4078 // Call a C-function that prints output. 4079 void MacroAssembler::stop(int type, const char* msg) { 4080 bool msg_present = (msg != nullptr); 4081 4082 #ifndef PRODUCT 4083 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4084 #else 4085 block_comment("stop {"); 4086 #endif 4087 4088 if (msg_present) { 4089 type |= stop_msg_present; 4090 } 4091 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4092 if (msg_present) { 4093 emit_int64((uintptr_t)msg); 4094 } 4095 4096 block_comment("} stop;"); 4097 } 4098 4099 #ifndef PRODUCT 4100 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4101 // Val, addr are temp registers. 4102 // If low == addr, addr is killed. 4103 // High is preserved. 4104 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4105 if (!ZapMemory) return; 4106 4107 assert_different_registers(low, val); 4108 4109 BLOCK_COMMENT("zap memory region {"); 4110 load_const_optimized(val, 0x0101010101010101); 4111 int size = before + after; 4112 if (low == high && size < 5 && size > 0) { 4113 int offset = -before*BytesPerWord; 4114 for (int i = 0; i < size; ++i) { 4115 std(val, offset, low); 4116 offset += (1*BytesPerWord); 4117 } 4118 } else { 4119 addi(addr, low, -before*BytesPerWord); 4120 assert_different_registers(high, val); 4121 if (after) addi(high, high, after * BytesPerWord); 4122 Label loop; 4123 bind(loop); 4124 std(val, 0, addr); 4125 addi(addr, addr, 8); 4126 cmpd(CCR6, addr, high); 4127 ble(CCR6, loop); 4128 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4129 } 4130 BLOCK_COMMENT("} zap memory region"); 4131 } 4132 4133 #endif // !PRODUCT 4134 4135 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4136 const bool* flag_addr, Label& label) { 4137 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4138 assert(sizeof(bool) == 1, "PowerPC ABI"); 4139 masm->lbz(temp, simm16_offset, temp); 4140 masm->cmpwi(CCR0, temp, 0); 4141 masm->beq(CCR0, label); 4142 } 4143 4144 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4145 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4146 } 4147 4148 SkipIfEqualZero::~SkipIfEqualZero() { 4149 _masm->bind(_label); 4150 } 4151 4152 void MacroAssembler::cache_wb(Address line) { 4153 assert(line.index() == noreg, "index should be noreg"); 4154 assert(line.disp() == 0, "displacement should be 0"); 4155 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4156 // Data Cache Store, not really a flush, so it works like a sync of cache 4157 // line and persistent mem, i.e. copying the cache line to persistent whilst 4158 // not invalidating the cache line. 4159 dcbst(line.base()); 4160 } 4161 4162 void MacroAssembler::cache_wbsync(bool is_presync) { 4163 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4164 // We only need a post sync barrier. Post means _after_ a cache line flush or 4165 // store instruction, pre means a barrier emitted before such a instructions. 4166 if (!is_presync) { 4167 fence(); 4168 } 4169 } 4170 4171 void MacroAssembler::push_cont_fastpath() { 4172 Label done; 4173 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4174 cmpld(CCR0, R1_SP, R0); 4175 ble(CCR0, done); 4176 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4177 bind(done); 4178 } 4179 4180 void MacroAssembler::pop_cont_fastpath() { 4181 Label done; 4182 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4183 cmpld(CCR0, R1_SP, R0); 4184 ble(CCR0, done); 4185 li(R0, 0); 4186 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4187 bind(done); 4188 } 4189 4190 // Note: Must preserve CCR0 EQ (invariant). 4191 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4192 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4193 #ifdef ASSERT 4194 Label ok; 4195 cmpdi(CCR0, tmp, 0); 4196 bge_predict_taken(CCR0, ok); 4197 stop("held monitor count is negativ at increment"); 4198 bind(ok); 4199 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4200 #endif 4201 addi(tmp, tmp, 1); 4202 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4203 } 4204 4205 // Note: Must preserve CCR0 EQ (invariant). 4206 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4207 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4208 #ifdef ASSERT 4209 Label ok; 4210 cmpdi(CCR0, tmp, 0); 4211 bgt_predict_taken(CCR0, ok); 4212 stop("held monitor count is <= 0 at decrement"); 4213 bind(ok); 4214 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4215 #endif 4216 addi(tmp, tmp, -1); 4217 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4218 } 4219 4220 // Function to flip between unlocked and locked state (fast locking). 4221 // Branches to failed if the state is not as expected with CCR0 NE. 4222 // Falls through upon success with CCR0 EQ. 4223 // This requires fewer instructions and registers and is easier to use than the 4224 // cmpxchg based implementation. 4225 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4226 assert_different_registers(obj, tmp, R0); 4227 Label retry; 4228 4229 if (semantics & MemBarRel) { 4230 release(); 4231 } 4232 4233 bind(retry); 4234 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4235 if (!is_unlock) { 4236 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4237 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4238 andi_(R0, tmp, markWord::lock_mask_in_place); 4239 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4240 } else { 4241 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4242 andi_(R0, tmp, markWord::lock_mask_in_place); 4243 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4244 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4245 } 4246 stdcx_(tmp, obj); 4247 bne(CCR0, retry); 4248 4249 if (semantics & MemBarFenceAfter) { 4250 fence(); 4251 } else if (semantics & MemBarAcq) { 4252 isync(); 4253 } 4254 } 4255 4256 // Implements lightweight-locking. 4257 // 4258 // - obj: the object to be locked 4259 // - t1, t2: temporary register 4260 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4261 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4262 assert_different_registers(obj, t1, t2); 4263 4264 Label push; 4265 const Register top = t1; 4266 const Register mark = t2; 4267 const Register t = R0; 4268 4269 // Check if the lock-stack is full. 4270 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4271 cmplwi(CCR0, top, LockStack::end_offset()); 4272 bge(CCR0, slow); 4273 4274 // The underflow check is elided. The recursive check will always fail 4275 // when the lock stack is empty because of the _bad_oop_sentinel field. 4276 4277 // Check for recursion. 4278 subi(t, top, oopSize); 4279 ldx(t, R16_thread, t); 4280 cmpd(CCR0, obj, t); 4281 beq(CCR0, push); 4282 4283 // Check header for monitor (0b10) or locked (0b00). 4284 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4285 xori(t, mark, markWord::unlocked_value); 4286 andi_(t, t, markWord::lock_mask_in_place); 4287 bne(CCR0, slow); 4288 4289 // Try to lock. Transition lock bits 0b00 => 0b01 4290 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4291 4292 bind(push); 4293 // After successful lock, push object on lock-stack 4294 stdx(obj, R16_thread, top); 4295 addi(top, top, oopSize); 4296 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4297 } 4298 4299 // Implements lightweight-unlocking. 4300 // 4301 // - obj: the object to be unlocked 4302 // - t1: temporary register 4303 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4304 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4305 assert_different_registers(obj, t1); 4306 4307 #ifdef ASSERT 4308 { 4309 // The following checks rely on the fact that LockStack is only ever modified by 4310 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4311 // entries after inflation will happen delayed in that case. 4312 4313 // Check for lock-stack underflow. 4314 Label stack_ok; 4315 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4316 cmplwi(CCR0, t1, LockStack::start_offset()); 4317 bge(CCR0, stack_ok); 4318 stop("Lock-stack underflow"); 4319 bind(stack_ok); 4320 } 4321 #endif 4322 4323 Label unlocked, push_and_slow; 4324 const Register top = t1; 4325 const Register mark = R0; 4326 Register t = R0; 4327 4328 // Check if obj is top of lock-stack. 4329 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4330 subi(top, top, oopSize); 4331 ldx(t, R16_thread, top); 4332 cmpd(CCR0, obj, t); 4333 bne(CCR0, slow); 4334 4335 // Pop lock-stack. 4336 DEBUG_ONLY(li(t, 0);) 4337 DEBUG_ONLY(stdx(t, R16_thread, top);) 4338 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4339 4340 // The underflow check is elided. The recursive check will always fail 4341 // when the lock stack is empty because of the _bad_oop_sentinel field. 4342 4343 // Check if recursive. 4344 subi(t, top, oopSize); 4345 ldx(t, R16_thread, t); 4346 cmpd(CCR0, obj, t); 4347 beq(CCR0, unlocked); 4348 4349 // Use top as tmp 4350 t = top; 4351 4352 // Not recursive. Check header for monitor (0b10). 4353 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4354 andi_(t, mark, markWord::monitor_value); 4355 bne(CCR0, push_and_slow); 4356 4357 #ifdef ASSERT 4358 // Check header not unlocked (0b01). 4359 Label not_unlocked; 4360 andi_(t, mark, markWord::unlocked_value); 4361 beq(CCR0, not_unlocked); 4362 stop("lightweight_unlock already unlocked"); 4363 bind(not_unlocked); 4364 #endif 4365 4366 // Try to unlock. Transition lock bits 0b00 => 0b01 4367 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4368 b(unlocked); 4369 4370 bind(push_and_slow); 4371 4372 // Restore lock-stack and handle the unlock in runtime. 4373 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4374 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4375 addi(top, top, oopSize); 4376 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4377 b(slow); 4378 4379 bind(unlocked); 4380 }