1 /* 2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2023 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "code/compiledIC.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "gc/shared/barrierSet.hpp" 32 #include "gc/shared/barrierSetAssembler.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "nativeInst_ppc.hpp" 36 #include "oops/compressedKlass.inline.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "oops/methodData.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "register_ppc.hpp" 42 #include "runtime/icache.hpp" 43 #include "runtime/interfaceSupport.inline.hpp" 44 #include "runtime/objectMonitor.hpp" 45 #include "runtime/os.hpp" 46 #include "runtime/safepoint.hpp" 47 #include "runtime/safepointMechanism.hpp" 48 #include "runtime/sharedRuntime.hpp" 49 #include "runtime/stubRoutines.hpp" 50 #include "runtime/vm_version.hpp" 51 #include "utilities/macros.hpp" 52 #include "utilities/powerOfTwo.hpp" 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) // nothing 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #endif 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 #ifdef ASSERT 62 // On RISC, there's no benefit to verifying instruction boundaries. 63 bool AbstractAssembler::pd_check_instruction_mark() { return false; } 64 #endif 65 66 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) { 67 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range"); 68 if (Assembler::is_simm(si31, 16)) { 69 ld(d, si31, a); 70 if (emit_filler_nop) nop(); 71 } else { 72 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31); 73 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31); 74 addis(d, a, hi); 75 ld(d, lo, d); 76 } 77 } 78 79 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) { 80 assert_different_registers(d, a); 81 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop); 82 } 83 84 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base, 85 size_t size_in_bytes, bool is_signed) { 86 switch (size_in_bytes) { 87 case 8: ld(dst, offs, base); break; 88 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break; 89 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break; 90 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :( 91 default: ShouldNotReachHere(); 92 } 93 } 94 95 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base, 96 size_t size_in_bytes) { 97 switch (size_in_bytes) { 98 case 8: std(dst, offs, base); break; 99 case 4: stw(dst, offs, base); break; 100 case 2: sth(dst, offs, base); break; 101 case 1: stb(dst, offs, base); break; 102 default: ShouldNotReachHere(); 103 } 104 } 105 106 void MacroAssembler::align(int modulus, int max, int rem) { 107 int padding = (rem + modulus - (offset() % modulus)) % modulus; 108 if (padding > max) return; 109 for (int c = (padding >> 2); c > 0; --c) { nop(); } 110 } 111 112 void MacroAssembler::align_prefix() { 113 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); } 114 } 115 116 // Issue instructions that calculate given TOC from global TOC. 117 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16, 118 bool add_relocation, bool emit_dummy_addr) { 119 int offset = -1; 120 if (emit_dummy_addr) { 121 offset = -128; // dummy address 122 } else if (addr != (address)(intptr_t)-1) { 123 offset = MacroAssembler::offset_to_global_toc(addr); 124 } 125 126 if (hi16) { 127 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset)); 128 } 129 if (lo16) { 130 if (add_relocation) { 131 // Relocate at the addi to avoid confusion with a load from the method's TOC. 132 relocate(internal_word_Relocation::spec(addr)); 133 } 134 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset)); 135 } 136 } 137 138 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) { 139 const int offset = MacroAssembler::offset_to_global_toc(addr); 140 141 const address inst2_addr = a; 142 const int inst2 = *(int *)inst2_addr; 143 144 // The relocation points to the second instruction, the addi, 145 // and the addi reads and writes the same register dst. 146 const int dst = inv_rt_field(inst2); 147 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 148 149 // Now, find the preceding addis which writes to dst. 150 int inst1 = 0; 151 address inst1_addr = inst2_addr - BytesPerInstWord; 152 while (inst1_addr >= bound) { 153 inst1 = *(int *) inst1_addr; 154 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 155 // Stop, found the addis which writes dst. 156 break; 157 } 158 inst1_addr -= BytesPerInstWord; 159 } 160 161 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 162 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset)); 163 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset)); 164 return inst1_addr; 165 } 166 167 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) { 168 const address inst2_addr = a; 169 const int inst2 = *(int *)inst2_addr; 170 171 // The relocation points to the second instruction, the addi, 172 // and the addi reads and writes the same register dst. 173 const int dst = inv_rt_field(inst2); 174 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst"); 175 176 // Now, find the preceding addis which writes to dst. 177 int inst1 = 0; 178 address inst1_addr = inst2_addr - BytesPerInstWord; 179 while (inst1_addr >= bound) { 180 inst1 = *(int *) inst1_addr; 181 if (is_addis(inst1) && inv_rt_field(inst1) == dst) { 182 // stop, found the addis which writes dst 183 break; 184 } 185 inst1_addr -= BytesPerInstWord; 186 } 187 188 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC"); 189 190 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0); 191 // -1 is a special case 192 if (offset == -1) { 193 return (address)(intptr_t)-1; 194 } else { 195 return global_toc() + offset; 196 } 197 } 198 199 #ifdef _LP64 200 // Patch compressed oops or klass constants. 201 // Assembler sequence is 202 // 1) compressed oops: 203 // lis rx = const.hi 204 // ori rx = rx | const.lo 205 // 2) compressed klass: 206 // lis rx = const.hi 207 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional 208 // ori rx = rx | const.lo 209 // Clrldi will be passed by. 210 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) { 211 assert(UseCompressedOops, "Should only patch compressed oops"); 212 213 const address inst2_addr = a; 214 const int inst2 = *(int *)inst2_addr; 215 216 // The relocation points to the second instruction, the ori, 217 // and the ori reads and writes the same register dst. 218 const int dst = inv_rta_field(inst2); 219 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 220 // Now, find the preceding addis which writes to dst. 221 int inst1 = 0; 222 address inst1_addr = inst2_addr - BytesPerInstWord; 223 bool inst1_found = false; 224 while (inst1_addr >= bound) { 225 inst1 = *(int *)inst1_addr; 226 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; } 227 inst1_addr -= BytesPerInstWord; 228 } 229 assert(inst1_found, "inst is not lis"); 230 231 uint32_t data_value = CompressedOops::narrow_oop_value(data); 232 int xc = (data_value >> 16) & 0xffff; 233 int xd = (data_value >> 0) & 0xffff; 234 235 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo 236 set_imm((int *)inst2_addr, (xd)); // unsigned int 237 return inst1_addr; 238 } 239 240 // Get compressed oop constant. 241 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) { 242 assert(UseCompressedOops, "Should only patch compressed oops"); 243 244 const address inst2_addr = a; 245 const int inst2 = *(int *)inst2_addr; 246 247 // The relocation points to the second instruction, the ori, 248 // and the ori reads and writes the same register dst. 249 const int dst = inv_rta_field(inst2); 250 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst"); 251 // Now, find the preceding lis which writes to dst. 252 int inst1 = 0; 253 address inst1_addr = inst2_addr - BytesPerInstWord; 254 bool inst1_found = false; 255 256 while (inst1_addr >= bound) { 257 inst1 = *(int *) inst1_addr; 258 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;} 259 inst1_addr -= BytesPerInstWord; 260 } 261 assert(inst1_found, "inst is not lis"); 262 263 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff)); 264 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16); 265 266 return CompressedOops::narrow_oop_cast(xl | xh); 267 } 268 #endif // _LP64 269 270 // Returns true if successful. 271 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a, 272 Register toc, bool fixed_size) { 273 int toc_offset = 0; 274 // Use RelocationHolder::none for the constant pool entry, otherwise 275 // we will end up with a failing NativeCall::verify(x) where x is 276 // the address of the constant pool entry. 277 // FIXME: We should insert relocation information for oops at the constant 278 // pool entries instead of inserting it at the loads; patching of a constant 279 // pool entry should be less expensive. 280 address const_address = address_constant((address)a.value(), RelocationHolder::none); 281 if (const_address == nullptr) { return false; } // allocation failure 282 // Relocate at the pc of the load. 283 relocate(a.rspec()); 284 toc_offset = (int)(const_address - code()->consts()->start()); 285 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size); 286 return true; 287 } 288 289 bool MacroAssembler::is_load_const_from_method_toc_at(address a) { 290 const address inst1_addr = a; 291 const int inst1 = *(int *)inst1_addr; 292 293 // The relocation points to the ld or the addis. 294 return (is_ld(inst1)) || 295 (is_addis(inst1) && inv_ra_field(inst1) != 0); 296 } 297 298 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) { 299 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc"); 300 301 const address inst1_addr = a; 302 const int inst1 = *(int *)inst1_addr; 303 304 if (is_ld(inst1)) { 305 return inv_d1_field(inst1); 306 } else if (is_addis(inst1)) { 307 const int dst = inv_rt_field(inst1); 308 309 // Now, find the succeeding ld which reads and writes to dst. 310 address inst2_addr = inst1_addr + BytesPerInstWord; 311 int inst2 = 0; 312 while (true) { 313 inst2 = *(int *) inst2_addr; 314 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) { 315 // Stop, found the ld which reads and writes dst. 316 break; 317 } 318 inst2_addr += BytesPerInstWord; 319 } 320 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2); 321 } 322 ShouldNotReachHere(); 323 return 0; 324 } 325 326 // Get the constant from a `load_const' sequence. 327 long MacroAssembler::get_const(address a) { 328 assert(is_load_const_at(a), "not a load of a constant"); 329 const int *p = (const int*) a; 330 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48); 331 if (is_ori(*(p+1))) { 332 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32); 333 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16); 334 x |= (((unsigned long) (get_imm(a,4) & 0xffff))); 335 } else if (is_lis(*(p+1))) { 336 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32); 337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16); 338 x |= (((unsigned long) (get_imm(a,3) & 0xffff))); 339 } else { 340 ShouldNotReachHere(); 341 return (long) 0; 342 } 343 return (long) x; 344 } 345 346 // Patch the 64 bit constant of a `load_const' sequence. This is a low 347 // level procedure. It neither flushes the instruction cache nor is it 348 // mt safe. 349 void MacroAssembler::patch_const(address a, long x) { 350 assert(is_load_const_at(a), "not a load of a constant"); 351 int *p = (int*) a; 352 if (is_ori(*(p+1))) { 353 set_imm(0 + p, (x >> 48) & 0xffff); 354 set_imm(1 + p, (x >> 32) & 0xffff); 355 set_imm(3 + p, (x >> 16) & 0xffff); 356 set_imm(4 + p, x & 0xffff); 357 } else if (is_lis(*(p+1))) { 358 set_imm(0 + p, (x >> 48) & 0xffff); 359 set_imm(2 + p, (x >> 32) & 0xffff); 360 set_imm(1 + p, (x >> 16) & 0xffff); 361 set_imm(3 + p, x & 0xffff); 362 } else { 363 ShouldNotReachHere(); 364 } 365 } 366 367 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) { 368 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 369 int index = oop_recorder()->allocate_metadata_index(obj); 370 RelocationHolder rspec = metadata_Relocation::spec(index); 371 return AddressLiteral((address)obj, rspec); 372 } 373 374 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) { 375 assert(oop_recorder() != nullptr, "this assembler needs a Recorder"); 376 int index = oop_recorder()->find_index(obj); 377 RelocationHolder rspec = metadata_Relocation::spec(index); 378 return AddressLiteral((address)obj, rspec); 379 } 380 381 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) { 382 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 383 int oop_index = oop_recorder()->allocate_oop_index(obj); 384 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 385 } 386 387 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) { 388 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder"); 389 int oop_index = oop_recorder()->find_index(obj); 390 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index)); 391 } 392 393 #ifndef PRODUCT 394 void MacroAssembler::pd_print_patched_instruction(address branch) { 395 Unimplemented(); // TODO: PPC port 396 } 397 #endif // ndef PRODUCT 398 399 // Conditional far branch for destinations encodable in 24+2 bits. 400 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) { 401 402 // If requested by flag optimize, relocate the bc_far as a 403 // runtime_call and prepare for optimizing it when the code gets 404 // relocated. 405 if (optimize == bc_far_optimize_on_relocate) { 406 relocate(relocInfo::runtime_call_type); 407 } 408 409 // variant 2: 410 // 411 // b!cxx SKIP 412 // bxx DEST 413 // SKIP: 414 // 415 416 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 417 opposite_bcond(inv_boint_bcond(boint))); 418 419 // We emit two branches. 420 // First, a conditional branch which jumps around the far branch. 421 const address not_taken_pc = pc() + 2 * BytesPerInstWord; 422 const address bc_pc = pc(); 423 bc(opposite_boint, biint, not_taken_pc); 424 425 const int bc_instr = *(int*)bc_pc; 426 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition"); 427 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition"); 428 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))), 429 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))), 430 "postcondition"); 431 assert(biint == inv_bi_field(bc_instr), "postcondition"); 432 433 // Second, an unconditional far branch which jumps to dest. 434 // Note: target(dest) remembers the current pc (see CodeSection::target) 435 // and returns the current pc if the label is not bound yet; when 436 // the label gets bound, the unconditional far branch will be patched. 437 const address target_pc = target(dest); 438 const address b_pc = pc(); 439 b(target_pc); 440 441 assert(not_taken_pc == pc(), "postcondition"); 442 assert(dest.is_bound() || target_pc == b_pc, "postcondition"); 443 } 444 445 // 1 or 2 instructions 446 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) { 447 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) { 448 bc(boint, biint, dest); 449 } else { 450 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate); 451 } 452 } 453 454 bool MacroAssembler::is_bc_far_at(address instruction_addr) { 455 return is_bc_far_variant1_at(instruction_addr) || 456 is_bc_far_variant2_at(instruction_addr) || 457 is_bc_far_variant3_at(instruction_addr); 458 } 459 460 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) { 461 if (is_bc_far_variant1_at(instruction_addr)) { 462 const address instruction_1_addr = instruction_addr; 463 const int instruction_1 = *(int*)instruction_1_addr; 464 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr); 465 } else if (is_bc_far_variant2_at(instruction_addr)) { 466 const address instruction_2_addr = instruction_addr + 4; 467 return bxx_destination(instruction_2_addr); 468 } else if (is_bc_far_variant3_at(instruction_addr)) { 469 return instruction_addr + 8; 470 } 471 // variant 4 ??? 472 ShouldNotReachHere(); 473 return nullptr; 474 } 475 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) { 476 477 if (is_bc_far_variant3_at(instruction_addr)) { 478 // variant 3, far cond branch to the next instruction, already patched to nops: 479 // 480 // nop 481 // endgroup 482 // SKIP/DEST: 483 // 484 return; 485 } 486 487 // first, extract boint and biint from the current branch 488 int boint = 0; 489 int biint = 0; 490 491 ResourceMark rm; 492 const int code_size = 2 * BytesPerInstWord; 493 CodeBuffer buf(instruction_addr, code_size); 494 MacroAssembler masm(&buf); 495 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) { 496 // Far branch to next instruction: Optimize it by patching nops (produce variant 3). 497 masm.nop(); 498 masm.endgroup(); 499 } else { 500 if (is_bc_far_variant1_at(instruction_addr)) { 501 // variant 1, the 1st instruction contains the destination address: 502 // 503 // bcxx DEST 504 // nop 505 // 506 const int instruction_1 = *(int*)(instruction_addr); 507 boint = inv_bo_field(instruction_1); 508 biint = inv_bi_field(instruction_1); 509 } else if (is_bc_far_variant2_at(instruction_addr)) { 510 // variant 2, the 2nd instruction contains the destination address: 511 // 512 // b!cxx SKIP 513 // bxx DEST 514 // SKIP: 515 // 516 const int instruction_1 = *(int*)(instruction_addr); 517 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))), 518 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1)))); 519 biint = inv_bi_field(instruction_1); 520 } else { 521 // variant 4??? 522 ShouldNotReachHere(); 523 } 524 525 // second, set the new branch destination and optimize the code 526 if (dest != instruction_addr + 4 && // the bc_far is still unbound! 527 masm.is_within_range_of_bcxx(dest, instruction_addr)) { 528 // variant 1: 529 // 530 // bcxx DEST 531 // nop 532 // 533 masm.bc(boint, biint, dest); 534 masm.nop(); 535 } else { 536 // variant 2: 537 // 538 // b!cxx SKIP 539 // bxx DEST 540 // SKIP: 541 // 542 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)), 543 opposite_bcond(inv_boint_bcond(boint))); 544 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord; 545 masm.bc(opposite_boint, biint, not_taken_pc); 546 masm.b(dest); 547 } 548 } 549 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 550 } 551 552 // Emit a NOT mt-safe patchable 64 bit absolute call/jump. 553 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) { 554 // get current pc 555 uint64_t start_pc = (uint64_t) pc(); 556 557 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last 558 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first 559 560 // relocate here 561 if (rt != relocInfo::none) { 562 relocate(rt); 563 } 564 565 if ( ReoptimizeCallSequences && 566 (( link && is_within_range_of_b(dest, pc_of_bl)) || 567 (!link && is_within_range_of_b(dest, pc_of_b)))) { 568 // variant 2: 569 // Emit an optimized, pc-relative call/jump. 570 571 if (link) { 572 // some padding 573 nop(); 574 nop(); 575 nop(); 576 nop(); 577 nop(); 578 nop(); 579 580 // do the call 581 assert(pc() == pc_of_bl, "just checking"); 582 bl(dest, relocInfo::none); 583 } else { 584 // do the jump 585 assert(pc() == pc_of_b, "just checking"); 586 b(dest, relocInfo::none); 587 588 // some padding 589 nop(); 590 nop(); 591 nop(); 592 nop(); 593 nop(); 594 nop(); 595 } 596 597 // Assert that we can identify the emitted call/jump. 598 assert(is_bxx64_patchable_variant2_at((address)start_pc, link), 599 "can't identify emitted call"); 600 } else { 601 // variant 1: 602 mr(R0, R11); // spill R11 -> R0. 603 604 // Load the destination address into CTR, 605 // calculate destination relative to global toc. 606 calculate_address_from_global_toc(R11, dest, true, true, false); 607 608 mtctr(R11); 609 mr(R11, R0); // spill R11 <- R0. 610 nop(); 611 612 // do the call/jump 613 if (link) { 614 bctrl(); 615 } else{ 616 bctr(); 617 } 618 // Assert that we can identify the emitted call/jump. 619 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link), 620 "can't identify emitted call"); 621 } 622 623 // Assert that we can identify the emitted call/jump. 624 assert(is_bxx64_patchable_at((address)start_pc, link), 625 "can't identify emitted call"); 626 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest, 627 "wrong encoding of dest address"); 628 } 629 630 // Identify a bxx64_patchable instruction. 631 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) { 632 return is_bxx64_patchable_variant1b_at(instruction_addr, link) 633 //|| is_bxx64_patchable_variant1_at(instruction_addr, link) 634 || is_bxx64_patchable_variant2_at(instruction_addr, link); 635 } 636 637 // Does the call64_patchable instruction use a pc-relative encoding of 638 // the call destination? 639 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) { 640 // variant 2 is pc-relative 641 return is_bxx64_patchable_variant2_at(instruction_addr, link); 642 } 643 644 // Identify variant 1. 645 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) { 646 unsigned int* instr = (unsigned int*) instruction_addr; 647 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 648 && is_mtctr(instr[5]) // mtctr 649 && is_load_const_at(instruction_addr); 650 } 651 652 // Identify variant 1b: load destination relative to global toc. 653 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) { 654 unsigned int* instr = (unsigned int*) instruction_addr; 655 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l] 656 && is_mtctr(instr[3]) // mtctr 657 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr); 658 } 659 660 // Identify variant 2. 661 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) { 662 unsigned int* instr = (unsigned int*) instruction_addr; 663 if (link) { 664 return is_bl (instr[6]) // bl dest is last 665 && is_nop(instr[0]) // nop 666 && is_nop(instr[1]) // nop 667 && is_nop(instr[2]) // nop 668 && is_nop(instr[3]) // nop 669 && is_nop(instr[4]) // nop 670 && is_nop(instr[5]); // nop 671 } else { 672 return is_b (instr[0]) // b dest is first 673 && is_nop(instr[1]) // nop 674 && is_nop(instr[2]) // nop 675 && is_nop(instr[3]) // nop 676 && is_nop(instr[4]) // nop 677 && is_nop(instr[5]) // nop 678 && is_nop(instr[6]); // nop 679 } 680 } 681 682 // Set dest address of a bxx64_patchable instruction. 683 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) { 684 ResourceMark rm; 685 int code_size = MacroAssembler::bxx64_patchable_size; 686 CodeBuffer buf(instruction_addr, code_size); 687 MacroAssembler masm(&buf); 688 masm.bxx64_patchable(dest, relocInfo::none, link); 689 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size); 690 } 691 692 // Get dest address of a bxx64_patchable instruction. 693 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) { 694 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) { 695 return (address) (unsigned long) get_const(instruction_addr); 696 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) { 697 unsigned int* instr = (unsigned int*) instruction_addr; 698 if (link) { 699 const int instr_idx = 6; // bl is last 700 int branchoffset = branch_destination(instr[instr_idx], 0); 701 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 702 } else { 703 const int instr_idx = 0; // b is first 704 int branchoffset = branch_destination(instr[instr_idx], 0); 705 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord; 706 } 707 // Load dest relative to global toc. 708 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) { 709 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, 710 instruction_addr); 711 } else { 712 ShouldNotReachHere(); 713 return nullptr; 714 } 715 } 716 717 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) { 718 const int magic_number = 0x42; 719 720 // Preserve stack pointer register (R1_SP) and system thread id register (R13); 721 // although they're technically volatile 722 for (int i = 2; i < 13; i++) { 723 Register reg = as_Register(i); 724 if (reg == excluded_register) { 725 continue; 726 } 727 728 li(reg, magic_number); 729 } 730 } 731 732 void MacroAssembler::clobber_carg_stack_slots(Register tmp) { 733 const int magic_number = 0x43; 734 735 li(tmp, magic_number); 736 for (int m = 0; m <= 7; m++) { 737 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP); 738 } 739 } 740 741 // Uses ordering which corresponds to ABI: 742 // _savegpr0_14: std r14,-144(r1) 743 // _savegpr0_15: std r15,-136(r1) 744 // _savegpr0_16: std r16,-128(r1) 745 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) { 746 std(R14, offset, dst); offset += 8; 747 std(R15, offset, dst); offset += 8; 748 std(R16, offset, dst); offset += 8; 749 std(R17, offset, dst); offset += 8; 750 std(R18, offset, dst); offset += 8; 751 std(R19, offset, dst); offset += 8; 752 std(R20, offset, dst); offset += 8; 753 std(R21, offset, dst); offset += 8; 754 std(R22, offset, dst); offset += 8; 755 std(R23, offset, dst); offset += 8; 756 std(R24, offset, dst); offset += 8; 757 std(R25, offset, dst); offset += 8; 758 std(R26, offset, dst); offset += 8; 759 std(R27, offset, dst); offset += 8; 760 std(R28, offset, dst); offset += 8; 761 std(R29, offset, dst); offset += 8; 762 std(R30, offset, dst); offset += 8; 763 std(R31, offset, dst); offset += 8; 764 765 stfd(F14, offset, dst); offset += 8; 766 stfd(F15, offset, dst); offset += 8; 767 stfd(F16, offset, dst); offset += 8; 768 stfd(F17, offset, dst); offset += 8; 769 stfd(F18, offset, dst); offset += 8; 770 stfd(F19, offset, dst); offset += 8; 771 stfd(F20, offset, dst); offset += 8; 772 stfd(F21, offset, dst); offset += 8; 773 stfd(F22, offset, dst); offset += 8; 774 stfd(F23, offset, dst); offset += 8; 775 stfd(F24, offset, dst); offset += 8; 776 stfd(F25, offset, dst); offset += 8; 777 stfd(F26, offset, dst); offset += 8; 778 stfd(F27, offset, dst); offset += 8; 779 stfd(F28, offset, dst); offset += 8; 780 stfd(F29, offset, dst); offset += 8; 781 stfd(F30, offset, dst); offset += 8; 782 stfd(F31, offset, dst); 783 } 784 785 // Uses ordering which corresponds to ABI: 786 // _restgpr0_14: ld r14,-144(r1) 787 // _restgpr0_15: ld r15,-136(r1) 788 // _restgpr0_16: ld r16,-128(r1) 789 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) { 790 ld(R14, offset, src); offset += 8; 791 ld(R15, offset, src); offset += 8; 792 ld(R16, offset, src); offset += 8; 793 ld(R17, offset, src); offset += 8; 794 ld(R18, offset, src); offset += 8; 795 ld(R19, offset, src); offset += 8; 796 ld(R20, offset, src); offset += 8; 797 ld(R21, offset, src); offset += 8; 798 ld(R22, offset, src); offset += 8; 799 ld(R23, offset, src); offset += 8; 800 ld(R24, offset, src); offset += 8; 801 ld(R25, offset, src); offset += 8; 802 ld(R26, offset, src); offset += 8; 803 ld(R27, offset, src); offset += 8; 804 ld(R28, offset, src); offset += 8; 805 ld(R29, offset, src); offset += 8; 806 ld(R30, offset, src); offset += 8; 807 ld(R31, offset, src); offset += 8; 808 809 // FP registers 810 lfd(F14, offset, src); offset += 8; 811 lfd(F15, offset, src); offset += 8; 812 lfd(F16, offset, src); offset += 8; 813 lfd(F17, offset, src); offset += 8; 814 lfd(F18, offset, src); offset += 8; 815 lfd(F19, offset, src); offset += 8; 816 lfd(F20, offset, src); offset += 8; 817 lfd(F21, offset, src); offset += 8; 818 lfd(F22, offset, src); offset += 8; 819 lfd(F23, offset, src); offset += 8; 820 lfd(F24, offset, src); offset += 8; 821 lfd(F25, offset, src); offset += 8; 822 lfd(F26, offset, src); offset += 8; 823 lfd(F27, offset, src); offset += 8; 824 lfd(F28, offset, src); offset += 8; 825 lfd(F29, offset, src); offset += 8; 826 lfd(F30, offset, src); offset += 8; 827 lfd(F31, offset, src); 828 } 829 830 // For verify_oops. 831 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 832 std(R2, offset, dst); offset += 8; 833 if (include_R3_RET_reg) { 834 std(R3, offset, dst); offset += 8; 835 } 836 std(R4, offset, dst); offset += 8; 837 std(R5, offset, dst); offset += 8; 838 std(R6, offset, dst); offset += 8; 839 std(R7, offset, dst); offset += 8; 840 std(R8, offset, dst); offset += 8; 841 std(R9, offset, dst); offset += 8; 842 std(R10, offset, dst); offset += 8; 843 std(R11, offset, dst); offset += 8; 844 std(R12, offset, dst); offset += 8; 845 846 if (include_fp_regs) { 847 stfd(F0, offset, dst); offset += 8; 848 stfd(F1, offset, dst); offset += 8; 849 stfd(F2, offset, dst); offset += 8; 850 stfd(F3, offset, dst); offset += 8; 851 stfd(F4, offset, dst); offset += 8; 852 stfd(F5, offset, dst); offset += 8; 853 stfd(F6, offset, dst); offset += 8; 854 stfd(F7, offset, dst); offset += 8; 855 stfd(F8, offset, dst); offset += 8; 856 stfd(F9, offset, dst); offset += 8; 857 stfd(F10, offset, dst); offset += 8; 858 stfd(F11, offset, dst); offset += 8; 859 stfd(F12, offset, dst); offset += 8; 860 stfd(F13, offset, dst); 861 } 862 } 863 864 // For verify_oops. 865 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) { 866 ld(R2, offset, src); offset += 8; 867 if (include_R3_RET_reg) { 868 ld(R3, offset, src); offset += 8; 869 } 870 ld(R4, offset, src); offset += 8; 871 ld(R5, offset, src); offset += 8; 872 ld(R6, offset, src); offset += 8; 873 ld(R7, offset, src); offset += 8; 874 ld(R8, offset, src); offset += 8; 875 ld(R9, offset, src); offset += 8; 876 ld(R10, offset, src); offset += 8; 877 ld(R11, offset, src); offset += 8; 878 ld(R12, offset, src); offset += 8; 879 880 if (include_fp_regs) { 881 lfd(F0, offset, src); offset += 8; 882 lfd(F1, offset, src); offset += 8; 883 lfd(F2, offset, src); offset += 8; 884 lfd(F3, offset, src); offset += 8; 885 lfd(F4, offset, src); offset += 8; 886 lfd(F5, offset, src); offset += 8; 887 lfd(F6, offset, src); offset += 8; 888 lfd(F7, offset, src); offset += 8; 889 lfd(F8, offset, src); offset += 8; 890 lfd(F9, offset, src); offset += 8; 891 lfd(F10, offset, src); offset += 8; 892 lfd(F11, offset, src); offset += 8; 893 lfd(F12, offset, src); offset += 8; 894 lfd(F13, offset, src); 895 } 896 } 897 898 void MacroAssembler::save_LR_CR(Register tmp) { 899 mfcr(tmp); 900 std(tmp, _abi0(cr), R1_SP); 901 mflr(tmp); 902 std(tmp, _abi0(lr), R1_SP); 903 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad) 904 } 905 906 void MacroAssembler::restore_LR_CR(Register tmp) { 907 assert(tmp != R1_SP, "must be distinct"); 908 ld(tmp, _abi0(lr), R1_SP); 909 mtlr(tmp); 910 ld(tmp, _abi0(cr), R1_SP); 911 mtcr(tmp); 912 } 913 914 address MacroAssembler::get_PC_trash_LR(Register result) { 915 Label L; 916 bl(L); 917 bind(L); 918 address lr_pc = pc(); 919 mflr(result); 920 return lr_pc; 921 } 922 923 void MacroAssembler::resize_frame(Register offset, Register tmp) { 924 #ifdef ASSERT 925 assert_different_registers(offset, tmp, R1_SP); 926 andi_(tmp, offset, frame::alignment_in_bytes-1); 927 asm_assert_eq("resize_frame: unaligned"); 928 #endif 929 930 // tmp <- *(SP) 931 ld(tmp, _abi0(callers_sp), R1_SP); 932 // addr <- SP + offset; 933 // *(addr) <- tmp; 934 // SP <- addr 935 stdux(tmp, R1_SP, offset); 936 } 937 938 void MacroAssembler::resize_frame(int offset, Register tmp) { 939 assert(is_simm(offset, 16), "too big an offset"); 940 assert_different_registers(tmp, R1_SP); 941 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned"); 942 // tmp <- *(SP) 943 ld(tmp, _abi0(callers_sp), R1_SP); 944 // addr <- SP + offset; 945 // *(addr) <- tmp; 946 // SP <- addr 947 stdu(tmp, offset, R1_SP); 948 } 949 950 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) { 951 // (addr == tmp1) || (addr == tmp2) is allowed here! 952 assert(tmp1 != tmp2, "must be distinct"); 953 954 // compute offset w.r.t. current stack pointer 955 // tmp_1 <- addr - SP (!) 956 subf(tmp1, R1_SP, addr); 957 958 // atomically update SP keeping back link. 959 resize_frame(tmp1/* offset */, tmp2/* tmp */); 960 } 961 962 void MacroAssembler::push_frame(Register bytes, Register tmp) { 963 #ifdef ASSERT 964 assert(bytes != R0, "r0 not allowed here"); 965 andi_(R0, bytes, frame::alignment_in_bytes-1); 966 asm_assert_eq("push_frame(Reg, Reg): unaligned"); 967 #endif 968 neg(tmp, bytes); 969 stdux(R1_SP, R1_SP, tmp); 970 } 971 972 // Push a frame of size `bytes'. 973 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) { 974 long offset = align_addr(bytes, frame::alignment_in_bytes); 975 if (is_simm(-offset, 16)) { 976 stdu(R1_SP, -offset, R1_SP); 977 } else { 978 load_const_optimized(tmp, -offset); 979 stdux(R1_SP, R1_SP, tmp); 980 } 981 } 982 983 // Push a frame of size `bytes' plus native_abi_reg_args on top. 984 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) { 985 push_frame(bytes + frame::native_abi_reg_args_size, tmp); 986 } 987 988 // Setup up a new C frame with a spill area for non-volatile GPRs and 989 // additional space for local variables. 990 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes, 991 Register tmp) { 992 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp); 993 } 994 995 // Pop current C frame. 996 void MacroAssembler::pop_frame() { 997 ld(R1_SP, _abi0(callers_sp), R1_SP); 998 } 999 1000 #if defined(ABI_ELFv2) 1001 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) { 1002 // TODO(asmundak): make sure the caller uses R12 as function descriptor 1003 // most of the times. 1004 if (R12 != r_function_entry) { 1005 mr(R12, r_function_entry); 1006 } 1007 mtctr(R12); 1008 // Do a call or a branch. 1009 if (and_link) { 1010 bctrl(); 1011 } else { 1012 bctr(); 1013 } 1014 _last_calls_return_pc = pc(); 1015 1016 return _last_calls_return_pc; 1017 } 1018 1019 // Call a C function via a function descriptor and use full C 1020 // calling conventions. Updates and returns _last_calls_return_pc. 1021 address MacroAssembler::call_c(Register r_function_entry) { 1022 return branch_to(r_function_entry, /*and_link=*/true); 1023 } 1024 1025 // For tail calls: only branch, don't link, so callee returns to caller of this function. 1026 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) { 1027 return branch_to(r_function_entry, /*and_link=*/false); 1028 } 1029 1030 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) { 1031 load_const(R12, function_entry, R0); 1032 return branch_to(R12, /*and_link=*/true); 1033 } 1034 1035 #else 1036 // Generic version of a call to C function via a function descriptor 1037 // with variable support for C calling conventions (TOC, ENV, etc.). 1038 // Updates and returns _last_calls_return_pc. 1039 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call, 1040 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) { 1041 // we emit standard ptrgl glue code here 1042 assert((function_descriptor != R0), "function_descriptor cannot be R0"); 1043 1044 // retrieve necessary entries from the function descriptor 1045 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor); 1046 mtctr(R0); 1047 1048 if (load_toc_of_callee) { 1049 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor); 1050 } 1051 if (load_env_of_callee) { 1052 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor); 1053 } else if (load_toc_of_callee) { 1054 li(R11, 0); 1055 } 1056 1057 // do a call or a branch 1058 if (and_link) { 1059 bctrl(); 1060 } else { 1061 bctr(); 1062 } 1063 _last_calls_return_pc = pc(); 1064 1065 return _last_calls_return_pc; 1066 } 1067 1068 // Call a C function via a function descriptor and use full C calling 1069 // conventions. 1070 // We don't use the TOC in generated code, so there is no need to save 1071 // and restore its value. 1072 address MacroAssembler::call_c(Register fd) { 1073 return branch_to(fd, /*and_link=*/true, 1074 /*save toc=*/false, 1075 /*restore toc=*/false, 1076 /*load toc=*/true, 1077 /*load env=*/true); 1078 } 1079 1080 address MacroAssembler::call_c_and_return_to_caller(Register fd) { 1081 return branch_to(fd, /*and_link=*/false, 1082 /*save toc=*/false, 1083 /*restore toc=*/false, 1084 /*load toc=*/true, 1085 /*load env=*/true); 1086 } 1087 1088 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) { 1089 if (rt != relocInfo::none) { 1090 // this call needs to be relocatable 1091 if (!ReoptimizeCallSequences 1092 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1093 || fd == nullptr // support code-size estimation 1094 || !fd->is_friend_function() 1095 || fd->entry() == nullptr) { 1096 // it's not a friend function as defined by class FunctionDescriptor, 1097 // so do a full call-c here. 1098 load_const(R11, (address)fd, R0); 1099 1100 bool has_env = (fd != nullptr && fd->env() != nullptr); 1101 return branch_to(R11, /*and_link=*/true, 1102 /*save toc=*/false, 1103 /*restore toc=*/false, 1104 /*load toc=*/true, 1105 /*load env=*/has_env); 1106 } else { 1107 // It's a friend function. Load the entry point and don't care about 1108 // toc and env. Use an optimizable call instruction, but ensure the 1109 // same code-size as in the case of a non-friend function. 1110 nop(); 1111 nop(); 1112 nop(); 1113 bl64_patchable(fd->entry(), rt); 1114 _last_calls_return_pc = pc(); 1115 return _last_calls_return_pc; 1116 } 1117 } else { 1118 // This call does not need to be relocatable, do more aggressive 1119 // optimizations. 1120 if (!ReoptimizeCallSequences 1121 || !fd->is_friend_function()) { 1122 // It's not a friend function as defined by class FunctionDescriptor, 1123 // so do a full call-c here. 1124 load_const(R11, (address)fd, R0); 1125 return branch_to(R11, /*and_link=*/true, 1126 /*save toc=*/false, 1127 /*restore toc=*/false, 1128 /*load toc=*/true, 1129 /*load env=*/true); 1130 } else { 1131 // it's a friend function, load the entry point and don't care about 1132 // toc and env. 1133 address dest = fd->entry(); 1134 if (is_within_range_of_b(dest, pc())) { 1135 bl(dest); 1136 } else { 1137 bl64_patchable(dest, rt); 1138 } 1139 _last_calls_return_pc = pc(); 1140 return _last_calls_return_pc; 1141 } 1142 } 1143 } 1144 1145 // Call a C function. All constants needed reside in TOC. 1146 // 1147 // Read the address to call from the TOC. 1148 // Read env from TOC, if fd specifies an env. 1149 // Read new TOC from TOC. 1150 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd, 1151 relocInfo::relocType rt, Register toc) { 1152 if (!ReoptimizeCallSequences 1153 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none) 1154 || !fd->is_friend_function()) { 1155 // It's not a friend function as defined by class FunctionDescriptor, 1156 // so do a full call-c here. 1157 assert(fd->entry() != nullptr, "function must be linked"); 1158 1159 AddressLiteral fd_entry(fd->entry()); 1160 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true); 1161 mtctr(R11); 1162 if (fd->env() == nullptr) { 1163 li(R11, 0); 1164 nop(); 1165 } else { 1166 AddressLiteral fd_env(fd->env()); 1167 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true); 1168 } 1169 AddressLiteral fd_toc(fd->toc()); 1170 // Set R2_TOC (load from toc) 1171 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true); 1172 bctrl(); 1173 _last_calls_return_pc = pc(); 1174 if (!success) { return nullptr; } 1175 } else { 1176 // It's a friend function, load the entry point and don't care about 1177 // toc and env. Use an optimizable call instruction, but ensure the 1178 // same code-size as in the case of a non-friend function. 1179 nop(); 1180 bl64_patchable(fd->entry(), rt); 1181 _last_calls_return_pc = pc(); 1182 } 1183 return _last_calls_return_pc; 1184 } 1185 #endif // ABI_ELFv2 1186 1187 void MacroAssembler::post_call_nop() { 1188 // Make inline again when loom is always enabled. 1189 if (!Continuations::enabled()) { 1190 return; 1191 } 1192 // We use CMPI/CMPLI instructions to encode post call nops. 1193 // Refer to NativePostCallNop for details. 1194 relocate(post_call_nop_Relocation::spec()); 1195 InlineSkippedInstructionsCounter skipCounter(this); 1196 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9)); 1197 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found"); 1198 } 1199 1200 int MacroAssembler::ic_check_size() { 1201 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1202 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1203 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1204 1205 int num_ins; 1206 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1207 num_ins = 3; 1208 if (use_trap_based_null_check) num_ins += 1; 1209 } else { 1210 num_ins = 7; 1211 if (!implicit_null_checks_available) num_ins += 2; 1212 } 1213 return num_ins * BytesPerInstWord; 1214 } 1215 1216 int MacroAssembler::ic_check(int end_alignment) { 1217 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(), 1218 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks, 1219 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks; 1220 1221 Register receiver = R3_ARG1; 1222 Register data = R19_inline_cache_reg; 1223 Register tmp1 = R11_scratch1; 1224 Register tmp2 = R12_scratch2; 1225 1226 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed 1227 // before the inline cache check, so we don't have to execute any nop instructions when dispatching 1228 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align 1229 // before the inline cache check here, and not after 1230 align(end_alignment, end_alignment, end_alignment - ic_check_size()); 1231 1232 int uep_offset = offset(); 1233 1234 if (use_fast_receiver_null_check && TrapBasedICMissChecks) { 1235 // Fast version which uses SIGTRAP 1236 1237 if (use_trap_based_null_check) { 1238 trap_null_check(receiver); 1239 } 1240 if (UseCompressedClassPointers) { 1241 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1242 } else { 1243 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1244 } 1245 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1246 trap_ic_miss_check(tmp1, tmp2); 1247 1248 } else { 1249 // Slower version which doesn't use SIGTRAP 1250 1251 // Load stub address using toc (fixed instruction size, unlike load_const_optimized) 1252 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(), 1253 true, true, false); // 2 instructions 1254 mtctr(tmp1); 1255 1256 if (!implicit_null_checks_available) { 1257 cmpdi(CCR0, receiver, 0); 1258 beqctr(CCR0); 1259 } 1260 if (UseCompressedClassPointers) { 1261 lwz(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1262 } else { 1263 ld(tmp1, oopDesc::klass_offset_in_bytes(), receiver); 1264 } 1265 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data); 1266 cmpd(CCR0, tmp1, tmp2); 1267 bnectr(CCR0); 1268 } 1269 1270 assert((offset() % end_alignment) == 0, "Misaligned verified entry point"); 1271 1272 return uep_offset; 1273 } 1274 1275 void MacroAssembler::call_VM_base(Register oop_result, 1276 Register last_java_sp, 1277 address entry_point, 1278 bool check_exceptions) { 1279 BLOCK_COMMENT("call_VM {"); 1280 // Determine last_java_sp register. 1281 if (!last_java_sp->is_valid()) { 1282 last_java_sp = R1_SP; 1283 } 1284 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1); 1285 1286 // ARG1 must hold thread address. 1287 mr(R3_ARG1, R16_thread); 1288 #if defined(ABI_ELFv2) 1289 address return_pc = call_c(entry_point, relocInfo::none); 1290 #else 1291 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none); 1292 #endif 1293 1294 reset_last_Java_frame(); 1295 1296 // Check for pending exceptions. 1297 if (check_exceptions) { 1298 // We don't check for exceptions here. 1299 ShouldNotReachHere(); 1300 } 1301 1302 // Get oop result if there is one and reset the value in the thread. 1303 if (oop_result->is_valid()) { 1304 get_vm_result(oop_result); 1305 } 1306 1307 _last_calls_return_pc = return_pc; 1308 BLOCK_COMMENT("} call_VM"); 1309 } 1310 1311 void MacroAssembler::call_VM_leaf_base(address entry_point) { 1312 BLOCK_COMMENT("call_VM_leaf {"); 1313 #if defined(ABI_ELFv2) 1314 call_c(entry_point, relocInfo::none); 1315 #else 1316 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none); 1317 #endif 1318 BLOCK_COMMENT("} call_VM_leaf"); 1319 } 1320 1321 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) { 1322 call_VM_base(oop_result, noreg, entry_point, check_exceptions); 1323 } 1324 1325 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, 1326 bool check_exceptions) { 1327 // R3_ARG1 is reserved for the thread. 1328 mr_if_needed(R4_ARG2, arg_1); 1329 call_VM(oop_result, entry_point, check_exceptions); 1330 } 1331 1332 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, 1333 bool check_exceptions) { 1334 // R3_ARG1 is reserved for the thread 1335 assert_different_registers(arg_2, R4_ARG2); 1336 mr_if_needed(R4_ARG2, arg_1); 1337 mr_if_needed(R5_ARG3, arg_2); 1338 call_VM(oop_result, entry_point, check_exceptions); 1339 } 1340 1341 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, 1342 bool check_exceptions) { 1343 // R3_ARG1 is reserved for the thread 1344 assert_different_registers(arg_2, R4_ARG2); 1345 assert_different_registers(arg_3, R4_ARG2, R5_ARG3); 1346 mr_if_needed(R4_ARG2, arg_1); 1347 mr_if_needed(R5_ARG3, arg_2); 1348 mr_if_needed(R6_ARG4, arg_3); 1349 call_VM(oop_result, entry_point, check_exceptions); 1350 } 1351 1352 void MacroAssembler::call_VM_leaf(address entry_point) { 1353 call_VM_leaf_base(entry_point); 1354 } 1355 1356 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) { 1357 mr_if_needed(R3_ARG1, arg_1); 1358 call_VM_leaf(entry_point); 1359 } 1360 1361 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) { 1362 assert_different_registers(arg_2, R3_ARG1); 1363 mr_if_needed(R3_ARG1, arg_1); 1364 mr_if_needed(R4_ARG2, arg_2); 1365 call_VM_leaf(entry_point); 1366 } 1367 1368 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) { 1369 assert_different_registers(arg_2, R3_ARG1); 1370 assert_different_registers(arg_3, R3_ARG1, R4_ARG2); 1371 mr_if_needed(R3_ARG1, arg_1); 1372 mr_if_needed(R4_ARG2, arg_2); 1373 mr_if_needed(R5_ARG3, arg_3); 1374 call_VM_leaf(entry_point); 1375 } 1376 1377 // Check whether instruction is a read access to the polling page 1378 // which was emitted by load_from_polling_page(..). 1379 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext, 1380 address* polling_address_ptr) { 1381 if (!is_ld(instruction)) 1382 return false; // It's not a ld. Fail. 1383 1384 int rt = inv_rt_field(instruction); 1385 int ra = inv_ra_field(instruction); 1386 int ds = inv_ds_field(instruction); 1387 if (!(ds == 0 && ra != 0 && rt == 0)) { 1388 return false; // It's not a ld(r0, X, ra). Fail. 1389 } 1390 1391 if (!ucontext) { 1392 // Set polling address. 1393 if (polling_address_ptr != nullptr) { 1394 *polling_address_ptr = nullptr; 1395 } 1396 return true; // No ucontext given. Can't check value of ra. Assume true. 1397 } 1398 1399 #ifdef LINUX 1400 // Ucontext given. Check that register ra contains the address of 1401 // the safepoing polling page. 1402 ucontext_t* uc = (ucontext_t*) ucontext; 1403 // Set polling address. 1404 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds; 1405 if (polling_address_ptr != nullptr) { 1406 *polling_address_ptr = addr; 1407 } 1408 return SafepointMechanism::is_poll_address(addr); 1409 #else 1410 // Not on Linux, ucontext must be null. 1411 ShouldNotReachHere(); 1412 return false; 1413 #endif 1414 } 1415 1416 void MacroAssembler::bang_stack_with_offset(int offset) { 1417 // When increasing the stack, the old stack pointer will be written 1418 // to the new top of stack according to the PPC64 abi. 1419 // Therefore, stack banging is not necessary when increasing 1420 // the stack by <= os::vm_page_size() bytes. 1421 // When increasing the stack by a larger amount, this method is 1422 // called repeatedly to bang the intermediate pages. 1423 1424 // Stack grows down, caller passes positive offset. 1425 assert(offset > 0, "must bang with positive offset"); 1426 1427 long stdoffset = -offset; 1428 1429 if (is_simm(stdoffset, 16)) { 1430 // Signed 16 bit offset, a simple std is ok. 1431 if (UseLoadInstructionsForStackBangingPPC64) { 1432 ld(R0, (int)(signed short)stdoffset, R1_SP); 1433 } else { 1434 std(R0,(int)(signed short)stdoffset, R1_SP); 1435 } 1436 } else if (is_simm(stdoffset, 31)) { 1437 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset); 1438 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset); 1439 1440 Register tmp = R11; 1441 addis(tmp, R1_SP, hi); 1442 if (UseLoadInstructionsForStackBangingPPC64) { 1443 ld(R0, lo, tmp); 1444 } else { 1445 std(R0, lo, tmp); 1446 } 1447 } else { 1448 ShouldNotReachHere(); 1449 } 1450 } 1451 1452 // If instruction is a stack bang of the form 1453 // std R0, x(Ry), (see bang_stack_with_offset()) 1454 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame()) 1455 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame()) 1456 // return the banged address. Otherwise, return 0. 1457 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) { 1458 #ifdef LINUX 1459 ucontext_t* uc = (ucontext_t*) ucontext; 1460 int rs = inv_rs_field(instruction); 1461 int ra = inv_ra_field(instruction); 1462 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64) 1463 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64) 1464 || (is_stdu(instruction) && rs == 1)) { 1465 int ds = inv_ds_field(instruction); 1466 // return banged address 1467 return ds+(address)uc->uc_mcontext.regs->gpr[ra]; 1468 } else if (is_stdux(instruction) && rs == 1) { 1469 int rb = inv_rb_field(instruction); 1470 address sp = (address)uc->uc_mcontext.regs->gpr[1]; 1471 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb]; 1472 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang 1473 : sp + rb_val; // banged address 1474 } 1475 return nullptr; // not a stack bang 1476 #else 1477 // workaround not needed on !LINUX :-) 1478 ShouldNotCallThis(); 1479 return nullptr; 1480 #endif 1481 } 1482 1483 void MacroAssembler::reserved_stack_check(Register return_pc) { 1484 // Test if reserved zone needs to be enabled. 1485 Label no_reserved_zone_enabling; 1486 1487 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread); 1488 cmpld(CCR0, R1_SP, R0); 1489 blt_predict_taken(CCR0, no_reserved_zone_enabling); 1490 1491 // Enable reserved zone again, throw stack overflow exception. 1492 push_frame_reg_args(0, R0); 1493 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread); 1494 pop_frame(); 1495 mtlr(return_pc); 1496 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry()); 1497 mtctr(R0); 1498 bctr(); 1499 1500 should_not_reach_here(); 1501 1502 bind(no_reserved_zone_enabling); 1503 } 1504 1505 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base, 1506 bool cmpxchgx_hint) { 1507 Label retry; 1508 bind(retry); 1509 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1510 stdcx_(exchange_value, addr_base); 1511 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1512 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1513 } else { 1514 bne( CCR0, retry); // StXcx_ sets CCR0. 1515 } 1516 } 1517 1518 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base, 1519 Register tmp, bool cmpxchgx_hint) { 1520 Label retry; 1521 bind(retry); 1522 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1523 add(tmp, dest_current_value, inc_value); 1524 stdcx_(tmp, addr_base); 1525 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1526 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1527 } else { 1528 bne( CCR0, retry); // StXcx_ sets CCR0. 1529 } 1530 } 1531 1532 // Word/sub-word atomic helper functions 1533 1534 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions. 1535 // Only signed types are supported with size < 4. 1536 // Atomic add always kills tmp1. 1537 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value, 1538 Register addr_base, Register tmp1, Register tmp2, Register tmp3, 1539 bool cmpxchgx_hint, bool is_add, int size) { 1540 // Sub-word instructions are available since Power 8. 1541 // For older processors, instruction_type != size holds, and we 1542 // emulate the sub-word instructions by constructing a 4-byte value 1543 // that leaves the other bytes unchanged. 1544 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1545 1546 Label retry; 1547 Register shift_amount = noreg, 1548 val32 = dest_current_value, 1549 modval = is_add ? tmp1 : exchange_value; 1550 1551 if (instruction_type != size) { 1552 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base); 1553 modval = tmp1; 1554 shift_amount = tmp2; 1555 val32 = tmp3; 1556 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1557 #ifdef VM_LITTLE_ENDIAN 1558 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1559 clrrdi(addr_base, addr_base, 2); 1560 #else 1561 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1562 clrrdi(addr_base, addr_base, 2); 1563 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1564 #endif 1565 } 1566 1567 // atomic emulation loop 1568 bind(retry); 1569 1570 switch (instruction_type) { 1571 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1572 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1573 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1574 default: ShouldNotReachHere(); 1575 } 1576 1577 if (instruction_type != size) { 1578 srw(dest_current_value, val32, shift_amount); 1579 } 1580 1581 if (is_add) { add(modval, dest_current_value, exchange_value); } 1582 1583 if (instruction_type != size) { 1584 // Transform exchange value such that the replacement can be done by one xor instruction. 1585 xorr(modval, dest_current_value, is_add ? modval : exchange_value); 1586 clrldi(modval, modval, (size == 1) ? 56 : 48); 1587 slw(modval, modval, shift_amount); 1588 xorr(modval, val32, modval); 1589 } 1590 1591 switch (instruction_type) { 1592 case 4: stwcx_(modval, addr_base); break; 1593 case 2: sthcx_(modval, addr_base); break; 1594 case 1: stbcx_(modval, addr_base); break; 1595 default: ShouldNotReachHere(); 1596 } 1597 1598 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1599 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0. 1600 } else { 1601 bne( CCR0, retry); // StXcx_ sets CCR0. 1602 } 1603 1604 // l?arx zero-extends, but Java wants byte/short values sign-extended. 1605 if (size == 1) { 1606 extsb(dest_current_value, dest_current_value); 1607 } else if (size == 2) { 1608 extsh(dest_current_value, dest_current_value); 1609 }; 1610 } 1611 1612 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions. 1613 // Only signed types are supported with size < 4. 1614 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value, 1615 Register compare_value, Register exchange_value, 1616 Register addr_base, Register tmp1, Register tmp2, 1617 Label &retry, Label &failed, bool cmpxchgx_hint, int size) { 1618 // Sub-word instructions are available since Power 8. 1619 // For older processors, instruction_type != size holds, and we 1620 // emulate the sub-word instructions by constructing a 4-byte value 1621 // that leaves the other bytes unchanged. 1622 const int instruction_type = VM_Version::has_lqarx() ? size : 4; 1623 1624 Register shift_amount = noreg, 1625 val32 = dest_current_value, 1626 modval = exchange_value; 1627 1628 if (instruction_type != size) { 1629 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base); 1630 shift_amount = tmp1; 1631 val32 = tmp2; 1632 modval = tmp2; 1633 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned. 1634 #ifdef VM_LITTLE_ENDIAN 1635 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8; 1636 clrrdi(addr_base, addr_base, 2); 1637 #else 1638 xori(shift_amount, addr_base, (size == 1) ? 3 : 2); 1639 clrrdi(addr_base, addr_base, 2); 1640 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16; 1641 #endif 1642 // Transform exchange value such that the replacement can be done by one xor instruction. 1643 xorr(exchange_value, compare_value, exchange_value); 1644 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48); 1645 slw(exchange_value, exchange_value, shift_amount); 1646 } 1647 1648 // atomic emulation loop 1649 bind(retry); 1650 1651 switch (instruction_type) { 1652 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break; 1653 case 2: lharx(val32, addr_base, cmpxchgx_hint); break; 1654 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break; 1655 default: ShouldNotReachHere(); 1656 } 1657 1658 if (instruction_type != size) { 1659 srw(dest_current_value, val32, shift_amount); 1660 } 1661 if (size == 1) { 1662 extsb(dest_current_value, dest_current_value); 1663 } else if (size == 2) { 1664 extsh(dest_current_value, dest_current_value); 1665 }; 1666 1667 cmpw(flag, dest_current_value, compare_value); 1668 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1669 bne_predict_not_taken(flag, failed); 1670 } else { 1671 bne( flag, failed); 1672 } 1673 // branch to done => (flag == ne), (dest_current_value != compare_value) 1674 // fall through => (flag == eq), (dest_current_value == compare_value) 1675 1676 if (instruction_type != size) { 1677 xorr(modval, val32, exchange_value); 1678 } 1679 1680 switch (instruction_type) { 1681 case 4: stwcx_(modval, addr_base); break; 1682 case 2: sthcx_(modval, addr_base); break; 1683 case 1: stbcx_(modval, addr_base); break; 1684 default: ShouldNotReachHere(); 1685 } 1686 } 1687 1688 // CmpxchgX sets condition register to cmpX(current, compare). 1689 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value, 1690 Register compare_value, Register exchange_value, 1691 Register addr_base, Register tmp1, Register tmp2, 1692 int semantics, bool cmpxchgx_hint, 1693 Register int_flag_success, bool contention_hint, bool weak, int size) { 1694 Label retry; 1695 Label failed; 1696 Label done; 1697 1698 // Save one branch if result is returned via register and 1699 // result register is different from the other ones. 1700 bool use_result_reg = (int_flag_success != noreg); 1701 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value && 1702 int_flag_success != exchange_value && int_flag_success != addr_base && 1703 int_flag_success != tmp1 && int_flag_success != tmp2); 1704 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1705 assert(size == 1 || size == 2 || size == 4, "unsupported"); 1706 1707 if (use_result_reg && preset_result_reg) { 1708 li(int_flag_success, 0); // preset (assume cas failed) 1709 } 1710 1711 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1712 if (contention_hint) { // Don't try to reserve if cmp fails. 1713 switch (size) { 1714 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break; 1715 case 2: lha(dest_current_value, 0, addr_base); break; 1716 case 4: lwz(dest_current_value, 0, addr_base); break; 1717 default: ShouldNotReachHere(); 1718 } 1719 cmpw(flag, dest_current_value, compare_value); 1720 bne(flag, failed); 1721 } 1722 1723 // release/fence semantics 1724 if (semantics & MemBarRel) { 1725 release(); 1726 } 1727 1728 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2, 1729 retry, failed, cmpxchgx_hint, size); 1730 if (!weak || use_result_reg) { 1731 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1732 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1733 } else { 1734 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0. 1735 } 1736 } 1737 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped) 1738 1739 // Result in register (must do this at the end because int_flag_success can be the 1740 // same register as one above). 1741 if (use_result_reg) { 1742 li(int_flag_success, 1); 1743 } 1744 1745 if (semantics & MemBarFenceAfter) { 1746 fence(); 1747 } else if (semantics & MemBarAcq) { 1748 isync(); 1749 } 1750 1751 if (use_result_reg && !preset_result_reg) { 1752 b(done); 1753 } 1754 1755 bind(failed); 1756 if (use_result_reg && !preset_result_reg) { 1757 li(int_flag_success, 0); 1758 } 1759 1760 bind(done); 1761 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1762 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1763 } 1764 1765 // Performs atomic compare exchange: 1766 // if (compare_value == *addr_base) 1767 // *addr_base = exchange_value 1768 // int_flag_success = 1; 1769 // else 1770 // int_flag_success = 0; 1771 // 1772 // ConditionRegister flag = cmp(compare_value, *addr_base) 1773 // Register dest_current_value = *addr_base 1774 // Register compare_value Used to compare with value in memory 1775 // Register exchange_value Written to memory if compare_value == *addr_base 1776 // Register addr_base The memory location to compareXChange 1777 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base 1778 // 1779 // To avoid the costly compare exchange the value is tested beforehand. 1780 // Several special cases exist to avoid that unnecessary information is generated. 1781 // 1782 void MacroAssembler::cmpxchgd(ConditionRegister flag, 1783 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value, 1784 Register addr_base, int semantics, bool cmpxchgx_hint, 1785 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) { 1786 Label retry; 1787 Label failed_int; 1788 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int; 1789 Label done; 1790 1791 // Save one branch if result is returned via register and result register is different from the other ones. 1792 bool use_result_reg = (int_flag_success!=noreg); 1793 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() && 1794 int_flag_success!=exchange_value && int_flag_success!=addr_base); 1795 assert(!weak || flag == CCR0, "weak only supported with CCR0"); 1796 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both"); 1797 1798 if (use_result_reg && preset_result_reg) { 1799 li(int_flag_success, 0); // preset (assume cas failed) 1800 } 1801 1802 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM). 1803 if (contention_hint) { // Don't try to reserve if cmp fails. 1804 ld(dest_current_value, 0, addr_base); 1805 cmpd(flag, compare_value, dest_current_value); 1806 bne(flag, failed); 1807 } 1808 1809 // release/fence semantics 1810 if (semantics & MemBarRel) { 1811 release(); 1812 } 1813 1814 // atomic emulation loop 1815 bind(retry); 1816 1817 ldarx(dest_current_value, addr_base, cmpxchgx_hint); 1818 cmpd(flag, compare_value, dest_current_value); 1819 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1820 bne_predict_not_taken(flag, failed); 1821 } else { 1822 bne( flag, failed); 1823 } 1824 1825 stdcx_(exchange_value, addr_base); 1826 if (!weak || use_result_reg || failed_ext) { 1827 if (UseStaticBranchPredictionInCompareAndSwapPPC64) { 1828 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1829 } else { 1830 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0 1831 } 1832 } 1833 1834 // result in register (must do this at the end because int_flag_success can be the same register as one above) 1835 if (use_result_reg) { 1836 li(int_flag_success, 1); 1837 } 1838 1839 if (semantics & MemBarFenceAfter) { 1840 fence(); 1841 } else if (semantics & MemBarAcq) { 1842 isync(); 1843 } 1844 1845 if (use_result_reg && !preset_result_reg) { 1846 b(done); 1847 } 1848 1849 bind(failed_int); 1850 if (use_result_reg && !preset_result_reg) { 1851 li(int_flag_success, 0); 1852 } 1853 1854 bind(done); 1855 // (flag == ne) => (dest_current_value != compare_value), (!swapped) 1856 // (flag == eq) => (dest_current_value == compare_value), ( swapped) 1857 } 1858 1859 // Look up the method for a megamorphic invokeinterface call. 1860 // The target method is determined by <intf_klass, itable_index>. 1861 // The receiver klass is in recv_klass. 1862 // On success, the result will be in method_result, and execution falls through. 1863 // On failure, execution transfers to the given label. 1864 void MacroAssembler::lookup_interface_method(Register recv_klass, 1865 Register intf_klass, 1866 RegisterOrConstant itable_index, 1867 Register method_result, 1868 Register scan_temp, 1869 Register temp2, 1870 Label& L_no_such_interface, 1871 bool return_method) { 1872 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 1873 1874 // Compute start of first itableOffsetEntry (which is at the end of the vtable). 1875 int vtable_base = in_bytes(Klass::vtable_start_offset()); 1876 int itentry_off = in_bytes(itableMethodEntry::method_offset()); 1877 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize); 1878 int scan_step = itableOffsetEntry::size() * wordSize; 1879 int log_vte_size= exact_log2(vtableEntry::size_in_bytes()); 1880 1881 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass); 1882 // We should store the aligned, prescaled offset in the klass. 1883 // Then the next several instructions would fold away. 1884 1885 sldi(scan_temp, scan_temp, log_vte_size); 1886 addi(scan_temp, scan_temp, vtable_base); 1887 add(scan_temp, recv_klass, scan_temp); 1888 1889 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 1890 if (return_method) { 1891 if (itable_index.is_register()) { 1892 Register itable_offset = itable_index.as_register(); 1893 sldi(method_result, itable_offset, logMEsize); 1894 if (itentry_off) { addi(method_result, method_result, itentry_off); } 1895 add(method_result, method_result, recv_klass); 1896 } else { 1897 long itable_offset = (long)itable_index.as_constant(); 1898 // static address, no relocation 1899 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2); 1900 } 1901 } 1902 1903 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) { 1904 // if (scan->interface() == intf) { 1905 // result = (klass + scan->offset() + itable_index); 1906 // } 1907 // } 1908 Label search, found_method; 1909 1910 for (int peel = 1; peel >= 0; peel--) { 1911 // %%%% Could load both offset and interface in one ldx, if they were 1912 // in the opposite order. This would save a load. 1913 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp); 1914 1915 // Check that this entry is non-null. A null entry means that 1916 // the receiver class doesn't implement the interface, and wasn't the 1917 // same as when the caller was compiled. 1918 cmpd(CCR0, temp2, intf_klass); 1919 1920 if (peel) { 1921 beq(CCR0, found_method); 1922 } else { 1923 bne(CCR0, search); 1924 // (invert the test to fall through to found_method...) 1925 } 1926 1927 if (!peel) break; 1928 1929 bind(search); 1930 1931 cmpdi(CCR0, temp2, 0); 1932 beq(CCR0, L_no_such_interface); 1933 addi(scan_temp, scan_temp, scan_step); 1934 } 1935 1936 bind(found_method); 1937 1938 // Got a hit. 1939 if (return_method) { 1940 int ito_offset = in_bytes(itableOffsetEntry::offset_offset()); 1941 lwz(scan_temp, ito_offset, scan_temp); 1942 ldx(method_result, scan_temp, method_result); 1943 } 1944 } 1945 1946 // virtual method calling 1947 void MacroAssembler::lookup_virtual_method(Register recv_klass, 1948 RegisterOrConstant vtable_index, 1949 Register method_result) { 1950 1951 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg()); 1952 1953 const ByteSize base = Klass::vtable_start_offset(); 1954 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 1955 1956 if (vtable_index.is_register()) { 1957 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord); 1958 add(recv_klass, vtable_index.as_register(), recv_klass); 1959 } else { 1960 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord); 1961 } 1962 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass); 1963 } 1964 1965 /////////////////////////////////////////// subtype checking //////////////////////////////////////////// 1966 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 1967 Register super_klass, 1968 Register temp1_reg, 1969 Register temp2_reg, 1970 Label* L_success, 1971 Label* L_failure, 1972 Label* L_slow_path, 1973 RegisterOrConstant super_check_offset) { 1974 1975 const Register check_cache_offset = temp1_reg; 1976 const Register cached_super = temp2_reg; 1977 1978 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super); 1979 1980 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1981 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1982 1983 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 1984 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset); 1985 1986 Label L_fallthrough; 1987 int label_nulls = 0; 1988 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; } 1989 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; } 1990 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; } 1991 assert(label_nulls <= 1 || 1992 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path), 1993 "at most one null in the batch, usually"); 1994 1995 // If the pointers are equal, we are done (e.g., String[] elements). 1996 // This self-check enables sharing of secondary supertype arrays among 1997 // non-primary types such as array-of-interface. Otherwise, each such 1998 // type would need its own customized SSA. 1999 // We move this check to the front of the fast path because many 2000 // type checks are in fact trivially successful in this manner, 2001 // so we get a nicely predicted branch right at the start of the check. 2002 cmpd(CCR0, sub_klass, super_klass); 2003 beq(CCR0, *L_success); 2004 2005 // Check the supertype display: 2006 if (must_load_sco) { 2007 // The super check offset is always positive... 2008 lwz(check_cache_offset, sco_offset, super_klass); 2009 super_check_offset = RegisterOrConstant(check_cache_offset); 2010 // super_check_offset is register. 2011 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register()); 2012 } 2013 // The loaded value is the offset from Klass. 2014 2015 ld(cached_super, super_check_offset, sub_klass); 2016 cmpd(CCR0, cached_super, super_klass); 2017 2018 // This check has worked decisively for primary supers. 2019 // Secondary supers are sought in the super_cache ('super_cache_addr'). 2020 // (Secondary supers are interfaces and very deeply nested subtypes.) 2021 // This works in the same check above because of a tricky aliasing 2022 // between the super_cache and the primary super display elements. 2023 // (The 'super_check_addr' can address either, as the case requires.) 2024 // Note that the cache is updated below if it does not help us find 2025 // what we need immediately. 2026 // So if it was a primary super, we can just fail immediately. 2027 // Otherwise, it's the slow path for us (no success at this point). 2028 2029 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); } 2030 2031 if (super_check_offset.is_register()) { 2032 beq(CCR0, *L_success); 2033 cmpwi(CCR0, super_check_offset.as_register(), sc_offset); 2034 if (L_failure == &L_fallthrough) { 2035 beq(CCR0, *L_slow_path); 2036 } else { 2037 bne(CCR0, *L_failure); 2038 FINAL_JUMP(*L_slow_path); 2039 } 2040 } else { 2041 if (super_check_offset.as_constant() == sc_offset) { 2042 // Need a slow path; fast failure is impossible. 2043 if (L_slow_path == &L_fallthrough) { 2044 beq(CCR0, *L_success); 2045 } else { 2046 bne(CCR0, *L_slow_path); 2047 FINAL_JUMP(*L_success); 2048 } 2049 } else { 2050 // No slow path; it's a fast decision. 2051 if (L_failure == &L_fallthrough) { 2052 beq(CCR0, *L_success); 2053 } else { 2054 bne(CCR0, *L_failure); 2055 FINAL_JUMP(*L_success); 2056 } 2057 } 2058 } 2059 2060 bind(L_fallthrough); 2061 #undef FINAL_JUMP 2062 } 2063 2064 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 2065 Register super_klass, 2066 Register temp1_reg, 2067 Register temp2_reg, 2068 Label* L_success, 2069 Register result_reg) { 2070 const Register array_ptr = temp1_reg; // current value from cache array 2071 const Register temp = temp2_reg; 2072 2073 assert_different_registers(sub_klass, super_klass, array_ptr, temp); 2074 2075 int source_offset = in_bytes(Klass::secondary_supers_offset()); 2076 int target_offset = in_bytes(Klass::secondary_super_cache_offset()); 2077 2078 int length_offset = Array<Klass*>::length_offset_in_bytes(); 2079 int base_offset = Array<Klass*>::base_offset_in_bytes(); 2080 2081 Label hit, loop, failure, fallthru; 2082 2083 ld(array_ptr, source_offset, sub_klass); 2084 2085 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated."); 2086 lwz(temp, length_offset, array_ptr); 2087 cmpwi(CCR0, temp, 0); 2088 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0 2089 2090 mtctr(temp); // load ctr 2091 2092 bind(loop); 2093 // Oops in table are NO MORE compressed. 2094 ld(temp, base_offset, array_ptr); 2095 cmpd(CCR0, temp, super_klass); 2096 beq(CCR0, hit); 2097 addi(array_ptr, array_ptr, BytesPerWord); 2098 bdnz(loop); 2099 2100 bind(failure); 2101 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss) 2102 b(fallthru); 2103 2104 bind(hit); 2105 std(super_klass, target_offset, sub_klass); // save result to cache 2106 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit) 2107 if (L_success != nullptr) { b(*L_success); } 2108 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided 2109 2110 bind(fallthru); 2111 } 2112 2113 // Try fast path, then go to slow one if not successful 2114 void MacroAssembler::check_klass_subtype(Register sub_klass, 2115 Register super_klass, 2116 Register temp1_reg, 2117 Register temp2_reg, 2118 Label& L_success) { 2119 Label L_failure; 2120 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure); 2121 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success); 2122 bind(L_failure); // Fallthru if not successful. 2123 } 2124 2125 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 2126 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required"); 2127 2128 Label L_fallthrough; 2129 if (L_fast_path == nullptr) { 2130 L_fast_path = &L_fallthrough; 2131 } else if (L_slow_path == nullptr) { 2132 L_slow_path = &L_fallthrough; 2133 } 2134 2135 // Fast path check: class is fully initialized 2136 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass); 2137 cmpwi(CCR0, R0, InstanceKlass::fully_initialized); 2138 beq(CCR0, *L_fast_path); 2139 2140 // Fast path check: current thread is initializer thread 2141 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass); 2142 cmpd(CCR0, thread, R0); 2143 if (L_slow_path == &L_fallthrough) { 2144 beq(CCR0, *L_fast_path); 2145 } else if (L_fast_path == &L_fallthrough) { 2146 bne(CCR0, *L_slow_path); 2147 } else { 2148 Unimplemented(); 2149 } 2150 2151 bind(L_fallthrough); 2152 } 2153 2154 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot, 2155 Register temp_reg, 2156 int extra_slot_offset) { 2157 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 2158 int stackElementSize = Interpreter::stackElementSize; 2159 int offset = extra_slot_offset * stackElementSize; 2160 if (arg_slot.is_constant()) { 2161 offset += arg_slot.as_constant() * stackElementSize; 2162 return offset; 2163 } else { 2164 assert(temp_reg != noreg, "must specify"); 2165 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize)); 2166 if (offset != 0) 2167 addi(temp_reg, temp_reg, offset); 2168 return temp_reg; 2169 } 2170 } 2171 2172 void MacroAssembler::tlab_allocate( 2173 Register obj, // result: pointer to object after successful allocation 2174 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise 2175 int con_size_in_bytes, // object size in bytes if known at compile time 2176 Register t1, // temp register 2177 Label& slow_case // continuation point if fast allocation fails 2178 ) { 2179 // make sure arguments make sense 2180 assert_different_registers(obj, var_size_in_bytes, t1); 2181 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size"); 2182 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment"); 2183 2184 const Register new_top = t1; 2185 //verify_tlab(); not implemented 2186 2187 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2188 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread); 2189 if (var_size_in_bytes == noreg) { 2190 addi(new_top, obj, con_size_in_bytes); 2191 } else { 2192 add(new_top, obj, var_size_in_bytes); 2193 } 2194 cmpld(CCR0, new_top, R0); 2195 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case); 2196 2197 #ifdef ASSERT 2198 // make sure new free pointer is properly aligned 2199 { 2200 Label L; 2201 andi_(R0, new_top, MinObjAlignmentInBytesMask); 2202 beq(CCR0, L); 2203 stop("updated TLAB free is not properly aligned"); 2204 bind(L); 2205 } 2206 #endif // ASSERT 2207 2208 // update the tlab top pointer 2209 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread); 2210 //verify_tlab(); not implemented 2211 } 2212 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) { 2213 unimplemented("incr_allocated_bytes"); 2214 } 2215 2216 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset, 2217 int insts_call_instruction_offset, Register Rtoc) { 2218 // Start the stub. 2219 address stub = start_a_stub(64); 2220 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out 2221 2222 // Create a trampoline stub relocation which relates this trampoline stub 2223 // with the call instruction at insts_call_instruction_offset in the 2224 // instructions code-section. 2225 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset)); 2226 const int stub_start_offset = offset(); 2227 2228 // For java_to_interp stubs we use R11_scratch1 as scratch register 2229 // and in call trampoline stubs we use R12_scratch2. This way we 2230 // can distinguish them (see is_NativeCallTrampolineStub_at()). 2231 Register reg_scratch = R12_scratch2; 2232 2233 // Now, create the trampoline stub's code: 2234 // - load the TOC 2235 // - load the call target from the constant pool 2236 // - call 2237 if (Rtoc == noreg) { 2238 calculate_address_from_global_toc(reg_scratch, method_toc()); 2239 Rtoc = reg_scratch; 2240 } 2241 2242 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false); 2243 mtctr(reg_scratch); 2244 bctr(); 2245 2246 const address stub_start_addr = addr_at(stub_start_offset); 2247 2248 // Assert that the encoded destination_toc_offset can be identified and that it is correct. 2249 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(), 2250 "encoded offset into the constant pool must match"); 2251 // Trampoline_stub_size should be good. 2252 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size"); 2253 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline"); 2254 2255 // End the stub. 2256 end_a_stub(); 2257 return stub; 2258 } 2259 2260 // "The box" is the space on the stack where we copy the object mark. 2261 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box, 2262 Register temp, Register displaced_header, Register current_header) { 2263 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight"); 2264 assert_different_registers(oop, box, temp, displaced_header, current_header); 2265 Label object_has_monitor; 2266 Label cas_failed; 2267 Label success, failure; 2268 2269 // Load markWord from object into displaced_header. 2270 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop); 2271 2272 if (DiagnoseSyncOnValueBasedClasses != 0) { 2273 load_klass(temp, oop); 2274 lwz(temp, in_bytes(Klass::access_flags_offset()), temp); 2275 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2276 bne(flag, failure); 2277 } 2278 2279 // Handle existing monitor. 2280 // The object has an existing monitor iff (mark & monitor_value) != 0. 2281 andi_(temp, displaced_header, markWord::monitor_value); 2282 bne(CCR0, object_has_monitor); 2283 2284 if (LockingMode == LM_MONITOR) { 2285 // Set NE to indicate 'failure' -> take slow-path. 2286 crandc(flag, Assembler::equal, flag, Assembler::equal); 2287 b(failure); 2288 } else { 2289 assert(LockingMode == LM_LEGACY, "must be"); 2290 // Set displaced_header to be (markWord of object | UNLOCK_VALUE). 2291 ori(displaced_header, displaced_header, markWord::unlocked_value); 2292 2293 // Load Compare Value application register. 2294 2295 // Initialize the box. (Must happen before we update the object mark!) 2296 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2297 2298 // Must fence, otherwise, preceding store(s) may float below cmpxchg. 2299 // Compare object markWord with mark and if equal exchange scratch1 with object markWord. 2300 cmpxchgd(/*flag=*/flag, 2301 /*current_value=*/current_header, 2302 /*compare_value=*/displaced_header, 2303 /*exchange_value=*/box, 2304 /*where=*/oop, 2305 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2306 MacroAssembler::cmpxchgx_hint_acquire_lock(), 2307 noreg, 2308 &cas_failed, 2309 /*check without membar and ldarx first*/true); 2310 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2311 // If the compare-and-exchange succeeded, then we found an unlocked 2312 // object and we have now locked it. 2313 b(success); 2314 2315 bind(cas_failed); 2316 // We did not see an unlocked object so try the fast recursive case. 2317 2318 // Check if the owner is self by comparing the value in the markWord of object 2319 // (current_header) with the stack pointer. 2320 sub(current_header, current_header, R1_SP); 2321 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place); 2322 2323 and_(R0/*==0?*/, current_header, temp); 2324 // If condition is true we are cont and hence we can store 0 as the 2325 // displaced header in the box, which indicates that it is a recursive lock. 2326 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box); 2327 2328 if (flag != CCR0) { 2329 mcrf(flag, CCR0); 2330 } 2331 beq(CCR0, success); 2332 b(failure); 2333 } 2334 2335 // Handle existing monitor. 2336 bind(object_has_monitor); 2337 // The object's monitor m is unlocked iff m->owner is null, 2338 // otherwise m->owner may contain a thread or a stack address. 2339 2340 // Try to CAS m->owner from null to current thread. 2341 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value); 2342 cmpxchgd(/*flag=*/flag, 2343 /*current_value=*/current_header, 2344 /*compare_value=*/(intptr_t)0, 2345 /*exchange_value=*/R16_thread, 2346 /*where=*/temp, 2347 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2348 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2349 2350 // Store a non-null value into the box. 2351 std(box, BasicLock::displaced_header_offset_in_bytes(), box); 2352 beq(flag, success); 2353 2354 // Check for recursive locking. 2355 cmpd(flag, current_header, R16_thread); 2356 bne(flag, failure); 2357 2358 // Current thread already owns the lock. Just increment recursions. 2359 Register recursions = displaced_header; 2360 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2361 addi(recursions, recursions, 1); 2362 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp); 2363 2364 // flag == EQ indicates success, increment held monitor count 2365 // flag == NE indicates failure 2366 bind(success); 2367 inc_held_monitor_count(temp); 2368 bind(failure); 2369 } 2370 2371 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box, 2372 Register temp, Register displaced_header, Register current_header) { 2373 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight"); 2374 assert_different_registers(oop, box, temp, displaced_header, current_header); 2375 Label success, failure, object_has_monitor, notRecursive; 2376 2377 if (LockingMode == LM_LEGACY) { 2378 // Find the lock address and load the displaced header from the stack. 2379 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box); 2380 2381 // If the displaced header is 0, we have a recursive unlock. 2382 cmpdi(flag, displaced_header, 0); 2383 beq(flag, success); 2384 } 2385 2386 // Handle existing monitor. 2387 // The object has an existing monitor iff (mark & monitor_value) != 0. 2388 ld(current_header, oopDesc::mark_offset_in_bytes(), oop); 2389 andi_(R0, current_header, markWord::monitor_value); 2390 bne(CCR0, object_has_monitor); 2391 2392 if (LockingMode == LM_MONITOR) { 2393 // Set NE to indicate 'failure' -> take slow-path. 2394 crandc(flag, Assembler::equal, flag, Assembler::equal); 2395 b(failure); 2396 } else { 2397 assert(LockingMode == LM_LEGACY, "must be"); 2398 // Check if it is still a light weight lock, this is is true if we see 2399 // the stack address of the basicLock in the markWord of the object. 2400 // Cmpxchg sets flag to cmpd(current_header, box). 2401 cmpxchgd(/*flag=*/flag, 2402 /*current_value=*/current_header, 2403 /*compare_value=*/box, 2404 /*exchange_value=*/displaced_header, 2405 /*where=*/oop, 2406 MacroAssembler::MemBarRel, 2407 MacroAssembler::cmpxchgx_hint_release_lock(), 2408 noreg, 2409 &failure); 2410 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0"); 2411 b(success); 2412 } 2413 2414 // Handle existing monitor. 2415 bind(object_has_monitor); 2416 STATIC_ASSERT(markWord::monitor_value <= INT_MAX); 2417 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor 2418 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2419 2420 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0. 2421 // This is handled like owner thread mismatches: We take the slow path. 2422 cmpd(flag, temp, R16_thread); 2423 bne(flag, failure); 2424 2425 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2426 2427 addic_(displaced_header, displaced_header, -1); 2428 blt(CCR0, notRecursive); // Not recursive if negative after decrement. 2429 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header); 2430 if (flag == CCR0) { // Otherwise, flag is already EQ, here. 2431 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ 2432 } 2433 b(success); 2434 2435 bind(notRecursive); 2436 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header); 2437 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header); 2438 orr(temp, temp, displaced_header); // Will be 0 if both are 0. 2439 cmpdi(flag, temp, 0); 2440 bne(flag, failure); 2441 release(); 2442 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header); 2443 2444 // flag == EQ indicates success, decrement held monitor count 2445 // flag == NE indicates failure 2446 bind(success); 2447 dec_held_monitor_count(temp); 2448 bind(failure); 2449 } 2450 2451 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2452 Register tmp2, Register tmp3) { 2453 assert_different_registers(obj, tmp1, tmp2, tmp3); 2454 assert(flag == CCR0, "bad condition register"); 2455 2456 // Handle inflated monitor. 2457 Label inflated; 2458 // Finish fast lock successfully. MUST reach to with flag == NE 2459 Label locked; 2460 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ 2461 Label slow_path; 2462 2463 if (DiagnoseSyncOnValueBasedClasses != 0) { 2464 load_klass(tmp1, obj); 2465 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1); 2466 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS)); 2467 bne(flag, slow_path); 2468 } 2469 2470 const Register mark = tmp1; 2471 const Register t = tmp3; // Usage of R0 allowed! 2472 2473 { // Lightweight locking 2474 2475 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ 2476 Label push; 2477 2478 const Register top = tmp2; 2479 2480 // Check if lock-stack is full. 2481 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2482 cmplwi(flag, top, LockStack::end_offset() - 1); 2483 bgt(flag, slow_path); 2484 2485 // The underflow check is elided. The recursive check will always fail 2486 // when the lock stack is empty because of the _bad_oop_sentinel field. 2487 2488 // Check if recursive. 2489 subi(t, top, oopSize); 2490 ldx(t, R16_thread, t); 2491 cmpd(flag, obj, t); 2492 beq(flag, push); 2493 2494 // Check for monitor (0b10) or locked (0b00). 2495 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2496 andi_(t, mark, markWord::lock_mask_in_place); 2497 cmpldi(flag, t, markWord::unlocked_value); 2498 bgt(flag, inflated); 2499 bne(flag, slow_path); 2500 2501 // Not inflated. 2502 2503 // Try to lock. Transition lock bits 0b00 => 0b01 2504 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea"); 2505 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq); 2506 2507 bind(push); 2508 // After successful lock, push object on lock-stack. 2509 stdx(obj, R16_thread, top); 2510 addi(top, top, oopSize); 2511 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2512 b(locked); 2513 } 2514 2515 { // Handle inflated monitor. 2516 bind(inflated); 2517 2518 // mark contains the tagged ObjectMonitor*. 2519 const Register tagged_monitor = mark; 2520 const uintptr_t monitor_tag = markWord::monitor_value; 2521 const Register owner_addr = tmp2; 2522 2523 // Compute owner address. 2524 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag); 2525 2526 // CAS owner (null => current thread). 2527 cmpxchgd(/*flag=*/flag, 2528 /*current_value=*/t, 2529 /*compare_value=*/(intptr_t)0, 2530 /*exchange_value=*/R16_thread, 2531 /*where=*/owner_addr, 2532 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq, 2533 MacroAssembler::cmpxchgx_hint_acquire_lock()); 2534 beq(flag, locked); 2535 2536 // Check if recursive. 2537 cmpd(flag, t, R16_thread); 2538 bne(flag, slow_path); 2539 2540 // Recursive. 2541 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2542 addi(tmp1, tmp1, 1); 2543 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr); 2544 } 2545 2546 bind(locked); 2547 inc_held_monitor_count(tmp1); 2548 2549 #ifdef ASSERT 2550 // Check that locked label is reached with flag == EQ. 2551 Label flag_correct; 2552 beq(flag, flag_correct); 2553 stop("Fast Lock Flag != EQ"); 2554 #endif 2555 bind(slow_path); 2556 #ifdef ASSERT 2557 // Check that slow_path label is reached with flag == NE. 2558 bne(flag, flag_correct); 2559 stop("Fast Lock Flag != NE"); 2560 bind(flag_correct); 2561 #endif 2562 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2563 } 2564 2565 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1, 2566 Register tmp2, Register tmp3) { 2567 assert_different_registers(obj, tmp1, tmp2, tmp3); 2568 assert(flag == CCR0, "bad condition register"); 2569 2570 // Handle inflated monitor. 2571 Label inflated, inflated_load_monitor; 2572 // Finish fast unlock successfully. MUST reach to with flag == EQ. 2573 Label unlocked; 2574 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE. 2575 Label slow_path; 2576 2577 const Register mark = tmp1; 2578 const Register top = tmp2; 2579 const Register t = tmp3; 2580 2581 { // Lightweight unlock 2582 Label push_and_slow; 2583 2584 // Check if obj is top of lock-stack. 2585 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2586 subi(top, top, oopSize); 2587 ldx(t, R16_thread, top); 2588 cmpd(flag, obj, t); 2589 // Top of lock stack was not obj. Must be monitor. 2590 bne(flag, inflated_load_monitor); 2591 2592 // Pop lock-stack. 2593 DEBUG_ONLY(li(t, 0);) 2594 DEBUG_ONLY(stdx(t, R16_thread, top);) 2595 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2596 2597 // The underflow check is elided. The recursive check will always fail 2598 // when the lock stack is empty because of the _bad_oop_sentinel field. 2599 2600 // Check if recursive. 2601 subi(t, top, oopSize); 2602 ldx(t, R16_thread, t); 2603 cmpd(flag, obj, t); 2604 beq(flag, unlocked); 2605 2606 // Not recursive. 2607 2608 // Check for monitor (0b10). 2609 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2610 andi_(t, mark, markWord::monitor_value); 2611 bne(CCR0, inflated); 2612 2613 #ifdef ASSERT 2614 // Check header not unlocked (0b01). 2615 Label not_unlocked; 2616 andi_(t, mark, markWord::unlocked_value); 2617 beq(CCR0, not_unlocked); 2618 stop("lightweight_unlock already unlocked"); 2619 bind(not_unlocked); 2620 #endif 2621 2622 // Try to unlock. Transition lock bits 0b00 => 0b01 2623 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel); 2624 b(unlocked); 2625 2626 bind(push_and_slow); 2627 // Restore lock-stack and handle the unlock in runtime. 2628 DEBUG_ONLY(stdx(obj, R16_thread, top);) 2629 addi(top, top, oopSize); 2630 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 2631 b(slow_path); 2632 } 2633 2634 { // Handle inflated monitor. 2635 bind(inflated_load_monitor); 2636 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 2637 #ifdef ASSERT 2638 andi_(t, mark, markWord::monitor_value); 2639 bne(CCR0, inflated); 2640 stop("Fast Unlock not monitor"); 2641 #endif 2642 2643 bind(inflated); 2644 2645 #ifdef ASSERT 2646 Label check_done; 2647 subi(top, top, oopSize); 2648 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset())); 2649 blt(CCR0, check_done); 2650 ldx(t, R16_thread, top); 2651 cmpd(flag, obj, t); 2652 bne(flag, inflated); 2653 stop("Fast Unlock lock on stack"); 2654 bind(check_done); 2655 #endif 2656 2657 // mark contains the tagged ObjectMonitor*. 2658 const Register monitor = mark; 2659 const uintptr_t monitor_tag = markWord::monitor_value; 2660 2661 // Untag the monitor. 2662 subi(monitor, mark, monitor_tag); 2663 2664 const Register recursions = tmp2; 2665 Label not_recursive; 2666 2667 // Check if recursive. 2668 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2669 addic_(recursions, recursions, -1); 2670 blt(CCR0, not_recursive); 2671 2672 // Recursive unlock. 2673 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor); 2674 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); 2675 b(unlocked); 2676 2677 bind(not_recursive); 2678 2679 Label release_; 2680 const Register t2 = tmp2; 2681 2682 // Check if the entry lists are empty. 2683 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor); 2684 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor); 2685 orr(t, t, t2); 2686 cmpdi(flag, t, 0); 2687 beq(flag, release_); 2688 2689 // The owner may be anonymous and we removed the last obj entry in 2690 // the lock-stack. This loses the information about the owner. 2691 // Write the thread to the owner field so the runtime knows the owner. 2692 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor); 2693 b(slow_path); 2694 2695 bind(release_); 2696 // Set owner to null. 2697 release(); 2698 // t contains 0 2699 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor); 2700 } 2701 2702 bind(unlocked); 2703 dec_held_monitor_count(t); 2704 2705 #ifdef ASSERT 2706 // Check that unlocked label is reached with flag == EQ. 2707 Label flag_correct; 2708 beq(flag, flag_correct); 2709 stop("Fast Lock Flag != EQ"); 2710 #endif 2711 bind(slow_path); 2712 #ifdef ASSERT 2713 // Check that slow_path label is reached with flag == NE. 2714 bne(flag, flag_correct); 2715 stop("Fast Lock Flag != NE"); 2716 bind(flag_correct); 2717 #endif 2718 // C2 uses the value of flag (NE vs EQ) to determine the continuation. 2719 } 2720 2721 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) { 2722 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread); 2723 2724 if (at_return) { 2725 if (in_nmethod) { 2726 if (UseSIGTRAP) { 2727 // Use Signal Handler. 2728 relocate(relocInfo::poll_return_type); 2729 td(traptoGreaterThanUnsigned, R1_SP, temp); 2730 } else { 2731 cmpld(CCR0, R1_SP, temp); 2732 // Stub may be out of range for short conditional branch. 2733 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path); 2734 } 2735 } else { // Not in nmethod. 2736 // Frame still on stack, need to get fp. 2737 Register fp = R0; 2738 ld(fp, _abi0(callers_sp), R1_SP); 2739 cmpld(CCR0, fp, temp); 2740 bgt(CCR0, slow_path); 2741 } 2742 } else { // Normal safepoint poll. Not at return. 2743 assert(!in_nmethod, "should use load_from_polling_page"); 2744 andi_(temp, temp, SafepointMechanism::poll_bit()); 2745 bne(CCR0, slow_path); 2746 } 2747 } 2748 2749 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2, 2750 MacroAssembler::PreservationLevel preservation_level) { 2751 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2752 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level); 2753 } 2754 2755 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2, 2756 MacroAssembler::PreservationLevel preservation_level) { 2757 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2758 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level); 2759 } 2760 2761 // Values for last_Java_pc, and last_Java_sp must comply to the rules 2762 // in frame_ppc.hpp. 2763 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) { 2764 // Always set last_Java_pc and flags first because once last_Java_sp 2765 // is visible has_last_Java_frame is true and users will look at the 2766 // rest of the fields. (Note: flags should always be zero before we 2767 // get here so doesn't need to be set.) 2768 2769 // Verify that last_Java_pc was zeroed on return to Java 2770 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread, 2771 "last_Java_pc not zeroed before leaving Java"); 2772 2773 // When returning from calling out from Java mode the frame anchor's 2774 // last_Java_pc will always be set to null. It is set here so that 2775 // if we are doing a call to native (not VM) that we capture the 2776 // known pc and don't have to rely on the native call having a 2777 // standard frame linkage where we can find the pc. 2778 if (last_Java_pc != noreg) 2779 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2780 2781 // Set last_Java_sp last. 2782 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2783 } 2784 2785 void MacroAssembler::reset_last_Java_frame(void) { 2786 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()), 2787 R16_thread, "SP was not set, still zero"); 2788 2789 BLOCK_COMMENT("reset_last_Java_frame {"); 2790 li(R0, 0); 2791 2792 // _last_Java_sp = 0 2793 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread); 2794 2795 // _last_Java_pc = 0 2796 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread); 2797 BLOCK_COMMENT("} reset_last_Java_frame"); 2798 } 2799 2800 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) { 2801 assert_different_registers(sp, tmp1); 2802 2803 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via 2804 // TOP_IJAVA_FRAME_ABI. 2805 // FIXME: assert that we really have a TOP_IJAVA_FRAME here! 2806 address entry = pc(); 2807 load_const_optimized(tmp1, entry); 2808 2809 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1); 2810 } 2811 2812 void MacroAssembler::get_vm_result(Register oop_result) { 2813 // Read: 2814 // R16_thread 2815 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2816 // 2817 // Updated: 2818 // oop_result 2819 // R16_thread->in_bytes(JavaThread::vm_result_offset()) 2820 2821 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2822 li(R0, 0); 2823 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread); 2824 2825 verify_oop(oop_result, FILE_AND_LINE); 2826 } 2827 2828 void MacroAssembler::get_vm_result_2(Register metadata_result) { 2829 // Read: 2830 // R16_thread 2831 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2832 // 2833 // Updated: 2834 // metadata_result 2835 // R16_thread->in_bytes(JavaThread::vm_result_2_offset()) 2836 2837 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2838 li(R0, 0); 2839 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread); 2840 } 2841 2842 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) { 2843 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided. 2844 if (CompressedKlassPointers::base() != 0) { 2845 // Use dst as temp if it is free. 2846 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0); 2847 current = dst; 2848 } 2849 if (CompressedKlassPointers::shift() != 0) { 2850 srdi(dst, current, CompressedKlassPointers::shift()); 2851 current = dst; 2852 } 2853 return current; 2854 } 2855 2856 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) { 2857 if (UseCompressedClassPointers) { 2858 Register compressedKlass = encode_klass_not_null(ck, klass); 2859 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop); 2860 } else { 2861 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop); 2862 } 2863 } 2864 2865 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) { 2866 if (UseCompressedClassPointers) { 2867 if (val == noreg) { 2868 val = R0; 2869 li(val, 0); 2870 } 2871 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed 2872 } 2873 } 2874 2875 int MacroAssembler::instr_size_for_decode_klass_not_null() { 2876 static int computed_size = -1; 2877 2878 // Not yet computed? 2879 if (computed_size == -1) { 2880 2881 if (!UseCompressedClassPointers) { 2882 computed_size = 0; 2883 } else { 2884 // Determine by scratch emit. 2885 ResourceMark rm; 2886 int code_size = 8 * BytesPerInstWord; 2887 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0); 2888 MacroAssembler* a = new MacroAssembler(&cb); 2889 a->decode_klass_not_null(R11_scratch1); 2890 computed_size = a->offset(); 2891 } 2892 } 2893 2894 return computed_size; 2895 } 2896 2897 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 2898 assert(dst != R0, "Dst reg may not be R0, as R0 is used here."); 2899 if (src == noreg) src = dst; 2900 Register shifted_src = src; 2901 if (CompressedKlassPointers::shift() != 0 || 2902 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required. 2903 shifted_src = dst; 2904 sldi(shifted_src, src, CompressedKlassPointers::shift()); 2905 } 2906 if (CompressedKlassPointers::base() != 0) { 2907 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0); 2908 } 2909 } 2910 2911 void MacroAssembler::load_klass(Register dst, Register src) { 2912 if (UseCompressedClassPointers) { 2913 lwz(dst, oopDesc::klass_offset_in_bytes(), src); 2914 // Attention: no null check here! 2915 decode_klass_not_null(dst, dst); 2916 } else { 2917 ld(dst, oopDesc::klass_offset_in_bytes(), src); 2918 } 2919 } 2920 2921 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) { 2922 null_check(src, oopDesc::klass_offset_in_bytes(), is_null); 2923 load_klass(dst, src); 2924 } 2925 2926 // ((OopHandle)result).resolve(); 2927 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2, 2928 MacroAssembler::PreservationLevel preservation_level) { 2929 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level); 2930 } 2931 2932 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2, 2933 MacroAssembler::PreservationLevel preservation_level) { 2934 Label resolved; 2935 2936 // A null weak handle resolves to null. 2937 cmpdi(CCR0, result, 0); 2938 beq(CCR0, resolved); 2939 2940 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2, 2941 preservation_level); 2942 bind(resolved); 2943 } 2944 2945 void MacroAssembler::load_method_holder(Register holder, Register method) { 2946 ld(holder, in_bytes(Method::const_offset()), method); 2947 ld(holder, in_bytes(ConstMethod::constants_offset()), holder); 2948 ld(holder, ConstantPool::pool_holder_offset(), holder); 2949 } 2950 2951 // Clear Array 2952 // For very short arrays. tmp == R0 is allowed. 2953 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) { 2954 if (cnt_dwords > 0) { li(tmp, 0); } 2955 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); } 2956 } 2957 2958 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed. 2959 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) { 2960 if (cnt_dwords < 8) { 2961 clear_memory_unrolled(base_ptr, cnt_dwords, tmp); 2962 return; 2963 } 2964 2965 Label loop; 2966 const long loopcnt = cnt_dwords >> 1, 2967 remainder = cnt_dwords & 1; 2968 2969 li(tmp, loopcnt); 2970 mtctr(tmp); 2971 li(tmp, 0); 2972 bind(loop); 2973 std(tmp, 0, base_ptr); 2974 std(tmp, 8, base_ptr); 2975 addi(base_ptr, base_ptr, 16); 2976 bdnz(loop); 2977 if (remainder) { std(tmp, 0, base_ptr); } 2978 } 2979 2980 // Kills both input registers. tmp == R0 is allowed. 2981 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) { 2982 // Procedure for large arrays (uses data cache block zero instruction). 2983 Label startloop, fast, fastloop, small_rest, restloop, done; 2984 const int cl_size = VM_Version::L1_data_cache_line_size(), 2985 cl_dwords = cl_size >> 3, 2986 cl_dw_addr_bits = exact_log2(cl_dwords), 2987 dcbz_min = 1, // Min count of dcbz executions, needs to be >0. 2988 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1; 2989 2990 if (const_cnt >= 0) { 2991 // Constant case. 2992 if (const_cnt < min_cnt) { 2993 clear_memory_constlen(base_ptr, const_cnt, tmp); 2994 return; 2995 } 2996 load_const_optimized(cnt_dwords, const_cnt, tmp); 2997 } else { 2998 // cnt_dwords already loaded in register. Need to check size. 2999 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included). 3000 blt(CCR1, small_rest); 3001 } 3002 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line. 3003 beq(CCR0, fast); // Already 128byte aligned. 3004 3005 subfic(tmp, tmp, cl_dwords); 3006 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords). 3007 subf(cnt_dwords, tmp, cnt_dwords); // rest. 3008 li(tmp, 0); 3009 3010 bind(startloop); // Clear at the beginning to reach 128byte boundary. 3011 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3012 addi(base_ptr, base_ptr, 8); 3013 bdnz(startloop); 3014 3015 bind(fast); // Clear 128byte blocks. 3016 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0). 3017 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords. 3018 mtctr(tmp); // Load counter. 3019 3020 bind(fastloop); 3021 dcbz(base_ptr); // Clear 128byte aligned block. 3022 addi(base_ptr, base_ptr, cl_size); 3023 bdnz(fastloop); 3024 3025 bind(small_rest); 3026 cmpdi(CCR0, cnt_dwords, 0); // size 0? 3027 beq(CCR0, done); // rest == 0 3028 li(tmp, 0); 3029 mtctr(cnt_dwords); // Load counter. 3030 3031 bind(restloop); // Clear rest. 3032 std(tmp, 0, base_ptr); // Clear 8byte aligned block. 3033 addi(base_ptr, base_ptr, 8); 3034 bdnz(restloop); 3035 3036 bind(done); 3037 } 3038 3039 /////////////////////////////////////////// String intrinsics //////////////////////////////////////////// 3040 3041 // Helpers for Intrinsic Emitters 3042 // 3043 // Revert the byte order of a 32bit value in a register 3044 // src: 0x44556677 3045 // dst: 0x77665544 3046 // Three steps to obtain the result: 3047 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word 3048 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared. 3049 // This value initializes dst. 3050 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost 3051 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go. 3052 // This value is mask inserted into dst with a [0..23] mask of 1s. 3053 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position. 3054 // This value is mask inserted into dst with a [8..15] mask of 1s. 3055 void MacroAssembler::load_reverse_32(Register dst, Register src) { 3056 assert_different_registers(dst, src); 3057 3058 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left. 3059 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone. 3060 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone. 3061 } 3062 3063 // Calculate the column addresses of the crc32 lookup table into distinct registers. 3064 // This loop-invariant calculation is moved out of the loop body, reducing the loop 3065 // body size from 20 to 16 instructions. 3066 // Returns the offset that was used to calculate the address of column tc3. 3067 // Due to register shortage, setting tc3 may overwrite table. With the return offset 3068 // at hand, the original table address can be easily reconstructed. 3069 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) { 3070 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!"); 3071 3072 // Point to 4 byte folding tables (byte-reversed version for Big Endian) 3073 // Layout: See StubRoutines::ppc::generate_crc_constants. 3074 #ifdef VM_LITTLE_ENDIAN 3075 const int ix0 = 3 * CRC32_TABLE_SIZE; 3076 const int ix1 = 2 * CRC32_TABLE_SIZE; 3077 const int ix2 = 1 * CRC32_TABLE_SIZE; 3078 const int ix3 = 0 * CRC32_TABLE_SIZE; 3079 #else 3080 const int ix0 = 1 * CRC32_TABLE_SIZE; 3081 const int ix1 = 2 * CRC32_TABLE_SIZE; 3082 const int ix2 = 3 * CRC32_TABLE_SIZE; 3083 const int ix3 = 4 * CRC32_TABLE_SIZE; 3084 #endif 3085 assert_different_registers(table, tc0, tc1, tc2); 3086 assert(table == tc3, "must be!"); 3087 3088 addi(tc0, table, ix0); 3089 addi(tc1, table, ix1); 3090 addi(tc2, table, ix2); 3091 if (ix3 != 0) addi(tc3, table, ix3); 3092 3093 return ix3; 3094 } 3095 3096 /** 3097 * uint32_t crc; 3098 * table[crc & 0xFF] ^ (crc >> 8); 3099 */ 3100 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) { 3101 assert_different_registers(crc, table, tmp); 3102 assert_different_registers(val, table); 3103 3104 if (crc == val) { // Must rotate first to use the unmodified value. 3105 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3106 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions. 3107 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3108 } else { 3109 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits. 3110 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest. 3111 } 3112 lwzx(tmp, table, tmp); 3113 xorr(crc, crc, tmp); 3114 } 3115 3116 /** 3117 * Emits code to update CRC-32 with a byte value according to constants in table. 3118 * 3119 * @param [in,out]crc Register containing the crc. 3120 * @param [in]val Register containing the byte to fold into the CRC. 3121 * @param [in]table Register containing the table of crc constants. 3122 * 3123 * uint32_t crc; 3124 * val = crc_table[(val ^ crc) & 0xFF]; 3125 * crc = val ^ (crc >> 8); 3126 */ 3127 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3128 BLOCK_COMMENT("update_byte_crc32:"); 3129 xorr(val, val, crc); 3130 fold_byte_crc32(crc, val, table, val); 3131 } 3132 3133 /** 3134 * @param crc register containing existing CRC (32-bit) 3135 * @param buf register pointing to input byte buffer (byte*) 3136 * @param len register containing number of bytes 3137 * @param table register pointing to CRC table 3138 */ 3139 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table, 3140 Register data, bool loopAlignment) { 3141 assert_different_registers(crc, buf, len, table, data); 3142 3143 Label L_mainLoop, L_done; 3144 const int mainLoop_stepping = 1; 3145 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4; 3146 3147 // Process all bytes in a single-byte loop. 3148 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do? 3149 beq(CCR0, L_done); 3150 3151 mtctr(len); 3152 align(mainLoop_alignment); 3153 BIND(L_mainLoop); 3154 lbz(data, 0, buf); // Byte from buffer, zero-extended. 3155 addi(buf, buf, mainLoop_stepping); // Advance buffer position. 3156 update_byte_crc32(crc, data, table); 3157 bdnz(L_mainLoop); // Iterate. 3158 3159 bind(L_done); 3160 } 3161 3162 /** 3163 * Emits code to update CRC-32 with a 4-byte value according to constants in table 3164 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c 3165 */ 3166 // A note on the lookup table address(es): 3167 // The implementation uses 4 table columns (byte-reversed versions for Big Endian). 3168 // To save the effort of adding the column offset to the table address each time 3169 // a table element is looked up, it is possible to pass the pre-calculated 3170 // column addresses. 3171 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary. 3172 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc, 3173 Register t0, Register t1, Register t2, Register t3, 3174 Register tc0, Register tc1, Register tc2, Register tc3) { 3175 assert_different_registers(crc, t3); 3176 3177 // XOR crc with next four bytes of buffer. 3178 lwz(t3, bufDisp, buf); 3179 if (bufInc != 0) { 3180 addi(buf, buf, bufInc); 3181 } 3182 xorr(t3, t3, crc); 3183 3184 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices. 3185 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2 3186 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2 3187 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2 3188 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2 3189 3190 // Use the pre-calculated column addresses. 3191 // Load pre-calculated table values. 3192 lwzx(t0, tc0, t0); 3193 lwzx(t1, tc1, t1); 3194 lwzx(t2, tc2, t2); 3195 lwzx(t3, tc3, t3); 3196 3197 // Calculate new crc from table values. 3198 xorr(t0, t0, t1); 3199 xorr(t2, t2, t3); 3200 xorr(crc, t0, t2); // Now crc contains the final checksum value. 3201 } 3202 3203 /** 3204 * @param crc register containing existing CRC (32-bit) 3205 * @param buf register pointing to input byte buffer (byte*) 3206 * @param len register containing number of bytes 3207 * @param table register pointing to CRC table 3208 * 3209 * uses R9..R12 as work register. Must be saved/restored by caller! 3210 */ 3211 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table, 3212 Register t0, Register t1, Register t2, Register t3, 3213 Register tc0, Register tc1, Register tc2, Register tc3, 3214 bool invertCRC) { 3215 assert_different_registers(crc, buf, len, table); 3216 3217 Label L_mainLoop, L_tail; 3218 Register tmp = t0; 3219 Register data = t0; 3220 Register tmp2 = t1; 3221 const int mainLoop_stepping = 4; 3222 const int tailLoop_stepping = 1; 3223 const int log_stepping = exact_log2(mainLoop_stepping); 3224 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32; 3225 const int complexThreshold = 2*mainLoop_stepping; 3226 3227 // Don't test for len <= 0 here. This pathological case should not occur anyway. 3228 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles 3229 // for all well-behaved cases. The situation itself is detected and handled correctly 3230 // within update_byteLoop_crc32. 3231 assert(tailLoop_stepping == 1, "check tailLoop_stepping!"); 3232 3233 BLOCK_COMMENT("kernel_crc32_1word {"); 3234 3235 if (invertCRC) { 3236 nand(crc, crc, crc); // 1s complement of crc 3237 } 3238 3239 // Check for short (<mainLoop_stepping) buffer. 3240 cmpdi(CCR0, len, complexThreshold); 3241 blt(CCR0, L_tail); 3242 3243 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance. 3244 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions. 3245 { 3246 // Align buf addr to mainLoop_stepping boundary. 3247 neg(tmp2, buf); // Calculate # preLoop iterations for alignment. 3248 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63. 3249 3250 if (complexThreshold > mainLoop_stepping) { 3251 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3252 } else { 3253 sub(tmp, len, tmp2); // Remaining bytes for main loop. 3254 cmpdi(CCR0, tmp, mainLoop_stepping); 3255 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing 3256 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed). 3257 } 3258 update_byteLoop_crc32(crc, buf, tmp2, table, data, false); 3259 } 3260 3261 srdi(tmp2, len, log_stepping); // #iterations for mainLoop 3262 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop 3263 mtctr(tmp2); 3264 3265 #ifdef VM_LITTLE_ENDIAN 3266 Register crc_rv = crc; 3267 #else 3268 Register crc_rv = tmp; // Load_reverse needs separate registers to work on. 3269 // Occupies tmp, but frees up crc. 3270 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data. 3271 tmp = crc; 3272 #endif 3273 3274 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3); 3275 3276 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement. 3277 BIND(L_mainLoop); 3278 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3); 3279 bdnz(L_mainLoop); 3280 3281 #ifndef VM_LITTLE_ENDIAN 3282 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data. 3283 tmp = crc_rv; // Tmp uses it's original register again. 3284 #endif 3285 3286 // Restore original table address for tailLoop. 3287 if (reconstructTableOffset != 0) { 3288 addi(table, table, -reconstructTableOffset); 3289 } 3290 3291 // Process last few (<complexThreshold) bytes of buffer. 3292 BIND(L_tail); 3293 update_byteLoop_crc32(crc, buf, len, table, data, false); 3294 3295 if (invertCRC) { 3296 nand(crc, crc, crc); // 1s complement of crc 3297 } 3298 BLOCK_COMMENT("} kernel_crc32_1word"); 3299 } 3300 3301 /** 3302 * @param crc register containing existing CRC (32-bit) 3303 * @param buf register pointing to input byte buffer (byte*) 3304 * @param len register containing number of bytes 3305 * @param constants register pointing to precomputed constants 3306 * @param t0-t6 temp registers 3307 */ 3308 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants, 3309 Register t0, Register t1, Register t2, Register t3, 3310 Register t4, Register t5, Register t6, bool invertCRC) { 3311 assert_different_registers(crc, buf, len, constants); 3312 3313 Label L_tail; 3314 3315 BLOCK_COMMENT("kernel_crc32_vpmsum {"); 3316 3317 if (invertCRC) { 3318 nand(crc, crc, crc); // 1s complement of crc 3319 } 3320 3321 // Enforce 32 bit. 3322 clrldi(len, len, 32); 3323 3324 // Align if we have enough bytes for the fast version. 3325 const int alignment = 16, 3326 threshold = 32; 3327 Register prealign = t0; 3328 3329 neg(prealign, buf); 3330 addi(t1, len, -threshold); 3331 andi(prealign, prealign, alignment - 1); 3332 cmpw(CCR0, t1, prealign); 3333 blt(CCR0, L_tail); // len - prealign < threshold? 3334 3335 subf(len, prealign, len); 3336 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false); 3337 3338 // Calculate from first aligned address as far as possible. 3339 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants. 3340 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6); 3341 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again. 3342 3343 // Remaining bytes. 3344 BIND(L_tail); 3345 update_byteLoop_crc32(crc, buf, len, constants, t2, false); 3346 3347 if (invertCRC) { 3348 nand(crc, crc, crc); // 1s complement of crc 3349 } 3350 3351 BLOCK_COMMENT("} kernel_crc32_vpmsum"); 3352 } 3353 3354 /** 3355 * @param crc register containing existing CRC (32-bit) 3356 * @param buf register pointing to input byte buffer (byte*) 3357 * @param len register containing number of bytes (will get updated to remaining bytes) 3358 * @param constants register pointing to CRC table for 128-bit aligned memory 3359 * @param t0-t6 temp registers 3360 */ 3361 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants, 3362 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) { 3363 3364 // Save non-volatile vector registers (frameless). 3365 Register offset = t1; 3366 int offsetInt = 0; 3367 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP); 3368 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP); 3369 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP); 3370 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP); 3371 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP); 3372 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP); 3373 #ifndef VM_LITTLE_ENDIAN 3374 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP); 3375 #endif 3376 offsetInt -= 8; std(R14, offsetInt, R1_SP); 3377 offsetInt -= 8; std(R15, offsetInt, R1_SP); 3378 3379 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor 3380 // bytes per iteration. The basic scheme is: 3381 // lvx: load vector (Big Endian needs reversal) 3382 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift 3383 // vxor: xor partial results together to get unroll_factor2 vectors 3384 3385 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors. 3386 3387 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants. 3388 const int unroll_factor = CRC32_UNROLL_FACTOR, 3389 unroll_factor2 = CRC32_UNROLL_FACTOR2; 3390 3391 const int outer_consts_size = (unroll_factor2 - 1) * 16, 3392 inner_consts_size = (unroll_factor / unroll_factor2) * 16; 3393 3394 // Support registers. 3395 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 }; 3396 Register num_bytes = R14, 3397 loop_count = R15, 3398 cur_const = crc; // will live in VCRC 3399 // Constant array for outer loop: unroll_factor2 - 1 registers, 3400 // Constant array for inner loop: unroll_factor / unroll_factor2 registers. 3401 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 }, 3402 consts1[] = { VR23, VR24 }; 3403 // Data register arrays: 2 arrays with unroll_factor2 registers. 3404 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 }, 3405 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 }; 3406 3407 VectorRegister VCRC = data0[0]; 3408 VectorRegister Vc = VR25; 3409 VectorRegister swap_bytes = VR26; // Only for Big Endian. 3410 3411 // We have at least 1 iteration (ensured by caller). 3412 Label L_outer_loop, L_inner_loop, L_last; 3413 3414 // If supported set DSCR pre-fetch to deepest. 3415 if (VM_Version::has_mfdscr()) { 3416 load_const_optimized(t0, VM_Version::_dscr_val | 7); 3417 mtdscr(t0); 3418 } 3419 3420 mtvrwz(VCRC, crc); // crc lives in VCRC, now 3421 3422 for (int i = 1; i < unroll_factor2; ++i) { 3423 li(offs[i], 16 * i); 3424 } 3425 3426 // Load consts for outer loop 3427 lvx(consts0[0], constants); 3428 for (int i = 1; i < unroll_factor2 - 1; ++i) { 3429 lvx(consts0[i], offs[i], constants); 3430 } 3431 3432 load_const_optimized(num_bytes, 16 * unroll_factor); 3433 3434 // Reuse data registers outside of the loop. 3435 VectorRegister Vtmp = data1[0]; 3436 VectorRegister Vtmp2 = data1[1]; 3437 VectorRegister zeroes = data1[2]; 3438 3439 vspltisb(Vtmp, 0); 3440 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC. 3441 3442 // Load vector for vpermxor (to xor both 64 bit parts together) 3443 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f 3444 vspltisb(Vc, 4); 3445 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0 3446 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0); 3447 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f 3448 3449 #ifdef VM_LITTLE_ENDIAN 3450 #define BE_swap_bytes(x) 3451 #else 3452 vspltisb(Vtmp2, 0xf); 3453 vxor(swap_bytes, Vtmp, Vtmp2); 3454 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes) 3455 #endif 3456 3457 cmpd(CCR0, len, num_bytes); 3458 blt(CCR0, L_last); 3459 3460 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop 3461 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off. 3462 3463 // ********** Main loop start ********** 3464 align(32); 3465 bind(L_outer_loop); 3466 3467 // Begin of unrolled first iteration (no xor). 3468 lvx(data1[0], buf); 3469 for (int i = 1; i < unroll_factor2 / 2; ++i) { 3470 lvx(data1[i], offs[i], buf); 3471 } 3472 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3473 lvx(consts1[0], cur_const); 3474 mtctr(loop_count); 3475 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3476 BE_swap_bytes(data1[i]); 3477 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC. 3478 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3479 vpmsumw(data0[i], data1[i], consts1[0]); 3480 } 3481 addi(buf, buf, 16 * unroll_factor2); 3482 subf(len, num_bytes, len); 3483 lvx(consts1[1], offs[1], cur_const); 3484 addi(cur_const, cur_const, 32); 3485 // Begin of unrolled second iteration (head). 3486 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3487 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3488 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); } 3489 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]); 3490 } 3491 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3492 BE_swap_bytes(data1[i]); 3493 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf); 3494 vpmsumw(data1[i], data1[i], consts1[1]); 3495 } 3496 addi(buf, buf, 16 * unroll_factor2); 3497 3498 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated. 3499 // Double-iteration allows using the 2 constant registers alternatingly. 3500 align(32); 3501 bind(L_inner_loop); 3502 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling. 3503 if (j & 1) { 3504 lvx(consts1[0], cur_const); 3505 } else { 3506 lvx(consts1[1], offs[1], cur_const); 3507 addi(cur_const, cur_const, 32); 3508 } 3509 for (int i = 0; i < unroll_factor2; ++i) { 3510 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input. 3511 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; } 3512 BE_swap_bytes(data1[idx]); 3513 vxor(data0[i], data0[i], data1[i]); 3514 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf); 3515 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]); 3516 } 3517 addi(buf, buf, 16 * unroll_factor2); 3518 } 3519 bdnz(L_inner_loop); 3520 3521 addi(cur_const, constants, outer_consts_size); // Reset 3522 3523 // Tail of last iteration (no loads). 3524 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3525 BE_swap_bytes(data1[i + unroll_factor2 / 2]); 3526 vxor(data0[i], data0[i], data1[i]); 3527 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]); 3528 } 3529 for (int i = 0; i < unroll_factor2 / 2; ++i) { 3530 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts. 3531 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]); 3532 } 3533 3534 // Last data register is ok, other ones need fixup shift. 3535 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) { 3536 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); 3537 } 3538 3539 // Combine to 128 bit result vector VCRC = data0[0]. 3540 for (int i = 1; i < unroll_factor2; i<<=1) { 3541 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) { 3542 vxor(data0[j], data0[j], data0[j+i]); 3543 } 3544 } 3545 cmpd(CCR0, len, num_bytes); 3546 bge(CCR0, L_outer_loop); 3547 3548 // Last chance with lower num_bytes. 3549 bind(L_last); 3550 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations. 3551 // Point behind last const for inner loop. 3552 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3553 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used. 3554 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2)); 3555 subf(cur_const, R0, cur_const); // Point to constant to be used first. 3556 3557 addic_(loop_count, loop_count, -1); // One double-iteration peeled off. 3558 bgt(CCR0, L_outer_loop); 3559 // ********** Main loop end ********** 3560 3561 // Restore DSCR pre-fetch value. 3562 if (VM_Version::has_mfdscr()) { 3563 load_const_optimized(t0, VM_Version::_dscr_val); 3564 mtdscr(t0); 3565 } 3566 3567 // ********** Simple loop for remaining 16 byte blocks ********** 3568 { 3569 Label L_loop, L_done; 3570 3571 srdi_(t0, len, 4); // 16 bytes per iteration 3572 clrldi(len, len, 64-4); 3573 beq(CCR0, L_done); 3574 3575 // Point to const (same as last const for inner loop). 3576 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16); 3577 mtctr(t0); 3578 lvx(Vtmp2, cur_const); 3579 3580 align(32); 3581 bind(L_loop); 3582 3583 lvx(Vtmp, buf); 3584 addi(buf, buf, 16); 3585 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3586 BE_swap_bytes(Vtmp); 3587 vxor(VCRC, VCRC, Vtmp); 3588 vpmsumw(VCRC, VCRC, Vtmp2); 3589 bdnz(L_loop); 3590 3591 bind(L_done); 3592 } 3593 // ********** Simple loop end ********** 3594 #undef BE_swap_bytes 3595 3596 // Point to Barrett constants 3597 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size); 3598 3599 vspltisb(zeroes, 0); 3600 3601 // Combine to 64 bit result. 3602 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result. 3603 3604 // Reduce to 32 bit CRC: Remainder by multiply-high. 3605 lvx(Vtmp, cur_const); 3606 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit. 3607 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly. 3608 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit. 3609 vsldoi(Vtmp, zeroes, Vtmp, 8); 3610 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly. 3611 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit. 3612 3613 // Move result. len is already updated. 3614 vsldoi(VCRC, VCRC, zeroes, 8); 3615 mfvrd(crc, VCRC); 3616 3617 // Restore non-volatile Vector registers (frameless). 3618 offsetInt = 0; 3619 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP); 3620 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP); 3621 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP); 3622 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP); 3623 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP); 3624 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP); 3625 #ifndef VM_LITTLE_ENDIAN 3626 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP); 3627 #endif 3628 offsetInt -= 8; ld(R14, offsetInt, R1_SP); 3629 offsetInt -= 8; ld(R15, offsetInt, R1_SP); 3630 } 3631 3632 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2, 3633 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) { 3634 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr() 3635 : StubRoutines::crc_table_addr() , R0); 3636 3637 if (VM_Version::has_vpmsumb()) { 3638 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c); 3639 } else { 3640 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c); 3641 } 3642 } 3643 3644 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) { 3645 assert_different_registers(crc, val, table); 3646 3647 BLOCK_COMMENT("kernel_crc32_singleByteReg:"); 3648 if (invertCRC) { 3649 nand(crc, crc, crc); // 1s complement of crc 3650 } 3651 3652 update_byte_crc32(crc, val, table); 3653 3654 if (invertCRC) { 3655 nand(crc, crc, crc); // 1s complement of crc 3656 } 3657 } 3658 3659 // dest_lo += src1 + src2 3660 // dest_hi += carry1 + carry2 3661 void MacroAssembler::add2_with_carry(Register dest_hi, 3662 Register dest_lo, 3663 Register src1, Register src2) { 3664 li(R0, 0); 3665 addc(dest_lo, dest_lo, src1); 3666 adde(dest_hi, dest_hi, R0); 3667 addc(dest_lo, dest_lo, src2); 3668 adde(dest_hi, dest_hi, R0); 3669 } 3670 3671 // Multiply 64 bit by 64 bit first loop. 3672 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, 3673 Register x_xstart, 3674 Register y, Register y_idx, 3675 Register z, 3676 Register carry, 3677 Register product_high, Register product, 3678 Register idx, Register kdx, 3679 Register tmp) { 3680 // jlong carry, x[], y[], z[]; 3681 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 3682 // huge_128 product = y[idx] * x[xstart] + carry; 3683 // z[kdx] = (jlong)product; 3684 // carry = (jlong)(product >>> 64); 3685 // } 3686 // z[xstart] = carry; 3687 3688 Label L_first_loop, L_first_loop_exit; 3689 Label L_one_x, L_one_y, L_multiply; 3690 3691 addic_(xstart, xstart, -1); 3692 blt(CCR0, L_one_x); // Special case: length of x is 1. 3693 3694 // Load next two integers of x. 3695 sldi(tmp, xstart, LogBytesPerInt); 3696 ldx(x_xstart, x, tmp); 3697 #ifdef VM_LITTLE_ENDIAN 3698 rldicl(x_xstart, x_xstart, 32, 0); 3699 #endif 3700 3701 align(32, 16); 3702 bind(L_first_loop); 3703 3704 cmpdi(CCR0, idx, 1); 3705 blt(CCR0, L_first_loop_exit); 3706 addi(idx, idx, -2); 3707 beq(CCR0, L_one_y); 3708 3709 // Load next two integers of y. 3710 sldi(tmp, idx, LogBytesPerInt); 3711 ldx(y_idx, y, tmp); 3712 #ifdef VM_LITTLE_ENDIAN 3713 rldicl(y_idx, y_idx, 32, 0); 3714 #endif 3715 3716 3717 bind(L_multiply); 3718 multiply64(product_high, product, x_xstart, y_idx); 3719 3720 li(tmp, 0); 3721 addc(product, product, carry); // Add carry to result. 3722 adde(product_high, product_high, tmp); // Add carry of the last addition. 3723 addi(kdx, kdx, -2); 3724 3725 // Store result. 3726 #ifdef VM_LITTLE_ENDIAN 3727 rldicl(product, product, 32, 0); 3728 #endif 3729 sldi(tmp, kdx, LogBytesPerInt); 3730 stdx(product, z, tmp); 3731 mr_if_needed(carry, product_high); 3732 b(L_first_loop); 3733 3734 3735 bind(L_one_y); // Load one 32 bit portion of y as (0,value). 3736 3737 lwz(y_idx, 0, y); 3738 b(L_multiply); 3739 3740 3741 bind(L_one_x); // Load one 32 bit portion of x as (0,value). 3742 3743 lwz(x_xstart, 0, x); 3744 b(L_first_loop); 3745 3746 bind(L_first_loop_exit); 3747 } 3748 3749 // Multiply 64 bit by 64 bit and add 128 bit. 3750 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, 3751 Register z, Register yz_idx, 3752 Register idx, Register carry, 3753 Register product_high, Register product, 3754 Register tmp, int offset) { 3755 3756 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 3757 // z[kdx] = (jlong)product; 3758 3759 sldi(tmp, idx, LogBytesPerInt); 3760 if (offset) { 3761 addi(tmp, tmp, offset); 3762 } 3763 ldx(yz_idx, y, tmp); 3764 #ifdef VM_LITTLE_ENDIAN 3765 rldicl(yz_idx, yz_idx, 32, 0); 3766 #endif 3767 3768 multiply64(product_high, product, x_xstart, yz_idx); 3769 ldx(yz_idx, z, tmp); 3770 #ifdef VM_LITTLE_ENDIAN 3771 rldicl(yz_idx, yz_idx, 32, 0); 3772 #endif 3773 3774 add2_with_carry(product_high, product, carry, yz_idx); 3775 3776 sldi(tmp, idx, LogBytesPerInt); 3777 if (offset) { 3778 addi(tmp, tmp, offset); 3779 } 3780 #ifdef VM_LITTLE_ENDIAN 3781 rldicl(product, product, 32, 0); 3782 #endif 3783 stdx(product, z, tmp); 3784 } 3785 3786 // Multiply 128 bit by 128 bit. Unrolled inner loop. 3787 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, 3788 Register y, Register z, 3789 Register yz_idx, Register idx, Register carry, 3790 Register product_high, Register product, 3791 Register carry2, Register tmp) { 3792 3793 // jlong carry, x[], y[], z[]; 3794 // int kdx = ystart+1; 3795 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 3796 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 3797 // z[kdx+idx+1] = (jlong)product; 3798 // jlong carry2 = (jlong)(product >>> 64); 3799 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 3800 // z[kdx+idx] = (jlong)product; 3801 // carry = (jlong)(product >>> 64); 3802 // } 3803 // idx += 2; 3804 // if (idx > 0) { 3805 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 3806 // z[kdx+idx] = (jlong)product; 3807 // carry = (jlong)(product >>> 64); 3808 // } 3809 3810 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 3811 const Register jdx = R0; 3812 3813 // Scale the index. 3814 srdi_(jdx, idx, 2); 3815 beq(CCR0, L_third_loop_exit); 3816 mtctr(jdx); 3817 3818 align(32, 16); 3819 bind(L_third_loop); 3820 3821 addi(idx, idx, -4); 3822 3823 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8); 3824 mr_if_needed(carry2, product_high); 3825 3826 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0); 3827 mr_if_needed(carry, product_high); 3828 bdnz(L_third_loop); 3829 3830 bind(L_third_loop_exit); // Handle any left-over operand parts. 3831 3832 andi_(idx, idx, 0x3); 3833 beq(CCR0, L_post_third_loop_done); 3834 3835 Label L_check_1; 3836 3837 addic_(idx, idx, -2); 3838 blt(CCR0, L_check_1); 3839 3840 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0); 3841 mr_if_needed(carry, product_high); 3842 3843 bind(L_check_1); 3844 3845 addi(idx, idx, 0x2); 3846 andi_(idx, idx, 0x1); 3847 addic_(idx, idx, -1); 3848 blt(CCR0, L_post_third_loop_done); 3849 3850 sldi(tmp, idx, LogBytesPerInt); 3851 lwzx(yz_idx, y, tmp); 3852 multiply64(product_high, product, x_xstart, yz_idx); 3853 lwzx(yz_idx, z, tmp); 3854 3855 add2_with_carry(product_high, product, yz_idx, carry); 3856 3857 sldi(tmp, idx, LogBytesPerInt); 3858 stwx(product, z, tmp); 3859 srdi(product, product, 32); 3860 3861 sldi(product_high, product_high, 32); 3862 orr(product, product, product_high); 3863 mr_if_needed(carry, product); 3864 3865 bind(L_post_third_loop_done); 3866 } // multiply_128_x_128_loop 3867 3868 void MacroAssembler::muladd(Register out, Register in, 3869 Register offset, Register len, Register k, 3870 Register tmp1, Register tmp2, Register carry) { 3871 3872 // Labels 3873 Label LOOP, SKIP; 3874 3875 // Make sure length is positive. 3876 cmpdi (CCR0, len, 0); 3877 3878 // Prepare variables 3879 subi (offset, offset, 4); 3880 li (carry, 0); 3881 ble (CCR0, SKIP); 3882 3883 mtctr (len); 3884 subi (len, len, 1 ); 3885 sldi (len, len, 2 ); 3886 3887 // Main loop 3888 bind(LOOP); 3889 lwzx (tmp1, len, in ); 3890 lwzx (tmp2, offset, out ); 3891 mulld (tmp1, tmp1, k ); 3892 add (tmp2, carry, tmp2 ); 3893 add (tmp2, tmp1, tmp2 ); 3894 stwx (tmp2, offset, out ); 3895 srdi (carry, tmp2, 32 ); 3896 subi (offset, offset, 4 ); 3897 subi (len, len, 4 ); 3898 bdnz (LOOP); 3899 bind(SKIP); 3900 } 3901 3902 void MacroAssembler::multiply_to_len(Register x, Register xlen, 3903 Register y, Register ylen, 3904 Register z, Register zlen, 3905 Register tmp1, Register tmp2, 3906 Register tmp3, Register tmp4, 3907 Register tmp5, Register tmp6, 3908 Register tmp7, Register tmp8, 3909 Register tmp9, Register tmp10, 3910 Register tmp11, Register tmp12, 3911 Register tmp13) { 3912 3913 ShortBranchVerifier sbv(this); 3914 3915 assert_different_registers(x, xlen, y, ylen, z, zlen, 3916 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 3917 assert_different_registers(x, xlen, y, ylen, z, zlen, 3918 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7); 3919 assert_different_registers(x, xlen, y, ylen, z, zlen, 3920 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8); 3921 3922 const Register idx = tmp1; 3923 const Register kdx = tmp2; 3924 const Register xstart = tmp3; 3925 3926 const Register y_idx = tmp4; 3927 const Register carry = tmp5; 3928 const Register product = tmp6; 3929 const Register product_high = tmp7; 3930 const Register x_xstart = tmp8; 3931 const Register tmp = tmp9; 3932 3933 // First Loop. 3934 // 3935 // final static long LONG_MASK = 0xffffffffL; 3936 // int xstart = xlen - 1; 3937 // int ystart = ylen - 1; 3938 // long carry = 0; 3939 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 3940 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 3941 // z[kdx] = (int)product; 3942 // carry = product >>> 32; 3943 // } 3944 // z[xstart] = (int)carry; 3945 3946 mr_if_needed(idx, ylen); // idx = ylen 3947 mr_if_needed(kdx, zlen); // kdx = xlen + ylen 3948 li(carry, 0); // carry = 0 3949 3950 Label L_done; 3951 3952 addic_(xstart, xlen, -1); 3953 blt(CCR0, L_done); 3954 3955 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, 3956 carry, product_high, product, idx, kdx, tmp); 3957 3958 Label L_second_loop; 3959 3960 cmpdi(CCR0, kdx, 0); 3961 beq(CCR0, L_second_loop); 3962 3963 Label L_carry; 3964 3965 addic_(kdx, kdx, -1); 3966 beq(CCR0, L_carry); 3967 3968 // Store lower 32 bits of carry. 3969 sldi(tmp, kdx, LogBytesPerInt); 3970 stwx(carry, z, tmp); 3971 srdi(carry, carry, 32); 3972 addi(kdx, kdx, -1); 3973 3974 3975 bind(L_carry); 3976 3977 // Store upper 32 bits of carry. 3978 sldi(tmp, kdx, LogBytesPerInt); 3979 stwx(carry, z, tmp); 3980 3981 // Second and third (nested) loops. 3982 // 3983 // for (int i = xstart-1; i >= 0; i--) { // Second loop 3984 // carry = 0; 3985 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 3986 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 3987 // (z[k] & LONG_MASK) + carry; 3988 // z[k] = (int)product; 3989 // carry = product >>> 32; 3990 // } 3991 // z[i] = (int)carry; 3992 // } 3993 // 3994 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 3995 3996 bind(L_second_loop); 3997 3998 li(carry, 0); // carry = 0; 3999 4000 addic_(xstart, xstart, -1); // i = xstart-1; 4001 blt(CCR0, L_done); 4002 4003 Register zsave = tmp10; 4004 4005 mr(zsave, z); 4006 4007 4008 Label L_last_x; 4009 4010 sldi(tmp, xstart, LogBytesPerInt); 4011 add(z, z, tmp); // z = z + k - j 4012 addi(z, z, 4); 4013 addic_(xstart, xstart, -1); // i = xstart-1; 4014 blt(CCR0, L_last_x); 4015 4016 sldi(tmp, xstart, LogBytesPerInt); 4017 ldx(x_xstart, x, tmp); 4018 #ifdef VM_LITTLE_ENDIAN 4019 rldicl(x_xstart, x_xstart, 32, 0); 4020 #endif 4021 4022 4023 Label L_third_loop_prologue; 4024 4025 bind(L_third_loop_prologue); 4026 4027 Register xsave = tmp11; 4028 Register xlensave = tmp12; 4029 Register ylensave = tmp13; 4030 4031 mr(xsave, x); 4032 mr(xlensave, xstart); 4033 mr(ylensave, ylen); 4034 4035 4036 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen, 4037 carry, product_high, product, x, tmp); 4038 4039 mr(z, zsave); 4040 mr(x, xsave); 4041 mr(xlen, xlensave); // This is the decrement of the loop counter! 4042 mr(ylen, ylensave); 4043 4044 addi(tmp3, xlen, 1); 4045 sldi(tmp, tmp3, LogBytesPerInt); 4046 stwx(carry, z, tmp); 4047 addic_(tmp3, tmp3, -1); 4048 blt(CCR0, L_done); 4049 4050 srdi(carry, carry, 32); 4051 sldi(tmp, tmp3, LogBytesPerInt); 4052 stwx(carry, z, tmp); 4053 b(L_second_loop); 4054 4055 // Next infrequent code is moved outside loops. 4056 bind(L_last_x); 4057 4058 lwz(x_xstart, 0, x); 4059 b(L_third_loop_prologue); 4060 4061 bind(L_done); 4062 } // multiply_to_len 4063 4064 void MacroAssembler::asm_assert(bool check_equal, const char *msg) { 4065 #ifdef ASSERT 4066 Label ok; 4067 if (check_equal) { 4068 beq(CCR0, ok); 4069 } else { 4070 bne(CCR0, ok); 4071 } 4072 stop(msg); 4073 bind(ok); 4074 #endif 4075 } 4076 4077 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset, 4078 Register mem_base, const char* msg) { 4079 #ifdef ASSERT 4080 switch (size) { 4081 case 4: 4082 lwz(R0, mem_offset, mem_base); 4083 cmpwi(CCR0, R0, 0); 4084 break; 4085 case 8: 4086 ld(R0, mem_offset, mem_base); 4087 cmpdi(CCR0, R0, 0); 4088 break; 4089 default: 4090 ShouldNotReachHere(); 4091 } 4092 asm_assert(check_equal, msg); 4093 #endif // ASSERT 4094 } 4095 4096 void MacroAssembler::verify_coop(Register coop, const char* msg) { 4097 if (!VerifyOops) { return; } 4098 if (UseCompressedOops) { decode_heap_oop(coop); } 4099 verify_oop(coop, msg); 4100 if (UseCompressedOops) { encode_heap_oop(coop, coop); } 4101 } 4102 4103 // READ: oop. KILL: R0. Volatile floats perhaps. 4104 void MacroAssembler::verify_oop(Register oop, const char* msg) { 4105 if (!VerifyOops) { 4106 return; 4107 } 4108 4109 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4110 const Register tmp = R11; // Will be preserved. 4111 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4112 4113 BLOCK_COMMENT("verify_oop {"); 4114 4115 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4116 4117 mr_if_needed(R4_ARG2, oop); 4118 save_LR_CR(tmp); // save in old frame 4119 push_frame_reg_args(nbytes_save, tmp); 4120 // load FunctionDescriptor** / entry_address * 4121 load_const_optimized(tmp, fd, R0); 4122 // load FunctionDescriptor* / entry_address 4123 ld(tmp, 0, tmp); 4124 load_const_optimized(R3_ARG1, (address)msg, R0); 4125 // Call destination for its side effect. 4126 call_c(tmp); 4127 4128 pop_frame(); 4129 restore_LR_CR(tmp); 4130 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4131 4132 BLOCK_COMMENT("} verify_oop"); 4133 } 4134 4135 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) { 4136 if (!VerifyOops) { 4137 return; 4138 } 4139 4140 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address(); 4141 const Register tmp = R11; // Will be preserved. 4142 const int nbytes_save = MacroAssembler::num_volatile_regs * 8; 4143 save_volatile_gprs(R1_SP, -nbytes_save); // except R0 4144 4145 ld(R4_ARG2, offs, base); 4146 save_LR_CR(tmp); // save in old frame 4147 push_frame_reg_args(nbytes_save, tmp); 4148 // load FunctionDescriptor** / entry_address * 4149 load_const_optimized(tmp, fd, R0); 4150 // load FunctionDescriptor* / entry_address 4151 ld(tmp, 0, tmp); 4152 load_const_optimized(R3_ARG1, (address)msg, R0); 4153 // Call destination for its side effect. 4154 call_c(tmp); 4155 4156 pop_frame(); 4157 restore_LR_CR(tmp); 4158 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0 4159 } 4160 4161 // Call a C-function that prints output. 4162 void MacroAssembler::stop(int type, const char* msg) { 4163 bool msg_present = (msg != nullptr); 4164 4165 #ifndef PRODUCT 4166 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null")); 4167 #else 4168 block_comment("stop {"); 4169 #endif 4170 4171 if (msg_present) { 4172 type |= stop_msg_present; 4173 } 4174 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type); 4175 if (msg_present) { 4176 emit_int64((uintptr_t)msg); 4177 } 4178 4179 block_comment("} stop;"); 4180 } 4181 4182 #ifndef PRODUCT 4183 // Write pattern 0x0101010101010101 in memory region [low-before, high+after]. 4184 // Val, addr are temp registers. 4185 // If low == addr, addr is killed. 4186 // High is preserved. 4187 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) { 4188 if (!ZapMemory) return; 4189 4190 assert_different_registers(low, val); 4191 4192 BLOCK_COMMENT("zap memory region {"); 4193 load_const_optimized(val, 0x0101010101010101); 4194 int size = before + after; 4195 if (low == high && size < 5 && size > 0) { 4196 int offset = -before*BytesPerWord; 4197 for (int i = 0; i < size; ++i) { 4198 std(val, offset, low); 4199 offset += (1*BytesPerWord); 4200 } 4201 } else { 4202 addi(addr, low, -before*BytesPerWord); 4203 assert_different_registers(high, val); 4204 if (after) addi(high, high, after * BytesPerWord); 4205 Label loop; 4206 bind(loop); 4207 std(val, 0, addr); 4208 addi(addr, addr, 8); 4209 cmpd(CCR6, addr, high); 4210 ble(CCR6, loop); 4211 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value. 4212 } 4213 BLOCK_COMMENT("} zap memory region"); 4214 } 4215 4216 #endif // !PRODUCT 4217 4218 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp, 4219 const bool* flag_addr, Label& label) { 4220 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true); 4221 assert(sizeof(bool) == 1, "PowerPC ABI"); 4222 masm->lbz(temp, simm16_offset, temp); 4223 masm->cmpwi(CCR0, temp, 0); 4224 masm->beq(CCR0, label); 4225 } 4226 4227 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() { 4228 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label); 4229 } 4230 4231 SkipIfEqualZero::~SkipIfEqualZero() { 4232 _masm->bind(_label); 4233 } 4234 4235 void MacroAssembler::cache_wb(Address line) { 4236 assert(line.index() == noreg, "index should be noreg"); 4237 assert(line.disp() == 0, "displacement should be 0"); 4238 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory"); 4239 // Data Cache Store, not really a flush, so it works like a sync of cache 4240 // line and persistent mem, i.e. copying the cache line to persistent whilst 4241 // not invalidating the cache line. 4242 dcbst(line.base()); 4243 } 4244 4245 void MacroAssembler::cache_wbsync(bool is_presync) { 4246 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory"); 4247 // We only need a post sync barrier. Post means _after_ a cache line flush or 4248 // store instruction, pre means a barrier emitted before such a instructions. 4249 if (!is_presync) { 4250 fence(); 4251 } 4252 } 4253 4254 void MacroAssembler::push_cont_fastpath() { 4255 Label done; 4256 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4257 cmpld(CCR0, R1_SP, R0); 4258 ble(CCR0, done); 4259 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread); 4260 bind(done); 4261 } 4262 4263 void MacroAssembler::pop_cont_fastpath() { 4264 Label done; 4265 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4266 cmpld(CCR0, R1_SP, R0); 4267 ble(CCR0, done); 4268 li(R0, 0); 4269 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread); 4270 bind(done); 4271 } 4272 4273 // Note: Must preserve CCR0 EQ (invariant). 4274 void MacroAssembler::inc_held_monitor_count(Register tmp) { 4275 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4276 #ifdef ASSERT 4277 Label ok; 4278 cmpdi(CCR0, tmp, 0); 4279 bge_predict_taken(CCR0, ok); 4280 stop("held monitor count is negativ at increment"); 4281 bind(ok); 4282 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4283 #endif 4284 addi(tmp, tmp, 1); 4285 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4286 } 4287 4288 // Note: Must preserve CCR0 EQ (invariant). 4289 void MacroAssembler::dec_held_monitor_count(Register tmp) { 4290 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4291 #ifdef ASSERT 4292 Label ok; 4293 cmpdi(CCR0, tmp, 0); 4294 bgt_predict_taken(CCR0, ok); 4295 stop("held monitor count is <= 0 at decrement"); 4296 bind(ok); 4297 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ 4298 #endif 4299 addi(tmp, tmp, -1); 4300 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread); 4301 } 4302 4303 // Function to flip between unlocked and locked state (fast locking). 4304 // Branches to failed if the state is not as expected with CCR0 NE. 4305 // Falls through upon success with CCR0 EQ. 4306 // This requires fewer instructions and registers and is easier to use than the 4307 // cmpxchg based implementation. 4308 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) { 4309 assert_different_registers(obj, tmp, R0); 4310 Label retry; 4311 4312 if (semantics & MemBarRel) { 4313 release(); 4314 } 4315 4316 bind(retry); 4317 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this! 4318 if (!is_unlock) { 4319 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock()); 4320 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit 4321 andi_(R0, tmp, markWord::lock_mask_in_place); 4322 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0) 4323 } else { 4324 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock()); 4325 andi_(R0, tmp, markWord::lock_mask_in_place); 4326 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0) 4327 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit 4328 } 4329 stdcx_(tmp, obj); 4330 bne(CCR0, retry); 4331 4332 if (semantics & MemBarFenceAfter) { 4333 fence(); 4334 } else if (semantics & MemBarAcq) { 4335 isync(); 4336 } 4337 } 4338 4339 // Implements lightweight-locking. 4340 // 4341 // - obj: the object to be locked 4342 // - t1, t2: temporary register 4343 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) { 4344 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4345 assert_different_registers(obj, t1, t2); 4346 4347 Label push; 4348 const Register top = t1; 4349 const Register mark = t2; 4350 const Register t = R0; 4351 4352 // Check if the lock-stack is full. 4353 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4354 cmplwi(CCR0, top, LockStack::end_offset()); 4355 bge(CCR0, slow); 4356 4357 // The underflow check is elided. The recursive check will always fail 4358 // when the lock stack is empty because of the _bad_oop_sentinel field. 4359 4360 // Check for recursion. 4361 subi(t, top, oopSize); 4362 ldx(t, R16_thread, t); 4363 cmpd(CCR0, obj, t); 4364 beq(CCR0, push); 4365 4366 // Check header for monitor (0b10) or locked (0b00). 4367 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4368 xori(t, mark, markWord::unlocked_value); 4369 andi_(t, t, markWord::lock_mask_in_place); 4370 bne(CCR0, slow); 4371 4372 // Try to lock. Transition lock bits 0b00 => 0b01 4373 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq); 4374 4375 bind(push); 4376 // After successful lock, push object on lock-stack 4377 stdx(obj, R16_thread, top); 4378 addi(top, top, oopSize); 4379 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4380 } 4381 4382 // Implements lightweight-unlocking. 4383 // 4384 // - obj: the object to be unlocked 4385 // - t1: temporary register 4386 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) { 4387 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking"); 4388 assert_different_registers(obj, t1); 4389 4390 #ifdef ASSERT 4391 { 4392 // The following checks rely on the fact that LockStack is only ever modified by 4393 // its owning thread, even if the lock got inflated concurrently; removal of LockStack 4394 // entries after inflation will happen delayed in that case. 4395 4396 // Check for lock-stack underflow. 4397 Label stack_ok; 4398 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4399 cmplwi(CCR0, t1, LockStack::start_offset()); 4400 bge(CCR0, stack_ok); 4401 stop("Lock-stack underflow"); 4402 bind(stack_ok); 4403 } 4404 #endif 4405 4406 Label unlocked, push_and_slow; 4407 const Register top = t1; 4408 const Register mark = R0; 4409 Register t = R0; 4410 4411 // Check if obj is top of lock-stack. 4412 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4413 subi(top, top, oopSize); 4414 ldx(t, R16_thread, top); 4415 cmpd(CCR0, obj, t); 4416 bne(CCR0, slow); 4417 4418 // Pop lock-stack. 4419 DEBUG_ONLY(li(t, 0);) 4420 DEBUG_ONLY(stdx(t, R16_thread, top);) 4421 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4422 4423 // The underflow check is elided. The recursive check will always fail 4424 // when the lock stack is empty because of the _bad_oop_sentinel field. 4425 4426 // Check if recursive. 4427 subi(t, top, oopSize); 4428 ldx(t, R16_thread, t); 4429 cmpd(CCR0, obj, t); 4430 beq(CCR0, unlocked); 4431 4432 // Use top as tmp 4433 t = top; 4434 4435 // Not recursive. Check header for monitor (0b10). 4436 ld(mark, oopDesc::mark_offset_in_bytes(), obj); 4437 andi_(t, mark, markWord::monitor_value); 4438 bne(CCR0, push_and_slow); 4439 4440 #ifdef ASSERT 4441 // Check header not unlocked (0b01). 4442 Label not_unlocked; 4443 andi_(t, mark, markWord::unlocked_value); 4444 beq(CCR0, not_unlocked); 4445 stop("lightweight_unlock already unlocked"); 4446 bind(not_unlocked); 4447 #endif 4448 4449 // Try to unlock. Transition lock bits 0b00 => 0b01 4450 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel); 4451 b(unlocked); 4452 4453 bind(push_and_slow); 4454 4455 // Restore lock-stack and handle the unlock in runtime. 4456 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4457 DEBUG_ONLY(stdx(obj, R16_thread, top);) 4458 addi(top, top, oopSize); 4459 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread); 4460 b(slow); 4461 4462 bind(unlocked); 4463 }