1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.inline.hpp"
27 #include "code/compiledIC.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "interpreter/interpreterRuntime.hpp"
34 #include "memory/resourceArea.hpp"
35 #include "nativeInst_ppc.hpp"
36 #include "oops/compressedKlass.inline.hpp"
37 #include "oops/compressedOops.inline.hpp"
38 #include "oops/klass.inline.hpp"
39 #include "oops/methodData.hpp"
40 #include "prims/methodHandles.hpp"
41 #include "register_ppc.hpp"
42 #include "runtime/icache.hpp"
43 #include "runtime/interfaceSupport.inline.hpp"
44 #include "runtime/objectMonitor.hpp"
45 #include "runtime/objectMonitorTable.hpp"
46 #include "runtime/os.hpp"
47 #include "runtime/safepoint.hpp"
48 #include "runtime/safepointMechanism.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "runtime/vm_version.hpp"
52 #include "utilities/macros.hpp"
53 #include "utilities/powerOfTwo.hpp"
54
55 #ifdef PRODUCT
56 #define BLOCK_COMMENT(str) // nothing
57 #else
58 #define BLOCK_COMMENT(str) block_comment(str)
59 #endif
60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
61
62 #ifdef ASSERT
63 // On RISC, there's no benefit to verifying instruction boundaries.
64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
65 #endif
66
67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
68 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
69 if (Assembler::is_simm(si31, 16)) {
70 ld(d, si31, a);
71 if (emit_filler_nop) nop();
72 } else {
73 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
74 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
75 addis(d, a, hi);
76 ld(d, lo, d);
77 }
78 }
79
80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
81 assert_different_registers(d, a);
82 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
83 }
84
85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
86 size_t size_in_bytes, bool is_signed) {
87 switch (size_in_bytes) {
88 case 8: ld(dst, offs, base); break;
89 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
90 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
91 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
92 default: ShouldNotReachHere();
93 }
94 }
95
96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
97 size_t size_in_bytes) {
98 switch (size_in_bytes) {
99 case 8: std(dst, offs, base); break;
100 case 4: stw(dst, offs, base); break;
101 case 2: sth(dst, offs, base); break;
102 case 1: stb(dst, offs, base); break;
103 default: ShouldNotReachHere();
104 }
105 }
106
107 void MacroAssembler::align(int modulus, int max, int rem) {
108 int padding = (rem + modulus - (offset() % modulus)) % modulus;
109 if (padding > max) return;
110 for (int c = (padding >> 2); c > 0; --c) { nop(); }
111 }
112
113 void MacroAssembler::align_prefix() {
114 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
115 }
116
117 // Issue instructions that calculate given TOC from global TOC.
118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
119 bool add_relocation, bool emit_dummy_addr,
120 bool add_addr_to_reloc) {
121 int offset = -1;
122 if (emit_dummy_addr) {
123 offset = -128; // dummy address
124 } else if (addr != (address)(intptr_t)-1) {
125 offset = MacroAssembler::offset_to_global_toc(addr);
126 }
127
128 if (hi16) {
129 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
130 }
131 if (lo16) {
132 if (add_relocation) {
133 // Relocate at the addi to avoid confusion with a load from the method's TOC.
134 RelocationHolder rh = add_addr_to_reloc ?
135 internal_word_Relocation::spec(addr) :
136 internal_word_Relocation::spec_for_immediate();
137 relocate(rh);
138 }
139 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
140 }
141 }
142
143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
144 const int offset = MacroAssembler::offset_to_global_toc(addr);
145
146 const address inst2_addr = a;
147 const int inst2 = *(int *)inst2_addr;
148
149 // The relocation points to the second instruction, the addi,
150 // and the addi reads and writes the same register dst.
151 const int dst = inv_rt_field(inst2);
152 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
153
154 // Now, find the preceding addis which writes to dst.
155 int inst1 = 0;
156 address inst1_addr = inst2_addr - BytesPerInstWord;
157 while (inst1_addr >= bound) {
158 inst1 = *(int *) inst1_addr;
159 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
160 // Stop, found the addis which writes dst.
161 break;
162 }
163 inst1_addr -= BytesPerInstWord;
164 }
165
166 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
167 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
168 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
169 return inst1_addr;
170 }
171
172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
173 const address inst2_addr = a;
174 const int inst2 = *(int *)inst2_addr;
175
176 // The relocation points to the second instruction, the addi,
177 // and the addi reads and writes the same register dst.
178 const int dst = inv_rt_field(inst2);
179 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
180
181 // Now, find the preceding addis which writes to dst.
182 int inst1 = 0;
183 address inst1_addr = inst2_addr - BytesPerInstWord;
184 while (inst1_addr >= bound) {
185 inst1 = *(int *) inst1_addr;
186 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
187 // stop, found the addis which writes dst
188 break;
189 }
190 inst1_addr -= BytesPerInstWord;
191 }
192
193 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
194
195 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
196 // -1 is a special case
197 if (offset == -1) {
198 return (address)(intptr_t)-1;
199 } else {
200 return global_toc() + offset;
201 }
202 }
203
204 #ifdef _LP64
205 // Patch compressed oops or klass constants.
206 // Assembler sequence is
207 // 1) compressed oops:
208 // lis rx = const.hi
209 // ori rx = rx | const.lo
210 // 2) compressed klass:
211 // lis rx = const.hi
212 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
213 // ori rx = rx | const.lo
214 // Clrldi will be passed by.
215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
216 assert(UseCompressedOops, "Should only patch compressed oops");
217
218 const address inst2_addr = a;
219 const int inst2 = *(int *)inst2_addr;
220
221 // The relocation points to the second instruction, the ori,
222 // and the ori reads and writes the same register dst.
223 const int dst = inv_rta_field(inst2);
224 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
225 // Now, find the preceding addis which writes to dst.
226 int inst1 = 0;
227 address inst1_addr = inst2_addr - BytesPerInstWord;
228 bool inst1_found = false;
229 while (inst1_addr >= bound) {
230 inst1 = *(int *)inst1_addr;
231 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
232 inst1_addr -= BytesPerInstWord;
233 }
234 assert(inst1_found, "inst is not lis");
235
236 uint32_t data_value = CompressedOops::narrow_oop_value(data);
237 int xc = (data_value >> 16) & 0xffff;
238 int xd = (data_value >> 0) & 0xffff;
239
240 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
241 set_imm((int *)inst2_addr, (xd)); // unsigned int
242 return inst1_addr;
243 }
244
245 // Get compressed oop constant.
246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
247 assert(UseCompressedOops, "Should only patch compressed oops");
248
249 const address inst2_addr = a;
250 const int inst2 = *(int *)inst2_addr;
251
252 // The relocation points to the second instruction, the ori,
253 // and the ori reads and writes the same register dst.
254 const int dst = inv_rta_field(inst2);
255 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
256 // Now, find the preceding lis which writes to dst.
257 int inst1 = 0;
258 address inst1_addr = inst2_addr - BytesPerInstWord;
259 bool inst1_found = false;
260
261 while (inst1_addr >= bound) {
262 inst1 = *(int *) inst1_addr;
263 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
264 inst1_addr -= BytesPerInstWord;
265 }
266 assert(inst1_found, "inst is not lis");
267
268 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
269 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
270
271 return CompressedOops::narrow_oop_cast(xl | xh);
272 }
273 #endif // _LP64
274
275 // Returns true if successful.
276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
277 Register toc, bool fixed_size) {
278 int toc_offset = 0;
279 // Use RelocationHolder::none for the constant pool entry, otherwise
280 // we will end up with a failing NativeCall::verify(x) where x is
281 // the address of the constant pool entry.
282 // FIXME: We should insert relocation information for oops at the constant
283 // pool entries instead of inserting it at the loads; patching of a constant
284 // pool entry should be less expensive.
285 address const_address = address_constant((address)a.value(), RelocationHolder::none);
286 if (const_address == nullptr) { return false; } // allocation failure
287 // Relocate at the pc of the load.
288 relocate(a.rspec());
289 toc_offset = (int)(const_address - code()->consts()->start());
290 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
291 return true;
292 }
293
294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
295 const address inst1_addr = a;
296 const int inst1 = *(int *)inst1_addr;
297
298 // The relocation points to the ld or the addis.
299 return (is_ld(inst1)) ||
300 (is_addis(inst1) && inv_ra_field(inst1) != 0);
301 }
302
303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
304 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
305
306 const address inst1_addr = a;
307 const int inst1 = *(int *)inst1_addr;
308
309 if (is_ld(inst1)) {
310 return inv_d1_field(inst1);
311 } else if (is_addis(inst1)) {
312 const int dst = inv_rt_field(inst1);
313
314 // Now, find the succeeding ld which reads and writes to dst.
315 address inst2_addr = inst1_addr + BytesPerInstWord;
316 int inst2 = 0;
317 while (true) {
318 inst2 = *(int *) inst2_addr;
319 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
320 // Stop, found the ld which reads and writes dst.
321 break;
322 }
323 inst2_addr += BytesPerInstWord;
324 }
325 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
326 }
327 ShouldNotReachHere();
328 return 0;
329 }
330
331 // Get the constant from a `load_const' sequence.
332 long MacroAssembler::get_const(address a) {
333 assert(is_load_const_at(a), "not a load of a constant");
334 const int *p = (const int*) a;
335 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
336 if (is_ori(*(p+1))) {
337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
338 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
339 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
340 } else if (is_lis(*(p+1))) {
341 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
342 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
343 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
344 } else {
345 ShouldNotReachHere();
346 return (long) 0;
347 }
348 return (long) x;
349 }
350
351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
352 // level procedure. It neither flushes the instruction cache nor is it
353 // mt safe.
354 void MacroAssembler::patch_const(address a, long x) {
355 assert(is_load_const_at(a), "not a load of a constant");
356 int *p = (int*) a;
357 if (is_ori(*(p+1))) {
358 set_imm(0 + p, (x >> 48) & 0xffff);
359 set_imm(1 + p, (x >> 32) & 0xffff);
360 set_imm(3 + p, (x >> 16) & 0xffff);
361 set_imm(4 + p, x & 0xffff);
362 } else if (is_lis(*(p+1))) {
363 set_imm(0 + p, (x >> 48) & 0xffff);
364 set_imm(2 + p, (x >> 32) & 0xffff);
365 set_imm(1 + p, (x >> 16) & 0xffff);
366 set_imm(3 + p, x & 0xffff);
367 } else {
368 ShouldNotReachHere();
369 }
370 }
371
372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
373 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
374 int index = oop_recorder()->allocate_metadata_index(obj);
375 RelocationHolder rspec = metadata_Relocation::spec(index);
376 return AddressLiteral((address)obj, rspec);
377 }
378
379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
380 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
381 int index = oop_recorder()->find_index(obj);
382 RelocationHolder rspec = metadata_Relocation::spec(index);
383 return AddressLiteral((address)obj, rspec);
384 }
385
386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
387 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
388 int oop_index = oop_recorder()->allocate_oop_index(obj);
389 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
390 }
391
392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
393 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
394 int oop_index = oop_recorder()->find_index(obj);
395 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
396 }
397
398 #ifndef PRODUCT
399 void MacroAssembler::pd_print_patched_instruction(address branch) {
400 Unimplemented(); // TODO: PPC port
401 }
402 #endif // ndef PRODUCT
403
404 // Conditional far branch for destinations encodable in 24+2 bits.
405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
406
407 // If requested by flag optimize, relocate the bc_far as a
408 // runtime_call and prepare for optimizing it when the code gets
409 // relocated.
410 if (optimize == bc_far_optimize_on_relocate) {
411 relocate(relocInfo::runtime_call_type);
412 }
413
414 // variant 2:
415 //
416 // b!cxx SKIP
417 // bxx DEST
418 // SKIP:
419 //
420
421 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
422 opposite_bcond(inv_boint_bcond(boint)));
423
424 // We emit two branches.
425 // First, a conditional branch which jumps around the far branch.
426 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
427 const address bc_pc = pc();
428 bc(opposite_boint, biint, not_taken_pc);
429
430 const int bc_instr = *(int*)bc_pc;
431 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
432 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
433 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
434 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
435 "postcondition");
436 assert(biint == inv_bi_field(bc_instr), "postcondition");
437
438 // Second, an unconditional far branch which jumps to dest.
439 // Note: target(dest) remembers the current pc (see CodeSection::target)
440 // and returns the current pc if the label is not bound yet; when
441 // the label gets bound, the unconditional far branch will be patched.
442 const address target_pc = target(dest);
443 const address b_pc = pc();
444 b(target_pc);
445
446 assert(not_taken_pc == pc(), "postcondition");
447 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
448 }
449
450 // 1 or 2 instructions
451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
452 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
453 bc(boint, biint, dest);
454 } else {
455 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
456 }
457 }
458
459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
460 return is_bc_far_variant1_at(instruction_addr) ||
461 is_bc_far_variant2_at(instruction_addr) ||
462 is_bc_far_variant3_at(instruction_addr);
463 }
464
465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
466 if (is_bc_far_variant1_at(instruction_addr)) {
467 const address instruction_1_addr = instruction_addr;
468 const int instruction_1 = *(int*)instruction_1_addr;
469 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
470 } else if (is_bc_far_variant2_at(instruction_addr)) {
471 const address instruction_2_addr = instruction_addr + 4;
472 return bxx_destination(instruction_2_addr);
473 } else if (is_bc_far_variant3_at(instruction_addr)) {
474 return instruction_addr + 8;
475 }
476 // variant 4 ???
477 ShouldNotReachHere();
478 return nullptr;
479 }
480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
481
482 if (is_bc_far_variant3_at(instruction_addr)) {
483 // variant 3, far cond branch to the next instruction, already patched to nops:
484 //
485 // nop
486 // nop
487 // SKIP/DEST:
488 //
489 return;
490 }
491
492 // first, extract boint and biint from the current branch
493 int boint = 0;
494 int biint = 0;
495
496 ResourceMark rm;
497 const int code_size = 2 * BytesPerInstWord;
498 CodeBuffer buf(instruction_addr, code_size);
499 MacroAssembler masm(&buf);
500 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
501 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
502 masm.nop();
503 masm.nop();
504 } else {
505 if (is_bc_far_variant1_at(instruction_addr)) {
506 // variant 1, the 1st instruction contains the destination address:
507 //
508 // bcxx DEST
509 // nop
510 //
511 const int instruction_1 = *(int*)(instruction_addr);
512 boint = inv_bo_field(instruction_1);
513 biint = inv_bi_field(instruction_1);
514 } else if (is_bc_far_variant2_at(instruction_addr)) {
515 // variant 2, the 2nd instruction contains the destination address:
516 //
517 // b!cxx SKIP
518 // bxx DEST
519 // SKIP:
520 //
521 const int instruction_1 = *(int*)(instruction_addr);
522 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
523 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
524 biint = inv_bi_field(instruction_1);
525 } else {
526 // variant 4???
527 ShouldNotReachHere();
528 }
529
530 // second, set the new branch destination and optimize the code
531 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
532 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
533 // variant 1:
534 //
535 // bcxx DEST
536 // nop
537 //
538 masm.bc(boint, biint, dest);
539 masm.nop();
540 } else {
541 // variant 2:
542 //
543 // b!cxx SKIP
544 // bxx DEST
545 // SKIP:
546 //
547 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
548 opposite_bcond(inv_boint_bcond(boint)));
549 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
550 masm.bc(opposite_boint, biint, not_taken_pc);
551 masm.b(dest);
552 }
553 }
554 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
555 }
556
557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
559 // get current pc
560 uint64_t start_pc = (uint64_t) pc();
561
562 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
563 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
564
565 // relocate here
566 if (rt != relocInfo::none) {
567 relocate(rt);
568 }
569
570 if ( ReoptimizeCallSequences &&
571 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
572 (!link && is_within_range_of_b(dest, pc_of_b)))) {
573 // variant 2:
574 // Emit an optimized, pc-relative call/jump.
575
576 if (link) {
577 // some padding
578 nop();
579 nop();
580 nop();
581 nop();
582 nop();
583 nop();
584
585 // do the call
586 assert(pc() == pc_of_bl, "just checking");
587 bl(dest, relocInfo::none);
588 } else {
589 // do the jump
590 assert(pc() == pc_of_b, "just checking");
591 b(dest, relocInfo::none);
592
593 // some padding
594 nop();
595 nop();
596 nop();
597 nop();
598 nop();
599 nop();
600 }
601
602 // Assert that we can identify the emitted call/jump.
603 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
604 "can't identify emitted call");
605 } else {
606 // variant 1:
607 mr(R0, R11); // spill R11 -> R0.
608
609 // Load the destination address into CTR,
610 // calculate destination relative to global toc.
611 calculate_address_from_global_toc(R11, dest, true, true, false);
612
613 mtctr(R11);
614 mr(R11, R0); // spill R11 <- R0.
615 nop();
616
617 // do the call/jump
618 if (link) {
619 bctrl();
620 } else{
621 bctr();
622 }
623 // Assert that we can identify the emitted call/jump.
624 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
625 "can't identify emitted call");
626 }
627
628 // Assert that we can identify the emitted call/jump.
629 assert(is_bxx64_patchable_at((address)start_pc, link),
630 "can't identify emitted call");
631 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
632 "wrong encoding of dest address");
633 }
634
635 // Identify a bxx64_patchable instruction.
636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
637 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
638 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
639 || is_bxx64_patchable_variant2_at(instruction_addr, link);
640 }
641
642 // Does the call64_patchable instruction use a pc-relative encoding of
643 // the call destination?
644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
645 // variant 2 is pc-relative
646 return is_bxx64_patchable_variant2_at(instruction_addr, link);
647 }
648
649 // Identify variant 1.
650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
651 unsigned int* instr = (unsigned int*) instruction_addr;
652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
653 && is_mtctr(instr[5]) // mtctr
654 && is_load_const_at(instruction_addr);
655 }
656
657 // Identify variant 1b: load destination relative to global toc.
658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
659 unsigned int* instr = (unsigned int*) instruction_addr;
660 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
661 && is_mtctr(instr[3]) // mtctr
662 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
663 }
664
665 // Identify variant 2.
666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
667 unsigned int* instr = (unsigned int*) instruction_addr;
668 if (link) {
669 return is_bl (instr[6]) // bl dest is last
670 && is_nop(instr[0]) // nop
671 && is_nop(instr[1]) // nop
672 && is_nop(instr[2]) // nop
673 && is_nop(instr[3]) // nop
674 && is_nop(instr[4]) // nop
675 && is_nop(instr[5]); // nop
676 } else {
677 return is_b (instr[0]) // b dest is first
678 && is_nop(instr[1]) // nop
679 && is_nop(instr[2]) // nop
680 && is_nop(instr[3]) // nop
681 && is_nop(instr[4]) // nop
682 && is_nop(instr[5]) // nop
683 && is_nop(instr[6]); // nop
684 }
685 }
686
687 // Set dest address of a bxx64_patchable instruction.
688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
689 ResourceMark rm;
690 int code_size = MacroAssembler::bxx64_patchable_size;
691 CodeBuffer buf(instruction_addr, code_size);
692 MacroAssembler masm(&buf);
693 masm.bxx64_patchable(dest, relocInfo::none, link);
694 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
695 }
696
697 // Get dest address of a bxx64_patchable instruction.
698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
699 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
700 return (address) (unsigned long) get_const(instruction_addr);
701 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
702 unsigned int* instr = (unsigned int*) instruction_addr;
703 if (link) {
704 const int instr_idx = 6; // bl is last
705 int branchoffset = branch_destination(instr[instr_idx], 0);
706 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
707 } else {
708 const int instr_idx = 0; // b is first
709 int branchoffset = branch_destination(instr[instr_idx], 0);
710 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
711 }
712 // Load dest relative to global toc.
713 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
714 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
715 instruction_addr);
716 } else {
717 ShouldNotReachHere();
718 return nullptr;
719 }
720 }
721
722 #ifdef ASSERT
723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
724 const int magic_number = 0x42;
725
726 // Preserve stack pointer register (R1_SP) and system thread id register (R13);
727 // although they're technically volatile
728 for (int i = 2; i < 13; i++) {
729 Register reg = as_Register(i);
730 if (reg == excluded_register) {
731 continue;
732 }
733
734 li(reg, magic_number);
735 }
736 }
737
738 void MacroAssembler::clobber_nonvolatile_registers() {
739 BLOCK_COMMENT("clobber nonvolatile registers {");
740 static const Register regs[] = {
741 R14,
742 R15,
743 // don't zap R16_thread
744 R17,
745 R18,
746 R19,
747 R20,
748 R21,
749 R22,
750 R23,
751 R24,
752 R25,
753 R26,
754 R27,
755 R28,
756 // don't zap R29_TOC
757 R30,
758 R31
759 };
760 Register bad = regs[0];
761 load_const_optimized(bad, 0xbad0101babe00000);
762 for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
763 addi(regs[i], bad, regs[i]->encoding());
764 }
765 BLOCK_COMMENT("} clobber nonvolatile registers");
766 }
767 #endif // ASSERT
768
769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
770 const int magic_number = 0x43;
771
772 li(tmp, magic_number);
773 for (int m = 0; m <= 7; m++) {
774 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
775 }
776 }
777
778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
779 BLOCK_COMMENT("save_nonvolatile_registers {");
780
781 for (int i = 14; i < 32; i++) {
782 std(as_Register(i), offset, dst);
783 offset += 8;
784 }
785
786 if (include_fp_regs) {
787 for (int i = 14; i < 32; i++) {
788 stfd(as_FloatRegister(i), offset, dst);
789 offset += 8;
790 }
791 }
792
793 if (include_vector_regs) {
794 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
795 if (PowerArchitecturePPC64 >= 10) {
796 for (int i = 20; i < 32; i += 2) {
797 stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
798 offset += 32;
799 }
800 } else {
801 for (int i = 20; i < 32; i++) {
802 stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
803 offset += 16;
804 }
805 }
806 }
807
808 BLOCK_COMMENT("} save_nonvolatile_registers ");
809 }
810
811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
812 BLOCK_COMMENT("restore_nonvolatile_registers {");
813
814 for (int i = 14; i < 32; i++) {
815 ld(as_Register(i), offset, src);
816 offset += 8;
817 }
818
819 if (include_fp_regs) {
820 for (int i = 14; i < 32; i++) {
821 lfd(as_FloatRegister(i), offset, src);
822 offset += 8;
823 }
824 }
825
826 if (include_vector_regs) {
827 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
828 if (PowerArchitecturePPC64 >= 10) {
829 for (int i = 20; i < 32; i += 2) {
830 lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
831 offset += 32;
832 }
833 } else {
834 for (int i = 20; i < 32; i++) {
835 lxv(as_VectorRegister(i)->to_vsr(), offset, src);
836 offset += 16;
837 }
838 }
839 }
840
841 BLOCK_COMMENT("} restore_nonvolatile_registers");
842 }
843
844 // For verify_oops.
845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
846 std(R2, offset, dst); offset += 8;
847 if (include_R3_RET_reg) {
848 std(R3, offset, dst); offset += 8;
849 }
850 std(R4, offset, dst); offset += 8;
851 std(R5, offset, dst); offset += 8;
852 std(R6, offset, dst); offset += 8;
853 std(R7, offset, dst); offset += 8;
854 std(R8, offset, dst); offset += 8;
855 std(R9, offset, dst); offset += 8;
856 std(R10, offset, dst); offset += 8;
857 std(R11, offset, dst); offset += 8;
858 std(R12, offset, dst); offset += 8;
859
860 if (include_fp_regs) {
861 stfd(F0, offset, dst); offset += 8;
862 stfd(F1, offset, dst); offset += 8;
863 stfd(F2, offset, dst); offset += 8;
864 stfd(F3, offset, dst); offset += 8;
865 stfd(F4, offset, dst); offset += 8;
866 stfd(F5, offset, dst); offset += 8;
867 stfd(F6, offset, dst); offset += 8;
868 stfd(F7, offset, dst); offset += 8;
869 stfd(F8, offset, dst); offset += 8;
870 stfd(F9, offset, dst); offset += 8;
871 stfd(F10, offset, dst); offset += 8;
872 stfd(F11, offset, dst); offset += 8;
873 stfd(F12, offset, dst); offset += 8;
874 stfd(F13, offset, dst);
875 }
876 }
877
878 // For verify_oops.
879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
880 ld(R2, offset, src); offset += 8;
881 if (include_R3_RET_reg) {
882 ld(R3, offset, src); offset += 8;
883 }
884 ld(R4, offset, src); offset += 8;
885 ld(R5, offset, src); offset += 8;
886 ld(R6, offset, src); offset += 8;
887 ld(R7, offset, src); offset += 8;
888 ld(R8, offset, src); offset += 8;
889 ld(R9, offset, src); offset += 8;
890 ld(R10, offset, src); offset += 8;
891 ld(R11, offset, src); offset += 8;
892 ld(R12, offset, src); offset += 8;
893
894 if (include_fp_regs) {
895 lfd(F0, offset, src); offset += 8;
896 lfd(F1, offset, src); offset += 8;
897 lfd(F2, offset, src); offset += 8;
898 lfd(F3, offset, src); offset += 8;
899 lfd(F4, offset, src); offset += 8;
900 lfd(F5, offset, src); offset += 8;
901 lfd(F6, offset, src); offset += 8;
902 lfd(F7, offset, src); offset += 8;
903 lfd(F8, offset, src); offset += 8;
904 lfd(F9, offset, src); offset += 8;
905 lfd(F10, offset, src); offset += 8;
906 lfd(F11, offset, src); offset += 8;
907 lfd(F12, offset, src); offset += 8;
908 lfd(F13, offset, src);
909 }
910 }
911
912 void MacroAssembler::save_LR(Register tmp) {
913 mflr(tmp);
914 std(tmp, _abi0(lr), R1_SP);
915 }
916
917 void MacroAssembler::restore_LR(Register tmp) {
918 assert(tmp != R1_SP, "must be distinct");
919 ld(tmp, _abi0(lr), R1_SP);
920 mtlr(tmp);
921 }
922
923 void MacroAssembler::save_LR_CR(Register tmp) {
924 mfcr(tmp);
925 std(tmp, _abi0(cr), R1_SP);
926 save_LR(tmp);
927 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
928 }
929
930 void MacroAssembler::restore_LR_CR(Register tmp) {
931 restore_LR(tmp);
932 ld(tmp, _abi0(cr), R1_SP);
933 mtcr(tmp);
934 }
935
936 address MacroAssembler::get_PC_trash_LR(Register result) {
937 Label L;
938 bl(L);
939 bind(L);
940 address lr_pc = pc();
941 mflr(result);
942 return lr_pc;
943 }
944
945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
946 #ifdef ASSERT
947 assert_different_registers(offset, tmp, R1_SP);
948 andi_(tmp, offset, frame::alignment_in_bytes-1);
949 asm_assert_eq("resize_frame: unaligned");
950 #endif
951
952 // tmp <- *(SP)
953 ld(tmp, _abi0(callers_sp), R1_SP);
954 // addr <- SP + offset;
955 // *(addr) <- tmp;
956 // SP <- addr
957 stdux(tmp, R1_SP, offset);
958 }
959
960 void MacroAssembler::resize_frame(int offset, Register tmp) {
961 assert(is_simm(offset, 16), "too big an offset");
962 assert_different_registers(tmp, R1_SP);
963 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
964 // tmp <- *(SP)
965 ld(tmp, _abi0(callers_sp), R1_SP);
966 // addr <- SP + offset;
967 // *(addr) <- tmp;
968 // SP <- addr
969 stdu(tmp, offset, R1_SP);
970 }
971
972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
973 // (addr == tmp1) || (addr == tmp2) is allowed here!
974 assert(tmp1 != tmp2, "must be distinct");
975
976 // compute offset w.r.t. current stack pointer
977 // tmp_1 <- addr - SP (!)
978 subf(tmp1, R1_SP, addr);
979
980 // atomically update SP keeping back link.
981 resize_frame(tmp1/* offset */, tmp2/* tmp */);
982 }
983
984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
985 #ifdef ASSERT
986 assert(bytes != R0, "r0 not allowed here");
987 andi_(R0, bytes, frame::alignment_in_bytes-1);
988 asm_assert_eq("push_frame(Reg, Reg): unaligned");
989 #endif
990 neg(tmp, bytes);
991 stdux(R1_SP, R1_SP, tmp);
992 }
993
994 // Push a frame of size `bytes'.
995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
996 long offset = align_addr(bytes, frame::alignment_in_bytes);
997 if (is_simm(-offset, 16)) {
998 stdu(R1_SP, -offset, R1_SP);
999 } else {
1000 load_const_optimized(tmp, -offset);
1001 stdux(R1_SP, R1_SP, tmp);
1002 }
1003 }
1004
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007 push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012 ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017 // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018 // most of the times.
1019 if (R12 != r_function_entry) {
1020 mr(R12, r_function_entry);
1021 }
1022 mtctr(R12);
1023 // Do a call or a branch.
1024 if (and_link) {
1025 bctrl();
1026 } else {
1027 bctr();
1028 }
1029 _last_calls_return_pc = pc();
1030
1031 return _last_calls_return_pc;
1032 }
1033
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037 return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042 return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046 load_const(R12, function_entry, R0);
1047 return branch_to(R12, /*and_link=*/true);
1048 }
1049
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056 // we emit standard ptrgl glue code here
1057 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058
1059 // retrieve necessary entries from the function descriptor
1060 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061 mtctr(R0);
1062
1063 if (load_toc_of_callee) {
1064 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065 }
1066 if (load_env_of_callee) {
1067 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068 } else if (load_toc_of_callee) {
1069 li(R11, 0);
1070 }
1071
1072 // do a call or a branch
1073 if (and_link) {
1074 bctrl();
1075 } else {
1076 bctr();
1077 }
1078 _last_calls_return_pc = pc();
1079
1080 return _last_calls_return_pc;
1081 }
1082
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088 return branch_to(fd, /*and_link=*/true,
1089 /*save toc=*/false,
1090 /*restore toc=*/false,
1091 /*load toc=*/true,
1092 /*load env=*/true);
1093 }
1094
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096 return branch_to(fd, /*and_link=*/false,
1097 /*save toc=*/false,
1098 /*restore toc=*/false,
1099 /*load toc=*/true,
1100 /*load env=*/true);
1101 }
1102
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104 if (rt != relocInfo::none) {
1105 // this call needs to be relocatable
1106 if (!ReoptimizeCallSequences
1107 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108 || fd == nullptr // support code-size estimation
1109 || !fd->is_friend_function()
1110 || fd->entry() == nullptr) {
1111 // it's not a friend function as defined by class FunctionDescriptor,
1112 // so do a full call-c here.
1113 load_const(R11, (address)fd, R0);
1114
1115 bool has_env = (fd != nullptr && fd->env() != nullptr);
1116 return branch_to(R11, /*and_link=*/true,
1117 /*save toc=*/false,
1118 /*restore toc=*/false,
1119 /*load toc=*/true,
1120 /*load env=*/has_env);
1121 } else {
1122 // It's a friend function. Load the entry point and don't care about
1123 // toc and env. Use an optimizable call instruction, but ensure the
1124 // same code-size as in the case of a non-friend function.
1125 nop();
1126 nop();
1127 nop();
1128 bl64_patchable(fd->entry(), rt);
1129 _last_calls_return_pc = pc();
1130 return _last_calls_return_pc;
1131 }
1132 } else {
1133 // This call does not need to be relocatable, do more aggressive
1134 // optimizations.
1135 if (!ReoptimizeCallSequences
1136 || !fd->is_friend_function()) {
1137 // It's not a friend function as defined by class FunctionDescriptor,
1138 // so do a full call-c here.
1139 load_const(R11, (address)fd, R0);
1140 return branch_to(R11, /*and_link=*/true,
1141 /*save toc=*/false,
1142 /*restore toc=*/false,
1143 /*load toc=*/true,
1144 /*load env=*/true);
1145 } else {
1146 // it's a friend function, load the entry point and don't care about
1147 // toc and env.
1148 address dest = fd->entry();
1149 if (is_within_range_of_b(dest, pc())) {
1150 bl(dest);
1151 } else {
1152 bl64_patchable(dest, rt);
1153 }
1154 _last_calls_return_pc = pc();
1155 return _last_calls_return_pc;
1156 }
1157 }
1158 }
1159
1160 // Call a C function. All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166 relocInfo::relocType rt, Register toc) {
1167 if (!ReoptimizeCallSequences
1168 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169 || !fd->is_friend_function()) {
1170 // It's not a friend function as defined by class FunctionDescriptor,
1171 // so do a full call-c here.
1172 assert(fd->entry() != nullptr, "function must be linked");
1173
1174 AddressLiteral fd_entry(fd->entry());
1175 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176 mtctr(R11);
1177 if (fd->env() == nullptr) {
1178 li(R11, 0);
1179 nop();
1180 } else {
1181 AddressLiteral fd_env(fd->env());
1182 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183 }
1184 AddressLiteral fd_toc(fd->toc());
1185 // Set R2_TOC (load from toc)
1186 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187 bctrl();
1188 _last_calls_return_pc = pc();
1189 if (!success) { return nullptr; }
1190 } else {
1191 // It's a friend function, load the entry point and don't care about
1192 // toc and env. Use an optimizable call instruction, but ensure the
1193 // same code-size as in the case of a non-friend function.
1194 nop();
1195 bl64_patchable(fd->entry(), rt);
1196 _last_calls_return_pc = pc();
1197 }
1198 return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201
1202 bool MacroAssembler::ic_call(Register Rmethod_toc,
1203 address target,
1204 jint method_index,
1205 bool scratch_emit,
1206 bool fixed_size) {
1207 AddressLiteral target_al(target, virtual_call_Relocation::spec(pc(), method_index));
1208 DEBUG_ONLY(int ic_load_offset = offset());
1209
1210 // Load a clear inline cache.
1211 AddressLiteral empty_ic((address) Universe::non_oop_word());
1212 bool success = load_const_from_method_toc(R19_inline_cache_reg, empty_ic, Rmethod_toc, fixed_size);
1213 if (!success) return false;
1214
1215 assert(MacroAssembler::is_load_const_from_method_toc_at(addr_at(ic_load_offset)),
1216 "should be load from TOC");
1217
1218 address call_pc = trampoline_call(target_al, Rmethod_toc, scratch_emit);
1219 return call_pc != nullptr;
1220 }
1221
1222 address MacroAssembler::trampoline_call(AddressLiteral target,
1223 Register Rmethod_toc,
1224 bool scratch_emit) {
1225 // First, emit the trampoline stub
1226 if (!scratch_emit) {
1227 RelocationHolder rh = trampoline_stub_Relocation::spec(pc() /* of the bl below */);
1228
1229 // Put the target's entry point as a constant into the constant pool.
1230 const address target_toc_addr = address_constant((address)target.value());
1231 if (target_toc_addr == nullptr) return nullptr;
1232
1233 const int target_toc_offset = offset_to_method_toc(target_toc_addr);
1234 address stub = start_a_stub(64);
1235 if (stub == nullptr) return nullptr;
1236
1237 // Annotate the stub with a relocation that points to the owning call instruction.
1238 relocate(rh);
1239 DEBUG_ONLY(int stub_start_offset = offset());
1240
1241 // For java_to_interp stubs we use R11_scratch1 as scratch register
1242 // and in call trampoline stubs we use R12_scratch2. This way we
1243 // can distinguish them (see is_NativeCallTrampolineStub_at()).
1244 Register reg_scratch = R12_scratch2;
1245
1246 if (Rmethod_toc == noreg) {
1247 calculate_address_from_global_toc(reg_scratch, method_toc());
1248 Rmethod_toc = reg_scratch;
1249 }
1250
1251 ld_largeoffset_unchecked(reg_scratch, target_toc_offset, Rmethod_toc, false);
1252 mtctr(reg_scratch);
1253 bctr();
1254
1255 assert(target_toc_offset == NativeCallTrampolineStub_at(addr_at(stub_start_offset))->destination_toc_offset(),
1256 "encoded offset into the constant pool must match");
1257 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
1258 assert(is_NativeCallTrampolineStub_at(addr_at(stub_start_offset)), "doesn't look like a trampoline");
1259
1260 // End the stub.
1261 end_a_stub();
1262 }
1263
1264 // The call will be resolved / patched later.
1265 address call_pc = pc();
1266 relocate(target.rspec());
1267 bl(call_pc);
1268 return call_pc;
1269 }
1270
1271 void MacroAssembler::post_call_nop() {
1272 // Make inline again when loom is always enabled.
1273 if (!Continuations::enabled()) {
1274 return;
1275 }
1276 // We use CMPI/CMPLI instructions to encode post call nops.
1277 // Refer to NativePostCallNop for details.
1278 relocate(post_call_nop_Relocation::spec());
1279 InlineSkippedInstructionsCounter skipCounter(this);
1280 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1281 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1282 }
1283
1284 int MacroAssembler::ic_check_size() {
1285 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1286 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1287 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1288
1289 int num_ins;
1290 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1291 num_ins = 3;
1292 if (use_trap_based_null_check) num_ins += 1;
1293 } else {
1294 num_ins = 7;
1295 if (!implicit_null_checks_available) num_ins += 2;
1296 }
1297
1298 if (UseCompactObjectHeaders) num_ins++;
1299
1300 return num_ins * BytesPerInstWord;
1301 }
1302
1303 int MacroAssembler::ic_check(int end_alignment) {
1304 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1305 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1306 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1307
1308 Register receiver = R3_ARG1;
1309 Register data = R19_inline_cache_reg;
1310 Register tmp1 = R11_scratch1;
1311 Register tmp2 = R12_scratch2;
1312
1313 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1314 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1315 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1316 // before the inline cache check here, and not after
1317 align(end_alignment, end_alignment, end_alignment - ic_check_size());
1318
1319 int uep_offset = offset();
1320
1321 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1322 // Fast version which uses SIGTRAP
1323
1324 if (use_trap_based_null_check) {
1325 trap_null_check(receiver);
1326 }
1327 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1328 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1329 trap_ic_miss_check(tmp1, tmp2);
1330
1331 } else {
1332 // Slower version which doesn't use SIGTRAP
1333
1334 // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1335 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1336 true, true, false); // 2 instructions
1337 mtctr(tmp1);
1338
1339 if (!implicit_null_checks_available) {
1340 cmpdi(CR0, receiver, 0);
1341 beqctr(CR0);
1342 }
1343 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1344 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1345 cmpd(CR0, tmp1, tmp2);
1346 bnectr(CR0);
1347 }
1348
1349 assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1350
1351 return uep_offset;
1352 }
1353
1354 void MacroAssembler::call_VM_base(Register oop_result,
1355 Register last_java_sp,
1356 address entry_point,
1357 bool check_exceptions,
1358 Label* last_java_pc) {
1359 BLOCK_COMMENT("call_VM {");
1360 // Determine last_java_sp register.
1361 if (!last_java_sp->is_valid()) {
1362 last_java_sp = R1_SP;
1363 }
1364 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1365
1366 // ARG1 must hold thread address.
1367 mr(R3_ARG1, R16_thread);
1368 address return_pc = call_c(entry_point, relocInfo::none);
1369
1370 reset_last_Java_frame();
1371
1372 // Check for pending exceptions.
1373 if (check_exceptions) {
1374 // We don't check for exceptions here.
1375 ShouldNotReachHere();
1376 }
1377
1378 // Get oop result if there is one and reset the value in the thread.
1379 if (oop_result->is_valid()) {
1380 get_vm_result_oop(oop_result);
1381 }
1382
1383 _last_calls_return_pc = return_pc;
1384 BLOCK_COMMENT("} call_VM");
1385 }
1386
1387 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1388 BLOCK_COMMENT("call_VM_leaf {");
1389 call_c(entry_point);
1390 BLOCK_COMMENT("} call_VM_leaf");
1391 }
1392
1393 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1394 call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1395 }
1396
1397 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1398 bool check_exceptions) {
1399 // R3_ARG1 is reserved for the thread.
1400 mr_if_needed(R4_ARG2, arg_1);
1401 call_VM(oop_result, entry_point, check_exceptions);
1402 }
1403
1404 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1405 bool check_exceptions) {
1406 // R3_ARG1 is reserved for the thread
1407 assert_different_registers(arg_2, R4_ARG2);
1408 mr_if_needed(R4_ARG2, arg_1);
1409 mr_if_needed(R5_ARG3, arg_2);
1410 call_VM(oop_result, entry_point, check_exceptions);
1411 }
1412
1413 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1414 bool check_exceptions) {
1415 // R3_ARG1 is reserved for the thread
1416 assert_different_registers(arg_2, R4_ARG2);
1417 assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1418 mr_if_needed(R4_ARG2, arg_1);
1419 mr_if_needed(R5_ARG3, arg_2);
1420 mr_if_needed(R6_ARG4, arg_3);
1421 call_VM(oop_result, entry_point, check_exceptions);
1422 }
1423
1424 void MacroAssembler::call_VM_leaf(address entry_point) {
1425 call_VM_leaf_base(entry_point);
1426 }
1427
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1429 mr_if_needed(R3_ARG1, arg_1);
1430 call_VM_leaf(entry_point);
1431 }
1432
1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1434 assert_different_registers(arg_2, R3_ARG1);
1435 mr_if_needed(R3_ARG1, arg_1);
1436 mr_if_needed(R4_ARG2, arg_2);
1437 call_VM_leaf(entry_point);
1438 }
1439
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1441 assert_different_registers(arg_2, R3_ARG1);
1442 assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1443 mr_if_needed(R3_ARG1, arg_1);
1444 mr_if_needed(R4_ARG2, arg_2);
1445 mr_if_needed(R5_ARG3, arg_3);
1446 call_VM_leaf(entry_point);
1447 }
1448
1449 // Check whether instruction is a read access to the polling page
1450 // which was emitted by load_from_polling_page(..).
1451 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1452 address* polling_address_ptr) {
1453 if (!is_ld(instruction))
1454 return false; // It's not a ld. Fail.
1455
1456 int rt = inv_rt_field(instruction);
1457 int ra = inv_ra_field(instruction);
1458 int ds = inv_ds_field(instruction);
1459 if (!(ds == 0 && ra != 0 && rt == 0)) {
1460 return false; // It's not a ld(r0, X, ra). Fail.
1461 }
1462
1463 if (!ucontext) {
1464 // Set polling address.
1465 if (polling_address_ptr != nullptr) {
1466 *polling_address_ptr = nullptr;
1467 }
1468 return true; // No ucontext given. Can't check value of ra. Assume true.
1469 }
1470
1471 #ifdef LINUX
1472 // Ucontext given. Check that register ra contains the address of
1473 // the safepoing polling page.
1474 ucontext_t* uc = (ucontext_t*) ucontext;
1475 // Set polling address.
1476 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1477 if (polling_address_ptr != nullptr) {
1478 *polling_address_ptr = addr;
1479 }
1480 return SafepointMechanism::is_poll_address(addr);
1481 #else
1482 // Not on Linux, ucontext must be null.
1483 ShouldNotReachHere();
1484 return false;
1485 #endif
1486 }
1487
1488 void MacroAssembler::bang_stack_with_offset(int offset) {
1489 // When increasing the stack, the old stack pointer will be written
1490 // to the new top of stack according to the PPC64 abi.
1491 // Therefore, stack banging is not necessary when increasing
1492 // the stack by <= os::vm_page_size() bytes.
1493 // When increasing the stack by a larger amount, this method is
1494 // called repeatedly to bang the intermediate pages.
1495
1496 // Stack grows down, caller passes positive offset.
1497 assert(offset > 0, "must bang with positive offset");
1498
1499 long stdoffset = -offset;
1500
1501 if (is_simm(stdoffset, 16)) {
1502 // Signed 16 bit offset, a simple std is ok.
1503 if (UseLoadInstructionsForStackBangingPPC64) {
1504 ld(R0, (int)(signed short)stdoffset, R1_SP);
1505 } else {
1506 std(R0,(int)(signed short)stdoffset, R1_SP);
1507 }
1508 } else if (is_simm(stdoffset, 31)) {
1509 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1510 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1511
1512 Register tmp = R11;
1513 addis(tmp, R1_SP, hi);
1514 if (UseLoadInstructionsForStackBangingPPC64) {
1515 ld(R0, lo, tmp);
1516 } else {
1517 std(R0, lo, tmp);
1518 }
1519 } else {
1520 ShouldNotReachHere();
1521 }
1522 }
1523
1524 // If instruction is a stack bang of the form
1525 // std R0, x(Ry), (see bang_stack_with_offset())
1526 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1527 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1528 // return the banged address. Otherwise, return 0.
1529 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1530 #ifdef LINUX
1531 ucontext_t* uc = (ucontext_t*) ucontext;
1532 int rs = inv_rs_field(instruction);
1533 int ra = inv_ra_field(instruction);
1534 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1535 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1536 || (is_stdu(instruction) && rs == 1)) {
1537 int ds = inv_ds_field(instruction);
1538 // return banged address
1539 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1540 } else if (is_stdux(instruction) && rs == 1) {
1541 int rb = inv_rb_field(instruction);
1542 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1543 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1544 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang
1545 : sp + rb_val; // banged address
1546 }
1547 return nullptr; // not a stack bang
1548 #else
1549 // workaround not needed on !LINUX :-)
1550 ShouldNotCallThis();
1551 return nullptr;
1552 #endif
1553 }
1554
1555 void MacroAssembler::reserved_stack_check(Register return_pc) {
1556 // Test if reserved zone needs to be enabled.
1557 Label no_reserved_zone_enabling;
1558
1559 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1560 cmpld(CR0, R1_SP, R0);
1561 blt_predict_taken(CR0, no_reserved_zone_enabling);
1562
1563 // Enable reserved zone again, throw stack overflow exception.
1564 push_frame_reg_args(0, R0);
1565 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1566 pop_frame();
1567 mtlr(return_pc);
1568 load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1569 mtctr(R0);
1570 bctr();
1571
1572 should_not_reach_here();
1573
1574 bind(no_reserved_zone_enabling);
1575 }
1576
1577 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1578 bool cmpxchgx_hint) {
1579 Label retry;
1580 bind(retry);
1581 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1582 stdcx_(exchange_value, addr_base);
1583 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1584 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1585 } else {
1586 bne( CR0, retry); // StXcx_ sets CR0.
1587 }
1588 }
1589
1590 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1591 Register tmp, bool cmpxchgx_hint) {
1592 Label retry;
1593 bind(retry);
1594 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1595 add(tmp, dest_current_value, inc_value);
1596 stdcx_(tmp, addr_base);
1597 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1599 } else {
1600 bne( CR0, retry); // StXcx_ sets CR0.
1601 }
1602 }
1603
1604 // Word/sub-word atomic helper functions
1605
1606 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1607 // Only signed types are supported with size < 4.
1608 // Atomic add always kills tmp1.
1609 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1610 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1611 bool cmpxchgx_hint, bool is_add, int size) {
1612 // Sub-word instructions are available since Power 8.
1613
1614 Label retry;
1615 Register shift_amount = noreg,
1616 val32 = dest_current_value,
1617 modval = is_add ? tmp1 : exchange_value;
1618
1619
1620 // atomic emulation loop
1621 bind(retry);
1622
1623 switch (size) {
1624 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1625 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1626 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1627 default: ShouldNotReachHere();
1628 }
1629
1630 if (is_add) { add(modval, dest_current_value, exchange_value); }
1631
1632
1633 switch (size) {
1634 case 4: stwcx_(modval, addr_base); break;
1635 case 2: sthcx_(modval, addr_base); break;
1636 case 1: stbcx_(modval, addr_base); break;
1637 default: ShouldNotReachHere();
1638 }
1639
1640 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1641 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1642 } else {
1643 bne( CR0, retry); // StXcx_ sets CR0.
1644 }
1645
1646 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1647 if (size == 1) {
1648 extsb(dest_current_value, dest_current_value);
1649 } else if (size == 2) {
1650 extsh(dest_current_value, dest_current_value);
1651 };
1652 }
1653
1654 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1655 // Only signed types are supported with size < 4.
1656 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1657 RegisterOrConstant compare_value, Register exchange_value,
1658 Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1659 // Sub-word instructions are available since Power 8.
1660 Register shift_amount = noreg,
1661 val32 = dest_current_value,
1662 modval = exchange_value;
1663
1664 // atomic emulation loop
1665 bind(retry);
1666
1667 switch (size) {
1668 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1669 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1670 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1671 default: ShouldNotReachHere();
1672 }
1673
1674 if (size == 1) {
1675 extsb(dest_current_value, dest_current_value);
1676 } else if (size == 2) {
1677 extsh(dest_current_value, dest_current_value);
1678 };
1679
1680 cmpw(flag, dest_current_value, compare_value);
1681 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1682 bne_predict_not_taken(flag, failed);
1683 } else {
1684 bne( flag, failed);
1685 }
1686 // branch to done => (flag == ne), (dest_current_value != compare_value)
1687 // fall through => (flag == eq), (dest_current_value == compare_value)
1688
1689 switch (size) {
1690 case 4: stwcx_(modval, addr_base); break;
1691 case 2: sthcx_(modval, addr_base); break;
1692 case 1: stbcx_(modval, addr_base); break;
1693 default: ShouldNotReachHere();
1694 }
1695 }
1696
1697 // CmpxchgX sets condition register to cmpX(current, compare).
1698 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1699 RegisterOrConstant compare_value, Register exchange_value,
1700 Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1701 Label* failed_ext, bool contention_hint, bool weak, int size) {
1702 Label retry;
1703 Label failed_int;
1704 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1705 Label done;
1706
1707 // Save one branch if result is returned via register and
1708 // result register is different from the other ones.
1709 bool use_result_reg = (int_flag_success != noreg);
1710 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1711 int_flag_success != exchange_value && int_flag_success != addr_base);
1712 assert(!weak || flag == CR0, "weak only supported with CR0");
1713 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714 assert(size == 1 || size == 2 || size == 4, "unsupported");
1715
1716 if (use_result_reg && preset_result_reg) {
1717 li(int_flag_success, 0); // preset (assume cas failed)
1718 }
1719
1720 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721 if (contention_hint) { // Don't try to reserve if cmp fails.
1722 switch (size) {
1723 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1724 case 2: lha(dest_current_value, 0, addr_base); break;
1725 case 4: lwz(dest_current_value, 0, addr_base); break;
1726 default: ShouldNotReachHere();
1727 }
1728 cmpw(flag, dest_current_value, compare_value);
1729 bne(flag, failed);
1730 }
1731
1732 // release/fence semantics
1733 if (semantics & MemBarRel) {
1734 release();
1735 }
1736
1737 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1738 retry, failed, cmpxchgx_hint, size);
1739 if (!weak || use_result_reg || failed_ext) {
1740 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1741 bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1742 } else {
1743 bne( CR0, weak ? failed : retry); // StXcx_ sets CR0.
1744 }
1745 }
1746 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1747
1748 // Result in register (must do this at the end because int_flag_success can be the
1749 // same register as one above).
1750 if (use_result_reg) {
1751 li(int_flag_success, 1);
1752 }
1753
1754 if (semantics & MemBarFenceAfter) {
1755 fence();
1756 } else if (semantics & MemBarAcq) {
1757 isync();
1758 }
1759
1760 if (use_result_reg && !preset_result_reg) {
1761 b(done);
1762 }
1763
1764 bind(failed_int);
1765 if (use_result_reg && !preset_result_reg) {
1766 li(int_flag_success, 0);
1767 }
1768
1769 bind(done);
1770 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1771 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1772 }
1773
1774 // Performs atomic compare exchange:
1775 // if (compare_value == *addr_base)
1776 // *addr_base = exchange_value
1777 // int_flag_success = 1;
1778 // else
1779 // int_flag_success = 0;
1780 //
1781 // ConditionRegister flag = cmp(compare_value, *addr_base)
1782 // Register dest_current_value = *addr_base
1783 // Register compare_value Used to compare with value in memory
1784 // Register exchange_value Written to memory if compare_value == *addr_base
1785 // Register addr_base The memory location to compareXChange
1786 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1787 //
1788 // To avoid the costly compare exchange the value is tested beforehand.
1789 // Several special cases exist to avoid that unnecessary information is generated.
1790 //
1791 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1792 RegisterOrConstant compare_value, Register exchange_value,
1793 Register addr_base,
1794 int semantics, bool cmpxchgx_hint, Register int_flag_success,
1795 Label* failed_ext, bool contention_hint, bool weak) {
1796 Label retry;
1797 Label failed_int;
1798 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1799 Label done;
1800
1801 // Save one branch if result is returned via register and result register is different from the other ones.
1802 bool use_result_reg = (int_flag_success!=noreg);
1803 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1804 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1805 assert(!weak || flag == CR0, "weak only supported with CR0");
1806 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1807
1808 if (use_result_reg && preset_result_reg) {
1809 li(int_flag_success, 0); // preset (assume cas failed)
1810 }
1811
1812 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1813 if (contention_hint) { // Don't try to reserve if cmp fails.
1814 ld(dest_current_value, 0, addr_base);
1815 cmpd(flag, dest_current_value, compare_value);
1816 bne(flag, failed);
1817 }
1818
1819 // release/fence semantics
1820 if (semantics & MemBarRel) {
1821 release();
1822 }
1823
1824 // atomic emulation loop
1825 bind(retry);
1826
1827 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1828 cmpd(flag, dest_current_value, compare_value);
1829 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1830 bne_predict_not_taken(flag, failed);
1831 } else {
1832 bne( flag, failed);
1833 }
1834
1835 stdcx_(exchange_value, addr_base);
1836 if (!weak || use_result_reg || failed_ext) {
1837 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1838 bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1839 } else {
1840 bne( CR0, weak ? failed : retry); // stXcx_ sets CR0
1841 }
1842 }
1843
1844 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1845 if (use_result_reg) {
1846 li(int_flag_success, 1);
1847 }
1848
1849 if (semantics & MemBarFenceAfter) {
1850 fence();
1851 } else if (semantics & MemBarAcq) {
1852 isync();
1853 }
1854
1855 if (use_result_reg && !preset_result_reg) {
1856 b(done);
1857 }
1858
1859 bind(failed_int);
1860 if (use_result_reg && !preset_result_reg) {
1861 li(int_flag_success, 0);
1862 }
1863
1864 bind(done);
1865 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1866 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1867 }
1868
1869 // Look up the method for a megamorphic invokeinterface call.
1870 // The target method is determined by <intf_klass, itable_index>.
1871 // The receiver klass is in recv_klass.
1872 // On success, the result will be in method_result, and execution falls through.
1873 // On failure, execution transfers to the given label.
1874 void MacroAssembler::lookup_interface_method(Register recv_klass,
1875 Register intf_klass,
1876 RegisterOrConstant itable_index,
1877 Register method_result,
1878 Register scan_temp,
1879 Register temp2,
1880 Label& L_no_such_interface,
1881 bool return_method) {
1882 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1883
1884 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1885 int vtable_base = in_bytes(Klass::vtable_start_offset());
1886 int itentry_off = in_bytes(itableMethodEntry::method_offset());
1887 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1888 int scan_step = itableOffsetEntry::size() * wordSize;
1889 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1890
1891 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1892 // We should store the aligned, prescaled offset in the klass.
1893 // Then the next several instructions would fold away.
1894
1895 sldi(scan_temp, scan_temp, log_vte_size);
1896 addi(scan_temp, scan_temp, vtable_base);
1897 add(scan_temp, recv_klass, scan_temp);
1898
1899 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1900 if (return_method) {
1901 if (itable_index.is_register()) {
1902 Register itable_offset = itable_index.as_register();
1903 sldi(method_result, itable_offset, logMEsize);
1904 if (itentry_off) { addi(method_result, method_result, itentry_off); }
1905 add(method_result, method_result, recv_klass);
1906 } else {
1907 long itable_offset = (long)itable_index.as_constant();
1908 // static address, no relocation
1909 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1910 }
1911 }
1912
1913 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1914 // if (scan->interface() == intf) {
1915 // result = (klass + scan->offset() + itable_index);
1916 // }
1917 // }
1918 Label search, found_method;
1919
1920 for (int peel = 1; peel >= 0; peel--) {
1921 // %%%% Could load both offset and interface in one ldx, if they were
1922 // in the opposite order. This would save a load.
1923 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1924
1925 // Check that this entry is non-null. A null entry means that
1926 // the receiver class doesn't implement the interface, and wasn't the
1927 // same as when the caller was compiled.
1928 cmpd(CR0, temp2, intf_klass);
1929
1930 if (peel) {
1931 beq(CR0, found_method);
1932 } else {
1933 bne(CR0, search);
1934 // (invert the test to fall through to found_method...)
1935 }
1936
1937 if (!peel) break;
1938
1939 bind(search);
1940
1941 cmpdi(CR0, temp2, 0);
1942 beq(CR0, L_no_such_interface);
1943 addi(scan_temp, scan_temp, scan_step);
1944 }
1945
1946 bind(found_method);
1947
1948 // Got a hit.
1949 if (return_method) {
1950 int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1951 lwz(scan_temp, ito_offset, scan_temp);
1952 ldx(method_result, scan_temp, method_result);
1953 }
1954 }
1955
1956 // virtual method calling
1957 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1958 RegisterOrConstant vtable_index,
1959 Register method_result) {
1960
1961 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1962
1963 const ByteSize base = Klass::vtable_start_offset();
1964 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1965
1966 if (vtable_index.is_register()) {
1967 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1968 add(recv_klass, vtable_index.as_register(), recv_klass);
1969 } else {
1970 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1971 }
1972 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1973 }
1974
1975 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1976 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1977 Register super_klass,
1978 Register temp1_reg,
1979 Register temp2_reg,
1980 Label* L_success,
1981 Label* L_failure,
1982 Label* L_slow_path,
1983 RegisterOrConstant super_check_offset) {
1984
1985 const Register check_cache_offset = temp1_reg;
1986 const Register cached_super = temp2_reg;
1987
1988 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1989
1990 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1991 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1992
1993 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1994 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1995
1996 Label L_fallthrough;
1997 int label_nulls = 0;
1998 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
1999 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
2000 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2001 assert(label_nulls <= 1 ||
2002 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
2003 "at most one null in the batch, usually");
2004
2005 // If the pointers are equal, we are done (e.g., String[] elements).
2006 // This self-check enables sharing of secondary supertype arrays among
2007 // non-primary types such as array-of-interface. Otherwise, each such
2008 // type would need its own customized SSA.
2009 // We move this check to the front of the fast path because many
2010 // type checks are in fact trivially successful in this manner,
2011 // so we get a nicely predicted branch right at the start of the check.
2012 cmpd(CR0, sub_klass, super_klass);
2013 beq(CR0, *L_success);
2014
2015 // Check the supertype display:
2016 if (must_load_sco) {
2017 // The super check offset is always positive...
2018 lwz(check_cache_offset, sco_offset, super_klass);
2019 super_check_offset = RegisterOrConstant(check_cache_offset);
2020 // super_check_offset is register.
2021 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2022 }
2023 // The loaded value is the offset from Klass.
2024
2025 ld(cached_super, super_check_offset, sub_klass);
2026 cmpd(CR0, cached_super, super_klass);
2027
2028 // This check has worked decisively for primary supers.
2029 // Secondary supers are sought in the super_cache ('super_cache_addr').
2030 // (Secondary supers are interfaces and very deeply nested subtypes.)
2031 // This works in the same check above because of a tricky aliasing
2032 // between the super_cache and the primary super display elements.
2033 // (The 'super_check_addr' can address either, as the case requires.)
2034 // Note that the cache is updated below if it does not help us find
2035 // what we need immediately.
2036 // So if it was a primary super, we can just fail immediately.
2037 // Otherwise, it's the slow path for us (no success at this point).
2038
2039 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2040
2041 if (super_check_offset.is_register()) {
2042 beq(CR0, *L_success);
2043 cmpwi(CR0, super_check_offset.as_register(), sc_offset);
2044 if (L_failure == &L_fallthrough) {
2045 beq(CR0, *L_slow_path);
2046 } else {
2047 bne(CR0, *L_failure);
2048 FINAL_JUMP(*L_slow_path);
2049 }
2050 } else {
2051 if (super_check_offset.as_constant() == sc_offset) {
2052 // Need a slow path; fast failure is impossible.
2053 if (L_slow_path == &L_fallthrough) {
2054 beq(CR0, *L_success);
2055 } else {
2056 bne(CR0, *L_slow_path);
2057 FINAL_JUMP(*L_success);
2058 }
2059 } else {
2060 // No slow path; it's a fast decision.
2061 if (L_failure == &L_fallthrough) {
2062 beq(CR0, *L_success);
2063 } else {
2064 bne(CR0, *L_failure);
2065 FINAL_JUMP(*L_success);
2066 }
2067 }
2068 }
2069
2070 bind(L_fallthrough);
2071 #undef FINAL_JUMP
2072 }
2073
2074 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2075 Register super_klass,
2076 Register temp1_reg,
2077 Register temp2_reg,
2078 Label* L_success,
2079 Register result_reg) {
2080 const Register array_ptr = temp1_reg; // current value from cache array
2081 const Register temp = temp2_reg;
2082
2083 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2084 assert(L_success == nullptr || result_reg == noreg, "can't have both");
2085
2086 int source_offset = in_bytes(Klass::secondary_supers_offset());
2087 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2088
2089 int length_offset = Array<Klass*>::length_offset_in_bytes();
2090 int base_offset = Array<Klass*>::base_offset_in_bytes();
2091
2092 Label hit, loop, failure, fallthru;
2093
2094 ld(array_ptr, source_offset, sub_klass);
2095
2096 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2097 lwz(temp, length_offset, array_ptr);
2098 cmpwi(CR0, temp, 0);
2099 beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2100
2101 mtctr(temp); // load ctr
2102
2103 bind(loop);
2104 // Oops in table are NO MORE compressed.
2105 ld(temp, base_offset, array_ptr);
2106 cmpd(CR0, temp, super_klass);
2107 beq(CR0, hit);
2108 addi(array_ptr, array_ptr, BytesPerWord);
2109 bdnz(loop);
2110
2111 bind(failure);
2112 if (result_reg != noreg) {
2113 li(result_reg, 1); // load non-zero result (indicates a miss)
2114 } else if (L_success == nullptr) {
2115 crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2116 }
2117 b(fallthru);
2118
2119 bind(hit);
2120 std(super_klass, target_offset, sub_klass); // save result to cache
2121 if (result_reg != noreg) {
2122 li(result_reg, 0); // load zero result (indicates a hit)
2123 } else if (L_success != nullptr) {
2124 b(*L_success);
2125 }
2126
2127 bind(fallthru);
2128 }
2129
2130 Register MacroAssembler::allocate_if_noreg(Register r,
2131 RegSetIterator<Register> &available_regs,
2132 RegSet ®s_to_push) {
2133 if (!r->is_valid()) {
2134 r = *available_regs++;
2135 regs_to_push += r;
2136 }
2137 return r;
2138 }
2139
2140 void MacroAssembler::push_set(RegSet set)
2141 {
2142 int spill_offset = 0;
2143 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2144 spill_offset += wordSize;
2145 std(*it, -spill_offset, R1_SP);
2146 }
2147 }
2148
2149 void MacroAssembler::pop_set(RegSet set)
2150 {
2151 int spill_offset = 0;
2152 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2153 spill_offset += wordSize;
2154 ld(*it, -spill_offset, R1_SP);
2155 }
2156 }
2157
2158 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2159 Register super_klass,
2160 Register temp1_reg,
2161 Register temp2_reg,
2162 Label* L_success,
2163 Register result_reg) {
2164 RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2165
2166 assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2167
2168 Register temp3_reg = noreg, temp4_reg = noreg;
2169 bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2170
2171 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2172
2173 RegSetIterator<Register> available_regs
2174 = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2175
2176 RegSet pushed_regs;
2177
2178 temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2179 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2180 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2181 temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2182 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2183
2184 push_set(pushed_regs);
2185
2186 lookup_secondary_supers_table_var(sub_klass, super_klass,
2187 temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2188 result_reg);
2189
2190 if (L_success != nullptr || !result_reg_provided) {
2191 // result_reg may get overwritten by pop_set
2192 cmpdi(CR0, result_reg, 0);
2193 }
2194
2195 // Unspill the temp. registers:
2196 pop_set(pushed_regs);
2197
2198 if (L_success != nullptr) {
2199 beq(CR0, *L_success);
2200 }
2201 }
2202
2203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2204 Register super_klass,
2205 Register temp1_reg,
2206 Register temp2_reg,
2207 Label* L_success,
2208 Register result_reg) {
2209 if (UseSecondarySupersTable) {
2210 check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2211 } else {
2212 if (temp2_reg == noreg) temp2_reg = R0;
2213 check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2214 }
2215 }
2216
2217 // Try fast path, then go to slow one if not successful
2218 void MacroAssembler::check_klass_subtype(Register sub_klass,
2219 Register super_klass,
2220 Register temp1_reg,
2221 Register temp2_reg,
2222 Label& L_success) {
2223 Label L_failure;
2224 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2225 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2226 bind(L_failure); // Fallthru if not successful.
2227 }
2228
2229 // scans count pointer sized words at [addr] for occurrence of value,
2230 // generic (count must be >0)
2231 // iff found: CR0 eq, scratch == 0
2232 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2233 Label Lloop, Lafter_loop, Lexit;
2234
2235 srdi_(scratch, count, 1);
2236 beq(CR0, Lafter_loop);
2237 mtctr(scratch);
2238
2239 bind(Lloop); // 2x unrolled
2240 ld(scratch, 0, addr);
2241 xor_(scratch, scratch, value);
2242 beq(CR0, Lexit);
2243 ld(scratch, 8, addr);
2244 xor_(scratch, scratch, value);
2245 beq(CR0, Lexit);
2246 addi(addr, addr, 2 * wordSize);
2247 bdnz(Lloop);
2248
2249 bind(Lafter_loop);
2250 andi_(scratch, count, 1);
2251 beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2252 ld(scratch, 0, addr);
2253 xor_(scratch, scratch, value);
2254
2255 bind(Lexit);
2256 }
2257
2258 // Ensure that the inline code and the stub are using the same registers.
2259 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
2260 do { \
2261 assert(r_super_klass == R4_ARG2 && \
2262 r_array_base == R3_ARG1 && \
2263 r_array_length == R7_ARG5 && \
2264 (r_array_index == R6_ARG4 || r_array_index == noreg) && \
2265 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \
2266 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \
2267 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \
2268 } while(0)
2269
2270 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2271 Register r_super_klass,
2272 Register temp1,
2273 Register temp2,
2274 Register temp3,
2275 Register temp4,
2276 Register result,
2277 u1 super_klass_slot) {
2278 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2279
2280 Label L_done;
2281
2282 BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2283
2284 const Register
2285 r_array_base = temp1,
2286 r_array_length = temp2,
2287 r_array_index = temp3,
2288 r_bitmap = temp4;
2289
2290 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2291
2292 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2293
2294 // First check the bitmap to see if super_klass might be present. If
2295 // the bit is zero, we are certain that super_klass is not one of
2296 // the secondary supers.
2297 u1 bit = super_klass_slot;
2298 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2299
2300 // if (shift_count == 0) this is used for comparing with 0:
2301 sldi_(r_array_index, r_bitmap, shift_count);
2302
2303 li(result, 1); // failure
2304 // We test the MSB of r_array_index, i.e. its sign bit
2305 bge(CR0, L_done);
2306
2307 // We will consult the secondary-super array.
2308 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2309
2310 // The value i in r_array_index is >= 1, so even though r_array_base
2311 // points to the length, we don't need to adjust it to point to the
2312 // data.
2313 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2314
2315 // Get the first array index that can contain super_klass.
2316 if (bit != 0) {
2317 popcntd(r_array_index, r_array_index);
2318 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2319 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2320 ldx(result, r_array_base, r_array_index);
2321 } else {
2322 // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2323 // such that the sum is precise.
2324 ld(result, BytesPerWord, r_array_base);
2325 li(r_array_index, BytesPerWord); // for slow path (scaled)
2326 }
2327
2328 xor_(result, result, r_super_klass);
2329 beq(CR0, L_done); // Found a match (result == 0)
2330
2331 // Is there another entry to check? Consult the bitmap.
2332 testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2333 beq(CR0, L_done); // (result != 0)
2334
2335 // Linear probe. Rotate the bitmap so that the next bit to test is
2336 // in Bit 2 for the look-ahead check in the slow path.
2337 if (bit != 0) {
2338 rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2339 }
2340
2341 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2342 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2343 // Kills: r_array_length.
2344 // Returns: result.
2345 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2346 Register r_stub_addr = r_array_length;
2347 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2348 mtctr(r_stub_addr);
2349 bctrl();
2350
2351 bind(L_done);
2352 BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2353
2354 if (VerifySecondarySupers) {
2355 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2356 temp1, temp2, temp3);
2357 }
2358 }
2359
2360 // At runtime, return 0 in result if r_super_klass is a superclass of
2361 // r_sub_klass, otherwise return nonzero. Use this version of
2362 // lookup_secondary_supers_table() if you don't know ahead of time
2363 // which superclass will be searched for. Used by interpreter and
2364 // runtime stubs. It is larger and has somewhat greater latency than
2365 // the version above, which takes a constant super_klass_slot.
2366 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2367 Register r_super_klass,
2368 Register temp1,
2369 Register temp2,
2370 Register temp3,
2371 Register temp4,
2372 Register result) {
2373 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2374
2375 Label L_done;
2376
2377 BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2378
2379 const Register
2380 r_array_base = temp1,
2381 slot = temp2,
2382 r_array_index = temp3,
2383 r_bitmap = temp4;
2384
2385 lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2386 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2387
2388 li(result, 1); // Make sure that result is nonzero if the test below misses.
2389
2390 // First check the bitmap to see if super_klass might be present. If
2391 // the bit is zero, we are certain that super_klass is not one of
2392 // the secondary supers.
2393 xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2394 sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2395
2396 // We test the MSB of r_array_index, i.e. its sign bit
2397 bge(CR0, L_done);
2398
2399 // We will consult the secondary-super array.
2400 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2401
2402 // The value i in r_array_index is >= 1, so even though r_array_base
2403 // points to the length, we don't need to adjust it to point to the data.
2404 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2405 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2406
2407 // Get the first array index that can contain super_klass into r_array_index.
2408 popcntd(r_array_index, r_array_index);
2409
2410 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2411 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2412
2413 ldx(R0, r_array_base, r_array_index);
2414 xor_(result, R0, r_super_klass);
2415 beq(CR0, L_done); // found a match, result is 0 in this case
2416
2417 // Linear probe. Rotate the bitmap so that the next bit to test is
2418 // in Bit 1.
2419 neg(R0, slot); // rotate right
2420 rldcl(r_bitmap, r_bitmap, R0, 0);
2421 Register temp = slot;
2422 andi_(temp, r_bitmap, 2);
2423 beq(CR0, L_done); // fail (result != 0)
2424
2425 // The slot we just inspected is at secondary_supers[r_array_index - 1].
2426 // The next slot to be inspected, by the logic we're about to call,
2427 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2428 // have been checked.
2429 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2430 r_bitmap, result, temp);
2431 // return whatever we got from slow path
2432
2433 bind(L_done);
2434
2435 BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2436
2437 if (VerifySecondarySupers) {
2438 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2439 temp1, temp2, temp3);
2440 }
2441 }
2442
2443 // Called by code generated by check_klass_subtype_slow_path
2444 // above. This is called when there is a collision in the hashed
2445 // lookup in the secondary supers array.
2446 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2447 Register r_array_base,
2448 Register r_array_index,
2449 Register r_bitmap,
2450 Register result,
2451 Register temp1) {
2452 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2453
2454 const Register
2455 r_array_length = temp1,
2456 r_sub_klass = noreg;
2457
2458 Label L_done;
2459
2460 // Load the array length.
2461 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2462 // And adjust the array base to point to the data.
2463 // NB! Effectively increments current slot index by 1.
2464 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2465 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2466
2467 // Linear probe
2468 Label L_huge;
2469
2470 // The bitmap is full to bursting.
2471 // Implicit invariant: BITMAP_FULL implies (length > 0)
2472 cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2473 bgt(CR0, L_huge);
2474
2475 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2476 // current slot (at secondary_supers[r_array_index]) has not yet
2477 // been inspected, and r_array_index may be out of bounds if we
2478 // wrapped around the end of the array.
2479
2480 { // This is conventional linear probing, but instead of terminating
2481 // when a null entry is found in the table, we maintain a bitmap
2482 // in which a 0 indicates missing entries.
2483 // The check above guarantees there are 0s in the bitmap, so the loop
2484 // eventually terminates.
2485
2486 #ifdef ASSERT
2487 {
2488 // We should only reach here after having found a bit in the bitmap.
2489 // Invariant: array_length == popcount(bitmap)
2490 Label ok;
2491 cmpdi(CR0, r_array_length, 0);
2492 bgt(CR0, ok);
2493 stop("array_length must be positive");
2494 bind(ok);
2495 }
2496 #endif
2497
2498 // Compute limit in r_array_length
2499 addi(r_array_length, r_array_length, -1);
2500 sldi(r_array_length, r_array_length, LogBytesPerWord);
2501
2502 Label L_loop;
2503 bind(L_loop);
2504
2505 // Check for wraparound.
2506 cmpd(CR0, r_array_index, r_array_length);
2507 isel_0(r_array_index, CR0, Assembler::greater);
2508
2509 ldx(result, r_array_base, r_array_index);
2510 xor_(result, result, r_super_klass);
2511 beq(CR0, L_done); // success (result == 0)
2512
2513 // look-ahead check (Bit 2); result is non-zero
2514 testbitdi(CR0, R0, r_bitmap, 2);
2515 beq(CR0, L_done); // fail (result != 0)
2516
2517 rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2518 addi(r_array_index, r_array_index, BytesPerWord);
2519 b(L_loop);
2520 }
2521
2522 { // Degenerate case: more than 64 secondary supers.
2523 // FIXME: We could do something smarter here, maybe a vectorized
2524 // comparison or a binary search, but is that worth any added
2525 // complexity?
2526 bind(L_huge);
2527 repne_scan(r_array_base, r_super_klass, r_array_length, result);
2528 }
2529
2530 bind(L_done);
2531 }
2532
2533 // Make sure that the hashed lookup and a linear scan agree.
2534 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2535 Register r_super_klass,
2536 Register result,
2537 Register temp1,
2538 Register temp2,
2539 Register temp3) {
2540 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2541
2542 const Register
2543 r_array_base = temp1,
2544 r_array_length = temp2,
2545 r_array_index = temp3,
2546 r_bitmap = noreg; // unused
2547
2548 BLOCK_COMMENT("verify_secondary_supers_table {");
2549
2550 Label passed, failure;
2551
2552 // We will consult the secondary-super array.
2553 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2554 // Load the array length.
2555 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2556 // And adjust the array base to point to the data.
2557 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2558
2559 // convert !=0 to 1
2560 normalize_bool(result, R0, true);
2561 const Register linear_result = r_array_index; // reuse
2562 li(linear_result, 1);
2563 cmpdi(CR0, r_array_length, 0);
2564 ble(CR0, failure);
2565 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2566 bind(failure);
2567
2568 // convert !=0 to 1
2569 normalize_bool(linear_result, R0, true);
2570
2571 cmpd(CR0, result, linear_result);
2572 beq(CR0, passed);
2573
2574 // report fatal error and terminate VM
2575
2576 // Argument shuffle. Using stack to avoid clashes.
2577 std(r_super_klass, -8, R1_SP);
2578 std(r_sub_klass, -16, R1_SP);
2579 std(linear_result, -24, R1_SP);
2580 mr_if_needed(R6_ARG4, result);
2581 ld(R3_ARG1, -8, R1_SP);
2582 ld(R4_ARG2, -16, R1_SP);
2583 ld(R5_ARG3, -24, R1_SP);
2584
2585 const char* msg = "mismatch";
2586 load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2587 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2588 should_not_reach_here();
2589
2590 bind(passed);
2591
2592 BLOCK_COMMENT("} verify_secondary_supers_table");
2593 }
2594
2595 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2596 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2597
2598 Label L_check_thread, L_fallthrough;
2599 if (L_fast_path == nullptr) {
2600 L_fast_path = &L_fallthrough;
2601 } else if (L_slow_path == nullptr) {
2602 L_slow_path = &L_fallthrough;
2603 }
2604
2605 // Fast path check: class is fully initialized
2606 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2607 // acquire by cmp-branch-isync if fully_initialized
2608 cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2609 bne(CR0, L_check_thread);
2610 isync();
2611 b(*L_fast_path);
2612
2613 // Fast path check: current thread is initializer thread
2614 bind(L_check_thread);
2615 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2616 cmpd(CR0, thread, R0);
2617 if (L_slow_path == &L_fallthrough) {
2618 beq(CR0, *L_fast_path);
2619 } else if (L_fast_path == &L_fallthrough) {
2620 bne(CR0, *L_slow_path);
2621 } else {
2622 Unimplemented();
2623 }
2624
2625 bind(L_fallthrough);
2626 }
2627
2628 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2629 Register temp_reg,
2630 int extra_slot_offset) {
2631 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2632 int stackElementSize = Interpreter::stackElementSize;
2633 int offset = extra_slot_offset * stackElementSize;
2634 if (arg_slot.is_constant()) {
2635 offset += arg_slot.as_constant() * stackElementSize;
2636 return offset;
2637 } else {
2638 assert(temp_reg != noreg, "must specify");
2639 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2640 if (offset != 0)
2641 addi(temp_reg, temp_reg, offset);
2642 return temp_reg;
2643 }
2644 }
2645
2646 void MacroAssembler::tlab_allocate(
2647 Register obj, // result: pointer to object after successful allocation
2648 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2649 int con_size_in_bytes, // object size in bytes if known at compile time
2650 Register t1, // temp register
2651 Label& slow_case // continuation point if fast allocation fails
2652 ) {
2653 // make sure arguments make sense
2654 assert_different_registers(obj, var_size_in_bytes, t1);
2655 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2656 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2657
2658 const Register new_top = t1;
2659 //verify_tlab(); not implemented
2660
2661 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2662 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2663 if (var_size_in_bytes == noreg) {
2664 addi(new_top, obj, con_size_in_bytes);
2665 } else {
2666 add(new_top, obj, var_size_in_bytes);
2667 }
2668 cmpld(CR0, new_top, R0);
2669 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2670
2671 #ifdef ASSERT
2672 // make sure new free pointer is properly aligned
2673 {
2674 Label L;
2675 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2676 beq(CR0, L);
2677 stop("updated TLAB free is not properly aligned");
2678 bind(L);
2679 }
2680 #endif // ASSERT
2681
2682 // update the tlab top pointer
2683 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2684 //verify_tlab(); not implemented
2685 }
2686
2687 // "The box" is the space on the stack where we copy the object mark.
2688 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2689 Register tmp1, Register tmp2, Register tmp3) {
2690 assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2691 assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2692 assert(flag == CR0, "bad condition register");
2693
2694 // Handle inflated monitor.
2695 Label inflated;
2696 // Finish fast lock successfully. MUST reach to with flag == NE
2697 Label locked;
2698 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2699 Label slow_path;
2700
2701 if (UseObjectMonitorTable) {
2702 // Clear cache in case fast locking succeeds or we need to take the slow-path.
2703 li(tmp1, 0);
2704 std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2705 }
2706
2707 if (DiagnoseSyncOnValueBasedClasses != 0) {
2708 load_klass(tmp1, obj);
2709 lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2710 testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2711 bne(CR0, slow_path);
2712 }
2713
2714 Register mark = tmp1;
2715
2716 { // Fast locking
2717
2718 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2719 Label push;
2720
2721 const Register top = tmp2;
2722
2723 // Check if lock-stack is full.
2724 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2725 cmplwi(CR0, top, LockStack::end_offset() - 1);
2726 bgt(CR0, slow_path);
2727
2728 // The underflow check is elided. The recursive check will always fail
2729 // when the lock stack is empty because of the _bad_oop_sentinel field.
2730
2731 // Check if recursive.
2732 subi(R0, top, oopSize);
2733 ldx(R0, R16_thread, R0);
2734 cmpd(CR0, obj, R0);
2735 beq(CR0, push);
2736
2737 // Check for monitor (0b10) or locked (0b00).
2738 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2739 andi_(R0, mark, markWord::lock_mask_in_place);
2740 cmpldi(CR0, R0, markWord::unlocked_value);
2741 bgt(CR0, inflated);
2742 bne(CR0, slow_path);
2743
2744 // Not inflated.
2745
2746 // Try to lock. Transition lock bits 0b01 => 0b00
2747 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2748 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2749
2750 bind(push);
2751 // After successful lock, push object on lock-stack.
2752 stdx(obj, R16_thread, top);
2753 addi(top, top, oopSize);
2754 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2755 b(locked);
2756 }
2757
2758 { // Handle inflated monitor.
2759 bind(inflated);
2760
2761 // mark contains the tagged ObjectMonitor*.
2762 const uintptr_t monitor_tag = markWord::monitor_value;
2763 const Register monitor = UseObjectMonitorTable ? tmp1 : noreg;
2764 const Register owner_addr = tmp2;
2765 const Register thread_id = UseObjectMonitorTable ? tmp3 : tmp1;
2766 Label monitor_locked;
2767
2768 if (!UseObjectMonitorTable) {
2769 // Compute owner address.
2770 addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2771 mark = noreg;
2772 } else {
2773 const Register tmp3_bucket = tmp3;
2774 const Register tmp2_hash = tmp2;
2775 Label monitor_found;
2776
2777 // Save the mark, we might need it to extract the hash.
2778 mr(tmp2_hash, mark);
2779
2780 // Look for the monitor in the om_cache.
2781
2782 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
2783 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2784 const int num_unrolled = OMCache::CAPACITY;
2785 for (int i = 0; i < num_unrolled; i++) {
2786 ld(R0, in_bytes(cache_offset), R16_thread);
2787 ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2788 cmpd(CR0, R0, obj);
2789 beq(CR0, monitor_found);
2790 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2791 }
2792
2793 // Look for the monitor in the table.
2794
2795 // Get the hash code.
2796 srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2797
2798 // Get the table and calculate the bucket's address
2799 int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2800 ld_ptr(tmp3, simm16_rest, tmp3);
2801 ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2802 andr(tmp2_hash, tmp2_hash, tmp1);
2803 ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2804
2805 // Read the monitor from the bucket.
2806 sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2807 ldx(monitor, tmp3_bucket, tmp2_hash);
2808
2809 // Check if the monitor in the bucket is special (empty, tombstone or removed).
2810 cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2811 blt(CR0, slow_path);
2812
2813 // Check if object matches.
2814 ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2815 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2816 bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
2817 cmpd(CR0, tmp3, obj);
2818 bne(CR0, slow_path);
2819
2820 bind(monitor_found);
2821
2822 // Compute owner address.
2823 addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2824 }
2825
2826 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2827 assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2828 ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2829 cmpxchgd(/*flag=*/CR0,
2830 /*current_value=*/R0,
2831 /*compare_value=*/(intptr_t)0,
2832 /*exchange_value=*/thread_id,
2833 /*where=*/owner_addr,
2834 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2835 MacroAssembler::cmpxchgx_hint_acquire_lock());
2836 beq(CR0, monitor_locked);
2837
2838 // Check if recursive.
2839 cmpd(CR0, R0, thread_id);
2840 bne(CR0, slow_path);
2841
2842 // Recursive.
2843 if (!UseObjectMonitorTable) {
2844 assert_different_registers(tmp1, owner_addr);
2845 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2846 addi(tmp1, tmp1, 1);
2847 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2848 } else {
2849 assert_different_registers(tmp2, monitor);
2850 ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2851 addi(tmp2, tmp2, 1);
2852 std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2853 }
2854
2855 bind(monitor_locked);
2856 if (UseObjectMonitorTable) {
2857 std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2858 }
2859 }
2860
2861 bind(locked);
2862
2863 #ifdef ASSERT
2864 // Check that locked label is reached with flag == EQ.
2865 Label flag_correct;
2866 beq(CR0, flag_correct);
2867 stop("Fast Lock Flag != EQ");
2868 #endif
2869 bind(slow_path);
2870 #ifdef ASSERT
2871 // Check that slow_path label is reached with flag == NE.
2872 bne(CR0, flag_correct);
2873 stop("Fast Lock Flag != NE");
2874 bind(flag_correct);
2875 #endif
2876 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2877 }
2878
2879 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2880 Register tmp1, Register tmp2, Register tmp3) {
2881 assert_different_registers(obj, tmp1, tmp2, tmp3);
2882 assert(flag == CR0, "bad condition register");
2883
2884 // Handle inflated monitor.
2885 Label inflated, inflated_load_monitor;
2886 // Finish fast unlock successfully. MUST reach to with flag == EQ.
2887 Label unlocked;
2888 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2889 Label slow_path;
2890
2891 const Register mark = tmp1;
2892 const Register top = tmp2;
2893 const Register t = tmp3;
2894
2895 { // Fast unlock
2896 Label push_and_slow;
2897
2898 // Check if obj is top of lock-stack.
2899 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2900 subi(top, top, oopSize);
2901 ldx(t, R16_thread, top);
2902 cmpd(CR0, obj, t);
2903 // Top of lock stack was not obj. Must be monitor.
2904 bne(CR0, inflated_load_monitor);
2905
2906 // Pop lock-stack.
2907 DEBUG_ONLY(li(t, 0);)
2908 DEBUG_ONLY(stdx(t, R16_thread, top);)
2909 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2910
2911 // The underflow check is elided. The recursive check will always fail
2912 // when the lock stack is empty because of the _bad_oop_sentinel field.
2913
2914 // Check if recursive.
2915 subi(t, top, oopSize);
2916 ldx(t, R16_thread, t);
2917 cmpd(CR0, obj, t);
2918 beq(CR0, unlocked);
2919
2920 // Not recursive.
2921
2922 // Check for monitor (0b10).
2923 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2924 andi_(t, mark, markWord::monitor_value);
2925 if (!UseObjectMonitorTable) {
2926 bne(CR0, inflated);
2927 } else {
2928 bne(CR0, push_and_slow);
2929 }
2930
2931 #ifdef ASSERT
2932 // Check header not unlocked (0b01).
2933 Label not_unlocked;
2934 andi_(t, mark, markWord::unlocked_value);
2935 beq(CR0, not_unlocked);
2936 stop("fast_unlock already unlocked");
2937 bind(not_unlocked);
2938 #endif
2939
2940 // Try to unlock. Transition lock bits 0b00 => 0b01
2941 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2942 b(unlocked);
2943
2944 bind(push_and_slow);
2945 // Restore lock-stack and handle the unlock in runtime.
2946 DEBUG_ONLY(stdx(obj, R16_thread, top);)
2947 addi(top, top, oopSize);
2948 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2949 b(slow_path);
2950 }
2951
2952 { // Handle inflated monitor.
2953 bind(inflated_load_monitor);
2954 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2955 #ifdef ASSERT
2956 andi_(t, mark, markWord::monitor_value);
2957 bne(CR0, inflated);
2958 stop("Fast Unlock not monitor");
2959 #endif
2960
2961 bind(inflated);
2962
2963 #ifdef ASSERT
2964 Label check_done;
2965 subi(top, top, oopSize);
2966 cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2967 blt(CR0, check_done);
2968 ldx(t, R16_thread, top);
2969 cmpd(CR0, obj, t);
2970 bne(CR0, inflated);
2971 stop("Fast Unlock lock on stack");
2972 bind(check_done);
2973 #endif
2974
2975 // mark contains the tagged ObjectMonitor*.
2976 const Register monitor = mark;
2977 const uintptr_t monitor_tag = markWord::monitor_value;
2978
2979 if (!UseObjectMonitorTable) {
2980 // Untag the monitor.
2981 subi(monitor, mark, monitor_tag);
2982 } else {
2983 ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2984 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2985 cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2986 blt(CR0, slow_path);
2987 }
2988
2989 const Register recursions = tmp2;
2990 Label not_recursive;
2991
2992 // Check if recursive.
2993 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2994 addic_(recursions, recursions, -1);
2995 blt(CR0, not_recursive);
2996
2997 // Recursive unlock.
2998 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2999 crorc(CR0, Assembler::equal, CR0, Assembler::equal);
3000 b(unlocked);
3001
3002 bind(not_recursive);
3003
3004 // Set owner to null.
3005 // Release to satisfy the JMM
3006 release();
3007 li(t, 0);
3008 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3009 // We need a full fence after clearing owner to avoid stranding.
3010 // StoreLoad achieves this.
3011 membar(StoreLoad);
3012
3013 // Check if the entry_list is empty.
3014 ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
3015 cmpdi(CR0, t, 0);
3016 beq(CR0, unlocked); // If so we are done.
3017
3018 // Check if there is a successor.
3019 ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
3020 cmpdi(CR0, t, 0);
3021 // Invert equal bit
3022 crnand(flag, Assembler::equal, flag, Assembler::equal);
3023 beq(CR0, unlocked); // If there is a successor we are done.
3024
3025 // Save the monitor pointer in the current thread, so we can try
3026 // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3027 std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3028 b(slow_path); // flag == NE
3029 }
3030
3031 bind(unlocked);
3032
3033 #ifdef ASSERT
3034 // Check that unlocked label is reached with flag == EQ.
3035 Label flag_correct;
3036 beq(CR0, flag_correct);
3037 stop("Fast Lock Flag != EQ");
3038 #endif
3039 bind(slow_path);
3040 #ifdef ASSERT
3041 // Check that slow_path label is reached with flag == NE.
3042 bne(CR0, flag_correct);
3043 stop("Fast Lock Flag != NE");
3044 bind(flag_correct);
3045 #endif
3046 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3047 }
3048
3049 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3050 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3051
3052 if (at_return) {
3053 if (in_nmethod) {
3054 if (UseSIGTRAP) {
3055 // Use Signal Handler.
3056 relocate(relocInfo::poll_return_type);
3057 td(traptoGreaterThanUnsigned, R1_SP, temp);
3058 } else {
3059 cmpld(CR0, R1_SP, temp);
3060 // Stub may be out of range for short conditional branch.
3061 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3062 }
3063 } else { // Not in nmethod.
3064 // Frame still on stack, need to get fp.
3065 Register fp = R0;
3066 ld(fp, _abi0(callers_sp), R1_SP);
3067 cmpld(CR0, fp, temp);
3068 bgt(CR0, slow_path);
3069 }
3070 } else { // Normal safepoint poll. Not at return.
3071 assert(!in_nmethod, "should use load_from_polling_page");
3072 andi_(temp, temp, SafepointMechanism::poll_bit());
3073 bne(CR0, slow_path);
3074 }
3075 }
3076
3077 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3078 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3079 "polling page return stub not created yet");
3080 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3081
3082 // Determine saved exception pc using pc relative address computation.
3083 {
3084 Label next_pc;
3085 bl(next_pc);
3086 bind(next_pc);
3087 }
3088 int current_offset = offset();
3089
3090 if (fixed_size) {
3091 // Code size must not depend on offsets.
3092 load_const32(R12, safepoint_offset - current_offset);
3093 mflr(R0);
3094 add(R12, R12, R0);
3095 } else {
3096 mflr(R12);
3097 add_const_optimized(R12, R12, safepoint_offset - current_offset);
3098 }
3099 std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3100
3101 add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3102 mtctr(R0);
3103 bctr();
3104 }
3105
3106 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3107 MacroAssembler::PreservationLevel preservation_level) {
3108 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3109 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3110 }
3111
3112 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3113 MacroAssembler::PreservationLevel preservation_level) {
3114 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3115 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3116 }
3117
3118 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3119 // in frame_ppc.hpp.
3120 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3121 // Always set last_Java_pc and flags first because once last_Java_sp
3122 // is visible has_last_Java_frame is true and users will look at the
3123 // rest of the fields. (Note: flags should always be zero before we
3124 // get here so doesn't need to be set.)
3125
3126 // Verify that last_Java_pc was zeroed on return to Java
3127 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3128 "last_Java_pc not zeroed before leaving Java");
3129
3130 // When returning from calling out from Java mode the frame anchor's
3131 // last_Java_pc will always be set to null. It is set here so that
3132 // if we are doing a call to native (not VM) that we capture the
3133 // known pc and don't have to rely on the native call having a
3134 // standard frame linkage where we can find the pc.
3135 if (last_Java_pc != noreg)
3136 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3137
3138 // Set last_Java_sp last.
3139 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3140 }
3141
3142 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3143 if (check_last_java_sp) {
3144 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3145 R16_thread, "SP was not set, still zero");
3146 }
3147
3148 BLOCK_COMMENT("reset_last_Java_frame {");
3149 li(R0, 0);
3150
3151 // _last_Java_sp = 0
3152 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3153
3154 // _last_Java_pc = 0
3155 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3156 BLOCK_COMMENT("} reset_last_Java_frame");
3157 }
3158
3159 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3160 assert_different_registers(sp, tmp1);
3161
3162 if (jpc == nullptr || jpc->is_bound()) {
3163 load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3164 } else {
3165 load_const(tmp1, *jpc, R12_scratch2);
3166 }
3167
3168 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3169 }
3170
3171 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3172 // Read:
3173 // R16_thread
3174 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3175 //
3176 // Updated:
3177 // oop_result
3178 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3179
3180 ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3181 li(R0, 0);
3182 std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3183
3184 verify_oop(oop_result, FILE_AND_LINE);
3185 }
3186
3187 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3188 // Read:
3189 // R16_thread
3190 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3191 //
3192 // Updated:
3193 // metadata_result
3194 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3195
3196 ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3197 li(R0, 0);
3198 std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3199 }
3200
3201 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3202 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3203 if (CompressedKlassPointers::base() != nullptr) {
3204 // Use dst as temp if it is free.
3205 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3206 current = dst;
3207 }
3208 if (CompressedKlassPointers::shift() != 0) {
3209 srdi(dst, current, CompressedKlassPointers::shift());
3210 current = dst;
3211 }
3212 return current;
3213 }
3214
3215 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3216 assert(!UseCompactObjectHeaders, "not with compact headers");
3217 Register compressedKlass = encode_klass_not_null(ck, klass);
3218 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3219 }
3220
3221 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3222 assert(!UseCompactObjectHeaders, "not with compact headers");
3223 if (val == noreg) {
3224 val = R0;
3225 li(val, 0);
3226 }
3227 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3228 }
3229
3230 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3231 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3232 if (src == noreg) src = dst;
3233 Register shifted_src = src;
3234 if (CompressedKlassPointers::shift() != 0 ||
3235 (CompressedKlassPointers::base() == nullptr && src != dst)) { // Move required.
3236 shifted_src = dst;
3237 sldi(shifted_src, src, CompressedKlassPointers::shift());
3238 }
3239 if (CompressedKlassPointers::base() != nullptr) {
3240 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3241 }
3242 }
3243
3244 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3245 if (UseCompactObjectHeaders) {
3246 load_narrow_klass_compact(dst, src);
3247 } else {
3248 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3249 }
3250 }
3251
3252 void MacroAssembler::load_klass(Register dst, Register src) {
3253 load_klass_no_decode(dst, src);
3254 decode_klass_not_null(dst);
3255 }
3256
3257 // Loads the obj's Klass* into dst.
3258 // Preserves all registers (incl src, rscratch1 and rscratch2).
3259 // Input:
3260 // src - the oop we want to load the klass from.
3261 // dst - output nklass.
3262 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3263 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3264 ld(dst, oopDesc::mark_offset_in_bytes(), src);
3265 srdi(dst, dst, markWord::klass_shift);
3266 }
3267
3268 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3269 assert_different_registers(obj, klass, tmp);
3270 if (UseCompactObjectHeaders) {
3271 load_narrow_klass_compact(tmp, obj);
3272 } else {
3273 lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3274 }
3275 Register encoded_klass = encode_klass_not_null(tmp2, klass);
3276 cmpw(dst, tmp, encoded_klass);
3277 }
3278
3279 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3280 if (UseCompactObjectHeaders) {
3281 load_narrow_klass_compact(tmp1, obj1);
3282 load_narrow_klass_compact(tmp2, obj2);
3283 cmpw(dst, tmp1, tmp2);
3284 } else {
3285 lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3286 lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3287 cmpw(dst, tmp1, tmp2);
3288 }
3289 }
3290
3291 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3292 null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3293 load_klass(dst, src);
3294 }
3295
3296 // ((OopHandle)result).resolve();
3297 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3298 MacroAssembler::PreservationLevel preservation_level) {
3299 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3300 }
3301
3302 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3303 MacroAssembler::PreservationLevel preservation_level) {
3304 Label resolved;
3305
3306 // A null weak handle resolves to null.
3307 cmpdi(CR0, result, 0);
3308 beq(CR0, resolved);
3309
3310 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3311 preservation_level);
3312 bind(resolved);
3313 }
3314
3315 void MacroAssembler::load_method_holder(Register holder, Register method) {
3316 ld(holder, in_bytes(Method::const_offset()), method);
3317 ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3318 ld(holder, ConstantPool::pool_holder_offset(), holder);
3319 }
3320
3321 // Clear Array
3322 // For very short arrays. tmp == R0 is allowed.
3323 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3324 if (cnt_dwords > 0) { li(tmp, 0); }
3325 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3326 }
3327
3328 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3329 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3330 if (cnt_dwords < 8) {
3331 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3332 return;
3333 }
3334
3335 Label loop;
3336 const long loopcnt = cnt_dwords >> 1,
3337 remainder = cnt_dwords & 1;
3338
3339 li(tmp, loopcnt);
3340 mtctr(tmp);
3341 li(tmp, 0);
3342 bind(loop);
3343 std(tmp, 0, base_ptr);
3344 std(tmp, 8, base_ptr);
3345 addi(base_ptr, base_ptr, 16);
3346 bdnz(loop);
3347 if (remainder) { std(tmp, 0, base_ptr); }
3348 }
3349
3350 // Kills both input registers. tmp == R0 is allowed.
3351 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3352 // Procedure for large arrays (uses data cache block zero instruction).
3353 Label startloop, fast, fastloop, small_rest, restloop, done;
3354 const int cl_size = VM_Version::L1_data_cache_line_size(),
3355 cl_dwords = cl_size >> 3,
3356 cl_dw_addr_bits = exact_log2(cl_dwords),
3357 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3358 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3359
3360 if (const_cnt >= 0) {
3361 // Constant case.
3362 if (const_cnt < min_cnt) {
3363 clear_memory_constlen(base_ptr, const_cnt, tmp);
3364 return;
3365 }
3366 load_const_optimized(cnt_dwords, const_cnt, tmp);
3367 } else {
3368 // cnt_dwords already loaded in register. Need to check size.
3369 cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3370 blt(CR1, small_rest);
3371 }
3372 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3373 beq(CR0, fast); // Already 128byte aligned.
3374
3375 subfic(tmp, tmp, cl_dwords);
3376 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3377 subf(cnt_dwords, tmp, cnt_dwords); // rest.
3378 li(tmp, 0);
3379
3380 bind(startloop); // Clear at the beginning to reach 128byte boundary.
3381 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3382 addi(base_ptr, base_ptr, 8);
3383 bdnz(startloop);
3384
3385 bind(fast); // Clear 128byte blocks.
3386 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3387 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3388 mtctr(tmp); // Load counter.
3389
3390 bind(fastloop);
3391 dcbz(base_ptr); // Clear 128byte aligned block.
3392 addi(base_ptr, base_ptr, cl_size);
3393 bdnz(fastloop);
3394
3395 bind(small_rest);
3396 cmpdi(CR0, cnt_dwords, 0); // size 0?
3397 beq(CR0, done); // rest == 0
3398 li(tmp, 0);
3399 mtctr(cnt_dwords); // Load counter.
3400
3401 bind(restloop); // Clear rest.
3402 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3403 addi(base_ptr, base_ptr, 8);
3404 bdnz(restloop);
3405
3406 bind(done);
3407 }
3408
3409 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3410
3411 // Helpers for Intrinsic Emitters
3412 //
3413 // Revert the byte order of a 32bit value in a register
3414 // src: 0x44556677
3415 // dst: 0x77665544
3416 // Three steps to obtain the result:
3417 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3418 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3419 // This value initializes dst.
3420 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3421 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3422 // This value is mask inserted into dst with a [0..23] mask of 1s.
3423 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3424 // This value is mask inserted into dst with a [8..15] mask of 1s.
3425 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3426 assert_different_registers(dst, src);
3427
3428 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3429 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3430 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3431 }
3432
3433 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3434 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3435 // body size from 20 to 16 instructions.
3436 // Returns the offset that was used to calculate the address of column tc3.
3437 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3438 // at hand, the original table address can be easily reconstructed.
3439 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3440
3441 // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3442 // Layout: See StubRoutines::ppc::generate_crc_constants.
3443 #ifdef VM_LITTLE_ENDIAN
3444 const int ix0 = 3 * CRC32_TABLE_SIZE;
3445 const int ix1 = 2 * CRC32_TABLE_SIZE;
3446 const int ix2 = 1 * CRC32_TABLE_SIZE;
3447 const int ix3 = 0 * CRC32_TABLE_SIZE;
3448 #else
3449 const int ix0 = 1 * CRC32_TABLE_SIZE;
3450 const int ix1 = 2 * CRC32_TABLE_SIZE;
3451 const int ix2 = 3 * CRC32_TABLE_SIZE;
3452 const int ix3 = 4 * CRC32_TABLE_SIZE;
3453 #endif
3454 assert_different_registers(table, tc0, tc1, tc2);
3455 assert(table == tc3, "must be!");
3456
3457 addi(tc0, table, ix0);
3458 addi(tc1, table, ix1);
3459 addi(tc2, table, ix2);
3460 if (ix3 != 0) addi(tc3, table, ix3);
3461
3462 return ix3;
3463 }
3464
3465 /**
3466 * uint32_t crc;
3467 * table[crc & 0xFF] ^ (crc >> 8);
3468 */
3469 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3470 assert_different_registers(crc, table, tmp);
3471 assert_different_registers(val, table);
3472
3473 if (crc == val) { // Must rotate first to use the unmodified value.
3474 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3475 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3476 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3477 } else {
3478 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3479 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3480 }
3481 lwzx(tmp, table, tmp);
3482 xorr(crc, crc, tmp);
3483 }
3484
3485 /**
3486 * Emits code to update CRC-32 with a byte value according to constants in table.
3487 *
3488 * @param [in,out]crc Register containing the crc.
3489 * @param [in]val Register containing the byte to fold into the CRC.
3490 * @param [in]table Register containing the table of crc constants.
3491 *
3492 * uint32_t crc;
3493 * val = crc_table[(val ^ crc) & 0xFF];
3494 * crc = val ^ (crc >> 8);
3495 */
3496 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3497 BLOCK_COMMENT("update_byte_crc32:");
3498 xorr(val, val, crc);
3499 fold_byte_crc32(crc, val, table, val);
3500 }
3501
3502 /**
3503 * @param crc register containing existing CRC (32-bit)
3504 * @param buf register pointing to input byte buffer (byte*)
3505 * @param len register containing number of bytes
3506 * @param table register pointing to CRC table
3507 */
3508 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3509 Register data, bool loopAlignment) {
3510 assert_different_registers(crc, buf, len, table, data);
3511
3512 Label L_mainLoop, L_done;
3513 const int mainLoop_stepping = 1;
3514 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3515
3516 // Process all bytes in a single-byte loop.
3517 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3518 beq(CR0, L_done);
3519
3520 mtctr(len);
3521 align(mainLoop_alignment);
3522 BIND(L_mainLoop);
3523 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3524 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3525 update_byte_crc32(crc, data, table);
3526 bdnz(L_mainLoop); // Iterate.
3527
3528 bind(L_done);
3529 }
3530
3531 /**
3532 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3533 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3534 */
3535 // A note on the lookup table address(es):
3536 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3537 // To save the effort of adding the column offset to the table address each time
3538 // a table element is looked up, it is possible to pass the pre-calculated
3539 // column addresses.
3540 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3541 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3542 Register t0, Register t1, Register t2, Register t3,
3543 Register tc0, Register tc1, Register tc2, Register tc3) {
3544 assert_different_registers(crc, t3);
3545
3546 // XOR crc with next four bytes of buffer.
3547 lwz(t3, bufDisp, buf);
3548 if (bufInc != 0) {
3549 addi(buf, buf, bufInc);
3550 }
3551 xorr(t3, t3, crc);
3552
3553 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3554 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3555 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3556 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3557 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3558
3559 // Use the pre-calculated column addresses.
3560 // Load pre-calculated table values.
3561 lwzx(t0, tc0, t0);
3562 lwzx(t1, tc1, t1);
3563 lwzx(t2, tc2, t2);
3564 lwzx(t3, tc3, t3);
3565
3566 // Calculate new crc from table values.
3567 xorr(t0, t0, t1);
3568 xorr(t2, t2, t3);
3569 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3570 }
3571
3572
3573 /**
3574 * @param crc register containing existing CRC (32-bit)
3575 * @param buf register pointing to input byte buffer (byte*)
3576 * @param len register containing number of bytes
3577 * @param constants register pointing to precomputed constants
3578 * @param t0-t6 temp registers
3579 */
3580 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3581 Register t0, Register t1, Register t2, Register t3,
3582 Register t4, Register t5, Register t6, bool invertCRC) {
3583 assert_different_registers(crc, buf, len, constants);
3584
3585 Label L_tail;
3586
3587 BLOCK_COMMENT("kernel_crc32_vpmsum {");
3588
3589 if (invertCRC) {
3590 nand(crc, crc, crc); // 1s complement of crc
3591 }
3592
3593 // Enforce 32 bit.
3594 clrldi(len, len, 32);
3595
3596 // Align if we have enough bytes for the fast version.
3597 const int alignment = 16,
3598 threshold = 32;
3599 Register prealign = t0;
3600
3601 neg(prealign, buf);
3602 addi(t1, len, -threshold);
3603 andi(prealign, prealign, alignment - 1);
3604 cmpw(CR0, t1, prealign);
3605 blt(CR0, L_tail); // len - prealign < threshold?
3606
3607 subf(len, prealign, len);
3608 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3609
3610 // Calculate from first aligned address as far as possible.
3611 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3612 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3613 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3614
3615 // Remaining bytes.
3616 BIND(L_tail);
3617 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3618
3619 if (invertCRC) {
3620 nand(crc, crc, crc); // 1s complement of crc
3621 }
3622
3623 BLOCK_COMMENT("} kernel_crc32_vpmsum");
3624 }
3625
3626 /**
3627 * @param crc register containing existing CRC (32-bit)
3628 * @param buf register pointing to input byte buffer (byte*)
3629 * @param len register containing number of bytes (will get updated to remaining bytes)
3630 * @param constants register pointing to CRC table for 128-bit aligned memory
3631 * @param t0-t6 temp registers
3632 */
3633 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3634 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3635
3636 // Save non-volatile vector registers (frameless).
3637 Register offset = t1;
3638 int offsetInt = 0;
3639 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3640 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3641 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3642 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3643 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3644 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3645 #ifndef VM_LITTLE_ENDIAN
3646 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3647 #endif
3648 offsetInt -= 8; std(R14, offsetInt, R1_SP);
3649 offsetInt -= 8; std(R15, offsetInt, R1_SP);
3650
3651 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3652 // bytes per iteration. The basic scheme is:
3653 // lvx: load vector (Big Endian needs reversal)
3654 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3655 // vxor: xor partial results together to get unroll_factor2 vectors
3656
3657 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3658
3659 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3660 const int unroll_factor = CRC32_UNROLL_FACTOR,
3661 unroll_factor2 = CRC32_UNROLL_FACTOR2;
3662
3663 const int outer_consts_size = (unroll_factor2 - 1) * 16,
3664 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3665
3666 // Support registers.
3667 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3668 Register num_bytes = R14,
3669 loop_count = R15,
3670 cur_const = crc; // will live in VCRC
3671 // Constant array for outer loop: unroll_factor2 - 1 registers,
3672 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3673 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3674 consts1[] = { VR23, VR24 };
3675 // Data register arrays: 2 arrays with unroll_factor2 registers.
3676 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3677 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3678
3679 VectorRegister VCRC = data0[0];
3680 VectorRegister Vc = VR25;
3681 VectorRegister swap_bytes = VR26; // Only for Big Endian.
3682
3683 // We have at least 1 iteration (ensured by caller).
3684 Label L_outer_loop, L_inner_loop, L_last;
3685
3686 // Set DSCR pre-fetch to deepest.
3687 if (VM_Version::has_mfdscr()) {
3688 load_const_optimized(t0, VM_Version::_dscr_val | 7);
3689 mtdscr(t0);
3690 }
3691
3692 mtvrwz(VCRC, crc); // crc lives in VCRC, now
3693
3694 for (int i = 1; i < unroll_factor2; ++i) {
3695 li(offs[i], 16 * i);
3696 }
3697
3698 // Load consts for outer loop
3699 lvx(consts0[0], constants);
3700 for (int i = 1; i < unroll_factor2 - 1; ++i) {
3701 lvx(consts0[i], offs[i], constants);
3702 }
3703
3704 load_const_optimized(num_bytes, 16 * unroll_factor);
3705
3706 // Reuse data registers outside of the loop.
3707 VectorRegister Vtmp = data1[0];
3708 VectorRegister Vtmp2 = data1[1];
3709 VectorRegister zeroes = data1[2];
3710
3711 vspltisb(Vtmp, 0);
3712 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3713
3714 // Load vector for vpermxor (to xor both 64 bit parts together)
3715 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3716 vspltisb(Vc, 4);
3717 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3718 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3719 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3720
3721 #ifdef VM_LITTLE_ENDIAN
3722 #define BE_swap_bytes(x)
3723 #else
3724 vspltisb(Vtmp2, 0xf);
3725 vxor(swap_bytes, Vtmp, Vtmp2);
3726 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3727 #endif
3728
3729 cmpd(CR0, len, num_bytes);
3730 blt(CR0, L_last);
3731
3732 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3733 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3734
3735 // ********** Main loop start **********
3736 align(32);
3737 bind(L_outer_loop);
3738
3739 // Begin of unrolled first iteration (no xor).
3740 lvx(data1[0], buf);
3741 for (int i = 1; i < unroll_factor2 / 2; ++i) {
3742 lvx(data1[i], offs[i], buf);
3743 }
3744 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3745 lvx(consts1[0], cur_const);
3746 mtctr(loop_count);
3747 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3748 BE_swap_bytes(data1[i]);
3749 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3750 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3751 vpmsumw(data0[i], data1[i], consts1[0]);
3752 }
3753 addi(buf, buf, 16 * unroll_factor2);
3754 subf(len, num_bytes, len);
3755 lvx(consts1[1], offs[1], cur_const);
3756 addi(cur_const, cur_const, 32);
3757 // Begin of unrolled second iteration (head).
3758 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3759 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3760 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3761 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3762 }
3763 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3764 BE_swap_bytes(data1[i]);
3765 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3766 vpmsumw(data1[i], data1[i], consts1[1]);
3767 }
3768 addi(buf, buf, 16 * unroll_factor2);
3769
3770 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3771 // Double-iteration allows using the 2 constant registers alternatingly.
3772 align(32);
3773 bind(L_inner_loop);
3774 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3775 if (j & 1) {
3776 lvx(consts1[0], cur_const);
3777 } else {
3778 lvx(consts1[1], offs[1], cur_const);
3779 addi(cur_const, cur_const, 32);
3780 }
3781 for (int i = 0; i < unroll_factor2; ++i) {
3782 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3783 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3784 BE_swap_bytes(data1[idx]);
3785 vxor(data0[i], data0[i], data1[i]);
3786 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3787 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3788 }
3789 addi(buf, buf, 16 * unroll_factor2);
3790 }
3791 bdnz(L_inner_loop);
3792
3793 addi(cur_const, constants, outer_consts_size); // Reset
3794
3795 // Tail of last iteration (no loads).
3796 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3797 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3798 vxor(data0[i], data0[i], data1[i]);
3799 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3800 }
3801 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3802 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3803 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3804 }
3805
3806 // Last data register is ok, other ones need fixup shift.
3807 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3808 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3809 }
3810
3811 // Combine to 128 bit result vector VCRC = data0[0].
3812 for (int i = 1; i < unroll_factor2; i<<=1) {
3813 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3814 vxor(data0[j], data0[j], data0[j+i]);
3815 }
3816 }
3817 cmpd(CR0, len, num_bytes);
3818 bge(CR0, L_outer_loop);
3819
3820 // Last chance with lower num_bytes.
3821 bind(L_last);
3822 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3823 // Point behind last const for inner loop.
3824 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3825 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3826 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3827 subf(cur_const, R0, cur_const); // Point to constant to be used first.
3828
3829 addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3830 bgt(CR0, L_outer_loop);
3831 // ********** Main loop end **********
3832
3833 // Restore DSCR pre-fetch value.
3834 if (VM_Version::has_mfdscr()) {
3835 load_const_optimized(t0, VM_Version::_dscr_val);
3836 mtdscr(t0);
3837 }
3838
3839 // ********** Simple loop for remaining 16 byte blocks **********
3840 {
3841 Label L_loop, L_done;
3842
3843 srdi_(t0, len, 4); // 16 bytes per iteration
3844 clrldi(len, len, 64-4);
3845 beq(CR0, L_done);
3846
3847 // Point to const (same as last const for inner loop).
3848 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3849 mtctr(t0);
3850 lvx(Vtmp2, cur_const);
3851
3852 align(32);
3853 bind(L_loop);
3854
3855 lvx(Vtmp, buf);
3856 addi(buf, buf, 16);
3857 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3858 BE_swap_bytes(Vtmp);
3859 vxor(VCRC, VCRC, Vtmp);
3860 vpmsumw(VCRC, VCRC, Vtmp2);
3861 bdnz(L_loop);
3862
3863 bind(L_done);
3864 }
3865 // ********** Simple loop end **********
3866 #undef BE_swap_bytes
3867
3868 // Point to Barrett constants
3869 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3870
3871 vspltisb(zeroes, 0);
3872
3873 // Combine to 64 bit result.
3874 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3875
3876 // Reduce to 32 bit CRC: Remainder by multiply-high.
3877 lvx(Vtmp, cur_const);
3878 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3879 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3880 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3881 vsldoi(Vtmp, zeroes, Vtmp, 8);
3882 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3883 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3884
3885 // Move result. len is already updated.
3886 vsldoi(VCRC, VCRC, zeroes, 8);
3887 mfvrd(crc, VCRC);
3888
3889 // Restore non-volatile Vector registers (frameless).
3890 offsetInt = 0;
3891 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3892 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3893 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3894 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3895 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3896 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3897 #ifndef VM_LITTLE_ENDIAN
3898 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3899 #endif
3900 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3901 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3902 }
3903
3904 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3905 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3906 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3907 : StubRoutines::crc_table_addr() , R0);
3908
3909 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3910 }
3911
3912 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3913 assert_different_registers(crc, val, table);
3914
3915 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3916 if (invertCRC) {
3917 nand(crc, crc, crc); // 1s complement of crc
3918 }
3919
3920 update_byte_crc32(crc, val, table);
3921
3922 if (invertCRC) {
3923 nand(crc, crc, crc); // 1s complement of crc
3924 }
3925 }
3926
3927 // dest_lo += src1 + src2
3928 // dest_hi += carry1 + carry2
3929 void MacroAssembler::add2_with_carry(Register dest_hi,
3930 Register dest_lo,
3931 Register src1, Register src2) {
3932 li(R0, 0);
3933 addc(dest_lo, dest_lo, src1);
3934 adde(dest_hi, dest_hi, R0);
3935 addc(dest_lo, dest_lo, src2);
3936 adde(dest_hi, dest_hi, R0);
3937 }
3938
3939 // Multiply 64 bit by 64 bit first loop.
3940 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3941 Register x_xstart,
3942 Register y, Register y_idx,
3943 Register z,
3944 Register carry,
3945 Register product_high, Register product,
3946 Register idx, Register kdx,
3947 Register tmp) {
3948 // jlong carry, x[], y[], z[];
3949 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3950 // huge_128 product = y[idx] * x[xstart] + carry;
3951 // z[kdx] = (jlong)product;
3952 // carry = (jlong)(product >>> 64);
3953 // }
3954 // z[xstart] = carry;
3955
3956 Label L_first_loop, L_first_loop_exit;
3957 Label L_one_x, L_one_y, L_multiply;
3958
3959 addic_(xstart, xstart, -1);
3960 blt(CR0, L_one_x); // Special case: length of x is 1.
3961
3962 // Load next two integers of x.
3963 sldi(tmp, xstart, LogBytesPerInt);
3964 ldx(x_xstart, x, tmp);
3965 #ifdef VM_LITTLE_ENDIAN
3966 rldicl(x_xstart, x_xstart, 32, 0);
3967 #endif
3968
3969 align(32, 16);
3970 bind(L_first_loop);
3971
3972 cmpdi(CR0, idx, 1);
3973 blt(CR0, L_first_loop_exit);
3974 addi(idx, idx, -2);
3975 beq(CR0, L_one_y);
3976
3977 // Load next two integers of y.
3978 sldi(tmp, idx, LogBytesPerInt);
3979 ldx(y_idx, y, tmp);
3980 #ifdef VM_LITTLE_ENDIAN
3981 rldicl(y_idx, y_idx, 32, 0);
3982 #endif
3983
3984
3985 bind(L_multiply);
3986 multiply64(product_high, product, x_xstart, y_idx);
3987
3988 li(tmp, 0);
3989 addc(product, product, carry); // Add carry to result.
3990 adde(product_high, product_high, tmp); // Add carry of the last addition.
3991 addi(kdx, kdx, -2);
3992
3993 // Store result.
3994 #ifdef VM_LITTLE_ENDIAN
3995 rldicl(product, product, 32, 0);
3996 #endif
3997 sldi(tmp, kdx, LogBytesPerInt);
3998 stdx(product, z, tmp);
3999 mr_if_needed(carry, product_high);
4000 b(L_first_loop);
4001
4002
4003 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4004
4005 lwz(y_idx, 0, y);
4006 b(L_multiply);
4007
4008
4009 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4010
4011 lwz(x_xstart, 0, x);
4012 b(L_first_loop);
4013
4014 bind(L_first_loop_exit);
4015 }
4016
4017 // Multiply 64 bit by 64 bit and add 128 bit.
4018 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4019 Register z, Register yz_idx,
4020 Register idx, Register carry,
4021 Register product_high, Register product,
4022 Register tmp, int offset) {
4023
4024 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4025 // z[kdx] = (jlong)product;
4026
4027 sldi(tmp, idx, LogBytesPerInt);
4028 if (offset) {
4029 addi(tmp, tmp, offset);
4030 }
4031 ldx(yz_idx, y, tmp);
4032 #ifdef VM_LITTLE_ENDIAN
4033 rldicl(yz_idx, yz_idx, 32, 0);
4034 #endif
4035
4036 multiply64(product_high, product, x_xstart, yz_idx);
4037 ldx(yz_idx, z, tmp);
4038 #ifdef VM_LITTLE_ENDIAN
4039 rldicl(yz_idx, yz_idx, 32, 0);
4040 #endif
4041
4042 add2_with_carry(product_high, product, carry, yz_idx);
4043
4044 sldi(tmp, idx, LogBytesPerInt);
4045 if (offset) {
4046 addi(tmp, tmp, offset);
4047 }
4048 #ifdef VM_LITTLE_ENDIAN
4049 rldicl(product, product, 32, 0);
4050 #endif
4051 stdx(product, z, tmp);
4052 }
4053
4054 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4055 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4056 Register y, Register z,
4057 Register yz_idx, Register idx, Register carry,
4058 Register product_high, Register product,
4059 Register carry2, Register tmp) {
4060
4061 // jlong carry, x[], y[], z[];
4062 // int kdx = ystart+1;
4063 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4064 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4065 // z[kdx+idx+1] = (jlong)product;
4066 // jlong carry2 = (jlong)(product >>> 64);
4067 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4068 // z[kdx+idx] = (jlong)product;
4069 // carry = (jlong)(product >>> 64);
4070 // }
4071 // idx += 2;
4072 // if (idx > 0) {
4073 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4074 // z[kdx+idx] = (jlong)product;
4075 // carry = (jlong)(product >>> 64);
4076 // }
4077
4078 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4079 const Register jdx = R0;
4080
4081 // Scale the index.
4082 srdi_(jdx, idx, 2);
4083 beq(CR0, L_third_loop_exit);
4084 mtctr(jdx);
4085
4086 align(32, 16);
4087 bind(L_third_loop);
4088
4089 addi(idx, idx, -4);
4090
4091 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4092 mr_if_needed(carry2, product_high);
4093
4094 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4095 mr_if_needed(carry, product_high);
4096 bdnz(L_third_loop);
4097
4098 bind(L_third_loop_exit); // Handle any left-over operand parts.
4099
4100 andi_(idx, idx, 0x3);
4101 beq(CR0, L_post_third_loop_done);
4102
4103 Label L_check_1;
4104
4105 addic_(idx, idx, -2);
4106 blt(CR0, L_check_1);
4107
4108 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4109 mr_if_needed(carry, product_high);
4110
4111 bind(L_check_1);
4112
4113 addi(idx, idx, 0x2);
4114 andi_(idx, idx, 0x1);
4115 addic_(idx, idx, -1);
4116 blt(CR0, L_post_third_loop_done);
4117
4118 sldi(tmp, idx, LogBytesPerInt);
4119 lwzx(yz_idx, y, tmp);
4120 multiply64(product_high, product, x_xstart, yz_idx);
4121 lwzx(yz_idx, z, tmp);
4122
4123 add2_with_carry(product_high, product, yz_idx, carry);
4124
4125 sldi(tmp, idx, LogBytesPerInt);
4126 stwx(product, z, tmp);
4127 srdi(product, product, 32);
4128
4129 sldi(product_high, product_high, 32);
4130 orr(product, product, product_high);
4131 mr_if_needed(carry, product);
4132
4133 bind(L_post_third_loop_done);
4134 } // multiply_128_x_128_loop
4135
4136 void MacroAssembler::muladd(Register out, Register in,
4137 Register offset, Register len, Register k,
4138 Register tmp1, Register tmp2, Register carry) {
4139
4140 // Labels
4141 Label LOOP, SKIP;
4142
4143 // Make sure length is positive.
4144 cmpdi (CR0, len, 0);
4145
4146 // Prepare variables
4147 subi (offset, offset, 4);
4148 li (carry, 0);
4149 ble (CR0, SKIP);
4150
4151 mtctr (len);
4152 subi (len, len, 1 );
4153 sldi (len, len, 2 );
4154
4155 // Main loop
4156 bind(LOOP);
4157 lwzx (tmp1, len, in );
4158 lwzx (tmp2, offset, out );
4159 mulld (tmp1, tmp1, k );
4160 add (tmp2, carry, tmp2 );
4161 add (tmp2, tmp1, tmp2 );
4162 stwx (tmp2, offset, out );
4163 srdi (carry, tmp2, 32 );
4164 subi (offset, offset, 4 );
4165 subi (len, len, 4 );
4166 bdnz (LOOP);
4167 bind(SKIP);
4168 }
4169
4170 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4171 Register y, Register ylen,
4172 Register z,
4173 Register tmp1, Register tmp2,
4174 Register tmp3, Register tmp4,
4175 Register tmp5, Register tmp6,
4176 Register tmp7, Register tmp8,
4177 Register tmp9, Register tmp10,
4178 Register tmp11, Register tmp12,
4179 Register tmp13) {
4180
4181 ShortBranchVerifier sbv(this);
4182
4183 assert_different_registers(x, xlen, y, ylen, z,
4184 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4185 assert_different_registers(x, xlen, y, ylen, z,
4186 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4187 assert_different_registers(x, xlen, y, ylen, z,
4188 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4189
4190 const Register idx = tmp1;
4191 const Register kdx = tmp2;
4192 const Register xstart = tmp3;
4193
4194 const Register y_idx = tmp4;
4195 const Register carry = tmp5;
4196 const Register product = tmp6;
4197 const Register product_high = tmp7;
4198 const Register x_xstart = tmp8;
4199 const Register tmp = tmp9;
4200
4201 // First Loop.
4202 //
4203 // final static long LONG_MASK = 0xffffffffL;
4204 // int xstart = xlen - 1;
4205 // int ystart = ylen - 1;
4206 // long carry = 0;
4207 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4208 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4209 // z[kdx] = (int)product;
4210 // carry = product >>> 32;
4211 // }
4212 // z[xstart] = (int)carry;
4213
4214 mr_if_needed(idx, ylen); // idx = ylen
4215 add(kdx, xlen, ylen); // kdx = xlen + ylen
4216 li(carry, 0); // carry = 0
4217
4218 Label L_done;
4219
4220 addic_(xstart, xlen, -1);
4221 blt(CR0, L_done);
4222
4223 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4224 carry, product_high, product, idx, kdx, tmp);
4225
4226 Label L_second_loop;
4227
4228 cmpdi(CR0, kdx, 0);
4229 beq(CR0, L_second_loop);
4230
4231 Label L_carry;
4232
4233 addic_(kdx, kdx, -1);
4234 beq(CR0, L_carry);
4235
4236 // Store lower 32 bits of carry.
4237 sldi(tmp, kdx, LogBytesPerInt);
4238 stwx(carry, z, tmp);
4239 srdi(carry, carry, 32);
4240 addi(kdx, kdx, -1);
4241
4242
4243 bind(L_carry);
4244
4245 // Store upper 32 bits of carry.
4246 sldi(tmp, kdx, LogBytesPerInt);
4247 stwx(carry, z, tmp);
4248
4249 // Second and third (nested) loops.
4250 //
4251 // for (int i = xstart-1; i >= 0; i--) { // Second loop
4252 // carry = 0;
4253 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4254 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4255 // (z[k] & LONG_MASK) + carry;
4256 // z[k] = (int)product;
4257 // carry = product >>> 32;
4258 // }
4259 // z[i] = (int)carry;
4260 // }
4261 //
4262 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4263
4264 bind(L_second_loop);
4265
4266 li(carry, 0); // carry = 0;
4267
4268 addic_(xstart, xstart, -1); // i = xstart-1;
4269 blt(CR0, L_done);
4270
4271 Register zsave = tmp10;
4272
4273 mr(zsave, z);
4274
4275
4276 Label L_last_x;
4277
4278 sldi(tmp, xstart, LogBytesPerInt);
4279 add(z, z, tmp); // z = z + k - j
4280 addi(z, z, 4);
4281 addic_(xstart, xstart, -1); // i = xstart-1;
4282 blt(CR0, L_last_x);
4283
4284 sldi(tmp, xstart, LogBytesPerInt);
4285 ldx(x_xstart, x, tmp);
4286 #ifdef VM_LITTLE_ENDIAN
4287 rldicl(x_xstart, x_xstart, 32, 0);
4288 #endif
4289
4290
4291 Label L_third_loop_prologue;
4292
4293 bind(L_third_loop_prologue);
4294
4295 Register xsave = tmp11;
4296 Register xlensave = tmp12;
4297 Register ylensave = tmp13;
4298
4299 mr(xsave, x);
4300 mr(xlensave, xstart);
4301 mr(ylensave, ylen);
4302
4303
4304 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4305 carry, product_high, product, x, tmp);
4306
4307 mr(z, zsave);
4308 mr(x, xsave);
4309 mr(xlen, xlensave); // This is the decrement of the loop counter!
4310 mr(ylen, ylensave);
4311
4312 addi(tmp3, xlen, 1);
4313 sldi(tmp, tmp3, LogBytesPerInt);
4314 stwx(carry, z, tmp);
4315 addic_(tmp3, tmp3, -1);
4316 blt(CR0, L_done);
4317
4318 srdi(carry, carry, 32);
4319 sldi(tmp, tmp3, LogBytesPerInt);
4320 stwx(carry, z, tmp);
4321 b(L_second_loop);
4322
4323 // Next infrequent code is moved outside loops.
4324 bind(L_last_x);
4325
4326 lwz(x_xstart, 0, x);
4327 b(L_third_loop_prologue);
4328
4329 bind(L_done);
4330 } // multiply_to_len
4331
4332 void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) {
4333 ld(tmp, ind_or_offs, base);
4334 addi(tmp, tmp, val);
4335 std(tmp, ind_or_offs, base);
4336 }
4337
4338 // Handle the receiver type profile update given the "recv" klass.
4339 //
4340 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4341 // If there are no matching or claimable receiver entries in RD, updates
4342 // the polymorphic counter.
4343 //
4344 // This code expected to run by either the interpreter or JIT-ed code, without
4345 // extra synchronization. For safety, receiver cells are claimed atomically, which
4346 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4347 // counter updates are not atomic.
4348 //
4349 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) {
4350 assert_different_registers(recv, mdp, tmp1, tmp2);
4351
4352 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4353 int poly_count_offset = in_bytes(CounterData::count_offset());
4354 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4355 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4356
4357 // Adjust for MDP offsets.
4358 base_receiver_offset += mdp_offset;
4359 poly_count_offset += mdp_offset;
4360
4361 #ifdef ASSERT
4362 // We are about to walk the MDO slots without asking for offsets.
4363 // Check that our math hits all the right spots.
4364 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4365 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4366 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4367 int offset = base_receiver_offset + receiver_step*c;
4368 int count_offset = offset + receiver_to_count_step;
4369 assert(offset == real_recv_offset, "receiver slot math");
4370 assert(count_offset == real_count_offset, "receiver count math");
4371 }
4372 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4373 assert(poly_count_offset == real_poly_count_offset, "poly counter math");
4374 #endif
4375
4376 // Corner case: no profile table. Increment poly counter and exit.
4377 if (ReceiverTypeData::row_limit() == 0) {
4378 increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1);
4379 return;
4380 }
4381
4382 Label L_loop_search_receiver, L_loop_search_empty;
4383 Label L_restart, L_found_recv, L_found_empty, L_count_update;
4384 Register offset = tmp1, count = tmp2;
4385
4386 // The code here recognizes three major cases:
4387 // A. Fastest: receiver found in the table
4388 // B. Fast: no receiver in the table, and the table is full
4389 // C. Slow: no receiver in the table, free slots in the table
4390 //
4391 // The case A performance is most important, as perfectly-behaved code would end up
4392 // there, especially with larger TypeProfileWidth. The case B performance is
4393 // important as well, this is where bulk of code would land for normally megamorphic
4394 // cases. The case C performance is not essential, its job is to deal with installation
4395 // races, we optimize for code density instead. Case C needs to make sure that receiver
4396 // rows are only claimed once. This makes sure we never overwrite a row for another
4397 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4398 //
4399 // It is very tempting to handle these cases in a single loop, and claim the first slot
4400 // without checking the rest of the table. But, profiling code should tolerate free slots
4401 // in the table, as class unloading can clear them. After such cleanup, the receiver
4402 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4403 // to complete, before trying to install new slots. Splitting the code in several tight
4404 // loops also helpfully optimizes for cases A and B.
4405 //
4406 // This code is effectively:
4407 //
4408 // restart:
4409 // // Fastest: receiver is already installed
4410 // for (i = 0; i < receiver_count(); i++) {
4411 // if (receiver(i) == recv) goto found_recv(i);
4412 // }
4413 //
4414 // // Fast: no receiver, but profile is not full
4415 // for (i = 0; i < receiver_count(); i++) {
4416 // if (receiver(i) == null) goto found_null(i);
4417 // }
4418 //
4419 // // Slow: profile is full, polymorphic case
4420 // count++;
4421 // return
4422 //
4423 // // Slow: try to install receiver
4424 // found_null(i):
4425 // CAS(&receiver(i), null, recv);
4426 // goto restart
4427 //
4428 // found_recv(i):
4429 // *receiver_count(i)++
4430 //
4431
4432 if (count != noreg) {
4433 li(count, ReceiverTypeData::row_limit());
4434 }
4435
4436 bind(L_restart);
4437
4438 // Fastest: receiver is already installed
4439 if (count != noreg) {
4440 mtctr(count);
4441 } else {
4442 li(R0, ReceiverTypeData::row_limit());
4443 mtctr(R0);
4444 }
4445 li(offset, base_receiver_offset);
4446 bind(L_loop_search_receiver);
4447 ldx(R0, offset, mdp);
4448 cmpd(CR0, R0, recv);
4449 beq(CR0, L_found_recv);
4450 addi(offset, offset, receiver_step);
4451 bdnz(L_loop_search_receiver);
4452
4453 // Fast: no receiver, but profile is full
4454 if (count != noreg) {
4455 mtctr(count);
4456 } else {
4457 li(R0, ReceiverTypeData::row_limit());
4458 mtctr(R0);
4459 }
4460 li(offset, base_receiver_offset);
4461 bind(L_loop_search_empty);
4462 ldx(R0, offset, mdp);
4463 cmpdi(CR0, R0, 0);
4464 beq(CR0, L_found_empty);
4465 addi(offset, offset, receiver_step);
4466 bdnz(L_loop_search_empty);
4467
4468 // Slow: Receiver is not found and table is full.
4469 // Increment polymorphic counter instead of receiver slot.
4470 li(offset, poly_count_offset);
4471 b(L_count_update);
4472
4473 // Slowest: try to install receiver
4474 bind(L_found_empty);
4475
4476 // Atomically swing receiver slot: null -> recv.
4477 {
4478 Register receiver_addr = offset;
4479 add(receiver_addr, mdp, offset); // kills offset
4480 cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(),
4481 noreg, nullptr, /* check without ldarx first */ false, /* weak */ true);
4482 }
4483
4484 // CAS success means the slot now has the receiver we want. CAS failure means
4485 // something had claimed the slot concurrently: it can be the same receiver we want,
4486 // or something else. Since this is a slow path, we can optimize for code density,
4487 // and just restart the search from the beginning.
4488 b(L_restart);
4489
4490 // Found a receiver, convert its slot offset to corresponding count offset.
4491 bind(L_found_recv);
4492 addi(offset, offset, receiver_to_count_step);
4493
4494 // Finally, update the counter
4495 bind(L_count_update);
4496 increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv);
4497 }
4498
4499 #ifdef ASSERT
4500 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4501 Label ok;
4502 switch (cond) {
4503 case eq:
4504 beq(CR0, ok);
4505 break;
4506 case ne:
4507 bne(CR0, ok);
4508 break;
4509 case ge:
4510 bge(CR0, ok);
4511 break;
4512 case gt:
4513 bgt(CR0, ok);
4514 break;
4515 case lt:
4516 blt(CR0, ok);
4517 break;
4518 case le:
4519 ble(CR0, ok);
4520 break;
4521 default:
4522 assert(false, "unknown cond:%d", cond);
4523 }
4524 stop(msg);
4525 bind(ok);
4526 }
4527
4528 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4529 Register mem_base, const char* msg) {
4530 switch (size) {
4531 case 4:
4532 lwz(R0, mem_offset, mem_base);
4533 cmpwi(CR0, R0, 0);
4534 break;
4535 case 8:
4536 ld(R0, mem_offset, mem_base);
4537 cmpdi(CR0, R0, 0);
4538 break;
4539 default:
4540 ShouldNotReachHere();
4541 }
4542 asm_assert(cond, msg);
4543 }
4544 #endif // ASSERT
4545
4546 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4547 if (!VerifyOops) { return; }
4548 if (UseCompressedOops) { decode_heap_oop(coop); }
4549 verify_oop(coop, msg);
4550 if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4551 }
4552
4553 // READ: oop. KILL: R0. Volatile floats perhaps.
4554 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4555 if (!VerifyOops) {
4556 return;
4557 }
4558
4559 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4560 const Register tmp = R11; // Will be preserved.
4561 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4562
4563 BLOCK_COMMENT("verify_oop {");
4564
4565 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4566
4567 mr_if_needed(R4_ARG2, oop);
4568 save_LR_CR(tmp); // save in old frame
4569 push_frame_reg_args(nbytes_save, tmp);
4570 // load FunctionDescriptor** / entry_address *
4571 load_const_optimized(tmp, fd, R0);
4572 // load FunctionDescriptor* / entry_address
4573 ld(tmp, 0, tmp);
4574 load_const_optimized(R3_ARG1, (address)msg, R0);
4575 // Call destination for its side effect.
4576 call_c(tmp);
4577
4578 pop_frame();
4579 restore_LR_CR(tmp);
4580 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4581
4582 BLOCK_COMMENT("} verify_oop");
4583 }
4584
4585 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4586 if (!VerifyOops) {
4587 return;
4588 }
4589
4590 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4591 const Register tmp = R11; // Will be preserved.
4592 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4593 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4594
4595 ld(R4_ARG2, offs, base);
4596 save_LR_CR(tmp); // save in old frame
4597 push_frame_reg_args(nbytes_save, tmp);
4598 // load FunctionDescriptor** / entry_address *
4599 load_const_optimized(tmp, fd, R0);
4600 // load FunctionDescriptor* / entry_address
4601 ld(tmp, 0, tmp);
4602 load_const_optimized(R3_ARG1, (address)msg, R0);
4603 // Call destination for its side effect.
4604 call_c(tmp);
4605
4606 pop_frame();
4607 restore_LR_CR(tmp);
4608 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4609 }
4610
4611 // Call a C-function that prints output.
4612 void MacroAssembler::stop(int type, const char* msg) {
4613 bool msg_present = (msg != nullptr);
4614
4615 #ifndef PRODUCT
4616 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4617 #else
4618 block_comment("stop {");
4619 #endif
4620
4621 if (msg_present) {
4622 type |= stop_msg_present;
4623 }
4624 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4625 if (msg_present) {
4626 emit_int64((uintptr_t)msg);
4627 }
4628
4629 block_comment("} stop;");
4630 }
4631
4632 #ifndef PRODUCT
4633 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4634 // Val, addr are temp registers.
4635 // If low == addr, addr is killed.
4636 // High is preserved.
4637 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4638 if (!ZapMemory) return;
4639
4640 assert_different_registers(low, val);
4641
4642 BLOCK_COMMENT("zap memory region {");
4643 load_const_optimized(val, 0x0101010101010101);
4644 int size = before + after;
4645 if (low == high && size < 5 && size > 0) {
4646 int offset = -before*BytesPerWord;
4647 for (int i = 0; i < size; ++i) {
4648 std(val, offset, low);
4649 offset += (1*BytesPerWord);
4650 }
4651 } else {
4652 addi(addr, low, -before*BytesPerWord);
4653 assert_different_registers(high, val);
4654 if (after) addi(high, high, after * BytesPerWord);
4655 Label loop;
4656 bind(loop);
4657 std(val, 0, addr);
4658 addi(addr, addr, 8);
4659 cmpd(CR6, addr, high);
4660 ble(CR6, loop);
4661 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4662 }
4663 BLOCK_COMMENT("} zap memory region");
4664 }
4665
4666 #endif // !PRODUCT
4667
4668 void MacroAssembler::cache_wb(Address line) {
4669 assert(line.index() == noreg, "index should be noreg");
4670 assert(line.disp() == 0, "displacement should be 0");
4671 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4672 // Data Cache Store, not really a flush, so it works like a sync of cache
4673 // line and persistent mem, i.e. copying the cache line to persistent whilst
4674 // not invalidating the cache line.
4675 dcbst(line.base());
4676 }
4677
4678 void MacroAssembler::cache_wbsync(bool is_presync) {
4679 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4680 // We only need a post sync barrier. Post means _after_ a cache line flush or
4681 // store instruction, pre means a barrier emitted before such a instructions.
4682 if (!is_presync) {
4683 fence();
4684 }
4685 }
4686
4687 void MacroAssembler::push_cont_fastpath() {
4688 if (!Continuations::enabled()) return;
4689
4690 Label done;
4691 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4692 cmpld(CR0, R1_SP, R0);
4693 ble(CR0, done); // if (SP <= _cont_fastpath) goto done;
4694 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4695 bind(done);
4696 }
4697
4698 void MacroAssembler::pop_cont_fastpath() {
4699 if (!Continuations::enabled()) return;
4700
4701 Label done;
4702 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4703 cmpld(CR0, R1_SP, R0);
4704 blt(CR0, done); // if (SP < _cont_fastpath) goto done;
4705 li(R0, 0);
4706 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4707 bind(done);
4708 }
4709
4710 // Function to flip between unlocked and locked state (fast locking).
4711 // Branches to failed if the state is not as expected with CR0 NE.
4712 // Falls through upon success with CR0 EQ.
4713 // This requires fewer instructions and registers and is easier to use than the
4714 // cmpxchg based implementation.
4715 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4716 assert_different_registers(obj, tmp, R0);
4717 Label retry;
4718
4719 if (semantics & MemBarRel) {
4720 release();
4721 }
4722
4723 bind(retry);
4724 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4725 if (!is_unlock) {
4726 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4727 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4728 andi_(R0, tmp, markWord::lock_mask_in_place);
4729 bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4730 } else {
4731 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4732 andi_(R0, tmp, markWord::lock_mask_in_place);
4733 bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4734 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4735 }
4736 stdcx_(tmp, obj);
4737 bne(CR0, retry);
4738
4739 if (semantics & MemBarFenceAfter) {
4740 fence();
4741 } else if (semantics & MemBarAcq) {
4742 isync();
4743 }
4744 }
4745
4746 // Implements fast-locking.
4747 //
4748 // - obj: the object to be locked
4749 // - t1, t2: temporary register
4750 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4751 assert_different_registers(box, obj, t1, t2, R0);
4752
4753 Label push;
4754 const Register t = R0;
4755
4756 if (UseObjectMonitorTable) {
4757 // Clear cache in case fast locking succeeds or we need to take the slow-path.
4758 li(t, 0);
4759 std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4760 }
4761
4762 if (DiagnoseSyncOnValueBasedClasses != 0) {
4763 load_klass(t1, obj);
4764 lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4765 testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4766 bne(CR0, slow);
4767 }
4768
4769 const Register top = t1;
4770 const Register mark = t2;
4771
4772 // Check if the lock-stack is full.
4773 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4774 cmplwi(CR0, top, LockStack::end_offset());
4775 bge(CR0, slow);
4776
4777 // The underflow check is elided. The recursive check will always fail
4778 // when the lock stack is empty because of the _bad_oop_sentinel field.
4779
4780 // Check for recursion.
4781 subi(t, top, oopSize);
4782 ldx(t, R16_thread, t);
4783 cmpd(CR0, obj, t);
4784 beq(CR0, push);
4785
4786 // Check header for monitor (0b10) or locked (0b00).
4787 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4788 xori(t, mark, markWord::unlocked_value);
4789 andi_(t, t, markWord::lock_mask_in_place);
4790 bne(CR0, slow);
4791
4792 // Try to lock. Transition lock bits 0b01 => 0b00
4793 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4794
4795 bind(push);
4796 // After successful lock, push object on lock-stack
4797 stdx(obj, R16_thread, top);
4798 addi(top, top, oopSize);
4799 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4800 }
4801
4802 // Implements fast-unlocking.
4803 //
4804 // - obj: the object to be unlocked
4805 // - t1: temporary register
4806 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4807 assert_different_registers(obj, t1);
4808
4809 #ifdef ASSERT
4810 {
4811 // The following checks rely on the fact that LockStack is only ever modified by
4812 // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4813 // entries after inflation will happen delayed in that case.
4814
4815 // Check for lock-stack underflow.
4816 Label stack_ok;
4817 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4818 cmplwi(CR0, t1, LockStack::start_offset());
4819 bge(CR0, stack_ok);
4820 stop("Lock-stack underflow");
4821 bind(stack_ok);
4822 }
4823 #endif
4824
4825 Label unlocked, push_and_slow;
4826 const Register top = t1;
4827 const Register mark = R0;
4828 Register t = R0;
4829
4830 // Check if obj is top of lock-stack.
4831 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4832 subi(top, top, oopSize);
4833 ldx(t, R16_thread, top);
4834 cmpd(CR0, obj, t);
4835 bne(CR0, slow);
4836
4837 // Pop lock-stack.
4838 DEBUG_ONLY(li(t, 0);)
4839 DEBUG_ONLY(stdx(t, R16_thread, top);)
4840 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4841
4842 // The underflow check is elided. The recursive check will always fail
4843 // when the lock stack is empty because of the _bad_oop_sentinel field.
4844
4845 // Check if recursive.
4846 subi(t, top, oopSize);
4847 ldx(t, R16_thread, t);
4848 cmpd(CR0, obj, t);
4849 beq(CR0, unlocked);
4850
4851 // Use top as tmp
4852 t = top;
4853
4854 // Not recursive. Check header for monitor (0b10).
4855 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4856 andi_(t, mark, markWord::monitor_value);
4857 bne(CR0, push_and_slow);
4858
4859 #ifdef ASSERT
4860 // Check header not unlocked (0b01).
4861 Label not_unlocked;
4862 andi_(t, mark, markWord::unlocked_value);
4863 beq(CR0, not_unlocked);
4864 stop("fast_unlock already unlocked");
4865 bind(not_unlocked);
4866 #endif
4867
4868 // Try to unlock. Transition lock bits 0b00 => 0b01
4869 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4870 b(unlocked);
4871
4872 bind(push_and_slow);
4873
4874 // Restore lock-stack and handle the unlock in runtime.
4875 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4876 DEBUG_ONLY(stdx(obj, R16_thread, top);)
4877 addi(top, top, oopSize);
4878 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4879 b(slow);
4880
4881 bind(unlocked);
4882 }
4883
4884 // Unimplemented methods for inline types.
4885 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
4886 Unimplemented();
4887 }
4888
4889 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
4890 Unimplemented();
4891 }
4892
4893 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
4894 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
4895 RegState reg_state[]) {
4896 Unimplemented();
4897 }
4898
4899 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
4900 VMRegPair* from, int from_count, int& from_index, VMReg to,
4901 RegState reg_state[], Register val_array) {
4902 Unimplemented();
4903 }
4904
4905 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
4906 Unimplemented();
4907 }
4908
4909 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
4910 Unimplemented();
4911 }