1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.inline.hpp"
27 #include "code/compiledIC.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "interpreter/interpreterRuntime.hpp"
34 #include "memory/resourceArea.hpp"
35 #include "nativeInst_ppc.hpp"
36 #include "oops/compressedKlass.inline.hpp"
37 #include "oops/compressedOops.inline.hpp"
38 #include "oops/klass.inline.hpp"
39 #include "oops/methodData.hpp"
40 #include "prims/methodHandles.hpp"
41 #include "register_ppc.hpp"
42 #include "runtime/icache.hpp"
43 #include "runtime/interfaceSupport.inline.hpp"
44 #include "runtime/objectMonitor.hpp"
45 #include "runtime/objectMonitorTable.hpp"
46 #include "runtime/os.hpp"
47 #include "runtime/safepoint.hpp"
48 #include "runtime/safepointMechanism.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "runtime/vm_version.hpp"
52 #include "utilities/macros.hpp"
53 #include "utilities/powerOfTwo.hpp"
54
55 #ifdef PRODUCT
56 #define BLOCK_COMMENT(str) // nothing
57 #else
58 #define BLOCK_COMMENT(str) block_comment(str)
59 #endif
60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
61
62 #ifdef ASSERT
63 // On RISC, there's no benefit to verifying instruction boundaries.
64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
65 #endif
66
67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
68 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
69 if (Assembler::is_simm(si31, 16)) {
70 ld(d, si31, a);
71 if (emit_filler_nop) nop();
72 } else {
73 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
74 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
75 addis(d, a, hi);
76 ld(d, lo, d);
77 }
78 }
79
80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
81 assert_different_registers(d, a);
82 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
83 }
84
85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
86 size_t size_in_bytes, bool is_signed) {
87 switch (size_in_bytes) {
88 case 8: ld(dst, offs, base); break;
89 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
90 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
91 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
92 default: ShouldNotReachHere();
93 }
94 }
95
96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
97 size_t size_in_bytes) {
98 switch (size_in_bytes) {
99 case 8: std(dst, offs, base); break;
100 case 4: stw(dst, offs, base); break;
101 case 2: sth(dst, offs, base); break;
102 case 1: stb(dst, offs, base); break;
103 default: ShouldNotReachHere();
104 }
105 }
106
107 void MacroAssembler::align(int modulus, int max, int rem) {
108 int padding = (rem + modulus - (offset() % modulus)) % modulus;
109 if (padding > max) return;
110 for (int c = (padding >> 2); c > 0; --c) { nop(); }
111 }
112
113 void MacroAssembler::align_prefix() {
114 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
115 }
116
117 // Issue instructions that calculate given TOC from global TOC.
118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
119 bool add_relocation, bool emit_dummy_addr,
120 bool add_addr_to_reloc) {
121 int offset = -1;
122 if (emit_dummy_addr) {
123 offset = -128; // dummy address
124 } else if (addr != (address)(intptr_t)-1) {
125 offset = MacroAssembler::offset_to_global_toc(addr);
126 }
127
128 if (hi16) {
129 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
130 }
131 if (lo16) {
132 if (add_relocation) {
133 // Relocate at the addi to avoid confusion with a load from the method's TOC.
134 RelocationHolder rh = add_addr_to_reloc ?
135 internal_word_Relocation::spec(addr) :
136 internal_word_Relocation::spec_for_immediate();
137 relocate(rh);
138 }
139 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
140 }
141 }
142
143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
144 const int offset = MacroAssembler::offset_to_global_toc(addr);
145
146 const address inst2_addr = a;
147 const int inst2 = *(int *)inst2_addr;
148
149 // The relocation points to the second instruction, the addi,
150 // and the addi reads and writes the same register dst.
151 const int dst = inv_rt_field(inst2);
152 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
153
154 // Now, find the preceding addis which writes to dst.
155 int inst1 = 0;
156 address inst1_addr = inst2_addr - BytesPerInstWord;
157 while (inst1_addr >= bound) {
158 inst1 = *(int *) inst1_addr;
159 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
160 // Stop, found the addis which writes dst.
161 break;
162 }
163 inst1_addr -= BytesPerInstWord;
164 }
165
166 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
167 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
168 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
169 return inst1_addr;
170 }
171
172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
173 const address inst2_addr = a;
174 const int inst2 = *(int *)inst2_addr;
175
176 // The relocation points to the second instruction, the addi,
177 // and the addi reads and writes the same register dst.
178 const int dst = inv_rt_field(inst2);
179 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
180
181 // Now, find the preceding addis which writes to dst.
182 int inst1 = 0;
183 address inst1_addr = inst2_addr - BytesPerInstWord;
184 while (inst1_addr >= bound) {
185 inst1 = *(int *) inst1_addr;
186 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
187 // stop, found the addis which writes dst
188 break;
189 }
190 inst1_addr -= BytesPerInstWord;
191 }
192
193 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
194
195 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
196 // -1 is a special case
197 if (offset == -1) {
198 return (address)(intptr_t)-1;
199 } else {
200 return global_toc() + offset;
201 }
202 }
203
204 #ifdef _LP64
205 // Patch compressed oops or klass constants.
206 // Assembler sequence is
207 // 1) compressed oops:
208 // lis rx = const.hi
209 // ori rx = rx | const.lo
210 // 2) compressed klass:
211 // lis rx = const.hi
212 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
213 // ori rx = rx | const.lo
214 // Clrldi will be passed by.
215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
216 assert(UseCompressedOops, "Should only patch compressed oops");
217
218 const address inst2_addr = a;
219 const int inst2 = *(int *)inst2_addr;
220
221 // The relocation points to the second instruction, the ori,
222 // and the ori reads and writes the same register dst.
223 const int dst = inv_rta_field(inst2);
224 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
225 // Now, find the preceding addis which writes to dst.
226 int inst1 = 0;
227 address inst1_addr = inst2_addr - BytesPerInstWord;
228 bool inst1_found = false;
229 while (inst1_addr >= bound) {
230 inst1 = *(int *)inst1_addr;
231 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
232 inst1_addr -= BytesPerInstWord;
233 }
234 assert(inst1_found, "inst is not lis");
235
236 uint32_t data_value = CompressedOops::narrow_oop_value(data);
237 int xc = (data_value >> 16) & 0xffff;
238 int xd = (data_value >> 0) & 0xffff;
239
240 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
241 set_imm((int *)inst2_addr, (xd)); // unsigned int
242 return inst1_addr;
243 }
244
245 // Get compressed oop constant.
246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
247 assert(UseCompressedOops, "Should only patch compressed oops");
248
249 const address inst2_addr = a;
250 const int inst2 = *(int *)inst2_addr;
251
252 // The relocation points to the second instruction, the ori,
253 // and the ori reads and writes the same register dst.
254 const int dst = inv_rta_field(inst2);
255 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
256 // Now, find the preceding lis which writes to dst.
257 int inst1 = 0;
258 address inst1_addr = inst2_addr - BytesPerInstWord;
259 bool inst1_found = false;
260
261 while (inst1_addr >= bound) {
262 inst1 = *(int *) inst1_addr;
263 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
264 inst1_addr -= BytesPerInstWord;
265 }
266 assert(inst1_found, "inst is not lis");
267
268 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
269 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
270
271 return CompressedOops::narrow_oop_cast(xl | xh);
272 }
273 #endif // _LP64
274
275 // Returns true if successful.
276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
277 Register toc, bool fixed_size) {
278 int toc_offset = 0;
279 // Use RelocationHolder::none for the constant pool entry, otherwise
280 // we will end up with a failing NativeCall::verify(x) where x is
281 // the address of the constant pool entry.
282 // FIXME: We should insert relocation information for oops at the constant
283 // pool entries instead of inserting it at the loads; patching of a constant
284 // pool entry should be less expensive.
285 address const_address = address_constant((address)a.value(), RelocationHolder::none);
286 if (const_address == nullptr) { return false; } // allocation failure
287 // Relocate at the pc of the load.
288 relocate(a.rspec());
289 toc_offset = (int)(const_address - code()->consts()->start());
290 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
291 return true;
292 }
293
294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
295 const address inst1_addr = a;
296 const int inst1 = *(int *)inst1_addr;
297
298 // The relocation points to the ld or the addis.
299 return (is_ld(inst1)) ||
300 (is_addis(inst1) && inv_ra_field(inst1) != 0);
301 }
302
303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
304 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
305
306 const address inst1_addr = a;
307 const int inst1 = *(int *)inst1_addr;
308
309 if (is_ld(inst1)) {
310 return inv_d1_field(inst1);
311 } else if (is_addis(inst1)) {
312 const int dst = inv_rt_field(inst1);
313
314 // Now, find the succeeding ld which reads and writes to dst.
315 address inst2_addr = inst1_addr + BytesPerInstWord;
316 int inst2 = 0;
317 while (true) {
318 inst2 = *(int *) inst2_addr;
319 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
320 // Stop, found the ld which reads and writes dst.
321 break;
322 }
323 inst2_addr += BytesPerInstWord;
324 }
325 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
326 }
327 ShouldNotReachHere();
328 return 0;
329 }
330
331 // Get the constant from a `load_const' sequence.
332 long MacroAssembler::get_const(address a) {
333 assert(is_load_const_at(a), "not a load of a constant");
334 const int *p = (const int*) a;
335 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
336 if (is_ori(*(p+1))) {
337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
338 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
339 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
340 } else if (is_lis(*(p+1))) {
341 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
342 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
343 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
344 } else {
345 ShouldNotReachHere();
346 return (long) 0;
347 }
348 return (long) x;
349 }
350
351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
352 // level procedure. It neither flushes the instruction cache nor is it
353 // mt safe.
354 void MacroAssembler::patch_const(address a, long x) {
355 assert(is_load_const_at(a), "not a load of a constant");
356 int *p = (int*) a;
357 if (is_ori(*(p+1))) {
358 set_imm(0 + p, (x >> 48) & 0xffff);
359 set_imm(1 + p, (x >> 32) & 0xffff);
360 set_imm(3 + p, (x >> 16) & 0xffff);
361 set_imm(4 + p, x & 0xffff);
362 } else if (is_lis(*(p+1))) {
363 set_imm(0 + p, (x >> 48) & 0xffff);
364 set_imm(2 + p, (x >> 32) & 0xffff);
365 set_imm(1 + p, (x >> 16) & 0xffff);
366 set_imm(3 + p, x & 0xffff);
367 } else {
368 ShouldNotReachHere();
369 }
370 }
371
372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
373 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
374 int index = oop_recorder()->allocate_metadata_index(obj);
375 RelocationHolder rspec = metadata_Relocation::spec(index);
376 return AddressLiteral((address)obj, rspec);
377 }
378
379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
380 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
381 int index = oop_recorder()->find_index(obj);
382 RelocationHolder rspec = metadata_Relocation::spec(index);
383 return AddressLiteral((address)obj, rspec);
384 }
385
386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
387 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
388 int oop_index = oop_recorder()->allocate_oop_index(obj);
389 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
390 }
391
392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
393 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
394 int oop_index = oop_recorder()->find_index(obj);
395 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
396 }
397
398 #ifndef PRODUCT
399 void MacroAssembler::pd_print_patched_instruction(address branch) {
400 Unimplemented(); // TODO: PPC port
401 }
402 #endif // ndef PRODUCT
403
404 // Conditional far branch for destinations encodable in 24+2 bits.
405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
406
407 // If requested by flag optimize, relocate the bc_far as a
408 // runtime_call and prepare for optimizing it when the code gets
409 // relocated.
410 if (optimize == bc_far_optimize_on_relocate) {
411 relocate(relocInfo::runtime_call_type);
412 }
413
414 // variant 2:
415 //
416 // b!cxx SKIP
417 // bxx DEST
418 // SKIP:
419 //
420
421 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
422 opposite_bcond(inv_boint_bcond(boint)));
423
424 // We emit two branches.
425 // First, a conditional branch which jumps around the far branch.
426 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
427 const address bc_pc = pc();
428 bc(opposite_boint, biint, not_taken_pc);
429
430 const int bc_instr = *(int*)bc_pc;
431 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
432 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
433 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
434 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
435 "postcondition");
436 assert(biint == inv_bi_field(bc_instr), "postcondition");
437
438 // Second, an unconditional far branch which jumps to dest.
439 // Note: target(dest) remembers the current pc (see CodeSection::target)
440 // and returns the current pc if the label is not bound yet; when
441 // the label gets bound, the unconditional far branch will be patched.
442 const address target_pc = target(dest);
443 const address b_pc = pc();
444 b(target_pc);
445
446 assert(not_taken_pc == pc(), "postcondition");
447 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
448 }
449
450 // 1 or 2 instructions
451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
452 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
453 bc(boint, biint, dest);
454 } else {
455 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
456 }
457 }
458
459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
460 return is_bc_far_variant1_at(instruction_addr) ||
461 is_bc_far_variant2_at(instruction_addr) ||
462 is_bc_far_variant3_at(instruction_addr);
463 }
464
465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
466 if (is_bc_far_variant1_at(instruction_addr)) {
467 const address instruction_1_addr = instruction_addr;
468 const int instruction_1 = *(int*)instruction_1_addr;
469 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
470 } else if (is_bc_far_variant2_at(instruction_addr)) {
471 const address instruction_2_addr = instruction_addr + 4;
472 return bxx_destination(instruction_2_addr);
473 } else if (is_bc_far_variant3_at(instruction_addr)) {
474 return instruction_addr + 8;
475 }
476 // variant 4 ???
477 ShouldNotReachHere();
478 return nullptr;
479 }
480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
481
482 if (is_bc_far_variant3_at(instruction_addr)) {
483 // variant 3, far cond branch to the next instruction, already patched to nops:
484 //
485 // nop
486 // nop
487 // SKIP/DEST:
488 //
489 return;
490 }
491
492 // first, extract boint and biint from the current branch
493 int boint = 0;
494 int biint = 0;
495
496 ResourceMark rm;
497 const int code_size = 2 * BytesPerInstWord;
498 CodeBuffer buf(instruction_addr, code_size);
499 MacroAssembler masm(&buf);
500 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
501 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
502 masm.nop();
503 masm.nop();
504 } else {
505 if (is_bc_far_variant1_at(instruction_addr)) {
506 // variant 1, the 1st instruction contains the destination address:
507 //
508 // bcxx DEST
509 // nop
510 //
511 const int instruction_1 = *(int*)(instruction_addr);
512 boint = inv_bo_field(instruction_1);
513 biint = inv_bi_field(instruction_1);
514 } else if (is_bc_far_variant2_at(instruction_addr)) {
515 // variant 2, the 2nd instruction contains the destination address:
516 //
517 // b!cxx SKIP
518 // bxx DEST
519 // SKIP:
520 //
521 const int instruction_1 = *(int*)(instruction_addr);
522 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
523 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
524 biint = inv_bi_field(instruction_1);
525 } else {
526 // variant 4???
527 ShouldNotReachHere();
528 }
529
530 // second, set the new branch destination and optimize the code
531 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
532 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
533 // variant 1:
534 //
535 // bcxx DEST
536 // nop
537 //
538 masm.bc(boint, biint, dest);
539 masm.nop();
540 } else {
541 // variant 2:
542 //
543 // b!cxx SKIP
544 // bxx DEST
545 // SKIP:
546 //
547 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
548 opposite_bcond(inv_boint_bcond(boint)));
549 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
550 masm.bc(opposite_boint, biint, not_taken_pc);
551 masm.b(dest);
552 }
553 }
554 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
555 }
556
557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
559 // get current pc
560 uint64_t start_pc = (uint64_t) pc();
561
562 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
563 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
564
565 // relocate here
566 if (rt != relocInfo::none) {
567 relocate(rt);
568 }
569
570 if ( ReoptimizeCallSequences &&
571 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
572 (!link && is_within_range_of_b(dest, pc_of_b)))) {
573 // variant 2:
574 // Emit an optimized, pc-relative call/jump.
575
576 if (link) {
577 // some padding
578 nop();
579 nop();
580 nop();
581 nop();
582 nop();
583 nop();
584
585 // do the call
586 assert(pc() == pc_of_bl, "just checking");
587 bl(dest, relocInfo::none);
588 } else {
589 // do the jump
590 assert(pc() == pc_of_b, "just checking");
591 b(dest, relocInfo::none);
592
593 // some padding
594 nop();
595 nop();
596 nop();
597 nop();
598 nop();
599 nop();
600 }
601
602 // Assert that we can identify the emitted call/jump.
603 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
604 "can't identify emitted call");
605 } else {
606 // variant 1:
607 mr(R0, R11); // spill R11 -> R0.
608
609 // Load the destination address into CTR,
610 // calculate destination relative to global toc.
611 calculate_address_from_global_toc(R11, dest, true, true, false);
612
613 mtctr(R11);
614 mr(R11, R0); // spill R11 <- R0.
615 nop();
616
617 // do the call/jump
618 if (link) {
619 bctrl();
620 } else{
621 bctr();
622 }
623 // Assert that we can identify the emitted call/jump.
624 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
625 "can't identify emitted call");
626 }
627
628 // Assert that we can identify the emitted call/jump.
629 assert(is_bxx64_patchable_at((address)start_pc, link),
630 "can't identify emitted call");
631 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
632 "wrong encoding of dest address");
633 }
634
635 // Identify a bxx64_patchable instruction.
636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
637 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
638 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
639 || is_bxx64_patchable_variant2_at(instruction_addr, link);
640 }
641
642 // Does the call64_patchable instruction use a pc-relative encoding of
643 // the call destination?
644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
645 // variant 2 is pc-relative
646 return is_bxx64_patchable_variant2_at(instruction_addr, link);
647 }
648
649 // Identify variant 1.
650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
651 unsigned int* instr = (unsigned int*) instruction_addr;
652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
653 && is_mtctr(instr[5]) // mtctr
654 && is_load_const_at(instruction_addr);
655 }
656
657 // Identify variant 1b: load destination relative to global toc.
658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
659 unsigned int* instr = (unsigned int*) instruction_addr;
660 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
661 && is_mtctr(instr[3]) // mtctr
662 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
663 }
664
665 // Identify variant 2.
666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
667 unsigned int* instr = (unsigned int*) instruction_addr;
668 if (link) {
669 return is_bl (instr[6]) // bl dest is last
670 && is_nop(instr[0]) // nop
671 && is_nop(instr[1]) // nop
672 && is_nop(instr[2]) // nop
673 && is_nop(instr[3]) // nop
674 && is_nop(instr[4]) // nop
675 && is_nop(instr[5]); // nop
676 } else {
677 return is_b (instr[0]) // b dest is first
678 && is_nop(instr[1]) // nop
679 && is_nop(instr[2]) // nop
680 && is_nop(instr[3]) // nop
681 && is_nop(instr[4]) // nop
682 && is_nop(instr[5]) // nop
683 && is_nop(instr[6]); // nop
684 }
685 }
686
687 // Set dest address of a bxx64_patchable instruction.
688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
689 ResourceMark rm;
690 int code_size = MacroAssembler::bxx64_patchable_size;
691 CodeBuffer buf(instruction_addr, code_size);
692 MacroAssembler masm(&buf);
693 masm.bxx64_patchable(dest, relocInfo::none, link);
694 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
695 }
696
697 // Get dest address of a bxx64_patchable instruction.
698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
699 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
700 return (address) (unsigned long) get_const(instruction_addr);
701 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
702 unsigned int* instr = (unsigned int*) instruction_addr;
703 if (link) {
704 const int instr_idx = 6; // bl is last
705 int branchoffset = branch_destination(instr[instr_idx], 0);
706 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
707 } else {
708 const int instr_idx = 0; // b is first
709 int branchoffset = branch_destination(instr[instr_idx], 0);
710 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
711 }
712 // Load dest relative to global toc.
713 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
714 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
715 instruction_addr);
716 } else {
717 ShouldNotReachHere();
718 return nullptr;
719 }
720 }
721
722 #ifdef ASSERT
723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
724 const int magic_number = 0x42;
725
726 // Preserve stack pointer register (R1_SP) and system thread id register (R13);
727 // although they're technically volatile
728 for (int i = 2; i < 13; i++) {
729 Register reg = as_Register(i);
730 if (reg == excluded_register) {
731 continue;
732 }
733
734 li(reg, magic_number);
735 }
736 }
737
738 void MacroAssembler::clobber_nonvolatile_registers() {
739 BLOCK_COMMENT("clobber nonvolatile registers {");
740 static const Register regs[] = {
741 R14,
742 R15,
743 // don't zap R16_thread
744 R17,
745 R18,
746 R19,
747 R20,
748 R21,
749 R22,
750 R23,
751 R24,
752 R25,
753 R26,
754 R27,
755 R28,
756 // don't zap R29_TOC
757 R30,
758 R31
759 };
760 Register bad = regs[0];
761 load_const_optimized(bad, 0xbad0101babe00000);
762 for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
763 addi(regs[i], bad, regs[i]->encoding());
764 }
765 BLOCK_COMMENT("} clobber nonvolatile registers");
766 }
767 #endif // ASSERT
768
769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
770 const int magic_number = 0x43;
771
772 li(tmp, magic_number);
773 for (int m = 0; m <= 7; m++) {
774 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
775 }
776 }
777
778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
779 BLOCK_COMMENT("save_nonvolatile_registers {");
780
781 for (int i = 14; i < 32; i++) {
782 std(as_Register(i), offset, dst);
783 offset += 8;
784 }
785
786 if (include_fp_regs) {
787 for (int i = 14; i < 32; i++) {
788 stfd(as_FloatRegister(i), offset, dst);
789 offset += 8;
790 }
791 }
792
793 if (include_vector_regs) {
794 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
795 if (PowerArchitecturePPC64 >= 10) {
796 for (int i = 20; i < 32; i += 2) {
797 stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
798 offset += 32;
799 }
800 } else {
801 for (int i = 20; i < 32; i++) {
802 stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
803 offset += 16;
804 }
805 }
806 }
807
808 BLOCK_COMMENT("} save_nonvolatile_registers ");
809 }
810
811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
812 BLOCK_COMMENT("restore_nonvolatile_registers {");
813
814 for (int i = 14; i < 32; i++) {
815 ld(as_Register(i), offset, src);
816 offset += 8;
817 }
818
819 if (include_fp_regs) {
820 for (int i = 14; i < 32; i++) {
821 lfd(as_FloatRegister(i), offset, src);
822 offset += 8;
823 }
824 }
825
826 if (include_vector_regs) {
827 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
828 if (PowerArchitecturePPC64 >= 10) {
829 for (int i = 20; i < 32; i += 2) {
830 lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
831 offset += 32;
832 }
833 } else {
834 for (int i = 20; i < 32; i++) {
835 lxv(as_VectorRegister(i)->to_vsr(), offset, src);
836 offset += 16;
837 }
838 }
839 }
840
841 BLOCK_COMMENT("} restore_nonvolatile_registers");
842 }
843
844 // For verify_oops.
845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
846 std(R2, offset, dst); offset += 8;
847 if (include_R3_RET_reg) {
848 std(R3, offset, dst); offset += 8;
849 }
850 std(R4, offset, dst); offset += 8;
851 std(R5, offset, dst); offset += 8;
852 std(R6, offset, dst); offset += 8;
853 std(R7, offset, dst); offset += 8;
854 std(R8, offset, dst); offset += 8;
855 std(R9, offset, dst); offset += 8;
856 std(R10, offset, dst); offset += 8;
857 std(R11, offset, dst); offset += 8;
858 std(R12, offset, dst); offset += 8;
859
860 if (include_fp_regs) {
861 stfd(F0, offset, dst); offset += 8;
862 stfd(F1, offset, dst); offset += 8;
863 stfd(F2, offset, dst); offset += 8;
864 stfd(F3, offset, dst); offset += 8;
865 stfd(F4, offset, dst); offset += 8;
866 stfd(F5, offset, dst); offset += 8;
867 stfd(F6, offset, dst); offset += 8;
868 stfd(F7, offset, dst); offset += 8;
869 stfd(F8, offset, dst); offset += 8;
870 stfd(F9, offset, dst); offset += 8;
871 stfd(F10, offset, dst); offset += 8;
872 stfd(F11, offset, dst); offset += 8;
873 stfd(F12, offset, dst); offset += 8;
874 stfd(F13, offset, dst);
875 }
876 }
877
878 // For verify_oops.
879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
880 ld(R2, offset, src); offset += 8;
881 if (include_R3_RET_reg) {
882 ld(R3, offset, src); offset += 8;
883 }
884 ld(R4, offset, src); offset += 8;
885 ld(R5, offset, src); offset += 8;
886 ld(R6, offset, src); offset += 8;
887 ld(R7, offset, src); offset += 8;
888 ld(R8, offset, src); offset += 8;
889 ld(R9, offset, src); offset += 8;
890 ld(R10, offset, src); offset += 8;
891 ld(R11, offset, src); offset += 8;
892 ld(R12, offset, src); offset += 8;
893
894 if (include_fp_regs) {
895 lfd(F0, offset, src); offset += 8;
896 lfd(F1, offset, src); offset += 8;
897 lfd(F2, offset, src); offset += 8;
898 lfd(F3, offset, src); offset += 8;
899 lfd(F4, offset, src); offset += 8;
900 lfd(F5, offset, src); offset += 8;
901 lfd(F6, offset, src); offset += 8;
902 lfd(F7, offset, src); offset += 8;
903 lfd(F8, offset, src); offset += 8;
904 lfd(F9, offset, src); offset += 8;
905 lfd(F10, offset, src); offset += 8;
906 lfd(F11, offset, src); offset += 8;
907 lfd(F12, offset, src); offset += 8;
908 lfd(F13, offset, src);
909 }
910 }
911
912 void MacroAssembler::save_LR(Register tmp) {
913 mflr(tmp);
914 std(tmp, _abi0(lr), R1_SP);
915 }
916
917 void MacroAssembler::restore_LR(Register tmp) {
918 assert(tmp != R1_SP, "must be distinct");
919 ld(tmp, _abi0(lr), R1_SP);
920 mtlr(tmp);
921 }
922
923 void MacroAssembler::save_LR_CR(Register tmp) {
924 mfcr(tmp);
925 std(tmp, _abi0(cr), R1_SP);
926 save_LR(tmp);
927 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
928 }
929
930 void MacroAssembler::restore_LR_CR(Register tmp) {
931 restore_LR(tmp);
932 ld(tmp, _abi0(cr), R1_SP);
933 mtcr(tmp);
934 }
935
936 address MacroAssembler::get_PC_trash_LR(Register result) {
937 Label L;
938 bl(L);
939 bind(L);
940 address lr_pc = pc();
941 mflr(result);
942 return lr_pc;
943 }
944
945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
946 #ifdef ASSERT
947 assert_different_registers(offset, tmp, R1_SP);
948 andi_(tmp, offset, frame::alignment_in_bytes-1);
949 asm_assert_eq("resize_frame: unaligned");
950 #endif
951
952 // tmp <- *(SP)
953 ld(tmp, _abi0(callers_sp), R1_SP);
954 // addr <- SP + offset;
955 // *(addr) <- tmp;
956 // SP <- addr
957 stdux(tmp, R1_SP, offset);
958 }
959
960 void MacroAssembler::resize_frame(int offset, Register tmp) {
961 assert(is_simm(offset, 16), "too big an offset");
962 assert_different_registers(tmp, R1_SP);
963 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
964 // tmp <- *(SP)
965 ld(tmp, _abi0(callers_sp), R1_SP);
966 // addr <- SP + offset;
967 // *(addr) <- tmp;
968 // SP <- addr
969 stdu(tmp, offset, R1_SP);
970 }
971
972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
973 // (addr == tmp1) || (addr == tmp2) is allowed here!
974 assert(tmp1 != tmp2, "must be distinct");
975
976 // compute offset w.r.t. current stack pointer
977 // tmp_1 <- addr - SP (!)
978 subf(tmp1, R1_SP, addr);
979
980 // atomically update SP keeping back link.
981 resize_frame(tmp1/* offset */, tmp2/* tmp */);
982 }
983
984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
985 #ifdef ASSERT
986 assert(bytes != R0, "r0 not allowed here");
987 andi_(R0, bytes, frame::alignment_in_bytes-1);
988 asm_assert_eq("push_frame(Reg, Reg): unaligned");
989 #endif
990 neg(tmp, bytes);
991 stdux(R1_SP, R1_SP, tmp);
992 }
993
994 // Push a frame of size `bytes'.
995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
996 long offset = align_addr(bytes, frame::alignment_in_bytes);
997 if (is_simm(-offset, 16)) {
998 stdu(R1_SP, -offset, R1_SP);
999 } else {
1000 load_const_optimized(tmp, -offset);
1001 stdux(R1_SP, R1_SP, tmp);
1002 }
1003 }
1004
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007 push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012 ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017 // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018 // most of the times.
1019 if (R12 != r_function_entry) {
1020 mr(R12, r_function_entry);
1021 }
1022 mtctr(R12);
1023 // Do a call or a branch.
1024 if (and_link) {
1025 bctrl();
1026 } else {
1027 bctr();
1028 }
1029 _last_calls_return_pc = pc();
1030
1031 return _last_calls_return_pc;
1032 }
1033
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037 return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042 return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046 load_const(R12, function_entry, R0);
1047 return branch_to(R12, /*and_link=*/true);
1048 }
1049
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056 // we emit standard ptrgl glue code here
1057 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058
1059 // retrieve necessary entries from the function descriptor
1060 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061 mtctr(R0);
1062
1063 if (load_toc_of_callee) {
1064 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065 }
1066 if (load_env_of_callee) {
1067 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068 } else if (load_toc_of_callee) {
1069 li(R11, 0);
1070 }
1071
1072 // do a call or a branch
1073 if (and_link) {
1074 bctrl();
1075 } else {
1076 bctr();
1077 }
1078 _last_calls_return_pc = pc();
1079
1080 return _last_calls_return_pc;
1081 }
1082
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088 return branch_to(fd, /*and_link=*/true,
1089 /*save toc=*/false,
1090 /*restore toc=*/false,
1091 /*load toc=*/true,
1092 /*load env=*/true);
1093 }
1094
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096 return branch_to(fd, /*and_link=*/false,
1097 /*save toc=*/false,
1098 /*restore toc=*/false,
1099 /*load toc=*/true,
1100 /*load env=*/true);
1101 }
1102
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104 if (rt != relocInfo::none) {
1105 // this call needs to be relocatable
1106 if (!ReoptimizeCallSequences
1107 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108 || fd == nullptr // support code-size estimation
1109 || !fd->is_friend_function()
1110 || fd->entry() == nullptr) {
1111 // it's not a friend function as defined by class FunctionDescriptor,
1112 // so do a full call-c here.
1113 load_const(R11, (address)fd, R0);
1114
1115 bool has_env = (fd != nullptr && fd->env() != nullptr);
1116 return branch_to(R11, /*and_link=*/true,
1117 /*save toc=*/false,
1118 /*restore toc=*/false,
1119 /*load toc=*/true,
1120 /*load env=*/has_env);
1121 } else {
1122 // It's a friend function. Load the entry point and don't care about
1123 // toc and env. Use an optimizable call instruction, but ensure the
1124 // same code-size as in the case of a non-friend function.
1125 nop();
1126 nop();
1127 nop();
1128 bl64_patchable(fd->entry(), rt);
1129 _last_calls_return_pc = pc();
1130 return _last_calls_return_pc;
1131 }
1132 } else {
1133 // This call does not need to be relocatable, do more aggressive
1134 // optimizations.
1135 if (!ReoptimizeCallSequences
1136 || !fd->is_friend_function()) {
1137 // It's not a friend function as defined by class FunctionDescriptor,
1138 // so do a full call-c here.
1139 load_const(R11, (address)fd, R0);
1140 return branch_to(R11, /*and_link=*/true,
1141 /*save toc=*/false,
1142 /*restore toc=*/false,
1143 /*load toc=*/true,
1144 /*load env=*/true);
1145 } else {
1146 // it's a friend function, load the entry point and don't care about
1147 // toc and env.
1148 address dest = fd->entry();
1149 if (is_within_range_of_b(dest, pc())) {
1150 bl(dest);
1151 } else {
1152 bl64_patchable(dest, rt);
1153 }
1154 _last_calls_return_pc = pc();
1155 return _last_calls_return_pc;
1156 }
1157 }
1158 }
1159
1160 // Call a C function. All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166 relocInfo::relocType rt, Register toc) {
1167 if (!ReoptimizeCallSequences
1168 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169 || !fd->is_friend_function()) {
1170 // It's not a friend function as defined by class FunctionDescriptor,
1171 // so do a full call-c here.
1172 assert(fd->entry() != nullptr, "function must be linked");
1173
1174 AddressLiteral fd_entry(fd->entry());
1175 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176 mtctr(R11);
1177 if (fd->env() == nullptr) {
1178 li(R11, 0);
1179 nop();
1180 } else {
1181 AddressLiteral fd_env(fd->env());
1182 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183 }
1184 AddressLiteral fd_toc(fd->toc());
1185 // Set R2_TOC (load from toc)
1186 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187 bctrl();
1188 _last_calls_return_pc = pc();
1189 if (!success) { return nullptr; }
1190 } else {
1191 // It's a friend function, load the entry point and don't care about
1192 // toc and env. Use an optimizable call instruction, but ensure the
1193 // same code-size as in the case of a non-friend function.
1194 nop();
1195 bl64_patchable(fd->entry(), rt);
1196 _last_calls_return_pc = pc();
1197 }
1198 return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201
1202 bool MacroAssembler::ic_call(Register Rmethod_toc,
1203 address target,
1204 jint method_index,
1205 bool scratch_emit,
1206 bool fixed_size) {
1207 AddressLiteral target_al(target, virtual_call_Relocation::spec(pc(), method_index));
1208 DEBUG_ONLY(int ic_load_offset = offset());
1209
1210 // Load a clear inline cache.
1211 AddressLiteral empty_ic((address) Universe::non_oop_word());
1212 bool success = load_const_from_method_toc(R19_inline_cache_reg, empty_ic, Rmethod_toc, fixed_size);
1213 if (!success) return false;
1214
1215 assert(MacroAssembler::is_load_const_from_method_toc_at(addr_at(ic_load_offset)),
1216 "should be load from TOC");
1217
1218 address call_pc = trampoline_call(target_al, Rmethod_toc, scratch_emit);
1219 return call_pc != nullptr;
1220 }
1221
1222 address MacroAssembler::trampoline_call(AddressLiteral target,
1223 Register Rmethod_toc,
1224 bool scratch_emit) {
1225 // First, emit the trampoline stub
1226 if (!scratch_emit) {
1227 RelocationHolder rh = trampoline_stub_Relocation::spec(pc() /* of the bl below */);
1228
1229 // Put the target's entry point as a constant into the constant pool.
1230 const address target_toc_addr = address_constant((address)target.value());
1231 if (target_toc_addr == nullptr) return nullptr;
1232
1233 const int target_toc_offset = offset_to_method_toc(target_toc_addr);
1234 address stub = start_a_stub(64);
1235 if (stub == nullptr) return nullptr;
1236
1237 // Annotate the stub with a relocation that points to the owning call instruction.
1238 relocate(rh);
1239 DEBUG_ONLY(int stub_start_offset = offset());
1240
1241 // For java_to_interp stubs we use R11_scratch1 as scratch register
1242 // and in call trampoline stubs we use R12_scratch2. This way we
1243 // can distinguish them (see is_NativeCallTrampolineStub_at()).
1244 Register reg_scratch = R12_scratch2;
1245
1246 if (Rmethod_toc == noreg) {
1247 calculate_address_from_global_toc(reg_scratch, method_toc());
1248 Rmethod_toc = reg_scratch;
1249 }
1250
1251 ld_largeoffset_unchecked(reg_scratch, target_toc_offset, Rmethod_toc, false);
1252 mtctr(reg_scratch);
1253 bctr();
1254
1255 assert(target_toc_offset == NativeCallTrampolineStub_at(addr_at(stub_start_offset))->destination_toc_offset(),
1256 "encoded offset into the constant pool must match");
1257 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
1258 assert(is_NativeCallTrampolineStub_at(addr_at(stub_start_offset)), "doesn't look like a trampoline");
1259
1260 // End the stub.
1261 end_a_stub();
1262 }
1263
1264 // The call will be resolved / patched later.
1265 address call_pc = pc();
1266 relocate(target.rspec());
1267 bl(call_pc);
1268 return call_pc;
1269 }
1270
1271 void MacroAssembler::post_call_nop() {
1272 // Make inline again when loom is always enabled.
1273 if (!Continuations::enabled()) {
1274 return;
1275 }
1276 // We use CMPI/CMPLI instructions to encode post call nops.
1277 // Refer to NativePostCallNop for details.
1278 relocate(post_call_nop_Relocation::spec());
1279 InlineSkippedInstructionsCounter skipCounter(this);
1280 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1281 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1282 }
1283
1284 int MacroAssembler::ic_check_size() {
1285 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1286 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1287 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1288
1289 int num_ins;
1290 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1291 num_ins = 3;
1292 if (use_trap_based_null_check) num_ins += 1;
1293 } else {
1294 num_ins = 7;
1295 if (!implicit_null_checks_available) num_ins += 2;
1296 }
1297
1298 if (UseCompactObjectHeaders) num_ins++;
1299
1300 return num_ins * BytesPerInstWord;
1301 }
1302
1303 int MacroAssembler::ic_check(int end_alignment) {
1304 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1305 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1306 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1307
1308 Register receiver = R3_ARG1;
1309 Register data = R19_inline_cache_reg;
1310 Register tmp1 = R11_scratch1;
1311 Register tmp2 = R12_scratch2;
1312
1313 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1314 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1315 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1316 // before the inline cache check here, and not after
1317 align(end_alignment, end_alignment, end_alignment - ic_check_size());
1318
1319 int uep_offset = offset();
1320
1321 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1322 // Fast version which uses SIGTRAP
1323
1324 if (use_trap_based_null_check) {
1325 trap_null_check(receiver);
1326 }
1327 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1328 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1329 trap_ic_miss_check(tmp1, tmp2);
1330
1331 } else {
1332 // Slower version which doesn't use SIGTRAP
1333
1334 // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1335 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1336 true, true, false); // 2 instructions
1337 mtctr(tmp1);
1338
1339 if (!implicit_null_checks_available) {
1340 cmpdi(CR0, receiver, 0);
1341 beqctr(CR0);
1342 }
1343 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1344 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1345 cmpd(CR0, tmp1, tmp2);
1346 bnectr(CR0);
1347 }
1348
1349 assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1350
1351 return uep_offset;
1352 }
1353
1354 void MacroAssembler::call_VM_base(Register oop_result,
1355 Register last_java_sp,
1356 address entry_point,
1357 bool check_exceptions,
1358 Label* last_java_pc) {
1359 BLOCK_COMMENT("call_VM {");
1360 // Determine last_java_sp register.
1361 if (!last_java_sp->is_valid()) {
1362 last_java_sp = R1_SP;
1363 }
1364 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1365
1366 // ARG1 must hold thread address.
1367 mr(R3_ARG1, R16_thread);
1368 address return_pc = call_c(entry_point, relocInfo::none);
1369
1370 reset_last_Java_frame();
1371
1372 // Check for pending exceptions.
1373 if (check_exceptions) {
1374 // We don't check for exceptions here.
1375 ShouldNotReachHere();
1376 }
1377
1378 // Get oop result if there is one and reset the value in the thread.
1379 if (oop_result->is_valid()) {
1380 get_vm_result_oop(oop_result);
1381 }
1382
1383 _last_calls_return_pc = return_pc;
1384 BLOCK_COMMENT("} call_VM");
1385 }
1386
1387 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1388 BLOCK_COMMENT("call_VM_leaf {");
1389 call_c(entry_point);
1390 BLOCK_COMMENT("} call_VM_leaf");
1391 }
1392
1393 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1394 call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1395 }
1396
1397 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1398 bool check_exceptions) {
1399 // R3_ARG1 is reserved for the thread.
1400 mr_if_needed(R4_ARG2, arg_1);
1401 call_VM(oop_result, entry_point, check_exceptions);
1402 }
1403
1404 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1405 bool check_exceptions) {
1406 // R3_ARG1 is reserved for the thread
1407 assert_different_registers(arg_2, R4_ARG2);
1408 mr_if_needed(R4_ARG2, arg_1);
1409 mr_if_needed(R5_ARG3, arg_2);
1410 call_VM(oop_result, entry_point, check_exceptions);
1411 }
1412
1413 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1414 bool check_exceptions) {
1415 // R3_ARG1 is reserved for the thread
1416 assert_different_registers(arg_2, R4_ARG2);
1417 assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1418 mr_if_needed(R4_ARG2, arg_1);
1419 mr_if_needed(R5_ARG3, arg_2);
1420 mr_if_needed(R6_ARG4, arg_3);
1421 call_VM(oop_result, entry_point, check_exceptions);
1422 }
1423
1424 void MacroAssembler::call_VM_leaf(address entry_point) {
1425 call_VM_leaf_base(entry_point);
1426 }
1427
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1429 mr_if_needed(R3_ARG1, arg_1);
1430 call_VM_leaf(entry_point);
1431 }
1432
1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1434 assert_different_registers(arg_2, R3_ARG1);
1435 mr_if_needed(R3_ARG1, arg_1);
1436 mr_if_needed(R4_ARG2, arg_2);
1437 call_VM_leaf(entry_point);
1438 }
1439
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1441 assert_different_registers(arg_2, R3_ARG1);
1442 assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1443 mr_if_needed(R3_ARG1, arg_1);
1444 mr_if_needed(R4_ARG2, arg_2);
1445 mr_if_needed(R5_ARG3, arg_3);
1446 call_VM_leaf(entry_point);
1447 }
1448
1449 // Check whether instruction is a read access to the polling page
1450 // which was emitted by load_from_polling_page(..).
1451 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1452 address* polling_address_ptr) {
1453 if (!is_ld(instruction))
1454 return false; // It's not a ld. Fail.
1455
1456 int rt = inv_rt_field(instruction);
1457 int ra = inv_ra_field(instruction);
1458 int ds = inv_ds_field(instruction);
1459 if (!(ds == 0 && ra != 0 && rt == 0)) {
1460 return false; // It's not a ld(r0, X, ra). Fail.
1461 }
1462
1463 if (!ucontext) {
1464 // Set polling address.
1465 if (polling_address_ptr != nullptr) {
1466 *polling_address_ptr = nullptr;
1467 }
1468 return true; // No ucontext given. Can't check value of ra. Assume true.
1469 }
1470
1471 #ifdef LINUX
1472 // Ucontext given. Check that register ra contains the address of
1473 // the safepoing polling page.
1474 ucontext_t* uc = (ucontext_t*) ucontext;
1475 // Set polling address.
1476 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1477 if (polling_address_ptr != nullptr) {
1478 *polling_address_ptr = addr;
1479 }
1480 return SafepointMechanism::is_poll_address(addr);
1481 #else
1482 // Not on Linux, ucontext must be null.
1483 ShouldNotReachHere();
1484 return false;
1485 #endif
1486 }
1487
1488 void MacroAssembler::bang_stack_with_offset(int offset) {
1489 // When increasing the stack, the old stack pointer will be written
1490 // to the new top of stack according to the PPC64 abi.
1491 // Therefore, stack banging is not necessary when increasing
1492 // the stack by <= os::vm_page_size() bytes.
1493 // When increasing the stack by a larger amount, this method is
1494 // called repeatedly to bang the intermediate pages.
1495
1496 // Stack grows down, caller passes positive offset.
1497 assert(offset > 0, "must bang with positive offset");
1498
1499 long stdoffset = -offset;
1500
1501 if (is_simm(stdoffset, 16)) {
1502 // Signed 16 bit offset, a simple std is ok.
1503 if (UseLoadInstructionsForStackBangingPPC64) {
1504 ld(R0, (int)(signed short)stdoffset, R1_SP);
1505 } else {
1506 std(R0,(int)(signed short)stdoffset, R1_SP);
1507 }
1508 } else if (is_simm(stdoffset, 31)) {
1509 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1510 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1511
1512 Register tmp = R11;
1513 addis(tmp, R1_SP, hi);
1514 if (UseLoadInstructionsForStackBangingPPC64) {
1515 ld(R0, lo, tmp);
1516 } else {
1517 std(R0, lo, tmp);
1518 }
1519 } else {
1520 ShouldNotReachHere();
1521 }
1522 }
1523
1524 // If instruction is a stack bang of the form
1525 // std R0, x(Ry), (see bang_stack_with_offset())
1526 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1527 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1528 // return the banged address. Otherwise, return 0.
1529 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1530 #ifdef LINUX
1531 ucontext_t* uc = (ucontext_t*) ucontext;
1532 int rs = inv_rs_field(instruction);
1533 int ra = inv_ra_field(instruction);
1534 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1535 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1536 || (is_stdu(instruction) && rs == 1)) {
1537 int ds = inv_ds_field(instruction);
1538 // return banged address
1539 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1540 } else if (is_stdux(instruction) && rs == 1) {
1541 int rb = inv_rb_field(instruction);
1542 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1543 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1544 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang
1545 : sp + rb_val; // banged address
1546 }
1547 return nullptr; // not a stack bang
1548 #else
1549 // workaround not needed on !LINUX :-)
1550 ShouldNotCallThis();
1551 return nullptr;
1552 #endif
1553 }
1554
1555 void MacroAssembler::reserved_stack_check(Register return_pc) {
1556 // Test if reserved zone needs to be enabled.
1557 Label no_reserved_zone_enabling;
1558
1559 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1560 cmpld(CR0, R1_SP, R0);
1561 blt_predict_taken(CR0, no_reserved_zone_enabling);
1562
1563 // Enable reserved zone again, throw stack overflow exception.
1564 push_frame_reg_args(0, R0);
1565 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1566 pop_frame();
1567 mtlr(return_pc);
1568 load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1569 mtctr(R0);
1570 bctr();
1571
1572 should_not_reach_here();
1573
1574 bind(no_reserved_zone_enabling);
1575 }
1576
1577 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1578 bool cmpxchgx_hint) {
1579 Label retry;
1580 bind(retry);
1581 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1582 stdcx_(exchange_value, addr_base);
1583 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1584 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1585 } else {
1586 bne( CR0, retry); // StXcx_ sets CR0.
1587 }
1588 }
1589
1590 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1591 Register tmp, bool cmpxchgx_hint) {
1592 Label retry;
1593 bind(retry);
1594 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1595 add(tmp, dest_current_value, inc_value);
1596 stdcx_(tmp, addr_base);
1597 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1598 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1599 } else {
1600 bne( CR0, retry); // StXcx_ sets CR0.
1601 }
1602 }
1603
1604 // Word/sub-word atomic helper functions
1605
1606 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1607 // Only signed types are supported with size < 4.
1608 // Atomic add always kills tmp1.
1609 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1610 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1611 bool cmpxchgx_hint, bool is_add, int size) {
1612 // Sub-word instructions are available since Power 8.
1613
1614 Label retry;
1615 Register shift_amount = noreg,
1616 val32 = dest_current_value,
1617 modval = is_add ? tmp1 : exchange_value;
1618
1619
1620 // atomic emulation loop
1621 bind(retry);
1622
1623 switch (size) {
1624 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1625 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1626 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1627 default: ShouldNotReachHere();
1628 }
1629
1630 if (is_add) { add(modval, dest_current_value, exchange_value); }
1631
1632
1633 switch (size) {
1634 case 4: stwcx_(modval, addr_base); break;
1635 case 2: sthcx_(modval, addr_base); break;
1636 case 1: stbcx_(modval, addr_base); break;
1637 default: ShouldNotReachHere();
1638 }
1639
1640 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1641 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1642 } else {
1643 bne( CR0, retry); // StXcx_ sets CR0.
1644 }
1645
1646 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1647 if (size == 1) {
1648 extsb(dest_current_value, dest_current_value);
1649 } else if (size == 2) {
1650 extsh(dest_current_value, dest_current_value);
1651 };
1652 }
1653
1654 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1655 // Only signed types are supported with size < 4.
1656 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1657 RegisterOrConstant compare_value, Register exchange_value,
1658 Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1659 // Sub-word instructions are available since Power 8.
1660 Register shift_amount = noreg,
1661 val32 = dest_current_value,
1662 modval = exchange_value;
1663
1664 // atomic emulation loop
1665 bind(retry);
1666
1667 switch (size) {
1668 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1669 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1670 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1671 default: ShouldNotReachHere();
1672 }
1673
1674 if (size == 1) {
1675 extsb(dest_current_value, dest_current_value);
1676 } else if (size == 2) {
1677 extsh(dest_current_value, dest_current_value);
1678 };
1679
1680 cmpw(flag, dest_current_value, compare_value);
1681 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1682 bne_predict_not_taken(flag, failed);
1683 } else {
1684 bne( flag, failed);
1685 }
1686 // branch to done => (flag == ne), (dest_current_value != compare_value)
1687 // fall through => (flag == eq), (dest_current_value == compare_value)
1688
1689 switch (size) {
1690 case 4: stwcx_(modval, addr_base); break;
1691 case 2: sthcx_(modval, addr_base); break;
1692 case 1: stbcx_(modval, addr_base); break;
1693 default: ShouldNotReachHere();
1694 }
1695 }
1696
1697 // CmpxchgX sets condition register to cmpX(current, compare).
1698 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1699 RegisterOrConstant compare_value, Register exchange_value,
1700 Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1701 Label* failed_ext, bool contention_hint, bool weak, int size) {
1702 Label retry;
1703 Label failed_int;
1704 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1705 Label done;
1706
1707 // Save one branch if result is returned via register and
1708 // result register is different from the other ones.
1709 bool use_result_reg = (int_flag_success != noreg);
1710 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1711 int_flag_success != exchange_value && int_flag_success != addr_base);
1712 assert(!weak || flag == CR0, "weak only supported with CR0");
1713 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714 assert(size == 1 || size == 2 || size == 4, "unsupported");
1715
1716 if (use_result_reg && preset_result_reg) {
1717 li(int_flag_success, 0); // preset (assume cas failed)
1718 }
1719
1720 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1721 if (contention_hint) { // Don't try to reserve if cmp fails.
1722 switch (size) {
1723 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1724 case 2: lha(dest_current_value, 0, addr_base); break;
1725 case 4: lwz(dest_current_value, 0, addr_base); break;
1726 default: ShouldNotReachHere();
1727 }
1728 cmpw(flag, dest_current_value, compare_value);
1729 bne(flag, failed);
1730 }
1731
1732 // release/fence semantics
1733 if (semantics & MemBarRel) {
1734 release();
1735 }
1736
1737 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1738 retry, failed, cmpxchgx_hint, size);
1739 if (!weak || use_result_reg || failed_ext) {
1740 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1741 bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1742 } else {
1743 bne( CR0, weak ? failed : retry); // StXcx_ sets CR0.
1744 }
1745 }
1746 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1747
1748 // Result in register (must do this at the end because int_flag_success can be the
1749 // same register as one above).
1750 if (use_result_reg) {
1751 li(int_flag_success, 1);
1752 }
1753
1754 if (semantics & MemBarFenceAfter) {
1755 fence();
1756 } else if (semantics & MemBarAcq) {
1757 isync();
1758 }
1759
1760 if (use_result_reg && !preset_result_reg) {
1761 b(done);
1762 }
1763
1764 bind(failed_int);
1765 if (use_result_reg && !preset_result_reg) {
1766 li(int_flag_success, 0);
1767 }
1768
1769 bind(done);
1770 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1771 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1772 }
1773
1774 // Performs atomic compare exchange:
1775 // if (compare_value == *addr_base)
1776 // *addr_base = exchange_value
1777 // int_flag_success = 1;
1778 // else
1779 // int_flag_success = 0;
1780 //
1781 // ConditionRegister flag = cmp(compare_value, *addr_base)
1782 // Register dest_current_value = *addr_base
1783 // Register compare_value Used to compare with value in memory
1784 // Register exchange_value Written to memory if compare_value == *addr_base
1785 // Register addr_base The memory location to compareXChange
1786 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1787 //
1788 // To avoid the costly compare exchange the value is tested beforehand.
1789 // Several special cases exist to avoid that unnecessary information is generated.
1790 //
1791 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1792 RegisterOrConstant compare_value, Register exchange_value,
1793 Register addr_base,
1794 int semantics, bool cmpxchgx_hint, Register int_flag_success,
1795 Label* failed_ext, bool contention_hint, bool weak) {
1796 Label retry;
1797 Label failed_int;
1798 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1799 Label done;
1800
1801 // Save one branch if result is returned via register and result register is different from the other ones.
1802 bool use_result_reg = (int_flag_success!=noreg);
1803 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1804 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1805 assert(!weak || flag == CR0, "weak only supported with CR0");
1806 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1807
1808 if (use_result_reg && preset_result_reg) {
1809 li(int_flag_success, 0); // preset (assume cas failed)
1810 }
1811
1812 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1813 if (contention_hint) { // Don't try to reserve if cmp fails.
1814 ld(dest_current_value, 0, addr_base);
1815 cmpd(flag, dest_current_value, compare_value);
1816 bne(flag, failed);
1817 }
1818
1819 // release/fence semantics
1820 if (semantics & MemBarRel) {
1821 release();
1822 }
1823
1824 // atomic emulation loop
1825 bind(retry);
1826
1827 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1828 cmpd(flag, dest_current_value, compare_value);
1829 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1830 bne_predict_not_taken(flag, failed);
1831 } else {
1832 bne( flag, failed);
1833 }
1834
1835 stdcx_(exchange_value, addr_base);
1836 if (!weak || use_result_reg || failed_ext) {
1837 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1838 bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1839 } else {
1840 bne( CR0, weak ? failed : retry); // stXcx_ sets CR0
1841 }
1842 }
1843
1844 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1845 if (use_result_reg) {
1846 li(int_flag_success, 1);
1847 }
1848
1849 if (semantics & MemBarFenceAfter) {
1850 fence();
1851 } else if (semantics & MemBarAcq) {
1852 isync();
1853 }
1854
1855 if (use_result_reg && !preset_result_reg) {
1856 b(done);
1857 }
1858
1859 bind(failed_int);
1860 if (use_result_reg && !preset_result_reg) {
1861 li(int_flag_success, 0);
1862 }
1863
1864 bind(done);
1865 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1866 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1867 }
1868
1869 // Look up the method for a megamorphic invokeinterface call.
1870 // The target method is determined by <intf_klass, itable_index>.
1871 // The receiver klass is in recv_klass.
1872 // On success, the result will be in method_result, and execution falls through.
1873 // On failure, execution transfers to the given label.
1874 void MacroAssembler::lookup_interface_method(Register recv_klass,
1875 Register intf_klass,
1876 RegisterOrConstant itable_index,
1877 Register method_result,
1878 Register scan_temp,
1879 Register temp2,
1880 Label& L_no_such_interface,
1881 bool return_method) {
1882 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1883
1884 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1885 int vtable_base = in_bytes(Klass::vtable_start_offset());
1886 int itentry_off = in_bytes(itableMethodEntry::method_offset());
1887 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1888 int scan_step = itableOffsetEntry::size() * wordSize;
1889 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1890
1891 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1892 // We should store the aligned, prescaled offset in the klass.
1893 // Then the next several instructions would fold away.
1894
1895 sldi(scan_temp, scan_temp, log_vte_size);
1896 addi(scan_temp, scan_temp, vtable_base);
1897 add(scan_temp, recv_klass, scan_temp);
1898
1899 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1900 if (return_method) {
1901 if (itable_index.is_register()) {
1902 Register itable_offset = itable_index.as_register();
1903 sldi(method_result, itable_offset, logMEsize);
1904 if (itentry_off) { addi(method_result, method_result, itentry_off); }
1905 add(method_result, method_result, recv_klass);
1906 } else {
1907 long itable_offset = (long)itable_index.as_constant();
1908 // static address, no relocation
1909 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1910 }
1911 }
1912
1913 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1914 // if (scan->interface() == intf) {
1915 // result = (klass + scan->offset() + itable_index);
1916 // }
1917 // }
1918 Label search, found_method;
1919
1920 for (int peel = 1; peel >= 0; peel--) {
1921 // %%%% Could load both offset and interface in one ldx, if they were
1922 // in the opposite order. This would save a load.
1923 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1924
1925 // Check that this entry is non-null. A null entry means that
1926 // the receiver class doesn't implement the interface, and wasn't the
1927 // same as when the caller was compiled.
1928 cmpd(CR0, temp2, intf_klass);
1929
1930 if (peel) {
1931 beq(CR0, found_method);
1932 } else {
1933 bne(CR0, search);
1934 // (invert the test to fall through to found_method...)
1935 }
1936
1937 if (!peel) break;
1938
1939 bind(search);
1940
1941 cmpdi(CR0, temp2, 0);
1942 beq(CR0, L_no_such_interface);
1943 addi(scan_temp, scan_temp, scan_step);
1944 }
1945
1946 bind(found_method);
1947
1948 // Got a hit.
1949 if (return_method) {
1950 int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1951 lwz(scan_temp, ito_offset, scan_temp);
1952 ldx(method_result, scan_temp, method_result);
1953 }
1954 }
1955
1956 // virtual method calling
1957 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1958 RegisterOrConstant vtable_index,
1959 Register method_result) {
1960
1961 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1962
1963 const ByteSize base = Klass::vtable_start_offset();
1964 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1965
1966 if (vtable_index.is_register()) {
1967 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1968 add(recv_klass, vtable_index.as_register(), recv_klass);
1969 } else {
1970 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1971 }
1972 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1973 }
1974
1975 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1976 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1977 Register super_klass,
1978 Register temp1_reg,
1979 Register temp2_reg,
1980 Label* L_success,
1981 Label* L_failure,
1982 Label* L_slow_path,
1983 RegisterOrConstant super_check_offset) {
1984
1985 const Register check_cache_offset = temp1_reg;
1986 const Register cached_super = temp2_reg;
1987
1988 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1989
1990 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1991 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1992
1993 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1994 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1995
1996 Label L_fallthrough;
1997 int label_nulls = 0;
1998 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
1999 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
2000 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
2001 assert(label_nulls <= 1 ||
2002 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
2003 "at most one null in the batch, usually");
2004
2005 // If the pointers are equal, we are done (e.g., String[] elements).
2006 // This self-check enables sharing of secondary supertype arrays among
2007 // non-primary types such as array-of-interface. Otherwise, each such
2008 // type would need its own customized SSA.
2009 // We move this check to the front of the fast path because many
2010 // type checks are in fact trivially successful in this manner,
2011 // so we get a nicely predicted branch right at the start of the check.
2012 cmpd(CR0, sub_klass, super_klass);
2013 beq(CR0, *L_success);
2014
2015 // Check the supertype display:
2016 if (must_load_sco) {
2017 // The super check offset is always positive...
2018 lwz(check_cache_offset, sco_offset, super_klass);
2019 super_check_offset = RegisterOrConstant(check_cache_offset);
2020 // super_check_offset is register.
2021 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
2022 }
2023 // The loaded value is the offset from Klass.
2024
2025 ld(cached_super, super_check_offset, sub_klass);
2026 cmpd(CR0, cached_super, super_klass);
2027
2028 // This check has worked decisively for primary supers.
2029 // Secondary supers are sought in the super_cache ('super_cache_addr').
2030 // (Secondary supers are interfaces and very deeply nested subtypes.)
2031 // This works in the same check above because of a tricky aliasing
2032 // between the super_cache and the primary super display elements.
2033 // (The 'super_check_addr' can address either, as the case requires.)
2034 // Note that the cache is updated below if it does not help us find
2035 // what we need immediately.
2036 // So if it was a primary super, we can just fail immediately.
2037 // Otherwise, it's the slow path for us (no success at this point).
2038
2039 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
2040
2041 if (super_check_offset.is_register()) {
2042 beq(CR0, *L_success);
2043 cmpwi(CR0, super_check_offset.as_register(), sc_offset);
2044 if (L_failure == &L_fallthrough) {
2045 beq(CR0, *L_slow_path);
2046 } else {
2047 bne(CR0, *L_failure);
2048 FINAL_JUMP(*L_slow_path);
2049 }
2050 } else {
2051 if (super_check_offset.as_constant() == sc_offset) {
2052 // Need a slow path; fast failure is impossible.
2053 if (L_slow_path == &L_fallthrough) {
2054 beq(CR0, *L_success);
2055 } else {
2056 bne(CR0, *L_slow_path);
2057 FINAL_JUMP(*L_success);
2058 }
2059 } else {
2060 // No slow path; it's a fast decision.
2061 if (L_failure == &L_fallthrough) {
2062 beq(CR0, *L_success);
2063 } else {
2064 bne(CR0, *L_failure);
2065 FINAL_JUMP(*L_success);
2066 }
2067 }
2068 }
2069
2070 bind(L_fallthrough);
2071 #undef FINAL_JUMP
2072 }
2073
2074 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2075 Register super_klass,
2076 Register temp1_reg,
2077 Register temp2_reg,
2078 Label* L_success,
2079 Register result_reg) {
2080 const Register array_ptr = temp1_reg; // current value from cache array
2081 const Register temp = temp2_reg;
2082
2083 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2084 assert(L_success == nullptr || result_reg == noreg, "can't have both");
2085
2086 int source_offset = in_bytes(Klass::secondary_supers_offset());
2087 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2088
2089 int length_offset = Array<Klass*>::length_offset_in_bytes();
2090 int base_offset = Array<Klass*>::base_offset_in_bytes();
2091
2092 Label hit, loop, failure, fallthru;
2093
2094 ld(array_ptr, source_offset, sub_klass);
2095
2096 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2097 lwz(temp, length_offset, array_ptr);
2098 cmpwi(CR0, temp, 0);
2099 beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2100
2101 mtctr(temp); // load ctr
2102
2103 bind(loop);
2104 // Oops in table are NO MORE compressed.
2105 ld(temp, base_offset, array_ptr);
2106 cmpd(CR0, temp, super_klass);
2107 beq(CR0, hit);
2108 addi(array_ptr, array_ptr, BytesPerWord);
2109 bdnz(loop);
2110
2111 bind(failure);
2112 if (result_reg != noreg) {
2113 li(result_reg, 1); // load non-zero result (indicates a miss)
2114 } else if (L_success == nullptr) {
2115 crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2116 }
2117 b(fallthru);
2118
2119 bind(hit);
2120 std(super_klass, target_offset, sub_klass); // save result to cache
2121 if (result_reg != noreg) {
2122 li(result_reg, 0); // load zero result (indicates a hit)
2123 } else if (L_success != nullptr) {
2124 b(*L_success);
2125 }
2126
2127 bind(fallthru);
2128 }
2129
2130 Register MacroAssembler::allocate_if_noreg(Register r,
2131 RegSetIterator<Register> &available_regs,
2132 RegSet ®s_to_push) {
2133 if (!r->is_valid()) {
2134 r = *available_regs++;
2135 regs_to_push += r;
2136 }
2137 return r;
2138 }
2139
2140 void MacroAssembler::push_set(RegSet set)
2141 {
2142 int spill_offset = 0;
2143 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2144 spill_offset += wordSize;
2145 std(*it, -spill_offset, R1_SP);
2146 }
2147 }
2148
2149 void MacroAssembler::pop_set(RegSet set)
2150 {
2151 int spill_offset = 0;
2152 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2153 spill_offset += wordSize;
2154 ld(*it, -spill_offset, R1_SP);
2155 }
2156 }
2157
2158 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2159 Register super_klass,
2160 Register temp1_reg,
2161 Register temp2_reg,
2162 Label* L_success,
2163 Register result_reg) {
2164 RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2165
2166 assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2167
2168 Register temp3_reg = noreg, temp4_reg = noreg;
2169 bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2170
2171 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2172
2173 RegSetIterator<Register> available_regs
2174 = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2175
2176 RegSet pushed_regs;
2177
2178 temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2179 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2180 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2181 temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2182 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2183
2184 push_set(pushed_regs);
2185
2186 lookup_secondary_supers_table_var(sub_klass, super_klass,
2187 temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2188 result_reg);
2189
2190 if (L_success != nullptr || !result_reg_provided) {
2191 // result_reg may get overwritten by pop_set
2192 cmpdi(CR0, result_reg, 0);
2193 }
2194
2195 // Unspill the temp. registers:
2196 pop_set(pushed_regs);
2197
2198 if (L_success != nullptr) {
2199 beq(CR0, *L_success);
2200 }
2201 }
2202
2203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2204 Register super_klass,
2205 Register temp1_reg,
2206 Register temp2_reg,
2207 Label* L_success,
2208 Register result_reg) {
2209 if (UseSecondarySupersTable) {
2210 check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2211 } else {
2212 if (temp2_reg == noreg) temp2_reg = R0;
2213 check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2214 }
2215 }
2216
2217 // Try fast path, then go to slow one if not successful
2218 void MacroAssembler::check_klass_subtype(Register sub_klass,
2219 Register super_klass,
2220 Register temp1_reg,
2221 Register temp2_reg,
2222 Label& L_success) {
2223 Label L_failure;
2224 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2225 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2226 bind(L_failure); // Fallthru if not successful.
2227 }
2228
2229 // scans count pointer sized words at [addr] for occurrence of value,
2230 // generic (count must be >0)
2231 // iff found: CR0 eq, scratch == 0
2232 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2233 Label Lloop, Lafter_loop, Lexit;
2234
2235 srdi_(scratch, count, 1);
2236 beq(CR0, Lafter_loop);
2237 mtctr(scratch);
2238
2239 bind(Lloop); // 2x unrolled
2240 ld(scratch, 0, addr);
2241 xor_(scratch, scratch, value);
2242 beq(CR0, Lexit);
2243 ld(scratch, 8, addr);
2244 xor_(scratch, scratch, value);
2245 beq(CR0, Lexit);
2246 addi(addr, addr, 2 * wordSize);
2247 bdnz(Lloop);
2248
2249 bind(Lafter_loop);
2250 andi_(scratch, count, 1);
2251 beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2252 ld(scratch, 0, addr);
2253 xor_(scratch, scratch, value);
2254
2255 bind(Lexit);
2256 }
2257
2258 // Ensure that the inline code and the stub are using the same registers.
2259 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
2260 do { \
2261 assert(r_super_klass == R4_ARG2 && \
2262 r_array_base == R3_ARG1 && \
2263 r_array_length == R7_ARG5 && \
2264 (r_array_index == R6_ARG4 || r_array_index == noreg) && \
2265 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \
2266 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \
2267 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \
2268 } while(0)
2269
2270 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2271 Register r_super_klass,
2272 Register temp1,
2273 Register temp2,
2274 Register temp3,
2275 Register temp4,
2276 Register result,
2277 u1 super_klass_slot) {
2278 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2279
2280 Label L_done;
2281
2282 BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2283
2284 const Register
2285 r_array_base = temp1,
2286 r_array_length = temp2,
2287 r_array_index = temp3,
2288 r_bitmap = temp4;
2289
2290 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2291
2292 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2293
2294 // First check the bitmap to see if super_klass might be present. If
2295 // the bit is zero, we are certain that super_klass is not one of
2296 // the secondary supers.
2297 u1 bit = super_klass_slot;
2298 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2299
2300 // if (shift_count == 0) this is used for comparing with 0:
2301 sldi_(r_array_index, r_bitmap, shift_count);
2302
2303 li(result, 1); // failure
2304 // We test the MSB of r_array_index, i.e. its sign bit
2305 bge(CR0, L_done);
2306
2307 // We will consult the secondary-super array.
2308 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2309
2310 // The value i in r_array_index is >= 1, so even though r_array_base
2311 // points to the length, we don't need to adjust it to point to the
2312 // data.
2313 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2314
2315 // Get the first array index that can contain super_klass.
2316 if (bit != 0) {
2317 popcntd(r_array_index, r_array_index);
2318 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2319 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2320 ldx(result, r_array_base, r_array_index);
2321 } else {
2322 // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2323 // such that the sum is precise.
2324 ld(result, BytesPerWord, r_array_base);
2325 li(r_array_index, BytesPerWord); // for slow path (scaled)
2326 }
2327
2328 xor_(result, result, r_super_klass);
2329 beq(CR0, L_done); // Found a match (result == 0)
2330
2331 // Is there another entry to check? Consult the bitmap.
2332 testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2333 beq(CR0, L_done); // (result != 0)
2334
2335 // Linear probe. Rotate the bitmap so that the next bit to test is
2336 // in Bit 2 for the look-ahead check in the slow path.
2337 if (bit != 0) {
2338 rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2339 }
2340
2341 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2342 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2343 // Kills: r_array_length.
2344 // Returns: result.
2345 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2346 Register r_stub_addr = r_array_length;
2347 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2348 mtctr(r_stub_addr);
2349 bctrl();
2350
2351 bind(L_done);
2352 BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2353
2354 if (VerifySecondarySupers) {
2355 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2356 temp1, temp2, temp3);
2357 }
2358 }
2359
2360 // At runtime, return 0 in result if r_super_klass is a superclass of
2361 // r_sub_klass, otherwise return nonzero. Use this version of
2362 // lookup_secondary_supers_table() if you don't know ahead of time
2363 // which superclass will be searched for. Used by interpreter and
2364 // runtime stubs. It is larger and has somewhat greater latency than
2365 // the version above, which takes a constant super_klass_slot.
2366 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2367 Register r_super_klass,
2368 Register temp1,
2369 Register temp2,
2370 Register temp3,
2371 Register temp4,
2372 Register result) {
2373 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2374
2375 Label L_done;
2376
2377 BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2378
2379 const Register
2380 r_array_base = temp1,
2381 slot = temp2,
2382 r_array_index = temp3,
2383 r_bitmap = temp4;
2384
2385 lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2386 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2387
2388 li(result, 1); // Make sure that result is nonzero if the test below misses.
2389
2390 // First check the bitmap to see if super_klass might be present. If
2391 // the bit is zero, we are certain that super_klass is not one of
2392 // the secondary supers.
2393 xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2394 sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2395
2396 // We test the MSB of r_array_index, i.e. its sign bit
2397 bge(CR0, L_done);
2398
2399 // We will consult the secondary-super array.
2400 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2401
2402 // The value i in r_array_index is >= 1, so even though r_array_base
2403 // points to the length, we don't need to adjust it to point to the data.
2404 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2405 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2406
2407 // Get the first array index that can contain super_klass into r_array_index.
2408 popcntd(r_array_index, r_array_index);
2409
2410 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2411 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2412
2413 ldx(R0, r_array_base, r_array_index);
2414 xor_(result, R0, r_super_klass);
2415 beq(CR0, L_done); // found a match, result is 0 in this case
2416
2417 // Linear probe. Rotate the bitmap so that the next bit to test is
2418 // in Bit 1.
2419 neg(R0, slot); // rotate right
2420 rldcl(r_bitmap, r_bitmap, R0, 0);
2421 Register temp = slot;
2422 andi_(temp, r_bitmap, 2);
2423 beq(CR0, L_done); // fail (result != 0)
2424
2425 // The slot we just inspected is at secondary_supers[r_array_index - 1].
2426 // The next slot to be inspected, by the logic we're about to call,
2427 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2428 // have been checked.
2429 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2430 r_bitmap, result, temp);
2431 // return whatever we got from slow path
2432
2433 bind(L_done);
2434
2435 BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2436
2437 if (VerifySecondarySupers) {
2438 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2439 temp1, temp2, temp3);
2440 }
2441 }
2442
2443 // Called by code generated by check_klass_subtype_slow_path
2444 // above. This is called when there is a collision in the hashed
2445 // lookup in the secondary supers array.
2446 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2447 Register r_array_base,
2448 Register r_array_index,
2449 Register r_bitmap,
2450 Register result,
2451 Register temp1) {
2452 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2453
2454 const Register
2455 r_array_length = temp1,
2456 r_sub_klass = noreg;
2457
2458 Label L_done;
2459
2460 // Load the array length.
2461 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2462 // And adjust the array base to point to the data.
2463 // NB! Effectively increments current slot index by 1.
2464 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2465 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2466
2467 // Linear probe
2468 Label L_huge;
2469
2470 // The bitmap is full to bursting.
2471 // Implicit invariant: BITMAP_FULL implies (length > 0)
2472 cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2473 bgt(CR0, L_huge);
2474
2475 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2476 // current slot (at secondary_supers[r_array_index]) has not yet
2477 // been inspected, and r_array_index may be out of bounds if we
2478 // wrapped around the end of the array.
2479
2480 { // This is conventional linear probing, but instead of terminating
2481 // when a null entry is found in the table, we maintain a bitmap
2482 // in which a 0 indicates missing entries.
2483 // The check above guarantees there are 0s in the bitmap, so the loop
2484 // eventually terminates.
2485
2486 #ifdef ASSERT
2487 {
2488 // We should only reach here after having found a bit in the bitmap.
2489 // Invariant: array_length == popcount(bitmap)
2490 Label ok;
2491 cmpdi(CR0, r_array_length, 0);
2492 bgt(CR0, ok);
2493 stop("array_length must be positive");
2494 bind(ok);
2495 }
2496 #endif
2497
2498 // Compute limit in r_array_length
2499 addi(r_array_length, r_array_length, -1);
2500 sldi(r_array_length, r_array_length, LogBytesPerWord);
2501
2502 Label L_loop;
2503 bind(L_loop);
2504
2505 // Check for wraparound.
2506 cmpd(CR0, r_array_index, r_array_length);
2507 isel_0(r_array_index, CR0, Assembler::greater);
2508
2509 ldx(result, r_array_base, r_array_index);
2510 xor_(result, result, r_super_klass);
2511 beq(CR0, L_done); // success (result == 0)
2512
2513 // look-ahead check (Bit 2); result is non-zero
2514 testbitdi(CR0, R0, r_bitmap, 2);
2515 beq(CR0, L_done); // fail (result != 0)
2516
2517 rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2518 addi(r_array_index, r_array_index, BytesPerWord);
2519 b(L_loop);
2520 }
2521
2522 { // Degenerate case: more than 64 secondary supers.
2523 // FIXME: We could do something smarter here, maybe a vectorized
2524 // comparison or a binary search, but is that worth any added
2525 // complexity?
2526 bind(L_huge);
2527 repne_scan(r_array_base, r_super_klass, r_array_length, result);
2528 }
2529
2530 bind(L_done);
2531 }
2532
2533 // Make sure that the hashed lookup and a linear scan agree.
2534 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2535 Register r_super_klass,
2536 Register result,
2537 Register temp1,
2538 Register temp2,
2539 Register temp3) {
2540 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2541
2542 const Register
2543 r_array_base = temp1,
2544 r_array_length = temp2,
2545 r_array_index = temp3,
2546 r_bitmap = noreg; // unused
2547
2548 BLOCK_COMMENT("verify_secondary_supers_table {");
2549
2550 Label passed, failure;
2551
2552 // We will consult the secondary-super array.
2553 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2554 // Load the array length.
2555 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2556 // And adjust the array base to point to the data.
2557 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2558
2559 // convert !=0 to 1
2560 normalize_bool(result, R0, true);
2561 const Register linear_result = r_array_index; // reuse
2562 li(linear_result, 1);
2563 cmpdi(CR0, r_array_length, 0);
2564 ble(CR0, failure);
2565 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2566 bind(failure);
2567
2568 // convert !=0 to 1
2569 normalize_bool(linear_result, R0, true);
2570
2571 cmpd(CR0, result, linear_result);
2572 beq(CR0, passed);
2573
2574 // report fatal error and terminate VM
2575
2576 // Argument shuffle. Using stack to avoid clashes.
2577 std(r_super_klass, -8, R1_SP);
2578 std(r_sub_klass, -16, R1_SP);
2579 std(linear_result, -24, R1_SP);
2580 mr_if_needed(R6_ARG4, result);
2581 ld(R3_ARG1, -8, R1_SP);
2582 ld(R4_ARG2, -16, R1_SP);
2583 ld(R5_ARG3, -24, R1_SP);
2584
2585 const char* msg = "mismatch";
2586 load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2587 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2588 should_not_reach_here();
2589
2590 bind(passed);
2591
2592 BLOCK_COMMENT("} verify_secondary_supers_table");
2593 }
2594
2595 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2596 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2597
2598 Label L_check_thread, L_fallthrough;
2599 if (L_fast_path == nullptr) {
2600 L_fast_path = &L_fallthrough;
2601 } else if (L_slow_path == nullptr) {
2602 L_slow_path = &L_fallthrough;
2603 }
2604
2605 // Fast path check: class is fully initialized
2606 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2607 // acquire by cmp-branch-isync if fully_initialized
2608 cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2609 bne(CR0, L_check_thread);
2610 isync();
2611 b(*L_fast_path);
2612
2613 // Fast path check: current thread is initializer thread
2614 bind(L_check_thread);
2615 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2616 cmpd(CR0, thread, R0);
2617 if (L_slow_path == &L_fallthrough) {
2618 beq(CR0, *L_fast_path);
2619 } else if (L_fast_path == &L_fallthrough) {
2620 bne(CR0, *L_slow_path);
2621 } else {
2622 Unimplemented();
2623 }
2624
2625 bind(L_fallthrough);
2626 }
2627
2628 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2629 Register temp_reg,
2630 int extra_slot_offset) {
2631 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2632 int stackElementSize = Interpreter::stackElementSize;
2633 int offset = extra_slot_offset * stackElementSize;
2634 if (arg_slot.is_constant()) {
2635 offset += arg_slot.as_constant() * stackElementSize;
2636 return offset;
2637 } else {
2638 assert(temp_reg != noreg, "must specify");
2639 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2640 if (offset != 0)
2641 addi(temp_reg, temp_reg, offset);
2642 return temp_reg;
2643 }
2644 }
2645
2646 void MacroAssembler::tlab_allocate(
2647 Register obj, // result: pointer to object after successful allocation
2648 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2649 int con_size_in_bytes, // object size in bytes if known at compile time
2650 Register t1, // temp register
2651 Label& slow_case // continuation point if fast allocation fails
2652 ) {
2653 // make sure arguments make sense
2654 assert_different_registers(obj, var_size_in_bytes, t1);
2655 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2656 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2657
2658 const Register new_top = t1;
2659 //verify_tlab(); not implemented
2660
2661 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2662 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2663 if (var_size_in_bytes == noreg) {
2664 addi(new_top, obj, con_size_in_bytes);
2665 } else {
2666 add(new_top, obj, var_size_in_bytes);
2667 }
2668 cmpld(CR0, new_top, R0);
2669 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2670
2671 #ifdef ASSERT
2672 // make sure new free pointer is properly aligned
2673 {
2674 Label L;
2675 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2676 beq(CR0, L);
2677 stop("updated TLAB free is not properly aligned");
2678 bind(L);
2679 }
2680 #endif // ASSERT
2681
2682 // update the tlab top pointer
2683 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2684 //verify_tlab(); not implemented
2685 }
2686
2687 // "The box" is the space on the stack where we copy the object mark.
2688 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2689 Register tmp1, Register tmp2, Register tmp3) {
2690 assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2691 assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2692 assert(flag == CR0, "bad condition register");
2693
2694 // Handle inflated monitor.
2695 Label inflated;
2696 // Finish fast lock successfully. MUST reach to with flag == NE
2697 Label locked;
2698 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2699 Label slow_path;
2700
2701 if (UseObjectMonitorTable) {
2702 // Clear cache in case fast locking succeeds or we need to take the slow-path.
2703 li(tmp1, 0);
2704 std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2705 }
2706
2707 if (DiagnoseSyncOnValueBasedClasses != 0) {
2708 load_klass(tmp1, obj);
2709 lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2710 testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2711 bne(CR0, slow_path);
2712 }
2713
2714 Register mark = tmp1;
2715
2716 { // Fast locking
2717
2718 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2719 Label push;
2720
2721 const Register top = tmp2;
2722
2723 // Check if lock-stack is full.
2724 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2725 cmplwi(CR0, top, LockStack::end_offset() - 1);
2726 bgt(CR0, slow_path);
2727
2728 // The underflow check is elided. The recursive check will always fail
2729 // when the lock stack is empty because of the _bad_oop_sentinel field.
2730
2731 // Check if recursive.
2732 subi(R0, top, oopSize);
2733 ldx(R0, R16_thread, R0);
2734 cmpd(CR0, obj, R0);
2735 beq(CR0, push);
2736
2737 // Check for monitor (0b10) or locked (0b00).
2738 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2739 andi_(R0, mark, markWord::lock_mask_in_place);
2740 cmpldi(CR0, R0, markWord::unlocked_value);
2741 bgt(CR0, inflated);
2742 bne(CR0, slow_path);
2743
2744 // Not inflated.
2745
2746 // Try to lock. Transition lock bits 0b01 => 0b00
2747 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2748 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2749
2750 bind(push);
2751 // After successful lock, push object on lock-stack.
2752 stdx(obj, R16_thread, top);
2753 addi(top, top, oopSize);
2754 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2755 b(locked);
2756 }
2757
2758 { // Handle inflated monitor.
2759 bind(inflated);
2760
2761 // mark contains the tagged ObjectMonitor*.
2762 const uintptr_t monitor_tag = markWord::monitor_value;
2763 const Register monitor = UseObjectMonitorTable ? tmp1 : noreg;
2764 const Register owner_addr = tmp2;
2765 const Register thread_id = UseObjectMonitorTable ? tmp3 : tmp1;
2766 Label monitor_locked;
2767
2768 if (!UseObjectMonitorTable) {
2769 // Compute owner address.
2770 addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2771 mark = noreg;
2772 } else {
2773 const Register tmp3_bucket = tmp3;
2774 const Register tmp2_hash = tmp2;
2775 Label monitor_found;
2776
2777 // Save the mark, we might need it to extract the hash.
2778 mr(tmp2_hash, mark);
2779
2780 // Look for the monitor in the om_cache.
2781
2782 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
2783 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2784 const int num_unrolled = OMCache::CAPACITY;
2785 for (int i = 0; i < num_unrolled; i++) {
2786 ld(R0, in_bytes(cache_offset), R16_thread);
2787 ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2788 cmpd(CR0, R0, obj);
2789 beq(CR0, monitor_found);
2790 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2791 }
2792
2793 // Look for the monitor in the table.
2794
2795 // Get the hash code.
2796 srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2797
2798 // Get the table and calculate the bucket's address
2799 int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2800 ld_ptr(tmp3, simm16_rest, tmp3);
2801 ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2802 andr(tmp2_hash, tmp2_hash, tmp1);
2803 ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2804
2805 // Read the monitor from the bucket.
2806 sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2807 ldx(monitor, tmp3_bucket, tmp2_hash);
2808
2809 // Check if the monitor in the bucket is special (empty, tombstone or removed).
2810 cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2811 blt(CR0, slow_path);
2812
2813 // Check if object matches.
2814 ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2815 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2816 bs_asm->try_peek_weak_handle_in_nmethod(this, tmp3, tmp3, tmp2, slow_path);
2817 cmpd(CR0, tmp3, obj);
2818 bne(CR0, slow_path);
2819
2820 bind(monitor_found);
2821
2822 // Compute owner address.
2823 addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2824 }
2825
2826 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2827 assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2828 ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2829 cmpxchgd(/*flag=*/CR0,
2830 /*current_value=*/R0,
2831 /*compare_value=*/(intptr_t)0,
2832 /*exchange_value=*/thread_id,
2833 /*where=*/owner_addr,
2834 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2835 MacroAssembler::cmpxchgx_hint_acquire_lock());
2836 beq(CR0, monitor_locked);
2837
2838 // Check if recursive.
2839 cmpd(CR0, R0, thread_id);
2840 bne(CR0, slow_path);
2841
2842 // Recursive.
2843 if (!UseObjectMonitorTable) {
2844 assert_different_registers(tmp1, owner_addr);
2845 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2846 addi(tmp1, tmp1, 1);
2847 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2848 } else {
2849 assert_different_registers(tmp2, monitor);
2850 ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2851 addi(tmp2, tmp2, 1);
2852 std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2853 }
2854
2855 bind(monitor_locked);
2856 if (UseObjectMonitorTable) {
2857 std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2858 }
2859 }
2860
2861 bind(locked);
2862
2863 #ifdef ASSERT
2864 // Check that locked label is reached with flag == EQ.
2865 Label flag_correct;
2866 beq(CR0, flag_correct);
2867 stop("Fast Lock Flag != EQ");
2868 #endif
2869 bind(slow_path);
2870 #ifdef ASSERT
2871 // Check that slow_path label is reached with flag == NE.
2872 bne(CR0, flag_correct);
2873 stop("Fast Lock Flag != NE");
2874 bind(flag_correct);
2875 #endif
2876 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2877 }
2878
2879 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2880 Register tmp1, Register tmp2, Register tmp3) {
2881 assert_different_registers(obj, tmp1, tmp2, tmp3);
2882 assert(flag == CR0, "bad condition register");
2883
2884 // Handle inflated monitor.
2885 Label inflated, inflated_load_monitor;
2886 // Finish fast unlock successfully. MUST reach to with flag == EQ.
2887 Label unlocked;
2888 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2889 Label slow_path;
2890
2891 const Register mark = tmp1;
2892 const Register top = tmp2;
2893 const Register t = tmp3;
2894
2895 { // Fast unlock
2896 Label push_and_slow;
2897
2898 // Check if obj is top of lock-stack.
2899 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2900 subi(top, top, oopSize);
2901 ldx(t, R16_thread, top);
2902 cmpd(CR0, obj, t);
2903 // Top of lock stack was not obj. Must be monitor.
2904 bne(CR0, inflated_load_monitor);
2905
2906 // Pop lock-stack.
2907 DEBUG_ONLY(li(t, 0);)
2908 DEBUG_ONLY(stdx(t, R16_thread, top);)
2909 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2910
2911 // The underflow check is elided. The recursive check will always fail
2912 // when the lock stack is empty because of the _bad_oop_sentinel field.
2913
2914 // Check if recursive.
2915 subi(t, top, oopSize);
2916 ldx(t, R16_thread, t);
2917 cmpd(CR0, obj, t);
2918 beq(CR0, unlocked);
2919
2920 // Not recursive.
2921
2922 // Check for monitor (0b10).
2923 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2924 andi_(t, mark, markWord::monitor_value);
2925 if (!UseObjectMonitorTable) {
2926 bne(CR0, inflated);
2927 } else {
2928 bne(CR0, push_and_slow);
2929 }
2930
2931 #ifdef ASSERT
2932 // Check header not unlocked (0b01).
2933 Label not_unlocked;
2934 andi_(t, mark, markWord::unlocked_value);
2935 beq(CR0, not_unlocked);
2936 stop("fast_unlock already unlocked");
2937 bind(not_unlocked);
2938 #endif
2939
2940 // Try to unlock. Transition lock bits 0b00 => 0b01
2941 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2942 b(unlocked);
2943
2944 bind(push_and_slow);
2945 // Restore lock-stack and handle the unlock in runtime.
2946 DEBUG_ONLY(stdx(obj, R16_thread, top);)
2947 addi(top, top, oopSize);
2948 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2949 b(slow_path);
2950 }
2951
2952 { // Handle inflated monitor.
2953 bind(inflated_load_monitor);
2954 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2955 #ifdef ASSERT
2956 andi_(t, mark, markWord::monitor_value);
2957 bne(CR0, inflated);
2958 stop("Fast Unlock not monitor");
2959 #endif
2960
2961 bind(inflated);
2962
2963 #ifdef ASSERT
2964 Label check_done;
2965 subi(top, top, oopSize);
2966 cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2967 blt(CR0, check_done);
2968 ldx(t, R16_thread, top);
2969 cmpd(CR0, obj, t);
2970 bne(CR0, inflated);
2971 stop("Fast Unlock lock on stack");
2972 bind(check_done);
2973 #endif
2974
2975 // mark contains the tagged ObjectMonitor*.
2976 const Register monitor = mark;
2977 const uintptr_t monitor_tag = markWord::monitor_value;
2978
2979 if (!UseObjectMonitorTable) {
2980 // Untag the monitor.
2981 subi(monitor, mark, monitor_tag);
2982 } else {
2983 ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2984 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2985 cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2986 blt(CR0, slow_path);
2987 }
2988
2989 const Register recursions = tmp2;
2990 Label not_recursive;
2991
2992 // Check if recursive.
2993 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2994 addic_(recursions, recursions, -1);
2995 blt(CR0, not_recursive);
2996
2997 // Recursive unlock.
2998 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2999 crorc(CR0, Assembler::equal, CR0, Assembler::equal);
3000 b(unlocked);
3001
3002 bind(not_recursive);
3003
3004 // Set owner to null.
3005 // Release to satisfy the JMM
3006 release();
3007 li(t, 0);
3008 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
3009 // We need a full fence after clearing owner to avoid stranding.
3010 // StoreLoad achieves this.
3011 membar(StoreLoad);
3012
3013 // Check if the entry_list is empty.
3014 ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
3015 cmpdi(CR0, t, 0);
3016 beq(CR0, unlocked); // If so we are done.
3017
3018 // Check if there is a successor.
3019 ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
3020 cmpdi(CR0, t, 0);
3021 // Invert equal bit
3022 crnand(flag, Assembler::equal, flag, Assembler::equal);
3023 beq(CR0, unlocked); // If there is a successor we are done.
3024
3025 // Save the monitor pointer in the current thread, so we can try
3026 // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3027 std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3028 b(slow_path); // flag == NE
3029 }
3030
3031 bind(unlocked);
3032
3033 #ifdef ASSERT
3034 // Check that unlocked label is reached with flag == EQ.
3035 Label flag_correct;
3036 beq(CR0, flag_correct);
3037 stop("Fast Lock Flag != EQ");
3038 #endif
3039 bind(slow_path);
3040 #ifdef ASSERT
3041 // Check that slow_path label is reached with flag == NE.
3042 bne(CR0, flag_correct);
3043 stop("Fast Lock Flag != NE");
3044 bind(flag_correct);
3045 #endif
3046 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3047 }
3048
3049 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3050 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3051
3052 if (at_return) {
3053 if (in_nmethod) {
3054 if (UseSIGTRAP) {
3055 // Use Signal Handler.
3056 relocate(relocInfo::poll_return_type);
3057 td(traptoGreaterThanUnsigned, R1_SP, temp);
3058 } else {
3059 cmpld(CR0, R1_SP, temp);
3060 // Stub may be out of range for short conditional branch.
3061 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3062 }
3063 } else { // Not in nmethod.
3064 // Frame still on stack, need to get fp.
3065 Register fp = R0;
3066 ld(fp, _abi0(callers_sp), R1_SP);
3067 cmpld(CR0, fp, temp);
3068 bgt(CR0, slow_path);
3069 }
3070 } else { // Normal safepoint poll. Not at return.
3071 assert(!in_nmethod, "should use load_from_polling_page");
3072 andi_(temp, temp, SafepointMechanism::poll_bit());
3073 bne(CR0, slow_path);
3074 }
3075 }
3076
3077 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3078 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3079 "polling page return stub not created yet");
3080 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3081
3082 // Determine saved exception pc using pc relative address computation.
3083 {
3084 Label next_pc;
3085 bl(next_pc);
3086 bind(next_pc);
3087 }
3088 int current_offset = offset();
3089
3090 if (fixed_size) {
3091 // Code size must not depend on offsets.
3092 load_const32(R12, safepoint_offset - current_offset);
3093 mflr(R0);
3094 add(R12, R12, R0);
3095 } else {
3096 mflr(R12);
3097 add_const_optimized(R12, R12, safepoint_offset - current_offset);
3098 }
3099 std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3100
3101 add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3102 mtctr(R0);
3103 bctr();
3104 }
3105
3106 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3107 MacroAssembler::PreservationLevel preservation_level) {
3108 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3109 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3110 }
3111
3112 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3113 MacroAssembler::PreservationLevel preservation_level) {
3114 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3115 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3116 }
3117
3118 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3119 // in frame_ppc.hpp.
3120 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3121 // Always set last_Java_pc and flags first because once last_Java_sp
3122 // is visible has_last_Java_frame is true and users will look at the
3123 // rest of the fields. (Note: flags should always be zero before we
3124 // get here so doesn't need to be set.)
3125
3126 // Verify that last_Java_pc was zeroed on return to Java
3127 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3128 "last_Java_pc not zeroed before leaving Java");
3129
3130 // When returning from calling out from Java mode the frame anchor's
3131 // last_Java_pc will always be set to null. It is set here so that
3132 // if we are doing a call to native (not VM) that we capture the
3133 // known pc and don't have to rely on the native call having a
3134 // standard frame linkage where we can find the pc.
3135 if (last_Java_pc != noreg)
3136 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3137
3138 // Set last_Java_sp last.
3139 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3140 }
3141
3142 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3143 if (check_last_java_sp) {
3144 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3145 R16_thread, "SP was not set, still zero");
3146 }
3147
3148 BLOCK_COMMENT("reset_last_Java_frame {");
3149 li(R0, 0);
3150
3151 // _last_Java_sp = 0
3152 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3153
3154 // _last_Java_pc = 0
3155 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3156 BLOCK_COMMENT("} reset_last_Java_frame");
3157 }
3158
3159 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3160 assert_different_registers(sp, tmp1);
3161
3162 if (jpc == nullptr || jpc->is_bound()) {
3163 load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3164 } else {
3165 load_const(tmp1, *jpc, R12_scratch2);
3166 }
3167
3168 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3169 }
3170
3171 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3172 // Read:
3173 // R16_thread
3174 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3175 //
3176 // Updated:
3177 // oop_result
3178 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3179
3180 ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3181 li(R0, 0);
3182 std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3183
3184 verify_oop(oop_result, FILE_AND_LINE);
3185 }
3186
3187 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3188 // Read:
3189 // R16_thread
3190 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3191 //
3192 // Updated:
3193 // metadata_result
3194 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3195
3196 ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3197 li(R0, 0);
3198 std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3199 }
3200
3201 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3202 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3203 if (CompressedKlassPointers::base() != nullptr) {
3204 // Use dst as temp if it is free.
3205 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3206 current = dst;
3207 }
3208 if (CompressedKlassPointers::shift() != 0) {
3209 srdi(dst, current, CompressedKlassPointers::shift());
3210 current = dst;
3211 }
3212 return current;
3213 }
3214
3215 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3216 assert(!UseCompactObjectHeaders, "not with compact headers");
3217 Register compressedKlass = encode_klass_not_null(ck, klass);
3218 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3219 }
3220
3221 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3222 assert(!UseCompactObjectHeaders, "not with compact headers");
3223 if (val == noreg) {
3224 val = R0;
3225 li(val, 0);
3226 }
3227 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3228 }
3229
3230 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3231 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3232 if (src == noreg) src = dst;
3233 Register shifted_src = src;
3234 if (CompressedKlassPointers::shift() != 0 ||
3235 (CompressedKlassPointers::base() == nullptr && src != dst)) { // Move required.
3236 shifted_src = dst;
3237 sldi(shifted_src, src, CompressedKlassPointers::shift());
3238 }
3239 if (CompressedKlassPointers::base() != nullptr) {
3240 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3241 }
3242 }
3243
3244 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3245 if (UseCompactObjectHeaders) {
3246 load_narrow_klass_compact(dst, src);
3247 } else {
3248 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3249 }
3250 }
3251
3252 void MacroAssembler::load_klass(Register dst, Register src) {
3253 load_klass_no_decode(dst, src);
3254 decode_klass_not_null(dst);
3255 }
3256
3257 // Loads the obj's Klass* into dst.
3258 // Preserves all registers (incl src, rscratch1 and rscratch2).
3259 // Input:
3260 // src - the oop we want to load the klass from.
3261 // dst - output nklass.
3262 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3263 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3264 ld(dst, oopDesc::mark_offset_in_bytes(), src);
3265 srdi(dst, dst, markWord::klass_shift);
3266 }
3267
3268 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3269 assert_different_registers(obj, klass, tmp);
3270 if (UseCompactObjectHeaders) {
3271 load_narrow_klass_compact(tmp, obj);
3272 } else {
3273 lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3274 }
3275 Register encoded_klass = encode_klass_not_null(tmp2, klass);
3276 cmpw(dst, tmp, encoded_klass);
3277 }
3278
3279 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3280 if (UseCompactObjectHeaders) {
3281 load_narrow_klass_compact(tmp1, obj1);
3282 load_narrow_klass_compact(tmp2, obj2);
3283 cmpw(dst, tmp1, tmp2);
3284 } else {
3285 lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3286 lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3287 cmpw(dst, tmp1, tmp2);
3288 }
3289 }
3290
3291 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3292 null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3293 load_klass(dst, src);
3294 }
3295
3296 // ((OopHandle)result).resolve();
3297 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3298 MacroAssembler::PreservationLevel preservation_level) {
3299 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3300 }
3301
3302 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3303 MacroAssembler::PreservationLevel preservation_level) {
3304 Label resolved;
3305
3306 // A null weak handle resolves to null.
3307 cmpdi(CR0, result, 0);
3308 beq(CR0, resolved);
3309
3310 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3311 preservation_level);
3312 bind(resolved);
3313 }
3314
3315 void MacroAssembler::load_method_holder(Register holder, Register method) {
3316 ld(holder, in_bytes(Method::const_offset()), method);
3317 ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3318 ld(holder, ConstantPool::pool_holder_offset(), holder);
3319 }
3320
3321 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
3322 assert_different_registers(markword, R0);
3323 andi(R0, markword, markWord::inline_type_pattern_mask);
3324 cmpwi(CR0, R0, markWord::inline_type_pattern);
3325 beq(CR0, is_inline_type);
3326 }
3327
3328 void MacroAssembler::test_oop_is_not_inline_type(Register object, Label& not_inline_type, bool can_be_null) {
3329 if (can_be_null) {
3330 cmpdi(CR0, object, 0);
3331 beq(CR0, not_inline_type);
3332 }
3333 ld(R0, oopDesc::mark_offset_in_bytes(), object);
3334 andi(R0, R0, markWord::inline_type_pattern_mask);
3335 cmpwi(CR0, R0, markWord::inline_type_pattern);
3336 bne(CR0, not_inline_type);
3337 }
3338
3339 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Label& is_null_free_inline_type) {
3340 testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3341 bne(CR0, is_null_free_inline_type);
3342 }
3343
3344 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Label& not_null_free_inline_type) {
3345 testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3346 beq(CR0, not_null_free_inline_type);
3347 }
3348
3349 void MacroAssembler::test_field_is_flat(Register flags, Label& is_flat) {
3350 testbitdi(CR0, R0, flags, ResolvedFieldEntry::is_flat_shift);
3351 bne(CR0, is_flat);
3352 }
3353
3354 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t test_bit, bool jmp_set,
3355 Label& jmp_label, bool maybe_far) {
3356 Label test_mark_word;
3357 // load mark word
3358 ld(temp_reg, oopDesc::mark_offset_in_bytes(), oop);
3359 // if unlocked bit is set we can directly use the mark word
3360 andi_(R0, temp_reg, markWord::unlocked_value);
3361 bne(CR0, test_mark_word);
3362 // slow path use klass prototype
3363 load_prototype_header(temp_reg, oop);
3364
3365 bind(test_mark_word);
3366 andi_(R0, temp_reg, test_bit);
3367 if (maybe_far) {
3368 bc_far_optimized(jmp_set ? Assembler::bcondCRbiIs0 : Assembler::bcondCRbiIs1,
3369 bi0(CR0, Assembler::equal), jmp_label);
3370 } else {
3371 if (jmp_set) {
3372 bne(CR0, jmp_label);
3373 } else {
3374 beq(CR0, jmp_label);
3375 }
3376 }
3377 }
3378
3379 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array, bool maybe_far) {
3380 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array, maybe_far);
3381 }
3382
3383 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg, Label& is_non_flat_array) {
3384 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
3385 }
3386
3387 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array, bool maybe_far) {
3388 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array, maybe_far);
3389 }
3390
3391 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label& is_non_null_free_array) {
3392 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
3393 }
3394
3395 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
3396 testbitdi(CR0, R0, lh, exact_log2(Klass::_lh_array_tag_flat_value_bit_inplace));
3397 bne(CR0, is_flat_array);
3398 }
3399
3400 void MacroAssembler::load_metadata(Register dst, Register src) {
3401 if (UseCompactObjectHeaders) {
3402 load_narrow_klass_compact(dst, src);
3403 } else {
3404 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3405 }
3406 }
3407
3408 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3409 load_klass(dst, src);
3410 ld(dst, Klass::prototype_header_offset(), dst);
3411 }
3412
3413 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst, Register inline_layout_info) {
3414 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3415 bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
3416 }
3417
3418 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
3419 ld(offset, in_bytes(InlineKlass::adr_members_offset()), inline_klass);
3420 lwz(offset, in_bytes(InlineKlass::payload_offset_offset()), offset);
3421 }
3422
3423 void MacroAssembler::payload_address(Register oop, Register data, Register inline_klass, Register t1) {
3424 // ((address) (void*) o) + vk->payload_offset();
3425 payload_offset(inline_klass, t1);
3426 add(data, oop, t1);
3427 }
3428
3429 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3430 assert_different_registers(holder_klass, index, layout_info);
3431 InlineLayoutInfo array[2];
3432 int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3433 if (is_power_of_2(size)) {
3434 sldi(index, index, log2i_exact(size)); // Scale index by power of 2
3435 } else {
3436 mulld(index, index, size); // Scale the index to be the entry index * array_element_size
3437 }
3438 ld(layout_info, InstanceKlass::inline_layout_info_array_offset(), holder_klass);
3439 addi(layout_info, layout_info, Array<InlineLayoutInfo>::base_offset_in_bytes());
3440 add(layout_info, layout_info, index);
3441 }
3442
3443
3444 // Clear Array
3445 // For very short arrays. tmp == R0 is allowed.
3446 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3447 if (cnt_dwords > 0) { li(tmp, 0); }
3448 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3449 }
3450
3451 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3452 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3453 if (cnt_dwords < 8) {
3454 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3455 return;
3456 }
3457
3458 Label loop;
3459 const long loopcnt = cnt_dwords >> 1,
3460 remainder = cnt_dwords & 1;
3461
3462 li(tmp, loopcnt);
3463 mtctr(tmp);
3464 li(tmp, 0);
3465 bind(loop);
3466 std(tmp, 0, base_ptr);
3467 std(tmp, 8, base_ptr);
3468 addi(base_ptr, base_ptr, 16);
3469 bdnz(loop);
3470 if (remainder) { std(tmp, 0, base_ptr); }
3471 }
3472
3473 // Kills both input registers. tmp == R0 is allowed.
3474 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3475 // Procedure for large arrays (uses data cache block zero instruction).
3476 Label startloop, fast, fastloop, small_rest, restloop, done;
3477 const int cl_size = VM_Version::L1_data_cache_line_size(),
3478 cl_dwords = cl_size >> 3,
3479 cl_dw_addr_bits = exact_log2(cl_dwords),
3480 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3481 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3482
3483 if (const_cnt >= 0) {
3484 // Constant case.
3485 if (const_cnt < min_cnt) {
3486 clear_memory_constlen(base_ptr, const_cnt, tmp);
3487 return;
3488 }
3489 load_const_optimized(cnt_dwords, const_cnt, tmp);
3490 } else {
3491 // cnt_dwords already loaded in register. Need to check size.
3492 cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3493 blt(CR1, small_rest);
3494 }
3495 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3496 beq(CR0, fast); // Already 128byte aligned.
3497
3498 subfic(tmp, tmp, cl_dwords);
3499 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3500 subf(cnt_dwords, tmp, cnt_dwords); // rest.
3501 li(tmp, 0);
3502
3503 bind(startloop); // Clear at the beginning to reach 128byte boundary.
3504 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3505 addi(base_ptr, base_ptr, 8);
3506 bdnz(startloop);
3507
3508 bind(fast); // Clear 128byte blocks.
3509 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3510 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3511 mtctr(tmp); // Load counter.
3512
3513 bind(fastloop);
3514 dcbz(base_ptr); // Clear 128byte aligned block.
3515 addi(base_ptr, base_ptr, cl_size);
3516 bdnz(fastloop);
3517
3518 bind(small_rest);
3519 cmpdi(CR0, cnt_dwords, 0); // size 0?
3520 beq(CR0, done); // rest == 0
3521 li(tmp, 0);
3522 mtctr(cnt_dwords); // Load counter.
3523
3524 bind(restloop); // Clear rest.
3525 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3526 addi(base_ptr, base_ptr, 8);
3527 bdnz(restloop);
3528
3529 bind(done);
3530 }
3531
3532 // base: Address of a buffer to be filled, 8 bytes aligned. Killed.
3533 // cnt: Count in 8-byte unit.
3534 // value: Value to be filled with.
3535 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
3536 Label loop, loop_end, done;
3537
3538 // 2x unrolled loop
3539 srdi_(R0, cnt, 1);
3540 beq(CR0, loop_end); // less than 2 elements
3541 mtctr(R0);
3542
3543 bind(loop);
3544 std(value, 0, base);
3545 std(value, 8, base);
3546 addi(base, base, 16);
3547 bdnz(loop);
3548
3549 bind(loop_end);
3550 andi_(R0, cnt, 1);
3551 beq(CR0, done);
3552 std(value, 0, base); // last element
3553
3554 bind(done);
3555 }
3556
3557 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3558
3559 // Helpers for Intrinsic Emitters
3560 //
3561 // Revert the byte order of a 32bit value in a register
3562 // src: 0x44556677
3563 // dst: 0x77665544
3564 // Three steps to obtain the result:
3565 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3566 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3567 // This value initializes dst.
3568 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3569 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3570 // This value is mask inserted into dst with a [0..23] mask of 1s.
3571 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3572 // This value is mask inserted into dst with a [8..15] mask of 1s.
3573 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3574 assert_different_registers(dst, src);
3575
3576 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3577 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3578 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3579 }
3580
3581 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3582 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3583 // body size from 20 to 16 instructions.
3584 // Returns the offset that was used to calculate the address of column tc3.
3585 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3586 // at hand, the original table address can be easily reconstructed.
3587 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3588
3589 // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3590 // Layout: See StubRoutines::ppc::generate_crc_constants.
3591 #ifdef VM_LITTLE_ENDIAN
3592 const int ix0 = 3 * CRC32_TABLE_SIZE;
3593 const int ix1 = 2 * CRC32_TABLE_SIZE;
3594 const int ix2 = 1 * CRC32_TABLE_SIZE;
3595 const int ix3 = 0 * CRC32_TABLE_SIZE;
3596 #else
3597 const int ix0 = 1 * CRC32_TABLE_SIZE;
3598 const int ix1 = 2 * CRC32_TABLE_SIZE;
3599 const int ix2 = 3 * CRC32_TABLE_SIZE;
3600 const int ix3 = 4 * CRC32_TABLE_SIZE;
3601 #endif
3602 assert_different_registers(table, tc0, tc1, tc2);
3603 assert(table == tc3, "must be!");
3604
3605 addi(tc0, table, ix0);
3606 addi(tc1, table, ix1);
3607 addi(tc2, table, ix2);
3608 if (ix3 != 0) addi(tc3, table, ix3);
3609
3610 return ix3;
3611 }
3612
3613 /**
3614 * uint32_t crc;
3615 * table[crc & 0xFF] ^ (crc >> 8);
3616 */
3617 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3618 assert_different_registers(crc, table, tmp);
3619 assert_different_registers(val, table);
3620
3621 if (crc == val) { // Must rotate first to use the unmodified value.
3622 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3623 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3624 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3625 } else {
3626 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3627 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3628 }
3629 lwzx(tmp, table, tmp);
3630 xorr(crc, crc, tmp);
3631 }
3632
3633 /**
3634 * Emits code to update CRC-32 with a byte value according to constants in table.
3635 *
3636 * @param [in,out]crc Register containing the crc.
3637 * @param [in]val Register containing the byte to fold into the CRC.
3638 * @param [in]table Register containing the table of crc constants.
3639 *
3640 * uint32_t crc;
3641 * val = crc_table[(val ^ crc) & 0xFF];
3642 * crc = val ^ (crc >> 8);
3643 */
3644 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3645 BLOCK_COMMENT("update_byte_crc32:");
3646 xorr(val, val, crc);
3647 fold_byte_crc32(crc, val, table, val);
3648 }
3649
3650 /**
3651 * @param crc register containing existing CRC (32-bit)
3652 * @param buf register pointing to input byte buffer (byte*)
3653 * @param len register containing number of bytes
3654 * @param table register pointing to CRC table
3655 */
3656 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3657 Register data, bool loopAlignment) {
3658 assert_different_registers(crc, buf, len, table, data);
3659
3660 Label L_mainLoop, L_done;
3661 const int mainLoop_stepping = 1;
3662 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3663
3664 // Process all bytes in a single-byte loop.
3665 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3666 beq(CR0, L_done);
3667
3668 mtctr(len);
3669 align(mainLoop_alignment);
3670 BIND(L_mainLoop);
3671 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3672 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3673 update_byte_crc32(crc, data, table);
3674 bdnz(L_mainLoop); // Iterate.
3675
3676 bind(L_done);
3677 }
3678
3679 /**
3680 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3681 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3682 */
3683 // A note on the lookup table address(es):
3684 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3685 // To save the effort of adding the column offset to the table address each time
3686 // a table element is looked up, it is possible to pass the pre-calculated
3687 // column addresses.
3688 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3689 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3690 Register t0, Register t1, Register t2, Register t3,
3691 Register tc0, Register tc1, Register tc2, Register tc3) {
3692 assert_different_registers(crc, t3);
3693
3694 // XOR crc with next four bytes of buffer.
3695 lwz(t3, bufDisp, buf);
3696 if (bufInc != 0) {
3697 addi(buf, buf, bufInc);
3698 }
3699 xorr(t3, t3, crc);
3700
3701 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3702 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3703 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3704 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3705 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3706
3707 // Use the pre-calculated column addresses.
3708 // Load pre-calculated table values.
3709 lwzx(t0, tc0, t0);
3710 lwzx(t1, tc1, t1);
3711 lwzx(t2, tc2, t2);
3712 lwzx(t3, tc3, t3);
3713
3714 // Calculate new crc from table values.
3715 xorr(t0, t0, t1);
3716 xorr(t2, t2, t3);
3717 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3718 }
3719
3720
3721 /**
3722 * @param crc register containing existing CRC (32-bit)
3723 * @param buf register pointing to input byte buffer (byte*)
3724 * @param len register containing number of bytes
3725 * @param constants register pointing to precomputed constants
3726 * @param t0-t6 temp registers
3727 */
3728 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3729 Register t0, Register t1, Register t2, Register t3,
3730 Register t4, Register t5, Register t6, bool invertCRC) {
3731 assert_different_registers(crc, buf, len, constants);
3732
3733 Label L_tail;
3734
3735 BLOCK_COMMENT("kernel_crc32_vpmsum {");
3736
3737 if (invertCRC) {
3738 nand(crc, crc, crc); // 1s complement of crc
3739 }
3740
3741 // Enforce 32 bit.
3742 clrldi(len, len, 32);
3743
3744 // Align if we have enough bytes for the fast version.
3745 const int alignment = 16,
3746 threshold = 32;
3747 Register prealign = t0;
3748
3749 neg(prealign, buf);
3750 addi(t1, len, -threshold);
3751 andi(prealign, prealign, alignment - 1);
3752 cmpw(CR0, t1, prealign);
3753 blt(CR0, L_tail); // len - prealign < threshold?
3754
3755 subf(len, prealign, len);
3756 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3757
3758 // Calculate from first aligned address as far as possible.
3759 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3760 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3761 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3762
3763 // Remaining bytes.
3764 BIND(L_tail);
3765 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3766
3767 if (invertCRC) {
3768 nand(crc, crc, crc); // 1s complement of crc
3769 }
3770
3771 BLOCK_COMMENT("} kernel_crc32_vpmsum");
3772 }
3773
3774 /**
3775 * @param crc register containing existing CRC (32-bit)
3776 * @param buf register pointing to input byte buffer (byte*)
3777 * @param len register containing number of bytes (will get updated to remaining bytes)
3778 * @param constants register pointing to CRC table for 128-bit aligned memory
3779 * @param t0-t6 temp registers
3780 */
3781 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3782 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3783
3784 // Save non-volatile vector registers (frameless).
3785 Register offset = t1;
3786 int offsetInt = 0;
3787 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3788 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3789 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3790 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3791 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3792 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3793 #ifndef VM_LITTLE_ENDIAN
3794 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3795 #endif
3796 offsetInt -= 8; std(R14, offsetInt, R1_SP);
3797 offsetInt -= 8; std(R15, offsetInt, R1_SP);
3798
3799 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3800 // bytes per iteration. The basic scheme is:
3801 // lvx: load vector (Big Endian needs reversal)
3802 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3803 // vxor: xor partial results together to get unroll_factor2 vectors
3804
3805 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3806
3807 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3808 const int unroll_factor = CRC32_UNROLL_FACTOR,
3809 unroll_factor2 = CRC32_UNROLL_FACTOR2;
3810
3811 const int outer_consts_size = (unroll_factor2 - 1) * 16,
3812 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3813
3814 // Support registers.
3815 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3816 Register num_bytes = R14,
3817 loop_count = R15,
3818 cur_const = crc; // will live in VCRC
3819 // Constant array for outer loop: unroll_factor2 - 1 registers,
3820 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3821 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3822 consts1[] = { VR23, VR24 };
3823 // Data register arrays: 2 arrays with unroll_factor2 registers.
3824 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3825 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3826
3827 VectorRegister VCRC = data0[0];
3828 VectorRegister Vc = VR25;
3829 VectorRegister swap_bytes = VR26; // Only for Big Endian.
3830
3831 // We have at least 1 iteration (ensured by caller).
3832 Label L_outer_loop, L_inner_loop, L_last;
3833
3834 // Set DSCR pre-fetch to deepest.
3835 if (VM_Version::has_mfdscr()) {
3836 load_const_optimized(t0, VM_Version::_dscr_val | 7);
3837 mtdscr(t0);
3838 }
3839
3840 mtvrwz(VCRC, crc); // crc lives in VCRC, now
3841
3842 for (int i = 1; i < unroll_factor2; ++i) {
3843 li(offs[i], 16 * i);
3844 }
3845
3846 // Load consts for outer loop
3847 lvx(consts0[0], constants);
3848 for (int i = 1; i < unroll_factor2 - 1; ++i) {
3849 lvx(consts0[i], offs[i], constants);
3850 }
3851
3852 load_const_optimized(num_bytes, 16 * unroll_factor);
3853
3854 // Reuse data registers outside of the loop.
3855 VectorRegister Vtmp = data1[0];
3856 VectorRegister Vtmp2 = data1[1];
3857 VectorRegister zeroes = data1[2];
3858
3859 vspltisb(Vtmp, 0);
3860 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3861
3862 // Load vector for vpermxor (to xor both 64 bit parts together)
3863 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3864 vspltisb(Vc, 4);
3865 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3866 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3867 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3868
3869 #ifdef VM_LITTLE_ENDIAN
3870 #define BE_swap_bytes(x)
3871 #else
3872 vspltisb(Vtmp2, 0xf);
3873 vxor(swap_bytes, Vtmp, Vtmp2);
3874 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3875 #endif
3876
3877 cmpd(CR0, len, num_bytes);
3878 blt(CR0, L_last);
3879
3880 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3881 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3882
3883 // ********** Main loop start **********
3884 align(32);
3885 bind(L_outer_loop);
3886
3887 // Begin of unrolled first iteration (no xor).
3888 lvx(data1[0], buf);
3889 for (int i = 1; i < unroll_factor2 / 2; ++i) {
3890 lvx(data1[i], offs[i], buf);
3891 }
3892 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3893 lvx(consts1[0], cur_const);
3894 mtctr(loop_count);
3895 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3896 BE_swap_bytes(data1[i]);
3897 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3898 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3899 vpmsumw(data0[i], data1[i], consts1[0]);
3900 }
3901 addi(buf, buf, 16 * unroll_factor2);
3902 subf(len, num_bytes, len);
3903 lvx(consts1[1], offs[1], cur_const);
3904 addi(cur_const, cur_const, 32);
3905 // Begin of unrolled second iteration (head).
3906 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3907 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3908 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3909 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3910 }
3911 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3912 BE_swap_bytes(data1[i]);
3913 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3914 vpmsumw(data1[i], data1[i], consts1[1]);
3915 }
3916 addi(buf, buf, 16 * unroll_factor2);
3917
3918 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3919 // Double-iteration allows using the 2 constant registers alternatingly.
3920 align(32);
3921 bind(L_inner_loop);
3922 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3923 if (j & 1) {
3924 lvx(consts1[0], cur_const);
3925 } else {
3926 lvx(consts1[1], offs[1], cur_const);
3927 addi(cur_const, cur_const, 32);
3928 }
3929 for (int i = 0; i < unroll_factor2; ++i) {
3930 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3931 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3932 BE_swap_bytes(data1[idx]);
3933 vxor(data0[i], data0[i], data1[i]);
3934 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3935 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3936 }
3937 addi(buf, buf, 16 * unroll_factor2);
3938 }
3939 bdnz(L_inner_loop);
3940
3941 addi(cur_const, constants, outer_consts_size); // Reset
3942
3943 // Tail of last iteration (no loads).
3944 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3945 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3946 vxor(data0[i], data0[i], data1[i]);
3947 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3948 }
3949 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3950 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3951 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3952 }
3953
3954 // Last data register is ok, other ones need fixup shift.
3955 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3956 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3957 }
3958
3959 // Combine to 128 bit result vector VCRC = data0[0].
3960 for (int i = 1; i < unroll_factor2; i<<=1) {
3961 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3962 vxor(data0[j], data0[j], data0[j+i]);
3963 }
3964 }
3965 cmpd(CR0, len, num_bytes);
3966 bge(CR0, L_outer_loop);
3967
3968 // Last chance with lower num_bytes.
3969 bind(L_last);
3970 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3971 // Point behind last const for inner loop.
3972 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3973 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3974 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3975 subf(cur_const, R0, cur_const); // Point to constant to be used first.
3976
3977 addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3978 bgt(CR0, L_outer_loop);
3979 // ********** Main loop end **********
3980
3981 // Restore DSCR pre-fetch value.
3982 if (VM_Version::has_mfdscr()) {
3983 load_const_optimized(t0, VM_Version::_dscr_val);
3984 mtdscr(t0);
3985 }
3986
3987 // ********** Simple loop for remaining 16 byte blocks **********
3988 {
3989 Label L_loop, L_done;
3990
3991 srdi_(t0, len, 4); // 16 bytes per iteration
3992 clrldi(len, len, 64-4);
3993 beq(CR0, L_done);
3994
3995 // Point to const (same as last const for inner loop).
3996 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3997 mtctr(t0);
3998 lvx(Vtmp2, cur_const);
3999
4000 align(32);
4001 bind(L_loop);
4002
4003 lvx(Vtmp, buf);
4004 addi(buf, buf, 16);
4005 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4006 BE_swap_bytes(Vtmp);
4007 vxor(VCRC, VCRC, Vtmp);
4008 vpmsumw(VCRC, VCRC, Vtmp2);
4009 bdnz(L_loop);
4010
4011 bind(L_done);
4012 }
4013 // ********** Simple loop end **********
4014 #undef BE_swap_bytes
4015
4016 // Point to Barrett constants
4017 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
4018
4019 vspltisb(zeroes, 0);
4020
4021 // Combine to 64 bit result.
4022 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
4023
4024 // Reduce to 32 bit CRC: Remainder by multiply-high.
4025 lvx(Vtmp, cur_const);
4026 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
4027 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
4028 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
4029 vsldoi(Vtmp, zeroes, Vtmp, 8);
4030 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
4031 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
4032
4033 // Move result. len is already updated.
4034 vsldoi(VCRC, VCRC, zeroes, 8);
4035 mfvrd(crc, VCRC);
4036
4037 // Restore non-volatile Vector registers (frameless).
4038 offsetInt = 0;
4039 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
4040 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
4041 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
4042 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
4043 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
4044 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
4045 #ifndef VM_LITTLE_ENDIAN
4046 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
4047 #endif
4048 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
4049 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
4050 }
4051
4052 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
4053 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
4054 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
4055 : StubRoutines::crc_table_addr() , R0);
4056
4057 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
4058 }
4059
4060 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
4061 assert_different_registers(crc, val, table);
4062
4063 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
4064 if (invertCRC) {
4065 nand(crc, crc, crc); // 1s complement of crc
4066 }
4067
4068 update_byte_crc32(crc, val, table);
4069
4070 if (invertCRC) {
4071 nand(crc, crc, crc); // 1s complement of crc
4072 }
4073 }
4074
4075 // dest_lo += src1 + src2
4076 // dest_hi += carry1 + carry2
4077 void MacroAssembler::add2_with_carry(Register dest_hi,
4078 Register dest_lo,
4079 Register src1, Register src2) {
4080 li(R0, 0);
4081 addc(dest_lo, dest_lo, src1);
4082 adde(dest_hi, dest_hi, R0);
4083 addc(dest_lo, dest_lo, src2);
4084 adde(dest_hi, dest_hi, R0);
4085 }
4086
4087 // Multiply 64 bit by 64 bit first loop.
4088 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
4089 Register x_xstart,
4090 Register y, Register y_idx,
4091 Register z,
4092 Register carry,
4093 Register product_high, Register product,
4094 Register idx, Register kdx,
4095 Register tmp) {
4096 // jlong carry, x[], y[], z[];
4097 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
4098 // huge_128 product = y[idx] * x[xstart] + carry;
4099 // z[kdx] = (jlong)product;
4100 // carry = (jlong)(product >>> 64);
4101 // }
4102 // z[xstart] = carry;
4103
4104 Label L_first_loop, L_first_loop_exit;
4105 Label L_one_x, L_one_y, L_multiply;
4106
4107 addic_(xstart, xstart, -1);
4108 blt(CR0, L_one_x); // Special case: length of x is 1.
4109
4110 // Load next two integers of x.
4111 sldi(tmp, xstart, LogBytesPerInt);
4112 ldx(x_xstart, x, tmp);
4113 #ifdef VM_LITTLE_ENDIAN
4114 rldicl(x_xstart, x_xstart, 32, 0);
4115 #endif
4116
4117 align(32, 16);
4118 bind(L_first_loop);
4119
4120 cmpdi(CR0, idx, 1);
4121 blt(CR0, L_first_loop_exit);
4122 addi(idx, idx, -2);
4123 beq(CR0, L_one_y);
4124
4125 // Load next two integers of y.
4126 sldi(tmp, idx, LogBytesPerInt);
4127 ldx(y_idx, y, tmp);
4128 #ifdef VM_LITTLE_ENDIAN
4129 rldicl(y_idx, y_idx, 32, 0);
4130 #endif
4131
4132
4133 bind(L_multiply);
4134 multiply64(product_high, product, x_xstart, y_idx);
4135
4136 li(tmp, 0);
4137 addc(product, product, carry); // Add carry to result.
4138 adde(product_high, product_high, tmp); // Add carry of the last addition.
4139 addi(kdx, kdx, -2);
4140
4141 // Store result.
4142 #ifdef VM_LITTLE_ENDIAN
4143 rldicl(product, product, 32, 0);
4144 #endif
4145 sldi(tmp, kdx, LogBytesPerInt);
4146 stdx(product, z, tmp);
4147 mr_if_needed(carry, product_high);
4148 b(L_first_loop);
4149
4150
4151 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
4152
4153 lwz(y_idx, 0, y);
4154 b(L_multiply);
4155
4156
4157 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4158
4159 lwz(x_xstart, 0, x);
4160 b(L_first_loop);
4161
4162 bind(L_first_loop_exit);
4163 }
4164
4165 // Multiply 64 bit by 64 bit and add 128 bit.
4166 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4167 Register z, Register yz_idx,
4168 Register idx, Register carry,
4169 Register product_high, Register product,
4170 Register tmp, int offset) {
4171
4172 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4173 // z[kdx] = (jlong)product;
4174
4175 sldi(tmp, idx, LogBytesPerInt);
4176 if (offset) {
4177 addi(tmp, tmp, offset);
4178 }
4179 ldx(yz_idx, y, tmp);
4180 #ifdef VM_LITTLE_ENDIAN
4181 rldicl(yz_idx, yz_idx, 32, 0);
4182 #endif
4183
4184 multiply64(product_high, product, x_xstart, yz_idx);
4185 ldx(yz_idx, z, tmp);
4186 #ifdef VM_LITTLE_ENDIAN
4187 rldicl(yz_idx, yz_idx, 32, 0);
4188 #endif
4189
4190 add2_with_carry(product_high, product, carry, yz_idx);
4191
4192 sldi(tmp, idx, LogBytesPerInt);
4193 if (offset) {
4194 addi(tmp, tmp, offset);
4195 }
4196 #ifdef VM_LITTLE_ENDIAN
4197 rldicl(product, product, 32, 0);
4198 #endif
4199 stdx(product, z, tmp);
4200 }
4201
4202 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4203 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4204 Register y, Register z,
4205 Register yz_idx, Register idx, Register carry,
4206 Register product_high, Register product,
4207 Register carry2, Register tmp) {
4208
4209 // jlong carry, x[], y[], z[];
4210 // int kdx = ystart+1;
4211 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4212 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4213 // z[kdx+idx+1] = (jlong)product;
4214 // jlong carry2 = (jlong)(product >>> 64);
4215 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4216 // z[kdx+idx] = (jlong)product;
4217 // carry = (jlong)(product >>> 64);
4218 // }
4219 // idx += 2;
4220 // if (idx > 0) {
4221 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4222 // z[kdx+idx] = (jlong)product;
4223 // carry = (jlong)(product >>> 64);
4224 // }
4225
4226 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4227 const Register jdx = R0;
4228
4229 // Scale the index.
4230 srdi_(jdx, idx, 2);
4231 beq(CR0, L_third_loop_exit);
4232 mtctr(jdx);
4233
4234 align(32, 16);
4235 bind(L_third_loop);
4236
4237 addi(idx, idx, -4);
4238
4239 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4240 mr_if_needed(carry2, product_high);
4241
4242 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4243 mr_if_needed(carry, product_high);
4244 bdnz(L_third_loop);
4245
4246 bind(L_third_loop_exit); // Handle any left-over operand parts.
4247
4248 andi_(idx, idx, 0x3);
4249 beq(CR0, L_post_third_loop_done);
4250
4251 Label L_check_1;
4252
4253 addic_(idx, idx, -2);
4254 blt(CR0, L_check_1);
4255
4256 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4257 mr_if_needed(carry, product_high);
4258
4259 bind(L_check_1);
4260
4261 addi(idx, idx, 0x2);
4262 andi_(idx, idx, 0x1);
4263 addic_(idx, idx, -1);
4264 blt(CR0, L_post_third_loop_done);
4265
4266 sldi(tmp, idx, LogBytesPerInt);
4267 lwzx(yz_idx, y, tmp);
4268 multiply64(product_high, product, x_xstart, yz_idx);
4269 lwzx(yz_idx, z, tmp);
4270
4271 add2_with_carry(product_high, product, yz_idx, carry);
4272
4273 sldi(tmp, idx, LogBytesPerInt);
4274 stwx(product, z, tmp);
4275 srdi(product, product, 32);
4276
4277 sldi(product_high, product_high, 32);
4278 orr(product, product, product_high);
4279 mr_if_needed(carry, product);
4280
4281 bind(L_post_third_loop_done);
4282 } // multiply_128_x_128_loop
4283
4284 void MacroAssembler::muladd(Register out, Register in,
4285 Register offset, Register len, Register k,
4286 Register tmp1, Register tmp2, Register carry) {
4287
4288 // Labels
4289 Label LOOP, SKIP;
4290
4291 // Make sure length is positive.
4292 cmpdi (CR0, len, 0);
4293
4294 // Prepare variables
4295 subi (offset, offset, 4);
4296 li (carry, 0);
4297 ble (CR0, SKIP);
4298
4299 mtctr (len);
4300 subi (len, len, 1 );
4301 sldi (len, len, 2 );
4302
4303 // Main loop
4304 bind(LOOP);
4305 lwzx (tmp1, len, in );
4306 lwzx (tmp2, offset, out );
4307 mulld (tmp1, tmp1, k );
4308 add (tmp2, carry, tmp2 );
4309 add (tmp2, tmp1, tmp2 );
4310 stwx (tmp2, offset, out );
4311 srdi (carry, tmp2, 32 );
4312 subi (offset, offset, 4 );
4313 subi (len, len, 4 );
4314 bdnz (LOOP);
4315 bind(SKIP);
4316 }
4317
4318 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4319 Register y, Register ylen,
4320 Register z,
4321 Register tmp1, Register tmp2,
4322 Register tmp3, Register tmp4,
4323 Register tmp5, Register tmp6,
4324 Register tmp7, Register tmp8,
4325 Register tmp9, Register tmp10,
4326 Register tmp11, Register tmp12,
4327 Register tmp13) {
4328
4329 ShortBranchVerifier sbv(this);
4330
4331 assert_different_registers(x, xlen, y, ylen, z,
4332 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4333 assert_different_registers(x, xlen, y, ylen, z,
4334 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4335 assert_different_registers(x, xlen, y, ylen, z,
4336 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4337
4338 const Register idx = tmp1;
4339 const Register kdx = tmp2;
4340 const Register xstart = tmp3;
4341
4342 const Register y_idx = tmp4;
4343 const Register carry = tmp5;
4344 const Register product = tmp6;
4345 const Register product_high = tmp7;
4346 const Register x_xstart = tmp8;
4347 const Register tmp = tmp9;
4348
4349 // First Loop.
4350 //
4351 // final static long LONG_MASK = 0xffffffffL;
4352 // int xstart = xlen - 1;
4353 // int ystart = ylen - 1;
4354 // long carry = 0;
4355 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4356 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4357 // z[kdx] = (int)product;
4358 // carry = product >>> 32;
4359 // }
4360 // z[xstart] = (int)carry;
4361
4362 mr_if_needed(idx, ylen); // idx = ylen
4363 add(kdx, xlen, ylen); // kdx = xlen + ylen
4364 li(carry, 0); // carry = 0
4365
4366 Label L_done;
4367
4368 addic_(xstart, xlen, -1);
4369 blt(CR0, L_done);
4370
4371 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4372 carry, product_high, product, idx, kdx, tmp);
4373
4374 Label L_second_loop;
4375
4376 cmpdi(CR0, kdx, 0);
4377 beq(CR0, L_second_loop);
4378
4379 Label L_carry;
4380
4381 addic_(kdx, kdx, -1);
4382 beq(CR0, L_carry);
4383
4384 // Store lower 32 bits of carry.
4385 sldi(tmp, kdx, LogBytesPerInt);
4386 stwx(carry, z, tmp);
4387 srdi(carry, carry, 32);
4388 addi(kdx, kdx, -1);
4389
4390
4391 bind(L_carry);
4392
4393 // Store upper 32 bits of carry.
4394 sldi(tmp, kdx, LogBytesPerInt);
4395 stwx(carry, z, tmp);
4396
4397 // Second and third (nested) loops.
4398 //
4399 // for (int i = xstart-1; i >= 0; i--) { // Second loop
4400 // carry = 0;
4401 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4402 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4403 // (z[k] & LONG_MASK) + carry;
4404 // z[k] = (int)product;
4405 // carry = product >>> 32;
4406 // }
4407 // z[i] = (int)carry;
4408 // }
4409 //
4410 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4411
4412 bind(L_second_loop);
4413
4414 li(carry, 0); // carry = 0;
4415
4416 addic_(xstart, xstart, -1); // i = xstart-1;
4417 blt(CR0, L_done);
4418
4419 Register zsave = tmp10;
4420
4421 mr(zsave, z);
4422
4423
4424 Label L_last_x;
4425
4426 sldi(tmp, xstart, LogBytesPerInt);
4427 add(z, z, tmp); // z = z + k - j
4428 addi(z, z, 4);
4429 addic_(xstart, xstart, -1); // i = xstart-1;
4430 blt(CR0, L_last_x);
4431
4432 sldi(tmp, xstart, LogBytesPerInt);
4433 ldx(x_xstart, x, tmp);
4434 #ifdef VM_LITTLE_ENDIAN
4435 rldicl(x_xstart, x_xstart, 32, 0);
4436 #endif
4437
4438
4439 Label L_third_loop_prologue;
4440
4441 bind(L_third_loop_prologue);
4442
4443 Register xsave = tmp11;
4444 Register xlensave = tmp12;
4445 Register ylensave = tmp13;
4446
4447 mr(xsave, x);
4448 mr(xlensave, xstart);
4449 mr(ylensave, ylen);
4450
4451
4452 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4453 carry, product_high, product, x, tmp);
4454
4455 mr(z, zsave);
4456 mr(x, xsave);
4457 mr(xlen, xlensave); // This is the decrement of the loop counter!
4458 mr(ylen, ylensave);
4459
4460 addi(tmp3, xlen, 1);
4461 sldi(tmp, tmp3, LogBytesPerInt);
4462 stwx(carry, z, tmp);
4463 addic_(tmp3, tmp3, -1);
4464 blt(CR0, L_done);
4465
4466 srdi(carry, carry, 32);
4467 sldi(tmp, tmp3, LogBytesPerInt);
4468 stwx(carry, z, tmp);
4469 b(L_second_loop);
4470
4471 // Next infrequent code is moved outside loops.
4472 bind(L_last_x);
4473
4474 lwz(x_xstart, 0, x);
4475 b(L_third_loop_prologue);
4476
4477 bind(L_done);
4478 } // multiply_to_len
4479
4480 void MacroAssembler::increment_mem64(Register base, RegisterOrConstant ind_or_offs, int val, Register tmp) {
4481 ld(tmp, ind_or_offs, base);
4482 addi(tmp, tmp, val);
4483 std(tmp, ind_or_offs, base);
4484 }
4485
4486 // Handle the receiver type profile update given the "recv" klass.
4487 //
4488 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
4489 // If there are no matching or claimable receiver entries in RD, updates
4490 // the polymorphic counter.
4491 //
4492 // This code expected to run by either the interpreter or JIT-ed code, without
4493 // extra synchronization. For safety, receiver cells are claimed atomically, which
4494 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
4495 // counter updates are not atomic.
4496 //
4497 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset, Register tmp1, Register tmp2) {
4498 assert_different_registers(recv, mdp, tmp1, tmp2);
4499
4500 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
4501 int poly_count_offset = in_bytes(CounterData::count_offset());
4502 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
4503 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
4504
4505 // Adjust for MDP offsets.
4506 base_receiver_offset += mdp_offset;
4507 poly_count_offset += mdp_offset;
4508
4509 #ifdef ASSERT
4510 // We are about to walk the MDO slots without asking for offsets.
4511 // Check that our math hits all the right spots.
4512 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
4513 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
4514 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
4515 int offset = base_receiver_offset + receiver_step*c;
4516 int count_offset = offset + receiver_to_count_step;
4517 assert(offset == real_recv_offset, "receiver slot math");
4518 assert(count_offset == real_count_offset, "receiver count math");
4519 }
4520 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
4521 assert(poly_count_offset == real_poly_count_offset, "poly counter math");
4522 #endif
4523
4524 // Corner case: no profile table. Increment poly counter and exit.
4525 if (ReceiverTypeData::row_limit() == 0) {
4526 increment_mem64(mdp, poly_count_offset, DataLayout::counter_increment, tmp1);
4527 return;
4528 }
4529
4530 Label L_loop_search_receiver, L_loop_search_empty;
4531 Label L_restart, L_found_recv, L_found_empty, L_count_update;
4532 Register offset = tmp1, count = tmp2;
4533
4534 // The code here recognizes three major cases:
4535 // A. Fastest: receiver found in the table
4536 // B. Fast: no receiver in the table, and the table is full
4537 // C. Slow: no receiver in the table, free slots in the table
4538 //
4539 // The case A performance is most important, as perfectly-behaved code would end up
4540 // there, especially with larger TypeProfileWidth. The case B performance is
4541 // important as well, this is where bulk of code would land for normally megamorphic
4542 // cases. The case C performance is not essential, its job is to deal with installation
4543 // races, we optimize for code density instead. Case C needs to make sure that receiver
4544 // rows are only claimed once. This makes sure we never overwrite a row for another
4545 // receiver and never duplicate the receivers in the list, making profile type-accurate.
4546 //
4547 // It is very tempting to handle these cases in a single loop, and claim the first slot
4548 // without checking the rest of the table. But, profiling code should tolerate free slots
4549 // in the table, as class unloading can clear them. After such cleanup, the receiver
4550 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
4551 // to complete, before trying to install new slots. Splitting the code in several tight
4552 // loops also helpfully optimizes for cases A and B.
4553 //
4554 // This code is effectively:
4555 //
4556 // restart:
4557 // // Fastest: receiver is already installed
4558 // for (i = 0; i < receiver_count(); i++) {
4559 // if (receiver(i) == recv) goto found_recv(i);
4560 // }
4561 //
4562 // // Fast: no receiver, but profile is not full
4563 // for (i = 0; i < receiver_count(); i++) {
4564 // if (receiver(i) == null) goto found_null(i);
4565 // }
4566 //
4567 // // Slow: profile is full, polymorphic case
4568 // count++;
4569 // return
4570 //
4571 // // Slow: try to install receiver
4572 // found_null(i):
4573 // CAS(&receiver(i), null, recv);
4574 // goto restart
4575 //
4576 // found_recv(i):
4577 // *receiver_count(i)++
4578 //
4579
4580 if (count != noreg) {
4581 li(count, ReceiverTypeData::row_limit());
4582 }
4583
4584 bind(L_restart);
4585
4586 // Fastest: receiver is already installed
4587 if (count != noreg) {
4588 mtctr(count);
4589 } else {
4590 li(R0, ReceiverTypeData::row_limit());
4591 mtctr(R0);
4592 }
4593 li(offset, base_receiver_offset);
4594 bind(L_loop_search_receiver);
4595 ldx(R0, offset, mdp);
4596 cmpd(CR0, R0, recv);
4597 beq(CR0, L_found_recv);
4598 addi(offset, offset, receiver_step);
4599 bdnz(L_loop_search_receiver);
4600
4601 // Fast: no receiver, but profile is not full
4602 if (count != noreg) {
4603 mtctr(count);
4604 } else {
4605 li(R0, ReceiverTypeData::row_limit());
4606 mtctr(R0);
4607 }
4608 li(offset, base_receiver_offset);
4609 bind(L_loop_search_empty);
4610 ldx(R0, offset, mdp);
4611 cmpdi(CR0, R0, 0);
4612 beq(CR0, L_found_empty);
4613 addi(offset, offset, receiver_step);
4614 bdnz(L_loop_search_empty);
4615
4616 // Slow: Receiver is not found and table is full.
4617 // Increment polymorphic counter instead of receiver slot.
4618 li(offset, poly_count_offset);
4619 b(L_count_update);
4620
4621 // Slowest: try to install receiver
4622 bind(L_found_empty);
4623
4624 // Atomically swing receiver slot: null -> recv.
4625 {
4626 Register receiver_addr = offset;
4627 add(receiver_addr, mdp, offset); // kills offset
4628 cmpxchgd(CR0, R0, RegisterOrConstant(0), recv, receiver_addr, MemBarNone, cmpxchgx_hint_atomic_update(),
4629 noreg, nullptr, /* check without ldarx first */ false, /* weak */ true);
4630 }
4631
4632 // CAS success means the slot now has the receiver we want. CAS failure means
4633 // something had claimed the slot concurrently: it can be the same receiver we want,
4634 // or something else. Since this is a slow path, we can optimize for code density,
4635 // and just restart the search from the beginning.
4636 b(L_restart);
4637
4638 // Found a receiver, convert its slot offset to corresponding count offset.
4639 bind(L_found_recv);
4640 addi(offset, offset, receiver_to_count_step);
4641
4642 // Finally, update the counter
4643 bind(L_count_update);
4644 increment_mem64(mdp, offset, DataLayout::counter_increment, /* temp */ (count != noreg) ? count : recv);
4645 }
4646
4647 #ifdef ASSERT
4648 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4649 Label ok;
4650 switch (cond) {
4651 case eq:
4652 beq(CR0, ok);
4653 break;
4654 case ne:
4655 bne(CR0, ok);
4656 break;
4657 case ge:
4658 bge(CR0, ok);
4659 break;
4660 case gt:
4661 bgt(CR0, ok);
4662 break;
4663 case lt:
4664 blt(CR0, ok);
4665 break;
4666 case le:
4667 ble(CR0, ok);
4668 break;
4669 default:
4670 assert(false, "unknown cond:%d", cond);
4671 }
4672 stop(msg);
4673 bind(ok);
4674 }
4675
4676 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4677 Register mem_base, const char* msg) {
4678 switch (size) {
4679 case 4:
4680 lwz(R0, mem_offset, mem_base);
4681 cmpwi(CR0, R0, 0);
4682 break;
4683 case 8:
4684 ld(R0, mem_offset, mem_base);
4685 cmpdi(CR0, R0, 0);
4686 break;
4687 default:
4688 ShouldNotReachHere();
4689 }
4690 asm_assert(cond, msg);
4691 }
4692 #endif // ASSERT
4693
4694 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4695 if (!VerifyOops) { return; }
4696 if (UseCompressedOops) { decode_heap_oop(coop); }
4697 verify_oop(coop, msg);
4698 if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4699 }
4700
4701 // READ: oop. KILL: R0. Volatile floats perhaps.
4702 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4703 if (!VerifyOops) {
4704 return;
4705 }
4706
4707 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4708 const Register tmp = R11; // Will be preserved.
4709 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4710
4711 BLOCK_COMMENT("verify_oop {");
4712
4713 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4714
4715 mr_if_needed(R4_ARG2, oop);
4716 save_LR_CR(tmp); // save in old frame
4717 push_frame_reg_args(nbytes_save, tmp);
4718 // load FunctionDescriptor** / entry_address *
4719 load_const_optimized(tmp, fd, R0);
4720 // load FunctionDescriptor* / entry_address
4721 ld(tmp, 0, tmp);
4722 load_const_optimized(R3_ARG1, (address)msg, R0);
4723 // Call destination for its side effect.
4724 call_c(tmp);
4725
4726 pop_frame();
4727 restore_LR_CR(tmp);
4728 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4729
4730 BLOCK_COMMENT("} verify_oop");
4731 }
4732
4733 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4734 if (!VerifyOops) {
4735 return;
4736 }
4737
4738 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4739 const Register tmp = R11; // Will be preserved.
4740 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4741 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4742
4743 ld(R4_ARG2, offs, base);
4744 save_LR_CR(tmp); // save in old frame
4745 push_frame_reg_args(nbytes_save, tmp);
4746 // load FunctionDescriptor** / entry_address *
4747 load_const_optimized(tmp, fd, R0);
4748 // load FunctionDescriptor* / entry_address
4749 ld(tmp, 0, tmp);
4750 load_const_optimized(R3_ARG1, (address)msg, R0);
4751 // Call destination for its side effect.
4752 call_c(tmp);
4753
4754 pop_frame();
4755 restore_LR_CR(tmp);
4756 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4757 }
4758
4759 // Call a C-function that prints output.
4760 void MacroAssembler::stop(int type, const char* msg) {
4761 bool msg_present = (msg != nullptr);
4762
4763 #ifndef PRODUCT
4764 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4765 #else
4766 block_comment("stop {");
4767 #endif
4768
4769 if (msg_present) {
4770 type |= stop_msg_present;
4771 }
4772 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4773 if (msg_present) {
4774 emit_int64((uintptr_t)msg);
4775 }
4776
4777 block_comment("} stop;");
4778 }
4779
4780 #ifndef PRODUCT
4781 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4782 // Val, addr are temp registers.
4783 // If low == addr, addr is killed.
4784 // High is preserved.
4785 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4786 if (!ZapMemory) return;
4787
4788 assert_different_registers(low, val);
4789
4790 BLOCK_COMMENT("zap memory region {");
4791 load_const_optimized(val, 0x0101010101010101);
4792 int size = before + after;
4793 if (low == high && size < 5 && size > 0) {
4794 int offset = -before*BytesPerWord;
4795 for (int i = 0; i < size; ++i) {
4796 std(val, offset, low);
4797 offset += (1*BytesPerWord);
4798 }
4799 } else {
4800 addi(addr, low, -before*BytesPerWord);
4801 assert_different_registers(high, val);
4802 if (after) addi(high, high, after * BytesPerWord);
4803 Label loop;
4804 bind(loop);
4805 std(val, 0, addr);
4806 addi(addr, addr, 8);
4807 cmpd(CR6, addr, high);
4808 ble(CR6, loop);
4809 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4810 }
4811 BLOCK_COMMENT("} zap memory region");
4812 }
4813
4814 #endif // !PRODUCT
4815
4816 void MacroAssembler::cache_wb(Address line) {
4817 assert(line.index() == noreg, "index should be noreg");
4818 assert(line.disp() == 0, "displacement should be 0");
4819 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4820 // Data Cache Store, not really a flush, so it works like a sync of cache
4821 // line and persistent mem, i.e. copying the cache line to persistent whilst
4822 // not invalidating the cache line.
4823 dcbst(line.base());
4824 }
4825
4826 void MacroAssembler::cache_wbsync(bool is_presync) {
4827 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4828 // We only need a post sync barrier. Post means _after_ a cache line flush or
4829 // store instruction, pre means a barrier emitted before such a instructions.
4830 if (!is_presync) {
4831 fence();
4832 }
4833 }
4834
4835 void MacroAssembler::push_cont_fastpath() {
4836 if (!Continuations::enabled()) return;
4837
4838 Label done;
4839 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4840 cmpld(CR0, R1_SP, R0);
4841 ble(CR0, done); // if (SP <= _cont_fastpath) goto done;
4842 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4843 bind(done);
4844 }
4845
4846 void MacroAssembler::pop_cont_fastpath() {
4847 if (!Continuations::enabled()) return;
4848
4849 Label done;
4850 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4851 cmpld(CR0, R1_SP, R0);
4852 blt(CR0, done); // if (SP < _cont_fastpath) goto done;
4853 li(R0, 0);
4854 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4855 bind(done);
4856 }
4857
4858 // Function to flip between unlocked and locked state (fast locking).
4859 // Branches to failed if the state is not as expected with CR0 NE.
4860 // Falls through upon success with CR0 EQ.
4861 // This requires fewer instructions and registers and is easier to use than the
4862 // cmpxchg based implementation.
4863 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4864 assert_different_registers(obj, tmp, R0);
4865 Label retry;
4866
4867 if (semantics & MemBarRel) {
4868 release();
4869 }
4870
4871 bind(retry);
4872 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4873 if (!is_unlock) {
4874 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4875 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4876 andi_(R0, tmp, markWord::lock_mask_in_place | markWord::inline_type_bit_in_place);
4877 bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0) or belongs to an inline type
4878 } else {
4879 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4880 andi_(R0, tmp, markWord::lock_mask_in_place);
4881 bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4882 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4883 }
4884 stdcx_(tmp, obj);
4885 bne(CR0, retry);
4886
4887 if (semantics & MemBarFenceAfter) {
4888 fence();
4889 } else if (semantics & MemBarAcq) {
4890 isync();
4891 }
4892 }
4893
4894 // Implements fast-locking.
4895 //
4896 // - obj: the object to be locked
4897 // - t1, t2: temporary register
4898 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4899 assert_different_registers(box, obj, t1, t2, R0);
4900
4901 Label push;
4902 const Register t = R0;
4903
4904 if (UseObjectMonitorTable) {
4905 // Clear cache in case fast locking succeeds or we need to take the slow-path.
4906 li(t, 0);
4907 std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4908 }
4909
4910 if (DiagnoseSyncOnValueBasedClasses != 0) {
4911 load_klass(t1, obj);
4912 lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4913 testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4914 bne(CR0, slow);
4915 }
4916
4917 const Register top = t1;
4918 const Register mark = t2;
4919
4920 // Check if the lock-stack is full.
4921 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4922 cmplwi(CR0, top, LockStack::end_offset());
4923 bge(CR0, slow);
4924
4925 // The underflow check is elided. The recursive check will always fail
4926 // when the lock stack is empty because of the _bad_oop_sentinel field.
4927
4928 // Check for recursion.
4929 subi(t, top, oopSize);
4930 ldx(t, R16_thread, t);
4931 cmpd(CR0, obj, t);
4932 beq(CR0, push);
4933
4934 // Check header for monitor (0b10) or locked (0b00).
4935 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4936 xori(t, mark, markWord::unlocked_value);
4937 andi_(t, t, markWord::lock_mask_in_place);
4938 bne(CR0, slow);
4939
4940 // Try to lock. Transition lock bits 0b01 => 0b00
4941 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4942
4943 bind(push);
4944 // After successful lock, push object on lock-stack
4945 stdx(obj, R16_thread, top);
4946 addi(top, top, oopSize);
4947 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4948 }
4949
4950 // Implements fast-unlocking.
4951 //
4952 // - obj: the object to be unlocked
4953 // - t1: temporary register
4954 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4955 assert_different_registers(obj, t1);
4956
4957 #ifdef ASSERT
4958 {
4959 // The following checks rely on the fact that LockStack is only ever modified by
4960 // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4961 // entries after inflation will happen delayed in that case.
4962
4963 // Check for lock-stack underflow.
4964 Label stack_ok;
4965 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4966 cmplwi(CR0, t1, LockStack::start_offset());
4967 bge(CR0, stack_ok);
4968 stop("Lock-stack underflow");
4969 bind(stack_ok);
4970 }
4971 #endif
4972
4973 Label unlocked, push_and_slow;
4974 const Register top = t1;
4975 const Register mark = R0;
4976 Register t = R0;
4977
4978 // Check if obj is top of lock-stack.
4979 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4980 subi(top, top, oopSize);
4981 ldx(t, R16_thread, top);
4982 cmpd(CR0, obj, t);
4983 bne(CR0, slow);
4984
4985 // Pop lock-stack.
4986 DEBUG_ONLY(li(t, 0);)
4987 DEBUG_ONLY(stdx(t, R16_thread, top);)
4988 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4989
4990 // The underflow check is elided. The recursive check will always fail
4991 // when the lock stack is empty because of the _bad_oop_sentinel field.
4992
4993 // Check if recursive.
4994 subi(t, top, oopSize);
4995 ldx(t, R16_thread, t);
4996 cmpd(CR0, obj, t);
4997 beq(CR0, unlocked);
4998
4999 // Use top as tmp
5000 t = top;
5001
5002 // Not recursive. Check header for monitor (0b10).
5003 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
5004 andi_(t, mark, markWord::monitor_value);
5005 bne(CR0, push_and_slow);
5006
5007 #ifdef ASSERT
5008 // Check header not unlocked (0b01).
5009 Label not_unlocked;
5010 andi_(t, mark, markWord::unlocked_value);
5011 beq(CR0, not_unlocked);
5012 stop("fast_unlock already unlocked");
5013 bind(not_unlocked);
5014 #endif
5015
5016 // Try to unlock. Transition lock bits 0b00 => 0b01
5017 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
5018 b(unlocked);
5019
5020 bind(push_and_slow);
5021
5022 // Restore lock-stack and handle the unlock in runtime.
5023 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
5024 DEBUG_ONLY(stdx(obj, R16_thread, top);)
5025 addi(top, top, oopSize);
5026 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
5027 b(slow);
5028
5029 bind(unlocked);
5030 }
5031
5032 // Unimplemented methods for inline types.
5033 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
5034 Unimplemented();
5035 }
5036
5037 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
5038 Unimplemented();
5039 }
5040
5041 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
5042 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
5043 RegState reg_state[]) {
5044 Unimplemented();
5045 }
5046
5047 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
5048 VMRegPair* from, int from_count, int& from_index, VMReg to,
5049 RegState reg_state[], Register val_array) {
5050 Unimplemented();
5051 }
5052
5053 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
5054 Unimplemented();
5055 }
5056
5057 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
5058 Unimplemented();
5059 }