1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2026 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.inline.hpp"
27 #include "code/compiledIC.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "interpreter/interpreterRuntime.hpp"
34 #include "memory/resourceArea.hpp"
35 #include "nativeInst_ppc.hpp"
36 #include "oops/compressedKlass.inline.hpp"
37 #include "oops/compressedOops.inline.hpp"
38 #include "oops/klass.inline.hpp"
39 #include "oops/methodData.hpp"
40 #include "prims/methodHandles.hpp"
41 #include "register_ppc.hpp"
42 #include "runtime/icache.hpp"
43 #include "runtime/interfaceSupport.inline.hpp"
44 #include "runtime/objectMonitor.hpp"
45 #include "runtime/objectMonitorTable.hpp"
46 #include "runtime/os.hpp"
47 #include "runtime/safepoint.hpp"
48 #include "runtime/safepointMechanism.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "runtime/vm_version.hpp"
52 #include "utilities/macros.hpp"
53 #include "utilities/powerOfTwo.hpp"
54
55 #ifdef PRODUCT
56 #define BLOCK_COMMENT(str) // nothing
57 #else
58 #define BLOCK_COMMENT(str) block_comment(str)
59 #endif
60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
61
62 #ifdef ASSERT
63 // On RISC, there's no benefit to verifying instruction boundaries.
64 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
65 #endif
66
67 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
68 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
69 if (Assembler::is_simm(si31, 16)) {
70 ld(d, si31, a);
71 if (emit_filler_nop) nop();
72 } else {
73 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
74 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
75 addis(d, a, hi);
76 ld(d, lo, d);
77 }
78 }
79
80 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
81 assert_different_registers(d, a);
82 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
83 }
84
85 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
86 size_t size_in_bytes, bool is_signed) {
87 switch (size_in_bytes) {
88 case 8: ld(dst, offs, base); break;
89 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
90 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
91 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
92 default: ShouldNotReachHere();
93 }
94 }
95
96 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
97 size_t size_in_bytes) {
98 switch (size_in_bytes) {
99 case 8: std(dst, offs, base); break;
100 case 4: stw(dst, offs, base); break;
101 case 2: sth(dst, offs, base); break;
102 case 1: stb(dst, offs, base); break;
103 default: ShouldNotReachHere();
104 }
105 }
106
107 void MacroAssembler::align(int modulus, int max, int rem) {
108 int padding = (rem + modulus - (offset() % modulus)) % modulus;
109 if (padding > max) return;
110 for (int c = (padding >> 2); c > 0; --c) { nop(); }
111 }
112
113 void MacroAssembler::align_prefix() {
114 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
115 }
116
117 // Issue instructions that calculate given TOC from global TOC.
118 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
119 bool add_relocation, bool emit_dummy_addr,
120 bool add_addr_to_reloc) {
121 int offset = -1;
122 if (emit_dummy_addr) {
123 offset = -128; // dummy address
124 } else if (addr != (address)(intptr_t)-1) {
125 offset = MacroAssembler::offset_to_global_toc(addr);
126 }
127
128 if (hi16) {
129 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
130 }
131 if (lo16) {
132 if (add_relocation) {
133 // Relocate at the addi to avoid confusion with a load from the method's TOC.
134 RelocationHolder rh = add_addr_to_reloc ?
135 internal_word_Relocation::spec(addr) :
136 internal_word_Relocation::spec_for_immediate();
137 relocate(rh);
138 }
139 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
140 }
141 }
142
143 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
144 const int offset = MacroAssembler::offset_to_global_toc(addr);
145
146 const address inst2_addr = a;
147 const int inst2 = *(int *)inst2_addr;
148
149 // The relocation points to the second instruction, the addi,
150 // and the addi reads and writes the same register dst.
151 const int dst = inv_rt_field(inst2);
152 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
153
154 // Now, find the preceding addis which writes to dst.
155 int inst1 = 0;
156 address inst1_addr = inst2_addr - BytesPerInstWord;
157 while (inst1_addr >= bound) {
158 inst1 = *(int *) inst1_addr;
159 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
160 // Stop, found the addis which writes dst.
161 break;
162 }
163 inst1_addr -= BytesPerInstWord;
164 }
165
166 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
167 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
168 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
169 return inst1_addr;
170 }
171
172 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
173 const address inst2_addr = a;
174 const int inst2 = *(int *)inst2_addr;
175
176 // The relocation points to the second instruction, the addi,
177 // and the addi reads and writes the same register dst.
178 const int dst = inv_rt_field(inst2);
179 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
180
181 // Now, find the preceding addis which writes to dst.
182 int inst1 = 0;
183 address inst1_addr = inst2_addr - BytesPerInstWord;
184 while (inst1_addr >= bound) {
185 inst1 = *(int *) inst1_addr;
186 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
187 // stop, found the addis which writes dst
188 break;
189 }
190 inst1_addr -= BytesPerInstWord;
191 }
192
193 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
194
195 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
196 // -1 is a special case
197 if (offset == -1) {
198 return (address)(intptr_t)-1;
199 } else {
200 return global_toc() + offset;
201 }
202 }
203
204 #ifdef _LP64
205 // Patch compressed oops or klass constants.
206 // Assembler sequence is
207 // 1) compressed oops:
208 // lis rx = const.hi
209 // ori rx = rx | const.lo
210 // 2) compressed klass:
211 // lis rx = const.hi
212 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
213 // ori rx = rx | const.lo
214 // Clrldi will be passed by.
215 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
216 assert(UseCompressedOops, "Should only patch compressed oops");
217
218 const address inst2_addr = a;
219 const int inst2 = *(int *)inst2_addr;
220
221 // The relocation points to the second instruction, the ori,
222 // and the ori reads and writes the same register dst.
223 const int dst = inv_rta_field(inst2);
224 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
225 // Now, find the preceding addis which writes to dst.
226 int inst1 = 0;
227 address inst1_addr = inst2_addr - BytesPerInstWord;
228 bool inst1_found = false;
229 while (inst1_addr >= bound) {
230 inst1 = *(int *)inst1_addr;
231 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
232 inst1_addr -= BytesPerInstWord;
233 }
234 assert(inst1_found, "inst is not lis");
235
236 uint32_t data_value = CompressedOops::narrow_oop_value(data);
237 int xc = (data_value >> 16) & 0xffff;
238 int xd = (data_value >> 0) & 0xffff;
239
240 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
241 set_imm((int *)inst2_addr, (xd)); // unsigned int
242 return inst1_addr;
243 }
244
245 // Get compressed oop constant.
246 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
247 assert(UseCompressedOops, "Should only patch compressed oops");
248
249 const address inst2_addr = a;
250 const int inst2 = *(int *)inst2_addr;
251
252 // The relocation points to the second instruction, the ori,
253 // and the ori reads and writes the same register dst.
254 const int dst = inv_rta_field(inst2);
255 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
256 // Now, find the preceding lis which writes to dst.
257 int inst1 = 0;
258 address inst1_addr = inst2_addr - BytesPerInstWord;
259 bool inst1_found = false;
260
261 while (inst1_addr >= bound) {
262 inst1 = *(int *) inst1_addr;
263 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
264 inst1_addr -= BytesPerInstWord;
265 }
266 assert(inst1_found, "inst is not lis");
267
268 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
269 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
270
271 return CompressedOops::narrow_oop_cast(xl | xh);
272 }
273 #endif // _LP64
274
275 // Returns true if successful.
276 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
277 Register toc, bool fixed_size) {
278 int toc_offset = 0;
279 // Use RelocationHolder::none for the constant pool entry, otherwise
280 // we will end up with a failing NativeCall::verify(x) where x is
281 // the address of the constant pool entry.
282 // FIXME: We should insert relocation information for oops at the constant
283 // pool entries instead of inserting it at the loads; patching of a constant
284 // pool entry should be less expensive.
285 address const_address = address_constant((address)a.value(), RelocationHolder::none);
286 if (const_address == nullptr) { return false; } // allocation failure
287 // Relocate at the pc of the load.
288 relocate(a.rspec());
289 toc_offset = (int)(const_address - code()->consts()->start());
290 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
291 return true;
292 }
293
294 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
295 const address inst1_addr = a;
296 const int inst1 = *(int *)inst1_addr;
297
298 // The relocation points to the ld or the addis.
299 return (is_ld(inst1)) ||
300 (is_addis(inst1) && inv_ra_field(inst1) != 0);
301 }
302
303 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
304 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
305
306 const address inst1_addr = a;
307 const int inst1 = *(int *)inst1_addr;
308
309 if (is_ld(inst1)) {
310 return inv_d1_field(inst1);
311 } else if (is_addis(inst1)) {
312 const int dst = inv_rt_field(inst1);
313
314 // Now, find the succeeding ld which reads and writes to dst.
315 address inst2_addr = inst1_addr + BytesPerInstWord;
316 int inst2 = 0;
317 while (true) {
318 inst2 = *(int *) inst2_addr;
319 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
320 // Stop, found the ld which reads and writes dst.
321 break;
322 }
323 inst2_addr += BytesPerInstWord;
324 }
325 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
326 }
327 ShouldNotReachHere();
328 return 0;
329 }
330
331 // Get the constant from a `load_const' sequence.
332 long MacroAssembler::get_const(address a) {
333 assert(is_load_const_at(a), "not a load of a constant");
334 const int *p = (const int*) a;
335 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
336 if (is_ori(*(p+1))) {
337 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
338 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
339 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
340 } else if (is_lis(*(p+1))) {
341 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
342 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
343 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
344 } else {
345 ShouldNotReachHere();
346 return (long) 0;
347 }
348 return (long) x;
349 }
350
351 // Patch the 64 bit constant of a `load_const' sequence. This is a low
352 // level procedure. It neither flushes the instruction cache nor is it
353 // mt safe.
354 void MacroAssembler::patch_const(address a, long x) {
355 assert(is_load_const_at(a), "not a load of a constant");
356 int *p = (int*) a;
357 if (is_ori(*(p+1))) {
358 set_imm(0 + p, (x >> 48) & 0xffff);
359 set_imm(1 + p, (x >> 32) & 0xffff);
360 set_imm(3 + p, (x >> 16) & 0xffff);
361 set_imm(4 + p, x & 0xffff);
362 } else if (is_lis(*(p+1))) {
363 set_imm(0 + p, (x >> 48) & 0xffff);
364 set_imm(2 + p, (x >> 32) & 0xffff);
365 set_imm(1 + p, (x >> 16) & 0xffff);
366 set_imm(3 + p, x & 0xffff);
367 } else {
368 ShouldNotReachHere();
369 }
370 }
371
372 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
373 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
374 int index = oop_recorder()->allocate_metadata_index(obj);
375 RelocationHolder rspec = metadata_Relocation::spec(index);
376 return AddressLiteral((address)obj, rspec);
377 }
378
379 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
380 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
381 int index = oop_recorder()->find_index(obj);
382 RelocationHolder rspec = metadata_Relocation::spec(index);
383 return AddressLiteral((address)obj, rspec);
384 }
385
386 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
387 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
388 int oop_index = oop_recorder()->allocate_oop_index(obj);
389 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
390 }
391
392 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
393 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
394 int oop_index = oop_recorder()->find_index(obj);
395 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
396 }
397
398 #ifndef PRODUCT
399 void MacroAssembler::pd_print_patched_instruction(address branch) {
400 Unimplemented(); // TODO: PPC port
401 }
402 #endif // ndef PRODUCT
403
404 // Conditional far branch for destinations encodable in 24+2 bits.
405 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
406
407 // If requested by flag optimize, relocate the bc_far as a
408 // runtime_call and prepare for optimizing it when the code gets
409 // relocated.
410 if (optimize == bc_far_optimize_on_relocate) {
411 relocate(relocInfo::runtime_call_type);
412 }
413
414 // variant 2:
415 //
416 // b!cxx SKIP
417 // bxx DEST
418 // SKIP:
419 //
420
421 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
422 opposite_bcond(inv_boint_bcond(boint)));
423
424 // We emit two branches.
425 // First, a conditional branch which jumps around the far branch.
426 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
427 const address bc_pc = pc();
428 bc(opposite_boint, biint, not_taken_pc);
429
430 const int bc_instr = *(int*)bc_pc;
431 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
432 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
433 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
434 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
435 "postcondition");
436 assert(biint == inv_bi_field(bc_instr), "postcondition");
437
438 // Second, an unconditional far branch which jumps to dest.
439 // Note: target(dest) remembers the current pc (see CodeSection::target)
440 // and returns the current pc if the label is not bound yet; when
441 // the label gets bound, the unconditional far branch will be patched.
442 const address target_pc = target(dest);
443 const address b_pc = pc();
444 b(target_pc);
445
446 assert(not_taken_pc == pc(), "postcondition");
447 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
448 }
449
450 // 1 or 2 instructions
451 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
452 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
453 bc(boint, biint, dest);
454 } else {
455 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
456 }
457 }
458
459 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
460 return is_bc_far_variant1_at(instruction_addr) ||
461 is_bc_far_variant2_at(instruction_addr) ||
462 is_bc_far_variant3_at(instruction_addr);
463 }
464
465 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
466 if (is_bc_far_variant1_at(instruction_addr)) {
467 const address instruction_1_addr = instruction_addr;
468 const int instruction_1 = *(int*)instruction_1_addr;
469 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
470 } else if (is_bc_far_variant2_at(instruction_addr)) {
471 const address instruction_2_addr = instruction_addr + 4;
472 return bxx_destination(instruction_2_addr);
473 } else if (is_bc_far_variant3_at(instruction_addr)) {
474 return instruction_addr + 8;
475 }
476 // variant 4 ???
477 ShouldNotReachHere();
478 return nullptr;
479 }
480 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
481
482 if (is_bc_far_variant3_at(instruction_addr)) {
483 // variant 3, far cond branch to the next instruction, already patched to nops:
484 //
485 // nop
486 // nop
487 // SKIP/DEST:
488 //
489 return;
490 }
491
492 // first, extract boint and biint from the current branch
493 int boint = 0;
494 int biint = 0;
495
496 ResourceMark rm;
497 const int code_size = 2 * BytesPerInstWord;
498 CodeBuffer buf(instruction_addr, code_size);
499 MacroAssembler masm(&buf);
500 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
501 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
502 masm.nop();
503 masm.nop();
504 } else {
505 if (is_bc_far_variant1_at(instruction_addr)) {
506 // variant 1, the 1st instruction contains the destination address:
507 //
508 // bcxx DEST
509 // nop
510 //
511 const int instruction_1 = *(int*)(instruction_addr);
512 boint = inv_bo_field(instruction_1);
513 biint = inv_bi_field(instruction_1);
514 } else if (is_bc_far_variant2_at(instruction_addr)) {
515 // variant 2, the 2nd instruction contains the destination address:
516 //
517 // b!cxx SKIP
518 // bxx DEST
519 // SKIP:
520 //
521 const int instruction_1 = *(int*)(instruction_addr);
522 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
523 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
524 biint = inv_bi_field(instruction_1);
525 } else {
526 // variant 4???
527 ShouldNotReachHere();
528 }
529
530 // second, set the new branch destination and optimize the code
531 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
532 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
533 // variant 1:
534 //
535 // bcxx DEST
536 // nop
537 //
538 masm.bc(boint, biint, dest);
539 masm.nop();
540 } else {
541 // variant 2:
542 //
543 // b!cxx SKIP
544 // bxx DEST
545 // SKIP:
546 //
547 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
548 opposite_bcond(inv_boint_bcond(boint)));
549 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
550 masm.bc(opposite_boint, biint, not_taken_pc);
551 masm.b(dest);
552 }
553 }
554 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
555 }
556
557 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
558 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
559 // get current pc
560 uint64_t start_pc = (uint64_t) pc();
561
562 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
563 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
564
565 // relocate here
566 if (rt != relocInfo::none) {
567 relocate(rt);
568 }
569
570 if ( ReoptimizeCallSequences &&
571 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
572 (!link && is_within_range_of_b(dest, pc_of_b)))) {
573 // variant 2:
574 // Emit an optimized, pc-relative call/jump.
575
576 if (link) {
577 // some padding
578 nop();
579 nop();
580 nop();
581 nop();
582 nop();
583 nop();
584
585 // do the call
586 assert(pc() == pc_of_bl, "just checking");
587 bl(dest, relocInfo::none);
588 } else {
589 // do the jump
590 assert(pc() == pc_of_b, "just checking");
591 b(dest, relocInfo::none);
592
593 // some padding
594 nop();
595 nop();
596 nop();
597 nop();
598 nop();
599 nop();
600 }
601
602 // Assert that we can identify the emitted call/jump.
603 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
604 "can't identify emitted call");
605 } else {
606 // variant 1:
607 mr(R0, R11); // spill R11 -> R0.
608
609 // Load the destination address into CTR,
610 // calculate destination relative to global toc.
611 calculate_address_from_global_toc(R11, dest, true, true, false);
612
613 mtctr(R11);
614 mr(R11, R0); // spill R11 <- R0.
615 nop();
616
617 // do the call/jump
618 if (link) {
619 bctrl();
620 } else{
621 bctr();
622 }
623 // Assert that we can identify the emitted call/jump.
624 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
625 "can't identify emitted call");
626 }
627
628 // Assert that we can identify the emitted call/jump.
629 assert(is_bxx64_patchable_at((address)start_pc, link),
630 "can't identify emitted call");
631 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
632 "wrong encoding of dest address");
633 }
634
635 // Identify a bxx64_patchable instruction.
636 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
637 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
638 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
639 || is_bxx64_patchable_variant2_at(instruction_addr, link);
640 }
641
642 // Does the call64_patchable instruction use a pc-relative encoding of
643 // the call destination?
644 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
645 // variant 2 is pc-relative
646 return is_bxx64_patchable_variant2_at(instruction_addr, link);
647 }
648
649 // Identify variant 1.
650 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
651 unsigned int* instr = (unsigned int*) instruction_addr;
652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
653 && is_mtctr(instr[5]) // mtctr
654 && is_load_const_at(instruction_addr);
655 }
656
657 // Identify variant 1b: load destination relative to global toc.
658 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
659 unsigned int* instr = (unsigned int*) instruction_addr;
660 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
661 && is_mtctr(instr[3]) // mtctr
662 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
663 }
664
665 // Identify variant 2.
666 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
667 unsigned int* instr = (unsigned int*) instruction_addr;
668 if (link) {
669 return is_bl (instr[6]) // bl dest is last
670 && is_nop(instr[0]) // nop
671 && is_nop(instr[1]) // nop
672 && is_nop(instr[2]) // nop
673 && is_nop(instr[3]) // nop
674 && is_nop(instr[4]) // nop
675 && is_nop(instr[5]); // nop
676 } else {
677 return is_b (instr[0]) // b dest is first
678 && is_nop(instr[1]) // nop
679 && is_nop(instr[2]) // nop
680 && is_nop(instr[3]) // nop
681 && is_nop(instr[4]) // nop
682 && is_nop(instr[5]) // nop
683 && is_nop(instr[6]); // nop
684 }
685 }
686
687 // Set dest address of a bxx64_patchable instruction.
688 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
689 ResourceMark rm;
690 int code_size = MacroAssembler::bxx64_patchable_size;
691 CodeBuffer buf(instruction_addr, code_size);
692 MacroAssembler masm(&buf);
693 masm.bxx64_patchable(dest, relocInfo::none, link);
694 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
695 }
696
697 // Get dest address of a bxx64_patchable instruction.
698 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
699 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
700 return (address) (unsigned long) get_const(instruction_addr);
701 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
702 unsigned int* instr = (unsigned int*) instruction_addr;
703 if (link) {
704 const int instr_idx = 6; // bl is last
705 int branchoffset = branch_destination(instr[instr_idx], 0);
706 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
707 } else {
708 const int instr_idx = 0; // b is first
709 int branchoffset = branch_destination(instr[instr_idx], 0);
710 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
711 }
712 // Load dest relative to global toc.
713 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
714 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
715 instruction_addr);
716 } else {
717 ShouldNotReachHere();
718 return nullptr;
719 }
720 }
721
722 #ifdef ASSERT
723 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
724 const int magic_number = 0x42;
725
726 // Preserve stack pointer register (R1_SP) and system thread id register (R13);
727 // although they're technically volatile
728 for (int i = 2; i < 13; i++) {
729 Register reg = as_Register(i);
730 if (reg == excluded_register) {
731 continue;
732 }
733
734 li(reg, magic_number);
735 }
736 }
737
738 void MacroAssembler::clobber_nonvolatile_registers() {
739 BLOCK_COMMENT("clobber nonvolatile registers {");
740 static const Register regs[] = {
741 R14,
742 R15,
743 // don't zap R16_thread
744 R17,
745 R18,
746 R19,
747 R20,
748 R21,
749 R22,
750 R23,
751 R24,
752 R25,
753 R26,
754 R27,
755 R28,
756 // don't zap R29_TOC
757 R30,
758 R31
759 };
760 Register bad = regs[0];
761 load_const_optimized(bad, 0xbad0101babe00000);
762 for (int i = (sizeof(regs) / sizeof(Register)) - 1; i >= 0; i--) {
763 addi(regs[i], bad, regs[i]->encoding());
764 }
765 BLOCK_COMMENT("} clobber nonvolatile registers");
766 }
767 #endif // ASSERT
768
769 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
770 const int magic_number = 0x43;
771
772 li(tmp, magic_number);
773 for (int m = 0; m <= 7; m++) {
774 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
775 }
776 }
777
778 void MacroAssembler::save_nonvolatile_registers(Register dst, int offset, bool include_fp_regs, bool include_vector_regs) {
779 BLOCK_COMMENT("save_nonvolatile_registers {");
780
781 for (int i = 14; i < 32; i++) {
782 std(as_Register(i), offset, dst);
783 offset += 8;
784 }
785
786 if (include_fp_regs) {
787 for (int i = 14; i < 32; i++) {
788 stfd(as_FloatRegister(i), offset, dst);
789 offset += 8;
790 }
791 }
792
793 if (include_vector_regs) {
794 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
795 if (PowerArchitecturePPC64 >= 10) {
796 for (int i = 20; i < 32; i += 2) {
797 stxvp(as_VectorRegister(i)->to_vsr(), offset, dst);
798 offset += 32;
799 }
800 } else {
801 for (int i = 20; i < 32; i++) {
802 stxv(as_VectorRegister(i)->to_vsr(), offset, dst);
803 offset += 16;
804 }
805 }
806 }
807
808 BLOCK_COMMENT("} save_nonvolatile_registers ");
809 }
810
811 void MacroAssembler::restore_nonvolatile_registers(Register src, int offset, bool include_fp_regs, bool include_vector_regs) {
812 BLOCK_COMMENT("restore_nonvolatile_registers {");
813
814 for (int i = 14; i < 32; i++) {
815 ld(as_Register(i), offset, src);
816 offset += 8;
817 }
818
819 if (include_fp_regs) {
820 for (int i = 14; i < 32; i++) {
821 lfd(as_FloatRegister(i), offset, src);
822 offset += 8;
823 }
824 }
825
826 if (include_vector_regs) {
827 assert(is_aligned(offset, StackAlignmentInBytes), "should be");
828 if (PowerArchitecturePPC64 >= 10) {
829 for (int i = 20; i < 32; i += 2) {
830 lxvp(as_VectorRegister(i)->to_vsr(), offset, src);
831 offset += 32;
832 }
833 } else {
834 for (int i = 20; i < 32; i++) {
835 lxv(as_VectorRegister(i)->to_vsr(), offset, src);
836 offset += 16;
837 }
838 }
839 }
840
841 BLOCK_COMMENT("} restore_nonvolatile_registers");
842 }
843
844 // For verify_oops.
845 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
846 std(R2, offset, dst); offset += 8;
847 if (include_R3_RET_reg) {
848 std(R3, offset, dst); offset += 8;
849 }
850 std(R4, offset, dst); offset += 8;
851 std(R5, offset, dst); offset += 8;
852 std(R6, offset, dst); offset += 8;
853 std(R7, offset, dst); offset += 8;
854 std(R8, offset, dst); offset += 8;
855 std(R9, offset, dst); offset += 8;
856 std(R10, offset, dst); offset += 8;
857 std(R11, offset, dst); offset += 8;
858 std(R12, offset, dst); offset += 8;
859
860 if (include_fp_regs) {
861 stfd(F0, offset, dst); offset += 8;
862 stfd(F1, offset, dst); offset += 8;
863 stfd(F2, offset, dst); offset += 8;
864 stfd(F3, offset, dst); offset += 8;
865 stfd(F4, offset, dst); offset += 8;
866 stfd(F5, offset, dst); offset += 8;
867 stfd(F6, offset, dst); offset += 8;
868 stfd(F7, offset, dst); offset += 8;
869 stfd(F8, offset, dst); offset += 8;
870 stfd(F9, offset, dst); offset += 8;
871 stfd(F10, offset, dst); offset += 8;
872 stfd(F11, offset, dst); offset += 8;
873 stfd(F12, offset, dst); offset += 8;
874 stfd(F13, offset, dst);
875 }
876 }
877
878 // For verify_oops.
879 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
880 ld(R2, offset, src); offset += 8;
881 if (include_R3_RET_reg) {
882 ld(R3, offset, src); offset += 8;
883 }
884 ld(R4, offset, src); offset += 8;
885 ld(R5, offset, src); offset += 8;
886 ld(R6, offset, src); offset += 8;
887 ld(R7, offset, src); offset += 8;
888 ld(R8, offset, src); offset += 8;
889 ld(R9, offset, src); offset += 8;
890 ld(R10, offset, src); offset += 8;
891 ld(R11, offset, src); offset += 8;
892 ld(R12, offset, src); offset += 8;
893
894 if (include_fp_regs) {
895 lfd(F0, offset, src); offset += 8;
896 lfd(F1, offset, src); offset += 8;
897 lfd(F2, offset, src); offset += 8;
898 lfd(F3, offset, src); offset += 8;
899 lfd(F4, offset, src); offset += 8;
900 lfd(F5, offset, src); offset += 8;
901 lfd(F6, offset, src); offset += 8;
902 lfd(F7, offset, src); offset += 8;
903 lfd(F8, offset, src); offset += 8;
904 lfd(F9, offset, src); offset += 8;
905 lfd(F10, offset, src); offset += 8;
906 lfd(F11, offset, src); offset += 8;
907 lfd(F12, offset, src); offset += 8;
908 lfd(F13, offset, src);
909 }
910 }
911
912 void MacroAssembler::save_LR(Register tmp) {
913 mflr(tmp);
914 std(tmp, _abi0(lr), R1_SP);
915 }
916
917 void MacroAssembler::restore_LR(Register tmp) {
918 assert(tmp != R1_SP, "must be distinct");
919 ld(tmp, _abi0(lr), R1_SP);
920 mtlr(tmp);
921 }
922
923 void MacroAssembler::save_LR_CR(Register tmp) {
924 mfcr(tmp);
925 std(tmp, _abi0(cr), R1_SP);
926 save_LR(tmp);
927 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
928 }
929
930 void MacroAssembler::restore_LR_CR(Register tmp) {
931 restore_LR(tmp);
932 ld(tmp, _abi0(cr), R1_SP);
933 mtcr(tmp);
934 }
935
936 address MacroAssembler::get_PC_trash_LR(Register result) {
937 Label L;
938 bl(L);
939 bind(L);
940 address lr_pc = pc();
941 mflr(result);
942 return lr_pc;
943 }
944
945 void MacroAssembler::resize_frame(Register offset, Register tmp) {
946 #ifdef ASSERT
947 assert_different_registers(offset, tmp, R1_SP);
948 andi_(tmp, offset, frame::alignment_in_bytes-1);
949 asm_assert_eq("resize_frame: unaligned");
950 #endif
951
952 // tmp <- *(SP)
953 ld(tmp, _abi0(callers_sp), R1_SP);
954 // addr <- SP + offset;
955 // *(addr) <- tmp;
956 // SP <- addr
957 stdux(tmp, R1_SP, offset);
958 }
959
960 void MacroAssembler::resize_frame(int offset, Register tmp) {
961 assert(is_simm(offset, 16), "too big an offset");
962 assert_different_registers(tmp, R1_SP);
963 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
964 // tmp <- *(SP)
965 ld(tmp, _abi0(callers_sp), R1_SP);
966 // addr <- SP + offset;
967 // *(addr) <- tmp;
968 // SP <- addr
969 stdu(tmp, offset, R1_SP);
970 }
971
972 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
973 // (addr == tmp1) || (addr == tmp2) is allowed here!
974 assert(tmp1 != tmp2, "must be distinct");
975
976 // compute offset w.r.t. current stack pointer
977 // tmp_1 <- addr - SP (!)
978 subf(tmp1, R1_SP, addr);
979
980 // atomically update SP keeping back link.
981 resize_frame(tmp1/* offset */, tmp2/* tmp */);
982 }
983
984 void MacroAssembler::push_frame(Register bytes, Register tmp) {
985 #ifdef ASSERT
986 assert(bytes != R0, "r0 not allowed here");
987 andi_(R0, bytes, frame::alignment_in_bytes-1);
988 asm_assert_eq("push_frame(Reg, Reg): unaligned");
989 #endif
990 neg(tmp, bytes);
991 stdux(R1_SP, R1_SP, tmp);
992 }
993
994 // Push a frame of size `bytes'.
995 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
996 long offset = align_addr(bytes, frame::alignment_in_bytes);
997 if (is_simm(-offset, 16)) {
998 stdu(R1_SP, -offset, R1_SP);
999 } else {
1000 load_const_optimized(tmp, -offset);
1001 stdux(R1_SP, R1_SP, tmp);
1002 }
1003 }
1004
1005 // Push a frame of size `bytes' plus native_abi_reg_args on top.
1006 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
1007 push_frame(bytes + frame::native_abi_reg_args_size, tmp);
1008 }
1009
1010 // Pop current C frame.
1011 void MacroAssembler::pop_frame() {
1012 ld(R1_SP, _abi0(callers_sp), R1_SP);
1013 }
1014
1015 #if defined(ABI_ELFv2)
1016 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
1017 // TODO(asmundak): make sure the caller uses R12 as function descriptor
1018 // most of the times.
1019 if (R12 != r_function_entry) {
1020 mr(R12, r_function_entry);
1021 }
1022 mtctr(R12);
1023 // Do a call or a branch.
1024 if (and_link) {
1025 bctrl();
1026 } else {
1027 bctr();
1028 }
1029 _last_calls_return_pc = pc();
1030
1031 return _last_calls_return_pc;
1032 }
1033
1034 // Call a C function via a function descriptor and use full C
1035 // calling conventions. Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::call_c(Register r_function_entry) {
1037 return branch_to(r_function_entry, /*and_link=*/true);
1038 }
1039
1040 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1041 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1042 return branch_to(r_function_entry, /*and_link=*/false);
1043 }
1044
1045 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1046 load_const(R12, function_entry, R0);
1047 return branch_to(R12, /*and_link=*/true);
1048 }
1049
1050 #else
1051 // Generic version of a call to C function via a function descriptor
1052 // with variable support for C calling conventions (TOC, ENV, etc.).
1053 // Updates and returns _last_calls_return_pc.
1054 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1055 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1056 // we emit standard ptrgl glue code here
1057 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1058
1059 // retrieve necessary entries from the function descriptor
1060 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1061 mtctr(R0);
1062
1063 if (load_toc_of_callee) {
1064 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1065 }
1066 if (load_env_of_callee) {
1067 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1068 } else if (load_toc_of_callee) {
1069 li(R11, 0);
1070 }
1071
1072 // do a call or a branch
1073 if (and_link) {
1074 bctrl();
1075 } else {
1076 bctr();
1077 }
1078 _last_calls_return_pc = pc();
1079
1080 return _last_calls_return_pc;
1081 }
1082
1083 // Call a C function via a function descriptor and use full C calling
1084 // conventions.
1085 // We don't use the TOC in generated code, so there is no need to save
1086 // and restore its value.
1087 address MacroAssembler::call_c(Register fd) {
1088 return branch_to(fd, /*and_link=*/true,
1089 /*save toc=*/false,
1090 /*restore toc=*/false,
1091 /*load toc=*/true,
1092 /*load env=*/true);
1093 }
1094
1095 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1096 return branch_to(fd, /*and_link=*/false,
1097 /*save toc=*/false,
1098 /*restore toc=*/false,
1099 /*load toc=*/true,
1100 /*load env=*/true);
1101 }
1102
1103 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1104 if (rt != relocInfo::none) {
1105 // this call needs to be relocatable
1106 if (!ReoptimizeCallSequences
1107 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1108 || fd == nullptr // support code-size estimation
1109 || !fd->is_friend_function()
1110 || fd->entry() == nullptr) {
1111 // it's not a friend function as defined by class FunctionDescriptor,
1112 // so do a full call-c here.
1113 load_const(R11, (address)fd, R0);
1114
1115 bool has_env = (fd != nullptr && fd->env() != nullptr);
1116 return branch_to(R11, /*and_link=*/true,
1117 /*save toc=*/false,
1118 /*restore toc=*/false,
1119 /*load toc=*/true,
1120 /*load env=*/has_env);
1121 } else {
1122 // It's a friend function. Load the entry point and don't care about
1123 // toc and env. Use an optimizable call instruction, but ensure the
1124 // same code-size as in the case of a non-friend function.
1125 nop();
1126 nop();
1127 nop();
1128 bl64_patchable(fd->entry(), rt);
1129 _last_calls_return_pc = pc();
1130 return _last_calls_return_pc;
1131 }
1132 } else {
1133 // This call does not need to be relocatable, do more aggressive
1134 // optimizations.
1135 if (!ReoptimizeCallSequences
1136 || !fd->is_friend_function()) {
1137 // It's not a friend function as defined by class FunctionDescriptor,
1138 // so do a full call-c here.
1139 load_const(R11, (address)fd, R0);
1140 return branch_to(R11, /*and_link=*/true,
1141 /*save toc=*/false,
1142 /*restore toc=*/false,
1143 /*load toc=*/true,
1144 /*load env=*/true);
1145 } else {
1146 // it's a friend function, load the entry point and don't care about
1147 // toc and env.
1148 address dest = fd->entry();
1149 if (is_within_range_of_b(dest, pc())) {
1150 bl(dest);
1151 } else {
1152 bl64_patchable(dest, rt);
1153 }
1154 _last_calls_return_pc = pc();
1155 return _last_calls_return_pc;
1156 }
1157 }
1158 }
1159
1160 // Call a C function. All constants needed reside in TOC.
1161 //
1162 // Read the address to call from the TOC.
1163 // Read env from TOC, if fd specifies an env.
1164 // Read new TOC from TOC.
1165 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1166 relocInfo::relocType rt, Register toc) {
1167 if (!ReoptimizeCallSequences
1168 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1169 || !fd->is_friend_function()) {
1170 // It's not a friend function as defined by class FunctionDescriptor,
1171 // so do a full call-c here.
1172 assert(fd->entry() != nullptr, "function must be linked");
1173
1174 AddressLiteral fd_entry(fd->entry());
1175 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1176 mtctr(R11);
1177 if (fd->env() == nullptr) {
1178 li(R11, 0);
1179 nop();
1180 } else {
1181 AddressLiteral fd_env(fd->env());
1182 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1183 }
1184 AddressLiteral fd_toc(fd->toc());
1185 // Set R2_TOC (load from toc)
1186 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1187 bctrl();
1188 _last_calls_return_pc = pc();
1189 if (!success) { return nullptr; }
1190 } else {
1191 // It's a friend function, load the entry point and don't care about
1192 // toc and env. Use an optimizable call instruction, but ensure the
1193 // same code-size as in the case of a non-friend function.
1194 nop();
1195 bl64_patchable(fd->entry(), rt);
1196 _last_calls_return_pc = pc();
1197 }
1198 return _last_calls_return_pc;
1199 }
1200 #endif // ABI_ELFv2
1201
1202 void MacroAssembler::post_call_nop() {
1203 // Make inline again when loom is always enabled.
1204 if (!Continuations::enabled()) {
1205 return;
1206 }
1207 // We use CMPI/CMPLI instructions to encode post call nops.
1208 // Refer to NativePostCallNop for details.
1209 relocate(post_call_nop_Relocation::spec());
1210 InlineSkippedInstructionsCounter skipCounter(this);
1211 Assembler::emit_int32(Assembler::CMPLI_OPCODE | Assembler::opp_u_field(1, 9, 9));
1212 assert(is_post_call_nop(*(int*)(pc() - 4)), "post call not not found");
1213 }
1214
1215 int MacroAssembler::ic_check_size() {
1216 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1217 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1218 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1219
1220 int num_ins;
1221 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1222 num_ins = 3;
1223 if (use_trap_based_null_check) num_ins += 1;
1224 } else {
1225 num_ins = 7;
1226 if (!implicit_null_checks_available) num_ins += 2;
1227 }
1228
1229 if (UseCompactObjectHeaders) num_ins++;
1230
1231 return num_ins * BytesPerInstWord;
1232 }
1233
1234 int MacroAssembler::ic_check(int end_alignment) {
1235 bool implicit_null_checks_available = ImplicitNullChecks && os::zero_page_read_protected(),
1236 use_fast_receiver_null_check = implicit_null_checks_available || TrapBasedNullChecks,
1237 use_trap_based_null_check = !implicit_null_checks_available && TrapBasedNullChecks;
1238
1239 Register receiver = R3_ARG1;
1240 Register data = R19_inline_cache_reg;
1241 Register tmp1 = R11_scratch1;
1242 Register tmp2 = R12_scratch2;
1243
1244 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
1245 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
1246 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
1247 // before the inline cache check here, and not after
1248 align(end_alignment, end_alignment, end_alignment - ic_check_size());
1249
1250 int uep_offset = offset();
1251
1252 if (use_fast_receiver_null_check && TrapBasedICMissChecks) {
1253 // Fast version which uses SIGTRAP
1254
1255 if (use_trap_based_null_check) {
1256 trap_null_check(receiver);
1257 }
1258 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1259 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1260 trap_ic_miss_check(tmp1, tmp2);
1261
1262 } else {
1263 // Slower version which doesn't use SIGTRAP
1264
1265 // Load stub address using toc (fixed instruction size, unlike load_const_optimized)
1266 calculate_address_from_global_toc(tmp1, SharedRuntime::get_ic_miss_stub(),
1267 true, true, false); // 2 instructions
1268 mtctr(tmp1);
1269
1270 if (!implicit_null_checks_available) {
1271 cmpdi(CR0, receiver, 0);
1272 beqctr(CR0);
1273 }
1274 load_klass_no_decode(tmp1, receiver); // 2 instructions with UseCompactObjectHeaders
1275 ld(tmp2, in_bytes(CompiledICData::speculated_klass_offset()), data);
1276 cmpd(CR0, tmp1, tmp2);
1277 bnectr(CR0);
1278 }
1279
1280 assert((offset() % end_alignment) == 0, "Misaligned verified entry point");
1281
1282 return uep_offset;
1283 }
1284
1285 void MacroAssembler::call_VM_base(Register oop_result,
1286 Register last_java_sp,
1287 address entry_point,
1288 bool check_exceptions,
1289 Label* last_java_pc) {
1290 BLOCK_COMMENT("call_VM {");
1291 // Determine last_java_sp register.
1292 if (!last_java_sp->is_valid()) {
1293 last_java_sp = R1_SP;
1294 }
1295 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1, last_java_pc);
1296
1297 // ARG1 must hold thread address.
1298 mr(R3_ARG1, R16_thread);
1299 address return_pc = call_c(entry_point, relocInfo::none);
1300
1301 reset_last_Java_frame();
1302
1303 // Check for pending exceptions.
1304 if (check_exceptions) {
1305 // We don't check for exceptions here.
1306 ShouldNotReachHere();
1307 }
1308
1309 // Get oop result if there is one and reset the value in the thread.
1310 if (oop_result->is_valid()) {
1311 get_vm_result_oop(oop_result);
1312 }
1313
1314 _last_calls_return_pc = return_pc;
1315 BLOCK_COMMENT("} call_VM");
1316 }
1317
1318 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1319 BLOCK_COMMENT("call_VM_leaf {");
1320 call_c(entry_point);
1321 BLOCK_COMMENT("} call_VM_leaf");
1322 }
1323
1324 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions, Label* last_java_pc) {
1325 call_VM_base(oop_result, noreg, entry_point, check_exceptions, last_java_pc);
1326 }
1327
1328 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1329 bool check_exceptions) {
1330 // R3_ARG1 is reserved for the thread.
1331 mr_if_needed(R4_ARG2, arg_1);
1332 call_VM(oop_result, entry_point, check_exceptions);
1333 }
1334
1335 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1336 bool check_exceptions) {
1337 // R3_ARG1 is reserved for the thread
1338 assert_different_registers(arg_2, R4_ARG2);
1339 mr_if_needed(R4_ARG2, arg_1);
1340 mr_if_needed(R5_ARG3, arg_2);
1341 call_VM(oop_result, entry_point, check_exceptions);
1342 }
1343
1344 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1345 bool check_exceptions) {
1346 // R3_ARG1 is reserved for the thread
1347 assert_different_registers(arg_2, R4_ARG2);
1348 assert_different_registers(arg_3, R4_ARG2, R5_ARG3);
1349 mr_if_needed(R4_ARG2, arg_1);
1350 mr_if_needed(R5_ARG3, arg_2);
1351 mr_if_needed(R6_ARG4, arg_3);
1352 call_VM(oop_result, entry_point, check_exceptions);
1353 }
1354
1355 void MacroAssembler::call_VM_leaf(address entry_point) {
1356 call_VM_leaf_base(entry_point);
1357 }
1358
1359 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1360 mr_if_needed(R3_ARG1, arg_1);
1361 call_VM_leaf(entry_point);
1362 }
1363
1364 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1365 assert_different_registers(arg_2, R3_ARG1);
1366 mr_if_needed(R3_ARG1, arg_1);
1367 mr_if_needed(R4_ARG2, arg_2);
1368 call_VM_leaf(entry_point);
1369 }
1370
1371 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1372 assert_different_registers(arg_2, R3_ARG1);
1373 assert_different_registers(arg_3, R3_ARG1, R4_ARG2);
1374 mr_if_needed(R3_ARG1, arg_1);
1375 mr_if_needed(R4_ARG2, arg_2);
1376 mr_if_needed(R5_ARG3, arg_3);
1377 call_VM_leaf(entry_point);
1378 }
1379
1380 // Check whether instruction is a read access to the polling page
1381 // which was emitted by load_from_polling_page(..).
1382 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1383 address* polling_address_ptr) {
1384 if (!is_ld(instruction))
1385 return false; // It's not a ld. Fail.
1386
1387 int rt = inv_rt_field(instruction);
1388 int ra = inv_ra_field(instruction);
1389 int ds = inv_ds_field(instruction);
1390 if (!(ds == 0 && ra != 0 && rt == 0)) {
1391 return false; // It's not a ld(r0, X, ra). Fail.
1392 }
1393
1394 if (!ucontext) {
1395 // Set polling address.
1396 if (polling_address_ptr != nullptr) {
1397 *polling_address_ptr = nullptr;
1398 }
1399 return true; // No ucontext given. Can't check value of ra. Assume true.
1400 }
1401
1402 #ifdef LINUX
1403 // Ucontext given. Check that register ra contains the address of
1404 // the safepoing polling page.
1405 ucontext_t* uc = (ucontext_t*) ucontext;
1406 // Set polling address.
1407 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1408 if (polling_address_ptr != nullptr) {
1409 *polling_address_ptr = addr;
1410 }
1411 return SafepointMechanism::is_poll_address(addr);
1412 #else
1413 // Not on Linux, ucontext must be null.
1414 ShouldNotReachHere();
1415 return false;
1416 #endif
1417 }
1418
1419 void MacroAssembler::bang_stack_with_offset(int offset) {
1420 // When increasing the stack, the old stack pointer will be written
1421 // to the new top of stack according to the PPC64 abi.
1422 // Therefore, stack banging is not necessary when increasing
1423 // the stack by <= os::vm_page_size() bytes.
1424 // When increasing the stack by a larger amount, this method is
1425 // called repeatedly to bang the intermediate pages.
1426
1427 // Stack grows down, caller passes positive offset.
1428 assert(offset > 0, "must bang with positive offset");
1429
1430 long stdoffset = -offset;
1431
1432 if (is_simm(stdoffset, 16)) {
1433 // Signed 16 bit offset, a simple std is ok.
1434 if (UseLoadInstructionsForStackBangingPPC64) {
1435 ld(R0, (int)(signed short)stdoffset, R1_SP);
1436 } else {
1437 std(R0,(int)(signed short)stdoffset, R1_SP);
1438 }
1439 } else if (is_simm(stdoffset, 31)) {
1440 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1441 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1442
1443 Register tmp = R11;
1444 addis(tmp, R1_SP, hi);
1445 if (UseLoadInstructionsForStackBangingPPC64) {
1446 ld(R0, lo, tmp);
1447 } else {
1448 std(R0, lo, tmp);
1449 }
1450 } else {
1451 ShouldNotReachHere();
1452 }
1453 }
1454
1455 // If instruction is a stack bang of the form
1456 // std R0, x(Ry), (see bang_stack_with_offset())
1457 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1458 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1459 // return the banged address. Otherwise, return 0.
1460 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1461 #ifdef LINUX
1462 ucontext_t* uc = (ucontext_t*) ucontext;
1463 int rs = inv_rs_field(instruction);
1464 int ra = inv_ra_field(instruction);
1465 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1466 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1467 || (is_stdu(instruction) && rs == 1)) {
1468 int ds = inv_ds_field(instruction);
1469 // return banged address
1470 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1471 } else if (is_stdux(instruction) && rs == 1) {
1472 int rb = inv_rb_field(instruction);
1473 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1474 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1475 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang
1476 : sp + rb_val; // banged address
1477 }
1478 return nullptr; // not a stack bang
1479 #else
1480 // workaround not needed on !LINUX :-)
1481 ShouldNotCallThis();
1482 return nullptr;
1483 #endif
1484 }
1485
1486 void MacroAssembler::reserved_stack_check(Register return_pc) {
1487 // Test if reserved zone needs to be enabled.
1488 Label no_reserved_zone_enabling;
1489
1490 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1491 cmpld(CR0, R1_SP, R0);
1492 blt_predict_taken(CR0, no_reserved_zone_enabling);
1493
1494 // Enable reserved zone again, throw stack overflow exception.
1495 push_frame_reg_args(0, R0);
1496 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1497 pop_frame();
1498 mtlr(return_pc);
1499 load_const_optimized(R0, SharedRuntime::throw_delayed_StackOverflowError_entry());
1500 mtctr(R0);
1501 bctr();
1502
1503 should_not_reach_here();
1504
1505 bind(no_reserved_zone_enabling);
1506 }
1507
1508 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1509 bool cmpxchgx_hint) {
1510 Label retry;
1511 bind(retry);
1512 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1513 stdcx_(exchange_value, addr_base);
1514 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1515 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1516 } else {
1517 bne( CR0, retry); // StXcx_ sets CR0.
1518 }
1519 }
1520
1521 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1522 Register tmp, bool cmpxchgx_hint) {
1523 Label retry;
1524 bind(retry);
1525 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1526 add(tmp, dest_current_value, inc_value);
1527 stdcx_(tmp, addr_base);
1528 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1529 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1530 } else {
1531 bne( CR0, retry); // StXcx_ sets CR0.
1532 }
1533 }
1534
1535 // Word/sub-word atomic helper functions
1536
1537 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1538 // Only signed types are supported with size < 4.
1539 // Atomic add always kills tmp1.
1540 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1541 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1542 bool cmpxchgx_hint, bool is_add, int size) {
1543 // Sub-word instructions are available since Power 8.
1544
1545 Label retry;
1546 Register shift_amount = noreg,
1547 val32 = dest_current_value,
1548 modval = is_add ? tmp1 : exchange_value;
1549
1550
1551 // atomic emulation loop
1552 bind(retry);
1553
1554 switch (size) {
1555 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1556 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1557 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1558 default: ShouldNotReachHere();
1559 }
1560
1561 if (is_add) { add(modval, dest_current_value, exchange_value); }
1562
1563
1564 switch (size) {
1565 case 4: stwcx_(modval, addr_base); break;
1566 case 2: sthcx_(modval, addr_base); break;
1567 case 1: stbcx_(modval, addr_base); break;
1568 default: ShouldNotReachHere();
1569 }
1570
1571 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1572 bne_predict_not_taken(CR0, retry); // StXcx_ sets CR0.
1573 } else {
1574 bne( CR0, retry); // StXcx_ sets CR0.
1575 }
1576
1577 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1578 if (size == 1) {
1579 extsb(dest_current_value, dest_current_value);
1580 } else if (size == 2) {
1581 extsh(dest_current_value, dest_current_value);
1582 };
1583 }
1584
1585 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1586 // Only signed types are supported with size < 4.
1587 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1588 RegisterOrConstant compare_value, Register exchange_value,
1589 Register addr_base, Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1590 // Sub-word instructions are available since Power 8.
1591 Register shift_amount = noreg,
1592 val32 = dest_current_value,
1593 modval = exchange_value;
1594
1595 // atomic emulation loop
1596 bind(retry);
1597
1598 switch (size) {
1599 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1600 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1601 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1602 default: ShouldNotReachHere();
1603 }
1604
1605 if (size == 1) {
1606 extsb(dest_current_value, dest_current_value);
1607 } else if (size == 2) {
1608 extsh(dest_current_value, dest_current_value);
1609 };
1610
1611 cmpw(flag, dest_current_value, compare_value);
1612 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1613 bne_predict_not_taken(flag, failed);
1614 } else {
1615 bne( flag, failed);
1616 }
1617 // branch to done => (flag == ne), (dest_current_value != compare_value)
1618 // fall through => (flag == eq), (dest_current_value == compare_value)
1619
1620 switch (size) {
1621 case 4: stwcx_(modval, addr_base); break;
1622 case 2: sthcx_(modval, addr_base); break;
1623 case 1: stbcx_(modval, addr_base); break;
1624 default: ShouldNotReachHere();
1625 }
1626 }
1627
1628 // CmpxchgX sets condition register to cmpX(current, compare).
1629 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1630 RegisterOrConstant compare_value, Register exchange_value,
1631 Register addr_base, int semantics, bool cmpxchgx_hint, Register int_flag_success,
1632 Label* failed_ext, bool contention_hint, bool weak, int size) {
1633 Label retry;
1634 Label failed_int;
1635 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1636 Label done;
1637
1638 // Save one branch if result is returned via register and
1639 // result register is different from the other ones.
1640 bool use_result_reg = (int_flag_success != noreg);
1641 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value.register_or_noreg() &&
1642 int_flag_success != exchange_value && int_flag_success != addr_base);
1643 assert(!weak || flag == CR0, "weak only supported with CR0");
1644 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1645 assert(size == 1 || size == 2 || size == 4, "unsupported");
1646
1647 if (use_result_reg && preset_result_reg) {
1648 li(int_flag_success, 0); // preset (assume cas failed)
1649 }
1650
1651 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1652 if (contention_hint) { // Don't try to reserve if cmp fails.
1653 switch (size) {
1654 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1655 case 2: lha(dest_current_value, 0, addr_base); break;
1656 case 4: lwz(dest_current_value, 0, addr_base); break;
1657 default: ShouldNotReachHere();
1658 }
1659 cmpw(flag, dest_current_value, compare_value);
1660 bne(flag, failed);
1661 }
1662
1663 // release/fence semantics
1664 if (semantics & MemBarRel) {
1665 release();
1666 }
1667
1668 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base,
1669 retry, failed, cmpxchgx_hint, size);
1670 if (!weak || use_result_reg || failed_ext) {
1671 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1672 bne_predict_not_taken(CR0, weak ? failed : retry); // StXcx_ sets CR0.
1673 } else {
1674 bne( CR0, weak ? failed : retry); // StXcx_ sets CR0.
1675 }
1676 }
1677 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1678
1679 // Result in register (must do this at the end because int_flag_success can be the
1680 // same register as one above).
1681 if (use_result_reg) {
1682 li(int_flag_success, 1);
1683 }
1684
1685 if (semantics & MemBarFenceAfter) {
1686 fence();
1687 } else if (semantics & MemBarAcq) {
1688 isync();
1689 }
1690
1691 if (use_result_reg && !preset_result_reg) {
1692 b(done);
1693 }
1694
1695 bind(failed_int);
1696 if (use_result_reg && !preset_result_reg) {
1697 li(int_flag_success, 0);
1698 }
1699
1700 bind(done);
1701 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1702 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1703 }
1704
1705 // Performs atomic compare exchange:
1706 // if (compare_value == *addr_base)
1707 // *addr_base = exchange_value
1708 // int_flag_success = 1;
1709 // else
1710 // int_flag_success = 0;
1711 //
1712 // ConditionRegister flag = cmp(compare_value, *addr_base)
1713 // Register dest_current_value = *addr_base
1714 // Register compare_value Used to compare with value in memory
1715 // Register exchange_value Written to memory if compare_value == *addr_base
1716 // Register addr_base The memory location to compareXChange
1717 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1718 //
1719 // To avoid the costly compare exchange the value is tested beforehand.
1720 // Several special cases exist to avoid that unnecessary information is generated.
1721 //
1722 void MacroAssembler::cmpxchgd(ConditionRegister flag, Register dest_current_value,
1723 RegisterOrConstant compare_value, Register exchange_value,
1724 Register addr_base,
1725 int semantics, bool cmpxchgx_hint, Register int_flag_success,
1726 Label* failed_ext, bool contention_hint, bool weak) {
1727 Label retry;
1728 Label failed_int;
1729 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1730 Label done;
1731
1732 // Save one branch if result is returned via register and result register is different from the other ones.
1733 bool use_result_reg = (int_flag_success!=noreg);
1734 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1735 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1736 assert(!weak || flag == CR0, "weak only supported with CR0");
1737 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1738
1739 if (use_result_reg && preset_result_reg) {
1740 li(int_flag_success, 0); // preset (assume cas failed)
1741 }
1742
1743 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1744 if (contention_hint) { // Don't try to reserve if cmp fails.
1745 ld(dest_current_value, 0, addr_base);
1746 cmpd(flag, dest_current_value, compare_value);
1747 bne(flag, failed);
1748 }
1749
1750 // release/fence semantics
1751 if (semantics & MemBarRel) {
1752 release();
1753 }
1754
1755 // atomic emulation loop
1756 bind(retry);
1757
1758 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1759 cmpd(flag, dest_current_value, compare_value);
1760 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1761 bne_predict_not_taken(flag, failed);
1762 } else {
1763 bne( flag, failed);
1764 }
1765
1766 stdcx_(exchange_value, addr_base);
1767 if (!weak || use_result_reg || failed_ext) {
1768 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1769 bne_predict_not_taken(CR0, weak ? failed : retry); // stXcx_ sets CR0
1770 } else {
1771 bne( CR0, weak ? failed : retry); // stXcx_ sets CR0
1772 }
1773 }
1774
1775 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1776 if (use_result_reg) {
1777 li(int_flag_success, 1);
1778 }
1779
1780 if (semantics & MemBarFenceAfter) {
1781 fence();
1782 } else if (semantics & MemBarAcq) {
1783 isync();
1784 }
1785
1786 if (use_result_reg && !preset_result_reg) {
1787 b(done);
1788 }
1789
1790 bind(failed_int);
1791 if (use_result_reg && !preset_result_reg) {
1792 li(int_flag_success, 0);
1793 }
1794
1795 bind(done);
1796 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1797 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1798 }
1799
1800 // Look up the method for a megamorphic invokeinterface call.
1801 // The target method is determined by <intf_klass, itable_index>.
1802 // The receiver klass is in recv_klass.
1803 // On success, the result will be in method_result, and execution falls through.
1804 // On failure, execution transfers to the given label.
1805 void MacroAssembler::lookup_interface_method(Register recv_klass,
1806 Register intf_klass,
1807 RegisterOrConstant itable_index,
1808 Register method_result,
1809 Register scan_temp,
1810 Register temp2,
1811 Label& L_no_such_interface,
1812 bool return_method) {
1813 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1814
1815 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1816 int vtable_base = in_bytes(Klass::vtable_start_offset());
1817 int itentry_off = in_bytes(itableMethodEntry::method_offset());
1818 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1819 int scan_step = itableOffsetEntry::size() * wordSize;
1820 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1821
1822 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1823 // We should store the aligned, prescaled offset in the klass.
1824 // Then the next several instructions would fold away.
1825
1826 sldi(scan_temp, scan_temp, log_vte_size);
1827 addi(scan_temp, scan_temp, vtable_base);
1828 add(scan_temp, recv_klass, scan_temp);
1829
1830 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1831 if (return_method) {
1832 if (itable_index.is_register()) {
1833 Register itable_offset = itable_index.as_register();
1834 sldi(method_result, itable_offset, logMEsize);
1835 if (itentry_off) { addi(method_result, method_result, itentry_off); }
1836 add(method_result, method_result, recv_klass);
1837 } else {
1838 long itable_offset = (long)itable_index.as_constant();
1839 // static address, no relocation
1840 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1841 }
1842 }
1843
1844 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1845 // if (scan->interface() == intf) {
1846 // result = (klass + scan->offset() + itable_index);
1847 // }
1848 // }
1849 Label search, found_method;
1850
1851 for (int peel = 1; peel >= 0; peel--) {
1852 // %%%% Could load both offset and interface in one ldx, if they were
1853 // in the opposite order. This would save a load.
1854 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1855
1856 // Check that this entry is non-null. A null entry means that
1857 // the receiver class doesn't implement the interface, and wasn't the
1858 // same as when the caller was compiled.
1859 cmpd(CR0, temp2, intf_klass);
1860
1861 if (peel) {
1862 beq(CR0, found_method);
1863 } else {
1864 bne(CR0, search);
1865 // (invert the test to fall through to found_method...)
1866 }
1867
1868 if (!peel) break;
1869
1870 bind(search);
1871
1872 cmpdi(CR0, temp2, 0);
1873 beq(CR0, L_no_such_interface);
1874 addi(scan_temp, scan_temp, scan_step);
1875 }
1876
1877 bind(found_method);
1878
1879 // Got a hit.
1880 if (return_method) {
1881 int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1882 lwz(scan_temp, ito_offset, scan_temp);
1883 ldx(method_result, scan_temp, method_result);
1884 }
1885 }
1886
1887 // virtual method calling
1888 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1889 RegisterOrConstant vtable_index,
1890 Register method_result) {
1891
1892 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1893
1894 const ByteSize base = Klass::vtable_start_offset();
1895 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1896
1897 if (vtable_index.is_register()) {
1898 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1899 add(recv_klass, vtable_index.as_register(), recv_klass);
1900 } else {
1901 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1902 }
1903 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1904 }
1905
1906 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1907 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1908 Register super_klass,
1909 Register temp1_reg,
1910 Register temp2_reg,
1911 Label* L_success,
1912 Label* L_failure,
1913 Label* L_slow_path,
1914 RegisterOrConstant super_check_offset) {
1915
1916 const Register check_cache_offset = temp1_reg;
1917 const Register cached_super = temp2_reg;
1918
1919 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1920
1921 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1922 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1923
1924 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1925 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1926
1927 Label L_fallthrough;
1928 int label_nulls = 0;
1929 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
1930 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
1931 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1932 assert(label_nulls <= 1 ||
1933 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1934 "at most one null in the batch, usually");
1935
1936 // If the pointers are equal, we are done (e.g., String[] elements).
1937 // This self-check enables sharing of secondary supertype arrays among
1938 // non-primary types such as array-of-interface. Otherwise, each such
1939 // type would need its own customized SSA.
1940 // We move this check to the front of the fast path because many
1941 // type checks are in fact trivially successful in this manner,
1942 // so we get a nicely predicted branch right at the start of the check.
1943 cmpd(CR0, sub_klass, super_klass);
1944 beq(CR0, *L_success);
1945
1946 // Check the supertype display:
1947 if (must_load_sco) {
1948 // The super check offset is always positive...
1949 lwz(check_cache_offset, sco_offset, super_klass);
1950 super_check_offset = RegisterOrConstant(check_cache_offset);
1951 // super_check_offset is register.
1952 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1953 }
1954 // The loaded value is the offset from Klass.
1955
1956 ld(cached_super, super_check_offset, sub_klass);
1957 cmpd(CR0, cached_super, super_klass);
1958
1959 // This check has worked decisively for primary supers.
1960 // Secondary supers are sought in the super_cache ('super_cache_addr').
1961 // (Secondary supers are interfaces and very deeply nested subtypes.)
1962 // This works in the same check above because of a tricky aliasing
1963 // between the super_cache and the primary super display elements.
1964 // (The 'super_check_addr' can address either, as the case requires.)
1965 // Note that the cache is updated below if it does not help us find
1966 // what we need immediately.
1967 // So if it was a primary super, we can just fail immediately.
1968 // Otherwise, it's the slow path for us (no success at this point).
1969
1970 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1971
1972 if (super_check_offset.is_register()) {
1973 beq(CR0, *L_success);
1974 cmpwi(CR0, super_check_offset.as_register(), sc_offset);
1975 if (L_failure == &L_fallthrough) {
1976 beq(CR0, *L_slow_path);
1977 } else {
1978 bne(CR0, *L_failure);
1979 FINAL_JUMP(*L_slow_path);
1980 }
1981 } else {
1982 if (super_check_offset.as_constant() == sc_offset) {
1983 // Need a slow path; fast failure is impossible.
1984 if (L_slow_path == &L_fallthrough) {
1985 beq(CR0, *L_success);
1986 } else {
1987 bne(CR0, *L_slow_path);
1988 FINAL_JUMP(*L_success);
1989 }
1990 } else {
1991 // No slow path; it's a fast decision.
1992 if (L_failure == &L_fallthrough) {
1993 beq(CR0, *L_success);
1994 } else {
1995 bne(CR0, *L_failure);
1996 FINAL_JUMP(*L_success);
1997 }
1998 }
1999 }
2000
2001 bind(L_fallthrough);
2002 #undef FINAL_JUMP
2003 }
2004
2005 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
2006 Register super_klass,
2007 Register temp1_reg,
2008 Register temp2_reg,
2009 Label* L_success,
2010 Register result_reg) {
2011 const Register array_ptr = temp1_reg; // current value from cache array
2012 const Register temp = temp2_reg;
2013
2014 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
2015 assert(L_success == nullptr || result_reg == noreg, "can't have both");
2016
2017 int source_offset = in_bytes(Klass::secondary_supers_offset());
2018 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
2019
2020 int length_offset = Array<Klass*>::length_offset_in_bytes();
2021 int base_offset = Array<Klass*>::base_offset_in_bytes();
2022
2023 Label hit, loop, failure, fallthru;
2024
2025 ld(array_ptr, source_offset, sub_klass);
2026
2027 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2028 lwz(temp, length_offset, array_ptr);
2029 cmpwi(CR0, temp, 0);
2030 beq(CR0, (L_success == nullptr) ? failure : fallthru); // indicate failure if length 0
2031
2032 mtctr(temp); // load ctr
2033
2034 bind(loop);
2035 // Oops in table are NO MORE compressed.
2036 ld(temp, base_offset, array_ptr);
2037 cmpd(CR0, temp, super_klass);
2038 beq(CR0, hit);
2039 addi(array_ptr, array_ptr, BytesPerWord);
2040 bdnz(loop);
2041
2042 bind(failure);
2043 if (result_reg != noreg) {
2044 li(result_reg, 1); // load non-zero result (indicates a miss)
2045 } else if (L_success == nullptr) {
2046 crandc(CR0, Assembler::equal, CR0, Assembler::equal); // miss indicated by CR0.ne
2047 }
2048 b(fallthru);
2049
2050 bind(hit);
2051 std(super_klass, target_offset, sub_klass); // save result to cache
2052 if (result_reg != noreg) {
2053 li(result_reg, 0); // load zero result (indicates a hit)
2054 } else if (L_success != nullptr) {
2055 b(*L_success);
2056 }
2057
2058 bind(fallthru);
2059 }
2060
2061 Register MacroAssembler::allocate_if_noreg(Register r,
2062 RegSetIterator<Register> &available_regs,
2063 RegSet ®s_to_push) {
2064 if (!r->is_valid()) {
2065 r = *available_regs++;
2066 regs_to_push += r;
2067 }
2068 return r;
2069 }
2070
2071 void MacroAssembler::push_set(RegSet set)
2072 {
2073 int spill_offset = 0;
2074 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2075 spill_offset += wordSize;
2076 std(*it, -spill_offset, R1_SP);
2077 }
2078 }
2079
2080 void MacroAssembler::pop_set(RegSet set)
2081 {
2082 int spill_offset = 0;
2083 for (RegSetIterator<Register> it = set.begin(); *it != noreg; ++it) {
2084 spill_offset += wordSize;
2085 ld(*it, -spill_offset, R1_SP);
2086 }
2087 }
2088
2089 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
2090 Register super_klass,
2091 Register temp1_reg,
2092 Register temp2_reg,
2093 Label* L_success,
2094 Register result_reg) {
2095 RegSet temps = RegSet::of(temp1_reg, temp2_reg);
2096
2097 assert_different_registers(sub_klass, super_klass, temp1_reg, temp2_reg, result_reg, R0);
2098
2099 Register temp3_reg = noreg, temp4_reg = noreg;
2100 bool result_reg_provided = (result_reg != noreg); // otherwise, result will be in CR0
2101
2102 BLOCK_COMMENT("check_klass_subtype_slow_path_table");
2103
2104 RegSetIterator<Register> available_regs
2105 = (RegSet::range(R2, R12) - temps - sub_klass - super_klass).begin();
2106
2107 RegSet pushed_regs;
2108
2109 temp1_reg = allocate_if_noreg(temp1_reg, available_regs, pushed_regs);
2110 temp2_reg = allocate_if_noreg(temp2_reg, available_regs, pushed_regs);
2111 temp3_reg = allocate_if_noreg(temp3_reg, available_regs, pushed_regs);
2112 temp4_reg = allocate_if_noreg(temp4_reg, available_regs, pushed_regs);
2113 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
2114
2115 push_set(pushed_regs);
2116
2117 lookup_secondary_supers_table_var(sub_klass, super_klass,
2118 temp1_reg, temp2_reg, temp3_reg, temp4_reg,
2119 result_reg);
2120
2121 if (L_success != nullptr || !result_reg_provided) {
2122 // result_reg may get overwritten by pop_set
2123 cmpdi(CR0, result_reg, 0);
2124 }
2125
2126 // Unspill the temp. registers:
2127 pop_set(pushed_regs);
2128
2129 if (L_success != nullptr) {
2130 beq(CR0, *L_success);
2131 }
2132 }
2133
2134 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
2135 Register super_klass,
2136 Register temp1_reg,
2137 Register temp2_reg,
2138 Label* L_success,
2139 Register result_reg) {
2140 if (UseSecondarySupersTable) {
2141 check_klass_subtype_slow_path_table(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2142 } else {
2143 if (temp2_reg == noreg) temp2_reg = R0;
2144 check_klass_subtype_slow_path_linear(sub_klass, super_klass, temp1_reg, temp2_reg, L_success, result_reg);
2145 }
2146 }
2147
2148 // Try fast path, then go to slow one if not successful
2149 void MacroAssembler::check_klass_subtype(Register sub_klass,
2150 Register super_klass,
2151 Register temp1_reg,
2152 Register temp2_reg,
2153 Label& L_success) {
2154 Label L_failure;
2155 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2156 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2157 bind(L_failure); // Fallthru if not successful.
2158 }
2159
2160 // scans count pointer sized words at [addr] for occurrence of value,
2161 // generic (count must be >0)
2162 // iff found: CR0 eq, scratch == 0
2163 void MacroAssembler::repne_scan(Register addr, Register value, Register count, Register scratch) {
2164 Label Lloop, Lafter_loop, Lexit;
2165
2166 srdi_(scratch, count, 1);
2167 beq(CR0, Lafter_loop);
2168 mtctr(scratch);
2169
2170 bind(Lloop); // 2x unrolled
2171 ld(scratch, 0, addr);
2172 xor_(scratch, scratch, value);
2173 beq(CR0, Lexit);
2174 ld(scratch, 8, addr);
2175 xor_(scratch, scratch, value);
2176 beq(CR0, Lexit);
2177 addi(addr, addr, 2 * wordSize);
2178 bdnz(Lloop);
2179
2180 bind(Lafter_loop);
2181 andi_(scratch, count, 1);
2182 beq(CR0, Lexit); // if taken: CR0 eq and scratch == 0
2183 ld(scratch, 0, addr);
2184 xor_(scratch, scratch, value);
2185
2186 bind(Lexit);
2187 }
2188
2189 // Ensure that the inline code and the stub are using the same registers.
2190 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS \
2191 do { \
2192 assert(r_super_klass == R4_ARG2 && \
2193 r_array_base == R3_ARG1 && \
2194 r_array_length == R7_ARG5 && \
2195 (r_array_index == R6_ARG4 || r_array_index == noreg) && \
2196 (r_sub_klass == R5_ARG3 || r_sub_klass == noreg) && \
2197 (r_bitmap == R11_scratch1 || r_bitmap == noreg) && \
2198 (result == R8_ARG6 || result == noreg), "registers must match ppc64.ad"); \
2199 } while(0)
2200
2201 void MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
2202 Register r_super_klass,
2203 Register temp1,
2204 Register temp2,
2205 Register temp3,
2206 Register temp4,
2207 Register result,
2208 u1 super_klass_slot) {
2209 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result);
2210
2211 Label L_done;
2212
2213 BLOCK_COMMENT("lookup_secondary_supers_table_const {");
2214
2215 const Register
2216 r_array_base = temp1,
2217 r_array_length = temp2,
2218 r_array_index = temp3,
2219 r_bitmap = temp4;
2220
2221 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS; // Required for stub call below.
2222
2223 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2224
2225 // First check the bitmap to see if super_klass might be present. If
2226 // the bit is zero, we are certain that super_klass is not one of
2227 // the secondary supers.
2228 u1 bit = super_klass_slot;
2229 int shift_count = Klass::SECONDARY_SUPERS_TABLE_MASK - bit;
2230
2231 // if (shift_count == 0) this is used for comparing with 0:
2232 sldi_(r_array_index, r_bitmap, shift_count);
2233
2234 li(result, 1); // failure
2235 // We test the MSB of r_array_index, i.e. its sign bit
2236 bge(CR0, L_done);
2237
2238 // We will consult the secondary-super array.
2239 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2240
2241 // The value i in r_array_index is >= 1, so even though r_array_base
2242 // points to the length, we don't need to adjust it to point to the
2243 // data.
2244 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2245
2246 // Get the first array index that can contain super_klass.
2247 if (bit != 0) {
2248 popcntd(r_array_index, r_array_index);
2249 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2250 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2251 ldx(result, r_array_base, r_array_index);
2252 } else {
2253 // Actually use index 0, but r_array_base and r_array_index are off by 1 word
2254 // such that the sum is precise.
2255 ld(result, BytesPerWord, r_array_base);
2256 li(r_array_index, BytesPerWord); // for slow path (scaled)
2257 }
2258
2259 xor_(result, result, r_super_klass);
2260 beq(CR0, L_done); // Found a match (result == 0)
2261
2262 // Is there another entry to check? Consult the bitmap.
2263 testbitdi(CR0, /* temp */ r_array_length, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
2264 beq(CR0, L_done); // (result != 0)
2265
2266 // Linear probe. Rotate the bitmap so that the next bit to test is
2267 // in Bit 2 for the look-ahead check in the slow path.
2268 if (bit != 0) {
2269 rldicl(r_bitmap, r_bitmap, 64 - bit, 0);
2270 }
2271
2272 // Calls into the stub generated by lookup_secondary_supers_table_slow_path.
2273 // Arguments: r_super_klass, r_array_base, r_array_index, r_bitmap.
2274 // Kills: r_array_length.
2275 // Returns: result.
2276 address stub = StubRoutines::lookup_secondary_supers_table_slow_path_stub();
2277 Register r_stub_addr = r_array_length;
2278 add_const_optimized(r_stub_addr, R29_TOC, MacroAssembler::offset_to_global_toc(stub), R0);
2279 mtctr(r_stub_addr);
2280 bctrl();
2281
2282 bind(L_done);
2283 BLOCK_COMMENT("} lookup_secondary_supers_table_const");
2284
2285 if (VerifySecondarySupers) {
2286 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2287 temp1, temp2, temp3);
2288 }
2289 }
2290
2291 // At runtime, return 0 in result if r_super_klass is a superclass of
2292 // r_sub_klass, otherwise return nonzero. Use this version of
2293 // lookup_secondary_supers_table() if you don't know ahead of time
2294 // which superclass will be searched for. Used by interpreter and
2295 // runtime stubs. It is larger and has somewhat greater latency than
2296 // the version above, which takes a constant super_klass_slot.
2297 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
2298 Register r_super_klass,
2299 Register temp1,
2300 Register temp2,
2301 Register temp3,
2302 Register temp4,
2303 Register result) {
2304 assert_different_registers(r_sub_klass, r_super_klass, temp1, temp2, temp3, temp4, result, R0);
2305
2306 Label L_done;
2307
2308 BLOCK_COMMENT("lookup_secondary_supers_table_var {");
2309
2310 const Register
2311 r_array_base = temp1,
2312 slot = temp2,
2313 r_array_index = temp3,
2314 r_bitmap = temp4;
2315
2316 lbz(slot, in_bytes(Klass::hash_slot_offset()), r_super_klass);
2317 ld(r_bitmap, in_bytes(Klass::secondary_supers_bitmap_offset()), r_sub_klass);
2318
2319 li(result, 1); // Make sure that result is nonzero if the test below misses.
2320
2321 // First check the bitmap to see if super_klass might be present. If
2322 // the bit is zero, we are certain that super_klass is not one of
2323 // the secondary supers.
2324 xori(R0, slot, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1); // slot ^ 63 === 63 - slot (mod 64)
2325 sld_(r_array_index, r_bitmap, R0); // shift left by 63-slot
2326
2327 // We test the MSB of r_array_index, i.e. its sign bit
2328 bge(CR0, L_done);
2329
2330 // We will consult the secondary-super array.
2331 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2332
2333 // The value i in r_array_index is >= 1, so even though r_array_base
2334 // points to the length, we don't need to adjust it to point to the data.
2335 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
2336 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
2337
2338 // Get the first array index that can contain super_klass into r_array_index.
2339 popcntd(r_array_index, r_array_index);
2340
2341 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
2342 sldi(r_array_index, r_array_index, LogBytesPerWord); // scale
2343
2344 ldx(R0, r_array_base, r_array_index);
2345 xor_(result, R0, r_super_klass);
2346 beq(CR0, L_done); // found a match, result is 0 in this case
2347
2348 // Linear probe. Rotate the bitmap so that the next bit to test is
2349 // in Bit 1.
2350 neg(R0, slot); // rotate right
2351 rldcl(r_bitmap, r_bitmap, R0, 0);
2352 Register temp = slot;
2353 andi_(temp, r_bitmap, 2);
2354 beq(CR0, L_done); // fail (result != 0)
2355
2356 // The slot we just inspected is at secondary_supers[r_array_index - 1].
2357 // The next slot to be inspected, by the logic we're about to call,
2358 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
2359 // have been checked.
2360 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
2361 r_bitmap, result, temp);
2362 // return whatever we got from slow path
2363
2364 bind(L_done);
2365
2366 BLOCK_COMMENT("} lookup_secondary_supers_table_var");
2367
2368 if (VerifySecondarySupers) {
2369 verify_secondary_supers_table(r_sub_klass, r_super_klass, result,
2370 temp1, temp2, temp3);
2371 }
2372 }
2373
2374 // Called by code generated by check_klass_subtype_slow_path
2375 // above. This is called when there is a collision in the hashed
2376 // lookup in the secondary supers array.
2377 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
2378 Register r_array_base,
2379 Register r_array_index,
2380 Register r_bitmap,
2381 Register result,
2382 Register temp1) {
2383 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
2384
2385 const Register
2386 r_array_length = temp1,
2387 r_sub_klass = noreg;
2388
2389 Label L_done;
2390
2391 // Load the array length.
2392 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2393 // And adjust the array base to point to the data.
2394 // NB! Effectively increments current slot index by 1.
2395 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
2396 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2397
2398 // Linear probe
2399 Label L_huge;
2400
2401 // The bitmap is full to bursting.
2402 // Implicit invariant: BITMAP_FULL implies (length > 0)
2403 cmpwi(CR0, r_array_length, (int32_t)Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
2404 bgt(CR0, L_huge);
2405
2406 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
2407 // current slot (at secondary_supers[r_array_index]) has not yet
2408 // been inspected, and r_array_index may be out of bounds if we
2409 // wrapped around the end of the array.
2410
2411 { // This is conventional linear probing, but instead of terminating
2412 // when a null entry is found in the table, we maintain a bitmap
2413 // in which a 0 indicates missing entries.
2414 // The check above guarantees there are 0s in the bitmap, so the loop
2415 // eventually terminates.
2416
2417 #ifdef ASSERT
2418 {
2419 // We should only reach here after having found a bit in the bitmap.
2420 // Invariant: array_length == popcount(bitmap)
2421 Label ok;
2422 cmpdi(CR0, r_array_length, 0);
2423 bgt(CR0, ok);
2424 stop("array_length must be positive");
2425 bind(ok);
2426 }
2427 #endif
2428
2429 // Compute limit in r_array_length
2430 addi(r_array_length, r_array_length, -1);
2431 sldi(r_array_length, r_array_length, LogBytesPerWord);
2432
2433 Label L_loop;
2434 bind(L_loop);
2435
2436 // Check for wraparound.
2437 cmpd(CR0, r_array_index, r_array_length);
2438 isel_0(r_array_index, CR0, Assembler::greater);
2439
2440 ldx(result, r_array_base, r_array_index);
2441 xor_(result, result, r_super_klass);
2442 beq(CR0, L_done); // success (result == 0)
2443
2444 // look-ahead check (Bit 2); result is non-zero
2445 testbitdi(CR0, R0, r_bitmap, 2);
2446 beq(CR0, L_done); // fail (result != 0)
2447
2448 rldicl(r_bitmap, r_bitmap, 64 - 1, 0);
2449 addi(r_array_index, r_array_index, BytesPerWord);
2450 b(L_loop);
2451 }
2452
2453 { // Degenerate case: more than 64 secondary supers.
2454 // FIXME: We could do something smarter here, maybe a vectorized
2455 // comparison or a binary search, but is that worth any added
2456 // complexity?
2457 bind(L_huge);
2458 repne_scan(r_array_base, r_super_klass, r_array_length, result);
2459 }
2460
2461 bind(L_done);
2462 }
2463
2464 // Make sure that the hashed lookup and a linear scan agree.
2465 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
2466 Register r_super_klass,
2467 Register result,
2468 Register temp1,
2469 Register temp2,
2470 Register temp3) {
2471 assert_different_registers(r_sub_klass, r_super_klass, result, temp1, temp2, temp3);
2472
2473 const Register
2474 r_array_base = temp1,
2475 r_array_length = temp2,
2476 r_array_index = temp3,
2477 r_bitmap = noreg; // unused
2478
2479 BLOCK_COMMENT("verify_secondary_supers_table {");
2480
2481 Label passed, failure;
2482
2483 // We will consult the secondary-super array.
2484 ld(r_array_base, in_bytes(Klass::secondary_supers_offset()), r_sub_klass);
2485 // Load the array length.
2486 lwa(r_array_length, Array<Klass*>::length_offset_in_bytes(), r_array_base);
2487 // And adjust the array base to point to the data.
2488 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
2489
2490 // convert !=0 to 1
2491 normalize_bool(result, R0, true);
2492 const Register linear_result = r_array_index; // reuse
2493 li(linear_result, 1);
2494 cmpdi(CR0, r_array_length, 0);
2495 ble(CR0, failure);
2496 repne_scan(r_array_base, r_super_klass, r_array_length, linear_result);
2497 bind(failure);
2498
2499 // convert !=0 to 1
2500 normalize_bool(linear_result, R0, true);
2501
2502 cmpd(CR0, result, linear_result);
2503 beq(CR0, passed);
2504
2505 // report fatal error and terminate VM
2506
2507 // Argument shuffle. Using stack to avoid clashes.
2508 std(r_super_klass, -8, R1_SP);
2509 std(r_sub_klass, -16, R1_SP);
2510 std(linear_result, -24, R1_SP);
2511 mr_if_needed(R6_ARG4, result);
2512 ld(R3_ARG1, -8, R1_SP);
2513 ld(R4_ARG2, -16, R1_SP);
2514 ld(R5_ARG3, -24, R1_SP);
2515
2516 const char* msg = "mismatch";
2517 load_const_optimized(R7_ARG5, (intptr_t)msg, R0);
2518 call_VM_leaf(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
2519 should_not_reach_here();
2520
2521 bind(passed);
2522
2523 BLOCK_COMMENT("} verify_secondary_supers_table");
2524 }
2525
2526 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2527 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2528
2529 Label L_check_thread, L_fallthrough;
2530 if (L_fast_path == nullptr) {
2531 L_fast_path = &L_fallthrough;
2532 } else if (L_slow_path == nullptr) {
2533 L_slow_path = &L_fallthrough;
2534 }
2535
2536 // Fast path check: class is fully initialized
2537 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2538 // acquire by cmp-branch-isync if fully_initialized
2539 cmpwi(CR0, R0, InstanceKlass::fully_initialized);
2540 bne(CR0, L_check_thread);
2541 isync();
2542 b(*L_fast_path);
2543
2544 // Fast path check: current thread is initializer thread
2545 bind(L_check_thread);
2546 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2547 cmpd(CR0, thread, R0);
2548 if (L_slow_path == &L_fallthrough) {
2549 beq(CR0, *L_fast_path);
2550 } else if (L_fast_path == &L_fallthrough) {
2551 bne(CR0, *L_slow_path);
2552 } else {
2553 Unimplemented();
2554 }
2555
2556 bind(L_fallthrough);
2557 }
2558
2559 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2560 Register temp_reg,
2561 int extra_slot_offset) {
2562 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2563 int stackElementSize = Interpreter::stackElementSize;
2564 int offset = extra_slot_offset * stackElementSize;
2565 if (arg_slot.is_constant()) {
2566 offset += arg_slot.as_constant() * stackElementSize;
2567 return offset;
2568 } else {
2569 assert(temp_reg != noreg, "must specify");
2570 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2571 if (offset != 0)
2572 addi(temp_reg, temp_reg, offset);
2573 return temp_reg;
2574 }
2575 }
2576
2577 void MacroAssembler::tlab_allocate(
2578 Register obj, // result: pointer to object after successful allocation
2579 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2580 int con_size_in_bytes, // object size in bytes if known at compile time
2581 Register t1, // temp register
2582 Label& slow_case // continuation point if fast allocation fails
2583 ) {
2584 // make sure arguments make sense
2585 assert_different_registers(obj, var_size_in_bytes, t1);
2586 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2587 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2588
2589 const Register new_top = t1;
2590 //verify_tlab(); not implemented
2591
2592 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2593 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2594 if (var_size_in_bytes == noreg) {
2595 addi(new_top, obj, con_size_in_bytes);
2596 } else {
2597 add(new_top, obj, var_size_in_bytes);
2598 }
2599 cmpld(CR0, new_top, R0);
2600 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_case);
2601
2602 #ifdef ASSERT
2603 // make sure new free pointer is properly aligned
2604 {
2605 Label L;
2606 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2607 beq(CR0, L);
2608 stop("updated TLAB free is not properly aligned");
2609 bind(L);
2610 }
2611 #endif // ASSERT
2612
2613 // update the tlab top pointer
2614 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2615 //verify_tlab(); not implemented
2616 }
2617
2618 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2619 int insts_call_instruction_offset, Register Rtoc) {
2620 // Start the stub.
2621 address stub = start_a_stub(64);
2622 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2623
2624 // Create a trampoline stub relocation which relates this trampoline stub
2625 // with the call instruction at insts_call_instruction_offset in the
2626 // instructions code-section.
2627 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2628 const int stub_start_offset = offset();
2629
2630 // For java_to_interp stubs we use R11_scratch1 as scratch register
2631 // and in call trampoline stubs we use R12_scratch2. This way we
2632 // can distinguish them (see is_NativeCallTrampolineStub_at()).
2633 Register reg_scratch = R12_scratch2;
2634
2635 // Now, create the trampoline stub's code:
2636 // - load the TOC
2637 // - load the call target from the constant pool
2638 // - call
2639 if (Rtoc == noreg) {
2640 calculate_address_from_global_toc(reg_scratch, method_toc());
2641 Rtoc = reg_scratch;
2642 }
2643
2644 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2645 mtctr(reg_scratch);
2646 bctr();
2647
2648 const address stub_start_addr = addr_at(stub_start_offset);
2649
2650 // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2651 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2652 "encoded offset into the constant pool must match");
2653 // Trampoline_stub_size should be good.
2654 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2655 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2656
2657 // End the stub.
2658 end_a_stub();
2659 return stub;
2660 }
2661
2662 // "The box" is the space on the stack where we copy the object mark.
2663 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register obj, Register box,
2664 Register tmp1, Register tmp2, Register tmp3) {
2665 assert_different_registers(obj, box, tmp1, tmp2, tmp3);
2666 assert(UseObjectMonitorTable || tmp3 == noreg, "tmp3 not needed");
2667 assert(flag == CR0, "bad condition register");
2668
2669 // Handle inflated monitor.
2670 Label inflated;
2671 // Finish fast lock successfully. MUST reach to with flag == NE
2672 Label locked;
2673 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2674 Label slow_path;
2675
2676 if (UseObjectMonitorTable) {
2677 // Clear cache in case fast locking succeeds or we need to take the slow-path.
2678 li(tmp1, 0);
2679 std(tmp1, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
2680 }
2681
2682 if (DiagnoseSyncOnValueBasedClasses != 0) {
2683 load_klass(tmp1, obj);
2684 lbz(tmp1, in_bytes(Klass::misc_flags_offset()), tmp1);
2685 testbitdi(CR0, R0, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
2686 bne(CR0, slow_path);
2687 }
2688
2689 Register mark = tmp1;
2690
2691 { // Fast locking
2692
2693 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2694 Label push;
2695
2696 const Register top = tmp2;
2697
2698 // Check if lock-stack is full.
2699 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2700 cmplwi(CR0, top, LockStack::end_offset() - 1);
2701 bgt(CR0, slow_path);
2702
2703 // The underflow check is elided. The recursive check will always fail
2704 // when the lock stack is empty because of the _bad_oop_sentinel field.
2705
2706 // Check if recursive.
2707 subi(R0, top, oopSize);
2708 ldx(R0, R16_thread, R0);
2709 cmpd(CR0, obj, R0);
2710 beq(CR0, push);
2711
2712 // Check for monitor (0b10) or locked (0b00).
2713 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2714 andi_(R0, mark, markWord::lock_mask_in_place);
2715 cmpldi(CR0, R0, markWord::unlocked_value);
2716 bgt(CR0, inflated);
2717 bne(CR0, slow_path);
2718
2719 // Not inflated.
2720
2721 // Try to lock. Transition lock bits 0b01 => 0b00
2722 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2723 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2724
2725 bind(push);
2726 // After successful lock, push object on lock-stack.
2727 stdx(obj, R16_thread, top);
2728 addi(top, top, oopSize);
2729 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2730 b(locked);
2731 }
2732
2733 { // Handle inflated monitor.
2734 bind(inflated);
2735
2736 // mark contains the tagged ObjectMonitor*.
2737 const uintptr_t monitor_tag = markWord::monitor_value;
2738 const Register monitor = UseObjectMonitorTable ? tmp1 : noreg;
2739 const Register owner_addr = tmp2;
2740 const Register thread_id = UseObjectMonitorTable ? tmp3 : tmp1;
2741 Label monitor_locked;
2742
2743 if (!UseObjectMonitorTable) {
2744 // Compute owner address.
2745 addi(owner_addr, mark, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2746 mark = noreg;
2747 } else {
2748 const Register tmp3_bucket = tmp3;
2749 const Register tmp2_hash = tmp2;
2750 Label monitor_found;
2751
2752 // Save the mark, we might need it to extract the hash.
2753 mr(tmp2_hash, mark);
2754
2755 // Look for the monitor in the om_cache.
2756
2757 ByteSize cache_offset = JavaThread::om_cache_oops_offset();
2758 ByteSize monitor_offset = OMCache::oop_to_monitor_difference();
2759 const int num_unrolled = OMCache::CAPACITY;
2760 for (int i = 0; i < num_unrolled; i++) {
2761 ld(R0, in_bytes(cache_offset), R16_thread);
2762 ld(monitor, in_bytes(cache_offset + monitor_offset), R16_thread);
2763 cmpd(CR0, R0, obj);
2764 beq(CR0, monitor_found);
2765 cache_offset = cache_offset + OMCache::oop_to_oop_difference();
2766 }
2767
2768 // Look for the monitor in the table.
2769
2770 // Get the hash code.
2771 srdi(tmp2_hash, tmp2_hash, markWord::hash_shift);
2772
2773 // Get the table and calculate the bucket's address
2774 int simm16_rest = load_const_optimized(tmp3, ObjectMonitorTable::current_table_address(), R0, true);
2775 ld_ptr(tmp3, simm16_rest, tmp3);
2776 ld(tmp1, in_bytes(ObjectMonitorTable::table_capacity_mask_offset()), tmp3);
2777 andr(tmp2_hash, tmp2_hash, tmp1);
2778 ld(tmp3_bucket, in_bytes(ObjectMonitorTable::table_buckets_offset()), tmp3);
2779
2780 // Read the monitor from the bucket.
2781 sldi(tmp2_hash, tmp2_hash, LogBytesPerWord);
2782 ldx(monitor, tmp3_bucket, tmp2_hash);
2783
2784 // Check if the monitor in the bucket is special (empty, tombstone or removed).
2785 cmpldi(CR0, monitor, ObjectMonitorTable::SpecialPointerValues::below_is_special);
2786 blt(CR0, slow_path);
2787
2788 // Check if object matches.
2789 ld(tmp3, in_bytes(ObjectMonitor::object_offset()), monitor);
2790 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
2791 bs_asm->try_resolve_weak_handle(this, tmp3, tmp2, slow_path);
2792 cmpd(CR0, tmp3, obj);
2793 bne(CR0, slow_path);
2794
2795 bind(monitor_found);
2796
2797 // Compute owner address.
2798 addi(owner_addr, monitor, in_bytes(ObjectMonitor::owner_offset()));
2799 }
2800
2801 // Try to CAS owner (no owner => current thread's _monitor_owner_id).
2802 assert_different_registers(thread_id, monitor, owner_addr, box, R0);
2803 ld(thread_id, in_bytes(JavaThread::monitor_owner_id_offset()), R16_thread);
2804 cmpxchgd(/*flag=*/CR0,
2805 /*current_value=*/R0,
2806 /*compare_value=*/(intptr_t)0,
2807 /*exchange_value=*/thread_id,
2808 /*where=*/owner_addr,
2809 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2810 MacroAssembler::cmpxchgx_hint_acquire_lock());
2811 beq(CR0, monitor_locked);
2812
2813 // Check if recursive.
2814 cmpd(CR0, R0, thread_id);
2815 bne(CR0, slow_path);
2816
2817 // Recursive.
2818 if (!UseObjectMonitorTable) {
2819 assert_different_registers(tmp1, owner_addr);
2820 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2821 addi(tmp1, tmp1, 1);
2822 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2823 } else {
2824 assert_different_registers(tmp2, monitor);
2825 ld(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2826 addi(tmp2, tmp2, 1);
2827 std(tmp2, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2828 }
2829
2830 bind(monitor_locked);
2831 if (UseObjectMonitorTable) {
2832 std(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2833 }
2834 }
2835
2836 bind(locked);
2837
2838 #ifdef ASSERT
2839 // Check that locked label is reached with flag == EQ.
2840 Label flag_correct;
2841 beq(CR0, flag_correct);
2842 stop("Fast Lock Flag != EQ");
2843 #endif
2844 bind(slow_path);
2845 #ifdef ASSERT
2846 // Check that slow_path label is reached with flag == NE.
2847 bne(CR0, flag_correct);
2848 stop("Fast Lock Flag != NE");
2849 bind(flag_correct);
2850 #endif
2851 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2852 }
2853
2854 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register obj, Register box,
2855 Register tmp1, Register tmp2, Register tmp3) {
2856 assert_different_registers(obj, tmp1, tmp2, tmp3);
2857 assert(flag == CR0, "bad condition register");
2858
2859 // Handle inflated monitor.
2860 Label inflated, inflated_load_monitor;
2861 // Finish fast unlock successfully. MUST reach to with flag == EQ.
2862 Label unlocked;
2863 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2864 Label slow_path;
2865
2866 const Register mark = tmp1;
2867 const Register top = tmp2;
2868 const Register t = tmp3;
2869
2870 { // Fast unlock
2871 Label push_and_slow;
2872
2873 // Check if obj is top of lock-stack.
2874 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2875 subi(top, top, oopSize);
2876 ldx(t, R16_thread, top);
2877 cmpd(CR0, obj, t);
2878 // Top of lock stack was not obj. Must be monitor.
2879 bne(CR0, inflated_load_monitor);
2880
2881 // Pop lock-stack.
2882 DEBUG_ONLY(li(t, 0);)
2883 DEBUG_ONLY(stdx(t, R16_thread, top);)
2884 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2885
2886 // The underflow check is elided. The recursive check will always fail
2887 // when the lock stack is empty because of the _bad_oop_sentinel field.
2888
2889 // Check if recursive.
2890 subi(t, top, oopSize);
2891 ldx(t, R16_thread, t);
2892 cmpd(CR0, obj, t);
2893 beq(CR0, unlocked);
2894
2895 // Not recursive.
2896
2897 // Check for monitor (0b10).
2898 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2899 andi_(t, mark, markWord::monitor_value);
2900 if (!UseObjectMonitorTable) {
2901 bne(CR0, inflated);
2902 } else {
2903 bne(CR0, push_and_slow);
2904 }
2905
2906 #ifdef ASSERT
2907 // Check header not unlocked (0b01).
2908 Label not_unlocked;
2909 andi_(t, mark, markWord::unlocked_value);
2910 beq(CR0, not_unlocked);
2911 stop("fast_unlock already unlocked");
2912 bind(not_unlocked);
2913 #endif
2914
2915 // Try to unlock. Transition lock bits 0b00 => 0b01
2916 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2917 b(unlocked);
2918
2919 bind(push_and_slow);
2920 // Restore lock-stack and handle the unlock in runtime.
2921 DEBUG_ONLY(stdx(obj, R16_thread, top);)
2922 addi(top, top, oopSize);
2923 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2924 b(slow_path);
2925 }
2926
2927 { // Handle inflated monitor.
2928 bind(inflated_load_monitor);
2929 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2930 #ifdef ASSERT
2931 andi_(t, mark, markWord::monitor_value);
2932 bne(CR0, inflated);
2933 stop("Fast Unlock not monitor");
2934 #endif
2935
2936 bind(inflated);
2937
2938 #ifdef ASSERT
2939 Label check_done;
2940 subi(top, top, oopSize);
2941 cmplwi(CR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2942 blt(CR0, check_done);
2943 ldx(t, R16_thread, top);
2944 cmpd(CR0, obj, t);
2945 bne(CR0, inflated);
2946 stop("Fast Unlock lock on stack");
2947 bind(check_done);
2948 #endif
2949
2950 // mark contains the tagged ObjectMonitor*.
2951 const Register monitor = mark;
2952 const uintptr_t monitor_tag = markWord::monitor_value;
2953
2954 if (!UseObjectMonitorTable) {
2955 // Untag the monitor.
2956 subi(monitor, mark, monitor_tag);
2957 } else {
2958 ld(monitor, BasicLock::object_monitor_cache_offset_in_bytes(), box);
2959 // null check with Flags == NE, no valid pointer below alignof(ObjectMonitor*)
2960 cmpldi(CR0, monitor, checked_cast<uint8_t>(alignof(ObjectMonitor*)));
2961 blt(CR0, slow_path);
2962 }
2963
2964 const Register recursions = tmp2;
2965 Label not_recursive;
2966
2967 // Check if recursive.
2968 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2969 addic_(recursions, recursions, -1);
2970 blt(CR0, not_recursive);
2971
2972 // Recursive unlock.
2973 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2974 crorc(CR0, Assembler::equal, CR0, Assembler::equal);
2975 b(unlocked);
2976
2977 bind(not_recursive);
2978
2979 // Set owner to null.
2980 // Release to satisfy the JMM
2981 release();
2982 li(t, 0);
2983 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2984 // We need a full fence after clearing owner to avoid stranding.
2985 // StoreLoad achieves this.
2986 membar(StoreLoad);
2987
2988 // Check if the entry_list is empty.
2989 ld(t, in_bytes(ObjectMonitor::entry_list_offset()), monitor);
2990 cmpdi(CR0, t, 0);
2991 beq(CR0, unlocked); // If so we are done.
2992
2993 // Check if there is a successor.
2994 ld(t, in_bytes(ObjectMonitor::succ_offset()), monitor);
2995 cmpdi(CR0, t, 0);
2996 // Invert equal bit
2997 crnand(flag, Assembler::equal, flag, Assembler::equal);
2998 beq(CR0, unlocked); // If there is a successor we are done.
2999
3000 // Save the monitor pointer in the current thread, so we can try
3001 // to reacquire the lock in SharedRuntime::monitor_exit_helper().
3002 std(monitor, in_bytes(JavaThread::unlocked_inflated_monitor_offset()), R16_thread);
3003 b(slow_path); // flag == NE
3004 }
3005
3006 bind(unlocked);
3007
3008 #ifdef ASSERT
3009 // Check that unlocked label is reached with flag == EQ.
3010 Label flag_correct;
3011 beq(CR0, flag_correct);
3012 stop("Fast Lock Flag != EQ");
3013 #endif
3014 bind(slow_path);
3015 #ifdef ASSERT
3016 // Check that slow_path label is reached with flag == NE.
3017 bne(CR0, flag_correct);
3018 stop("Fast Lock Flag != NE");
3019 bind(flag_correct);
3020 #endif
3021 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
3022 }
3023
3024 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
3025 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
3026
3027 if (at_return) {
3028 if (in_nmethod) {
3029 if (UseSIGTRAP) {
3030 // Use Signal Handler.
3031 relocate(relocInfo::poll_return_type);
3032 td(traptoGreaterThanUnsigned, R1_SP, temp);
3033 } else {
3034 cmpld(CR0, R1_SP, temp);
3035 // Stub may be out of range for short conditional branch.
3036 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CR0, Assembler::greater), slow_path);
3037 }
3038 } else { // Not in nmethod.
3039 // Frame still on stack, need to get fp.
3040 Register fp = R0;
3041 ld(fp, _abi0(callers_sp), R1_SP);
3042 cmpld(CR0, fp, temp);
3043 bgt(CR0, slow_path);
3044 }
3045 } else { // Normal safepoint poll. Not at return.
3046 assert(!in_nmethod, "should use load_from_polling_page");
3047 andi_(temp, temp, SafepointMechanism::poll_bit());
3048 bne(CR0, slow_path);
3049 }
3050 }
3051
3052 void MacroAssembler::jump_to_polling_page_return_handler_blob(int safepoint_offset, bool fixed_size) {
3053 assert(SharedRuntime::polling_page_return_handler_blob() != nullptr,
3054 "polling page return stub not created yet");
3055 address stub = SharedRuntime::polling_page_return_handler_blob()->entry_point();
3056
3057 // Determine saved exception pc using pc relative address computation.
3058 {
3059 Label next_pc;
3060 bl(next_pc);
3061 bind(next_pc);
3062 }
3063 int current_offset = offset();
3064
3065 if (fixed_size) {
3066 // Code size must not depend on offsets.
3067 load_const32(R12, safepoint_offset - current_offset);
3068 mflr(R0);
3069 add(R12, R12, R0);
3070 } else {
3071 mflr(R12);
3072 add_const_optimized(R12, R12, safepoint_offset - current_offset);
3073 }
3074 std(R12, in_bytes(JavaThread::saved_exception_pc_offset()), R16_thread);
3075
3076 add_const_optimized(R0, R29_TOC, MacroAssembler::offset_to_global_toc(stub));
3077 mtctr(R0);
3078 bctr();
3079 }
3080
3081 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
3082 MacroAssembler::PreservationLevel preservation_level) {
3083 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3084 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
3085 }
3086
3087 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
3088 MacroAssembler::PreservationLevel preservation_level) {
3089 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3090 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
3091 }
3092
3093 // Values for last_Java_pc, and last_Java_sp must comply to the rules
3094 // in frame_ppc.hpp.
3095 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
3096 // Always set last_Java_pc and flags first because once last_Java_sp
3097 // is visible has_last_Java_frame is true and users will look at the
3098 // rest of the fields. (Note: flags should always be zero before we
3099 // get here so doesn't need to be set.)
3100
3101 // Verify that last_Java_pc was zeroed on return to Java
3102 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
3103 "last_Java_pc not zeroed before leaving Java");
3104
3105 // When returning from calling out from Java mode the frame anchor's
3106 // last_Java_pc will always be set to null. It is set here so that
3107 // if we are doing a call to native (not VM) that we capture the
3108 // known pc and don't have to rely on the native call having a
3109 // standard frame linkage where we can find the pc.
3110 if (last_Java_pc != noreg)
3111 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3112
3113 // Set last_Java_sp last.
3114 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3115 }
3116
3117 void MacroAssembler::reset_last_Java_frame(bool check_last_java_sp) {
3118 if (check_last_java_sp) {
3119 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
3120 R16_thread, "SP was not set, still zero");
3121 }
3122
3123 BLOCK_COMMENT("reset_last_Java_frame {");
3124 li(R0, 0);
3125
3126 // _last_Java_sp = 0
3127 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
3128
3129 // _last_Java_pc = 0
3130 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
3131 BLOCK_COMMENT("} reset_last_Java_frame");
3132 }
3133
3134 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1, Label* jpc) {
3135 assert_different_registers(sp, tmp1);
3136
3137 if (jpc == nullptr || jpc->is_bound()) {
3138 load_const_optimized(tmp1, jpc == nullptr ? pc() : target(*jpc));
3139 } else {
3140 load_const(tmp1, *jpc, R12_scratch2);
3141 }
3142
3143 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
3144 }
3145
3146 void MacroAssembler::get_vm_result_oop(Register oop_result) {
3147 // Read:
3148 // R16_thread
3149 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3150 //
3151 // Updated:
3152 // oop_result
3153 // R16_thread->in_bytes(JavaThread::vm_result_oop_offset())
3154
3155 ld(oop_result, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3156 li(R0, 0);
3157 std(R0, in_bytes(JavaThread::vm_result_oop_offset()), R16_thread);
3158
3159 verify_oop(oop_result, FILE_AND_LINE);
3160 }
3161
3162 void MacroAssembler::get_vm_result_metadata(Register metadata_result) {
3163 // Read:
3164 // R16_thread
3165 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3166 //
3167 // Updated:
3168 // metadata_result
3169 // R16_thread->in_bytes(JavaThread::vm_result_metadata_offset())
3170
3171 ld(metadata_result, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3172 li(R0, 0);
3173 std(R0, in_bytes(JavaThread::vm_result_metadata_offset()), R16_thread);
3174 }
3175
3176 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3177 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
3178 if (CompressedKlassPointers::base() != nullptr) {
3179 // Use dst as temp if it is free.
3180 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
3181 current = dst;
3182 }
3183 if (CompressedKlassPointers::shift() != 0) {
3184 srdi(dst, current, CompressedKlassPointers::shift());
3185 current = dst;
3186 }
3187 return current;
3188 }
3189
3190 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
3191 assert(!UseCompactObjectHeaders, "not with compact headers");
3192 Register compressedKlass = encode_klass_not_null(ck, klass);
3193 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
3194 }
3195
3196 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
3197 assert(!UseCompactObjectHeaders, "not with compact headers");
3198 if (val == noreg) {
3199 val = R0;
3200 li(val, 0);
3201 }
3202 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop);
3203 }
3204
3205 int MacroAssembler::instr_size_for_load_klass() {
3206 static int computed_size = -1;
3207
3208 // Not yet computed?
3209 if (computed_size == -1) {
3210
3211 // Determine by scratch emit.
3212 ResourceMark rm;
3213 int code_size = 16 * BytesPerInstWord;
3214 CodeBuffer cb("load_klass scratch buffer", code_size, 0);
3215 MacroAssembler* a = new MacroAssembler(&cb);
3216 a->load_klass(R11_scratch1, R11_scratch1);
3217 computed_size = a->offset();
3218 }
3219
3220 return computed_size;
3221 }
3222
3223 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3224 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
3225 if (src == noreg) src = dst;
3226 Register shifted_src = src;
3227 if (CompressedKlassPointers::shift() != 0 ||
3228 (CompressedKlassPointers::base() == nullptr && src != dst)) { // Move required.
3229 shifted_src = dst;
3230 sldi(shifted_src, src, CompressedKlassPointers::shift());
3231 }
3232 if (CompressedKlassPointers::base() != nullptr) {
3233 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
3234 }
3235 }
3236
3237 void MacroAssembler::load_klass_no_decode(Register dst, Register src) {
3238 if (UseCompactObjectHeaders) {
3239 load_narrow_klass_compact(dst, src);
3240 } else {
3241 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
3242 }
3243 }
3244
3245 void MacroAssembler::load_klass(Register dst, Register src) {
3246 load_klass_no_decode(dst, src);
3247 decode_klass_not_null(dst);
3248 }
3249
3250 // Loads the obj's Klass* into dst.
3251 // Preserves all registers (incl src, rscratch1 and rscratch2).
3252 // Input:
3253 // src - the oop we want to load the klass from.
3254 // dst - output nklass.
3255 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3256 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3257 ld(dst, oopDesc::mark_offset_in_bytes(), src);
3258 srdi(dst, dst, markWord::klass_shift);
3259 }
3260
3261 void MacroAssembler::cmp_klass(ConditionRegister dst, Register obj, Register klass, Register tmp, Register tmp2) {
3262 assert_different_registers(obj, klass, tmp);
3263 if (UseCompactObjectHeaders) {
3264 load_narrow_klass_compact(tmp, obj);
3265 } else {
3266 lwz(tmp, oopDesc::klass_offset_in_bytes(), obj);
3267 }
3268 Register encoded_klass = encode_klass_not_null(tmp2, klass);
3269 cmpw(dst, tmp, encoded_klass);
3270 }
3271
3272 void MacroAssembler::cmp_klasses_from_objects(ConditionRegister dst, Register obj1, Register obj2, Register tmp1, Register tmp2) {
3273 if (UseCompactObjectHeaders) {
3274 load_narrow_klass_compact(tmp1, obj1);
3275 load_narrow_klass_compact(tmp2, obj2);
3276 cmpw(dst, tmp1, tmp2);
3277 } else {
3278 lwz(tmp1, oopDesc::klass_offset_in_bytes(), obj1);
3279 lwz(tmp2, oopDesc::klass_offset_in_bytes(), obj2);
3280 cmpw(dst, tmp1, tmp2);
3281 }
3282 }
3283
3284 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
3285 null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
3286 load_klass(dst, src);
3287 }
3288
3289 // ((OopHandle)result).resolve();
3290 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
3291 MacroAssembler::PreservationLevel preservation_level) {
3292 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
3293 }
3294
3295 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
3296 MacroAssembler::PreservationLevel preservation_level) {
3297 Label resolved;
3298
3299 // A null weak handle resolves to null.
3300 cmpdi(CR0, result, 0);
3301 beq(CR0, resolved);
3302
3303 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
3304 preservation_level);
3305 bind(resolved);
3306 }
3307
3308 void MacroAssembler::load_method_holder(Register holder, Register method) {
3309 ld(holder, in_bytes(Method::const_offset()), method);
3310 ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
3311 ld(holder, ConstantPool::pool_holder_offset(), holder);
3312 }
3313
3314 // Clear Array
3315 // For very short arrays. tmp == R0 is allowed.
3316 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
3317 if (cnt_dwords > 0) { li(tmp, 0); }
3318 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
3319 }
3320
3321 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
3322 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
3323 if (cnt_dwords < 8) {
3324 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
3325 return;
3326 }
3327
3328 Label loop;
3329 const long loopcnt = cnt_dwords >> 1,
3330 remainder = cnt_dwords & 1;
3331
3332 li(tmp, loopcnt);
3333 mtctr(tmp);
3334 li(tmp, 0);
3335 bind(loop);
3336 std(tmp, 0, base_ptr);
3337 std(tmp, 8, base_ptr);
3338 addi(base_ptr, base_ptr, 16);
3339 bdnz(loop);
3340 if (remainder) { std(tmp, 0, base_ptr); }
3341 }
3342
3343 // Kills both input registers. tmp == R0 is allowed.
3344 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
3345 // Procedure for large arrays (uses data cache block zero instruction).
3346 Label startloop, fast, fastloop, small_rest, restloop, done;
3347 const int cl_size = VM_Version::L1_data_cache_line_size(),
3348 cl_dwords = cl_size >> 3,
3349 cl_dw_addr_bits = exact_log2(cl_dwords),
3350 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
3351 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
3352
3353 if (const_cnt >= 0) {
3354 // Constant case.
3355 if (const_cnt < min_cnt) {
3356 clear_memory_constlen(base_ptr, const_cnt, tmp);
3357 return;
3358 }
3359 load_const_optimized(cnt_dwords, const_cnt, tmp);
3360 } else {
3361 // cnt_dwords already loaded in register. Need to check size.
3362 cmpdi(CR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
3363 blt(CR1, small_rest);
3364 }
3365 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
3366 beq(CR0, fast); // Already 128byte aligned.
3367
3368 subfic(tmp, tmp, cl_dwords);
3369 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
3370 subf(cnt_dwords, tmp, cnt_dwords); // rest.
3371 li(tmp, 0);
3372
3373 bind(startloop); // Clear at the beginning to reach 128byte boundary.
3374 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3375 addi(base_ptr, base_ptr, 8);
3376 bdnz(startloop);
3377
3378 bind(fast); // Clear 128byte blocks.
3379 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
3380 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
3381 mtctr(tmp); // Load counter.
3382
3383 bind(fastloop);
3384 dcbz(base_ptr); // Clear 128byte aligned block.
3385 addi(base_ptr, base_ptr, cl_size);
3386 bdnz(fastloop);
3387
3388 bind(small_rest);
3389 cmpdi(CR0, cnt_dwords, 0); // size 0?
3390 beq(CR0, done); // rest == 0
3391 li(tmp, 0);
3392 mtctr(cnt_dwords); // Load counter.
3393
3394 bind(restloop); // Clear rest.
3395 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
3396 addi(base_ptr, base_ptr, 8);
3397 bdnz(restloop);
3398
3399 bind(done);
3400 }
3401
3402 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
3403
3404 // Helpers for Intrinsic Emitters
3405 //
3406 // Revert the byte order of a 32bit value in a register
3407 // src: 0x44556677
3408 // dst: 0x77665544
3409 // Three steps to obtain the result:
3410 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
3411 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
3412 // This value initializes dst.
3413 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
3414 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
3415 // This value is mask inserted into dst with a [0..23] mask of 1s.
3416 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
3417 // This value is mask inserted into dst with a [8..15] mask of 1s.
3418 void MacroAssembler::load_reverse_32(Register dst, Register src) {
3419 assert_different_registers(dst, src);
3420
3421 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
3422 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
3423 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
3424 }
3425
3426 // Calculate the column addresses of the crc32 lookup table into distinct registers.
3427 // This loop-invariant calculation is moved out of the loop body, reducing the loop
3428 // body size from 20 to 16 instructions.
3429 // Returns the offset that was used to calculate the address of column tc3.
3430 // Due to register shortage, setting tc3 may overwrite table. With the return offset
3431 // at hand, the original table address can be easily reconstructed.
3432 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
3433
3434 // Point to 4 byte folding tables (byte-reversed version for Big Endian)
3435 // Layout: See StubRoutines::ppc::generate_crc_constants.
3436 #ifdef VM_LITTLE_ENDIAN
3437 const int ix0 = 3 * CRC32_TABLE_SIZE;
3438 const int ix1 = 2 * CRC32_TABLE_SIZE;
3439 const int ix2 = 1 * CRC32_TABLE_SIZE;
3440 const int ix3 = 0 * CRC32_TABLE_SIZE;
3441 #else
3442 const int ix0 = 1 * CRC32_TABLE_SIZE;
3443 const int ix1 = 2 * CRC32_TABLE_SIZE;
3444 const int ix2 = 3 * CRC32_TABLE_SIZE;
3445 const int ix3 = 4 * CRC32_TABLE_SIZE;
3446 #endif
3447 assert_different_registers(table, tc0, tc1, tc2);
3448 assert(table == tc3, "must be!");
3449
3450 addi(tc0, table, ix0);
3451 addi(tc1, table, ix1);
3452 addi(tc2, table, ix2);
3453 if (ix3 != 0) addi(tc3, table, ix3);
3454
3455 return ix3;
3456 }
3457
3458 /**
3459 * uint32_t crc;
3460 * table[crc & 0xFF] ^ (crc >> 8);
3461 */
3462 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3463 assert_different_registers(crc, table, tmp);
3464 assert_different_registers(val, table);
3465
3466 if (crc == val) { // Must rotate first to use the unmodified value.
3467 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3468 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3469 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3470 } else {
3471 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3472 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3473 }
3474 lwzx(tmp, table, tmp);
3475 xorr(crc, crc, tmp);
3476 }
3477
3478 /**
3479 * Emits code to update CRC-32 with a byte value according to constants in table.
3480 *
3481 * @param [in,out]crc Register containing the crc.
3482 * @param [in]val Register containing the byte to fold into the CRC.
3483 * @param [in]table Register containing the table of crc constants.
3484 *
3485 * uint32_t crc;
3486 * val = crc_table[(val ^ crc) & 0xFF];
3487 * crc = val ^ (crc >> 8);
3488 */
3489 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3490 BLOCK_COMMENT("update_byte_crc32:");
3491 xorr(val, val, crc);
3492 fold_byte_crc32(crc, val, table, val);
3493 }
3494
3495 /**
3496 * @param crc register containing existing CRC (32-bit)
3497 * @param buf register pointing to input byte buffer (byte*)
3498 * @param len register containing number of bytes
3499 * @param table register pointing to CRC table
3500 */
3501 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3502 Register data, bool loopAlignment) {
3503 assert_different_registers(crc, buf, len, table, data);
3504
3505 Label L_mainLoop, L_done;
3506 const int mainLoop_stepping = 1;
3507 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3508
3509 // Process all bytes in a single-byte loop.
3510 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3511 beq(CR0, L_done);
3512
3513 mtctr(len);
3514 align(mainLoop_alignment);
3515 BIND(L_mainLoop);
3516 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3517 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3518 update_byte_crc32(crc, data, table);
3519 bdnz(L_mainLoop); // Iterate.
3520
3521 bind(L_done);
3522 }
3523
3524 /**
3525 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3526 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3527 */
3528 // A note on the lookup table address(es):
3529 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3530 // To save the effort of adding the column offset to the table address each time
3531 // a table element is looked up, it is possible to pass the pre-calculated
3532 // column addresses.
3533 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3534 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3535 Register t0, Register t1, Register t2, Register t3,
3536 Register tc0, Register tc1, Register tc2, Register tc3) {
3537 assert_different_registers(crc, t3);
3538
3539 // XOR crc with next four bytes of buffer.
3540 lwz(t3, bufDisp, buf);
3541 if (bufInc != 0) {
3542 addi(buf, buf, bufInc);
3543 }
3544 xorr(t3, t3, crc);
3545
3546 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3547 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3548 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3549 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3550 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3551
3552 // Use the pre-calculated column addresses.
3553 // Load pre-calculated table values.
3554 lwzx(t0, tc0, t0);
3555 lwzx(t1, tc1, t1);
3556 lwzx(t2, tc2, t2);
3557 lwzx(t3, tc3, t3);
3558
3559 // Calculate new crc from table values.
3560 xorr(t0, t0, t1);
3561 xorr(t2, t2, t3);
3562 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3563 }
3564
3565
3566 /**
3567 * @param crc register containing existing CRC (32-bit)
3568 * @param buf register pointing to input byte buffer (byte*)
3569 * @param len register containing number of bytes
3570 * @param constants register pointing to precomputed constants
3571 * @param t0-t6 temp registers
3572 */
3573 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3574 Register t0, Register t1, Register t2, Register t3,
3575 Register t4, Register t5, Register t6, bool invertCRC) {
3576 assert_different_registers(crc, buf, len, constants);
3577
3578 Label L_tail;
3579
3580 BLOCK_COMMENT("kernel_crc32_vpmsum {");
3581
3582 if (invertCRC) {
3583 nand(crc, crc, crc); // 1s complement of crc
3584 }
3585
3586 // Enforce 32 bit.
3587 clrldi(len, len, 32);
3588
3589 // Align if we have enough bytes for the fast version.
3590 const int alignment = 16,
3591 threshold = 32;
3592 Register prealign = t0;
3593
3594 neg(prealign, buf);
3595 addi(t1, len, -threshold);
3596 andi(prealign, prealign, alignment - 1);
3597 cmpw(CR0, t1, prealign);
3598 blt(CR0, L_tail); // len - prealign < threshold?
3599
3600 subf(len, prealign, len);
3601 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3602
3603 // Calculate from first aligned address as far as possible.
3604 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3605 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3606 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3607
3608 // Remaining bytes.
3609 BIND(L_tail);
3610 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3611
3612 if (invertCRC) {
3613 nand(crc, crc, crc); // 1s complement of crc
3614 }
3615
3616 BLOCK_COMMENT("} kernel_crc32_vpmsum");
3617 }
3618
3619 /**
3620 * @param crc register containing existing CRC (32-bit)
3621 * @param buf register pointing to input byte buffer (byte*)
3622 * @param len register containing number of bytes (will get updated to remaining bytes)
3623 * @param constants register pointing to CRC table for 128-bit aligned memory
3624 * @param t0-t6 temp registers
3625 */
3626 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3627 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3628
3629 // Save non-volatile vector registers (frameless).
3630 Register offset = t1;
3631 int offsetInt = 0;
3632 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3633 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3634 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3635 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3636 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3637 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3638 #ifndef VM_LITTLE_ENDIAN
3639 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3640 #endif
3641 offsetInt -= 8; std(R14, offsetInt, R1_SP);
3642 offsetInt -= 8; std(R15, offsetInt, R1_SP);
3643
3644 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3645 // bytes per iteration. The basic scheme is:
3646 // lvx: load vector (Big Endian needs reversal)
3647 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3648 // vxor: xor partial results together to get unroll_factor2 vectors
3649
3650 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3651
3652 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3653 const int unroll_factor = CRC32_UNROLL_FACTOR,
3654 unroll_factor2 = CRC32_UNROLL_FACTOR2;
3655
3656 const int outer_consts_size = (unroll_factor2 - 1) * 16,
3657 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3658
3659 // Support registers.
3660 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3661 Register num_bytes = R14,
3662 loop_count = R15,
3663 cur_const = crc; // will live in VCRC
3664 // Constant array for outer loop: unroll_factor2 - 1 registers,
3665 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3666 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3667 consts1[] = { VR23, VR24 };
3668 // Data register arrays: 2 arrays with unroll_factor2 registers.
3669 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3670 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3671
3672 VectorRegister VCRC = data0[0];
3673 VectorRegister Vc = VR25;
3674 VectorRegister swap_bytes = VR26; // Only for Big Endian.
3675
3676 // We have at least 1 iteration (ensured by caller).
3677 Label L_outer_loop, L_inner_loop, L_last;
3678
3679 // Set DSCR pre-fetch to deepest.
3680 if (VM_Version::has_mfdscr()) {
3681 load_const_optimized(t0, VM_Version::_dscr_val | 7);
3682 mtdscr(t0);
3683 }
3684
3685 mtvrwz(VCRC, crc); // crc lives in VCRC, now
3686
3687 for (int i = 1; i < unroll_factor2; ++i) {
3688 li(offs[i], 16 * i);
3689 }
3690
3691 // Load consts for outer loop
3692 lvx(consts0[0], constants);
3693 for (int i = 1; i < unroll_factor2 - 1; ++i) {
3694 lvx(consts0[i], offs[i], constants);
3695 }
3696
3697 load_const_optimized(num_bytes, 16 * unroll_factor);
3698
3699 // Reuse data registers outside of the loop.
3700 VectorRegister Vtmp = data1[0];
3701 VectorRegister Vtmp2 = data1[1];
3702 VectorRegister zeroes = data1[2];
3703
3704 vspltisb(Vtmp, 0);
3705 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3706
3707 // Load vector for vpermxor (to xor both 64 bit parts together)
3708 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3709 vspltisb(Vc, 4);
3710 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3711 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3712 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3713
3714 #ifdef VM_LITTLE_ENDIAN
3715 #define BE_swap_bytes(x)
3716 #else
3717 vspltisb(Vtmp2, 0xf);
3718 vxor(swap_bytes, Vtmp, Vtmp2);
3719 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3720 #endif
3721
3722 cmpd(CR0, len, num_bytes);
3723 blt(CR0, L_last);
3724
3725 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3726 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3727
3728 // ********** Main loop start **********
3729 align(32);
3730 bind(L_outer_loop);
3731
3732 // Begin of unrolled first iteration (no xor).
3733 lvx(data1[0], buf);
3734 for (int i = 1; i < unroll_factor2 / 2; ++i) {
3735 lvx(data1[i], offs[i], buf);
3736 }
3737 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3738 lvx(consts1[0], cur_const);
3739 mtctr(loop_count);
3740 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3741 BE_swap_bytes(data1[i]);
3742 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3743 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3744 vpmsumw(data0[i], data1[i], consts1[0]);
3745 }
3746 addi(buf, buf, 16 * unroll_factor2);
3747 subf(len, num_bytes, len);
3748 lvx(consts1[1], offs[1], cur_const);
3749 addi(cur_const, cur_const, 32);
3750 // Begin of unrolled second iteration (head).
3751 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3752 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3753 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3754 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3755 }
3756 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3757 BE_swap_bytes(data1[i]);
3758 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3759 vpmsumw(data1[i], data1[i], consts1[1]);
3760 }
3761 addi(buf, buf, 16 * unroll_factor2);
3762
3763 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3764 // Double-iteration allows using the 2 constant registers alternatingly.
3765 align(32);
3766 bind(L_inner_loop);
3767 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3768 if (j & 1) {
3769 lvx(consts1[0], cur_const);
3770 } else {
3771 lvx(consts1[1], offs[1], cur_const);
3772 addi(cur_const, cur_const, 32);
3773 }
3774 for (int i = 0; i < unroll_factor2; ++i) {
3775 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3776 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3777 BE_swap_bytes(data1[idx]);
3778 vxor(data0[i], data0[i], data1[i]);
3779 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3780 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3781 }
3782 addi(buf, buf, 16 * unroll_factor2);
3783 }
3784 bdnz(L_inner_loop);
3785
3786 addi(cur_const, constants, outer_consts_size); // Reset
3787
3788 // Tail of last iteration (no loads).
3789 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3790 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3791 vxor(data0[i], data0[i], data1[i]);
3792 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3793 }
3794 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3795 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3796 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3797 }
3798
3799 // Last data register is ok, other ones need fixup shift.
3800 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3801 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3802 }
3803
3804 // Combine to 128 bit result vector VCRC = data0[0].
3805 for (int i = 1; i < unroll_factor2; i<<=1) {
3806 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3807 vxor(data0[j], data0[j], data0[j+i]);
3808 }
3809 }
3810 cmpd(CR0, len, num_bytes);
3811 bge(CR0, L_outer_loop);
3812
3813 // Last chance with lower num_bytes.
3814 bind(L_last);
3815 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3816 // Point behind last const for inner loop.
3817 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3818 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3819 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3820 subf(cur_const, R0, cur_const); // Point to constant to be used first.
3821
3822 addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3823 bgt(CR0, L_outer_loop);
3824 // ********** Main loop end **********
3825
3826 // Restore DSCR pre-fetch value.
3827 if (VM_Version::has_mfdscr()) {
3828 load_const_optimized(t0, VM_Version::_dscr_val);
3829 mtdscr(t0);
3830 }
3831
3832 // ********** Simple loop for remaining 16 byte blocks **********
3833 {
3834 Label L_loop, L_done;
3835
3836 srdi_(t0, len, 4); // 16 bytes per iteration
3837 clrldi(len, len, 64-4);
3838 beq(CR0, L_done);
3839
3840 // Point to const (same as last const for inner loop).
3841 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3842 mtctr(t0);
3843 lvx(Vtmp2, cur_const);
3844
3845 align(32);
3846 bind(L_loop);
3847
3848 lvx(Vtmp, buf);
3849 addi(buf, buf, 16);
3850 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3851 BE_swap_bytes(Vtmp);
3852 vxor(VCRC, VCRC, Vtmp);
3853 vpmsumw(VCRC, VCRC, Vtmp2);
3854 bdnz(L_loop);
3855
3856 bind(L_done);
3857 }
3858 // ********** Simple loop end **********
3859 #undef BE_swap_bytes
3860
3861 // Point to Barrett constants
3862 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3863
3864 vspltisb(zeroes, 0);
3865
3866 // Combine to 64 bit result.
3867 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3868
3869 // Reduce to 32 bit CRC: Remainder by multiply-high.
3870 lvx(Vtmp, cur_const);
3871 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3872 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3873 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3874 vsldoi(Vtmp, zeroes, Vtmp, 8);
3875 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3876 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3877
3878 // Move result. len is already updated.
3879 vsldoi(VCRC, VCRC, zeroes, 8);
3880 mfvrd(crc, VCRC);
3881
3882 // Restore non-volatile Vector registers (frameless).
3883 offsetInt = 0;
3884 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3885 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3886 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3887 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3888 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3889 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3890 #ifndef VM_LITTLE_ENDIAN
3891 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3892 #endif
3893 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3894 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3895 }
3896
3897 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3898 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3899 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3900 : StubRoutines::crc_table_addr() , R0);
3901
3902 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3903 }
3904
3905 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3906 assert_different_registers(crc, val, table);
3907
3908 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3909 if (invertCRC) {
3910 nand(crc, crc, crc); // 1s complement of crc
3911 }
3912
3913 update_byte_crc32(crc, val, table);
3914
3915 if (invertCRC) {
3916 nand(crc, crc, crc); // 1s complement of crc
3917 }
3918 }
3919
3920 // dest_lo += src1 + src2
3921 // dest_hi += carry1 + carry2
3922 void MacroAssembler::add2_with_carry(Register dest_hi,
3923 Register dest_lo,
3924 Register src1, Register src2) {
3925 li(R0, 0);
3926 addc(dest_lo, dest_lo, src1);
3927 adde(dest_hi, dest_hi, R0);
3928 addc(dest_lo, dest_lo, src2);
3929 adde(dest_hi, dest_hi, R0);
3930 }
3931
3932 // Multiply 64 bit by 64 bit first loop.
3933 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3934 Register x_xstart,
3935 Register y, Register y_idx,
3936 Register z,
3937 Register carry,
3938 Register product_high, Register product,
3939 Register idx, Register kdx,
3940 Register tmp) {
3941 // jlong carry, x[], y[], z[];
3942 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3943 // huge_128 product = y[idx] * x[xstart] + carry;
3944 // z[kdx] = (jlong)product;
3945 // carry = (jlong)(product >>> 64);
3946 // }
3947 // z[xstart] = carry;
3948
3949 Label L_first_loop, L_first_loop_exit;
3950 Label L_one_x, L_one_y, L_multiply;
3951
3952 addic_(xstart, xstart, -1);
3953 blt(CR0, L_one_x); // Special case: length of x is 1.
3954
3955 // Load next two integers of x.
3956 sldi(tmp, xstart, LogBytesPerInt);
3957 ldx(x_xstart, x, tmp);
3958 #ifdef VM_LITTLE_ENDIAN
3959 rldicl(x_xstart, x_xstart, 32, 0);
3960 #endif
3961
3962 align(32, 16);
3963 bind(L_first_loop);
3964
3965 cmpdi(CR0, idx, 1);
3966 blt(CR0, L_first_loop_exit);
3967 addi(idx, idx, -2);
3968 beq(CR0, L_one_y);
3969
3970 // Load next two integers of y.
3971 sldi(tmp, idx, LogBytesPerInt);
3972 ldx(y_idx, y, tmp);
3973 #ifdef VM_LITTLE_ENDIAN
3974 rldicl(y_idx, y_idx, 32, 0);
3975 #endif
3976
3977
3978 bind(L_multiply);
3979 multiply64(product_high, product, x_xstart, y_idx);
3980
3981 li(tmp, 0);
3982 addc(product, product, carry); // Add carry to result.
3983 adde(product_high, product_high, tmp); // Add carry of the last addition.
3984 addi(kdx, kdx, -2);
3985
3986 // Store result.
3987 #ifdef VM_LITTLE_ENDIAN
3988 rldicl(product, product, 32, 0);
3989 #endif
3990 sldi(tmp, kdx, LogBytesPerInt);
3991 stdx(product, z, tmp);
3992 mr_if_needed(carry, product_high);
3993 b(L_first_loop);
3994
3995
3996 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3997
3998 lwz(y_idx, 0, y);
3999 b(L_multiply);
4000
4001
4002 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
4003
4004 lwz(x_xstart, 0, x);
4005 b(L_first_loop);
4006
4007 bind(L_first_loop_exit);
4008 }
4009
4010 // Multiply 64 bit by 64 bit and add 128 bit.
4011 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
4012 Register z, Register yz_idx,
4013 Register idx, Register carry,
4014 Register product_high, Register product,
4015 Register tmp, int offset) {
4016
4017 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
4018 // z[kdx] = (jlong)product;
4019
4020 sldi(tmp, idx, LogBytesPerInt);
4021 if (offset) {
4022 addi(tmp, tmp, offset);
4023 }
4024 ldx(yz_idx, y, tmp);
4025 #ifdef VM_LITTLE_ENDIAN
4026 rldicl(yz_idx, yz_idx, 32, 0);
4027 #endif
4028
4029 multiply64(product_high, product, x_xstart, yz_idx);
4030 ldx(yz_idx, z, tmp);
4031 #ifdef VM_LITTLE_ENDIAN
4032 rldicl(yz_idx, yz_idx, 32, 0);
4033 #endif
4034
4035 add2_with_carry(product_high, product, carry, yz_idx);
4036
4037 sldi(tmp, idx, LogBytesPerInt);
4038 if (offset) {
4039 addi(tmp, tmp, offset);
4040 }
4041 #ifdef VM_LITTLE_ENDIAN
4042 rldicl(product, product, 32, 0);
4043 #endif
4044 stdx(product, z, tmp);
4045 }
4046
4047 // Multiply 128 bit by 128 bit. Unrolled inner loop.
4048 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
4049 Register y, Register z,
4050 Register yz_idx, Register idx, Register carry,
4051 Register product_high, Register product,
4052 Register carry2, Register tmp) {
4053
4054 // jlong carry, x[], y[], z[];
4055 // int kdx = ystart+1;
4056 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
4057 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
4058 // z[kdx+idx+1] = (jlong)product;
4059 // jlong carry2 = (jlong)(product >>> 64);
4060 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
4061 // z[kdx+idx] = (jlong)product;
4062 // carry = (jlong)(product >>> 64);
4063 // }
4064 // idx += 2;
4065 // if (idx > 0) {
4066 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
4067 // z[kdx+idx] = (jlong)product;
4068 // carry = (jlong)(product >>> 64);
4069 // }
4070
4071 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
4072 const Register jdx = R0;
4073
4074 // Scale the index.
4075 srdi_(jdx, idx, 2);
4076 beq(CR0, L_third_loop_exit);
4077 mtctr(jdx);
4078
4079 align(32, 16);
4080 bind(L_third_loop);
4081
4082 addi(idx, idx, -4);
4083
4084 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
4085 mr_if_needed(carry2, product_high);
4086
4087 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
4088 mr_if_needed(carry, product_high);
4089 bdnz(L_third_loop);
4090
4091 bind(L_third_loop_exit); // Handle any left-over operand parts.
4092
4093 andi_(idx, idx, 0x3);
4094 beq(CR0, L_post_third_loop_done);
4095
4096 Label L_check_1;
4097
4098 addic_(idx, idx, -2);
4099 blt(CR0, L_check_1);
4100
4101 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
4102 mr_if_needed(carry, product_high);
4103
4104 bind(L_check_1);
4105
4106 addi(idx, idx, 0x2);
4107 andi_(idx, idx, 0x1);
4108 addic_(idx, idx, -1);
4109 blt(CR0, L_post_third_loop_done);
4110
4111 sldi(tmp, idx, LogBytesPerInt);
4112 lwzx(yz_idx, y, tmp);
4113 multiply64(product_high, product, x_xstart, yz_idx);
4114 lwzx(yz_idx, z, tmp);
4115
4116 add2_with_carry(product_high, product, yz_idx, carry);
4117
4118 sldi(tmp, idx, LogBytesPerInt);
4119 stwx(product, z, tmp);
4120 srdi(product, product, 32);
4121
4122 sldi(product_high, product_high, 32);
4123 orr(product, product, product_high);
4124 mr_if_needed(carry, product);
4125
4126 bind(L_post_third_loop_done);
4127 } // multiply_128_x_128_loop
4128
4129 void MacroAssembler::muladd(Register out, Register in,
4130 Register offset, Register len, Register k,
4131 Register tmp1, Register tmp2, Register carry) {
4132
4133 // Labels
4134 Label LOOP, SKIP;
4135
4136 // Make sure length is positive.
4137 cmpdi (CR0, len, 0);
4138
4139 // Prepare variables
4140 subi (offset, offset, 4);
4141 li (carry, 0);
4142 ble (CR0, SKIP);
4143
4144 mtctr (len);
4145 subi (len, len, 1 );
4146 sldi (len, len, 2 );
4147
4148 // Main loop
4149 bind(LOOP);
4150 lwzx (tmp1, len, in );
4151 lwzx (tmp2, offset, out );
4152 mulld (tmp1, tmp1, k );
4153 add (tmp2, carry, tmp2 );
4154 add (tmp2, tmp1, tmp2 );
4155 stwx (tmp2, offset, out );
4156 srdi (carry, tmp2, 32 );
4157 subi (offset, offset, 4 );
4158 subi (len, len, 4 );
4159 bdnz (LOOP);
4160 bind(SKIP);
4161 }
4162
4163 void MacroAssembler::multiply_to_len(Register x, Register xlen,
4164 Register y, Register ylen,
4165 Register z,
4166 Register tmp1, Register tmp2,
4167 Register tmp3, Register tmp4,
4168 Register tmp5, Register tmp6,
4169 Register tmp7, Register tmp8,
4170 Register tmp9, Register tmp10,
4171 Register tmp11, Register tmp12,
4172 Register tmp13) {
4173
4174 ShortBranchVerifier sbv(this);
4175
4176 assert_different_registers(x, xlen, y, ylen, z,
4177 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
4178 assert_different_registers(x, xlen, y, ylen, z,
4179 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
4180 assert_different_registers(x, xlen, y, ylen, z,
4181 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
4182
4183 const Register idx = tmp1;
4184 const Register kdx = tmp2;
4185 const Register xstart = tmp3;
4186
4187 const Register y_idx = tmp4;
4188 const Register carry = tmp5;
4189 const Register product = tmp6;
4190 const Register product_high = tmp7;
4191 const Register x_xstart = tmp8;
4192 const Register tmp = tmp9;
4193
4194 // First Loop.
4195 //
4196 // final static long LONG_MASK = 0xffffffffL;
4197 // int xstart = xlen - 1;
4198 // int ystart = ylen - 1;
4199 // long carry = 0;
4200 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
4201 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
4202 // z[kdx] = (int)product;
4203 // carry = product >>> 32;
4204 // }
4205 // z[xstart] = (int)carry;
4206
4207 mr_if_needed(idx, ylen); // idx = ylen
4208 add(kdx, xlen, ylen); // kdx = xlen + ylen
4209 li(carry, 0); // carry = 0
4210
4211 Label L_done;
4212
4213 addic_(xstart, xlen, -1);
4214 blt(CR0, L_done);
4215
4216 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
4217 carry, product_high, product, idx, kdx, tmp);
4218
4219 Label L_second_loop;
4220
4221 cmpdi(CR0, kdx, 0);
4222 beq(CR0, L_second_loop);
4223
4224 Label L_carry;
4225
4226 addic_(kdx, kdx, -1);
4227 beq(CR0, L_carry);
4228
4229 // Store lower 32 bits of carry.
4230 sldi(tmp, kdx, LogBytesPerInt);
4231 stwx(carry, z, tmp);
4232 srdi(carry, carry, 32);
4233 addi(kdx, kdx, -1);
4234
4235
4236 bind(L_carry);
4237
4238 // Store upper 32 bits of carry.
4239 sldi(tmp, kdx, LogBytesPerInt);
4240 stwx(carry, z, tmp);
4241
4242 // Second and third (nested) loops.
4243 //
4244 // for (int i = xstart-1; i >= 0; i--) { // Second loop
4245 // carry = 0;
4246 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
4247 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
4248 // (z[k] & LONG_MASK) + carry;
4249 // z[k] = (int)product;
4250 // carry = product >>> 32;
4251 // }
4252 // z[i] = (int)carry;
4253 // }
4254 //
4255 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
4256
4257 bind(L_second_loop);
4258
4259 li(carry, 0); // carry = 0;
4260
4261 addic_(xstart, xstart, -1); // i = xstart-1;
4262 blt(CR0, L_done);
4263
4264 Register zsave = tmp10;
4265
4266 mr(zsave, z);
4267
4268
4269 Label L_last_x;
4270
4271 sldi(tmp, xstart, LogBytesPerInt);
4272 add(z, z, tmp); // z = z + k - j
4273 addi(z, z, 4);
4274 addic_(xstart, xstart, -1); // i = xstart-1;
4275 blt(CR0, L_last_x);
4276
4277 sldi(tmp, xstart, LogBytesPerInt);
4278 ldx(x_xstart, x, tmp);
4279 #ifdef VM_LITTLE_ENDIAN
4280 rldicl(x_xstart, x_xstart, 32, 0);
4281 #endif
4282
4283
4284 Label L_third_loop_prologue;
4285
4286 bind(L_third_loop_prologue);
4287
4288 Register xsave = tmp11;
4289 Register xlensave = tmp12;
4290 Register ylensave = tmp13;
4291
4292 mr(xsave, x);
4293 mr(xlensave, xstart);
4294 mr(ylensave, ylen);
4295
4296
4297 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
4298 carry, product_high, product, x, tmp);
4299
4300 mr(z, zsave);
4301 mr(x, xsave);
4302 mr(xlen, xlensave); // This is the decrement of the loop counter!
4303 mr(ylen, ylensave);
4304
4305 addi(tmp3, xlen, 1);
4306 sldi(tmp, tmp3, LogBytesPerInt);
4307 stwx(carry, z, tmp);
4308 addic_(tmp3, tmp3, -1);
4309 blt(CR0, L_done);
4310
4311 srdi(carry, carry, 32);
4312 sldi(tmp, tmp3, LogBytesPerInt);
4313 stwx(carry, z, tmp);
4314 b(L_second_loop);
4315
4316 // Next infrequent code is moved outside loops.
4317 bind(L_last_x);
4318
4319 lwz(x_xstart, 0, x);
4320 b(L_third_loop_prologue);
4321
4322 bind(L_done);
4323 } // multiply_to_len
4324
4325 #ifdef ASSERT
4326 void MacroAssembler::asm_assert(AsmAssertCond cond, const char *msg) {
4327 Label ok;
4328 switch (cond) {
4329 case eq:
4330 beq(CR0, ok);
4331 break;
4332 case ne:
4333 bne(CR0, ok);
4334 break;
4335 case ge:
4336 bge(CR0, ok);
4337 break;
4338 case gt:
4339 bgt(CR0, ok);
4340 break;
4341 case lt:
4342 blt(CR0, ok);
4343 break;
4344 case le:
4345 ble(CR0, ok);
4346 break;
4347 default:
4348 assert(false, "unknown cond:%d", cond);
4349 }
4350 stop(msg);
4351 bind(ok);
4352 }
4353
4354 void MacroAssembler::asm_assert_mems_zero(AsmAssertCond cond, int size, int mem_offset,
4355 Register mem_base, const char* msg) {
4356 switch (size) {
4357 case 4:
4358 lwz(R0, mem_offset, mem_base);
4359 cmpwi(CR0, R0, 0);
4360 break;
4361 case 8:
4362 ld(R0, mem_offset, mem_base);
4363 cmpdi(CR0, R0, 0);
4364 break;
4365 default:
4366 ShouldNotReachHere();
4367 }
4368 asm_assert(cond, msg);
4369 }
4370 #endif // ASSERT
4371
4372 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4373 if (!VerifyOops) { return; }
4374 if (UseCompressedOops) { decode_heap_oop(coop); }
4375 verify_oop(coop, msg);
4376 if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4377 }
4378
4379 // READ: oop. KILL: R0. Volatile floats perhaps.
4380 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4381 if (!VerifyOops) {
4382 return;
4383 }
4384
4385 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4386 const Register tmp = R11; // Will be preserved.
4387 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4388
4389 BLOCK_COMMENT("verify_oop {");
4390
4391 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4392
4393 mr_if_needed(R4_ARG2, oop);
4394 save_LR_CR(tmp); // save in old frame
4395 push_frame_reg_args(nbytes_save, tmp);
4396 // load FunctionDescriptor** / entry_address *
4397 load_const_optimized(tmp, fd, R0);
4398 // load FunctionDescriptor* / entry_address
4399 ld(tmp, 0, tmp);
4400 load_const_optimized(R3_ARG1, (address)msg, R0);
4401 // Call destination for its side effect.
4402 call_c(tmp);
4403
4404 pop_frame();
4405 restore_LR_CR(tmp);
4406 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4407
4408 BLOCK_COMMENT("} verify_oop");
4409 }
4410
4411 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4412 if (!VerifyOops) {
4413 return;
4414 }
4415
4416 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4417 const Register tmp = R11; // Will be preserved.
4418 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4419 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4420
4421 ld(R4_ARG2, offs, base);
4422 save_LR_CR(tmp); // save in old frame
4423 push_frame_reg_args(nbytes_save, tmp);
4424 // load FunctionDescriptor** / entry_address *
4425 load_const_optimized(tmp, fd, R0);
4426 // load FunctionDescriptor* / entry_address
4427 ld(tmp, 0, tmp);
4428 load_const_optimized(R3_ARG1, (address)msg, R0);
4429 // Call destination for its side effect.
4430 call_c(tmp);
4431
4432 pop_frame();
4433 restore_LR_CR(tmp);
4434 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4435 }
4436
4437 // Call a C-function that prints output.
4438 void MacroAssembler::stop(int type, const char* msg) {
4439 bool msg_present = (msg != nullptr);
4440
4441 #ifndef PRODUCT
4442 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4443 #else
4444 block_comment("stop {");
4445 #endif
4446
4447 if (msg_present) {
4448 type |= stop_msg_present;
4449 }
4450 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4451 if (msg_present) {
4452 emit_int64((uintptr_t)msg);
4453 }
4454
4455 block_comment("} stop;");
4456 }
4457
4458 #ifndef PRODUCT
4459 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4460 // Val, addr are temp registers.
4461 // If low == addr, addr is killed.
4462 // High is preserved.
4463 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4464 if (!ZapMemory) return;
4465
4466 assert_different_registers(low, val);
4467
4468 BLOCK_COMMENT("zap memory region {");
4469 load_const_optimized(val, 0x0101010101010101);
4470 int size = before + after;
4471 if (low == high && size < 5 && size > 0) {
4472 int offset = -before*BytesPerWord;
4473 for (int i = 0; i < size; ++i) {
4474 std(val, offset, low);
4475 offset += (1*BytesPerWord);
4476 }
4477 } else {
4478 addi(addr, low, -before*BytesPerWord);
4479 assert_different_registers(high, val);
4480 if (after) addi(high, high, after * BytesPerWord);
4481 Label loop;
4482 bind(loop);
4483 std(val, 0, addr);
4484 addi(addr, addr, 8);
4485 cmpd(CR6, addr, high);
4486 ble(CR6, loop);
4487 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4488 }
4489 BLOCK_COMMENT("} zap memory region");
4490 }
4491
4492 #endif // !PRODUCT
4493
4494 void MacroAssembler::cache_wb(Address line) {
4495 assert(line.index() == noreg, "index should be noreg");
4496 assert(line.disp() == 0, "displacement should be 0");
4497 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4498 // Data Cache Store, not really a flush, so it works like a sync of cache
4499 // line and persistent mem, i.e. copying the cache line to persistent whilst
4500 // not invalidating the cache line.
4501 dcbst(line.base());
4502 }
4503
4504 void MacroAssembler::cache_wbsync(bool is_presync) {
4505 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4506 // We only need a post sync barrier. Post means _after_ a cache line flush or
4507 // store instruction, pre means a barrier emitted before such a instructions.
4508 if (!is_presync) {
4509 fence();
4510 }
4511 }
4512
4513 void MacroAssembler::push_cont_fastpath() {
4514 if (!Continuations::enabled()) return;
4515
4516 Label done;
4517 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4518 cmpld(CR0, R1_SP, R0);
4519 ble(CR0, done); // if (SP <= _cont_fastpath) goto done;
4520 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4521 bind(done);
4522 }
4523
4524 void MacroAssembler::pop_cont_fastpath() {
4525 if (!Continuations::enabled()) return;
4526
4527 Label done;
4528 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4529 cmpld(CR0, R1_SP, R0);
4530 blt(CR0, done); // if (SP < _cont_fastpath) goto done;
4531 li(R0, 0);
4532 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4533 bind(done);
4534 }
4535
4536 // Function to flip between unlocked and locked state (fast locking).
4537 // Branches to failed if the state is not as expected with CR0 NE.
4538 // Falls through upon success with CR0 EQ.
4539 // This requires fewer instructions and registers and is easier to use than the
4540 // cmpxchg based implementation.
4541 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4542 assert_different_registers(obj, tmp, R0);
4543 Label retry;
4544
4545 if (semantics & MemBarRel) {
4546 release();
4547 }
4548
4549 bind(retry);
4550 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4551 if (!is_unlock) {
4552 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4553 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4554 andi_(R0, tmp, markWord::lock_mask_in_place);
4555 bne(CR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4556 } else {
4557 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4558 andi_(R0, tmp, markWord::lock_mask_in_place);
4559 bne(CR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4560 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4561 }
4562 stdcx_(tmp, obj);
4563 bne(CR0, retry);
4564
4565 if (semantics & MemBarFenceAfter) {
4566 fence();
4567 } else if (semantics & MemBarAcq) {
4568 isync();
4569 }
4570 }
4571
4572 // Implements fast-locking.
4573 //
4574 // - obj: the object to be locked
4575 // - t1, t2: temporary register
4576 void MacroAssembler::fast_lock(Register box, Register obj, Register t1, Register t2, Label& slow) {
4577 assert_different_registers(box, obj, t1, t2, R0);
4578
4579 Label push;
4580 const Register t = R0;
4581
4582 if (UseObjectMonitorTable) {
4583 // Clear cache in case fast locking succeeds or we need to take the slow-path.
4584 li(t, 0);
4585 std(t, in_bytes(BasicObjectLock::lock_offset()) + BasicLock::object_monitor_cache_offset_in_bytes(), box);
4586 }
4587
4588 if (DiagnoseSyncOnValueBasedClasses != 0) {
4589 load_klass(t1, obj);
4590 lbz(t1, in_bytes(Klass::misc_flags_offset()), t1);
4591 testbitdi(CR0, R0, t1, exact_log2(KlassFlags::_misc_is_value_based_class));
4592 bne(CR0, slow);
4593 }
4594
4595 const Register top = t1;
4596 const Register mark = t2;
4597
4598 // Check if the lock-stack is full.
4599 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4600 cmplwi(CR0, top, LockStack::end_offset());
4601 bge(CR0, slow);
4602
4603 // The underflow check is elided. The recursive check will always fail
4604 // when the lock stack is empty because of the _bad_oop_sentinel field.
4605
4606 // Check for recursion.
4607 subi(t, top, oopSize);
4608 ldx(t, R16_thread, t);
4609 cmpd(CR0, obj, t);
4610 beq(CR0, push);
4611
4612 // Check header for monitor (0b10) or locked (0b00).
4613 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4614 xori(t, mark, markWord::unlocked_value);
4615 andi_(t, t, markWord::lock_mask_in_place);
4616 bne(CR0, slow);
4617
4618 // Try to lock. Transition lock bits 0b01 => 0b00
4619 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4620
4621 bind(push);
4622 // After successful lock, push object on lock-stack
4623 stdx(obj, R16_thread, top);
4624 addi(top, top, oopSize);
4625 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4626 }
4627
4628 // Implements fast-unlocking.
4629 //
4630 // - obj: the object to be unlocked
4631 // - t1: temporary register
4632 void MacroAssembler::fast_unlock(Register obj, Register t1, Label& slow) {
4633 assert_different_registers(obj, t1);
4634
4635 #ifdef ASSERT
4636 {
4637 // The following checks rely on the fact that LockStack is only ever modified by
4638 // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4639 // entries after inflation will happen delayed in that case.
4640
4641 // Check for lock-stack underflow.
4642 Label stack_ok;
4643 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4644 cmplwi(CR0, t1, LockStack::start_offset());
4645 bge(CR0, stack_ok);
4646 stop("Lock-stack underflow");
4647 bind(stack_ok);
4648 }
4649 #endif
4650
4651 Label unlocked, push_and_slow;
4652 const Register top = t1;
4653 const Register mark = R0;
4654 Register t = R0;
4655
4656 // Check if obj is top of lock-stack.
4657 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4658 subi(top, top, oopSize);
4659 ldx(t, R16_thread, top);
4660 cmpd(CR0, obj, t);
4661 bne(CR0, slow);
4662
4663 // Pop lock-stack.
4664 DEBUG_ONLY(li(t, 0);)
4665 DEBUG_ONLY(stdx(t, R16_thread, top);)
4666 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4667
4668 // The underflow check is elided. The recursive check will always fail
4669 // when the lock stack is empty because of the _bad_oop_sentinel field.
4670
4671 // Check if recursive.
4672 subi(t, top, oopSize);
4673 ldx(t, R16_thread, t);
4674 cmpd(CR0, obj, t);
4675 beq(CR0, unlocked);
4676
4677 // Use top as tmp
4678 t = top;
4679
4680 // Not recursive. Check header for monitor (0b10).
4681 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4682 andi_(t, mark, markWord::monitor_value);
4683 bne(CR0, push_and_slow);
4684
4685 #ifdef ASSERT
4686 // Check header not unlocked (0b01).
4687 Label not_unlocked;
4688 andi_(t, mark, markWord::unlocked_value);
4689 beq(CR0, not_unlocked);
4690 stop("fast_unlock already unlocked");
4691 bind(not_unlocked);
4692 #endif
4693
4694 // Try to unlock. Transition lock bits 0b00 => 0b01
4695 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4696 b(unlocked);
4697
4698 bind(push_and_slow);
4699
4700 // Restore lock-stack and handle the unlock in runtime.
4701 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4702 DEBUG_ONLY(stdx(obj, R16_thread, top);)
4703 addi(top, top, oopSize);
4704 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4705 b(slow);
4706
4707 bind(unlocked);
4708 }