1 /*
2 * Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2012, 2023 SAP SE. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "compiler/disassembler.hpp"
29 #include "gc/shared/collectedHeap.inline.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/resourceArea.hpp"
34 #include "nativeInst_ppc.hpp"
35 #include "oops/klass.inline.hpp"
36 #include "oops/methodData.hpp"
37 #include "prims/methodHandles.hpp"
38 #include "register_ppc.hpp"
39 #include "runtime/icache.hpp"
40 #include "runtime/interfaceSupport.inline.hpp"
41 #include "runtime/objectMonitor.hpp"
42 #include "runtime/os.hpp"
43 #include "runtime/safepoint.hpp"
44 #include "runtime/safepointMechanism.hpp"
45 #include "runtime/sharedRuntime.hpp"
46 #include "runtime/stubRoutines.hpp"
47 #include "runtime/vm_version.hpp"
48 #include "utilities/macros.hpp"
49 #include "utilities/powerOfTwo.hpp"
50
51 #ifdef PRODUCT
52 #define BLOCK_COMMENT(str) // nothing
53 #else
54 #define BLOCK_COMMENT(str) block_comment(str)
55 #endif
56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
57
58 #ifdef ASSERT
59 // On RISC, there's no benefit to verifying instruction boundaries.
60 bool AbstractAssembler::pd_check_instruction_mark() { return false; }
61 #endif
62
63 void MacroAssembler::ld_largeoffset_unchecked(Register d, int si31, Register a, int emit_filler_nop) {
64 assert(Assembler::is_simm(si31, 31) && si31 >= 0, "si31 out of range");
65 if (Assembler::is_simm(si31, 16)) {
66 ld(d, si31, a);
67 if (emit_filler_nop) nop();
68 } else {
69 const int hi = MacroAssembler::largeoffset_si16_si16_hi(si31);
70 const int lo = MacroAssembler::largeoffset_si16_si16_lo(si31);
71 addis(d, a, hi);
72 ld(d, lo, d);
73 }
74 }
75
76 void MacroAssembler::ld_largeoffset(Register d, int si31, Register a, int emit_filler_nop) {
77 assert_different_registers(d, a);
78 ld_largeoffset_unchecked(d, si31, a, emit_filler_nop);
79 }
80
81 void MacroAssembler::load_sized_value(Register dst, RegisterOrConstant offs, Register base,
82 size_t size_in_bytes, bool is_signed) {
83 switch (size_in_bytes) {
84 case 8: ld(dst, offs, base); break;
85 case 4: is_signed ? lwa(dst, offs, base) : lwz(dst, offs, base); break;
86 case 2: is_signed ? lha(dst, offs, base) : lhz(dst, offs, base); break;
87 case 1: lbz(dst, offs, base); if (is_signed) extsb(dst, dst); break; // lba doesn't exist :(
88 default: ShouldNotReachHere();
89 }
90 }
91
92 void MacroAssembler::store_sized_value(Register dst, RegisterOrConstant offs, Register base,
93 size_t size_in_bytes) {
94 switch (size_in_bytes) {
95 case 8: std(dst, offs, base); break;
96 case 4: stw(dst, offs, base); break;
97 case 2: sth(dst, offs, base); break;
98 case 1: stb(dst, offs, base); break;
99 default: ShouldNotReachHere();
100 }
101 }
102
103 void MacroAssembler::align(int modulus, int max, int rem) {
104 int padding = (rem + modulus - (offset() % modulus)) % modulus;
105 if (padding > max) return;
106 for (int c = (padding >> 2); c > 0; --c) { nop(); }
107 }
108
109 void MacroAssembler::align_prefix() {
110 if (is_aligned(offset() + BytesPerInstWord, 64)) { nop(); }
111 }
112
113 // Issue instructions that calculate given TOC from global TOC.
114 void MacroAssembler::calculate_address_from_global_toc(Register dst, address addr, bool hi16, bool lo16,
115 bool add_relocation, bool emit_dummy_addr) {
116 int offset = -1;
117 if (emit_dummy_addr) {
118 offset = -128; // dummy address
119 } else if (addr != (address)(intptr_t)-1) {
120 offset = MacroAssembler::offset_to_global_toc(addr);
121 }
122
123 if (hi16) {
124 addis(dst, R29_TOC, MacroAssembler::largeoffset_si16_si16_hi(offset));
125 }
126 if (lo16) {
127 if (add_relocation) {
128 // Relocate at the addi to avoid confusion with a load from the method's TOC.
129 relocate(internal_word_Relocation::spec(addr));
130 }
131 addi(dst, dst, MacroAssembler::largeoffset_si16_si16_lo(offset));
132 }
133 }
134
135 address MacroAssembler::patch_calculate_address_from_global_toc_at(address a, address bound, address addr) {
136 const int offset = MacroAssembler::offset_to_global_toc(addr);
137
138 const address inst2_addr = a;
139 const int inst2 = *(int *)inst2_addr;
140
141 // The relocation points to the second instruction, the addi,
142 // and the addi reads and writes the same register dst.
143 const int dst = inv_rt_field(inst2);
144 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
145
146 // Now, find the preceding addis which writes to dst.
147 int inst1 = 0;
148 address inst1_addr = inst2_addr - BytesPerInstWord;
149 while (inst1_addr >= bound) {
150 inst1 = *(int *) inst1_addr;
151 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
152 // Stop, found the addis which writes dst.
153 break;
154 }
155 inst1_addr -= BytesPerInstWord;
156 }
157
158 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
159 set_imm((int *)inst1_addr, MacroAssembler::largeoffset_si16_si16_hi(offset));
160 set_imm((int *)inst2_addr, MacroAssembler::largeoffset_si16_si16_lo(offset));
161 return inst1_addr;
162 }
163
164 address MacroAssembler::get_address_of_calculate_address_from_global_toc_at(address a, address bound) {
165 const address inst2_addr = a;
166 const int inst2 = *(int *)inst2_addr;
167
168 // The relocation points to the second instruction, the addi,
169 // and the addi reads and writes the same register dst.
170 const int dst = inv_rt_field(inst2);
171 assert(is_addi(inst2) && inv_ra_field(inst2) == dst, "must be addi reading and writing dst");
172
173 // Now, find the preceding addis which writes to dst.
174 int inst1 = 0;
175 address inst1_addr = inst2_addr - BytesPerInstWord;
176 while (inst1_addr >= bound) {
177 inst1 = *(int *) inst1_addr;
178 if (is_addis(inst1) && inv_rt_field(inst1) == dst) {
179 // stop, found the addis which writes dst
180 break;
181 }
182 inst1_addr -= BytesPerInstWord;
183 }
184
185 assert(is_addis(inst1) && inv_ra_field(inst1) == 29 /* R29 */, "source must be global TOC");
186
187 int offset = (get_imm(inst1_addr, 0) << 16) + get_imm(inst2_addr, 0);
188 // -1 is a special case
189 if (offset == -1) {
190 return (address)(intptr_t)-1;
191 } else {
192 return global_toc() + offset;
193 }
194 }
195
196 #ifdef _LP64
197 // Patch compressed oops or klass constants.
198 // Assembler sequence is
199 // 1) compressed oops:
200 // lis rx = const.hi
201 // ori rx = rx | const.lo
202 // 2) compressed klass:
203 // lis rx = const.hi
204 // clrldi rx = rx & 0xFFFFffff // clearMS32b, optional
205 // ori rx = rx | const.lo
206 // Clrldi will be passed by.
207 address MacroAssembler::patch_set_narrow_oop(address a, address bound, narrowOop data) {
208 assert(UseCompressedOops, "Should only patch compressed oops");
209
210 const address inst2_addr = a;
211 const int inst2 = *(int *)inst2_addr;
212
213 // The relocation points to the second instruction, the ori,
214 // and the ori reads and writes the same register dst.
215 const int dst = inv_rta_field(inst2);
216 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
217 // Now, find the preceding addis which writes to dst.
218 int inst1 = 0;
219 address inst1_addr = inst2_addr - BytesPerInstWord;
220 bool inst1_found = false;
221 while (inst1_addr >= bound) {
222 inst1 = *(int *)inst1_addr;
223 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break; }
224 inst1_addr -= BytesPerInstWord;
225 }
226 assert(inst1_found, "inst is not lis");
227
228 uint32_t data_value = CompressedOops::narrow_oop_value(data);
229 int xc = (data_value >> 16) & 0xffff;
230 int xd = (data_value >> 0) & 0xffff;
231
232 set_imm((int *)inst1_addr, (short)(xc)); // see enc_load_con_narrow_hi/_lo
233 set_imm((int *)inst2_addr, (xd)); // unsigned int
234 return inst1_addr;
235 }
236
237 // Get compressed oop constant.
238 narrowOop MacroAssembler::get_narrow_oop(address a, address bound) {
239 assert(UseCompressedOops, "Should only patch compressed oops");
240
241 const address inst2_addr = a;
242 const int inst2 = *(int *)inst2_addr;
243
244 // The relocation points to the second instruction, the ori,
245 // and the ori reads and writes the same register dst.
246 const int dst = inv_rta_field(inst2);
247 assert(is_ori(inst2) && inv_rs_field(inst2) == dst, "must be ori reading and writing dst");
248 // Now, find the preceding lis which writes to dst.
249 int inst1 = 0;
250 address inst1_addr = inst2_addr - BytesPerInstWord;
251 bool inst1_found = false;
252
253 while (inst1_addr >= bound) {
254 inst1 = *(int *) inst1_addr;
255 if (is_lis(inst1) && inv_rs_field(inst1) == dst) { inst1_found = true; break;}
256 inst1_addr -= BytesPerInstWord;
257 }
258 assert(inst1_found, "inst is not lis");
259
260 uint xl = ((unsigned int) (get_imm(inst2_addr, 0) & 0xffff));
261 uint xh = (((get_imm(inst1_addr, 0)) & 0xffff) << 16);
262
263 return CompressedOops::narrow_oop_cast(xl | xh);
264 }
265 #endif // _LP64
266
267 // Returns true if successful.
268 bool MacroAssembler::load_const_from_method_toc(Register dst, AddressLiteral& a,
269 Register toc, bool fixed_size) {
270 int toc_offset = 0;
271 // Use RelocationHolder::none for the constant pool entry, otherwise
272 // we will end up with a failing NativeCall::verify(x) where x is
273 // the address of the constant pool entry.
274 // FIXME: We should insert relocation information for oops at the constant
275 // pool entries instead of inserting it at the loads; patching of a constant
276 // pool entry should be less expensive.
277 address const_address = address_constant((address)a.value(), RelocationHolder::none);
278 if (const_address == nullptr) { return false; } // allocation failure
279 // Relocate at the pc of the load.
280 relocate(a.rspec());
281 toc_offset = (int)(const_address - code()->consts()->start());
282 ld_largeoffset_unchecked(dst, toc_offset, toc, fixed_size);
283 return true;
284 }
285
286 bool MacroAssembler::is_load_const_from_method_toc_at(address a) {
287 const address inst1_addr = a;
288 const int inst1 = *(int *)inst1_addr;
289
290 // The relocation points to the ld or the addis.
291 return (is_ld(inst1)) ||
292 (is_addis(inst1) && inv_ra_field(inst1) != 0);
293 }
294
295 int MacroAssembler::get_offset_of_load_const_from_method_toc_at(address a) {
296 assert(is_load_const_from_method_toc_at(a), "must be load_const_from_method_toc");
297
298 const address inst1_addr = a;
299 const int inst1 = *(int *)inst1_addr;
300
301 if (is_ld(inst1)) {
302 return inv_d1_field(inst1);
303 } else if (is_addis(inst1)) {
304 const int dst = inv_rt_field(inst1);
305
306 // Now, find the succeeding ld which reads and writes to dst.
307 address inst2_addr = inst1_addr + BytesPerInstWord;
308 int inst2 = 0;
309 while (true) {
310 inst2 = *(int *) inst2_addr;
311 if (is_ld(inst2) && inv_ra_field(inst2) == dst && inv_rt_field(inst2) == dst) {
312 // Stop, found the ld which reads and writes dst.
313 break;
314 }
315 inst2_addr += BytesPerInstWord;
316 }
317 return (inv_d1_field(inst1) << 16) + inv_d1_field(inst2);
318 }
319 ShouldNotReachHere();
320 return 0;
321 }
322
323 // Get the constant from a `load_const' sequence.
324 long MacroAssembler::get_const(address a) {
325 assert(is_load_const_at(a), "not a load of a constant");
326 const int *p = (const int*) a;
327 unsigned long x = (((unsigned long) (get_imm(a,0) & 0xffff)) << 48);
328 if (is_ori(*(p+1))) {
329 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 32);
330 x |= (((unsigned long) (get_imm(a,3) & 0xffff)) << 16);
331 x |= (((unsigned long) (get_imm(a,4) & 0xffff)));
332 } else if (is_lis(*(p+1))) {
333 x |= (((unsigned long) (get_imm(a,2) & 0xffff)) << 32);
334 x |= (((unsigned long) (get_imm(a,1) & 0xffff)) << 16);
335 x |= (((unsigned long) (get_imm(a,3) & 0xffff)));
336 } else {
337 ShouldNotReachHere();
338 return (long) 0;
339 }
340 return (long) x;
341 }
342
343 // Patch the 64 bit constant of a `load_const' sequence. This is a low
344 // level procedure. It neither flushes the instruction cache nor is it
345 // mt safe.
346 void MacroAssembler::patch_const(address a, long x) {
347 assert(is_load_const_at(a), "not a load of a constant");
348 int *p = (int*) a;
349 if (is_ori(*(p+1))) {
350 set_imm(0 + p, (x >> 48) & 0xffff);
351 set_imm(1 + p, (x >> 32) & 0xffff);
352 set_imm(3 + p, (x >> 16) & 0xffff);
353 set_imm(4 + p, x & 0xffff);
354 } else if (is_lis(*(p+1))) {
355 set_imm(0 + p, (x >> 48) & 0xffff);
356 set_imm(2 + p, (x >> 32) & 0xffff);
357 set_imm(1 + p, (x >> 16) & 0xffff);
358 set_imm(3 + p, x & 0xffff);
359 } else {
360 ShouldNotReachHere();
361 }
362 }
363
364 AddressLiteral MacroAssembler::allocate_metadata_address(Metadata* obj) {
365 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
366 int index = oop_recorder()->allocate_metadata_index(obj);
367 RelocationHolder rspec = metadata_Relocation::spec(index);
368 return AddressLiteral((address)obj, rspec);
369 }
370
371 AddressLiteral MacroAssembler::constant_metadata_address(Metadata* obj) {
372 assert(oop_recorder() != nullptr, "this assembler needs a Recorder");
373 int index = oop_recorder()->find_index(obj);
374 RelocationHolder rspec = metadata_Relocation::spec(index);
375 return AddressLiteral((address)obj, rspec);
376 }
377
378 AddressLiteral MacroAssembler::allocate_oop_address(jobject obj) {
379 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
380 int oop_index = oop_recorder()->allocate_oop_index(obj);
381 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
382 }
383
384 AddressLiteral MacroAssembler::constant_oop_address(jobject obj) {
385 assert(oop_recorder() != nullptr, "this assembler needs an OopRecorder");
386 int oop_index = oop_recorder()->find_index(obj);
387 return AddressLiteral(address(obj), oop_Relocation::spec(oop_index));
388 }
389
390 #ifndef PRODUCT
391 void MacroAssembler::pd_print_patched_instruction(address branch) {
392 Unimplemented(); // TODO: PPC port
393 }
394 #endif // ndef PRODUCT
395
396 // Conditional far branch for destinations encodable in 24+2 bits.
397 void MacroAssembler::bc_far(int boint, int biint, Label& dest, int optimize) {
398
399 // If requested by flag optimize, relocate the bc_far as a
400 // runtime_call and prepare for optimizing it when the code gets
401 // relocated.
402 if (optimize == bc_far_optimize_on_relocate) {
403 relocate(relocInfo::runtime_call_type);
404 }
405
406 // variant 2:
407 //
408 // b!cxx SKIP
409 // bxx DEST
410 // SKIP:
411 //
412
413 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
414 opposite_bcond(inv_boint_bcond(boint)));
415
416 // We emit two branches.
417 // First, a conditional branch which jumps around the far branch.
418 const address not_taken_pc = pc() + 2 * BytesPerInstWord;
419 const address bc_pc = pc();
420 bc(opposite_boint, biint, not_taken_pc);
421
422 const int bc_instr = *(int*)bc_pc;
423 assert(not_taken_pc == (address)inv_bd_field(bc_instr, (intptr_t)bc_pc), "postcondition");
424 assert(opposite_boint == inv_bo_field(bc_instr), "postcondition");
425 assert(boint == add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(bc_instr))),
426 opposite_bcond(inv_boint_bcond(inv_bo_field(bc_instr)))),
427 "postcondition");
428 assert(biint == inv_bi_field(bc_instr), "postcondition");
429
430 // Second, an unconditional far branch which jumps to dest.
431 // Note: target(dest) remembers the current pc (see CodeSection::target)
432 // and returns the current pc if the label is not bound yet; when
433 // the label gets bound, the unconditional far branch will be patched.
434 const address target_pc = target(dest);
435 const address b_pc = pc();
436 b(target_pc);
437
438 assert(not_taken_pc == pc(), "postcondition");
439 assert(dest.is_bound() || target_pc == b_pc, "postcondition");
440 }
441
442 // 1 or 2 instructions
443 void MacroAssembler::bc_far_optimized(int boint, int biint, Label& dest) {
444 if (dest.is_bound() && is_within_range_of_bcxx(target(dest), pc())) {
445 bc(boint, biint, dest);
446 } else {
447 bc_far(boint, biint, dest, MacroAssembler::bc_far_optimize_on_relocate);
448 }
449 }
450
451 bool MacroAssembler::is_bc_far_at(address instruction_addr) {
452 return is_bc_far_variant1_at(instruction_addr) ||
453 is_bc_far_variant2_at(instruction_addr) ||
454 is_bc_far_variant3_at(instruction_addr);
455 }
456
457 address MacroAssembler::get_dest_of_bc_far_at(address instruction_addr) {
458 if (is_bc_far_variant1_at(instruction_addr)) {
459 const address instruction_1_addr = instruction_addr;
460 const int instruction_1 = *(int*)instruction_1_addr;
461 return (address)inv_bd_field(instruction_1, (intptr_t)instruction_1_addr);
462 } else if (is_bc_far_variant2_at(instruction_addr)) {
463 const address instruction_2_addr = instruction_addr + 4;
464 return bxx_destination(instruction_2_addr);
465 } else if (is_bc_far_variant3_at(instruction_addr)) {
466 return instruction_addr + 8;
467 }
468 // variant 4 ???
469 ShouldNotReachHere();
470 return nullptr;
471 }
472 void MacroAssembler::set_dest_of_bc_far_at(address instruction_addr, address dest) {
473
474 if (is_bc_far_variant3_at(instruction_addr)) {
475 // variant 3, far cond branch to the next instruction, already patched to nops:
476 //
477 // nop
478 // endgroup
479 // SKIP/DEST:
480 //
481 return;
482 }
483
484 // first, extract boint and biint from the current branch
485 int boint = 0;
486 int biint = 0;
487
488 ResourceMark rm;
489 const int code_size = 2 * BytesPerInstWord;
490 CodeBuffer buf(instruction_addr, code_size);
491 MacroAssembler masm(&buf);
492 if (is_bc_far_variant2_at(instruction_addr) && dest == instruction_addr + 8) {
493 // Far branch to next instruction: Optimize it by patching nops (produce variant 3).
494 masm.nop();
495 masm.endgroup();
496 } else {
497 if (is_bc_far_variant1_at(instruction_addr)) {
498 // variant 1, the 1st instruction contains the destination address:
499 //
500 // bcxx DEST
501 // nop
502 //
503 const int instruction_1 = *(int*)(instruction_addr);
504 boint = inv_bo_field(instruction_1);
505 biint = inv_bi_field(instruction_1);
506 } else if (is_bc_far_variant2_at(instruction_addr)) {
507 // variant 2, the 2nd instruction contains the destination address:
508 //
509 // b!cxx SKIP
510 // bxx DEST
511 // SKIP:
512 //
513 const int instruction_1 = *(int*)(instruction_addr);
514 boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(inv_bo_field(instruction_1))),
515 opposite_bcond(inv_boint_bcond(inv_bo_field(instruction_1))));
516 biint = inv_bi_field(instruction_1);
517 } else {
518 // variant 4???
519 ShouldNotReachHere();
520 }
521
522 // second, set the new branch destination and optimize the code
523 if (dest != instruction_addr + 4 && // the bc_far is still unbound!
524 masm.is_within_range_of_bcxx(dest, instruction_addr)) {
525 // variant 1:
526 //
527 // bcxx DEST
528 // nop
529 //
530 masm.bc(boint, biint, dest);
531 masm.nop();
532 } else {
533 // variant 2:
534 //
535 // b!cxx SKIP
536 // bxx DEST
537 // SKIP:
538 //
539 const int opposite_boint = add_bhint_to_boint(opposite_bhint(inv_boint_bhint(boint)),
540 opposite_bcond(inv_boint_bcond(boint)));
541 const address not_taken_pc = masm.pc() + 2 * BytesPerInstWord;
542 masm.bc(opposite_boint, biint, not_taken_pc);
543 masm.b(dest);
544 }
545 }
546 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
547 }
548
549 // Emit a NOT mt-safe patchable 64 bit absolute call/jump.
550 void MacroAssembler::bxx64_patchable(address dest, relocInfo::relocType rt, bool link) {
551 // get current pc
552 uint64_t start_pc = (uint64_t) pc();
553
554 const address pc_of_bl = (address) (start_pc + (6*BytesPerInstWord)); // bl is last
555 const address pc_of_b = (address) (start_pc + (0*BytesPerInstWord)); // b is first
556
557 // relocate here
558 if (rt != relocInfo::none) {
559 relocate(rt);
560 }
561
562 if ( ReoptimizeCallSequences &&
563 (( link && is_within_range_of_b(dest, pc_of_bl)) ||
564 (!link && is_within_range_of_b(dest, pc_of_b)))) {
565 // variant 2:
566 // Emit an optimized, pc-relative call/jump.
567
568 if (link) {
569 // some padding
570 nop();
571 nop();
572 nop();
573 nop();
574 nop();
575 nop();
576
577 // do the call
578 assert(pc() == pc_of_bl, "just checking");
579 bl(dest, relocInfo::none);
580 } else {
581 // do the jump
582 assert(pc() == pc_of_b, "just checking");
583 b(dest, relocInfo::none);
584
585 // some padding
586 nop();
587 nop();
588 nop();
589 nop();
590 nop();
591 nop();
592 }
593
594 // Assert that we can identify the emitted call/jump.
595 assert(is_bxx64_patchable_variant2_at((address)start_pc, link),
596 "can't identify emitted call");
597 } else {
598 // variant 1:
599 mr(R0, R11); // spill R11 -> R0.
600
601 // Load the destination address into CTR,
602 // calculate destination relative to global toc.
603 calculate_address_from_global_toc(R11, dest, true, true, false);
604
605 mtctr(R11);
606 mr(R11, R0); // spill R11 <- R0.
607 nop();
608
609 // do the call/jump
610 if (link) {
611 bctrl();
612 } else{
613 bctr();
614 }
615 // Assert that we can identify the emitted call/jump.
616 assert(is_bxx64_patchable_variant1b_at((address)start_pc, link),
617 "can't identify emitted call");
618 }
619
620 // Assert that we can identify the emitted call/jump.
621 assert(is_bxx64_patchable_at((address)start_pc, link),
622 "can't identify emitted call");
623 assert(get_dest_of_bxx64_patchable_at((address)start_pc, link) == dest,
624 "wrong encoding of dest address");
625 }
626
627 // Identify a bxx64_patchable instruction.
628 bool MacroAssembler::is_bxx64_patchable_at(address instruction_addr, bool link) {
629 return is_bxx64_patchable_variant1b_at(instruction_addr, link)
630 //|| is_bxx64_patchable_variant1_at(instruction_addr, link)
631 || is_bxx64_patchable_variant2_at(instruction_addr, link);
632 }
633
634 // Does the call64_patchable instruction use a pc-relative encoding of
635 // the call destination?
636 bool MacroAssembler::is_bxx64_patchable_pcrelative_at(address instruction_addr, bool link) {
637 // variant 2 is pc-relative
638 return is_bxx64_patchable_variant2_at(instruction_addr, link);
639 }
640
641 // Identify variant 1.
642 bool MacroAssembler::is_bxx64_patchable_variant1_at(address instruction_addr, bool link) {
643 unsigned int* instr = (unsigned int*) instruction_addr;
644 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
645 && is_mtctr(instr[5]) // mtctr
646 && is_load_const_at(instruction_addr);
647 }
648
649 // Identify variant 1b: load destination relative to global toc.
650 bool MacroAssembler::is_bxx64_patchable_variant1b_at(address instruction_addr, bool link) {
651 unsigned int* instr = (unsigned int*) instruction_addr;
652 return (link ? is_bctrl(instr[6]) : is_bctr(instr[6])) // bctr[l]
653 && is_mtctr(instr[3]) // mtctr
654 && is_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord, instruction_addr);
655 }
656
657 // Identify variant 2.
658 bool MacroAssembler::is_bxx64_patchable_variant2_at(address instruction_addr, bool link) {
659 unsigned int* instr = (unsigned int*) instruction_addr;
660 if (link) {
661 return is_bl (instr[6]) // bl dest is last
662 && is_nop(instr[0]) // nop
663 && is_nop(instr[1]) // nop
664 && is_nop(instr[2]) // nop
665 && is_nop(instr[3]) // nop
666 && is_nop(instr[4]) // nop
667 && is_nop(instr[5]); // nop
668 } else {
669 return is_b (instr[0]) // b dest is first
670 && is_nop(instr[1]) // nop
671 && is_nop(instr[2]) // nop
672 && is_nop(instr[3]) // nop
673 && is_nop(instr[4]) // nop
674 && is_nop(instr[5]) // nop
675 && is_nop(instr[6]); // nop
676 }
677 }
678
679 // Set dest address of a bxx64_patchable instruction.
680 void MacroAssembler::set_dest_of_bxx64_patchable_at(address instruction_addr, address dest, bool link) {
681 ResourceMark rm;
682 int code_size = MacroAssembler::bxx64_patchable_size;
683 CodeBuffer buf(instruction_addr, code_size);
684 MacroAssembler masm(&buf);
685 masm.bxx64_patchable(dest, relocInfo::none, link);
686 ICache::ppc64_flush_icache_bytes(instruction_addr, code_size);
687 }
688
689 // Get dest address of a bxx64_patchable instruction.
690 address MacroAssembler::get_dest_of_bxx64_patchable_at(address instruction_addr, bool link) {
691 if (is_bxx64_patchable_variant1_at(instruction_addr, link)) {
692 return (address) (unsigned long) get_const(instruction_addr);
693 } else if (is_bxx64_patchable_variant2_at(instruction_addr, link)) {
694 unsigned int* instr = (unsigned int*) instruction_addr;
695 if (link) {
696 const int instr_idx = 6; // bl is last
697 int branchoffset = branch_destination(instr[instr_idx], 0);
698 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
699 } else {
700 const int instr_idx = 0; // b is first
701 int branchoffset = branch_destination(instr[instr_idx], 0);
702 return instruction_addr + branchoffset + instr_idx*BytesPerInstWord;
703 }
704 // Load dest relative to global toc.
705 } else if (is_bxx64_patchable_variant1b_at(instruction_addr, link)) {
706 return get_address_of_calculate_address_from_global_toc_at(instruction_addr + 2*BytesPerInstWord,
707 instruction_addr);
708 } else {
709 ShouldNotReachHere();
710 return nullptr;
711 }
712 }
713
714 void MacroAssembler::clobber_volatile_gprs(Register excluded_register) {
715 const int magic_number = 0x42;
716
717 // Preserve stack pointer register (R1_SP) and system thread id register (R13);
718 // although they're technically volatile
719 for (int i = 2; i < 13; i++) {
720 Register reg = as_Register(i);
721 if (reg == excluded_register) {
722 continue;
723 }
724
725 li(reg, magic_number);
726 }
727 }
728
729 void MacroAssembler::clobber_carg_stack_slots(Register tmp) {
730 const int magic_number = 0x43;
731
732 li(tmp, magic_number);
733 for (int m = 0; m <= 7; m++) {
734 std(tmp, frame::native_abi_minframe_size + m * 8, R1_SP);
735 }
736 }
737
738 // Uses ordering which corresponds to ABI:
739 // _savegpr0_14: std r14,-144(r1)
740 // _savegpr0_15: std r15,-136(r1)
741 // _savegpr0_16: std r16,-128(r1)
742 void MacroAssembler::save_nonvolatile_gprs(Register dst, int offset) {
743 std(R14, offset, dst); offset += 8;
744 std(R15, offset, dst); offset += 8;
745 std(R16, offset, dst); offset += 8;
746 std(R17, offset, dst); offset += 8;
747 std(R18, offset, dst); offset += 8;
748 std(R19, offset, dst); offset += 8;
749 std(R20, offset, dst); offset += 8;
750 std(R21, offset, dst); offset += 8;
751 std(R22, offset, dst); offset += 8;
752 std(R23, offset, dst); offset += 8;
753 std(R24, offset, dst); offset += 8;
754 std(R25, offset, dst); offset += 8;
755 std(R26, offset, dst); offset += 8;
756 std(R27, offset, dst); offset += 8;
757 std(R28, offset, dst); offset += 8;
758 std(R29, offset, dst); offset += 8;
759 std(R30, offset, dst); offset += 8;
760 std(R31, offset, dst); offset += 8;
761
762 stfd(F14, offset, dst); offset += 8;
763 stfd(F15, offset, dst); offset += 8;
764 stfd(F16, offset, dst); offset += 8;
765 stfd(F17, offset, dst); offset += 8;
766 stfd(F18, offset, dst); offset += 8;
767 stfd(F19, offset, dst); offset += 8;
768 stfd(F20, offset, dst); offset += 8;
769 stfd(F21, offset, dst); offset += 8;
770 stfd(F22, offset, dst); offset += 8;
771 stfd(F23, offset, dst); offset += 8;
772 stfd(F24, offset, dst); offset += 8;
773 stfd(F25, offset, dst); offset += 8;
774 stfd(F26, offset, dst); offset += 8;
775 stfd(F27, offset, dst); offset += 8;
776 stfd(F28, offset, dst); offset += 8;
777 stfd(F29, offset, dst); offset += 8;
778 stfd(F30, offset, dst); offset += 8;
779 stfd(F31, offset, dst);
780 }
781
782 // Uses ordering which corresponds to ABI:
783 // _restgpr0_14: ld r14,-144(r1)
784 // _restgpr0_15: ld r15,-136(r1)
785 // _restgpr0_16: ld r16,-128(r1)
786 void MacroAssembler::restore_nonvolatile_gprs(Register src, int offset) {
787 ld(R14, offset, src); offset += 8;
788 ld(R15, offset, src); offset += 8;
789 ld(R16, offset, src); offset += 8;
790 ld(R17, offset, src); offset += 8;
791 ld(R18, offset, src); offset += 8;
792 ld(R19, offset, src); offset += 8;
793 ld(R20, offset, src); offset += 8;
794 ld(R21, offset, src); offset += 8;
795 ld(R22, offset, src); offset += 8;
796 ld(R23, offset, src); offset += 8;
797 ld(R24, offset, src); offset += 8;
798 ld(R25, offset, src); offset += 8;
799 ld(R26, offset, src); offset += 8;
800 ld(R27, offset, src); offset += 8;
801 ld(R28, offset, src); offset += 8;
802 ld(R29, offset, src); offset += 8;
803 ld(R30, offset, src); offset += 8;
804 ld(R31, offset, src); offset += 8;
805
806 // FP registers
807 lfd(F14, offset, src); offset += 8;
808 lfd(F15, offset, src); offset += 8;
809 lfd(F16, offset, src); offset += 8;
810 lfd(F17, offset, src); offset += 8;
811 lfd(F18, offset, src); offset += 8;
812 lfd(F19, offset, src); offset += 8;
813 lfd(F20, offset, src); offset += 8;
814 lfd(F21, offset, src); offset += 8;
815 lfd(F22, offset, src); offset += 8;
816 lfd(F23, offset, src); offset += 8;
817 lfd(F24, offset, src); offset += 8;
818 lfd(F25, offset, src); offset += 8;
819 lfd(F26, offset, src); offset += 8;
820 lfd(F27, offset, src); offset += 8;
821 lfd(F28, offset, src); offset += 8;
822 lfd(F29, offset, src); offset += 8;
823 lfd(F30, offset, src); offset += 8;
824 lfd(F31, offset, src);
825 }
826
827 // For verify_oops.
828 void MacroAssembler::save_volatile_gprs(Register dst, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
829 std(R2, offset, dst); offset += 8;
830 if (include_R3_RET_reg) {
831 std(R3, offset, dst); offset += 8;
832 }
833 std(R4, offset, dst); offset += 8;
834 std(R5, offset, dst); offset += 8;
835 std(R6, offset, dst); offset += 8;
836 std(R7, offset, dst); offset += 8;
837 std(R8, offset, dst); offset += 8;
838 std(R9, offset, dst); offset += 8;
839 std(R10, offset, dst); offset += 8;
840 std(R11, offset, dst); offset += 8;
841 std(R12, offset, dst); offset += 8;
842
843 if (include_fp_regs) {
844 stfd(F0, offset, dst); offset += 8;
845 stfd(F1, offset, dst); offset += 8;
846 stfd(F2, offset, dst); offset += 8;
847 stfd(F3, offset, dst); offset += 8;
848 stfd(F4, offset, dst); offset += 8;
849 stfd(F5, offset, dst); offset += 8;
850 stfd(F6, offset, dst); offset += 8;
851 stfd(F7, offset, dst); offset += 8;
852 stfd(F8, offset, dst); offset += 8;
853 stfd(F9, offset, dst); offset += 8;
854 stfd(F10, offset, dst); offset += 8;
855 stfd(F11, offset, dst); offset += 8;
856 stfd(F12, offset, dst); offset += 8;
857 stfd(F13, offset, dst);
858 }
859 }
860
861 // For verify_oops.
862 void MacroAssembler::restore_volatile_gprs(Register src, int offset, bool include_fp_regs, bool include_R3_RET_reg) {
863 ld(R2, offset, src); offset += 8;
864 if (include_R3_RET_reg) {
865 ld(R3, offset, src); offset += 8;
866 }
867 ld(R4, offset, src); offset += 8;
868 ld(R5, offset, src); offset += 8;
869 ld(R6, offset, src); offset += 8;
870 ld(R7, offset, src); offset += 8;
871 ld(R8, offset, src); offset += 8;
872 ld(R9, offset, src); offset += 8;
873 ld(R10, offset, src); offset += 8;
874 ld(R11, offset, src); offset += 8;
875 ld(R12, offset, src); offset += 8;
876
877 if (include_fp_regs) {
878 lfd(F0, offset, src); offset += 8;
879 lfd(F1, offset, src); offset += 8;
880 lfd(F2, offset, src); offset += 8;
881 lfd(F3, offset, src); offset += 8;
882 lfd(F4, offset, src); offset += 8;
883 lfd(F5, offset, src); offset += 8;
884 lfd(F6, offset, src); offset += 8;
885 lfd(F7, offset, src); offset += 8;
886 lfd(F8, offset, src); offset += 8;
887 lfd(F9, offset, src); offset += 8;
888 lfd(F10, offset, src); offset += 8;
889 lfd(F11, offset, src); offset += 8;
890 lfd(F12, offset, src); offset += 8;
891 lfd(F13, offset, src);
892 }
893 }
894
895 void MacroAssembler::save_LR_CR(Register tmp) {
896 mfcr(tmp);
897 std(tmp, _abi0(cr), R1_SP);
898 mflr(tmp);
899 std(tmp, _abi0(lr), R1_SP);
900 // Tmp must contain lr on exit! (see return_addr and prolog in ppc64.ad)
901 }
902
903 void MacroAssembler::restore_LR_CR(Register tmp) {
904 assert(tmp != R1_SP, "must be distinct");
905 ld(tmp, _abi0(lr), R1_SP);
906 mtlr(tmp);
907 ld(tmp, _abi0(cr), R1_SP);
908 mtcr(tmp);
909 }
910
911 address MacroAssembler::get_PC_trash_LR(Register result) {
912 Label L;
913 bl(L);
914 bind(L);
915 address lr_pc = pc();
916 mflr(result);
917 return lr_pc;
918 }
919
920 void MacroAssembler::resize_frame(Register offset, Register tmp) {
921 #ifdef ASSERT
922 assert_different_registers(offset, tmp, R1_SP);
923 andi_(tmp, offset, frame::alignment_in_bytes-1);
924 asm_assert_eq("resize_frame: unaligned");
925 #endif
926
927 // tmp <- *(SP)
928 ld(tmp, _abi0(callers_sp), R1_SP);
929 // addr <- SP + offset;
930 // *(addr) <- tmp;
931 // SP <- addr
932 stdux(tmp, R1_SP, offset);
933 }
934
935 void MacroAssembler::resize_frame(int offset, Register tmp) {
936 assert(is_simm(offset, 16), "too big an offset");
937 assert_different_registers(tmp, R1_SP);
938 assert((offset & (frame::alignment_in_bytes-1))==0, "resize_frame: unaligned");
939 // tmp <- *(SP)
940 ld(tmp, _abi0(callers_sp), R1_SP);
941 // addr <- SP + offset;
942 // *(addr) <- tmp;
943 // SP <- addr
944 stdu(tmp, offset, R1_SP);
945 }
946
947 void MacroAssembler::resize_frame_absolute(Register addr, Register tmp1, Register tmp2) {
948 // (addr == tmp1) || (addr == tmp2) is allowed here!
949 assert(tmp1 != tmp2, "must be distinct");
950
951 // compute offset w.r.t. current stack pointer
952 // tmp_1 <- addr - SP (!)
953 subf(tmp1, R1_SP, addr);
954
955 // atomically update SP keeping back link.
956 resize_frame(tmp1/* offset */, tmp2/* tmp */);
957 }
958
959 void MacroAssembler::push_frame(Register bytes, Register tmp) {
960 #ifdef ASSERT
961 assert(bytes != R0, "r0 not allowed here");
962 andi_(R0, bytes, frame::alignment_in_bytes-1);
963 asm_assert_eq("push_frame(Reg, Reg): unaligned");
964 #endif
965 neg(tmp, bytes);
966 stdux(R1_SP, R1_SP, tmp);
967 }
968
969 // Push a frame of size `bytes'.
970 void MacroAssembler::push_frame(unsigned int bytes, Register tmp) {
971 long offset = align_addr(bytes, frame::alignment_in_bytes);
972 if (is_simm(-offset, 16)) {
973 stdu(R1_SP, -offset, R1_SP);
974 } else {
975 load_const_optimized(tmp, -offset);
976 stdux(R1_SP, R1_SP, tmp);
977 }
978 }
979
980 // Push a frame of size `bytes' plus native_abi_reg_args on top.
981 void MacroAssembler::push_frame_reg_args(unsigned int bytes, Register tmp) {
982 push_frame(bytes + frame::native_abi_reg_args_size, tmp);
983 }
984
985 // Setup up a new C frame with a spill area for non-volatile GPRs and
986 // additional space for local variables.
987 void MacroAssembler::push_frame_reg_args_nonvolatiles(unsigned int bytes,
988 Register tmp) {
989 push_frame(bytes + frame::native_abi_reg_args_size + frame::spill_nonvolatiles_size, tmp);
990 }
991
992 // Pop current C frame.
993 void MacroAssembler::pop_frame() {
994 ld(R1_SP, _abi0(callers_sp), R1_SP);
995 }
996
997 #if defined(ABI_ELFv2)
998 address MacroAssembler::branch_to(Register r_function_entry, bool and_link) {
999 // TODO(asmundak): make sure the caller uses R12 as function descriptor
1000 // most of the times.
1001 if (R12 != r_function_entry) {
1002 mr(R12, r_function_entry);
1003 }
1004 mtctr(R12);
1005 // Do a call or a branch.
1006 if (and_link) {
1007 bctrl();
1008 } else {
1009 bctr();
1010 }
1011 _last_calls_return_pc = pc();
1012
1013 return _last_calls_return_pc;
1014 }
1015
1016 // Call a C function via a function descriptor and use full C
1017 // calling conventions. Updates and returns _last_calls_return_pc.
1018 address MacroAssembler::call_c(Register r_function_entry) {
1019 return branch_to(r_function_entry, /*and_link=*/true);
1020 }
1021
1022 // For tail calls: only branch, don't link, so callee returns to caller of this function.
1023 address MacroAssembler::call_c_and_return_to_caller(Register r_function_entry) {
1024 return branch_to(r_function_entry, /*and_link=*/false);
1025 }
1026
1027 address MacroAssembler::call_c(address function_entry, relocInfo::relocType rt) {
1028 load_const(R12, function_entry, R0);
1029 return branch_to(R12, /*and_link=*/true);
1030 }
1031
1032 #else
1033 // Generic version of a call to C function via a function descriptor
1034 // with variable support for C calling conventions (TOC, ENV, etc.).
1035 // Updates and returns _last_calls_return_pc.
1036 address MacroAssembler::branch_to(Register function_descriptor, bool and_link, bool save_toc_before_call,
1037 bool restore_toc_after_call, bool load_toc_of_callee, bool load_env_of_callee) {
1038 // we emit standard ptrgl glue code here
1039 assert((function_descriptor != R0), "function_descriptor cannot be R0");
1040
1041 // retrieve necessary entries from the function descriptor
1042 ld(R0, in_bytes(FunctionDescriptor::entry_offset()), function_descriptor);
1043 mtctr(R0);
1044
1045 if (load_toc_of_callee) {
1046 ld(R2_TOC, in_bytes(FunctionDescriptor::toc_offset()), function_descriptor);
1047 }
1048 if (load_env_of_callee) {
1049 ld(R11, in_bytes(FunctionDescriptor::env_offset()), function_descriptor);
1050 } else if (load_toc_of_callee) {
1051 li(R11, 0);
1052 }
1053
1054 // do a call or a branch
1055 if (and_link) {
1056 bctrl();
1057 } else {
1058 bctr();
1059 }
1060 _last_calls_return_pc = pc();
1061
1062 return _last_calls_return_pc;
1063 }
1064
1065 // Call a C function via a function descriptor and use full C calling
1066 // conventions.
1067 // We don't use the TOC in generated code, so there is no need to save
1068 // and restore its value.
1069 address MacroAssembler::call_c(Register fd) {
1070 return branch_to(fd, /*and_link=*/true,
1071 /*save toc=*/false,
1072 /*restore toc=*/false,
1073 /*load toc=*/true,
1074 /*load env=*/true);
1075 }
1076
1077 address MacroAssembler::call_c_and_return_to_caller(Register fd) {
1078 return branch_to(fd, /*and_link=*/false,
1079 /*save toc=*/false,
1080 /*restore toc=*/false,
1081 /*load toc=*/true,
1082 /*load env=*/true);
1083 }
1084
1085 address MacroAssembler::call_c(const FunctionDescriptor* fd, relocInfo::relocType rt) {
1086 if (rt != relocInfo::none) {
1087 // this call needs to be relocatable
1088 if (!ReoptimizeCallSequences
1089 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1090 || fd == nullptr // support code-size estimation
1091 || !fd->is_friend_function()
1092 || fd->entry() == nullptr) {
1093 // it's not a friend function as defined by class FunctionDescriptor,
1094 // so do a full call-c here.
1095 load_const(R11, (address)fd, R0);
1096
1097 bool has_env = (fd != nullptr && fd->env() != nullptr);
1098 return branch_to(R11, /*and_link=*/true,
1099 /*save toc=*/false,
1100 /*restore toc=*/false,
1101 /*load toc=*/true,
1102 /*load env=*/has_env);
1103 } else {
1104 // It's a friend function. Load the entry point and don't care about
1105 // toc and env. Use an optimizable call instruction, but ensure the
1106 // same code-size as in the case of a non-friend function.
1107 nop();
1108 nop();
1109 nop();
1110 bl64_patchable(fd->entry(), rt);
1111 _last_calls_return_pc = pc();
1112 return _last_calls_return_pc;
1113 }
1114 } else {
1115 // This call does not need to be relocatable, do more aggressive
1116 // optimizations.
1117 if (!ReoptimizeCallSequences
1118 || !fd->is_friend_function()) {
1119 // It's not a friend function as defined by class FunctionDescriptor,
1120 // so do a full call-c here.
1121 load_const(R11, (address)fd, R0);
1122 return branch_to(R11, /*and_link=*/true,
1123 /*save toc=*/false,
1124 /*restore toc=*/false,
1125 /*load toc=*/true,
1126 /*load env=*/true);
1127 } else {
1128 // it's a friend function, load the entry point and don't care about
1129 // toc and env.
1130 address dest = fd->entry();
1131 if (is_within_range_of_b(dest, pc())) {
1132 bl(dest);
1133 } else {
1134 bl64_patchable(dest, rt);
1135 }
1136 _last_calls_return_pc = pc();
1137 return _last_calls_return_pc;
1138 }
1139 }
1140 }
1141
1142 // Call a C function. All constants needed reside in TOC.
1143 //
1144 // Read the address to call from the TOC.
1145 // Read env from TOC, if fd specifies an env.
1146 // Read new TOC from TOC.
1147 address MacroAssembler::call_c_using_toc(const FunctionDescriptor* fd,
1148 relocInfo::relocType rt, Register toc) {
1149 if (!ReoptimizeCallSequences
1150 || (rt != relocInfo::runtime_call_type && rt != relocInfo::none)
1151 || !fd->is_friend_function()) {
1152 // It's not a friend function as defined by class FunctionDescriptor,
1153 // so do a full call-c here.
1154 assert(fd->entry() != nullptr, "function must be linked");
1155
1156 AddressLiteral fd_entry(fd->entry());
1157 bool success = load_const_from_method_toc(R11, fd_entry, toc, /*fixed_size*/ true);
1158 mtctr(R11);
1159 if (fd->env() == nullptr) {
1160 li(R11, 0);
1161 nop();
1162 } else {
1163 AddressLiteral fd_env(fd->env());
1164 success = success && load_const_from_method_toc(R11, fd_env, toc, /*fixed_size*/ true);
1165 }
1166 AddressLiteral fd_toc(fd->toc());
1167 // Set R2_TOC (load from toc)
1168 success = success && load_const_from_method_toc(R2_TOC, fd_toc, toc, /*fixed_size*/ true);
1169 bctrl();
1170 _last_calls_return_pc = pc();
1171 if (!success) { return nullptr; }
1172 } else {
1173 // It's a friend function, load the entry point and don't care about
1174 // toc and env. Use an optimizable call instruction, but ensure the
1175 // same code-size as in the case of a non-friend function.
1176 nop();
1177 bl64_patchable(fd->entry(), rt);
1178 _last_calls_return_pc = pc();
1179 }
1180 return _last_calls_return_pc;
1181 }
1182 #endif // ABI_ELFv2
1183
1184 void MacroAssembler::post_call_nop() {
1185 // Make inline again when loom is always enabled.
1186 if (!Continuations::enabled()) {
1187 return;
1188 }
1189 InlineSkippedInstructionsCounter skipCounter(this);
1190 nop();
1191 }
1192
1193 void MacroAssembler::call_VM_base(Register oop_result,
1194 Register last_java_sp,
1195 address entry_point,
1196 bool check_exceptions) {
1197 BLOCK_COMMENT("call_VM {");
1198 // Determine last_java_sp register.
1199 if (!last_java_sp->is_valid()) {
1200 last_java_sp = R1_SP;
1201 }
1202 set_top_ijava_frame_at_SP_as_last_Java_frame(last_java_sp, R11_scratch1);
1203
1204 // ARG1 must hold thread address.
1205 mr(R3_ARG1, R16_thread);
1206 #if defined(ABI_ELFv2)
1207 address return_pc = call_c(entry_point, relocInfo::none);
1208 #else
1209 address return_pc = call_c((FunctionDescriptor*)entry_point, relocInfo::none);
1210 #endif
1211
1212 reset_last_Java_frame();
1213
1214 // Check for pending exceptions.
1215 if (check_exceptions) {
1216 // We don't check for exceptions here.
1217 ShouldNotReachHere();
1218 }
1219
1220 // Get oop result if there is one and reset the value in the thread.
1221 if (oop_result->is_valid()) {
1222 get_vm_result(oop_result);
1223 }
1224
1225 _last_calls_return_pc = return_pc;
1226 BLOCK_COMMENT("} call_VM");
1227 }
1228
1229 void MacroAssembler::call_VM_leaf_base(address entry_point) {
1230 BLOCK_COMMENT("call_VM_leaf {");
1231 #if defined(ABI_ELFv2)
1232 call_c(entry_point, relocInfo::none);
1233 #else
1234 call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, entry_point), relocInfo::none);
1235 #endif
1236 BLOCK_COMMENT("} call_VM_leaf");
1237 }
1238
1239 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
1240 call_VM_base(oop_result, noreg, entry_point, check_exceptions);
1241 }
1242
1243 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1,
1244 bool check_exceptions) {
1245 // R3_ARG1 is reserved for the thread.
1246 mr_if_needed(R4_ARG2, arg_1);
1247 call_VM(oop_result, entry_point, check_exceptions);
1248 }
1249
1250 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2,
1251 bool check_exceptions) {
1252 // R3_ARG1 is reserved for the thread
1253 mr_if_needed(R4_ARG2, arg_1);
1254 assert(arg_2 != R4_ARG2, "smashed argument");
1255 mr_if_needed(R5_ARG3, arg_2);
1256 call_VM(oop_result, entry_point, check_exceptions);
1257 }
1258
1259 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3,
1260 bool check_exceptions) {
1261 // R3_ARG1 is reserved for the thread
1262 mr_if_needed(R4_ARG2, arg_1);
1263 assert(arg_2 != R4_ARG2, "smashed argument");
1264 mr_if_needed(R5_ARG3, arg_2);
1265 mr_if_needed(R6_ARG4, arg_3);
1266 call_VM(oop_result, entry_point, check_exceptions);
1267 }
1268
1269 void MacroAssembler::call_VM_leaf(address entry_point) {
1270 call_VM_leaf_base(entry_point);
1271 }
1272
1273 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
1274 mr_if_needed(R3_ARG1, arg_1);
1275 call_VM_leaf(entry_point);
1276 }
1277
1278 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
1279 mr_if_needed(R3_ARG1, arg_1);
1280 assert(arg_2 != R3_ARG1, "smashed argument");
1281 mr_if_needed(R4_ARG2, arg_2);
1282 call_VM_leaf(entry_point);
1283 }
1284
1285 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
1286 mr_if_needed(R3_ARG1, arg_1);
1287 assert(arg_2 != R3_ARG1, "smashed argument");
1288 mr_if_needed(R4_ARG2, arg_2);
1289 assert(arg_3 != R3_ARG1 && arg_3 != R4_ARG2, "smashed argument");
1290 mr_if_needed(R5_ARG3, arg_3);
1291 call_VM_leaf(entry_point);
1292 }
1293
1294 // Check whether instruction is a read access to the polling page
1295 // which was emitted by load_from_polling_page(..).
1296 bool MacroAssembler::is_load_from_polling_page(int instruction, void* ucontext,
1297 address* polling_address_ptr) {
1298 if (!is_ld(instruction))
1299 return false; // It's not a ld. Fail.
1300
1301 int rt = inv_rt_field(instruction);
1302 int ra = inv_ra_field(instruction);
1303 int ds = inv_ds_field(instruction);
1304 if (!(ds == 0 && ra != 0 && rt == 0)) {
1305 return false; // It's not a ld(r0, X, ra). Fail.
1306 }
1307
1308 if (!ucontext) {
1309 // Set polling address.
1310 if (polling_address_ptr != nullptr) {
1311 *polling_address_ptr = nullptr;
1312 }
1313 return true; // No ucontext given. Can't check value of ra. Assume true.
1314 }
1315
1316 #ifdef LINUX
1317 // Ucontext given. Check that register ra contains the address of
1318 // the safepoing polling page.
1319 ucontext_t* uc = (ucontext_t*) ucontext;
1320 // Set polling address.
1321 address addr = (address)uc->uc_mcontext.regs->gpr[ra] + (ssize_t)ds;
1322 if (polling_address_ptr != nullptr) {
1323 *polling_address_ptr = addr;
1324 }
1325 return SafepointMechanism::is_poll_address(addr);
1326 #else
1327 // Not on Linux, ucontext must be null.
1328 ShouldNotReachHere();
1329 return false;
1330 #endif
1331 }
1332
1333 void MacroAssembler::bang_stack_with_offset(int offset) {
1334 // When increasing the stack, the old stack pointer will be written
1335 // to the new top of stack according to the PPC64 abi.
1336 // Therefore, stack banging is not necessary when increasing
1337 // the stack by <= os::vm_page_size() bytes.
1338 // When increasing the stack by a larger amount, this method is
1339 // called repeatedly to bang the intermediate pages.
1340
1341 // Stack grows down, caller passes positive offset.
1342 assert(offset > 0, "must bang with positive offset");
1343
1344 long stdoffset = -offset;
1345
1346 if (is_simm(stdoffset, 16)) {
1347 // Signed 16 bit offset, a simple std is ok.
1348 if (UseLoadInstructionsForStackBangingPPC64) {
1349 ld(R0, (int)(signed short)stdoffset, R1_SP);
1350 } else {
1351 std(R0,(int)(signed short)stdoffset, R1_SP);
1352 }
1353 } else if (is_simm(stdoffset, 31)) {
1354 const int hi = MacroAssembler::largeoffset_si16_si16_hi(stdoffset);
1355 const int lo = MacroAssembler::largeoffset_si16_si16_lo(stdoffset);
1356
1357 Register tmp = R11;
1358 addis(tmp, R1_SP, hi);
1359 if (UseLoadInstructionsForStackBangingPPC64) {
1360 ld(R0, lo, tmp);
1361 } else {
1362 std(R0, lo, tmp);
1363 }
1364 } else {
1365 ShouldNotReachHere();
1366 }
1367 }
1368
1369 // If instruction is a stack bang of the form
1370 // std R0, x(Ry), (see bang_stack_with_offset())
1371 // stdu R1_SP, x(R1_SP), (see push_frame(), resize_frame())
1372 // or stdux R1_SP, Rx, R1_SP (see push_frame(), resize_frame())
1373 // return the banged address. Otherwise, return 0.
1374 address MacroAssembler::get_stack_bang_address(int instruction, void *ucontext) {
1375 #ifdef LINUX
1376 ucontext_t* uc = (ucontext_t*) ucontext;
1377 int rs = inv_rs_field(instruction);
1378 int ra = inv_ra_field(instruction);
1379 if ( (is_ld(instruction) && rs == 0 && UseLoadInstructionsForStackBangingPPC64)
1380 || (is_std(instruction) && rs == 0 && !UseLoadInstructionsForStackBangingPPC64)
1381 || (is_stdu(instruction) && rs == 1)) {
1382 int ds = inv_ds_field(instruction);
1383 // return banged address
1384 return ds+(address)uc->uc_mcontext.regs->gpr[ra];
1385 } else if (is_stdux(instruction) && rs == 1) {
1386 int rb = inv_rb_field(instruction);
1387 address sp = (address)uc->uc_mcontext.regs->gpr[1];
1388 long rb_val = (long)uc->uc_mcontext.regs->gpr[rb];
1389 return ra != 1 || rb_val >= 0 ? nullptr // not a stack bang
1390 : sp + rb_val; // banged address
1391 }
1392 return nullptr; // not a stack bang
1393 #else
1394 // workaround not needed on !LINUX :-)
1395 ShouldNotCallThis();
1396 return nullptr;
1397 #endif
1398 }
1399
1400 void MacroAssembler::reserved_stack_check(Register return_pc) {
1401 // Test if reserved zone needs to be enabled.
1402 Label no_reserved_zone_enabling;
1403
1404 ld_ptr(R0, JavaThread::reserved_stack_activation_offset(), R16_thread);
1405 cmpld(CCR0, R1_SP, R0);
1406 blt_predict_taken(CCR0, no_reserved_zone_enabling);
1407
1408 // Enable reserved zone again, throw stack overflow exception.
1409 push_frame_reg_args(0, R0);
1410 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), R16_thread);
1411 pop_frame();
1412 mtlr(return_pc);
1413 load_const_optimized(R0, StubRoutines::throw_delayed_StackOverflowError_entry());
1414 mtctr(R0);
1415 bctr();
1416
1417 should_not_reach_here();
1418
1419 bind(no_reserved_zone_enabling);
1420 }
1421
1422 void MacroAssembler::getandsetd(Register dest_current_value, Register exchange_value, Register addr_base,
1423 bool cmpxchgx_hint) {
1424 Label retry;
1425 bind(retry);
1426 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1427 stdcx_(exchange_value, addr_base);
1428 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1429 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1430 } else {
1431 bne( CCR0, retry); // StXcx_ sets CCR0.
1432 }
1433 }
1434
1435 void MacroAssembler::getandaddd(Register dest_current_value, Register inc_value, Register addr_base,
1436 Register tmp, bool cmpxchgx_hint) {
1437 Label retry;
1438 bind(retry);
1439 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1440 add(tmp, dest_current_value, inc_value);
1441 stdcx_(tmp, addr_base);
1442 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1443 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1444 } else {
1445 bne( CCR0, retry); // StXcx_ sets CCR0.
1446 }
1447 }
1448
1449 // Word/sub-word atomic helper functions
1450
1451 // Temps and addr_base are killed if size < 4 and processor does not support respective instructions.
1452 // Only signed types are supported with size < 4.
1453 // Atomic add always kills tmp1.
1454 void MacroAssembler::atomic_get_and_modify_generic(Register dest_current_value, Register exchange_value,
1455 Register addr_base, Register tmp1, Register tmp2, Register tmp3,
1456 bool cmpxchgx_hint, bool is_add, int size) {
1457 // Sub-word instructions are available since Power 8.
1458 // For older processors, instruction_type != size holds, and we
1459 // emulate the sub-word instructions by constructing a 4-byte value
1460 // that leaves the other bytes unchanged.
1461 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1462
1463 Label retry;
1464 Register shift_amount = noreg,
1465 val32 = dest_current_value,
1466 modval = is_add ? tmp1 : exchange_value;
1467
1468 if (instruction_type != size) {
1469 assert_different_registers(tmp1, tmp2, tmp3, dest_current_value, exchange_value, addr_base);
1470 modval = tmp1;
1471 shift_amount = tmp2;
1472 val32 = tmp3;
1473 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1474 #ifdef VM_LITTLE_ENDIAN
1475 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1476 clrrdi(addr_base, addr_base, 2);
1477 #else
1478 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1479 clrrdi(addr_base, addr_base, 2);
1480 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1481 #endif
1482 }
1483
1484 // atomic emulation loop
1485 bind(retry);
1486
1487 switch (instruction_type) {
1488 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1489 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1490 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1491 default: ShouldNotReachHere();
1492 }
1493
1494 if (instruction_type != size) {
1495 srw(dest_current_value, val32, shift_amount);
1496 }
1497
1498 if (is_add) { add(modval, dest_current_value, exchange_value); }
1499
1500 if (instruction_type != size) {
1501 // Transform exchange value such that the replacement can be done by one xor instruction.
1502 xorr(modval, dest_current_value, is_add ? modval : exchange_value);
1503 clrldi(modval, modval, (size == 1) ? 56 : 48);
1504 slw(modval, modval, shift_amount);
1505 xorr(modval, val32, modval);
1506 }
1507
1508 switch (instruction_type) {
1509 case 4: stwcx_(modval, addr_base); break;
1510 case 2: sthcx_(modval, addr_base); break;
1511 case 1: stbcx_(modval, addr_base); break;
1512 default: ShouldNotReachHere();
1513 }
1514
1515 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1516 bne_predict_not_taken(CCR0, retry); // StXcx_ sets CCR0.
1517 } else {
1518 bne( CCR0, retry); // StXcx_ sets CCR0.
1519 }
1520
1521 // l?arx zero-extends, but Java wants byte/short values sign-extended.
1522 if (size == 1) {
1523 extsb(dest_current_value, dest_current_value);
1524 } else if (size == 2) {
1525 extsh(dest_current_value, dest_current_value);
1526 };
1527 }
1528
1529 // Temps, addr_base and exchange_value are killed if size < 4 and processor does not support respective instructions.
1530 // Only signed types are supported with size < 4.
1531 void MacroAssembler::cmpxchg_loop_body(ConditionRegister flag, Register dest_current_value,
1532 Register compare_value, Register exchange_value,
1533 Register addr_base, Register tmp1, Register tmp2,
1534 Label &retry, Label &failed, bool cmpxchgx_hint, int size) {
1535 // Sub-word instructions are available since Power 8.
1536 // For older processors, instruction_type != size holds, and we
1537 // emulate the sub-word instructions by constructing a 4-byte value
1538 // that leaves the other bytes unchanged.
1539 const int instruction_type = VM_Version::has_lqarx() ? size : 4;
1540
1541 Register shift_amount = noreg,
1542 val32 = dest_current_value,
1543 modval = exchange_value;
1544
1545 if (instruction_type != size) {
1546 assert_different_registers(tmp1, tmp2, dest_current_value, compare_value, exchange_value, addr_base);
1547 shift_amount = tmp1;
1548 val32 = tmp2;
1549 modval = tmp2;
1550 // Need some preparation: Compute shift amount, align address. Note: shorts must be 2 byte aligned.
1551 #ifdef VM_LITTLE_ENDIAN
1552 rldic(shift_amount, addr_base, 3, 64-5); // (dest & 3) * 8;
1553 clrrdi(addr_base, addr_base, 2);
1554 #else
1555 xori(shift_amount, addr_base, (size == 1) ? 3 : 2);
1556 clrrdi(addr_base, addr_base, 2);
1557 rldic(shift_amount, shift_amount, 3, 64-5); // byte: ((3-dest) & 3) * 8; short: ((1-dest/2) & 1) * 16;
1558 #endif
1559 // Transform exchange value such that the replacement can be done by one xor instruction.
1560 xorr(exchange_value, compare_value, exchange_value);
1561 clrldi(exchange_value, exchange_value, (size == 1) ? 56 : 48);
1562 slw(exchange_value, exchange_value, shift_amount);
1563 }
1564
1565 // atomic emulation loop
1566 bind(retry);
1567
1568 switch (instruction_type) {
1569 case 4: lwarx(val32, addr_base, cmpxchgx_hint); break;
1570 case 2: lharx(val32, addr_base, cmpxchgx_hint); break;
1571 case 1: lbarx(val32, addr_base, cmpxchgx_hint); break;
1572 default: ShouldNotReachHere();
1573 }
1574
1575 if (instruction_type != size) {
1576 srw(dest_current_value, val32, shift_amount);
1577 }
1578 if (size == 1) {
1579 extsb(dest_current_value, dest_current_value);
1580 } else if (size == 2) {
1581 extsh(dest_current_value, dest_current_value);
1582 };
1583
1584 cmpw(flag, dest_current_value, compare_value);
1585 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1586 bne_predict_not_taken(flag, failed);
1587 } else {
1588 bne( flag, failed);
1589 }
1590 // branch to done => (flag == ne), (dest_current_value != compare_value)
1591 // fall through => (flag == eq), (dest_current_value == compare_value)
1592
1593 if (instruction_type != size) {
1594 xorr(modval, val32, exchange_value);
1595 }
1596
1597 switch (instruction_type) {
1598 case 4: stwcx_(modval, addr_base); break;
1599 case 2: sthcx_(modval, addr_base); break;
1600 case 1: stbcx_(modval, addr_base); break;
1601 default: ShouldNotReachHere();
1602 }
1603 }
1604
1605 // CmpxchgX sets condition register to cmpX(current, compare).
1606 void MacroAssembler::cmpxchg_generic(ConditionRegister flag, Register dest_current_value,
1607 Register compare_value, Register exchange_value,
1608 Register addr_base, Register tmp1, Register tmp2,
1609 int semantics, bool cmpxchgx_hint,
1610 Register int_flag_success, bool contention_hint, bool weak, int size) {
1611 Label retry;
1612 Label failed;
1613 Label done;
1614
1615 // Save one branch if result is returned via register and
1616 // result register is different from the other ones.
1617 bool use_result_reg = (int_flag_success != noreg);
1618 bool preset_result_reg = (int_flag_success != dest_current_value && int_flag_success != compare_value &&
1619 int_flag_success != exchange_value && int_flag_success != addr_base &&
1620 int_flag_success != tmp1 && int_flag_success != tmp2);
1621 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1622 assert(size == 1 || size == 2 || size == 4, "unsupported");
1623
1624 if (use_result_reg && preset_result_reg) {
1625 li(int_flag_success, 0); // preset (assume cas failed)
1626 }
1627
1628 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1629 if (contention_hint) { // Don't try to reserve if cmp fails.
1630 switch (size) {
1631 case 1: lbz(dest_current_value, 0, addr_base); extsb(dest_current_value, dest_current_value); break;
1632 case 2: lha(dest_current_value, 0, addr_base); break;
1633 case 4: lwz(dest_current_value, 0, addr_base); break;
1634 default: ShouldNotReachHere();
1635 }
1636 cmpw(flag, dest_current_value, compare_value);
1637 bne(flag, failed);
1638 }
1639
1640 // release/fence semantics
1641 if (semantics & MemBarRel) {
1642 release();
1643 }
1644
1645 cmpxchg_loop_body(flag, dest_current_value, compare_value, exchange_value, addr_base, tmp1, tmp2,
1646 retry, failed, cmpxchgx_hint, size);
1647 if (!weak || use_result_reg) {
1648 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1649 bne_predict_not_taken(CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1650 } else {
1651 bne( CCR0, weak ? failed : retry); // StXcx_ sets CCR0.
1652 }
1653 }
1654 // fall through => (flag == eq), (dest_current_value == compare_value), (swapped)
1655
1656 // Result in register (must do this at the end because int_flag_success can be the
1657 // same register as one above).
1658 if (use_result_reg) {
1659 li(int_flag_success, 1);
1660 }
1661
1662 if (semantics & MemBarFenceAfter) {
1663 fence();
1664 } else if (semantics & MemBarAcq) {
1665 isync();
1666 }
1667
1668 if (use_result_reg && !preset_result_reg) {
1669 b(done);
1670 }
1671
1672 bind(failed);
1673 if (use_result_reg && !preset_result_reg) {
1674 li(int_flag_success, 0);
1675 }
1676
1677 bind(done);
1678 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1679 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1680 }
1681
1682 // Performs atomic compare exchange:
1683 // if (compare_value == *addr_base)
1684 // *addr_base = exchange_value
1685 // int_flag_success = 1;
1686 // else
1687 // int_flag_success = 0;
1688 //
1689 // ConditionRegister flag = cmp(compare_value, *addr_base)
1690 // Register dest_current_value = *addr_base
1691 // Register compare_value Used to compare with value in memory
1692 // Register exchange_value Written to memory if compare_value == *addr_base
1693 // Register addr_base The memory location to compareXChange
1694 // Register int_flag_success Set to 1 if exchange_value was written to *addr_base
1695 //
1696 // To avoid the costly compare exchange the value is tested beforehand.
1697 // Several special cases exist to avoid that unnecessary information is generated.
1698 //
1699 void MacroAssembler::cmpxchgd(ConditionRegister flag,
1700 Register dest_current_value, RegisterOrConstant compare_value, Register exchange_value,
1701 Register addr_base, int semantics, bool cmpxchgx_hint,
1702 Register int_flag_success, Label* failed_ext, bool contention_hint, bool weak) {
1703 Label retry;
1704 Label failed_int;
1705 Label& failed = (failed_ext != nullptr) ? *failed_ext : failed_int;
1706 Label done;
1707
1708 // Save one branch if result is returned via register and result register is different from the other ones.
1709 bool use_result_reg = (int_flag_success!=noreg);
1710 bool preset_result_reg = (int_flag_success!=dest_current_value && int_flag_success!=compare_value.register_or_noreg() &&
1711 int_flag_success!=exchange_value && int_flag_success!=addr_base);
1712 assert(!weak || flag == CCR0, "weak only supported with CCR0");
1713 assert(int_flag_success == noreg || failed_ext == nullptr, "cannot have both");
1714
1715 if (use_result_reg && preset_result_reg) {
1716 li(int_flag_success, 0); // preset (assume cas failed)
1717 }
1718
1719 // Add simple guard in order to reduce risk of starving under high contention (recommended by IBM).
1720 if (contention_hint) { // Don't try to reserve if cmp fails.
1721 ld(dest_current_value, 0, addr_base);
1722 cmpd(flag, compare_value, dest_current_value);
1723 bne(flag, failed);
1724 }
1725
1726 // release/fence semantics
1727 if (semantics & MemBarRel) {
1728 release();
1729 }
1730
1731 // atomic emulation loop
1732 bind(retry);
1733
1734 ldarx(dest_current_value, addr_base, cmpxchgx_hint);
1735 cmpd(flag, compare_value, dest_current_value);
1736 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1737 bne_predict_not_taken(flag, failed);
1738 } else {
1739 bne( flag, failed);
1740 }
1741
1742 stdcx_(exchange_value, addr_base);
1743 if (!weak || use_result_reg || failed_ext) {
1744 if (UseStaticBranchPredictionInCompareAndSwapPPC64) {
1745 bne_predict_not_taken(CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1746 } else {
1747 bne( CCR0, weak ? failed : retry); // stXcx_ sets CCR0
1748 }
1749 }
1750
1751 // result in register (must do this at the end because int_flag_success can be the same register as one above)
1752 if (use_result_reg) {
1753 li(int_flag_success, 1);
1754 }
1755
1756 if (semantics & MemBarFenceAfter) {
1757 fence();
1758 } else if (semantics & MemBarAcq) {
1759 isync();
1760 }
1761
1762 if (use_result_reg && !preset_result_reg) {
1763 b(done);
1764 }
1765
1766 bind(failed_int);
1767 if (use_result_reg && !preset_result_reg) {
1768 li(int_flag_success, 0);
1769 }
1770
1771 bind(done);
1772 // (flag == ne) => (dest_current_value != compare_value), (!swapped)
1773 // (flag == eq) => (dest_current_value == compare_value), ( swapped)
1774 }
1775
1776 // Look up the method for a megamorphic invokeinterface call.
1777 // The target method is determined by <intf_klass, itable_index>.
1778 // The receiver klass is in recv_klass.
1779 // On success, the result will be in method_result, and execution falls through.
1780 // On failure, execution transfers to the given label.
1781 void MacroAssembler::lookup_interface_method(Register recv_klass,
1782 Register intf_klass,
1783 RegisterOrConstant itable_index,
1784 Register method_result,
1785 Register scan_temp,
1786 Register temp2,
1787 Label& L_no_such_interface,
1788 bool return_method) {
1789 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
1790
1791 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
1792 int vtable_base = in_bytes(Klass::vtable_start_offset());
1793 int itentry_off = in_bytes(itableMethodEntry::method_offset());
1794 int logMEsize = exact_log2(itableMethodEntry::size() * wordSize);
1795 int scan_step = itableOffsetEntry::size() * wordSize;
1796 int log_vte_size= exact_log2(vtableEntry::size_in_bytes());
1797
1798 lwz(scan_temp, in_bytes(Klass::vtable_length_offset()), recv_klass);
1799 // %%% We should store the aligned, prescaled offset in the klassoop.
1800 // Then the next several instructions would fold away.
1801
1802 sldi(scan_temp, scan_temp, log_vte_size);
1803 addi(scan_temp, scan_temp, vtable_base);
1804 add(scan_temp, recv_klass, scan_temp);
1805
1806 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1807 if (return_method) {
1808 if (itable_index.is_register()) {
1809 Register itable_offset = itable_index.as_register();
1810 sldi(method_result, itable_offset, logMEsize);
1811 if (itentry_off) { addi(method_result, method_result, itentry_off); }
1812 add(method_result, method_result, recv_klass);
1813 } else {
1814 long itable_offset = (long)itable_index.as_constant();
1815 // static address, no relocation
1816 add_const_optimized(method_result, recv_klass, (itable_offset << logMEsize) + itentry_off, temp2);
1817 }
1818 }
1819
1820 // for (scan = klass->itable(); scan->interface() != nullptr; scan += scan_step) {
1821 // if (scan->interface() == intf) {
1822 // result = (klass + scan->offset() + itable_index);
1823 // }
1824 // }
1825 Label search, found_method;
1826
1827 for (int peel = 1; peel >= 0; peel--) {
1828 // %%%% Could load both offset and interface in one ldx, if they were
1829 // in the opposite order. This would save a load.
1830 ld(temp2, in_bytes(itableOffsetEntry::interface_offset()), scan_temp);
1831
1832 // Check that this entry is non-null. A null entry means that
1833 // the receiver class doesn't implement the interface, and wasn't the
1834 // same as when the caller was compiled.
1835 cmpd(CCR0, temp2, intf_klass);
1836
1837 if (peel) {
1838 beq(CCR0, found_method);
1839 } else {
1840 bne(CCR0, search);
1841 // (invert the test to fall through to found_method...)
1842 }
1843
1844 if (!peel) break;
1845
1846 bind(search);
1847
1848 cmpdi(CCR0, temp2, 0);
1849 beq(CCR0, L_no_such_interface);
1850 addi(scan_temp, scan_temp, scan_step);
1851 }
1852
1853 bind(found_method);
1854
1855 // Got a hit.
1856 if (return_method) {
1857 int ito_offset = in_bytes(itableOffsetEntry::offset_offset());
1858 lwz(scan_temp, ito_offset, scan_temp);
1859 ldx(method_result, scan_temp, method_result);
1860 }
1861 }
1862
1863 // virtual method calling
1864 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1865 RegisterOrConstant vtable_index,
1866 Register method_result) {
1867
1868 assert_different_registers(recv_klass, method_result, vtable_index.register_or_noreg());
1869
1870 const ByteSize base = Klass::vtable_start_offset();
1871 assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1872
1873 if (vtable_index.is_register()) {
1874 sldi(vtable_index.as_register(), vtable_index.as_register(), LogBytesPerWord);
1875 add(recv_klass, vtable_index.as_register(), recv_klass);
1876 } else {
1877 addi(recv_klass, recv_klass, vtable_index.as_constant() << LogBytesPerWord);
1878 }
1879 ld(R19_method, in_bytes(base + vtableEntry::method_offset()), recv_klass);
1880 }
1881
1882 /////////////////////////////////////////// subtype checking ////////////////////////////////////////////
1883 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1884 Register super_klass,
1885 Register temp1_reg,
1886 Register temp2_reg,
1887 Label* L_success,
1888 Label* L_failure,
1889 Label* L_slow_path,
1890 RegisterOrConstant super_check_offset) {
1891
1892 const Register check_cache_offset = temp1_reg;
1893 const Register cached_super = temp2_reg;
1894
1895 assert_different_registers(sub_klass, super_klass, check_cache_offset, cached_super);
1896
1897 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1898 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1899
1900 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1901 bool need_slow_path = (must_load_sco || super_check_offset.constant_or_zero() == sco_offset);
1902
1903 Label L_fallthrough;
1904 int label_nulls = 0;
1905 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
1906 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
1907 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
1908 assert(label_nulls <= 1 ||
1909 (L_slow_path == &L_fallthrough && label_nulls <= 2 && !need_slow_path),
1910 "at most one null in the batch, usually");
1911
1912 // If the pointers are equal, we are done (e.g., String[] elements).
1913 // This self-check enables sharing of secondary supertype arrays among
1914 // non-primary types such as array-of-interface. Otherwise, each such
1915 // type would need its own customized SSA.
1916 // We move this check to the front of the fast path because many
1917 // type checks are in fact trivially successful in this manner,
1918 // so we get a nicely predicted branch right at the start of the check.
1919 cmpd(CCR0, sub_klass, super_klass);
1920 beq(CCR0, *L_success);
1921
1922 // Check the supertype display:
1923 if (must_load_sco) {
1924 // The super check offset is always positive...
1925 lwz(check_cache_offset, sco_offset, super_klass);
1926 super_check_offset = RegisterOrConstant(check_cache_offset);
1927 // super_check_offset is register.
1928 assert_different_registers(sub_klass, super_klass, cached_super, super_check_offset.as_register());
1929 }
1930 // The loaded value is the offset from KlassOopDesc.
1931
1932 ld(cached_super, super_check_offset, sub_klass);
1933 cmpd(CCR0, cached_super, super_klass);
1934
1935 // This check has worked decisively for primary supers.
1936 // Secondary supers are sought in the super_cache ('super_cache_addr').
1937 // (Secondary supers are interfaces and very deeply nested subtypes.)
1938 // This works in the same check above because of a tricky aliasing
1939 // between the super_cache and the primary super display elements.
1940 // (The 'super_check_addr' can address either, as the case requires.)
1941 // Note that the cache is updated below if it does not help us find
1942 // what we need immediately.
1943 // So if it was a primary super, we can just fail immediately.
1944 // Otherwise, it's the slow path for us (no success at this point).
1945
1946 #define FINAL_JUMP(label) if (&(label) != &L_fallthrough) { b(label); }
1947
1948 if (super_check_offset.is_register()) {
1949 beq(CCR0, *L_success);
1950 cmpwi(CCR0, super_check_offset.as_register(), sc_offset);
1951 if (L_failure == &L_fallthrough) {
1952 beq(CCR0, *L_slow_path);
1953 } else {
1954 bne(CCR0, *L_failure);
1955 FINAL_JUMP(*L_slow_path);
1956 }
1957 } else {
1958 if (super_check_offset.as_constant() == sc_offset) {
1959 // Need a slow path; fast failure is impossible.
1960 if (L_slow_path == &L_fallthrough) {
1961 beq(CCR0, *L_success);
1962 } else {
1963 bne(CCR0, *L_slow_path);
1964 FINAL_JUMP(*L_success);
1965 }
1966 } else {
1967 // No slow path; it's a fast decision.
1968 if (L_failure == &L_fallthrough) {
1969 beq(CCR0, *L_success);
1970 } else {
1971 bne(CCR0, *L_failure);
1972 FINAL_JUMP(*L_success);
1973 }
1974 }
1975 }
1976
1977 bind(L_fallthrough);
1978 #undef FINAL_JUMP
1979 }
1980
1981 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1982 Register super_klass,
1983 Register temp1_reg,
1984 Register temp2_reg,
1985 Label* L_success,
1986 Register result_reg) {
1987 const Register array_ptr = temp1_reg; // current value from cache array
1988 const Register temp = temp2_reg;
1989
1990 assert_different_registers(sub_klass, super_klass, array_ptr, temp);
1991
1992 int source_offset = in_bytes(Klass::secondary_supers_offset());
1993 int target_offset = in_bytes(Klass::secondary_super_cache_offset());
1994
1995 int length_offset = Array<Klass*>::length_offset_in_bytes();
1996 int base_offset = Array<Klass*>::base_offset_in_bytes();
1997
1998 Label hit, loop, failure, fallthru;
1999
2000 ld(array_ptr, source_offset, sub_klass);
2001
2002 // TODO: PPC port: assert(4 == arrayOopDesc::length_length_in_bytes(), "precondition violated.");
2003 lwz(temp, length_offset, array_ptr);
2004 cmpwi(CCR0, temp, 0);
2005 beq(CCR0, result_reg!=noreg ? failure : fallthru); // length 0
2006
2007 mtctr(temp); // load ctr
2008
2009 bind(loop);
2010 // Oops in table are NO MORE compressed.
2011 ld(temp, base_offset, array_ptr);
2012 cmpd(CCR0, temp, super_klass);
2013 beq(CCR0, hit);
2014 addi(array_ptr, array_ptr, BytesPerWord);
2015 bdnz(loop);
2016
2017 bind(failure);
2018 if (result_reg!=noreg) li(result_reg, 1); // load non-zero result (indicates a miss)
2019 b(fallthru);
2020
2021 bind(hit);
2022 std(super_klass, target_offset, sub_klass); // save result to cache
2023 if (result_reg != noreg) { li(result_reg, 0); } // load zero result (indicates a hit)
2024 if (L_success != nullptr) { b(*L_success); }
2025 else if (result_reg == noreg) { blr(); } // return with CR0.eq if neither label nor result reg provided
2026
2027 bind(fallthru);
2028 }
2029
2030 // Try fast path, then go to slow one if not successful
2031 void MacroAssembler::check_klass_subtype(Register sub_klass,
2032 Register super_klass,
2033 Register temp1_reg,
2034 Register temp2_reg,
2035 Label& L_success) {
2036 Label L_failure;
2037 check_klass_subtype_fast_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success, &L_failure);
2038 check_klass_subtype_slow_path(sub_klass, super_klass, temp1_reg, temp2_reg, &L_success);
2039 bind(L_failure); // Fallthru if not successful.
2040 }
2041
2042 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) {
2043 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
2044
2045 Label L_fallthrough;
2046 if (L_fast_path == nullptr) {
2047 L_fast_path = &L_fallthrough;
2048 } else if (L_slow_path == nullptr) {
2049 L_slow_path = &L_fallthrough;
2050 }
2051
2052 // Fast path check: class is fully initialized
2053 lbz(R0, in_bytes(InstanceKlass::init_state_offset()), klass);
2054 cmpwi(CCR0, R0, InstanceKlass::fully_initialized);
2055 beq(CCR0, *L_fast_path);
2056
2057 // Fast path check: current thread is initializer thread
2058 ld(R0, in_bytes(InstanceKlass::init_thread_offset()), klass);
2059 cmpd(CCR0, thread, R0);
2060 if (L_slow_path == &L_fallthrough) {
2061 beq(CCR0, *L_fast_path);
2062 } else if (L_fast_path == &L_fallthrough) {
2063 bne(CCR0, *L_slow_path);
2064 } else {
2065 Unimplemented();
2066 }
2067
2068 bind(L_fallthrough);
2069 }
2070
2071 RegisterOrConstant MacroAssembler::argument_offset(RegisterOrConstant arg_slot,
2072 Register temp_reg,
2073 int extra_slot_offset) {
2074 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
2075 int stackElementSize = Interpreter::stackElementSize;
2076 int offset = extra_slot_offset * stackElementSize;
2077 if (arg_slot.is_constant()) {
2078 offset += arg_slot.as_constant() * stackElementSize;
2079 return offset;
2080 } else {
2081 assert(temp_reg != noreg, "must specify");
2082 sldi(temp_reg, arg_slot.as_register(), exact_log2(stackElementSize));
2083 if (offset != 0)
2084 addi(temp_reg, temp_reg, offset);
2085 return temp_reg;
2086 }
2087 }
2088
2089 void MacroAssembler::tlab_allocate(
2090 Register obj, // result: pointer to object after successful allocation
2091 Register var_size_in_bytes, // object size in bytes if unknown at compile time; invalid otherwise
2092 int con_size_in_bytes, // object size in bytes if known at compile time
2093 Register t1, // temp register
2094 Label& slow_case // continuation point if fast allocation fails
2095 ) {
2096 // make sure arguments make sense
2097 assert_different_registers(obj, var_size_in_bytes, t1);
2098 assert(0 <= con_size_in_bytes && is_simm16(con_size_in_bytes), "illegal object size");
2099 assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0, "object size is not multiple of alignment");
2100
2101 const Register new_top = t1;
2102 //verify_tlab(); not implemented
2103
2104 ld(obj, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2105 ld(R0, in_bytes(JavaThread::tlab_end_offset()), R16_thread);
2106 if (var_size_in_bytes == noreg) {
2107 addi(new_top, obj, con_size_in_bytes);
2108 } else {
2109 add(new_top, obj, var_size_in_bytes);
2110 }
2111 cmpld(CCR0, new_top, R0);
2112 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_case);
2113
2114 #ifdef ASSERT
2115 // make sure new free pointer is properly aligned
2116 {
2117 Label L;
2118 andi_(R0, new_top, MinObjAlignmentInBytesMask);
2119 beq(CCR0, L);
2120 stop("updated TLAB free is not properly aligned");
2121 bind(L);
2122 }
2123 #endif // ASSERT
2124
2125 // update the tlab top pointer
2126 std(new_top, in_bytes(JavaThread::tlab_top_offset()), R16_thread);
2127 //verify_tlab(); not implemented
2128 }
2129 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register t1, Register t2) {
2130 unimplemented("incr_allocated_bytes");
2131 }
2132
2133 address MacroAssembler::emit_trampoline_stub(int destination_toc_offset,
2134 int insts_call_instruction_offset, Register Rtoc) {
2135 // Start the stub.
2136 address stub = start_a_stub(64);
2137 if (stub == nullptr) { return nullptr; } // CodeCache full: bail out
2138
2139 // Create a trampoline stub relocation which relates this trampoline stub
2140 // with the call instruction at insts_call_instruction_offset in the
2141 // instructions code-section.
2142 relocate(trampoline_stub_Relocation::spec(code()->insts()->start() + insts_call_instruction_offset));
2143 const int stub_start_offset = offset();
2144
2145 // For java_to_interp stubs we use R11_scratch1 as scratch register
2146 // and in call trampoline stubs we use R12_scratch2. This way we
2147 // can distinguish them (see is_NativeCallTrampolineStub_at()).
2148 Register reg_scratch = R12_scratch2;
2149
2150 // Now, create the trampoline stub's code:
2151 // - load the TOC
2152 // - load the call target from the constant pool
2153 // - call
2154 if (Rtoc == noreg) {
2155 calculate_address_from_global_toc(reg_scratch, method_toc());
2156 Rtoc = reg_scratch;
2157 }
2158
2159 ld_largeoffset_unchecked(reg_scratch, destination_toc_offset, Rtoc, false);
2160 mtctr(reg_scratch);
2161 bctr();
2162
2163 const address stub_start_addr = addr_at(stub_start_offset);
2164
2165 // Assert that the encoded destination_toc_offset can be identified and that it is correct.
2166 assert(destination_toc_offset == NativeCallTrampolineStub_at(stub_start_addr)->destination_toc_offset(),
2167 "encoded offset into the constant pool must match");
2168 // Trampoline_stub_size should be good.
2169 assert((uint)(offset() - stub_start_offset) <= trampoline_stub_size, "should be good size");
2170 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
2171
2172 // End the stub.
2173 end_a_stub();
2174 return stub;
2175 }
2176
2177 // "The box" is the space on the stack where we copy the object mark.
2178 void MacroAssembler::compiler_fast_lock_object(ConditionRegister flag, Register oop, Register box,
2179 Register temp, Register displaced_header, Register current_header) {
2180 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_lock_lightweight");
2181 assert_different_registers(oop, box, temp, displaced_header, current_header);
2182 Label object_has_monitor;
2183 Label cas_failed;
2184 Label success, failure;
2185
2186 // Load markWord from object into displaced_header.
2187 ld(displaced_header, oopDesc::mark_offset_in_bytes(), oop);
2188
2189 if (DiagnoseSyncOnValueBasedClasses != 0) {
2190 load_klass(temp, oop);
2191 lwz(temp, in_bytes(Klass::access_flags_offset()), temp);
2192 testbitdi(flag, R0, temp, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2193 bne(flag, failure);
2194 }
2195
2196 // Handle existing monitor.
2197 // The object has an existing monitor iff (mark & monitor_value) != 0.
2198 andi_(temp, displaced_header, markWord::monitor_value);
2199 bne(CCR0, object_has_monitor);
2200
2201 if (LockingMode == LM_MONITOR) {
2202 // Set NE to indicate 'failure' -> take slow-path.
2203 crandc(flag, Assembler::equal, flag, Assembler::equal);
2204 b(failure);
2205 } else {
2206 assert(LockingMode == LM_LEGACY, "must be");
2207 // Set displaced_header to be (markWord of object | UNLOCK_VALUE).
2208 ori(displaced_header, displaced_header, markWord::unlocked_value);
2209
2210 // Load Compare Value application register.
2211
2212 // Initialize the box. (Must happen before we update the object mark!)
2213 std(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2214
2215 // Must fence, otherwise, preceding store(s) may float below cmpxchg.
2216 // Compare object markWord with mark and if equal exchange scratch1 with object markWord.
2217 cmpxchgd(/*flag=*/flag,
2218 /*current_value=*/current_header,
2219 /*compare_value=*/displaced_header,
2220 /*exchange_value=*/box,
2221 /*where=*/oop,
2222 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2223 MacroAssembler::cmpxchgx_hint_acquire_lock(),
2224 noreg,
2225 &cas_failed,
2226 /*check without membar and ldarx first*/true);
2227 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2228 // If the compare-and-exchange succeeded, then we found an unlocked
2229 // object and we have now locked it.
2230 b(success);
2231
2232 bind(cas_failed);
2233 // We did not see an unlocked object so try the fast recursive case.
2234
2235 // Check if the owner is self by comparing the value in the markWord of object
2236 // (current_header) with the stack pointer.
2237 sub(current_header, current_header, R1_SP);
2238 load_const_optimized(temp, ~(os::vm_page_size()-1) | markWord::lock_mask_in_place);
2239
2240 and_(R0/*==0?*/, current_header, temp);
2241 // If condition is true we are cont and hence we can store 0 as the
2242 // displaced header in the box, which indicates that it is a recursive lock.
2243 std(R0/*==0, perhaps*/, BasicLock::displaced_header_offset_in_bytes(), box);
2244
2245 if (flag != CCR0) {
2246 mcrf(flag, CCR0);
2247 }
2248 beq(CCR0, success);
2249 b(failure);
2250 }
2251
2252 // Handle existing monitor.
2253 bind(object_has_monitor);
2254 // The object's monitor m is unlocked iff m->owner is null,
2255 // otherwise m->owner may contain a thread or a stack address.
2256
2257 // Try to CAS m->owner from null to current thread.
2258 addi(temp, displaced_header, in_bytes(ObjectMonitor::owner_offset()) - markWord::monitor_value);
2259 cmpxchgd(/*flag=*/flag,
2260 /*current_value=*/current_header,
2261 /*compare_value=*/(intptr_t)0,
2262 /*exchange_value=*/R16_thread,
2263 /*where=*/temp,
2264 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2265 MacroAssembler::cmpxchgx_hint_acquire_lock());
2266
2267 // Store a non-null value into the box.
2268 std(box, BasicLock::displaced_header_offset_in_bytes(), box);
2269 beq(flag, success);
2270
2271 // Check for recursive locking.
2272 cmpd(flag, current_header, R16_thread);
2273 bne(flag, failure);
2274
2275 // Current thread already owns the lock. Just increment recursions.
2276 Register recursions = displaced_header;
2277 ld(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2278 addi(recursions, recursions, 1);
2279 std(recursions, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), temp);
2280
2281 // flag == EQ indicates success, increment held monitor count
2282 // flag == NE indicates failure
2283 bind(success);
2284 inc_held_monitor_count(temp);
2285 bind(failure);
2286 }
2287
2288 void MacroAssembler::compiler_fast_unlock_object(ConditionRegister flag, Register oop, Register box,
2289 Register temp, Register displaced_header, Register current_header) {
2290 assert(LockingMode != LM_LIGHTWEIGHT, "uses fast_unlock_lightweight");
2291 assert_different_registers(oop, box, temp, displaced_header, current_header);
2292 Label success, failure, object_has_monitor, notRecursive;
2293
2294 if (LockingMode == LM_LEGACY) {
2295 // Find the lock address and load the displaced header from the stack.
2296 ld(displaced_header, BasicLock::displaced_header_offset_in_bytes(), box);
2297
2298 // If the displaced header is 0, we have a recursive unlock.
2299 cmpdi(flag, displaced_header, 0);
2300 beq(flag, success);
2301 }
2302
2303 // Handle existing monitor.
2304 // The object has an existing monitor iff (mark & monitor_value) != 0.
2305 ld(current_header, oopDesc::mark_offset_in_bytes(), oop);
2306 andi_(R0, current_header, markWord::monitor_value);
2307 bne(CCR0, object_has_monitor);
2308
2309 if (LockingMode == LM_MONITOR) {
2310 // Set NE to indicate 'failure' -> take slow-path.
2311 crandc(flag, Assembler::equal, flag, Assembler::equal);
2312 b(failure);
2313 } else {
2314 assert(LockingMode == LM_LEGACY, "must be");
2315 // Check if it is still a light weight lock, this is is true if we see
2316 // the stack address of the basicLock in the markWord of the object.
2317 // Cmpxchg sets flag to cmpd(current_header, box).
2318 cmpxchgd(/*flag=*/flag,
2319 /*current_value=*/current_header,
2320 /*compare_value=*/box,
2321 /*exchange_value=*/displaced_header,
2322 /*where=*/oop,
2323 MacroAssembler::MemBarRel,
2324 MacroAssembler::cmpxchgx_hint_release_lock(),
2325 noreg,
2326 &failure);
2327 assert(oopDesc::mark_offset_in_bytes() == 0, "offset of _mark is not 0");
2328 b(success);
2329 }
2330
2331 // Handle existing monitor.
2332 bind(object_has_monitor);
2333 STATIC_ASSERT(markWord::monitor_value <= INT_MAX);
2334 addi(current_header, current_header, -(int)markWord::monitor_value); // monitor
2335 ld(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2336
2337 // In case of LM_LIGHTWEIGHT, we may reach here with (temp & ObjectMonitor::ANONYMOUS_OWNER) != 0.
2338 // This is handled like owner thread mismatches: We take the slow path.
2339 cmpd(flag, temp, R16_thread);
2340 bne(flag, failure);
2341
2342 ld(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2343
2344 addic_(displaced_header, displaced_header, -1);
2345 blt(CCR0, notRecursive); // Not recursive if negative after decrement.
2346 std(displaced_header, in_bytes(ObjectMonitor::recursions_offset()), current_header);
2347 if (flag == CCR0) { // Otherwise, flag is already EQ, here.
2348 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Set CCR0 EQ
2349 }
2350 b(success);
2351
2352 bind(notRecursive);
2353 ld(temp, in_bytes(ObjectMonitor::EntryList_offset()), current_header);
2354 ld(displaced_header, in_bytes(ObjectMonitor::cxq_offset()), current_header);
2355 orr(temp, temp, displaced_header); // Will be 0 if both are 0.
2356 cmpdi(flag, temp, 0);
2357 bne(flag, failure);
2358 release();
2359 std(temp, in_bytes(ObjectMonitor::owner_offset()), current_header);
2360
2361 // flag == EQ indicates success, decrement held monitor count
2362 // flag == NE indicates failure
2363 bind(success);
2364 dec_held_monitor_count(temp);
2365 bind(failure);
2366 }
2367
2368 void MacroAssembler::compiler_fast_lock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2369 Register tmp2, Register tmp3) {
2370 assert_different_registers(obj, tmp1, tmp2, tmp3);
2371 assert(flag == CCR0, "bad condition register");
2372
2373 // Handle inflated monitor.
2374 Label inflated;
2375 // Finish fast lock successfully. MUST reach to with flag == NE
2376 Label locked;
2377 // Finish fast lock unsuccessfully. MUST branch to with flag == EQ
2378 Label slow_path;
2379
2380 if (DiagnoseSyncOnValueBasedClasses != 0) {
2381 load_klass(tmp1, obj);
2382 lwz(tmp1, in_bytes(Klass::access_flags_offset()), tmp1);
2383 testbitdi(flag, R0, tmp1, exact_log2(JVM_ACC_IS_VALUE_BASED_CLASS));
2384 bne(flag, slow_path);
2385 }
2386
2387 const Register mark = tmp1;
2388 const Register t = tmp3; // Usage of R0 allowed!
2389
2390 { // Lightweight locking
2391
2392 // Push lock to the lock stack and finish successfully. MUST reach to with flag == EQ
2393 Label push;
2394
2395 const Register top = tmp2;
2396
2397 // Check if lock-stack is full.
2398 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2399 cmplwi(flag, top, LockStack::end_offset() - 1);
2400 bgt(flag, slow_path);
2401
2402 // The underflow check is elided. The recursive check will always fail
2403 // when the lock stack is empty because of the _bad_oop_sentinel field.
2404
2405 // Check if recursive.
2406 subi(t, top, oopSize);
2407 ldx(t, R16_thread, t);
2408 cmpd(flag, obj, t);
2409 beq(flag, push);
2410
2411 // Check for monitor (0b10) or locked (0b00).
2412 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2413 andi_(t, mark, markWord::lock_mask_in_place);
2414 cmpldi(flag, t, markWord::unlocked_value);
2415 bgt(flag, inflated);
2416 bne(flag, slow_path);
2417
2418 // Not inflated.
2419
2420 // Try to lock. Transition lock bits 0b00 => 0b01
2421 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a lea");
2422 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow_path, MacroAssembler::MemBarAcq);
2423
2424 bind(push);
2425 // After successful lock, push object on lock-stack.
2426 stdx(obj, R16_thread, top);
2427 addi(top, top, oopSize);
2428 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2429 b(locked);
2430 }
2431
2432 { // Handle inflated monitor.
2433 bind(inflated);
2434
2435 // mark contains the tagged ObjectMonitor*.
2436 const Register tagged_monitor = mark;
2437 const uintptr_t monitor_tag = markWord::monitor_value;
2438 const Register owner_addr = tmp2;
2439
2440 // Compute owner address.
2441 addi(owner_addr, tagged_monitor, in_bytes(ObjectMonitor::owner_offset()) - monitor_tag);
2442
2443 // CAS owner (null => current thread).
2444 cmpxchgd(/*flag=*/flag,
2445 /*current_value=*/t,
2446 /*compare_value=*/(intptr_t)0,
2447 /*exchange_value=*/R16_thread,
2448 /*where=*/owner_addr,
2449 MacroAssembler::MemBarRel | MacroAssembler::MemBarAcq,
2450 MacroAssembler::cmpxchgx_hint_acquire_lock());
2451 beq(flag, locked);
2452
2453 // Check if recursive.
2454 cmpd(flag, t, R16_thread);
2455 bne(flag, slow_path);
2456
2457 // Recursive.
2458 ld(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2459 addi(tmp1, tmp1, 1);
2460 std(tmp1, in_bytes(ObjectMonitor::recursions_offset() - ObjectMonitor::owner_offset()), owner_addr);
2461 }
2462
2463 bind(locked);
2464 inc_held_monitor_count(tmp1);
2465
2466 #ifdef ASSERT
2467 // Check that locked label is reached with flag == EQ.
2468 Label flag_correct;
2469 beq(flag, flag_correct);
2470 stop("Fast Lock Flag != EQ");
2471 #endif
2472 bind(slow_path);
2473 #ifdef ASSERT
2474 // Check that slow_path label is reached with flag == NE.
2475 bne(flag, flag_correct);
2476 stop("Fast Lock Flag != NE");
2477 bind(flag_correct);
2478 #endif
2479 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2480 }
2481
2482 void MacroAssembler::compiler_fast_unlock_lightweight_object(ConditionRegister flag, Register obj, Register tmp1,
2483 Register tmp2, Register tmp3) {
2484 assert_different_registers(obj, tmp1, tmp2, tmp3);
2485 assert(flag == CCR0, "bad condition register");
2486
2487 // Handle inflated monitor.
2488 Label inflated, inflated_load_monitor;
2489 // Finish fast unlock successfully. MUST reach to with flag == EQ.
2490 Label unlocked;
2491 // Finish fast unlock unsuccessfully. MUST branch to with flag == NE.
2492 Label slow_path;
2493
2494 const Register mark = tmp1;
2495 const Register top = tmp2;
2496 const Register t = tmp3;
2497
2498 { // Lightweight unlock
2499 Label push_and_slow;
2500
2501 // Check if obj is top of lock-stack.
2502 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2503 subi(top, top, oopSize);
2504 ldx(t, R16_thread, top);
2505 cmpd(flag, obj, t);
2506 // Top of lock stack was not obj. Must be monitor.
2507 bne(flag, inflated_load_monitor);
2508
2509 // Pop lock-stack.
2510 DEBUG_ONLY(li(t, 0);)
2511 DEBUG_ONLY(stdx(t, R16_thread, top);)
2512 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2513
2514 // The underflow check is elided. The recursive check will always fail
2515 // when the lock stack is empty because of the _bad_oop_sentinel field.
2516
2517 // Check if recursive.
2518 subi(t, top, oopSize);
2519 ldx(t, R16_thread, t);
2520 cmpd(flag, obj, t);
2521 beq(flag, unlocked);
2522
2523 // Not recursive.
2524
2525 // Check for monitor (0b10).
2526 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2527 andi_(t, mark, markWord::monitor_value);
2528 bne(CCR0, inflated);
2529
2530 #ifdef ASSERT
2531 // Check header not unlocked (0b01).
2532 Label not_unlocked;
2533 andi_(t, mark, markWord::unlocked_value);
2534 beq(CCR0, not_unlocked);
2535 stop("lightweight_unlock already unlocked");
2536 bind(not_unlocked);
2537 #endif
2538
2539 // Try to unlock. Transition lock bits 0b00 => 0b01
2540 atomically_flip_locked_state(/* is_unlock */ true, obj, mark, push_and_slow, MacroAssembler::MemBarRel);
2541 b(unlocked);
2542
2543 bind(push_and_slow);
2544 // Restore lock-stack and handle the unlock in runtime.
2545 DEBUG_ONLY(stdx(obj, R16_thread, top);)
2546 addi(top, top, oopSize);
2547 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
2548 b(slow_path);
2549 }
2550
2551 { // Handle inflated monitor.
2552 bind(inflated_load_monitor);
2553 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
2554 #ifdef ASSERT
2555 andi_(t, mark, markWord::monitor_value);
2556 bne(CCR0, inflated);
2557 stop("Fast Unlock not monitor");
2558 #endif
2559
2560 bind(inflated);
2561
2562 #ifdef ASSERT
2563 Label check_done;
2564 subi(top, top, oopSize);
2565 cmplwi(CCR0, top, in_bytes(JavaThread::lock_stack_base_offset()));
2566 blt(CCR0, check_done);
2567 ldx(t, R16_thread, top);
2568 cmpd(flag, obj, t);
2569 bne(flag, inflated);
2570 stop("Fast Unlock lock on stack");
2571 bind(check_done);
2572 #endif
2573
2574 // mark contains the tagged ObjectMonitor*.
2575 const Register monitor = mark;
2576 const uintptr_t monitor_tag = markWord::monitor_value;
2577
2578 // Untag the monitor.
2579 subi(monitor, mark, monitor_tag);
2580
2581 const Register recursions = tmp2;
2582 Label not_recursive;
2583
2584 // Check if recursive.
2585 ld(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2586 addic_(recursions, recursions, -1);
2587 blt(CCR0, not_recursive);
2588
2589 // Recursive unlock.
2590 std(recursions, in_bytes(ObjectMonitor::recursions_offset()), monitor);
2591 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal);
2592 b(unlocked);
2593
2594 bind(not_recursive);
2595
2596 Label release_;
2597 const Register t2 = tmp2;
2598
2599 // Check if the entry lists are empty.
2600 ld(t, in_bytes(ObjectMonitor::EntryList_offset()), monitor);
2601 ld(t2, in_bytes(ObjectMonitor::cxq_offset()), monitor);
2602 orr(t, t, t2);
2603 cmpdi(flag, t, 0);
2604 beq(flag, release_);
2605
2606 // The owner may be anonymous and we removed the last obj entry in
2607 // the lock-stack. This loses the information about the owner.
2608 // Write the thread to the owner field so the runtime knows the owner.
2609 std(R16_thread, in_bytes(ObjectMonitor::owner_offset()), monitor);
2610 b(slow_path);
2611
2612 bind(release_);
2613 // Set owner to null.
2614 release();
2615 // t contains 0
2616 std(t, in_bytes(ObjectMonitor::owner_offset()), monitor);
2617 }
2618
2619 bind(unlocked);
2620 dec_held_monitor_count(t);
2621
2622 #ifdef ASSERT
2623 // Check that unlocked label is reached with flag == EQ.
2624 Label flag_correct;
2625 beq(flag, flag_correct);
2626 stop("Fast Lock Flag != EQ");
2627 #endif
2628 bind(slow_path);
2629 #ifdef ASSERT
2630 // Check that slow_path label is reached with flag == NE.
2631 bne(flag, flag_correct);
2632 stop("Fast Lock Flag != NE");
2633 bind(flag_correct);
2634 #endif
2635 // C2 uses the value of flag (NE vs EQ) to determine the continuation.
2636 }
2637
2638 void MacroAssembler::safepoint_poll(Label& slow_path, Register temp, bool at_return, bool in_nmethod) {
2639 ld(temp, in_bytes(JavaThread::polling_word_offset()), R16_thread);
2640
2641 if (at_return) {
2642 if (in_nmethod) {
2643 if (UseSIGTRAP) {
2644 // Use Signal Handler.
2645 relocate(relocInfo::poll_return_type);
2646 td(traptoGreaterThanUnsigned, R1_SP, temp);
2647 } else {
2648 cmpld(CCR0, R1_SP, temp);
2649 // Stub may be out of range for short conditional branch.
2650 bc_far_optimized(Assembler::bcondCRbiIs1, bi0(CCR0, Assembler::greater), slow_path);
2651 }
2652 } else { // Not in nmethod.
2653 // Frame still on stack, need to get fp.
2654 Register fp = R0;
2655 ld(fp, _abi0(callers_sp), R1_SP);
2656 cmpld(CCR0, fp, temp);
2657 bgt(CCR0, slow_path);
2658 }
2659 } else { // Normal safepoint poll. Not at return.
2660 assert(!in_nmethod, "should use load_from_polling_page");
2661 andi_(temp, temp, SafepointMechanism::poll_bit());
2662 bne(CCR0, slow_path);
2663 }
2664 }
2665
2666 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2,
2667 MacroAssembler::PreservationLevel preservation_level) {
2668 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2669 bs->resolve_jobject(this, value, tmp1, tmp2, preservation_level);
2670 }
2671
2672 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2,
2673 MacroAssembler::PreservationLevel preservation_level) {
2674 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
2675 bs->resolve_global_jobject(this, value, tmp1, tmp2, preservation_level);
2676 }
2677
2678 // Values for last_Java_pc, and last_Java_sp must comply to the rules
2679 // in frame_ppc.hpp.
2680 void MacroAssembler::set_last_Java_frame(Register last_Java_sp, Register last_Java_pc) {
2681 // Always set last_Java_pc and flags first because once last_Java_sp
2682 // is visible has_last_Java_frame is true and users will look at the
2683 // rest of the fields. (Note: flags should always be zero before we
2684 // get here so doesn't need to be set.)
2685
2686 // Verify that last_Java_pc was zeroed on return to Java
2687 asm_assert_mem8_is_zero(in_bytes(JavaThread::last_Java_pc_offset()), R16_thread,
2688 "last_Java_pc not zeroed before leaving Java");
2689
2690 // When returning from calling out from Java mode the frame anchor's
2691 // last_Java_pc will always be set to null. It is set here so that
2692 // if we are doing a call to native (not VM) that we capture the
2693 // known pc and don't have to rely on the native call having a
2694 // standard frame linkage where we can find the pc.
2695 if (last_Java_pc != noreg)
2696 std(last_Java_pc, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2697
2698 // Set last_Java_sp last.
2699 std(last_Java_sp, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2700 }
2701
2702 void MacroAssembler::reset_last_Java_frame(void) {
2703 asm_assert_mem8_isnot_zero(in_bytes(JavaThread::last_Java_sp_offset()),
2704 R16_thread, "SP was not set, still zero");
2705
2706 BLOCK_COMMENT("reset_last_Java_frame {");
2707 li(R0, 0);
2708
2709 // _last_Java_sp = 0
2710 std(R0, in_bytes(JavaThread::last_Java_sp_offset()), R16_thread);
2711
2712 // _last_Java_pc = 0
2713 std(R0, in_bytes(JavaThread::last_Java_pc_offset()), R16_thread);
2714 BLOCK_COMMENT("} reset_last_Java_frame");
2715 }
2716
2717 void MacroAssembler::set_top_ijava_frame_at_SP_as_last_Java_frame(Register sp, Register tmp1) {
2718 assert_different_registers(sp, tmp1);
2719
2720 // sp points to a TOP_IJAVA_FRAME, retrieve frame's PC via
2721 // TOP_IJAVA_FRAME_ABI.
2722 // FIXME: assert that we really have a TOP_IJAVA_FRAME here!
2723 address entry = pc();
2724 load_const_optimized(tmp1, entry);
2725
2726 set_last_Java_frame(/*sp=*/sp, /*pc=*/tmp1);
2727 }
2728
2729 void MacroAssembler::get_vm_result(Register oop_result) {
2730 // Read:
2731 // R16_thread
2732 // R16_thread->in_bytes(JavaThread::vm_result_offset())
2733 //
2734 // Updated:
2735 // oop_result
2736 // R16_thread->in_bytes(JavaThread::vm_result_offset())
2737
2738 ld(oop_result, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2739 li(R0, 0);
2740 std(R0, in_bytes(JavaThread::vm_result_offset()), R16_thread);
2741
2742 verify_oop(oop_result, FILE_AND_LINE);
2743 }
2744
2745 void MacroAssembler::get_vm_result_2(Register metadata_result) {
2746 // Read:
2747 // R16_thread
2748 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2749 //
2750 // Updated:
2751 // metadata_result
2752 // R16_thread->in_bytes(JavaThread::vm_result_2_offset())
2753
2754 ld(metadata_result, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2755 li(R0, 0);
2756 std(R0, in_bytes(JavaThread::vm_result_2_offset()), R16_thread);
2757 }
2758
2759 Register MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2760 Register current = (src != noreg) ? src : dst; // Klass is in dst if no src provided.
2761 if (CompressedKlassPointers::base() != 0) {
2762 // Use dst as temp if it is free.
2763 sub_const_optimized(dst, current, CompressedKlassPointers::base(), R0);
2764 current = dst;
2765 }
2766 if (CompressedKlassPointers::shift() != 0) {
2767 srdi(dst, current, CompressedKlassPointers::shift());
2768 current = dst;
2769 }
2770 return current;
2771 }
2772
2773 void MacroAssembler::store_klass(Register dst_oop, Register klass, Register ck) {
2774 if (UseCompressedClassPointers) {
2775 Register compressedKlass = encode_klass_not_null(ck, klass);
2776 stw(compressedKlass, oopDesc::klass_offset_in_bytes(), dst_oop);
2777 } else {
2778 std(klass, oopDesc::klass_offset_in_bytes(), dst_oop);
2779 }
2780 }
2781
2782 void MacroAssembler::store_klass_gap(Register dst_oop, Register val) {
2783 if (UseCompressedClassPointers) {
2784 if (val == noreg) {
2785 val = R0;
2786 li(val, 0);
2787 }
2788 stw(val, oopDesc::klass_gap_offset_in_bytes(), dst_oop); // klass gap if compressed
2789 }
2790 }
2791
2792 int MacroAssembler::instr_size_for_decode_klass_not_null() {
2793 static int computed_size = -1;
2794
2795 // Not yet computed?
2796 if (computed_size == -1) {
2797
2798 if (!UseCompressedClassPointers) {
2799 computed_size = 0;
2800 } else {
2801 // Determine by scratch emit.
2802 ResourceMark rm;
2803 int code_size = 8 * BytesPerInstWord;
2804 CodeBuffer cb("decode_klass_not_null scratch buffer", code_size, 0);
2805 MacroAssembler* a = new MacroAssembler(&cb);
2806 a->decode_klass_not_null(R11_scratch1);
2807 computed_size = a->offset();
2808 }
2809 }
2810
2811 return computed_size;
2812 }
2813
2814 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2815 assert(dst != R0, "Dst reg may not be R0, as R0 is used here.");
2816 if (src == noreg) src = dst;
2817 Register shifted_src = src;
2818 if (CompressedKlassPointers::shift() != 0 ||
2819 (CompressedKlassPointers::base() == 0 && src != dst)) { // Move required.
2820 shifted_src = dst;
2821 sldi(shifted_src, src, CompressedKlassPointers::shift());
2822 }
2823 if (CompressedKlassPointers::base() != 0) {
2824 add_const_optimized(dst, shifted_src, CompressedKlassPointers::base(), R0);
2825 }
2826 }
2827
2828 void MacroAssembler::load_klass(Register dst, Register src) {
2829 if (UseCompressedClassPointers) {
2830 lwz(dst, oopDesc::klass_offset_in_bytes(), src);
2831 // Attention: no null check here!
2832 decode_klass_not_null(dst, dst);
2833 } else {
2834 ld(dst, oopDesc::klass_offset_in_bytes(), src);
2835 }
2836 }
2837
2838 void MacroAssembler::load_klass_check_null(Register dst, Register src, Label* is_null) {
2839 null_check(src, oopDesc::klass_offset_in_bytes(), is_null);
2840 load_klass(dst, src);
2841 }
2842
2843 // ((OopHandle)result).resolve();
2844 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2,
2845 MacroAssembler::PreservationLevel preservation_level) {
2846 access_load_at(T_OBJECT, IN_NATIVE, result, noreg, result, tmp1, tmp2, preservation_level);
2847 }
2848
2849 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2,
2850 MacroAssembler::PreservationLevel preservation_level) {
2851 Label resolved;
2852
2853 // A null weak handle resolves to null.
2854 cmpdi(CCR0, result, 0);
2855 beq(CCR0, resolved);
2856
2857 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, result, noreg, result, tmp1, tmp2,
2858 preservation_level);
2859 bind(resolved);
2860 }
2861
2862 void MacroAssembler::load_method_holder(Register holder, Register method) {
2863 ld(holder, in_bytes(Method::const_offset()), method);
2864 ld(holder, in_bytes(ConstMethod::constants_offset()), holder);
2865 ld(holder, ConstantPool::pool_holder_offset(), holder);
2866 }
2867
2868 // Clear Array
2869 // For very short arrays. tmp == R0 is allowed.
2870 void MacroAssembler::clear_memory_unrolled(Register base_ptr, int cnt_dwords, Register tmp, int offset) {
2871 if (cnt_dwords > 0) { li(tmp, 0); }
2872 for (int i = 0; i < cnt_dwords; ++i) { std(tmp, offset + i * 8, base_ptr); }
2873 }
2874
2875 // Version for constant short array length. Kills base_ptr. tmp == R0 is allowed.
2876 void MacroAssembler::clear_memory_constlen(Register base_ptr, int cnt_dwords, Register tmp) {
2877 if (cnt_dwords < 8) {
2878 clear_memory_unrolled(base_ptr, cnt_dwords, tmp);
2879 return;
2880 }
2881
2882 Label loop;
2883 const long loopcnt = cnt_dwords >> 1,
2884 remainder = cnt_dwords & 1;
2885
2886 li(tmp, loopcnt);
2887 mtctr(tmp);
2888 li(tmp, 0);
2889 bind(loop);
2890 std(tmp, 0, base_ptr);
2891 std(tmp, 8, base_ptr);
2892 addi(base_ptr, base_ptr, 16);
2893 bdnz(loop);
2894 if (remainder) { std(tmp, 0, base_ptr); }
2895 }
2896
2897 // Kills both input registers. tmp == R0 is allowed.
2898 void MacroAssembler::clear_memory_doubleword(Register base_ptr, Register cnt_dwords, Register tmp, long const_cnt) {
2899 // Procedure for large arrays (uses data cache block zero instruction).
2900 Label startloop, fast, fastloop, small_rest, restloop, done;
2901 const int cl_size = VM_Version::L1_data_cache_line_size(),
2902 cl_dwords = cl_size >> 3,
2903 cl_dw_addr_bits = exact_log2(cl_dwords),
2904 dcbz_min = 1, // Min count of dcbz executions, needs to be >0.
2905 min_cnt = ((dcbz_min + 1) << cl_dw_addr_bits) - 1;
2906
2907 if (const_cnt >= 0) {
2908 // Constant case.
2909 if (const_cnt < min_cnt) {
2910 clear_memory_constlen(base_ptr, const_cnt, tmp);
2911 return;
2912 }
2913 load_const_optimized(cnt_dwords, const_cnt, tmp);
2914 } else {
2915 // cnt_dwords already loaded in register. Need to check size.
2916 cmpdi(CCR1, cnt_dwords, min_cnt); // Big enough? (ensure >= dcbz_min lines included).
2917 blt(CCR1, small_rest);
2918 }
2919 rldicl_(tmp, base_ptr, 64-3, 64-cl_dw_addr_bits); // Extract dword offset within first cache line.
2920 beq(CCR0, fast); // Already 128byte aligned.
2921
2922 subfic(tmp, tmp, cl_dwords);
2923 mtctr(tmp); // Set ctr to hit 128byte boundary (0<ctr<cl_dwords).
2924 subf(cnt_dwords, tmp, cnt_dwords); // rest.
2925 li(tmp, 0);
2926
2927 bind(startloop); // Clear at the beginning to reach 128byte boundary.
2928 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
2929 addi(base_ptr, base_ptr, 8);
2930 bdnz(startloop);
2931
2932 bind(fast); // Clear 128byte blocks.
2933 srdi(tmp, cnt_dwords, cl_dw_addr_bits); // Loop count for 128byte loop (>0).
2934 andi(cnt_dwords, cnt_dwords, cl_dwords-1); // Rest in dwords.
2935 mtctr(tmp); // Load counter.
2936
2937 bind(fastloop);
2938 dcbz(base_ptr); // Clear 128byte aligned block.
2939 addi(base_ptr, base_ptr, cl_size);
2940 bdnz(fastloop);
2941
2942 bind(small_rest);
2943 cmpdi(CCR0, cnt_dwords, 0); // size 0?
2944 beq(CCR0, done); // rest == 0
2945 li(tmp, 0);
2946 mtctr(cnt_dwords); // Load counter.
2947
2948 bind(restloop); // Clear rest.
2949 std(tmp, 0, base_ptr); // Clear 8byte aligned block.
2950 addi(base_ptr, base_ptr, 8);
2951 bdnz(restloop);
2952
2953 bind(done);
2954 }
2955
2956 /////////////////////////////////////////// String intrinsics ////////////////////////////////////////////
2957
2958 // Helpers for Intrinsic Emitters
2959 //
2960 // Revert the byte order of a 32bit value in a register
2961 // src: 0x44556677
2962 // dst: 0x77665544
2963 // Three steps to obtain the result:
2964 // 1) Rotate src (as doubleword) left 5 bytes. That puts the leftmost byte of the src word
2965 // into the rightmost byte position. Afterwards, everything left of the rightmost byte is cleared.
2966 // This value initializes dst.
2967 // 2) Rotate src (as word) left 3 bytes. That puts the rightmost byte of the src word into the leftmost
2968 // byte position. Furthermore, byte 5 is rotated into byte 6 position where it is supposed to go.
2969 // This value is mask inserted into dst with a [0..23] mask of 1s.
2970 // 3) Rotate src (as word) left 1 byte. That puts byte 6 into byte 5 position.
2971 // This value is mask inserted into dst with a [8..15] mask of 1s.
2972 void MacroAssembler::load_reverse_32(Register dst, Register src) {
2973 assert_different_registers(dst, src);
2974
2975 rldicl(dst, src, (4+1)*8, 56); // Rotate byte 4 into position 7 (rightmost), clear all to the left.
2976 rlwimi(dst, src, 3*8, 0, 23); // Insert byte 5 into position 6, 7 into 4, leave pos 7 alone.
2977 rlwimi(dst, src, 1*8, 8, 15); // Insert byte 6 into position 5, leave the rest alone.
2978 }
2979
2980 // Calculate the column addresses of the crc32 lookup table into distinct registers.
2981 // This loop-invariant calculation is moved out of the loop body, reducing the loop
2982 // body size from 20 to 16 instructions.
2983 // Returns the offset that was used to calculate the address of column tc3.
2984 // Due to register shortage, setting tc3 may overwrite table. With the return offset
2985 // at hand, the original table address can be easily reconstructed.
2986 int MacroAssembler::crc32_table_columns(Register table, Register tc0, Register tc1, Register tc2, Register tc3) {
2987 assert(!VM_Version::has_vpmsumb(), "Vector version should be used instead!");
2988
2989 // Point to 4 byte folding tables (byte-reversed version for Big Endian)
2990 // Layout: See StubRoutines::ppc::generate_crc_constants.
2991 #ifdef VM_LITTLE_ENDIAN
2992 const int ix0 = 3 * CRC32_TABLE_SIZE;
2993 const int ix1 = 2 * CRC32_TABLE_SIZE;
2994 const int ix2 = 1 * CRC32_TABLE_SIZE;
2995 const int ix3 = 0 * CRC32_TABLE_SIZE;
2996 #else
2997 const int ix0 = 1 * CRC32_TABLE_SIZE;
2998 const int ix1 = 2 * CRC32_TABLE_SIZE;
2999 const int ix2 = 3 * CRC32_TABLE_SIZE;
3000 const int ix3 = 4 * CRC32_TABLE_SIZE;
3001 #endif
3002 assert_different_registers(table, tc0, tc1, tc2);
3003 assert(table == tc3, "must be!");
3004
3005 addi(tc0, table, ix0);
3006 addi(tc1, table, ix1);
3007 addi(tc2, table, ix2);
3008 if (ix3 != 0) addi(tc3, table, ix3);
3009
3010 return ix3;
3011 }
3012
3013 /**
3014 * uint32_t crc;
3015 * table[crc & 0xFF] ^ (crc >> 8);
3016 */
3017 void MacroAssembler::fold_byte_crc32(Register crc, Register val, Register table, Register tmp) {
3018 assert_different_registers(crc, table, tmp);
3019 assert_different_registers(val, table);
3020
3021 if (crc == val) { // Must rotate first to use the unmodified value.
3022 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3023 // As we use a word (4-byte) instruction, we have to adapt the mask bit positions.
3024 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3025 } else {
3026 srwi(crc, crc, 8); // Unsigned shift, clear leftmost 8 bits.
3027 rlwinm(tmp, val, 2, 24-2, 31-2); // Insert (rightmost) byte 7 of val, shifted left by 2, into byte 6..7 of tmp, clear the rest.
3028 }
3029 lwzx(tmp, table, tmp);
3030 xorr(crc, crc, tmp);
3031 }
3032
3033 /**
3034 * Emits code to update CRC-32 with a byte value according to constants in table.
3035 *
3036 * @param [in,out]crc Register containing the crc.
3037 * @param [in]val Register containing the byte to fold into the CRC.
3038 * @param [in]table Register containing the table of crc constants.
3039 *
3040 * uint32_t crc;
3041 * val = crc_table[(val ^ crc) & 0xFF];
3042 * crc = val ^ (crc >> 8);
3043 */
3044 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3045 BLOCK_COMMENT("update_byte_crc32:");
3046 xorr(val, val, crc);
3047 fold_byte_crc32(crc, val, table, val);
3048 }
3049
3050 /**
3051 * @param crc register containing existing CRC (32-bit)
3052 * @param buf register pointing to input byte buffer (byte*)
3053 * @param len register containing number of bytes
3054 * @param table register pointing to CRC table
3055 */
3056 void MacroAssembler::update_byteLoop_crc32(Register crc, Register buf, Register len, Register table,
3057 Register data, bool loopAlignment) {
3058 assert_different_registers(crc, buf, len, table, data);
3059
3060 Label L_mainLoop, L_done;
3061 const int mainLoop_stepping = 1;
3062 const int mainLoop_alignment = loopAlignment ? 32 : 4; // (InputForNewCode > 4 ? InputForNewCode : 32) : 4;
3063
3064 // Process all bytes in a single-byte loop.
3065 clrldi_(len, len, 32); // Enforce 32 bit. Anything to do?
3066 beq(CCR0, L_done);
3067
3068 mtctr(len);
3069 align(mainLoop_alignment);
3070 BIND(L_mainLoop);
3071 lbz(data, 0, buf); // Byte from buffer, zero-extended.
3072 addi(buf, buf, mainLoop_stepping); // Advance buffer position.
3073 update_byte_crc32(crc, data, table);
3074 bdnz(L_mainLoop); // Iterate.
3075
3076 bind(L_done);
3077 }
3078
3079 /**
3080 * Emits code to update CRC-32 with a 4-byte value according to constants in table
3081 * Implementation according to jdk/src/share/native/java/util/zip/zlib-1.2.8/crc32.c
3082 */
3083 // A note on the lookup table address(es):
3084 // The implementation uses 4 table columns (byte-reversed versions for Big Endian).
3085 // To save the effort of adding the column offset to the table address each time
3086 // a table element is looked up, it is possible to pass the pre-calculated
3087 // column addresses.
3088 // Uses R9..R12 as work register. Must be saved/restored by caller, if necessary.
3089 void MacroAssembler::update_1word_crc32(Register crc, Register buf, Register table, int bufDisp, int bufInc,
3090 Register t0, Register t1, Register t2, Register t3,
3091 Register tc0, Register tc1, Register tc2, Register tc3) {
3092 assert_different_registers(crc, t3);
3093
3094 // XOR crc with next four bytes of buffer.
3095 lwz(t3, bufDisp, buf);
3096 if (bufInc != 0) {
3097 addi(buf, buf, bufInc);
3098 }
3099 xorr(t3, t3, crc);
3100
3101 // Chop crc into 4 single-byte pieces, shifted left 2 bits, to form the table indices.
3102 rlwinm(t0, t3, 2, 24-2, 31-2); // ((t1 >> 0) & 0xff) << 2
3103 rlwinm(t1, t3, 32+(2- 8), 24-2, 31-2); // ((t1 >> 8) & 0xff) << 2
3104 rlwinm(t2, t3, 32+(2-16), 24-2, 31-2); // ((t1 >> 16) & 0xff) << 2
3105 rlwinm(t3, t3, 32+(2-24), 24-2, 31-2); // ((t1 >> 24) & 0xff) << 2
3106
3107 // Use the pre-calculated column addresses.
3108 // Load pre-calculated table values.
3109 lwzx(t0, tc0, t0);
3110 lwzx(t1, tc1, t1);
3111 lwzx(t2, tc2, t2);
3112 lwzx(t3, tc3, t3);
3113
3114 // Calculate new crc from table values.
3115 xorr(t0, t0, t1);
3116 xorr(t2, t2, t3);
3117 xorr(crc, t0, t2); // Now crc contains the final checksum value.
3118 }
3119
3120 /**
3121 * @param crc register containing existing CRC (32-bit)
3122 * @param buf register pointing to input byte buffer (byte*)
3123 * @param len register containing number of bytes
3124 * @param table register pointing to CRC table
3125 *
3126 * uses R9..R12 as work register. Must be saved/restored by caller!
3127 */
3128 void MacroAssembler::kernel_crc32_1word(Register crc, Register buf, Register len, Register table,
3129 Register t0, Register t1, Register t2, Register t3,
3130 Register tc0, Register tc1, Register tc2, Register tc3,
3131 bool invertCRC) {
3132 assert_different_registers(crc, buf, len, table);
3133
3134 Label L_mainLoop, L_tail;
3135 Register tmp = t0;
3136 Register data = t0;
3137 Register tmp2 = t1;
3138 const int mainLoop_stepping = 4;
3139 const int tailLoop_stepping = 1;
3140 const int log_stepping = exact_log2(mainLoop_stepping);
3141 const int mainLoop_alignment = 32; // InputForNewCode > 4 ? InputForNewCode : 32;
3142 const int complexThreshold = 2*mainLoop_stepping;
3143
3144 // Don't test for len <= 0 here. This pathological case should not occur anyway.
3145 // Optimizing for it by adding a test and a branch seems to be a waste of CPU cycles
3146 // for all well-behaved cases. The situation itself is detected and handled correctly
3147 // within update_byteLoop_crc32.
3148 assert(tailLoop_stepping == 1, "check tailLoop_stepping!");
3149
3150 BLOCK_COMMENT("kernel_crc32_1word {");
3151
3152 if (invertCRC) {
3153 nand(crc, crc, crc); // 1s complement of crc
3154 }
3155
3156 // Check for short (<mainLoop_stepping) buffer.
3157 cmpdi(CCR0, len, complexThreshold);
3158 blt(CCR0, L_tail);
3159
3160 // Pre-mainLoop alignment did show a slight (1%) positive effect on performance.
3161 // We leave the code in for reference. Maybe we need alignment when we exploit vector instructions.
3162 {
3163 // Align buf addr to mainLoop_stepping boundary.
3164 neg(tmp2, buf); // Calculate # preLoop iterations for alignment.
3165 rldicl(tmp2, tmp2, 0, 64-log_stepping); // Rotate tmp2 0 bits, insert into tmp2, anding with mask with 1s from 62..63.
3166
3167 if (complexThreshold > mainLoop_stepping) {
3168 sub(len, len, tmp2); // Remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3169 } else {
3170 sub(tmp, len, tmp2); // Remaining bytes for main loop.
3171 cmpdi(CCR0, tmp, mainLoop_stepping);
3172 blt(CCR0, L_tail); // For less than one mainloop_stepping left, do only tail processing
3173 mr(len, tmp); // remaining bytes for main loop (>=mainLoop_stepping is guaranteed).
3174 }
3175 update_byteLoop_crc32(crc, buf, tmp2, table, data, false);
3176 }
3177
3178 srdi(tmp2, len, log_stepping); // #iterations for mainLoop
3179 andi(len, len, mainLoop_stepping-1); // remaining bytes for tailLoop
3180 mtctr(tmp2);
3181
3182 #ifdef VM_LITTLE_ENDIAN
3183 Register crc_rv = crc;
3184 #else
3185 Register crc_rv = tmp; // Load_reverse needs separate registers to work on.
3186 // Occupies tmp, but frees up crc.
3187 load_reverse_32(crc_rv, crc); // Revert byte order because we are dealing with big-endian data.
3188 tmp = crc;
3189 #endif
3190
3191 int reconstructTableOffset = crc32_table_columns(table, tc0, tc1, tc2, tc3);
3192
3193 align(mainLoop_alignment); // Octoword-aligned loop address. Shows 2% improvement.
3194 BIND(L_mainLoop);
3195 update_1word_crc32(crc_rv, buf, table, 0, mainLoop_stepping, crc_rv, t1, t2, t3, tc0, tc1, tc2, tc3);
3196 bdnz(L_mainLoop);
3197
3198 #ifndef VM_LITTLE_ENDIAN
3199 load_reverse_32(crc, crc_rv); // Revert byte order because we are dealing with big-endian data.
3200 tmp = crc_rv; // Tmp uses it's original register again.
3201 #endif
3202
3203 // Restore original table address for tailLoop.
3204 if (reconstructTableOffset != 0) {
3205 addi(table, table, -reconstructTableOffset);
3206 }
3207
3208 // Process last few (<complexThreshold) bytes of buffer.
3209 BIND(L_tail);
3210 update_byteLoop_crc32(crc, buf, len, table, data, false);
3211
3212 if (invertCRC) {
3213 nand(crc, crc, crc); // 1s complement of crc
3214 }
3215 BLOCK_COMMENT("} kernel_crc32_1word");
3216 }
3217
3218 /**
3219 * @param crc register containing existing CRC (32-bit)
3220 * @param buf register pointing to input byte buffer (byte*)
3221 * @param len register containing number of bytes
3222 * @param constants register pointing to precomputed constants
3223 * @param t0-t6 temp registers
3224 */
3225 void MacroAssembler::kernel_crc32_vpmsum(Register crc, Register buf, Register len, Register constants,
3226 Register t0, Register t1, Register t2, Register t3,
3227 Register t4, Register t5, Register t6, bool invertCRC) {
3228 assert_different_registers(crc, buf, len, constants);
3229
3230 Label L_tail;
3231
3232 BLOCK_COMMENT("kernel_crc32_vpmsum {");
3233
3234 if (invertCRC) {
3235 nand(crc, crc, crc); // 1s complement of crc
3236 }
3237
3238 // Enforce 32 bit.
3239 clrldi(len, len, 32);
3240
3241 // Align if we have enough bytes for the fast version.
3242 const int alignment = 16,
3243 threshold = 32;
3244 Register prealign = t0;
3245
3246 neg(prealign, buf);
3247 addi(t1, len, -threshold);
3248 andi(prealign, prealign, alignment - 1);
3249 cmpw(CCR0, t1, prealign);
3250 blt(CCR0, L_tail); // len - prealign < threshold?
3251
3252 subf(len, prealign, len);
3253 update_byteLoop_crc32(crc, buf, prealign, constants, t2, false);
3254
3255 // Calculate from first aligned address as far as possible.
3256 addi(constants, constants, CRC32_TABLE_SIZE); // Point to vector constants.
3257 kernel_crc32_vpmsum_aligned(crc, buf, len, constants, t0, t1, t2, t3, t4, t5, t6);
3258 addi(constants, constants, -CRC32_TABLE_SIZE); // Point to table again.
3259
3260 // Remaining bytes.
3261 BIND(L_tail);
3262 update_byteLoop_crc32(crc, buf, len, constants, t2, false);
3263
3264 if (invertCRC) {
3265 nand(crc, crc, crc); // 1s complement of crc
3266 }
3267
3268 BLOCK_COMMENT("} kernel_crc32_vpmsum");
3269 }
3270
3271 /**
3272 * @param crc register containing existing CRC (32-bit)
3273 * @param buf register pointing to input byte buffer (byte*)
3274 * @param len register containing number of bytes (will get updated to remaining bytes)
3275 * @param constants register pointing to CRC table for 128-bit aligned memory
3276 * @param t0-t6 temp registers
3277 */
3278 void MacroAssembler::kernel_crc32_vpmsum_aligned(Register crc, Register buf, Register len, Register constants,
3279 Register t0, Register t1, Register t2, Register t3, Register t4, Register t5, Register t6) {
3280
3281 // Save non-volatile vector registers (frameless).
3282 Register offset = t1;
3283 int offsetInt = 0;
3284 offsetInt -= 16; li(offset, offsetInt); stvx(VR20, offset, R1_SP);
3285 offsetInt -= 16; li(offset, offsetInt); stvx(VR21, offset, R1_SP);
3286 offsetInt -= 16; li(offset, offsetInt); stvx(VR22, offset, R1_SP);
3287 offsetInt -= 16; li(offset, offsetInt); stvx(VR23, offset, R1_SP);
3288 offsetInt -= 16; li(offset, offsetInt); stvx(VR24, offset, R1_SP);
3289 offsetInt -= 16; li(offset, offsetInt); stvx(VR25, offset, R1_SP);
3290 #ifndef VM_LITTLE_ENDIAN
3291 offsetInt -= 16; li(offset, offsetInt); stvx(VR26, offset, R1_SP);
3292 #endif
3293 offsetInt -= 8; std(R14, offsetInt, R1_SP);
3294 offsetInt -= 8; std(R15, offsetInt, R1_SP);
3295
3296 // Implementation uses an inner loop which uses between 256 and 16 * unroll_factor
3297 // bytes per iteration. The basic scheme is:
3298 // lvx: load vector (Big Endian needs reversal)
3299 // vpmsumw: carry-less 32 bit multiplications with constant representing a large CRC shift
3300 // vxor: xor partial results together to get unroll_factor2 vectors
3301
3302 // Outer loop performs the CRC shifts needed to combine the unroll_factor2 vectors.
3303
3304 // Using 16 * unroll_factor / unroll_factor_2 bytes for constants.
3305 const int unroll_factor = CRC32_UNROLL_FACTOR,
3306 unroll_factor2 = CRC32_UNROLL_FACTOR2;
3307
3308 const int outer_consts_size = (unroll_factor2 - 1) * 16,
3309 inner_consts_size = (unroll_factor / unroll_factor2) * 16;
3310
3311 // Support registers.
3312 Register offs[] = { noreg, t0, t1, t2, t3, t4, t5, t6 };
3313 Register num_bytes = R14,
3314 loop_count = R15,
3315 cur_const = crc; // will live in VCRC
3316 // Constant array for outer loop: unroll_factor2 - 1 registers,
3317 // Constant array for inner loop: unroll_factor / unroll_factor2 registers.
3318 VectorRegister consts0[] = { VR16, VR17, VR18, VR19, VR20, VR21, VR22 },
3319 consts1[] = { VR23, VR24 };
3320 // Data register arrays: 2 arrays with unroll_factor2 registers.
3321 VectorRegister data0[] = { VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7 },
3322 data1[] = { VR8, VR9, VR10, VR11, VR12, VR13, VR14, VR15 };
3323
3324 VectorRegister VCRC = data0[0];
3325 VectorRegister Vc = VR25;
3326 VectorRegister swap_bytes = VR26; // Only for Big Endian.
3327
3328 // We have at least 1 iteration (ensured by caller).
3329 Label L_outer_loop, L_inner_loop, L_last;
3330
3331 // If supported set DSCR pre-fetch to deepest.
3332 if (VM_Version::has_mfdscr()) {
3333 load_const_optimized(t0, VM_Version::_dscr_val | 7);
3334 mtdscr(t0);
3335 }
3336
3337 mtvrwz(VCRC, crc); // crc lives in VCRC, now
3338
3339 for (int i = 1; i < unroll_factor2; ++i) {
3340 li(offs[i], 16 * i);
3341 }
3342
3343 // Load consts for outer loop
3344 lvx(consts0[0], constants);
3345 for (int i = 1; i < unroll_factor2 - 1; ++i) {
3346 lvx(consts0[i], offs[i], constants);
3347 }
3348
3349 load_const_optimized(num_bytes, 16 * unroll_factor);
3350
3351 // Reuse data registers outside of the loop.
3352 VectorRegister Vtmp = data1[0];
3353 VectorRegister Vtmp2 = data1[1];
3354 VectorRegister zeroes = data1[2];
3355
3356 vspltisb(Vtmp, 0);
3357 vsldoi(VCRC, Vtmp, VCRC, 8); // 96 bit zeroes, 32 bit CRC.
3358
3359 // Load vector for vpermxor (to xor both 64 bit parts together)
3360 lvsl(Vtmp, buf); // 000102030405060708090a0b0c0d0e0f
3361 vspltisb(Vc, 4);
3362 vsl(Vc, Vtmp, Vc); // 00102030405060708090a0b0c0d0e0f0
3363 xxspltd(Vc->to_vsr(), Vc->to_vsr(), 0);
3364 vor(Vc, Vtmp, Vc); // 001122334455667708192a3b4c5d6e7f
3365
3366 #ifdef VM_LITTLE_ENDIAN
3367 #define BE_swap_bytes(x)
3368 #else
3369 vspltisb(Vtmp2, 0xf);
3370 vxor(swap_bytes, Vtmp, Vtmp2);
3371 #define BE_swap_bytes(x) vperm(x, x, x, swap_bytes)
3372 #endif
3373
3374 cmpd(CCR0, len, num_bytes);
3375 blt(CCR0, L_last);
3376
3377 addi(cur_const, constants, outer_consts_size); // Point to consts for inner loop
3378 load_const_optimized(loop_count, unroll_factor / (2 * unroll_factor2) - 1); // One double-iteration peeled off.
3379
3380 // ********** Main loop start **********
3381 align(32);
3382 bind(L_outer_loop);
3383
3384 // Begin of unrolled first iteration (no xor).
3385 lvx(data1[0], buf);
3386 for (int i = 1; i < unroll_factor2 / 2; ++i) {
3387 lvx(data1[i], offs[i], buf);
3388 }
3389 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3390 lvx(consts1[0], cur_const);
3391 mtctr(loop_count);
3392 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3393 BE_swap_bytes(data1[i]);
3394 if (i == 0) { vxor(data1[0], data1[0], VCRC); } // xor in previous CRC.
3395 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3396 vpmsumw(data0[i], data1[i], consts1[0]);
3397 }
3398 addi(buf, buf, 16 * unroll_factor2);
3399 subf(len, num_bytes, len);
3400 lvx(consts1[1], offs[1], cur_const);
3401 addi(cur_const, cur_const, 32);
3402 // Begin of unrolled second iteration (head).
3403 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3404 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3405 if (i == 0) { lvx(data1[0], buf); } else { lvx(data1[i], offs[i], buf); }
3406 vpmsumw(data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[0]);
3407 }
3408 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3409 BE_swap_bytes(data1[i]);
3410 lvx(data1[i + unroll_factor2 / 2], offs[i + unroll_factor2 / 2], buf);
3411 vpmsumw(data1[i], data1[i], consts1[1]);
3412 }
3413 addi(buf, buf, 16 * unroll_factor2);
3414
3415 // Generate most performance relevant code. Loads + half of the vpmsumw have been generated.
3416 // Double-iteration allows using the 2 constant registers alternatingly.
3417 align(32);
3418 bind(L_inner_loop);
3419 for (int j = 1; j < 3; ++j) { // j < unroll_factor / unroll_factor2 - 1 for complete unrolling.
3420 if (j & 1) {
3421 lvx(consts1[0], cur_const);
3422 } else {
3423 lvx(consts1[1], offs[1], cur_const);
3424 addi(cur_const, cur_const, 32);
3425 }
3426 for (int i = 0; i < unroll_factor2; ++i) {
3427 int idx = i + unroll_factor2 / 2, inc = 0; // For modulo-scheduled input.
3428 if (idx >= unroll_factor2) { idx -= unroll_factor2; inc = 1; }
3429 BE_swap_bytes(data1[idx]);
3430 vxor(data0[i], data0[i], data1[i]);
3431 if (i == 0) lvx(data1[0], buf); else lvx(data1[i], offs[i], buf);
3432 vpmsumw(data1[idx], data1[idx], consts1[(j + inc) & 1]);
3433 }
3434 addi(buf, buf, 16 * unroll_factor2);
3435 }
3436 bdnz(L_inner_loop);
3437
3438 addi(cur_const, constants, outer_consts_size); // Reset
3439
3440 // Tail of last iteration (no loads).
3441 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3442 BE_swap_bytes(data1[i + unroll_factor2 / 2]);
3443 vxor(data0[i], data0[i], data1[i]);
3444 vpmsumw(data1[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2], consts1[1]);
3445 }
3446 for (int i = 0; i < unroll_factor2 / 2; ++i) {
3447 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]); // First half of fixup shifts.
3448 vxor(data0[i + unroll_factor2 / 2], data0[i + unroll_factor2 / 2], data1[i + unroll_factor2 / 2]);
3449 }
3450
3451 // Last data register is ok, other ones need fixup shift.
3452 for (int i = unroll_factor2 / 2; i < unroll_factor2 - 1; ++i) {
3453 vpmsumw(data0[i], data0[i], consts0[unroll_factor2 - 2 - i]);
3454 }
3455
3456 // Combine to 128 bit result vector VCRC = data0[0].
3457 for (int i = 1; i < unroll_factor2; i<<=1) {
3458 for (int j = 0; j <= unroll_factor2 - 2*i; j+=2*i) {
3459 vxor(data0[j], data0[j], data0[j+i]);
3460 }
3461 }
3462 cmpd(CCR0, len, num_bytes);
3463 bge(CCR0, L_outer_loop);
3464
3465 // Last chance with lower num_bytes.
3466 bind(L_last);
3467 srdi(loop_count, len, exact_log2(16 * 2 * unroll_factor2)); // Use double-iterations.
3468 // Point behind last const for inner loop.
3469 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3470 sldi(R0, loop_count, exact_log2(16 * 2)); // Bytes of constants to be used.
3471 clrrdi(num_bytes, len, exact_log2(16 * 2 * unroll_factor2));
3472 subf(cur_const, R0, cur_const); // Point to constant to be used first.
3473
3474 addic_(loop_count, loop_count, -1); // One double-iteration peeled off.
3475 bgt(CCR0, L_outer_loop);
3476 // ********** Main loop end **********
3477
3478 // Restore DSCR pre-fetch value.
3479 if (VM_Version::has_mfdscr()) {
3480 load_const_optimized(t0, VM_Version::_dscr_val);
3481 mtdscr(t0);
3482 }
3483
3484 // ********** Simple loop for remaining 16 byte blocks **********
3485 {
3486 Label L_loop, L_done;
3487
3488 srdi_(t0, len, 4); // 16 bytes per iteration
3489 clrldi(len, len, 64-4);
3490 beq(CCR0, L_done);
3491
3492 // Point to const (same as last const for inner loop).
3493 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size - 16);
3494 mtctr(t0);
3495 lvx(Vtmp2, cur_const);
3496
3497 align(32);
3498 bind(L_loop);
3499
3500 lvx(Vtmp, buf);
3501 addi(buf, buf, 16);
3502 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3503 BE_swap_bytes(Vtmp);
3504 vxor(VCRC, VCRC, Vtmp);
3505 vpmsumw(VCRC, VCRC, Vtmp2);
3506 bdnz(L_loop);
3507
3508 bind(L_done);
3509 }
3510 // ********** Simple loop end **********
3511 #undef BE_swap_bytes
3512
3513 // Point to Barrett constants
3514 add_const_optimized(cur_const, constants, outer_consts_size + inner_consts_size);
3515
3516 vspltisb(zeroes, 0);
3517
3518 // Combine to 64 bit result.
3519 vpermxor(VCRC, VCRC, VCRC, Vc); // xor both halves to 64 bit result.
3520
3521 // Reduce to 32 bit CRC: Remainder by multiply-high.
3522 lvx(Vtmp, cur_const);
3523 vsldoi(Vtmp2, zeroes, VCRC, 12); // Extract high 32 bit.
3524 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply by inverse long poly.
3525 vsldoi(Vtmp2, zeroes, Vtmp2, 12); // Extract high 32 bit.
3526 vsldoi(Vtmp, zeroes, Vtmp, 8);
3527 vpmsumd(Vtmp2, Vtmp2, Vtmp); // Multiply quotient by long poly.
3528 vxor(VCRC, VCRC, Vtmp2); // Remainder fits into 32 bit.
3529
3530 // Move result. len is already updated.
3531 vsldoi(VCRC, VCRC, zeroes, 8);
3532 mfvrd(crc, VCRC);
3533
3534 // Restore non-volatile Vector registers (frameless).
3535 offsetInt = 0;
3536 offsetInt -= 16; li(offset, offsetInt); lvx(VR20, offset, R1_SP);
3537 offsetInt -= 16; li(offset, offsetInt); lvx(VR21, offset, R1_SP);
3538 offsetInt -= 16; li(offset, offsetInt); lvx(VR22, offset, R1_SP);
3539 offsetInt -= 16; li(offset, offsetInt); lvx(VR23, offset, R1_SP);
3540 offsetInt -= 16; li(offset, offsetInt); lvx(VR24, offset, R1_SP);
3541 offsetInt -= 16; li(offset, offsetInt); lvx(VR25, offset, R1_SP);
3542 #ifndef VM_LITTLE_ENDIAN
3543 offsetInt -= 16; li(offset, offsetInt); lvx(VR26, offset, R1_SP);
3544 #endif
3545 offsetInt -= 8; ld(R14, offsetInt, R1_SP);
3546 offsetInt -= 8; ld(R15, offsetInt, R1_SP);
3547 }
3548
3549 void MacroAssembler::crc32(Register crc, Register buf, Register len, Register t0, Register t1, Register t2,
3550 Register t3, Register t4, Register t5, Register t6, Register t7, bool is_crc32c) {
3551 load_const_optimized(t0, is_crc32c ? StubRoutines::crc32c_table_addr()
3552 : StubRoutines::crc_table_addr() , R0);
3553
3554 if (VM_Version::has_vpmsumb()) {
3555 kernel_crc32_vpmsum(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, !is_crc32c);
3556 } else {
3557 kernel_crc32_1word(crc, buf, len, t0, t1, t2, t3, t4, t5, t6, t7, t0, !is_crc32c);
3558 }
3559 }
3560
3561 void MacroAssembler::kernel_crc32_singleByteReg(Register crc, Register val, Register table, bool invertCRC) {
3562 assert_different_registers(crc, val, table);
3563
3564 BLOCK_COMMENT("kernel_crc32_singleByteReg:");
3565 if (invertCRC) {
3566 nand(crc, crc, crc); // 1s complement of crc
3567 }
3568
3569 update_byte_crc32(crc, val, table);
3570
3571 if (invertCRC) {
3572 nand(crc, crc, crc); // 1s complement of crc
3573 }
3574 }
3575
3576 // dest_lo += src1 + src2
3577 // dest_hi += carry1 + carry2
3578 void MacroAssembler::add2_with_carry(Register dest_hi,
3579 Register dest_lo,
3580 Register src1, Register src2) {
3581 li(R0, 0);
3582 addc(dest_lo, dest_lo, src1);
3583 adde(dest_hi, dest_hi, R0);
3584 addc(dest_lo, dest_lo, src2);
3585 adde(dest_hi, dest_hi, R0);
3586 }
3587
3588 // Multiply 64 bit by 64 bit first loop.
3589 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart,
3590 Register x_xstart,
3591 Register y, Register y_idx,
3592 Register z,
3593 Register carry,
3594 Register product_high, Register product,
3595 Register idx, Register kdx,
3596 Register tmp) {
3597 // jlong carry, x[], y[], z[];
3598 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
3599 // huge_128 product = y[idx] * x[xstart] + carry;
3600 // z[kdx] = (jlong)product;
3601 // carry = (jlong)(product >>> 64);
3602 // }
3603 // z[xstart] = carry;
3604
3605 Label L_first_loop, L_first_loop_exit;
3606 Label L_one_x, L_one_y, L_multiply;
3607
3608 addic_(xstart, xstart, -1);
3609 blt(CCR0, L_one_x); // Special case: length of x is 1.
3610
3611 // Load next two integers of x.
3612 sldi(tmp, xstart, LogBytesPerInt);
3613 ldx(x_xstart, x, tmp);
3614 #ifdef VM_LITTLE_ENDIAN
3615 rldicl(x_xstart, x_xstart, 32, 0);
3616 #endif
3617
3618 align(32, 16);
3619 bind(L_first_loop);
3620
3621 cmpdi(CCR0, idx, 1);
3622 blt(CCR0, L_first_loop_exit);
3623 addi(idx, idx, -2);
3624 beq(CCR0, L_one_y);
3625
3626 // Load next two integers of y.
3627 sldi(tmp, idx, LogBytesPerInt);
3628 ldx(y_idx, y, tmp);
3629 #ifdef VM_LITTLE_ENDIAN
3630 rldicl(y_idx, y_idx, 32, 0);
3631 #endif
3632
3633
3634 bind(L_multiply);
3635 multiply64(product_high, product, x_xstart, y_idx);
3636
3637 li(tmp, 0);
3638 addc(product, product, carry); // Add carry to result.
3639 adde(product_high, product_high, tmp); // Add carry of the last addition.
3640 addi(kdx, kdx, -2);
3641
3642 // Store result.
3643 #ifdef VM_LITTLE_ENDIAN
3644 rldicl(product, product, 32, 0);
3645 #endif
3646 sldi(tmp, kdx, LogBytesPerInt);
3647 stdx(product, z, tmp);
3648 mr_if_needed(carry, product_high);
3649 b(L_first_loop);
3650
3651
3652 bind(L_one_y); // Load one 32 bit portion of y as (0,value).
3653
3654 lwz(y_idx, 0, y);
3655 b(L_multiply);
3656
3657
3658 bind(L_one_x); // Load one 32 bit portion of x as (0,value).
3659
3660 lwz(x_xstart, 0, x);
3661 b(L_first_loop);
3662
3663 bind(L_first_loop_exit);
3664 }
3665
3666 // Multiply 64 bit by 64 bit and add 128 bit.
3667 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y,
3668 Register z, Register yz_idx,
3669 Register idx, Register carry,
3670 Register product_high, Register product,
3671 Register tmp, int offset) {
3672
3673 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
3674 // z[kdx] = (jlong)product;
3675
3676 sldi(tmp, idx, LogBytesPerInt);
3677 if (offset) {
3678 addi(tmp, tmp, offset);
3679 }
3680 ldx(yz_idx, y, tmp);
3681 #ifdef VM_LITTLE_ENDIAN
3682 rldicl(yz_idx, yz_idx, 32, 0);
3683 #endif
3684
3685 multiply64(product_high, product, x_xstart, yz_idx);
3686 ldx(yz_idx, z, tmp);
3687 #ifdef VM_LITTLE_ENDIAN
3688 rldicl(yz_idx, yz_idx, 32, 0);
3689 #endif
3690
3691 add2_with_carry(product_high, product, carry, yz_idx);
3692
3693 sldi(tmp, idx, LogBytesPerInt);
3694 if (offset) {
3695 addi(tmp, tmp, offset);
3696 }
3697 #ifdef VM_LITTLE_ENDIAN
3698 rldicl(product, product, 32, 0);
3699 #endif
3700 stdx(product, z, tmp);
3701 }
3702
3703 // Multiply 128 bit by 128 bit. Unrolled inner loop.
3704 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart,
3705 Register y, Register z,
3706 Register yz_idx, Register idx, Register carry,
3707 Register product_high, Register product,
3708 Register carry2, Register tmp) {
3709
3710 // jlong carry, x[], y[], z[];
3711 // int kdx = ystart+1;
3712 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3713 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
3714 // z[kdx+idx+1] = (jlong)product;
3715 // jlong carry2 = (jlong)(product >>> 64);
3716 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
3717 // z[kdx+idx] = (jlong)product;
3718 // carry = (jlong)(product >>> 64);
3719 // }
3720 // idx += 2;
3721 // if (idx > 0) {
3722 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
3723 // z[kdx+idx] = (jlong)product;
3724 // carry = (jlong)(product >>> 64);
3725 // }
3726
3727 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3728 const Register jdx = R0;
3729
3730 // Scale the index.
3731 srdi_(jdx, idx, 2);
3732 beq(CCR0, L_third_loop_exit);
3733 mtctr(jdx);
3734
3735 align(32, 16);
3736 bind(L_third_loop);
3737
3738 addi(idx, idx, -4);
3739
3740 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 8);
3741 mr_if_needed(carry2, product_high);
3742
3743 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product_high, product, tmp, 0);
3744 mr_if_needed(carry, product_high);
3745 bdnz(L_third_loop);
3746
3747 bind(L_third_loop_exit); // Handle any left-over operand parts.
3748
3749 andi_(idx, idx, 0x3);
3750 beq(CCR0, L_post_third_loop_done);
3751
3752 Label L_check_1;
3753
3754 addic_(idx, idx, -2);
3755 blt(CCR0, L_check_1);
3756
3757 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product_high, product, tmp, 0);
3758 mr_if_needed(carry, product_high);
3759
3760 bind(L_check_1);
3761
3762 addi(idx, idx, 0x2);
3763 andi_(idx, idx, 0x1);
3764 addic_(idx, idx, -1);
3765 blt(CCR0, L_post_third_loop_done);
3766
3767 sldi(tmp, idx, LogBytesPerInt);
3768 lwzx(yz_idx, y, tmp);
3769 multiply64(product_high, product, x_xstart, yz_idx);
3770 lwzx(yz_idx, z, tmp);
3771
3772 add2_with_carry(product_high, product, yz_idx, carry);
3773
3774 sldi(tmp, idx, LogBytesPerInt);
3775 stwx(product, z, tmp);
3776 srdi(product, product, 32);
3777
3778 sldi(product_high, product_high, 32);
3779 orr(product, product, product_high);
3780 mr_if_needed(carry, product);
3781
3782 bind(L_post_third_loop_done);
3783 } // multiply_128_x_128_loop
3784
3785 void MacroAssembler::muladd(Register out, Register in,
3786 Register offset, Register len, Register k,
3787 Register tmp1, Register tmp2, Register carry) {
3788
3789 // Labels
3790 Label LOOP, SKIP;
3791
3792 // Make sure length is positive.
3793 cmpdi (CCR0, len, 0);
3794
3795 // Prepare variables
3796 subi (offset, offset, 4);
3797 li (carry, 0);
3798 ble (CCR0, SKIP);
3799
3800 mtctr (len);
3801 subi (len, len, 1 );
3802 sldi (len, len, 2 );
3803
3804 // Main loop
3805 bind(LOOP);
3806 lwzx (tmp1, len, in );
3807 lwzx (tmp2, offset, out );
3808 mulld (tmp1, tmp1, k );
3809 add (tmp2, carry, tmp2 );
3810 add (tmp2, tmp1, tmp2 );
3811 stwx (tmp2, offset, out );
3812 srdi (carry, tmp2, 32 );
3813 subi (offset, offset, 4 );
3814 subi (len, len, 4 );
3815 bdnz (LOOP);
3816 bind(SKIP);
3817 }
3818
3819 void MacroAssembler::multiply_to_len(Register x, Register xlen,
3820 Register y, Register ylen,
3821 Register z, Register zlen,
3822 Register tmp1, Register tmp2,
3823 Register tmp3, Register tmp4,
3824 Register tmp5, Register tmp6,
3825 Register tmp7, Register tmp8,
3826 Register tmp9, Register tmp10,
3827 Register tmp11, Register tmp12,
3828 Register tmp13) {
3829
3830 ShortBranchVerifier sbv(this);
3831
3832 assert_different_registers(x, xlen, y, ylen, z, zlen,
3833 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3834 assert_different_registers(x, xlen, y, ylen, z, zlen,
3835 tmp1, tmp2, tmp3, tmp4, tmp5, tmp7);
3836 assert_different_registers(x, xlen, y, ylen, z, zlen,
3837 tmp1, tmp2, tmp3, tmp4, tmp5, tmp8);
3838
3839 const Register idx = tmp1;
3840 const Register kdx = tmp2;
3841 const Register xstart = tmp3;
3842
3843 const Register y_idx = tmp4;
3844 const Register carry = tmp5;
3845 const Register product = tmp6;
3846 const Register product_high = tmp7;
3847 const Register x_xstart = tmp8;
3848 const Register tmp = tmp9;
3849
3850 // First Loop.
3851 //
3852 // final static long LONG_MASK = 0xffffffffL;
3853 // int xstart = xlen - 1;
3854 // int ystart = ylen - 1;
3855 // long carry = 0;
3856 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3857 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3858 // z[kdx] = (int)product;
3859 // carry = product >>> 32;
3860 // }
3861 // z[xstart] = (int)carry;
3862
3863 mr_if_needed(idx, ylen); // idx = ylen
3864 mr_if_needed(kdx, zlen); // kdx = xlen + ylen
3865 li(carry, 0); // carry = 0
3866
3867 Label L_done;
3868
3869 addic_(xstart, xlen, -1);
3870 blt(CCR0, L_done);
3871
3872 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z,
3873 carry, product_high, product, idx, kdx, tmp);
3874
3875 Label L_second_loop;
3876
3877 cmpdi(CCR0, kdx, 0);
3878 beq(CCR0, L_second_loop);
3879
3880 Label L_carry;
3881
3882 addic_(kdx, kdx, -1);
3883 beq(CCR0, L_carry);
3884
3885 // Store lower 32 bits of carry.
3886 sldi(tmp, kdx, LogBytesPerInt);
3887 stwx(carry, z, tmp);
3888 srdi(carry, carry, 32);
3889 addi(kdx, kdx, -1);
3890
3891
3892 bind(L_carry);
3893
3894 // Store upper 32 bits of carry.
3895 sldi(tmp, kdx, LogBytesPerInt);
3896 stwx(carry, z, tmp);
3897
3898 // Second and third (nested) loops.
3899 //
3900 // for (int i = xstart-1; i >= 0; i--) { // Second loop
3901 // carry = 0;
3902 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3903 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3904 // (z[k] & LONG_MASK) + carry;
3905 // z[k] = (int)product;
3906 // carry = product >>> 32;
3907 // }
3908 // z[i] = (int)carry;
3909 // }
3910 //
3911 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
3912
3913 bind(L_second_loop);
3914
3915 li(carry, 0); // carry = 0;
3916
3917 addic_(xstart, xstart, -1); // i = xstart-1;
3918 blt(CCR0, L_done);
3919
3920 Register zsave = tmp10;
3921
3922 mr(zsave, z);
3923
3924
3925 Label L_last_x;
3926
3927 sldi(tmp, xstart, LogBytesPerInt);
3928 add(z, z, tmp); // z = z + k - j
3929 addi(z, z, 4);
3930 addic_(xstart, xstart, -1); // i = xstart-1;
3931 blt(CCR0, L_last_x);
3932
3933 sldi(tmp, xstart, LogBytesPerInt);
3934 ldx(x_xstart, x, tmp);
3935 #ifdef VM_LITTLE_ENDIAN
3936 rldicl(x_xstart, x_xstart, 32, 0);
3937 #endif
3938
3939
3940 Label L_third_loop_prologue;
3941
3942 bind(L_third_loop_prologue);
3943
3944 Register xsave = tmp11;
3945 Register xlensave = tmp12;
3946 Register ylensave = tmp13;
3947
3948 mr(xsave, x);
3949 mr(xlensave, xstart);
3950 mr(ylensave, ylen);
3951
3952
3953 multiply_128_x_128_loop(x_xstart, y, z, y_idx, ylen,
3954 carry, product_high, product, x, tmp);
3955
3956 mr(z, zsave);
3957 mr(x, xsave);
3958 mr(xlen, xlensave); // This is the decrement of the loop counter!
3959 mr(ylen, ylensave);
3960
3961 addi(tmp3, xlen, 1);
3962 sldi(tmp, tmp3, LogBytesPerInt);
3963 stwx(carry, z, tmp);
3964 addic_(tmp3, tmp3, -1);
3965 blt(CCR0, L_done);
3966
3967 srdi(carry, carry, 32);
3968 sldi(tmp, tmp3, LogBytesPerInt);
3969 stwx(carry, z, tmp);
3970 b(L_second_loop);
3971
3972 // Next infrequent code is moved outside loops.
3973 bind(L_last_x);
3974
3975 lwz(x_xstart, 0, x);
3976 b(L_third_loop_prologue);
3977
3978 bind(L_done);
3979 } // multiply_to_len
3980
3981 void MacroAssembler::asm_assert(bool check_equal, const char *msg) {
3982 #ifdef ASSERT
3983 Label ok;
3984 if (check_equal) {
3985 beq(CCR0, ok);
3986 } else {
3987 bne(CCR0, ok);
3988 }
3989 stop(msg);
3990 bind(ok);
3991 #endif
3992 }
3993
3994 void MacroAssembler::asm_assert_mems_zero(bool check_equal, int size, int mem_offset,
3995 Register mem_base, const char* msg) {
3996 #ifdef ASSERT
3997 switch (size) {
3998 case 4:
3999 lwz(R0, mem_offset, mem_base);
4000 cmpwi(CCR0, R0, 0);
4001 break;
4002 case 8:
4003 ld(R0, mem_offset, mem_base);
4004 cmpdi(CCR0, R0, 0);
4005 break;
4006 default:
4007 ShouldNotReachHere();
4008 }
4009 asm_assert(check_equal, msg);
4010 #endif // ASSERT
4011 }
4012
4013 void MacroAssembler::verify_coop(Register coop, const char* msg) {
4014 if (!VerifyOops) { return; }
4015 if (UseCompressedOops) { decode_heap_oop(coop); }
4016 verify_oop(coop, msg);
4017 if (UseCompressedOops) { encode_heap_oop(coop, coop); }
4018 }
4019
4020 // READ: oop. KILL: R0. Volatile floats perhaps.
4021 void MacroAssembler::verify_oop(Register oop, const char* msg) {
4022 if (!VerifyOops) {
4023 return;
4024 }
4025
4026 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4027 const Register tmp = R11; // Will be preserved.
4028 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4029
4030 BLOCK_COMMENT("verify_oop {");
4031
4032 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4033
4034 mr_if_needed(R4_ARG2, oop);
4035 save_LR_CR(tmp); // save in old frame
4036 push_frame_reg_args(nbytes_save, tmp);
4037 // load FunctionDescriptor** / entry_address *
4038 load_const_optimized(tmp, fd, R0);
4039 // load FunctionDescriptor* / entry_address
4040 ld(tmp, 0, tmp);
4041 load_const_optimized(R3_ARG1, (address)msg, R0);
4042 // Call destination for its side effect.
4043 call_c(tmp);
4044
4045 pop_frame();
4046 restore_LR_CR(tmp);
4047 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4048
4049 BLOCK_COMMENT("} verify_oop");
4050 }
4051
4052 void MacroAssembler::verify_oop_addr(RegisterOrConstant offs, Register base, const char* msg) {
4053 if (!VerifyOops) {
4054 return;
4055 }
4056
4057 address/* FunctionDescriptor** */fd = StubRoutines::verify_oop_subroutine_entry_address();
4058 const Register tmp = R11; // Will be preserved.
4059 const int nbytes_save = MacroAssembler::num_volatile_regs * 8;
4060 save_volatile_gprs(R1_SP, -nbytes_save); // except R0
4061
4062 ld(R4_ARG2, offs, base);
4063 save_LR_CR(tmp); // save in old frame
4064 push_frame_reg_args(nbytes_save, tmp);
4065 // load FunctionDescriptor** / entry_address *
4066 load_const_optimized(tmp, fd, R0);
4067 // load FunctionDescriptor* / entry_address
4068 ld(tmp, 0, tmp);
4069 load_const_optimized(R3_ARG1, (address)msg, R0);
4070 // Call destination for its side effect.
4071 call_c(tmp);
4072
4073 pop_frame();
4074 restore_LR_CR(tmp);
4075 restore_volatile_gprs(R1_SP, -nbytes_save); // except R0
4076 }
4077
4078 // Call a C-function that prints output.
4079 void MacroAssembler::stop(int type, const char* msg) {
4080 bool msg_present = (msg != nullptr);
4081
4082 #ifndef PRODUCT
4083 block_comment(err_msg("stop(type %d): %s {", type, msg_present ? msg : "null"));
4084 #else
4085 block_comment("stop {");
4086 #endif
4087
4088 if (msg_present) {
4089 type |= stop_msg_present;
4090 }
4091 tdi_unchecked(traptoUnconditional, 0/*reg 0*/, type);
4092 if (msg_present) {
4093 emit_int64((uintptr_t)msg);
4094 }
4095
4096 block_comment("} stop;");
4097 }
4098
4099 #ifndef PRODUCT
4100 // Write pattern 0x0101010101010101 in memory region [low-before, high+after].
4101 // Val, addr are temp registers.
4102 // If low == addr, addr is killed.
4103 // High is preserved.
4104 void MacroAssembler::zap_from_to(Register low, int before, Register high, int after, Register val, Register addr) {
4105 if (!ZapMemory) return;
4106
4107 assert_different_registers(low, val);
4108
4109 BLOCK_COMMENT("zap memory region {");
4110 load_const_optimized(val, 0x0101010101010101);
4111 int size = before + after;
4112 if (low == high && size < 5 && size > 0) {
4113 int offset = -before*BytesPerWord;
4114 for (int i = 0; i < size; ++i) {
4115 std(val, offset, low);
4116 offset += (1*BytesPerWord);
4117 }
4118 } else {
4119 addi(addr, low, -before*BytesPerWord);
4120 assert_different_registers(high, val);
4121 if (after) addi(high, high, after * BytesPerWord);
4122 Label loop;
4123 bind(loop);
4124 std(val, 0, addr);
4125 addi(addr, addr, 8);
4126 cmpd(CCR6, addr, high);
4127 ble(CCR6, loop);
4128 if (after) addi(high, high, -after * BytesPerWord); // Correct back to old value.
4129 }
4130 BLOCK_COMMENT("} zap memory region");
4131 }
4132
4133 #endif // !PRODUCT
4134
4135 void SkipIfEqualZero::skip_to_label_if_equal_zero(MacroAssembler* masm, Register temp,
4136 const bool* flag_addr, Label& label) {
4137 int simm16_offset = masm->load_const_optimized(temp, (address)flag_addr, R0, true);
4138 assert(sizeof(bool) == 1, "PowerPC ABI");
4139 masm->lbz(temp, simm16_offset, temp);
4140 masm->cmpwi(CCR0, temp, 0);
4141 masm->beq(CCR0, label);
4142 }
4143
4144 SkipIfEqualZero::SkipIfEqualZero(MacroAssembler* masm, Register temp, const bool* flag_addr) : _masm(masm), _label() {
4145 skip_to_label_if_equal_zero(masm, temp, flag_addr, _label);
4146 }
4147
4148 SkipIfEqualZero::~SkipIfEqualZero() {
4149 _masm->bind(_label);
4150 }
4151
4152 void MacroAssembler::cache_wb(Address line) {
4153 assert(line.index() == noreg, "index should be noreg");
4154 assert(line.disp() == 0, "displacement should be 0");
4155 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support flush to persistent memory");
4156 // Data Cache Store, not really a flush, so it works like a sync of cache
4157 // line and persistent mem, i.e. copying the cache line to persistent whilst
4158 // not invalidating the cache line.
4159 dcbst(line.base());
4160 }
4161
4162 void MacroAssembler::cache_wbsync(bool is_presync) {
4163 assert(VM_Version::supports_data_cache_line_flush(), "CPU or OS does not support sync related to persistent memory");
4164 // We only need a post sync barrier. Post means _after_ a cache line flush or
4165 // store instruction, pre means a barrier emitted before such a instructions.
4166 if (!is_presync) {
4167 fence();
4168 }
4169 }
4170
4171 void MacroAssembler::push_cont_fastpath() {
4172 Label done;
4173 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4174 cmpld(CCR0, R1_SP, R0);
4175 ble(CCR0, done);
4176 st_ptr(R1_SP, JavaThread::cont_fastpath_offset(), R16_thread);
4177 bind(done);
4178 }
4179
4180 void MacroAssembler::pop_cont_fastpath() {
4181 Label done;
4182 ld_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4183 cmpld(CCR0, R1_SP, R0);
4184 ble(CCR0, done);
4185 li(R0, 0);
4186 st_ptr(R0, JavaThread::cont_fastpath_offset(), R16_thread);
4187 bind(done);
4188 }
4189
4190 // Note: Must preserve CCR0 EQ (invariant).
4191 void MacroAssembler::inc_held_monitor_count(Register tmp) {
4192 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4193 #ifdef ASSERT
4194 Label ok;
4195 cmpdi(CCR0, tmp, 0);
4196 bge_predict_taken(CCR0, ok);
4197 stop("held monitor count is negativ at increment");
4198 bind(ok);
4199 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4200 #endif
4201 addi(tmp, tmp, 1);
4202 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4203 }
4204
4205 // Note: Must preserve CCR0 EQ (invariant).
4206 void MacroAssembler::dec_held_monitor_count(Register tmp) {
4207 ld(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4208 #ifdef ASSERT
4209 Label ok;
4210 cmpdi(CCR0, tmp, 0);
4211 bgt_predict_taken(CCR0, ok);
4212 stop("held monitor count is <= 0 at decrement");
4213 bind(ok);
4214 crorc(CCR0, Assembler::equal, CCR0, Assembler::equal); // Restore CCR0 EQ
4215 #endif
4216 addi(tmp, tmp, -1);
4217 std(tmp, in_bytes(JavaThread::held_monitor_count_offset()), R16_thread);
4218 }
4219
4220 // Function to flip between unlocked and locked state (fast locking).
4221 // Branches to failed if the state is not as expected with CCR0 NE.
4222 // Falls through upon success with CCR0 EQ.
4223 // This requires fewer instructions and registers and is easier to use than the
4224 // cmpxchg based implementation.
4225 void MacroAssembler::atomically_flip_locked_state(bool is_unlock, Register obj, Register tmp, Label& failed, int semantics) {
4226 assert_different_registers(obj, tmp, R0);
4227 Label retry;
4228
4229 if (semantics & MemBarRel) {
4230 release();
4231 }
4232
4233 bind(retry);
4234 STATIC_ASSERT(markWord::locked_value == 0); // Or need to change this!
4235 if (!is_unlock) {
4236 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_acquire_lock());
4237 xori(tmp, tmp, markWord::unlocked_value); // flip unlocked bit
4238 andi_(R0, tmp, markWord::lock_mask_in_place);
4239 bne(CCR0, failed); // failed if new header doesn't contain locked_value (which is 0)
4240 } else {
4241 ldarx(tmp, obj, MacroAssembler::cmpxchgx_hint_release_lock());
4242 andi_(R0, tmp, markWord::lock_mask_in_place);
4243 bne(CCR0, failed); // failed if old header doesn't contain locked_value (which is 0)
4244 ori(tmp, tmp, markWord::unlocked_value); // set unlocked bit
4245 }
4246 stdcx_(tmp, obj);
4247 bne(CCR0, retry);
4248
4249 if (semantics & MemBarFenceAfter) {
4250 fence();
4251 } else if (semantics & MemBarAcq) {
4252 isync();
4253 }
4254 }
4255
4256 // Implements lightweight-locking.
4257 //
4258 // - obj: the object to be locked
4259 // - t1, t2: temporary register
4260 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Label& slow) {
4261 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4262 assert_different_registers(obj, t1, t2);
4263
4264 Label push;
4265 const Register top = t1;
4266 const Register mark = t2;
4267 const Register t = R0;
4268
4269 // Check if the lock-stack is full.
4270 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4271 cmplwi(CCR0, top, LockStack::end_offset());
4272 bge(CCR0, slow);
4273
4274 // The underflow check is elided. The recursive check will always fail
4275 // when the lock stack is empty because of the _bad_oop_sentinel field.
4276
4277 // Check for recursion.
4278 subi(t, top, oopSize);
4279 ldx(t, R16_thread, t);
4280 cmpd(CCR0, obj, t);
4281 beq(CCR0, push);
4282
4283 // Check header for monitor (0b10) or locked (0b00).
4284 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4285 xori(t, mark, markWord::unlocked_value);
4286 andi_(t, t, markWord::lock_mask_in_place);
4287 bne(CCR0, slow);
4288
4289 // Try to lock. Transition lock bits 0b00 => 0b01
4290 atomically_flip_locked_state(/* is_unlock */ false, obj, mark, slow, MacroAssembler::MemBarAcq);
4291
4292 bind(push);
4293 // After successful lock, push object on lock-stack
4294 stdx(obj, R16_thread, top);
4295 addi(top, top, oopSize);
4296 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4297 }
4298
4299 // Implements lightweight-unlocking.
4300 //
4301 // - obj: the object to be unlocked
4302 // - t1: temporary register
4303 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Label& slow) {
4304 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
4305 assert_different_registers(obj, t1);
4306
4307 #ifdef ASSERT
4308 {
4309 // The following checks rely on the fact that LockStack is only ever modified by
4310 // its owning thread, even if the lock got inflated concurrently; removal of LockStack
4311 // entries after inflation will happen delayed in that case.
4312
4313 // Check for lock-stack underflow.
4314 Label stack_ok;
4315 lwz(t1, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4316 cmplwi(CCR0, t1, LockStack::start_offset());
4317 bge(CCR0, stack_ok);
4318 stop("Lock-stack underflow");
4319 bind(stack_ok);
4320 }
4321 #endif
4322
4323 Label unlocked, push_and_slow;
4324 const Register top = t1;
4325 const Register mark = R0;
4326 Register t = R0;
4327
4328 // Check if obj is top of lock-stack.
4329 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4330 subi(top, top, oopSize);
4331 ldx(t, R16_thread, top);
4332 cmpd(CCR0, obj, t);
4333 bne(CCR0, slow);
4334
4335 // Pop lock-stack.
4336 DEBUG_ONLY(li(t, 0);)
4337 DEBUG_ONLY(stdx(t, R16_thread, top);)
4338 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4339
4340 // The underflow check is elided. The recursive check will always fail
4341 // when the lock stack is empty because of the _bad_oop_sentinel field.
4342
4343 // Check if recursive.
4344 subi(t, top, oopSize);
4345 ldx(t, R16_thread, t);
4346 cmpd(CCR0, obj, t);
4347 beq(CCR0, unlocked);
4348
4349 // Use top as tmp
4350 t = top;
4351
4352 // Not recursive. Check header for monitor (0b10).
4353 ld(mark, oopDesc::mark_offset_in_bytes(), obj);
4354 andi_(t, mark, markWord::monitor_value);
4355 bne(CCR0, push_and_slow);
4356
4357 #ifdef ASSERT
4358 // Check header not unlocked (0b01).
4359 Label not_unlocked;
4360 andi_(t, mark, markWord::unlocked_value);
4361 beq(CCR0, not_unlocked);
4362 stop("lightweight_unlock already unlocked");
4363 bind(not_unlocked);
4364 #endif
4365
4366 // Try to unlock. Transition lock bits 0b00 => 0b01
4367 atomically_flip_locked_state(/* is_unlock */ true, obj, t, push_and_slow, MacroAssembler::MemBarRel);
4368 b(unlocked);
4369
4370 bind(push_and_slow);
4371
4372 // Restore lock-stack and handle the unlock in runtime.
4373 lwz(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4374 DEBUG_ONLY(stdx(obj, R16_thread, top);)
4375 addi(top, top, oopSize);
4376 stw(top, in_bytes(JavaThread::lock_stack_top_offset()), R16_thread);
4377 b(slow);
4378
4379 bind(unlocked);
4380 }