1 /*
2 * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2021, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "precompiled.hpp"
27 #include "jvm.h"
28 #include "asm/assembler.hpp"
29 #include "asm/assembler.inline.hpp"
30 #include "ci/ciEnv.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/cardTableBarrierSet.hpp"
34 #include "gc/shared/cardTable.hpp"
35 #include "gc/shared/collectedHeap.hpp"
36 #include "gc/shared/tlab_globals.hpp"
37 #include "interpreter/bytecodeHistogram.hpp"
38 #include "interpreter/interpreter.hpp"
39 #include "compiler/compileTask.hpp"
40 #include "compiler/disassembler.hpp"
41 #include "memory/resourceArea.hpp"
42 #include "memory/universe.hpp"
43 #include "nativeInst_aarch64.hpp"
44 #include "oops/accessDecorators.hpp"
45 #include "oops/compressedOops.inline.hpp"
46 #include "oops/klass.inline.hpp"
47 #include "runtime/biasedLocking.hpp"
48 #include "runtime/icache.hpp"
49 #include "runtime/interfaceSupport.inline.hpp"
50 #include "runtime/jniHandles.inline.hpp"
51 #include "runtime/objectMonitor.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "runtime/thread.hpp"
55 #include "utilities/globalDefinitions.hpp"
56 #include "utilities/powerOfTwo.hpp"
57 #ifdef COMPILER1
58 #include "c1/c1_LIRAssembler.hpp"
59 #endif
60 #ifdef COMPILER2
61 #include "oops/oop.hpp"
62 #include "opto/compile.hpp"
63 #include "opto/node.hpp"
64 #include "opto/output.hpp"
65 #endif
66
67 #include <sys/types.h>
68
69 #ifdef PRODUCT
70 #define BLOCK_COMMENT(str) /* nothing */
71 #else
72 #define BLOCK_COMMENT(str) block_comment(str)
73 #endif
74 #define STOP(str) stop(str);
75 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
76
77 // Patch any kind of instruction; there may be several instructions.
78 // Return the total length (in bytes) of the instructions.
79 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
80 int instructions = 1;
81 assert((uint64_t)target < (1ull << 48), "48-bit overflow in address constant");
82 intptr_t offset = (target - branch) >> 2;
83 unsigned insn = *(unsigned*)branch;
84 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
85 // Load register (literal)
86 Instruction_aarch64::spatch(branch, 23, 5, offset);
87 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
88 // Unconditional branch (immediate)
89 Instruction_aarch64::spatch(branch, 25, 0, offset);
90 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
91 // Conditional branch (immediate)
92 Instruction_aarch64::spatch(branch, 23, 5, offset);
93 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
94 // Compare & branch (immediate)
95 Instruction_aarch64::spatch(branch, 23, 5, offset);
96 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
97 // Test & branch (immediate)
98 Instruction_aarch64::spatch(branch, 18, 5, offset);
99 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
100 // PC-rel. addressing
101 offset = target-branch;
102 int shift = Instruction_aarch64::extract(insn, 31, 31);
103 if (shift) {
104 uint64_t dest = (uint64_t)target;
105 uint64_t pc_page = (uint64_t)branch >> 12;
106 uint64_t adr_page = (uint64_t)target >> 12;
107 unsigned offset_lo = dest & 0xfff;
108 offset = adr_page - pc_page;
109
110 // We handle 4 types of PC relative addressing
111 // 1 - adrp Rx, target_page
112 // ldr/str Ry, [Rx, #offset_in_page]
113 // 2 - adrp Rx, target_page
114 // add Ry, Rx, #offset_in_page
115 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
116 // movk Rx, #imm16<<32
117 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
118 // In the first 3 cases we must check that Rx is the same in the adrp and the
119 // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
120 // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
121 // to be followed by a random unrelated ldr/str, add or movk instruction.
122 //
123 unsigned insn2 = ((unsigned*)branch)[1];
124 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
125 Instruction_aarch64::extract(insn, 4, 0) ==
126 Instruction_aarch64::extract(insn2, 9, 5)) {
127 // Load/store register (unsigned immediate)
128 unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
129 Instruction_aarch64::patch(branch + sizeof (unsigned),
130 21, 10, offset_lo >> size);
131 guarantee(((dest >> size) << size) == dest, "misaligned target");
132 instructions = 2;
133 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
134 Instruction_aarch64::extract(insn, 4, 0) ==
135 Instruction_aarch64::extract(insn2, 4, 0)) {
136 // add (immediate)
137 Instruction_aarch64::patch(branch + sizeof (unsigned),
138 21, 10, offset_lo);
139 instructions = 2;
140 } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
141 Instruction_aarch64::extract(insn, 4, 0) ==
142 Instruction_aarch64::extract(insn2, 4, 0)) {
143 // movk #imm16<<32
144 Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
145 uintptr_t dest = ((uintptr_t)target & 0xffffffffULL) | ((uintptr_t)branch & 0xffff00000000ULL);
146 uintptr_t pc_page = (uintptr_t)branch >> 12;
147 uintptr_t adr_page = (uintptr_t)dest >> 12;
148 offset = adr_page - pc_page;
149 instructions = 2;
150 }
151 }
152 int offset_lo = offset & 3;
153 offset >>= 2;
154 Instruction_aarch64::spatch(branch, 23, 5, offset);
155 Instruction_aarch64::patch(branch, 30, 29, offset_lo);
156 } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
157 uint64_t dest = (uint64_t)target;
158 // Move wide constant
159 assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
160 assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
161 Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
162 Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
163 Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
164 assert(target_addr_for_insn(branch) == target, "should be");
165 instructions = 3;
166 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
167 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
168 // nothing to do
169 assert(target == 0, "did not expect to relocate target for polling page load");
170 } else {
171 ShouldNotReachHere();
172 }
173 return instructions * NativeInstruction::instruction_size;
174 }
175
176 int MacroAssembler::patch_oop(address insn_addr, address o) {
177 int instructions;
178 unsigned insn = *(unsigned*)insn_addr;
179 assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
180
181 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
182 // narrow OOPs by setting the upper 16 bits in the first
183 // instruction.
184 if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
185 // Move narrow OOP
186 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
187 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
188 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
189 instructions = 2;
190 } else {
191 // Move wide OOP
192 assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
193 uintptr_t dest = (uintptr_t)o;
194 Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
195 Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
196 Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
197 instructions = 3;
198 }
199 return instructions * NativeInstruction::instruction_size;
200 }
201
202 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
203 // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
204 // We encode narrow ones by setting the upper 16 bits in the first
205 // instruction.
206 NativeInstruction *insn = nativeInstruction_at(insn_addr);
207 assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
208 nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
209
210 Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
211 Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
212 return 2 * NativeInstruction::instruction_size;
213 }
214
215 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
216 intptr_t offset = 0;
217 if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
218 // Load register (literal)
219 offset = Instruction_aarch64::sextract(insn, 23, 5);
220 return address(((uint64_t)insn_addr + (offset << 2)));
221 } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
222 // Unconditional branch (immediate)
223 offset = Instruction_aarch64::sextract(insn, 25, 0);
224 } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
225 // Conditional branch (immediate)
226 offset = Instruction_aarch64::sextract(insn, 23, 5);
227 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
228 // Compare & branch (immediate)
229 offset = Instruction_aarch64::sextract(insn, 23, 5);
230 } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
231 // Test & branch (immediate)
232 offset = Instruction_aarch64::sextract(insn, 18, 5);
233 } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
234 // PC-rel. addressing
235 offset = Instruction_aarch64::extract(insn, 30, 29);
236 offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
237 int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
238 if (shift) {
239 offset <<= shift;
240 uint64_t target_page = ((uint64_t)insn_addr) + offset;
241 target_page &= ((uint64_t)-1) << shift;
242 // Return the target address for the following sequences
243 // 1 - adrp Rx, target_page
244 // ldr/str Ry, [Rx, #offset_in_page]
245 // 2 - adrp Rx, target_page
246 // add Ry, Rx, #offset_in_page
247 // 3 - adrp Rx, target_page (page aligned reloc, offset == 0)
248 // movk Rx, #imm12<<32
249 // 4 - adrp Rx, target_page (page aligned reloc, offset == 0)
250 //
251 // In the first two cases we check that the register is the same and
252 // return the target_page + the offset within the page.
253 // Otherwise we assume it is a page aligned relocation and return
254 // the target page only.
255 //
256 unsigned insn2 = ((unsigned*)insn_addr)[1];
257 if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
258 Instruction_aarch64::extract(insn, 4, 0) ==
259 Instruction_aarch64::extract(insn2, 9, 5)) {
260 // Load/store register (unsigned immediate)
261 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
262 unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
263 return address(target_page + (byte_offset << size));
264 } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
265 Instruction_aarch64::extract(insn, 4, 0) ==
266 Instruction_aarch64::extract(insn2, 4, 0)) {
267 // add (immediate)
268 unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
269 return address(target_page + byte_offset);
270 } else {
271 if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
272 Instruction_aarch64::extract(insn, 4, 0) ==
273 Instruction_aarch64::extract(insn2, 4, 0)) {
274 target_page = (target_page & 0xffffffff) |
275 ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
276 }
277 return (address)target_page;
278 }
279 } else {
280 ShouldNotReachHere();
281 }
282 } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
283 uint32_t *insns = (uint32_t *)insn_addr;
284 // Move wide constant: movz, movk, movk. See movptr().
285 assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
286 assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
287 return address(uint64_t(Instruction_aarch64::extract(insns[0], 20, 5))
288 + (uint64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
289 + (uint64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
290 } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
291 Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
292 return 0;
293 } else {
294 ShouldNotReachHere();
295 }
296 return address(((uint64_t)insn_addr + (offset << 2)));
297 }
298
299 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool acquire, bool in_nmethod) {
300 if (acquire) {
301 lea(rscratch1, Address(rthread, JavaThread::polling_word_offset()));
302 ldar(rscratch1, rscratch1);
303 } else {
304 ldr(rscratch1, Address(rthread, JavaThread::polling_word_offset()));
305 }
306 if (at_return) {
307 // Note that when in_nmethod is set, the stack pointer is incremented before the poll. Therefore,
308 // we may safely use the sp instead to perform the stack watermark check.
309 cmp(in_nmethod ? sp : rfp, rscratch1);
310 br(Assembler::HI, slow_path);
311 } else {
312 tbnz(rscratch1, log2i_exact(SafepointMechanism::poll_bit()), slow_path);
313 }
314 }
315
316 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
317 // we must set sp to zero to clear frame
318 str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
319
320 // must clear fp, so that compiled frames are not confused; it is
321 // possible that we need it only for debugging
322 if (clear_fp) {
323 str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
324 }
325
326 // Always clear the pc because it could have been set by make_walkable()
327 str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
328 }
329
330 // Calls to C land
331 //
332 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
333 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
334 // has to be reset to 0. This is required to allow proper stack traversal.
335 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
336 Register last_java_fp,
337 Register last_java_pc,
338 Register scratch) {
339
340 if (last_java_pc->is_valid()) {
341 str(last_java_pc, Address(rthread,
342 JavaThread::frame_anchor_offset()
343 + JavaFrameAnchor::last_Java_pc_offset()));
344 }
345
346 // determine last_java_sp register
347 if (last_java_sp == sp) {
348 mov(scratch, sp);
349 last_java_sp = scratch;
350 } else if (!last_java_sp->is_valid()) {
351 last_java_sp = esp;
352 }
353
354 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
355
356 // last_java_fp is optional
357 if (last_java_fp->is_valid()) {
358 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
359 }
360 }
361
362 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
363 Register last_java_fp,
364 address last_java_pc,
365 Register scratch) {
366 assert(last_java_pc != NULL, "must provide a valid PC");
367
368 adr(scratch, last_java_pc);
369 str(scratch, Address(rthread,
370 JavaThread::frame_anchor_offset()
371 + JavaFrameAnchor::last_Java_pc_offset()));
372
373 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
374 }
375
376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
377 Register last_java_fp,
378 Label &L,
379 Register scratch) {
380 if (L.is_bound()) {
381 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
382 } else {
383 InstructionMark im(this);
384 L.add_patch_at(code(), locator());
385 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
386 }
387 }
388
389 static inline bool target_needs_far_branch(address addr) {
390 // codecache size <= 128M
391 if (!MacroAssembler::far_branches()) {
392 return false;
393 }
394 // codecache size > 240M
395 if (MacroAssembler::codestub_branch_needs_far_jump()) {
396 return true;
397 }
398 // codecache size: 128M..240M
399 return !CodeCache::is_non_nmethod(addr);
400 }
401
402 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
403 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
404 assert(CodeCache::find_blob(entry.target()) != NULL,
405 "destination of far call not found in code cache");
406 if (target_needs_far_branch(entry.target())) {
407 uint64_t offset;
408 // We can use ADRP here because we know that the total size of
409 // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
410 adrp(tmp, entry, offset);
411 add(tmp, tmp, offset);
412 if (cbuf) cbuf->set_insts_mark();
413 blr(tmp);
414 } else {
415 if (cbuf) cbuf->set_insts_mark();
416 bl(entry);
417 }
418 }
419
420 int MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
421 assert(ReservedCodeCacheSize < 4*G, "branch out of range");
422 assert(CodeCache::find_blob(entry.target()) != NULL,
423 "destination of far call not found in code cache");
424 address start = pc();
425 if (target_needs_far_branch(entry.target())) {
426 uint64_t offset;
427 // We can use ADRP here because we know that the total size of
428 // the code cache cannot exceed 2Gb (ADRP limit is 4GB).
429 adrp(tmp, entry, offset);
430 add(tmp, tmp, offset);
431 if (cbuf) cbuf->set_insts_mark();
432 br(tmp);
433 } else {
434 if (cbuf) cbuf->set_insts_mark();
435 b(entry);
436 }
437 return pc() - start;
438 }
439
440 void MacroAssembler::reserved_stack_check() {
441 // testing if reserved zone needs to be enabled
442 Label no_reserved_zone_enabling;
443
444 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
445 cmp(sp, rscratch1);
446 br(Assembler::LO, no_reserved_zone_enabling);
447
448 enter(); // LR and FP are live.
449 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
450 mov(c_rarg0, rthread);
451 blr(rscratch1);
452 leave();
453
454 // We have already removed our own frame.
455 // throw_delayed_StackOverflowError will think that it's been
456 // called by our caller.
457 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
458 br(rscratch1);
459 should_not_reach_here();
460
461 bind(no_reserved_zone_enabling);
462 }
463
464 void MacroAssembler::biased_locking_enter(Register lock_reg,
465 Register obj_reg,
466 Register swap_reg,
467 Register tmp_reg,
468 bool swap_reg_contains_mark,
469 Label& done,
470 Label* slow_case,
471 BiasedLockingCounters* counters) {
472 assert(UseBiasedLocking, "why call this otherwise?");
473 assert_different_registers(lock_reg, obj_reg, swap_reg);
474
475 if (PrintBiasedLockingStatistics && counters == NULL)
476 counters = BiasedLocking::counters();
477
478 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
479 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
480 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes());
481 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes());
482 Address saved_mark_addr(lock_reg, 0);
483
484 // Biased locking
485 // See whether the lock is currently biased toward our thread and
486 // whether the epoch is still valid
487 // Note that the runtime guarantees sufficient alignment of JavaThread
488 // pointers to allow age to be placed into low bits
489 // First check to see whether biasing is even enabled for this object
490 Label cas_label;
491 if (!swap_reg_contains_mark) {
492 ldr(swap_reg, mark_addr);
493 }
494 andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
495 cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
496 br(Assembler::NE, cas_label);
497 // The bias pattern is present in the object's header. Need to check
498 // whether the bias owner and the epoch are both still current.
499 load_prototype_header(tmp_reg, obj_reg);
500 orr(tmp_reg, tmp_reg, rthread);
501 eor(tmp_reg, swap_reg, tmp_reg);
502 andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
503 if (counters != NULL) {
504 Label around;
505 cbnz(tmp_reg, around);
506 atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
507 b(done);
508 bind(around);
509 } else {
510 cbz(tmp_reg, done);
511 }
512
513 Label try_revoke_bias;
514 Label try_rebias;
515
516 // At this point we know that the header has the bias pattern and
517 // that we are not the bias owner in the current epoch. We need to
518 // figure out more details about the state of the header in order to
519 // know what operations can be legally performed on the object's
520 // header.
521
522 // If the low three bits in the xor result aren't clear, that means
523 // the prototype header is no longer biased and we have to revoke
524 // the bias on this object.
525 andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
526 cbnz(rscratch1, try_revoke_bias);
527
528 // Biasing is still enabled for this data type. See whether the
529 // epoch of the current bias is still valid, meaning that the epoch
530 // bits of the mark word are equal to the epoch bits of the
531 // prototype header. (Note that the prototype header's epoch bits
532 // only change at a safepoint.) If not, attempt to rebias the object
533 // toward the current thread. Note that we must be absolutely sure
534 // that the current epoch is invalid in order to do this because
535 // otherwise the manipulations it performs on the mark word are
536 // illegal.
537 andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
538 cbnz(rscratch1, try_rebias);
539
540 // The epoch of the current bias is still valid but we know nothing
541 // about the owner; it might be set or it might be clear. Try to
542 // acquire the bias of the object using an atomic operation. If this
543 // fails we will go in to the runtime to revoke the object's bias.
544 // Note that we first construct the presumed unbiased header so we
545 // don't accidentally blow away another thread's valid bias.
546 {
547 Label here;
548 mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
549 andr(swap_reg, swap_reg, rscratch1);
550 orr(tmp_reg, swap_reg, rthread);
551 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
552 // If the biasing toward our thread failed, this means that
553 // another thread succeeded in biasing it toward itself and we
554 // need to revoke that bias. The revocation will occur in the
555 // interpreter runtime in the slow case.
556 bind(here);
557 if (counters != NULL) {
558 atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
559 tmp_reg, rscratch1, rscratch2);
560 }
561 }
562 b(done);
563
564 bind(try_rebias);
565 // At this point we know the epoch has expired, meaning that the
566 // current "bias owner", if any, is actually invalid. Under these
567 // circumstances _only_, we are allowed to use the current header's
568 // value as the comparison value when doing the cas to acquire the
569 // bias in the current epoch. In other words, we allow transfer of
570 // the bias from one thread to another directly in this situation.
571 //
572 // FIXME: due to a lack of registers we currently blow away the age
573 // bits in this situation. Should attempt to preserve them.
574 {
575 Label here;
576 load_prototype_header(tmp_reg, obj_reg);
577 orr(tmp_reg, rthread, tmp_reg);
578 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
579 // If the biasing toward our thread failed, then another thread
580 // succeeded in biasing it toward itself and we need to revoke that
581 // bias. The revocation will occur in the runtime in the slow case.
582 bind(here);
583 if (counters != NULL) {
584 atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
585 tmp_reg, rscratch1, rscratch2);
586 }
587 }
588 b(done);
589
590 bind(try_revoke_bias);
591 // The prototype mark in the klass doesn't have the bias bit set any
592 // more, indicating that objects of this data type are not supposed
593 // to be biased any more. We are going to try to reset the mark of
594 // this object to the prototype value and fall through to the
595 // CAS-based locking scheme. Note that if our CAS fails, it means
596 // that another thread raced us for the privilege of revoking the
597 // bias of this particular object, so it's okay to continue in the
598 // normal locking code.
599 //
600 // FIXME: due to a lack of registers we currently blow away the age
601 // bits in this situation. Should attempt to preserve them.
602 {
603 Label here, nope;
604 load_prototype_header(tmp_reg, obj_reg);
605 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
606 bind(here);
607
608 // Fall through to the normal CAS-based lock, because no matter what
609 // the result of the above CAS, some thread must have succeeded in
610 // removing the bias bit from the object's header.
611 if (counters != NULL) {
612 atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
613 rscratch1, rscratch2);
614 }
615 bind(nope);
616 }
617
618 bind(cas_label);
619 }
620
621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
622 assert(UseBiasedLocking, "why call this otherwise?");
623
624 // Check for biased locking unlock case, which is a no-op
625 // Note: we do not have to check the thread ID for two reasons.
626 // First, the interpreter checks for IllegalMonitorStateException at
627 // a higher level. Second, if the bias was revoked while we held the
628 // lock, the object could not be rebiased toward another thread, so
629 // the bias bit would be clear.
630 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
631 andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
632 cmp(temp_reg, (u1)markWord::biased_lock_pattern);
633 br(Assembler::EQ, done);
634 }
635
636 static void pass_arg0(MacroAssembler* masm, Register arg) {
637 if (c_rarg0 != arg ) {
638 masm->mov(c_rarg0, arg);
639 }
640 }
641
642 static void pass_arg1(MacroAssembler* masm, Register arg) {
643 if (c_rarg1 != arg ) {
644 masm->mov(c_rarg1, arg);
645 }
646 }
647
648 static void pass_arg2(MacroAssembler* masm, Register arg) {
649 if (c_rarg2 != arg ) {
650 masm->mov(c_rarg2, arg);
651 }
652 }
653
654 static void pass_arg3(MacroAssembler* masm, Register arg) {
655 if (c_rarg3 != arg ) {
656 masm->mov(c_rarg3, arg);
657 }
658 }
659
660 void MacroAssembler::call_VM_base(Register oop_result,
661 Register java_thread,
662 Register last_java_sp,
663 address entry_point,
664 int number_of_arguments,
665 bool check_exceptions) {
666 // determine java_thread register
667 if (!java_thread->is_valid()) {
668 java_thread = rthread;
669 }
670
671 // determine last_java_sp register
672 if (!last_java_sp->is_valid()) {
673 last_java_sp = esp;
674 }
675
676 // debugging support
677 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
678 assert(java_thread == rthread, "unexpected register");
679 #ifdef ASSERT
680 // TraceBytecodes does not use r12 but saves it over the call, so don't verify
681 // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
682 #endif // ASSERT
683
684 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
685 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
686
687 // push java thread (becomes first argument of C function)
688
689 mov(c_rarg0, java_thread);
690
691 // set last Java frame before call
692 assert(last_java_sp != rfp, "can't use rfp");
693
694 Label l;
695 set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
696
697 // do the call, remove parameters
698 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
699
700 // lr could be poisoned with PAC signature during throw_pending_exception
701 // if it was tail-call optimized by compiler, since lr is not callee-saved
702 // reload it with proper value
703 adr(lr, l);
704
705 // reset last Java frame
706 // Only interpreter should have to clear fp
707 reset_last_Java_frame(true);
708
709 // C++ interp handles this in the interpreter
710 check_and_handle_popframe(java_thread);
711 check_and_handle_earlyret(java_thread);
712
713 if (check_exceptions) {
714 // check for pending exceptions (java_thread is set upon return)
715 ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
716 Label ok;
717 cbz(rscratch1, ok);
718 lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
719 br(rscratch1);
720 bind(ok);
721 }
722
723 // get oop result if there is one and reset the value in the thread
724 if (oop_result->is_valid()) {
725 get_vm_result(oop_result, java_thread);
726 }
727 }
728
729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
730 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
731 }
732
733 // Maybe emit a call via a trampoline. If the code cache is small
734 // trampolines won't be emitted.
735
736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer* cbuf) {
737 assert(JavaThread::current()->is_Compiler_thread(), "just checking");
738 assert(entry.rspec().type() == relocInfo::runtime_call_type
739 || entry.rspec().type() == relocInfo::opt_virtual_call_type
740 || entry.rspec().type() == relocInfo::static_call_type
741 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
742
743 bool need_trampoline = far_branches();
744 if (!need_trampoline && entry.rspec().type() == relocInfo::runtime_call_type && !CodeCache::contains(entry.target())) {
745 // If it is a runtime call of an address outside small CodeCache,
746 // we need to check whether it is in range.
747 address target = entry.target();
748 assert(target < CodeCache::low_bound() || target >= CodeCache::high_bound(), "target is inside CodeCache");
749 // Case 1: -------T-------L====CodeCache====H-------
750 // ^-------longest branch---|
751 // Case 2: -------L====CodeCache====H-------T-------
752 // |-------longest branch ---^
753 address longest_branch_start = (target < CodeCache::low_bound()) ? CodeCache::high_bound() - NativeInstruction::instruction_size
754 : CodeCache::low_bound();
755 need_trampoline = !reachable_from_branch_at(longest_branch_start, target);
756 }
757
758 // We need a trampoline if branches are far.
759 if (need_trampoline) {
760 bool in_scratch_emit_size = false;
761 #ifdef COMPILER2
762 // We don't want to emit a trampoline if C2 is generating dummy
763 // code during its branch shortening phase.
764 CompileTask* task = ciEnv::current()->task();
765 in_scratch_emit_size =
766 (task != NULL && is_c2_compile(task->comp_level()) &&
767 Compile::current()->output()->in_scratch_emit_size());
768 #endif
769 if (!in_scratch_emit_size) {
770 address stub = emit_trampoline_stub(offset(), entry.target());
771 if (stub == NULL) {
772 postcond(pc() == badAddress);
773 return NULL; // CodeCache is full
774 }
775 }
776 }
777
778 if (cbuf) cbuf->set_insts_mark();
779 relocate(entry.rspec());
780 if (!need_trampoline) {
781 bl(entry.target());
782 } else {
783 bl(pc());
784 }
785 // just need to return a non-null address
786 postcond(pc() != badAddress);
787 return pc();
788 }
789
790
791 // Emit a trampoline stub for a call to a target which is too far away.
792 //
793 // code sequences:
794 //
795 // call-site:
796 // branch-and-link to <destination> or <trampoline stub>
797 //
798 // Related trampoline stub for this call site in the stub section:
799 // load the call target from the constant pool
800 // branch (LR still points to the call site above)
801
802 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
803 address dest) {
804 // Max stub size: alignment nop, TrampolineStub.
805 address stub = start_a_stub(NativeInstruction::instruction_size
806 + NativeCallTrampolineStub::instruction_size);
807 if (stub == NULL) {
808 return NULL; // CodeBuffer::expand failed
809 }
810
811 // Create a trampoline stub relocation which relates this trampoline stub
812 // with the call instruction at insts_call_instruction_offset in the
813 // instructions code-section.
814 align(wordSize);
815 relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
816 + insts_call_instruction_offset));
817 const int stub_start_offset = offset();
818
819 // Now, create the trampoline stub's code:
820 // - load the call
821 // - call
822 Label target;
823 ldr(rscratch1, target);
824 br(rscratch1);
825 bind(target);
826 assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
827 "should be");
828 emit_int64((int64_t)dest);
829
830 const address stub_start_addr = addr_at(stub_start_offset);
831
832 assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
833
834 end_a_stub();
835 return stub_start_addr;
836 }
837
838 void MacroAssembler::emit_static_call_stub() {
839 // CompiledDirectStaticCall::set_to_interpreted knows the
840 // exact layout of this stub.
841
842 isb();
843 mov_metadata(rmethod, (Metadata*)NULL);
844
845 // Jump to the entry point of the i2c stub.
846 movptr(rscratch1, 0);
847 br(rscratch1);
848 }
849
850 void MacroAssembler::c2bool(Register x) {
851 // implements x == 0 ? 0 : 1
852 // note: must only look at least-significant byte of x
853 // since C-style booleans are stored in one byte
854 // only! (was bug)
855 tst(x, 0xff);
856 cset(x, Assembler::NE);
857 }
858
859 address MacroAssembler::ic_call(address entry, jint method_index) {
860 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
861 // address const_ptr = long_constant((jlong)Universe::non_oop_word());
862 // uintptr_t offset;
863 // ldr_constant(rscratch2, const_ptr);
864 movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
865 return trampoline_call(Address(entry, rh));
866 }
867
868 // Implementation of call_VM versions
869
870 void MacroAssembler::call_VM(Register oop_result,
871 address entry_point,
872 bool check_exceptions) {
873 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
874 }
875
876 void MacroAssembler::call_VM(Register oop_result,
877 address entry_point,
878 Register arg_1,
879 bool check_exceptions) {
880 pass_arg1(this, arg_1);
881 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
882 }
883
884 void MacroAssembler::call_VM(Register oop_result,
885 address entry_point,
886 Register arg_1,
887 Register arg_2,
888 bool check_exceptions) {
889 assert(arg_1 != c_rarg2, "smashed arg");
890 pass_arg2(this, arg_2);
891 pass_arg1(this, arg_1);
892 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
893 }
894
895 void MacroAssembler::call_VM(Register oop_result,
896 address entry_point,
897 Register arg_1,
898 Register arg_2,
899 Register arg_3,
900 bool check_exceptions) {
901 assert(arg_1 != c_rarg3, "smashed arg");
902 assert(arg_2 != c_rarg3, "smashed arg");
903 pass_arg3(this, arg_3);
904
905 assert(arg_1 != c_rarg2, "smashed arg");
906 pass_arg2(this, arg_2);
907
908 pass_arg1(this, arg_1);
909 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
910 }
911
912 void MacroAssembler::call_VM(Register oop_result,
913 Register last_java_sp,
914 address entry_point,
915 int number_of_arguments,
916 bool check_exceptions) {
917 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
918 }
919
920 void MacroAssembler::call_VM(Register oop_result,
921 Register last_java_sp,
922 address entry_point,
923 Register arg_1,
924 bool check_exceptions) {
925 pass_arg1(this, arg_1);
926 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
927 }
928
929 void MacroAssembler::call_VM(Register oop_result,
930 Register last_java_sp,
931 address entry_point,
932 Register arg_1,
933 Register arg_2,
934 bool check_exceptions) {
935
936 assert(arg_1 != c_rarg2, "smashed arg");
937 pass_arg2(this, arg_2);
938 pass_arg1(this, arg_1);
939 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
940 }
941
942 void MacroAssembler::call_VM(Register oop_result,
943 Register last_java_sp,
944 address entry_point,
945 Register arg_1,
946 Register arg_2,
947 Register arg_3,
948 bool check_exceptions) {
949 assert(arg_1 != c_rarg3, "smashed arg");
950 assert(arg_2 != c_rarg3, "smashed arg");
951 pass_arg3(this, arg_3);
952 assert(arg_1 != c_rarg2, "smashed arg");
953 pass_arg2(this, arg_2);
954 pass_arg1(this, arg_1);
955 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
956 }
957
958
959 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
960 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
961 str(zr, Address(java_thread, JavaThread::vm_result_offset()));
962 verify_oop(oop_result, "broken oop in call_VM_base");
963 }
964
965 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
966 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
967 str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
968 }
969
970 void MacroAssembler::align(int modulus) {
971 while (offset() % modulus != 0) nop();
972 }
973
974 // these are no-ops overridden by InterpreterMacroAssembler
975
976 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
977
978 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
979
980 // Look up the method for a megamorphic invokeinterface call.
981 // The target method is determined by <intf_klass, itable_index>.
982 // The receiver klass is in recv_klass.
983 // On success, the result will be in method_result, and execution falls through.
984 // On failure, execution transfers to the given label.
985 void MacroAssembler::lookup_interface_method(Register recv_klass,
986 Register intf_klass,
987 RegisterOrConstant itable_index,
988 Register method_result,
989 Register scan_temp,
990 Label& L_no_such_interface,
991 bool return_method) {
992 assert_different_registers(recv_klass, intf_klass, scan_temp);
993 assert_different_registers(method_result, intf_klass, scan_temp);
994 assert(recv_klass != method_result || !return_method,
995 "recv_klass can be destroyed when method isn't needed");
996 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
997 "caller must use same register for non-constant itable index as for method");
998
999 // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1000 int vtable_base = in_bytes(Klass::vtable_start_offset());
1001 int itentry_off = itableMethodEntry::method_offset_in_bytes();
1002 int scan_step = itableOffsetEntry::size() * wordSize;
1003 int vte_size = vtableEntry::size_in_bytes();
1004 assert(vte_size == wordSize, "else adjust times_vte_scale");
1005
1006 ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1007
1008 // %%% Could store the aligned, prescaled offset in the klassoop.
1009 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1010 lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1011 add(scan_temp, scan_temp, vtable_base);
1012
1013 if (return_method) {
1014 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1015 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1016 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1017 lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1018 if (itentry_off)
1019 add(recv_klass, recv_klass, itentry_off);
1020 }
1021
1022 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1023 // if (scan->interface() == intf) {
1024 // result = (klass + scan->offset() + itable_index);
1025 // }
1026 // }
1027 Label search, found_method;
1028
1029 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1030 cmp(intf_klass, method_result);
1031 br(Assembler::EQ, found_method);
1032 bind(search);
1033 // Check that the previous entry is non-null. A null entry means that
1034 // the receiver class doesn't implement the interface, and wasn't the
1035 // same as when the caller was compiled.
1036 cbz(method_result, L_no_such_interface);
1037 if (itableOffsetEntry::interface_offset_in_bytes() != 0) {
1038 add(scan_temp, scan_temp, scan_step);
1039 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1040 } else {
1041 ldr(method_result, Address(pre(scan_temp, scan_step)));
1042 }
1043 cmp(intf_klass, method_result);
1044 br(Assembler::NE, search);
1045
1046 bind(found_method);
1047
1048 // Got a hit.
1049 if (return_method) {
1050 ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1051 ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1052 }
1053 }
1054
1055 // virtual method calling
1056 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1057 RegisterOrConstant vtable_index,
1058 Register method_result) {
1059 const int base = in_bytes(Klass::vtable_start_offset());
1060 assert(vtableEntry::size() * wordSize == 8,
1061 "adjust the scaling in the code below");
1062 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1063
1064 if (vtable_index.is_register()) {
1065 lea(method_result, Address(recv_klass,
1066 vtable_index.as_register(),
1067 Address::lsl(LogBytesPerWord)));
1068 ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1069 } else {
1070 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1071 ldr(method_result,
1072 form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1073 }
1074 }
1075
1076 void MacroAssembler::check_klass_subtype(Register sub_klass,
1077 Register super_klass,
1078 Register temp_reg,
1079 Label& L_success) {
1080 Label L_failure;
1081 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL);
1082 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1083 bind(L_failure);
1084 }
1085
1086
1087 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1088 Register super_klass,
1089 Register temp_reg,
1090 Label* L_success,
1091 Label* L_failure,
1092 Label* L_slow_path,
1093 RegisterOrConstant super_check_offset) {
1094 assert_different_registers(sub_klass, super_klass, temp_reg);
1095 bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1096 if (super_check_offset.is_register()) {
1097 assert_different_registers(sub_klass, super_klass,
1098 super_check_offset.as_register());
1099 } else if (must_load_sco) {
1100 assert(temp_reg != noreg, "supply either a temp or a register offset");
1101 }
1102
1103 Label L_fallthrough;
1104 int label_nulls = 0;
1105 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1106 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1107 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1108 assert(label_nulls <= 1, "at most one NULL in the batch");
1109
1110 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1111 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1112 Address super_check_offset_addr(super_klass, sco_offset);
1113
1114 // Hacked jmp, which may only be used just before L_fallthrough.
1115 #define final_jmp(label) \
1116 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
1117 else b(label) /*omit semi*/
1118
1119 // If the pointers are equal, we are done (e.g., String[] elements).
1120 // This self-check enables sharing of secondary supertype arrays among
1121 // non-primary types such as array-of-interface. Otherwise, each such
1122 // type would need its own customized SSA.
1123 // We move this check to the front of the fast path because many
1124 // type checks are in fact trivially successful in this manner,
1125 // so we get a nicely predicted branch right at the start of the check.
1126 cmp(sub_klass, super_klass);
1127 br(Assembler::EQ, *L_success);
1128
1129 // Check the supertype display:
1130 if (must_load_sco) {
1131 ldrw(temp_reg, super_check_offset_addr);
1132 super_check_offset = RegisterOrConstant(temp_reg);
1133 }
1134 Address super_check_addr(sub_klass, super_check_offset);
1135 ldr(rscratch1, super_check_addr);
1136 cmp(super_klass, rscratch1); // load displayed supertype
1137
1138 // This check has worked decisively for primary supers.
1139 // Secondary supers are sought in the super_cache ('super_cache_addr').
1140 // (Secondary supers are interfaces and very deeply nested subtypes.)
1141 // This works in the same check above because of a tricky aliasing
1142 // between the super_cache and the primary super display elements.
1143 // (The 'super_check_addr' can address either, as the case requires.)
1144 // Note that the cache is updated below if it does not help us find
1145 // what we need immediately.
1146 // So if it was a primary super, we can just fail immediately.
1147 // Otherwise, it's the slow path for us (no success at this point).
1148
1149 if (super_check_offset.is_register()) {
1150 br(Assembler::EQ, *L_success);
1151 subs(zr, super_check_offset.as_register(), sc_offset);
1152 if (L_failure == &L_fallthrough) {
1153 br(Assembler::EQ, *L_slow_path);
1154 } else {
1155 br(Assembler::NE, *L_failure);
1156 final_jmp(*L_slow_path);
1157 }
1158 } else if (super_check_offset.as_constant() == sc_offset) {
1159 // Need a slow path; fast failure is impossible.
1160 if (L_slow_path == &L_fallthrough) {
1161 br(Assembler::EQ, *L_success);
1162 } else {
1163 br(Assembler::NE, *L_slow_path);
1164 final_jmp(*L_success);
1165 }
1166 } else {
1167 // No slow path; it's a fast decision.
1168 if (L_failure == &L_fallthrough) {
1169 br(Assembler::EQ, *L_success);
1170 } else {
1171 br(Assembler::NE, *L_failure);
1172 final_jmp(*L_success);
1173 }
1174 }
1175
1176 bind(L_fallthrough);
1177
1178 #undef final_jmp
1179 }
1180
1181 // These two are taken from x86, but they look generally useful
1182
1183 // scans count pointer sized words at [addr] for occurence of value,
1184 // generic
1185 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1186 Register scratch) {
1187 Label Lloop, Lexit;
1188 cbz(count, Lexit);
1189 bind(Lloop);
1190 ldr(scratch, post(addr, wordSize));
1191 cmp(value, scratch);
1192 br(EQ, Lexit);
1193 sub(count, count, 1);
1194 cbnz(count, Lloop);
1195 bind(Lexit);
1196 }
1197
1198 // scans count 4 byte words at [addr] for occurence of value,
1199 // generic
1200 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1201 Register scratch) {
1202 Label Lloop, Lexit;
1203 cbz(count, Lexit);
1204 bind(Lloop);
1205 ldrw(scratch, post(addr, wordSize));
1206 cmpw(value, scratch);
1207 br(EQ, Lexit);
1208 sub(count, count, 1);
1209 cbnz(count, Lloop);
1210 bind(Lexit);
1211 }
1212
1213 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1214 Register super_klass,
1215 Register temp_reg,
1216 Register temp2_reg,
1217 Label* L_success,
1218 Label* L_failure,
1219 bool set_cond_codes) {
1220 assert_different_registers(sub_klass, super_klass, temp_reg);
1221 if (temp2_reg != noreg)
1222 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1223 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1224
1225 Label L_fallthrough;
1226 int label_nulls = 0;
1227 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; }
1228 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; }
1229 assert(label_nulls <= 1, "at most one NULL in the batch");
1230
1231 // a couple of useful fields in sub_klass:
1232 int ss_offset = in_bytes(Klass::secondary_supers_offset());
1233 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1234 Address secondary_supers_addr(sub_klass, ss_offset);
1235 Address super_cache_addr( sub_klass, sc_offset);
1236
1237 BLOCK_COMMENT("check_klass_subtype_slow_path");
1238
1239 // Do a linear scan of the secondary super-klass chain.
1240 // This code is rarely used, so simplicity is a virtue here.
1241 // The repne_scan instruction uses fixed registers, which we must spill.
1242 // Don't worry too much about pre-existing connections with the input regs.
1243
1244 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1245 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1246
1247 RegSet pushed_registers;
1248 if (!IS_A_TEMP(r2)) pushed_registers += r2;
1249 if (!IS_A_TEMP(r5)) pushed_registers += r5;
1250
1251 if (super_klass != r0) {
1252 if (!IS_A_TEMP(r0)) pushed_registers += r0;
1253 }
1254
1255 push(pushed_registers, sp);
1256
1257 // Get super_klass value into r0 (even if it was in r5 or r2).
1258 if (super_klass != r0) {
1259 mov(r0, super_klass);
1260 }
1261
1262 #ifndef PRODUCT
1263 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1264 Address pst_counter_addr(rscratch2);
1265 ldr(rscratch1, pst_counter_addr);
1266 add(rscratch1, rscratch1, 1);
1267 str(rscratch1, pst_counter_addr);
1268 #endif //PRODUCT
1269
1270 // We will consult the secondary-super array.
1271 ldr(r5, secondary_supers_addr);
1272 // Load the array length.
1273 ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1274 // Skip to start of data.
1275 add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1276
1277 cmp(sp, zr); // Clear Z flag; SP is never zero
1278 // Scan R2 words at [R5] for an occurrence of R0.
1279 // Set NZ/Z based on last compare.
1280 repne_scan(r5, r0, r2, rscratch1);
1281
1282 // Unspill the temp. registers:
1283 pop(pushed_registers, sp);
1284
1285 br(Assembler::NE, *L_failure);
1286
1287 // Success. Cache the super we found and proceed in triumph.
1288 str(super_klass, super_cache_addr);
1289
1290 if (L_success != &L_fallthrough) {
1291 b(*L_success);
1292 }
1293
1294 #undef IS_A_TEMP
1295
1296 bind(L_fallthrough);
1297 }
1298
1299 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1300 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1301 assert_different_registers(klass, rthread, scratch);
1302
1303 Label L_fallthrough, L_tmp;
1304 if (L_fast_path == NULL) {
1305 L_fast_path = &L_fallthrough;
1306 } else if (L_slow_path == NULL) {
1307 L_slow_path = &L_fallthrough;
1308 }
1309 // Fast path check: class is fully initialized
1310 ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1311 subs(zr, scratch, InstanceKlass::fully_initialized);
1312 br(Assembler::EQ, *L_fast_path);
1313
1314 // Fast path check: current thread is initializer thread
1315 ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1316 cmp(rthread, scratch);
1317
1318 if (L_slow_path == &L_fallthrough) {
1319 br(Assembler::EQ, *L_fast_path);
1320 bind(*L_slow_path);
1321 } else if (L_fast_path == &L_fallthrough) {
1322 br(Assembler::NE, *L_slow_path);
1323 bind(*L_fast_path);
1324 } else {
1325 Unimplemented();
1326 }
1327 }
1328
1329 void MacroAssembler::verify_oop(Register reg, const char* s) {
1330 if (!VerifyOops) return;
1331
1332 // Pass register number to verify_oop_subroutine
1333 const char* b = NULL;
1334 {
1335 ResourceMark rm;
1336 stringStream ss;
1337 ss.print("verify_oop: %s: %s", reg->name(), s);
1338 b = code_string(ss.as_string());
1339 }
1340 BLOCK_COMMENT("verify_oop {");
1341
1342 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1343 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1344
1345 mov(r0, reg);
1346 movptr(rscratch1, (uintptr_t)(address)b);
1347
1348 // call indirectly to solve generation ordering problem
1349 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1350 ldr(rscratch2, Address(rscratch2));
1351 blr(rscratch2);
1352
1353 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1354 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1355
1356 BLOCK_COMMENT("} verify_oop");
1357 }
1358
1359 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1360 if (!VerifyOops) return;
1361
1362 const char* b = NULL;
1363 {
1364 ResourceMark rm;
1365 stringStream ss;
1366 ss.print("verify_oop_addr: %s", s);
1367 b = code_string(ss.as_string());
1368 }
1369 BLOCK_COMMENT("verify_oop_addr {");
1370
1371 stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1372 stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1373
1374 // addr may contain sp so we will have to adjust it based on the
1375 // pushes that we just did.
1376 if (addr.uses(sp)) {
1377 lea(r0, addr);
1378 ldr(r0, Address(r0, 4 * wordSize));
1379 } else {
1380 ldr(r0, addr);
1381 }
1382 movptr(rscratch1, (uintptr_t)(address)b);
1383
1384 // call indirectly to solve generation ordering problem
1385 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1386 ldr(rscratch2, Address(rscratch2));
1387 blr(rscratch2);
1388
1389 ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1390 ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1391
1392 BLOCK_COMMENT("} verify_oop_addr");
1393 }
1394
1395 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1396 int extra_slot_offset) {
1397 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1398 int stackElementSize = Interpreter::stackElementSize;
1399 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1400 #ifdef ASSERT
1401 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1402 assert(offset1 - offset == stackElementSize, "correct arithmetic");
1403 #endif
1404 if (arg_slot.is_constant()) {
1405 return Address(esp, arg_slot.as_constant() * stackElementSize
1406 + offset);
1407 } else {
1408 add(rscratch1, esp, arg_slot.as_register(),
1409 ext::uxtx, exact_log2(stackElementSize));
1410 return Address(rscratch1, offset);
1411 }
1412 }
1413
1414 void MacroAssembler::call_VM_leaf_base(address entry_point,
1415 int number_of_arguments,
1416 Label *retaddr) {
1417 Label E, L;
1418
1419 stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1420
1421 mov(rscratch1, entry_point);
1422 blr(rscratch1);
1423 if (retaddr)
1424 bind(*retaddr);
1425
1426 ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1427 }
1428
1429 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1430 call_VM_leaf_base(entry_point, number_of_arguments);
1431 }
1432
1433 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1434 pass_arg0(this, arg_0);
1435 call_VM_leaf_base(entry_point, 1);
1436 }
1437
1438 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1439 pass_arg0(this, arg_0);
1440 pass_arg1(this, arg_1);
1441 call_VM_leaf_base(entry_point, 2);
1442 }
1443
1444 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1445 Register arg_1, Register arg_2) {
1446 pass_arg0(this, arg_0);
1447 pass_arg1(this, arg_1);
1448 pass_arg2(this, arg_2);
1449 call_VM_leaf_base(entry_point, 3);
1450 }
1451
1452 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1453 pass_arg0(this, arg_0);
1454 MacroAssembler::call_VM_leaf_base(entry_point, 1);
1455 }
1456
1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1458
1459 assert(arg_0 != c_rarg1, "smashed arg");
1460 pass_arg1(this, arg_1);
1461 pass_arg0(this, arg_0);
1462 MacroAssembler::call_VM_leaf_base(entry_point, 2);
1463 }
1464
1465 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1466 assert(arg_0 != c_rarg2, "smashed arg");
1467 assert(arg_1 != c_rarg2, "smashed arg");
1468 pass_arg2(this, arg_2);
1469 assert(arg_0 != c_rarg1, "smashed arg");
1470 pass_arg1(this, arg_1);
1471 pass_arg0(this, arg_0);
1472 MacroAssembler::call_VM_leaf_base(entry_point, 3);
1473 }
1474
1475 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1476 assert(arg_0 != c_rarg3, "smashed arg");
1477 assert(arg_1 != c_rarg3, "smashed arg");
1478 assert(arg_2 != c_rarg3, "smashed arg");
1479 pass_arg3(this, arg_3);
1480 assert(arg_0 != c_rarg2, "smashed arg");
1481 assert(arg_1 != c_rarg2, "smashed arg");
1482 pass_arg2(this, arg_2);
1483 assert(arg_0 != c_rarg1, "smashed arg");
1484 pass_arg1(this, arg_1);
1485 pass_arg0(this, arg_0);
1486 MacroAssembler::call_VM_leaf_base(entry_point, 4);
1487 }
1488
1489 void MacroAssembler::null_check(Register reg, int offset) {
1490 if (needs_explicit_null_check(offset)) {
1491 // provoke OS NULL exception if reg = NULL by
1492 // accessing M[reg] w/o changing any registers
1493 // NOTE: this is plenty to provoke a segv
1494 ldr(zr, Address(reg));
1495 } else {
1496 // nothing to do, (later) access of M[reg + offset]
1497 // will provoke OS NULL exception if reg = NULL
1498 }
1499 }
1500
1501 // MacroAssembler protected routines needed to implement
1502 // public methods
1503
1504 void MacroAssembler::mov(Register r, Address dest) {
1505 code_section()->relocate(pc(), dest.rspec());
1506 uint64_t imm64 = (uint64_t)dest.target();
1507 movptr(r, imm64);
1508 }
1509
1510 // Move a constant pointer into r. In AArch64 mode the virtual
1511 // address space is 48 bits in size, so we only need three
1512 // instructions to create a patchable instruction sequence that can
1513 // reach anywhere.
1514 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1515 #ifndef PRODUCT
1516 {
1517 char buffer[64];
1518 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, (uint64_t)imm64);
1519 block_comment(buffer);
1520 }
1521 #endif
1522 assert(imm64 < (1ull << 48), "48-bit overflow in address constant");
1523 movz(r, imm64 & 0xffff);
1524 imm64 >>= 16;
1525 movk(r, imm64 & 0xffff, 16);
1526 imm64 >>= 16;
1527 movk(r, imm64 & 0xffff, 32);
1528 }
1529
1530 // Macro to mov replicated immediate to vector register.
1531 // imm64: only the lower 8/16/32 bits are considered for B/H/S type. That is,
1532 // the upper 56/48/32 bits must be zeros for B/H/S type.
1533 // Vd will get the following values for different arrangements in T
1534 // imm64 == hex 000000gh T8B: Vd = ghghghghghghghgh
1535 // imm64 == hex 000000gh T16B: Vd = ghghghghghghghghghghghghghghghgh
1536 // imm64 == hex 0000efgh T4H: Vd = efghefghefghefgh
1537 // imm64 == hex 0000efgh T8H: Vd = efghefghefghefghefghefghefghefgh
1538 // imm64 == hex abcdefgh T2S: Vd = abcdefghabcdefgh
1539 // imm64 == hex abcdefgh T4S: Vd = abcdefghabcdefghabcdefghabcdefgh
1540 // imm64 == hex abcdefgh T1D: Vd = 00000000abcdefgh
1541 // imm64 == hex abcdefgh T2D: Vd = 00000000abcdefgh00000000abcdefgh
1542 // Clobbers rscratch1
1543 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, uint64_t imm64) {
1544 assert(T != T1Q, "unsupported");
1545 if (T == T1D || T == T2D) {
1546 int imm = operand_valid_for_movi_immediate(imm64, T);
1547 if (-1 != imm) {
1548 movi(Vd, T, imm);
1549 } else {
1550 mov(rscratch1, imm64);
1551 dup(Vd, T, rscratch1);
1552 }
1553 return;
1554 }
1555
1556 #ifdef ASSERT
1557 if (T == T8B || T == T16B) assert((imm64 & ~0xff) == 0, "extraneous bits (T8B/T16B)");
1558 if (T == T4H || T == T8H) assert((imm64 & ~0xffff) == 0, "extraneous bits (T4H/T8H)");
1559 if (T == T2S || T == T4S) assert((imm64 & ~0xffffffff) == 0, "extraneous bits (T2S/T4S)");
1560 #endif
1561 int shift = operand_valid_for_movi_immediate(imm64, T);
1562 uint32_t imm32 = imm64 & 0xffffffffULL;
1563 if (shift >= 0) {
1564 movi(Vd, T, (imm32 >> shift) & 0xff, shift);
1565 } else {
1566 movw(rscratch1, imm32);
1567 dup(Vd, T, rscratch1);
1568 }
1569 }
1570
1571 void MacroAssembler::mov_immediate64(Register dst, uint64_t imm64)
1572 {
1573 #ifndef PRODUCT
1574 {
1575 char buffer[64];
1576 snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1577 block_comment(buffer);
1578 }
1579 #endif
1580 if (operand_valid_for_logical_immediate(false, imm64)) {
1581 orr(dst, zr, imm64);
1582 } else {
1583 // we can use a combination of MOVZ or MOVN with
1584 // MOVK to build up the constant
1585 uint64_t imm_h[4];
1586 int zero_count = 0;
1587 int neg_count = 0;
1588 int i;
1589 for (i = 0; i < 4; i++) {
1590 imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1591 if (imm_h[i] == 0) {
1592 zero_count++;
1593 } else if (imm_h[i] == 0xffffL) {
1594 neg_count++;
1595 }
1596 }
1597 if (zero_count == 4) {
1598 // one MOVZ will do
1599 movz(dst, 0);
1600 } else if (neg_count == 4) {
1601 // one MOVN will do
1602 movn(dst, 0);
1603 } else if (zero_count == 3) {
1604 for (i = 0; i < 4; i++) {
1605 if (imm_h[i] != 0L) {
1606 movz(dst, (uint32_t)imm_h[i], (i << 4));
1607 break;
1608 }
1609 }
1610 } else if (neg_count == 3) {
1611 // one MOVN will do
1612 for (int i = 0; i < 4; i++) {
1613 if (imm_h[i] != 0xffffL) {
1614 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1615 break;
1616 }
1617 }
1618 } else if (zero_count == 2) {
1619 // one MOVZ and one MOVK will do
1620 for (i = 0; i < 3; i++) {
1621 if (imm_h[i] != 0L) {
1622 movz(dst, (uint32_t)imm_h[i], (i << 4));
1623 i++;
1624 break;
1625 }
1626 }
1627 for (;i < 4; i++) {
1628 if (imm_h[i] != 0L) {
1629 movk(dst, (uint32_t)imm_h[i], (i << 4));
1630 }
1631 }
1632 } else if (neg_count == 2) {
1633 // one MOVN and one MOVK will do
1634 for (i = 0; i < 4; i++) {
1635 if (imm_h[i] != 0xffffL) {
1636 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1637 i++;
1638 break;
1639 }
1640 }
1641 for (;i < 4; i++) {
1642 if (imm_h[i] != 0xffffL) {
1643 movk(dst, (uint32_t)imm_h[i], (i << 4));
1644 }
1645 }
1646 } else if (zero_count == 1) {
1647 // one MOVZ and two MOVKs will do
1648 for (i = 0; i < 4; i++) {
1649 if (imm_h[i] != 0L) {
1650 movz(dst, (uint32_t)imm_h[i], (i << 4));
1651 i++;
1652 break;
1653 }
1654 }
1655 for (;i < 4; i++) {
1656 if (imm_h[i] != 0x0L) {
1657 movk(dst, (uint32_t)imm_h[i], (i << 4));
1658 }
1659 }
1660 } else if (neg_count == 1) {
1661 // one MOVN and two MOVKs will do
1662 for (i = 0; i < 4; i++) {
1663 if (imm_h[i] != 0xffffL) {
1664 movn(dst, (uint32_t)imm_h[i] ^ 0xffffL, (i << 4));
1665 i++;
1666 break;
1667 }
1668 }
1669 for (;i < 4; i++) {
1670 if (imm_h[i] != 0xffffL) {
1671 movk(dst, (uint32_t)imm_h[i], (i << 4));
1672 }
1673 }
1674 } else {
1675 // use a MOVZ and 3 MOVKs (makes it easier to debug)
1676 movz(dst, (uint32_t)imm_h[0], 0);
1677 for (i = 1; i < 4; i++) {
1678 movk(dst, (uint32_t)imm_h[i], (i << 4));
1679 }
1680 }
1681 }
1682 }
1683
1684 void MacroAssembler::mov_immediate32(Register dst, uint32_t imm32)
1685 {
1686 #ifndef PRODUCT
1687 {
1688 char buffer[64];
1689 snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1690 block_comment(buffer);
1691 }
1692 #endif
1693 if (operand_valid_for_logical_immediate(true, imm32)) {
1694 orrw(dst, zr, imm32);
1695 } else {
1696 // we can use MOVZ, MOVN or two calls to MOVK to build up the
1697 // constant
1698 uint32_t imm_h[2];
1699 imm_h[0] = imm32 & 0xffff;
1700 imm_h[1] = ((imm32 >> 16) & 0xffff);
1701 if (imm_h[0] == 0) {
1702 movzw(dst, imm_h[1], 16);
1703 } else if (imm_h[0] == 0xffff) {
1704 movnw(dst, imm_h[1] ^ 0xffff, 16);
1705 } else if (imm_h[1] == 0) {
1706 movzw(dst, imm_h[0], 0);
1707 } else if (imm_h[1] == 0xffff) {
1708 movnw(dst, imm_h[0] ^ 0xffff, 0);
1709 } else {
1710 // use a MOVZ and MOVK (makes it easier to debug)
1711 movzw(dst, imm_h[0], 0);
1712 movkw(dst, imm_h[1], 16);
1713 }
1714 }
1715 }
1716
1717 // Form an address from base + offset in Rd. Rd may or may
1718 // not actually be used: you must use the Address that is returned.
1719 // It is up to you to ensure that the shift provided matches the size
1720 // of your data.
1721 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset, int shift) {
1722 if (Address::offset_ok_for_immed(byte_offset, shift))
1723 // It fits; no need for any heroics
1724 return Address(base, byte_offset);
1725
1726 // Don't do anything clever with negative or misaligned offsets
1727 unsigned mask = (1 << shift) - 1;
1728 if (byte_offset < 0 || byte_offset & mask) {
1729 mov(Rd, byte_offset);
1730 add(Rd, base, Rd);
1731 return Address(Rd);
1732 }
1733
1734 // See if we can do this with two 12-bit offsets
1735 {
1736 uint64_t word_offset = byte_offset >> shift;
1737 uint64_t masked_offset = word_offset & 0xfff000;
1738 if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1739 && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1740 add(Rd, base, masked_offset << shift);
1741 word_offset -= masked_offset;
1742 return Address(Rd, word_offset << shift);
1743 }
1744 }
1745
1746 // Do it the hard way
1747 mov(Rd, byte_offset);
1748 add(Rd, base, Rd);
1749 return Address(Rd);
1750 }
1751
1752 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1753 if (UseLSE) {
1754 mov(tmp, 1);
1755 ldadd(Assembler::word, tmp, zr, counter_addr);
1756 return;
1757 }
1758 Label retry_load;
1759 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1760 prfm(Address(counter_addr), PSTL1STRM);
1761 bind(retry_load);
1762 // flush and load exclusive from the memory location
1763 ldxrw(tmp, counter_addr);
1764 addw(tmp, tmp, 1);
1765 // if we store+flush with no intervening write tmp wil be zero
1766 stxrw(tmp2, tmp, counter_addr);
1767 cbnzw(tmp2, retry_load);
1768 }
1769
1770
1771 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1772 bool want_remainder, Register scratch)
1773 {
1774 // Full implementation of Java idiv and irem. The function
1775 // returns the (pc) offset of the div instruction - may be needed
1776 // for implicit exceptions.
1777 //
1778 // constraint : ra/rb =/= scratch
1779 // normal case
1780 //
1781 // input : ra: dividend
1782 // rb: divisor
1783 //
1784 // result: either
1785 // quotient (= ra idiv rb)
1786 // remainder (= ra irem rb)
1787
1788 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1789
1790 int idivl_offset = offset();
1791 if (! want_remainder) {
1792 sdivw(result, ra, rb);
1793 } else {
1794 sdivw(scratch, ra, rb);
1795 Assembler::msubw(result, scratch, rb, ra);
1796 }
1797
1798 return idivl_offset;
1799 }
1800
1801 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1802 bool want_remainder, Register scratch)
1803 {
1804 // Full implementation of Java ldiv and lrem. The function
1805 // returns the (pc) offset of the div instruction - may be needed
1806 // for implicit exceptions.
1807 //
1808 // constraint : ra/rb =/= scratch
1809 // normal case
1810 //
1811 // input : ra: dividend
1812 // rb: divisor
1813 //
1814 // result: either
1815 // quotient (= ra idiv rb)
1816 // remainder (= ra irem rb)
1817
1818 assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1819
1820 int idivq_offset = offset();
1821 if (! want_remainder) {
1822 sdiv(result, ra, rb);
1823 } else {
1824 sdiv(scratch, ra, rb);
1825 Assembler::msub(result, scratch, rb, ra);
1826 }
1827
1828 return idivq_offset;
1829 }
1830
1831 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1832 address prev = pc() - NativeMembar::instruction_size;
1833 address last = code()->last_insn();
1834 if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1835 NativeMembar *bar = NativeMembar_at(prev);
1836 // We are merging two memory barrier instructions. On AArch64 we
1837 // can do this simply by ORing them together.
1838 bar->set_kind(bar->get_kind() | order_constraint);
1839 BLOCK_COMMENT("merged membar");
1840 } else {
1841 code()->set_last_insn(pc());
1842 dmb(Assembler::barrier(order_constraint));
1843 }
1844 }
1845
1846 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1847 if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1848 merge_ldst(rt, adr, size_in_bytes, is_store);
1849 code()->clear_last_insn();
1850 return true;
1851 } else {
1852 assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1853 const uint64_t mask = size_in_bytes - 1;
1854 if (adr.getMode() == Address::base_plus_offset &&
1855 (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1856 code()->set_last_insn(pc());
1857 }
1858 return false;
1859 }
1860 }
1861
1862 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1863 // We always try to merge two adjacent loads into one ldp.
1864 if (!try_merge_ldst(Rx, adr, 8, false)) {
1865 Assembler::ldr(Rx, adr);
1866 }
1867 }
1868
1869 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1870 // We always try to merge two adjacent loads into one ldp.
1871 if (!try_merge_ldst(Rw, adr, 4, false)) {
1872 Assembler::ldrw(Rw, adr);
1873 }
1874 }
1875
1876 void MacroAssembler::str(Register Rx, const Address &adr) {
1877 // We always try to merge two adjacent stores into one stp.
1878 if (!try_merge_ldst(Rx, adr, 8, true)) {
1879 Assembler::str(Rx, adr);
1880 }
1881 }
1882
1883 void MacroAssembler::strw(Register Rw, const Address &adr) {
1884 // We always try to merge two adjacent stores into one stp.
1885 if (!try_merge_ldst(Rw, adr, 4, true)) {
1886 Assembler::strw(Rw, adr);
1887 }
1888 }
1889
1890 // MacroAssembler routines found actually to be needed
1891
1892 void MacroAssembler::push(Register src)
1893 {
1894 str(src, Address(pre(esp, -1 * wordSize)));
1895 }
1896
1897 void MacroAssembler::pop(Register dst)
1898 {
1899 ldr(dst, Address(post(esp, 1 * wordSize)));
1900 }
1901
1902 // Note: load_unsigned_short used to be called load_unsigned_word.
1903 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1904 int off = offset();
1905 ldrh(dst, src);
1906 return off;
1907 }
1908
1909 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1910 int off = offset();
1911 ldrb(dst, src);
1912 return off;
1913 }
1914
1915 int MacroAssembler::load_signed_short(Register dst, Address src) {
1916 int off = offset();
1917 ldrsh(dst, src);
1918 return off;
1919 }
1920
1921 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1922 int off = offset();
1923 ldrsb(dst, src);
1924 return off;
1925 }
1926
1927 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1928 int off = offset();
1929 ldrshw(dst, src);
1930 return off;
1931 }
1932
1933 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1934 int off = offset();
1935 ldrsbw(dst, src);
1936 return off;
1937 }
1938
1939 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1940 switch (size_in_bytes) {
1941 case 8: ldr(dst, src); break;
1942 case 4: ldrw(dst, src); break;
1943 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1944 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1945 default: ShouldNotReachHere();
1946 }
1947 }
1948
1949 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1950 switch (size_in_bytes) {
1951 case 8: str(src, dst); break;
1952 case 4: strw(src, dst); break;
1953 case 2: strh(src, dst); break;
1954 case 1: strb(src, dst); break;
1955 default: ShouldNotReachHere();
1956 }
1957 }
1958
1959 void MacroAssembler::decrementw(Register reg, int value)
1960 {
1961 if (value < 0) { incrementw(reg, -value); return; }
1962 if (value == 0) { return; }
1963 if (value < (1 << 12)) { subw(reg, reg, value); return; }
1964 /* else */ {
1965 guarantee(reg != rscratch2, "invalid dst for register decrement");
1966 movw(rscratch2, (unsigned)value);
1967 subw(reg, reg, rscratch2);
1968 }
1969 }
1970
1971 void MacroAssembler::decrement(Register reg, int value)
1972 {
1973 if (value < 0) { increment(reg, -value); return; }
1974 if (value == 0) { return; }
1975 if (value < (1 << 12)) { sub(reg, reg, value); return; }
1976 /* else */ {
1977 assert(reg != rscratch2, "invalid dst for register decrement");
1978 mov(rscratch2, (uint64_t)value);
1979 sub(reg, reg, rscratch2);
1980 }
1981 }
1982
1983 void MacroAssembler::decrementw(Address dst, int value)
1984 {
1985 assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1986 if (dst.getMode() == Address::literal) {
1987 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1988 lea(rscratch2, dst);
1989 dst = Address(rscratch2);
1990 }
1991 ldrw(rscratch1, dst);
1992 decrementw(rscratch1, value);
1993 strw(rscratch1, dst);
1994 }
1995
1996 void MacroAssembler::decrement(Address dst, int value)
1997 {
1998 assert(!dst.uses(rscratch1), "invalid address for decrement");
1999 if (dst.getMode() == Address::literal) {
2000 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2001 lea(rscratch2, dst);
2002 dst = Address(rscratch2);
2003 }
2004 ldr(rscratch1, dst);
2005 decrement(rscratch1, value);
2006 str(rscratch1, dst);
2007 }
2008
2009 void MacroAssembler::incrementw(Register reg, int value)
2010 {
2011 if (value < 0) { decrementw(reg, -value); return; }
2012 if (value == 0) { return; }
2013 if (value < (1 << 12)) { addw(reg, reg, value); return; }
2014 /* else */ {
2015 assert(reg != rscratch2, "invalid dst for register increment");
2016 movw(rscratch2, (unsigned)value);
2017 addw(reg, reg, rscratch2);
2018 }
2019 }
2020
2021 void MacroAssembler::increment(Register reg, int value)
2022 {
2023 if (value < 0) { decrement(reg, -value); return; }
2024 if (value == 0) { return; }
2025 if (value < (1 << 12)) { add(reg, reg, value); return; }
2026 /* else */ {
2027 assert(reg != rscratch2, "invalid dst for register increment");
2028 movw(rscratch2, (unsigned)value);
2029 add(reg, reg, rscratch2);
2030 }
2031 }
2032
2033 void MacroAssembler::incrementw(Address dst, int value)
2034 {
2035 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2036 if (dst.getMode() == Address::literal) {
2037 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2038 lea(rscratch2, dst);
2039 dst = Address(rscratch2);
2040 }
2041 ldrw(rscratch1, dst);
2042 incrementw(rscratch1, value);
2043 strw(rscratch1, dst);
2044 }
2045
2046 void MacroAssembler::increment(Address dst, int value)
2047 {
2048 assert(!dst.uses(rscratch1), "invalid dst for address increment");
2049 if (dst.getMode() == Address::literal) {
2050 assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2051 lea(rscratch2, dst);
2052 dst = Address(rscratch2);
2053 }
2054 ldr(rscratch1, dst);
2055 increment(rscratch1, value);
2056 str(rscratch1, dst);
2057 }
2058
2059 // Push lots of registers in the bit set supplied. Don't push sp.
2060 // Return the number of words pushed
2061 int MacroAssembler::push(unsigned int bitset, Register stack) {
2062 int words_pushed = 0;
2063
2064 // Scan bitset to accumulate register pairs
2065 unsigned char regs[32];
2066 int count = 0;
2067 for (int reg = 0; reg <= 30; reg++) {
2068 if (1 & bitset)
2069 regs[count++] = reg;
2070 bitset >>= 1;
2071 }
2072 regs[count++] = zr->encoding_nocheck();
2073 count &= ~1; // Only push an even nuber of regs
2074
2075 if (count) {
2076 stp(as_Register(regs[0]), as_Register(regs[1]),
2077 Address(pre(stack, -count * wordSize)));
2078 words_pushed += 2;
2079 }
2080 for (int i = 2; i < count; i += 2) {
2081 stp(as_Register(regs[i]), as_Register(regs[i+1]),
2082 Address(stack, i * wordSize));
2083 words_pushed += 2;
2084 }
2085
2086 assert(words_pushed == count, "oops, pushed != count");
2087
2088 return count;
2089 }
2090
2091 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2092 int words_pushed = 0;
2093
2094 // Scan bitset to accumulate register pairs
2095 unsigned char regs[32];
2096 int count = 0;
2097 for (int reg = 0; reg <= 30; reg++) {
2098 if (1 & bitset)
2099 regs[count++] = reg;
2100 bitset >>= 1;
2101 }
2102 regs[count++] = zr->encoding_nocheck();
2103 count &= ~1;
2104
2105 for (int i = 2; i < count; i += 2) {
2106 ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2107 Address(stack, i * wordSize));
2108 words_pushed += 2;
2109 }
2110 if (count) {
2111 ldp(as_Register(regs[0]), as_Register(regs[1]),
2112 Address(post(stack, count * wordSize)));
2113 words_pushed += 2;
2114 }
2115
2116 assert(words_pushed == count, "oops, pushed != count");
2117
2118 return count;
2119 }
2120
2121 // Push lots of registers in the bit set supplied. Don't push sp.
2122 // Return the number of dwords pushed
2123 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2124 int words_pushed = 0;
2125 bool use_sve = false;
2126 int sve_vector_size_in_bytes = 0;
2127
2128 #ifdef COMPILER2
2129 use_sve = Matcher::supports_scalable_vector();
2130 sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2131 #endif
2132
2133 // Scan bitset to accumulate register pairs
2134 unsigned char regs[32];
2135 int count = 0;
2136 for (int reg = 0; reg <= 31; reg++) {
2137 if (1 & bitset)
2138 regs[count++] = reg;
2139 bitset >>= 1;
2140 }
2141
2142 if (count == 0) {
2143 return 0;
2144 }
2145
2146 // SVE
2147 if (use_sve && sve_vector_size_in_bytes > 16) {
2148 sub(stack, stack, sve_vector_size_in_bytes * count);
2149 for (int i = 0; i < count; i++) {
2150 sve_str(as_FloatRegister(regs[i]), Address(stack, i));
2151 }
2152 return count * sve_vector_size_in_bytes / 8;
2153 }
2154
2155 // NEON
2156 if (count == 1) {
2157 strq(as_FloatRegister(regs[0]), Address(pre(stack, -wordSize * 2)));
2158 return 2;
2159 }
2160
2161 bool odd = (count & 1) == 1;
2162 int push_slots = count + (odd ? 1 : 0);
2163
2164 // Always pushing full 128 bit registers.
2165 stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -push_slots * wordSize * 2)));
2166 words_pushed += 2;
2167
2168 for (int i = 2; i + 1 < count; i += 2) {
2169 stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2170 words_pushed += 2;
2171 }
2172
2173 if (odd) {
2174 strq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2175 words_pushed++;
2176 }
2177
2178 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2179 return count * 2;
2180 }
2181
2182 // Return the number of dwords poped
2183 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2184 int words_pushed = 0;
2185 bool use_sve = false;
2186 int sve_vector_size_in_bytes = 0;
2187
2188 #ifdef COMPILER2
2189 use_sve = Matcher::supports_scalable_vector();
2190 sve_vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2191 #endif
2192 // Scan bitset to accumulate register pairs
2193 unsigned char regs[32];
2194 int count = 0;
2195 for (int reg = 0; reg <= 31; reg++) {
2196 if (1 & bitset)
2197 regs[count++] = reg;
2198 bitset >>= 1;
2199 }
2200
2201 if (count == 0) {
2202 return 0;
2203 }
2204
2205 // SVE
2206 if (use_sve && sve_vector_size_in_bytes > 16) {
2207 for (int i = count - 1; i >= 0; i--) {
2208 sve_ldr(as_FloatRegister(regs[i]), Address(stack, i));
2209 }
2210 add(stack, stack, sve_vector_size_in_bytes * count);
2211 return count * sve_vector_size_in_bytes / 8;
2212 }
2213
2214 // NEON
2215 if (count == 1) {
2216 ldrq(as_FloatRegister(regs[0]), Address(post(stack, wordSize * 2)));
2217 return 2;
2218 }
2219
2220 bool odd = (count & 1) == 1;
2221 int push_slots = count + (odd ? 1 : 0);
2222
2223 if (odd) {
2224 ldrq(as_FloatRegister(regs[count - 1]), Address(stack, (count - 1) * wordSize * 2));
2225 words_pushed++;
2226 }
2227
2228 for (int i = 2; i + 1 < count; i += 2) {
2229 ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2230 words_pushed += 2;
2231 }
2232
2233 ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, push_slots * wordSize * 2)));
2234 words_pushed += 2;
2235
2236 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2237
2238 return count * 2;
2239 }
2240
2241 #ifdef ASSERT
2242 void MacroAssembler::verify_heapbase(const char* msg) {
2243 #if 0
2244 assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2245 assert (Universe::heap() != NULL, "java heap should be initialized");
2246 if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2247 // rheapbase is allocated as general register
2248 return;
2249 }
2250 if (CheckCompressedOops) {
2251 Label ok;
2252 push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2253 cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2254 br(Assembler::EQ, ok);
2255 stop(msg);
2256 bind(ok);
2257 pop(1 << rscratch1->encoding(), sp);
2258 }
2259 #endif
2260 }
2261 #endif
2262
2263 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2264 Label done, not_weak;
2265 cbz(value, done); // Use NULL as-is.
2266
2267 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2268 tbz(r0, 0, not_weak); // Test for jweak tag.
2269
2270 // Resolve jweak.
2271 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2272 Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2273 verify_oop(value);
2274 b(done);
2275
2276 bind(not_weak);
2277 // Resolve (untagged) jobject.
2278 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2279 verify_oop(value);
2280 bind(done);
2281 }
2282
2283 void MacroAssembler::stop(const char* msg) {
2284 BLOCK_COMMENT(msg);
2285 dcps1(0xdeae);
2286 emit_int64((uintptr_t)msg);
2287 }
2288
2289 void MacroAssembler::unimplemented(const char* what) {
2290 const char* buf = NULL;
2291 {
2292 ResourceMark rm;
2293 stringStream ss;
2294 ss.print("unimplemented: %s", what);
2295 buf = code_string(ss.as_string());
2296 }
2297 stop(buf);
2298 }
2299
2300 // If a constant does not fit in an immediate field, generate some
2301 // number of MOV instructions and then perform the operation.
2302 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, uint64_t imm,
2303 add_sub_imm_insn insn1,
2304 add_sub_reg_insn insn2,
2305 bool is32) {
2306 assert(Rd != zr, "Rd = zr and not setting flags?");
2307 bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2308 if (fits) {
2309 (this->*insn1)(Rd, Rn, imm);
2310 } else {
2311 if (uabs(imm) < (1 << 24)) {
2312 (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2313 (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2314 } else {
2315 assert_different_registers(Rd, Rn);
2316 mov(Rd, imm);
2317 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2318 }
2319 }
2320 }
2321
2322 // Seperate vsn which sets the flags. Optimisations are more restricted
2323 // because we must set the flags correctly.
2324 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, uint64_t imm,
2325 add_sub_imm_insn insn1,
2326 add_sub_reg_insn insn2,
2327 bool is32) {
2328 bool fits = operand_valid_for_add_sub_immediate(is32 ? (int32_t)imm : imm);
2329 if (fits) {
2330 (this->*insn1)(Rd, Rn, imm);
2331 } else {
2332 assert_different_registers(Rd, Rn);
2333 assert(Rd != zr, "overflow in immediate operand");
2334 mov(Rd, imm);
2335 (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2336 }
2337 }
2338
2339
2340 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2341 if (increment.is_register()) {
2342 add(Rd, Rn, increment.as_register());
2343 } else {
2344 add(Rd, Rn, increment.as_constant());
2345 }
2346 }
2347
2348 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2349 if (increment.is_register()) {
2350 addw(Rd, Rn, increment.as_register());
2351 } else {
2352 addw(Rd, Rn, increment.as_constant());
2353 }
2354 }
2355
2356 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2357 if (decrement.is_register()) {
2358 sub(Rd, Rn, decrement.as_register());
2359 } else {
2360 sub(Rd, Rn, decrement.as_constant());
2361 }
2362 }
2363
2364 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2365 if (decrement.is_register()) {
2366 subw(Rd, Rn, decrement.as_register());
2367 } else {
2368 subw(Rd, Rn, decrement.as_constant());
2369 }
2370 }
2371
2372 void MacroAssembler::reinit_heapbase()
2373 {
2374 if (UseCompressedOops) {
2375 if (Universe::is_fully_initialized()) {
2376 mov(rheapbase, CompressedOops::ptrs_base());
2377 } else {
2378 lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2379 ldr(rheapbase, Address(rheapbase));
2380 }
2381 }
2382 }
2383
2384 // this simulates the behaviour of the x86 cmpxchg instruction using a
2385 // load linked/store conditional pair. we use the acquire/release
2386 // versions of these instructions so that we flush pending writes as
2387 // per Java semantics.
2388
2389 // n.b the x86 version assumes the old value to be compared against is
2390 // in rax and updates rax with the value located in memory if the
2391 // cmpxchg fails. we supply a register for the old value explicitly
2392
2393 // the aarch64 load linked/store conditional instructions do not
2394 // accept an offset. so, unlike x86, we must provide a plain register
2395 // to identify the memory word to be compared/exchanged rather than a
2396 // register+offset Address.
2397
2398 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2399 Label &succeed, Label *fail) {
2400 // oldv holds comparison value
2401 // newv holds value to write in exchange
2402 // addr identifies memory word to compare against/update
2403 if (UseLSE) {
2404 mov(tmp, oldv);
2405 casal(Assembler::xword, oldv, newv, addr);
2406 cmp(tmp, oldv);
2407 br(Assembler::EQ, succeed);
2408 membar(AnyAny);
2409 } else {
2410 Label retry_load, nope;
2411 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2412 prfm(Address(addr), PSTL1STRM);
2413 bind(retry_load);
2414 // flush and load exclusive from the memory location
2415 // and fail if it is not what we expect
2416 ldaxr(tmp, addr);
2417 cmp(tmp, oldv);
2418 br(Assembler::NE, nope);
2419 // if we store+flush with no intervening write tmp wil be zero
2420 stlxr(tmp, newv, addr);
2421 cbzw(tmp, succeed);
2422 // retry so we only ever return after a load fails to compare
2423 // ensures we don't return a stale value after a failed write.
2424 b(retry_load);
2425 // if the memory word differs we return it in oldv and signal a fail
2426 bind(nope);
2427 membar(AnyAny);
2428 mov(oldv, tmp);
2429 }
2430 if (fail)
2431 b(*fail);
2432 }
2433
2434 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2435 Label &succeed, Label *fail) {
2436 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2437 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2438 }
2439
2440 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2441 Label &succeed, Label *fail) {
2442 // oldv holds comparison value
2443 // newv holds value to write in exchange
2444 // addr identifies memory word to compare against/update
2445 // tmp returns 0/1 for success/failure
2446 if (UseLSE) {
2447 mov(tmp, oldv);
2448 casal(Assembler::word, oldv, newv, addr);
2449 cmp(tmp, oldv);
2450 br(Assembler::EQ, succeed);
2451 membar(AnyAny);
2452 } else {
2453 Label retry_load, nope;
2454 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2455 prfm(Address(addr), PSTL1STRM);
2456 bind(retry_load);
2457 // flush and load exclusive from the memory location
2458 // and fail if it is not what we expect
2459 ldaxrw(tmp, addr);
2460 cmp(tmp, oldv);
2461 br(Assembler::NE, nope);
2462 // if we store+flush with no intervening write tmp wil be zero
2463 stlxrw(tmp, newv, addr);
2464 cbzw(tmp, succeed);
2465 // retry so we only ever return after a load fails to compare
2466 // ensures we don't return a stale value after a failed write.
2467 b(retry_load);
2468 // if the memory word differs we return it in oldv and signal a fail
2469 bind(nope);
2470 membar(AnyAny);
2471 mov(oldv, tmp);
2472 }
2473 if (fail)
2474 b(*fail);
2475 }
2476
2477 // A generic CAS; success or failure is in the EQ flag. A weak CAS
2478 // doesn't retry and may fail spuriously. If the oldval is wanted,
2479 // Pass a register for the result, otherwise pass noreg.
2480
2481 // Clobbers rscratch1
2482 void MacroAssembler::cmpxchg(Register addr, Register expected,
2483 Register new_val,
2484 enum operand_size size,
2485 bool acquire, bool release,
2486 bool weak,
2487 Register result) {
2488 if (result == noreg) result = rscratch1;
2489 BLOCK_COMMENT("cmpxchg {");
2490 if (UseLSE) {
2491 mov(result, expected);
2492 lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2493 compare_eq(result, expected, size);
2494 #ifdef ASSERT
2495 // Poison rscratch1 which is written on !UseLSE branch
2496 mov(rscratch1, 0x1f1f1f1f1f1f1f1f);
2497 #endif
2498 } else {
2499 Label retry_load, done;
2500 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2501 prfm(Address(addr), PSTL1STRM);
2502 bind(retry_load);
2503 load_exclusive(result, addr, size, acquire);
2504 compare_eq(result, expected, size);
2505 br(Assembler::NE, done);
2506 store_exclusive(rscratch1, new_val, addr, size, release);
2507 if (weak) {
2508 cmpw(rscratch1, 0u); // If the store fails, return NE to our caller.
2509 } else {
2510 cbnzw(rscratch1, retry_load);
2511 }
2512 bind(done);
2513 }
2514 BLOCK_COMMENT("} cmpxchg");
2515 }
2516
2517 // A generic comparison. Only compares for equality, clobbers rscratch1.
2518 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2519 if (size == xword) {
2520 cmp(rm, rn);
2521 } else if (size == word) {
2522 cmpw(rm, rn);
2523 } else if (size == halfword) {
2524 eorw(rscratch1, rm, rn);
2525 ands(zr, rscratch1, 0xffff);
2526 } else if (size == byte) {
2527 eorw(rscratch1, rm, rn);
2528 ands(zr, rscratch1, 0xff);
2529 } else {
2530 ShouldNotReachHere();
2531 }
2532 }
2533
2534
2535 static bool different(Register a, RegisterOrConstant b, Register c) {
2536 if (b.is_constant())
2537 return a != c;
2538 else
2539 return a != b.as_register() && a != c && b.as_register() != c;
2540 }
2541
2542 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz) \
2543 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2544 if (UseLSE) { \
2545 prev = prev->is_valid() ? prev : zr; \
2546 if (incr.is_register()) { \
2547 AOP(sz, incr.as_register(), prev, addr); \
2548 } else { \
2549 mov(rscratch2, incr.as_constant()); \
2550 AOP(sz, rscratch2, prev, addr); \
2551 } \
2552 return; \
2553 } \
2554 Register result = rscratch2; \
2555 if (prev->is_valid()) \
2556 result = different(prev, incr, addr) ? prev : rscratch2; \
2557 \
2558 Label retry_load; \
2559 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2560 prfm(Address(addr), PSTL1STRM); \
2561 bind(retry_load); \
2562 LDXR(result, addr); \
2563 OP(rscratch1, result, incr); \
2564 STXR(rscratch2, rscratch1, addr); \
2565 cbnzw(rscratch2, retry_load); \
2566 if (prev->is_valid() && prev != result) { \
2567 IOP(prev, rscratch1, incr); \
2568 } \
2569 }
2570
2571 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2572 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2573 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2574 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2575
2576 #undef ATOMIC_OP
2577
2578 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz) \
2579 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2580 if (UseLSE) { \
2581 prev = prev->is_valid() ? prev : zr; \
2582 AOP(sz, newv, prev, addr); \
2583 return; \
2584 } \
2585 Register result = rscratch2; \
2586 if (prev->is_valid()) \
2587 result = different(prev, newv, addr) ? prev : rscratch2; \
2588 \
2589 Label retry_load; \
2590 if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH)) \
2591 prfm(Address(addr), PSTL1STRM); \
2592 bind(retry_load); \
2593 LDXR(result, addr); \
2594 STXR(rscratch1, newv, addr); \
2595 cbnzw(rscratch1, retry_load); \
2596 if (prev->is_valid() && prev != result) \
2597 mov(prev, result); \
2598 }
2599
2600 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2601 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2602 ATOMIC_XCHG(xchgl, swpl, ldxr, stlxr, Assembler::xword)
2603 ATOMIC_XCHG(xchglw, swpl, ldxrw, stlxrw, Assembler::word)
2604 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2605 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2606
2607 #undef ATOMIC_XCHG
2608
2609 #ifndef PRODUCT
2610 extern "C" void findpc(intptr_t x);
2611 #endif
2612
2613 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2614 {
2615 // In order to get locks to work, we need to fake a in_VM state
2616 if (ShowMessageBoxOnError ) {
2617 JavaThread* thread = JavaThread::current();
2618 JavaThreadState saved_state = thread->thread_state();
2619 thread->set_thread_state(_thread_in_vm);
2620 #ifndef PRODUCT
2621 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2622 ttyLocker ttyl;
2623 BytecodeCounter::print();
2624 }
2625 #endif
2626 if (os::message_box(msg, "Execution stopped, print registers?")) {
2627 ttyLocker ttyl;
2628 tty->print_cr(" pc = 0x%016" PRIx64, pc);
2629 #ifndef PRODUCT
2630 tty->cr();
2631 findpc(pc);
2632 tty->cr();
2633 #endif
2634 tty->print_cr(" r0 = 0x%016" PRIx64, regs[0]);
2635 tty->print_cr(" r1 = 0x%016" PRIx64, regs[1]);
2636 tty->print_cr(" r2 = 0x%016" PRIx64, regs[2]);
2637 tty->print_cr(" r3 = 0x%016" PRIx64, regs[3]);
2638 tty->print_cr(" r4 = 0x%016" PRIx64, regs[4]);
2639 tty->print_cr(" r5 = 0x%016" PRIx64, regs[5]);
2640 tty->print_cr(" r6 = 0x%016" PRIx64, regs[6]);
2641 tty->print_cr(" r7 = 0x%016" PRIx64, regs[7]);
2642 tty->print_cr(" r8 = 0x%016" PRIx64, regs[8]);
2643 tty->print_cr(" r9 = 0x%016" PRIx64, regs[9]);
2644 tty->print_cr("r10 = 0x%016" PRIx64, regs[10]);
2645 tty->print_cr("r11 = 0x%016" PRIx64, regs[11]);
2646 tty->print_cr("r12 = 0x%016" PRIx64, regs[12]);
2647 tty->print_cr("r13 = 0x%016" PRIx64, regs[13]);
2648 tty->print_cr("r14 = 0x%016" PRIx64, regs[14]);
2649 tty->print_cr("r15 = 0x%016" PRIx64, regs[15]);
2650 tty->print_cr("r16 = 0x%016" PRIx64, regs[16]);
2651 tty->print_cr("r17 = 0x%016" PRIx64, regs[17]);
2652 tty->print_cr("r18 = 0x%016" PRIx64, regs[18]);
2653 tty->print_cr("r19 = 0x%016" PRIx64, regs[19]);
2654 tty->print_cr("r20 = 0x%016" PRIx64, regs[20]);
2655 tty->print_cr("r21 = 0x%016" PRIx64, regs[21]);
2656 tty->print_cr("r22 = 0x%016" PRIx64, regs[22]);
2657 tty->print_cr("r23 = 0x%016" PRIx64, regs[23]);
2658 tty->print_cr("r24 = 0x%016" PRIx64, regs[24]);
2659 tty->print_cr("r25 = 0x%016" PRIx64, regs[25]);
2660 tty->print_cr("r26 = 0x%016" PRIx64, regs[26]);
2661 tty->print_cr("r27 = 0x%016" PRIx64, regs[27]);
2662 tty->print_cr("r28 = 0x%016" PRIx64, regs[28]);
2663 tty->print_cr("r30 = 0x%016" PRIx64, regs[30]);
2664 tty->print_cr("r31 = 0x%016" PRIx64, regs[31]);
2665 BREAKPOINT;
2666 }
2667 }
2668 fatal("DEBUG MESSAGE: %s", msg);
2669 }
2670
2671 RegSet MacroAssembler::call_clobbered_registers() {
2672 RegSet regs = RegSet::range(r0, r17) - RegSet::of(rscratch1, rscratch2);
2673 #ifndef R18_RESERVED
2674 regs += r18_tls;
2675 #endif
2676 return regs;
2677 }
2678
2679 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2680 int step = 4 * wordSize;
2681 push(call_clobbered_registers() - exclude, sp);
2682 sub(sp, sp, step);
2683 mov(rscratch1, -step);
2684 // Push v0-v7, v16-v31.
2685 for (int i = 31; i>= 4; i -= 4) {
2686 if (i <= v7->encoding() || i >= v16->encoding())
2687 st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2688 as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2689 }
2690 st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2691 as_FloatRegister(3), T1D, Address(sp));
2692 }
2693
2694 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2695 for (int i = 0; i < 32; i += 4) {
2696 if (i <= v7->encoding() || i >= v16->encoding())
2697 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2698 as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2699 }
2700
2701 reinitialize_ptrue();
2702
2703 pop(call_clobbered_registers() - exclude, sp);
2704 }
2705
2706 void MacroAssembler::push_CPU_state(bool save_vectors, bool use_sve,
2707 int sve_vector_size_in_bytes) {
2708 push(RegSet::range(r0, r29), sp); // integer registers except lr & sp
2709 if (save_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2710 sub(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers);
2711 for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) {
2712 sve_str(as_FloatRegister(i), Address(sp, i));
2713 }
2714 } else {
2715 int step = (save_vectors ? 8 : 4) * wordSize;
2716 mov(rscratch1, -step);
2717 sub(sp, sp, step);
2718 for (int i = 28; i >= 4; i -= 4) {
2719 st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2720 as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2721 }
2722 st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2723 }
2724 }
2725
2726 void MacroAssembler::pop_CPU_state(bool restore_vectors, bool use_sve,
2727 int sve_vector_size_in_bytes) {
2728 if (restore_vectors && use_sve && sve_vector_size_in_bytes > 16) {
2729 for (int i = FloatRegisterImpl::number_of_registers - 1; i >= 0; i--) {
2730 sve_ldr(as_FloatRegister(i), Address(sp, i));
2731 }
2732 add(sp, sp, sve_vector_size_in_bytes * FloatRegisterImpl::number_of_registers);
2733 } else {
2734 int step = (restore_vectors ? 8 : 4) * wordSize;
2735 for (int i = 0; i <= 28; i += 4)
2736 ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2737 as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2738 }
2739
2740 if (restore_vectors) {
2741 reinitialize_ptrue();
2742 }
2743
2744 // integer registers except lr & sp
2745 pop(RegSet::range(r0, r17), sp);
2746 #ifdef R18_RESERVED
2747 ldp(zr, r19, Address(post(sp, 2 * wordSize)));
2748 pop(RegSet::range(r20, r29), sp);
2749 #else
2750 pop(RegSet::range(r18_tls, r29), sp);
2751 #endif
2752 }
2753
2754 /**
2755 * Helpers for multiply_to_len().
2756 */
2757 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2758 Register src1, Register src2) {
2759 adds(dest_lo, dest_lo, src1);
2760 adc(dest_hi, dest_hi, zr);
2761 adds(dest_lo, dest_lo, src2);
2762 adc(final_dest_hi, dest_hi, zr);
2763 }
2764
2765 // Generate an address from (r + r1 extend offset). "size" is the
2766 // size of the operand. The result may be in rscratch2.
2767 Address MacroAssembler::offsetted_address(Register r, Register r1,
2768 Address::extend ext, int offset, int size) {
2769 if (offset || (ext.shift() % size != 0)) {
2770 lea(rscratch2, Address(r, r1, ext));
2771 return Address(rscratch2, offset);
2772 } else {
2773 return Address(r, r1, ext);
2774 }
2775 }
2776
2777 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2778 {
2779 assert(offset >= 0, "spill to negative address?");
2780 // Offset reachable ?
2781 // Not aligned - 9 bits signed offset
2782 // Aligned - 12 bits unsigned offset shifted
2783 Register base = sp;
2784 if ((offset & (size-1)) && offset >= (1<<8)) {
2785 add(tmp, base, offset & ((1<<12)-1));
2786 base = tmp;
2787 offset &= -1u<<12;
2788 }
2789
2790 if (offset >= (1<<12) * size) {
2791 add(tmp, base, offset & (((1<<12)-1)<<12));
2792 base = tmp;
2793 offset &= ~(((1<<12)-1)<<12);
2794 }
2795
2796 return Address(base, offset);
2797 }
2798
2799 Address MacroAssembler::sve_spill_address(int sve_reg_size_in_bytes, int offset, Register tmp) {
2800 assert(offset >= 0, "spill to negative address?");
2801
2802 Register base = sp;
2803
2804 // An immediate offset in the range 0 to 255 which is multiplied
2805 // by the current vector or predicate register size in bytes.
2806 if (offset % sve_reg_size_in_bytes == 0 && offset < ((1<<8)*sve_reg_size_in_bytes)) {
2807 return Address(base, offset / sve_reg_size_in_bytes);
2808 }
2809
2810 add(tmp, base, offset);
2811 return Address(tmp);
2812 }
2813
2814 // Checks whether offset is aligned.
2815 // Returns true if it is, else false.
2816 bool MacroAssembler::merge_alignment_check(Register base,
2817 size_t size,
2818 int64_t cur_offset,
2819 int64_t prev_offset) const {
2820 if (AvoidUnalignedAccesses) {
2821 if (base == sp) {
2822 // Checks whether low offset if aligned to pair of registers.
2823 int64_t pair_mask = size * 2 - 1;
2824 int64_t offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2825 return (offset & pair_mask) == 0;
2826 } else { // If base is not sp, we can't guarantee the access is aligned.
2827 return false;
2828 }
2829 } else {
2830 int64_t mask = size - 1;
2831 // Load/store pair instruction only supports element size aligned offset.
2832 return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2833 }
2834 }
2835
2836 // Checks whether current and previous loads/stores can be merged.
2837 // Returns true if it can be merged, else false.
2838 bool MacroAssembler::ldst_can_merge(Register rt,
2839 const Address &adr,
2840 size_t cur_size_in_bytes,
2841 bool is_store) const {
2842 address prev = pc() - NativeInstruction::instruction_size;
2843 address last = code()->last_insn();
2844
2845 if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2846 return false;
2847 }
2848
2849 if (adr.getMode() != Address::base_plus_offset || prev != last) {
2850 return false;
2851 }
2852
2853 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2854 size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2855
2856 assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2857 assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2858
2859 if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2860 return false;
2861 }
2862
2863 int64_t max_offset = 63 * prev_size_in_bytes;
2864 int64_t min_offset = -64 * prev_size_in_bytes;
2865
2866 assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2867
2868 // Only same base can be merged.
2869 if (adr.base() != prev_ldst->base()) {
2870 return false;
2871 }
2872
2873 int64_t cur_offset = adr.offset();
2874 int64_t prev_offset = prev_ldst->offset();
2875 size_t diff = abs(cur_offset - prev_offset);
2876 if (diff != prev_size_in_bytes) {
2877 return false;
2878 }
2879
2880 // Following cases can not be merged:
2881 // ldr x2, [x2, #8]
2882 // ldr x3, [x2, #16]
2883 // or:
2884 // ldr x2, [x3, #8]
2885 // ldr x2, [x3, #16]
2886 // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2887 if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2888 return false;
2889 }
2890
2891 int64_t low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2892 // Offset range must be in ldp/stp instruction's range.
2893 if (low_offset > max_offset || low_offset < min_offset) {
2894 return false;
2895 }
2896
2897 if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2898 return true;
2899 }
2900
2901 return false;
2902 }
2903
2904 // Merge current load/store with previous load/store into ldp/stp.
2905 void MacroAssembler::merge_ldst(Register rt,
2906 const Address &adr,
2907 size_t cur_size_in_bytes,
2908 bool is_store) {
2909
2910 assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2911
2912 Register rt_low, rt_high;
2913 address prev = pc() - NativeInstruction::instruction_size;
2914 NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2915
2916 int64_t offset;
2917
2918 if (adr.offset() < prev_ldst->offset()) {
2919 offset = adr.offset();
2920 rt_low = rt;
2921 rt_high = prev_ldst->target();
2922 } else {
2923 offset = prev_ldst->offset();
2924 rt_low = prev_ldst->target();
2925 rt_high = rt;
2926 }
2927
2928 Address adr_p = Address(prev_ldst->base(), offset);
2929 // Overwrite previous generated binary.
2930 code_section()->set_end(prev);
2931
2932 const size_t sz = prev_ldst->size_in_bytes();
2933 assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2934 if (!is_store) {
2935 BLOCK_COMMENT("merged ldr pair");
2936 if (sz == 8) {
2937 ldp(rt_low, rt_high, adr_p);
2938 } else {
2939 ldpw(rt_low, rt_high, adr_p);
2940 }
2941 } else {
2942 BLOCK_COMMENT("merged str pair");
2943 if (sz == 8) {
2944 stp(rt_low, rt_high, adr_p);
2945 } else {
2946 stpw(rt_low, rt_high, adr_p);
2947 }
2948 }
2949 }
2950
2951 /**
2952 * Multiply 64 bit by 64 bit first loop.
2953 */
2954 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2955 Register y, Register y_idx, Register z,
2956 Register carry, Register product,
2957 Register idx, Register kdx) {
2958 //
2959 // jlong carry, x[], y[], z[];
2960 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2961 // huge_128 product = y[idx] * x[xstart] + carry;
2962 // z[kdx] = (jlong)product;
2963 // carry = (jlong)(product >>> 64);
2964 // }
2965 // z[xstart] = carry;
2966 //
2967
2968 Label L_first_loop, L_first_loop_exit;
2969 Label L_one_x, L_one_y, L_multiply;
2970
2971 subsw(xstart, xstart, 1);
2972 br(Assembler::MI, L_one_x);
2973
2974 lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2975 ldr(x_xstart, Address(rscratch1));
2976 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2977
2978 bind(L_first_loop);
2979 subsw(idx, idx, 1);
2980 br(Assembler::MI, L_first_loop_exit);
2981 subsw(idx, idx, 1);
2982 br(Assembler::MI, L_one_y);
2983 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2984 ldr(y_idx, Address(rscratch1));
2985 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2986 bind(L_multiply);
2987
2988 // AArch64 has a multiply-accumulate instruction that we can't use
2989 // here because it has no way to process carries, so we have to use
2990 // separate add and adc instructions. Bah.
2991 umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2992 mul(product, x_xstart, y_idx);
2993 adds(product, product, carry);
2994 adc(carry, rscratch1, zr); // x_xstart * y_idx + carry -> carry:product
2995
2996 subw(kdx, kdx, 2);
2997 ror(product, product, 32); // back to big-endian
2998 str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2999
3000 b(L_first_loop);
3001
3002 bind(L_one_y);
3003 ldrw(y_idx, Address(y, 0));
3004 b(L_multiply);
3005
3006 bind(L_one_x);
3007 ldrw(x_xstart, Address(x, 0));
3008 b(L_first_loop);
3009
3010 bind(L_first_loop_exit);
3011 }
3012
3013 /**
3014 * Multiply 128 bit by 128. Unrolled inner loop.
3015 *
3016 */
3017 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
3018 Register carry, Register carry2,
3019 Register idx, Register jdx,
3020 Register yz_idx1, Register yz_idx2,
3021 Register tmp, Register tmp3, Register tmp4,
3022 Register tmp6, Register product_hi) {
3023
3024 // jlong carry, x[], y[], z[];
3025 // int kdx = ystart+1;
3026 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
3027 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
3028 // jlong carry2 = (jlong)(tmp3 >>> 64);
3029 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
3030 // carry = (jlong)(tmp4 >>> 64);
3031 // z[kdx+idx+1] = (jlong)tmp3;
3032 // z[kdx+idx] = (jlong)tmp4;
3033 // }
3034 // idx += 2;
3035 // if (idx > 0) {
3036 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
3037 // z[kdx+idx] = (jlong)yz_idx1;
3038 // carry = (jlong)(yz_idx1 >>> 64);
3039 // }
3040 //
3041
3042 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
3043
3044 lsrw(jdx, idx, 2);
3045
3046 bind(L_third_loop);
3047
3048 subsw(jdx, jdx, 1);
3049 br(Assembler::MI, L_third_loop_exit);
3050 subw(idx, idx, 4);
3051
3052 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3053
3054 ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
3055
3056 lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3057
3058 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
3059 ror(yz_idx2, yz_idx2, 32);
3060
3061 ldp(rscratch2, rscratch1, Address(tmp6, 0));
3062
3063 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
3064 umulh(tmp4, product_hi, yz_idx1);
3065
3066 ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
3067 ror(rscratch2, rscratch2, 32);
3068
3069 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
3070 umulh(carry2, product_hi, yz_idx2);
3071
3072 // propagate sum of both multiplications into carry:tmp4:tmp3
3073 adds(tmp3, tmp3, carry);
3074 adc(tmp4, tmp4, zr);
3075 adds(tmp3, tmp3, rscratch1);
3076 adcs(tmp4, tmp4, tmp);
3077 adc(carry, carry2, zr);
3078 adds(tmp4, tmp4, rscratch2);
3079 adc(carry, carry, zr);
3080
3081 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
3082 ror(tmp4, tmp4, 32);
3083 stp(tmp4, tmp3, Address(tmp6, 0));
3084
3085 b(L_third_loop);
3086 bind (L_third_loop_exit);
3087
3088 andw (idx, idx, 0x3);
3089 cbz(idx, L_post_third_loop_done);
3090
3091 Label L_check_1;
3092 subsw(idx, idx, 2);
3093 br(Assembler::MI, L_check_1);
3094
3095 lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3096 ldr(yz_idx1, Address(rscratch1, 0));
3097 ror(yz_idx1, yz_idx1, 32);
3098 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
3099 umulh(tmp4, product_hi, yz_idx1);
3100 lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3101 ldr(yz_idx2, Address(rscratch1, 0));
3102 ror(yz_idx2, yz_idx2, 32);
3103
3104 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3105
3106 ror(tmp3, tmp3, 32);
3107 str(tmp3, Address(rscratch1, 0));
3108
3109 bind (L_check_1);
3110
3111 andw (idx, idx, 0x1);
3112 subsw(idx, idx, 1);
3113 br(Assembler::MI, L_post_third_loop_done);
3114 ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3115 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
3116 umulh(carry2, tmp4, product_hi);
3117 ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3118
3119 add2_with_carry(carry2, tmp3, tmp4, carry);
3120
3121 strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3122 extr(carry, carry2, tmp3, 32);
3123
3124 bind(L_post_third_loop_done);
3125 }
3126
3127 /**
3128 * Code for BigInteger::multiplyToLen() instrinsic.
3129 *
3130 * r0: x
3131 * r1: xlen
3132 * r2: y
3133 * r3: ylen
3134 * r4: z
3135 * r5: zlen
3136 * r10: tmp1
3137 * r11: tmp2
3138 * r12: tmp3
3139 * r13: tmp4
3140 * r14: tmp5
3141 * r15: tmp6
3142 * r16: tmp7
3143 *
3144 */
3145 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3146 Register z, Register zlen,
3147 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3148 Register tmp5, Register tmp6, Register product_hi) {
3149
3150 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3151
3152 const Register idx = tmp1;
3153 const Register kdx = tmp2;
3154 const Register xstart = tmp3;
3155
3156 const Register y_idx = tmp4;
3157 const Register carry = tmp5;
3158 const Register product = xlen;
3159 const Register x_xstart = zlen; // reuse register
3160
3161 // First Loop.
3162 //
3163 // final static long LONG_MASK = 0xffffffffL;
3164 // int xstart = xlen - 1;
3165 // int ystart = ylen - 1;
3166 // long carry = 0;
3167 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3168 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3169 // z[kdx] = (int)product;
3170 // carry = product >>> 32;
3171 // }
3172 // z[xstart] = (int)carry;
3173 //
3174
3175 movw(idx, ylen); // idx = ylen;
3176 movw(kdx, zlen); // kdx = xlen+ylen;
3177 mov(carry, zr); // carry = 0;
3178
3179 Label L_done;
3180
3181 movw(xstart, xlen);
3182 subsw(xstart, xstart, 1);
3183 br(Assembler::MI, L_done);
3184
3185 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3186
3187 Label L_second_loop;
3188 cbzw(kdx, L_second_loop);
3189
3190 Label L_carry;
3191 subw(kdx, kdx, 1);
3192 cbzw(kdx, L_carry);
3193
3194 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3195 lsr(carry, carry, 32);
3196 subw(kdx, kdx, 1);
3197
3198 bind(L_carry);
3199 strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3200
3201 // Second and third (nested) loops.
3202 //
3203 // for (int i = xstart-1; i >= 0; i--) { // Second loop
3204 // carry = 0;
3205 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3206 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3207 // (z[k] & LONG_MASK) + carry;
3208 // z[k] = (int)product;
3209 // carry = product >>> 32;
3210 // }
3211 // z[i] = (int)carry;
3212 // }
3213 //
3214 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3215
3216 const Register jdx = tmp1;
3217
3218 bind(L_second_loop);
3219 mov(carry, zr); // carry = 0;
3220 movw(jdx, ylen); // j = ystart+1
3221
3222 subsw(xstart, xstart, 1); // i = xstart-1;
3223 br(Assembler::MI, L_done);
3224
3225 str(z, Address(pre(sp, -4 * wordSize)));
3226
3227 Label L_last_x;
3228 lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3229 subsw(xstart, xstart, 1); // i = xstart-1;
3230 br(Assembler::MI, L_last_x);
3231
3232 lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3233 ldr(product_hi, Address(rscratch1));
3234 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
3235
3236 Label L_third_loop_prologue;
3237 bind(L_third_loop_prologue);
3238
3239 str(ylen, Address(sp, wordSize));
3240 stp(x, xstart, Address(sp, 2 * wordSize));
3241 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3242 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3243 ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3244 ldp(x, xlen, Address(post(sp, 2 * wordSize))); // copy old xstart -> xlen
3245
3246 addw(tmp3, xlen, 1);
3247 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3248 subsw(tmp3, tmp3, 1);
3249 br(Assembler::MI, L_done);
3250
3251 lsr(carry, carry, 32);
3252 strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3253 b(L_second_loop);
3254
3255 // Next infrequent code is moved outside loops.
3256 bind(L_last_x);
3257 ldrw(product_hi, Address(x, 0));
3258 b(L_third_loop_prologue);
3259
3260 bind(L_done);
3261 }
3262
3263 // Code for BigInteger::mulAdd instrinsic
3264 // out = r0
3265 // in = r1
3266 // offset = r2 (already out.length-offset)
3267 // len = r3
3268 // k = r4
3269 //
3270 // pseudo code from java implementation:
3271 // carry = 0;
3272 // offset = out.length-offset - 1;
3273 // for (int j=len-1; j >= 0; j--) {
3274 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3275 // out[offset--] = (int)product;
3276 // carry = product >>> 32;
3277 // }
3278 // return (int)carry;
3279 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3280 Register len, Register k) {
3281 Label LOOP, END;
3282 // pre-loop
3283 cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3284 csel(out, zr, out, Assembler::EQ);
3285 br(Assembler::EQ, END);
3286 add(in, in, len, LSL, 2); // in[j+1] address
3287 add(offset, out, offset, LSL, 2); // out[offset + 1] address
3288 mov(out, zr); // used to keep carry now
3289 BIND(LOOP);
3290 ldrw(rscratch1, Address(pre(in, -4)));
3291 madd(rscratch1, rscratch1, k, out);
3292 ldrw(rscratch2, Address(pre(offset, -4)));
3293 add(rscratch1, rscratch1, rscratch2);
3294 strw(rscratch1, Address(offset));
3295 lsr(out, rscratch1, 32);
3296 subs(len, len, 1);
3297 br(Assembler::NE, LOOP);
3298 BIND(END);
3299 }
3300
3301 /**
3302 * Emits code to update CRC-32 with a byte value according to constants in table
3303 *
3304 * @param [in,out]crc Register containing the crc.
3305 * @param [in]val Register containing the byte to fold into the CRC.
3306 * @param [in]table Register containing the table of crc constants.
3307 *
3308 * uint32_t crc;
3309 * val = crc_table[(val ^ crc) & 0xFF];
3310 * crc = val ^ (crc >> 8);
3311 *
3312 */
3313 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3314 eor(val, val, crc);
3315 andr(val, val, 0xff);
3316 ldrw(val, Address(table, val, Address::lsl(2)));
3317 eor(crc, val, crc, Assembler::LSR, 8);
3318 }
3319
3320 /**
3321 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3322 *
3323 * @param [in,out]crc Register containing the crc.
3324 * @param [in]v Register containing the 32-bit to fold into the CRC.
3325 * @param [in]table0 Register containing table 0 of crc constants.
3326 * @param [in]table1 Register containing table 1 of crc constants.
3327 * @param [in]table2 Register containing table 2 of crc constants.
3328 * @param [in]table3 Register containing table 3 of crc constants.
3329 *
3330 * uint32_t crc;
3331 * v = crc ^ v
3332 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3333 *
3334 */
3335 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3336 Register table0, Register table1, Register table2, Register table3,
3337 bool upper) {
3338 eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3339 uxtb(tmp, v);
3340 ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3341 ubfx(tmp, v, 8, 8);
3342 ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3343 eor(crc, crc, tmp);
3344 ubfx(tmp, v, 16, 8);
3345 ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3346 eor(crc, crc, tmp);
3347 ubfx(tmp, v, 24, 8);
3348 ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3349 eor(crc, crc, tmp);
3350 }
3351
3352 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3353 Register len, Register tmp0, Register tmp1, Register tmp2,
3354 Register tmp3) {
3355 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3356 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3357
3358 mvnw(crc, crc);
3359
3360 subs(len, len, 128);
3361 br(Assembler::GE, CRC_by64_pre);
3362 BIND(CRC_less64);
3363 adds(len, len, 128-32);
3364 br(Assembler::GE, CRC_by32_loop);
3365 BIND(CRC_less32);
3366 adds(len, len, 32-4);
3367 br(Assembler::GE, CRC_by4_loop);
3368 adds(len, len, 4);
3369 br(Assembler::GT, CRC_by1_loop);
3370 b(L_exit);
3371
3372 BIND(CRC_by32_loop);
3373 ldp(tmp0, tmp1, Address(post(buf, 16)));
3374 subs(len, len, 32);
3375 crc32x(crc, crc, tmp0);
3376 ldr(tmp2, Address(post(buf, 8)));
3377 crc32x(crc, crc, tmp1);
3378 ldr(tmp3, Address(post(buf, 8)));
3379 crc32x(crc, crc, tmp2);
3380 crc32x(crc, crc, tmp3);
3381 br(Assembler::GE, CRC_by32_loop);
3382 cmn(len, (u1)32);
3383 br(Assembler::NE, CRC_less32);
3384 b(L_exit);
3385
3386 BIND(CRC_by4_loop);
3387 ldrw(tmp0, Address(post(buf, 4)));
3388 subs(len, len, 4);
3389 crc32w(crc, crc, tmp0);
3390 br(Assembler::GE, CRC_by4_loop);
3391 adds(len, len, 4);
3392 br(Assembler::LE, L_exit);
3393 BIND(CRC_by1_loop);
3394 ldrb(tmp0, Address(post(buf, 1)));
3395 subs(len, len, 1);
3396 crc32b(crc, crc, tmp0);
3397 br(Assembler::GT, CRC_by1_loop);
3398 b(L_exit);
3399
3400 BIND(CRC_by64_pre);
3401 sub(buf, buf, 8);
3402 ldp(tmp0, tmp1, Address(buf, 8));
3403 crc32x(crc, crc, tmp0);
3404 ldr(tmp2, Address(buf, 24));
3405 crc32x(crc, crc, tmp1);
3406 ldr(tmp3, Address(buf, 32));
3407 crc32x(crc, crc, tmp2);
3408 ldr(tmp0, Address(buf, 40));
3409 crc32x(crc, crc, tmp3);
3410 ldr(tmp1, Address(buf, 48));
3411 crc32x(crc, crc, tmp0);
3412 ldr(tmp2, Address(buf, 56));
3413 crc32x(crc, crc, tmp1);
3414 ldr(tmp3, Address(pre(buf, 64)));
3415
3416 b(CRC_by64_loop);
3417
3418 align(CodeEntryAlignment);
3419 BIND(CRC_by64_loop);
3420 subs(len, len, 64);
3421 crc32x(crc, crc, tmp2);
3422 ldr(tmp0, Address(buf, 8));
3423 crc32x(crc, crc, tmp3);
3424 ldr(tmp1, Address(buf, 16));
3425 crc32x(crc, crc, tmp0);
3426 ldr(tmp2, Address(buf, 24));
3427 crc32x(crc, crc, tmp1);
3428 ldr(tmp3, Address(buf, 32));
3429 crc32x(crc, crc, tmp2);
3430 ldr(tmp0, Address(buf, 40));
3431 crc32x(crc, crc, tmp3);
3432 ldr(tmp1, Address(buf, 48));
3433 crc32x(crc, crc, tmp0);
3434 ldr(tmp2, Address(buf, 56));
3435 crc32x(crc, crc, tmp1);
3436 ldr(tmp3, Address(pre(buf, 64)));
3437 br(Assembler::GE, CRC_by64_loop);
3438
3439 // post-loop
3440 crc32x(crc, crc, tmp2);
3441 crc32x(crc, crc, tmp3);
3442
3443 sub(len, len, 64);
3444 add(buf, buf, 8);
3445 cmn(len, (u1)128);
3446 br(Assembler::NE, CRC_less64);
3447 BIND(L_exit);
3448 mvnw(crc, crc);
3449 }
3450
3451 /**
3452 * @param crc register containing existing CRC (32-bit)
3453 * @param buf register pointing to input byte buffer (byte*)
3454 * @param len register containing number of bytes
3455 * @param table register that will contain address of CRC table
3456 * @param tmp scratch register
3457 */
3458 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3459 Register table0, Register table1, Register table2, Register table3,
3460 Register tmp, Register tmp2, Register tmp3) {
3461 Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3462 uint64_t offset;
3463
3464 if (UseCRC32) {
3465 kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3466 return;
3467 }
3468
3469 mvnw(crc, crc);
3470
3471 adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3472 if (offset) add(table0, table0, offset);
3473 add(table1, table0, 1*256*sizeof(juint));
3474 add(table2, table0, 2*256*sizeof(juint));
3475 add(table3, table0, 3*256*sizeof(juint));
3476
3477 if (UseNeon) {
3478 cmp(len, (u1)64);
3479 br(Assembler::LT, L_by16);
3480 eor(v16, T16B, v16, v16);
3481
3482 Label L_fold;
3483
3484 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3485
3486 ld1(v0, v1, T2D, post(buf, 32));
3487 ld1r(v4, T2D, post(tmp, 8));
3488 ld1r(v5, T2D, post(tmp, 8));
3489 ld1r(v6, T2D, post(tmp, 8));
3490 ld1r(v7, T2D, post(tmp, 8));
3491 mov(v16, T4S, 0, crc);
3492
3493 eor(v0, T16B, v0, v16);
3494 sub(len, len, 64);
3495
3496 BIND(L_fold);
3497 pmull(v22, T8H, v0, v5, T8B);
3498 pmull(v20, T8H, v0, v7, T8B);
3499 pmull(v23, T8H, v0, v4, T8B);
3500 pmull(v21, T8H, v0, v6, T8B);
3501
3502 pmull2(v18, T8H, v0, v5, T16B);
3503 pmull2(v16, T8H, v0, v7, T16B);
3504 pmull2(v19, T8H, v0, v4, T16B);
3505 pmull2(v17, T8H, v0, v6, T16B);
3506
3507 uzp1(v24, T8H, v20, v22);
3508 uzp2(v25, T8H, v20, v22);
3509 eor(v20, T16B, v24, v25);
3510
3511 uzp1(v26, T8H, v16, v18);
3512 uzp2(v27, T8H, v16, v18);
3513 eor(v16, T16B, v26, v27);
3514
3515 ushll2(v22, T4S, v20, T8H, 8);
3516 ushll(v20, T4S, v20, T4H, 8);
3517
3518 ushll2(v18, T4S, v16, T8H, 8);
3519 ushll(v16, T4S, v16, T4H, 8);
3520
3521 eor(v22, T16B, v23, v22);
3522 eor(v18, T16B, v19, v18);
3523 eor(v20, T16B, v21, v20);
3524 eor(v16, T16B, v17, v16);
3525
3526 uzp1(v17, T2D, v16, v20);
3527 uzp2(v21, T2D, v16, v20);
3528 eor(v17, T16B, v17, v21);
3529
3530 ushll2(v20, T2D, v17, T4S, 16);
3531 ushll(v16, T2D, v17, T2S, 16);
3532
3533 eor(v20, T16B, v20, v22);
3534 eor(v16, T16B, v16, v18);
3535
3536 uzp1(v17, T2D, v20, v16);
3537 uzp2(v21, T2D, v20, v16);
3538 eor(v28, T16B, v17, v21);
3539
3540 pmull(v22, T8H, v1, v5, T8B);
3541 pmull(v20, T8H, v1, v7, T8B);
3542 pmull(v23, T8H, v1, v4, T8B);
3543 pmull(v21, T8H, v1, v6, T8B);
3544
3545 pmull2(v18, T8H, v1, v5, T16B);
3546 pmull2(v16, T8H, v1, v7, T16B);
3547 pmull2(v19, T8H, v1, v4, T16B);
3548 pmull2(v17, T8H, v1, v6, T16B);
3549
3550 ld1(v0, v1, T2D, post(buf, 32));
3551
3552 uzp1(v24, T8H, v20, v22);
3553 uzp2(v25, T8H, v20, v22);
3554 eor(v20, T16B, v24, v25);
3555
3556 uzp1(v26, T8H, v16, v18);
3557 uzp2(v27, T8H, v16, v18);
3558 eor(v16, T16B, v26, v27);
3559
3560 ushll2(v22, T4S, v20, T8H, 8);
3561 ushll(v20, T4S, v20, T4H, 8);
3562
3563 ushll2(v18, T4S, v16, T8H, 8);
3564 ushll(v16, T4S, v16, T4H, 8);
3565
3566 eor(v22, T16B, v23, v22);
3567 eor(v18, T16B, v19, v18);
3568 eor(v20, T16B, v21, v20);
3569 eor(v16, T16B, v17, v16);
3570
3571 uzp1(v17, T2D, v16, v20);
3572 uzp2(v21, T2D, v16, v20);
3573 eor(v16, T16B, v17, v21);
3574
3575 ushll2(v20, T2D, v16, T4S, 16);
3576 ushll(v16, T2D, v16, T2S, 16);
3577
3578 eor(v20, T16B, v22, v20);
3579 eor(v16, T16B, v16, v18);
3580
3581 uzp1(v17, T2D, v20, v16);
3582 uzp2(v21, T2D, v20, v16);
3583 eor(v20, T16B, v17, v21);
3584
3585 shl(v16, T2D, v28, 1);
3586 shl(v17, T2D, v20, 1);
3587
3588 eor(v0, T16B, v0, v16);
3589 eor(v1, T16B, v1, v17);
3590
3591 subs(len, len, 32);
3592 br(Assembler::GE, L_fold);
3593
3594 mov(crc, 0);
3595 mov(tmp, v0, T1D, 0);
3596 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3597 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3598 mov(tmp, v0, T1D, 1);
3599 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3600 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3601 mov(tmp, v1, T1D, 0);
3602 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3603 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3604 mov(tmp, v1, T1D, 1);
3605 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3606 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3607
3608 add(len, len, 32);
3609 }
3610
3611 BIND(L_by16);
3612 subs(len, len, 16);
3613 br(Assembler::GE, L_by16_loop);
3614 adds(len, len, 16-4);
3615 br(Assembler::GE, L_by4_loop);
3616 adds(len, len, 4);
3617 br(Assembler::GT, L_by1_loop);
3618 b(L_exit);
3619
3620 BIND(L_by4_loop);
3621 ldrw(tmp, Address(post(buf, 4)));
3622 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3623 subs(len, len, 4);
3624 br(Assembler::GE, L_by4_loop);
3625 adds(len, len, 4);
3626 br(Assembler::LE, L_exit);
3627 BIND(L_by1_loop);
3628 subs(len, len, 1);
3629 ldrb(tmp, Address(post(buf, 1)));
3630 update_byte_crc32(crc, tmp, table0);
3631 br(Assembler::GT, L_by1_loop);
3632 b(L_exit);
3633
3634 align(CodeEntryAlignment);
3635 BIND(L_by16_loop);
3636 subs(len, len, 16);
3637 ldp(tmp, tmp3, Address(post(buf, 16)));
3638 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3639 update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3640 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3641 update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3642 br(Assembler::GE, L_by16_loop);
3643 adds(len, len, 16-4);
3644 br(Assembler::GE, L_by4_loop);
3645 adds(len, len, 4);
3646 br(Assembler::GT, L_by1_loop);
3647 BIND(L_exit);
3648 mvnw(crc, crc);
3649 }
3650
3651 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3652 Register len, Register tmp0, Register tmp1, Register tmp2,
3653 Register tmp3) {
3654 Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3655 assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3656
3657 subs(len, len, 128);
3658 br(Assembler::GE, CRC_by64_pre);
3659 BIND(CRC_less64);
3660 adds(len, len, 128-32);
3661 br(Assembler::GE, CRC_by32_loop);
3662 BIND(CRC_less32);
3663 adds(len, len, 32-4);
3664 br(Assembler::GE, CRC_by4_loop);
3665 adds(len, len, 4);
3666 br(Assembler::GT, CRC_by1_loop);
3667 b(L_exit);
3668
3669 BIND(CRC_by32_loop);
3670 ldp(tmp0, tmp1, Address(post(buf, 16)));
3671 subs(len, len, 32);
3672 crc32cx(crc, crc, tmp0);
3673 ldr(tmp2, Address(post(buf, 8)));
3674 crc32cx(crc, crc, tmp1);
3675 ldr(tmp3, Address(post(buf, 8)));
3676 crc32cx(crc, crc, tmp2);
3677 crc32cx(crc, crc, tmp3);
3678 br(Assembler::GE, CRC_by32_loop);
3679 cmn(len, (u1)32);
3680 br(Assembler::NE, CRC_less32);
3681 b(L_exit);
3682
3683 BIND(CRC_by4_loop);
3684 ldrw(tmp0, Address(post(buf, 4)));
3685 subs(len, len, 4);
3686 crc32cw(crc, crc, tmp0);
3687 br(Assembler::GE, CRC_by4_loop);
3688 adds(len, len, 4);
3689 br(Assembler::LE, L_exit);
3690 BIND(CRC_by1_loop);
3691 ldrb(tmp0, Address(post(buf, 1)));
3692 subs(len, len, 1);
3693 crc32cb(crc, crc, tmp0);
3694 br(Assembler::GT, CRC_by1_loop);
3695 b(L_exit);
3696
3697 BIND(CRC_by64_pre);
3698 sub(buf, buf, 8);
3699 ldp(tmp0, tmp1, Address(buf, 8));
3700 crc32cx(crc, crc, tmp0);
3701 ldr(tmp2, Address(buf, 24));
3702 crc32cx(crc, crc, tmp1);
3703 ldr(tmp3, Address(buf, 32));
3704 crc32cx(crc, crc, tmp2);
3705 ldr(tmp0, Address(buf, 40));
3706 crc32cx(crc, crc, tmp3);
3707 ldr(tmp1, Address(buf, 48));
3708 crc32cx(crc, crc, tmp0);
3709 ldr(tmp2, Address(buf, 56));
3710 crc32cx(crc, crc, tmp1);
3711 ldr(tmp3, Address(pre(buf, 64)));
3712
3713 b(CRC_by64_loop);
3714
3715 align(CodeEntryAlignment);
3716 BIND(CRC_by64_loop);
3717 subs(len, len, 64);
3718 crc32cx(crc, crc, tmp2);
3719 ldr(tmp0, Address(buf, 8));
3720 crc32cx(crc, crc, tmp3);
3721 ldr(tmp1, Address(buf, 16));
3722 crc32cx(crc, crc, tmp0);
3723 ldr(tmp2, Address(buf, 24));
3724 crc32cx(crc, crc, tmp1);
3725 ldr(tmp3, Address(buf, 32));
3726 crc32cx(crc, crc, tmp2);
3727 ldr(tmp0, Address(buf, 40));
3728 crc32cx(crc, crc, tmp3);
3729 ldr(tmp1, Address(buf, 48));
3730 crc32cx(crc, crc, tmp0);
3731 ldr(tmp2, Address(buf, 56));
3732 crc32cx(crc, crc, tmp1);
3733 ldr(tmp3, Address(pre(buf, 64)));
3734 br(Assembler::GE, CRC_by64_loop);
3735
3736 // post-loop
3737 crc32cx(crc, crc, tmp2);
3738 crc32cx(crc, crc, tmp3);
3739
3740 sub(len, len, 64);
3741 add(buf, buf, 8);
3742 cmn(len, (u1)128);
3743 br(Assembler::NE, CRC_less64);
3744 BIND(L_exit);
3745 }
3746
3747 /**
3748 * @param crc register containing existing CRC (32-bit)
3749 * @param buf register pointing to input byte buffer (byte*)
3750 * @param len register containing number of bytes
3751 * @param table register that will contain address of CRC table
3752 * @param tmp scratch register
3753 */
3754 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3755 Register table0, Register table1, Register table2, Register table3,
3756 Register tmp, Register tmp2, Register tmp3) {
3757 kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3758 }
3759
3760
3761 SkipIfEqual::SkipIfEqual(
3762 MacroAssembler* masm, const bool* flag_addr, bool value) {
3763 _masm = masm;
3764 uint64_t offset;
3765 _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3766 _masm->ldrb(rscratch1, Address(rscratch1, offset));
3767 _masm->cbzw(rscratch1, _label);
3768 }
3769
3770 SkipIfEqual::~SkipIfEqual() {
3771 _masm->bind(_label);
3772 }
3773
3774 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3775 Address adr;
3776 switch(dst.getMode()) {
3777 case Address::base_plus_offset:
3778 // This is the expected mode, although we allow all the other
3779 // forms below.
3780 adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3781 break;
3782 default:
3783 lea(rscratch2, dst);
3784 adr = Address(rscratch2);
3785 break;
3786 }
3787 ldr(rscratch1, adr);
3788 add(rscratch1, rscratch1, src);
3789 str(rscratch1, adr);
3790 }
3791
3792 void MacroAssembler::cmpptr(Register src1, Address src2) {
3793 uint64_t offset;
3794 adrp(rscratch1, src2, offset);
3795 ldr(rscratch1, Address(rscratch1, offset));
3796 cmp(src1, rscratch1);
3797 }
3798
3799 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3800 cmp(obj1, obj2);
3801 }
3802
3803 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
3804 load_method_holder(rresult, rmethod);
3805 ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
3806 }
3807
3808 void MacroAssembler::load_method_holder(Register holder, Register method) {
3809 ldr(holder, Address(method, Method::const_offset())); // ConstMethod*
3810 ldr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
3811 ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3812 }
3813
3814 // Loads the obj's Klass* into dst.
3815 // src and dst must be distinct registers
3816 // Preserves all registers (incl src, rscratch1 and rscratch2), but clobbers condition flags
3817 void MacroAssembler::load_nklass(Register dst, Register src) {
3818 assert(UseCompressedClassPointers, "expects UseCompressedClassPointers");
3819
3820 if (!UseCompactObjectHeaders) {
3821 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3822 return;
3823 }
3824
3825 Label fast;
3826
3827 // Check if we can take the (common) fast path, if obj is unlocked.
3828 ldr(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3829 tbz(dst, exact_log2(markWord::monitor_value), fast);
3830
3831 // Fetch displaced header
3832 ldr(dst, Address(dst, OM_OFFSET_NO_MONITOR_VALUE_TAG(header)));
3833
3834 // Fast-path: shift and decode Klass*.
3835 bind(fast);
3836 lsr(dst, dst, markWord::klass_shift);
3837 }
3838
3839 void MacroAssembler::load_klass(Register dst, Register src, bool null_check_src) {
3840 if (null_check_src) {
3841 if (UseCompactObjectHeaders) {
3842 null_check(src, oopDesc::mark_offset_in_bytes());
3843 } else {
3844 null_check(src, oopDesc::klass_offset_in_bytes());
3845 }
3846 }
3847
3848 if (UseCompressedClassPointers) {
3849 if (UseCompactObjectHeaders) {
3850 load_nklass(dst, src);
3851 } else {
3852 ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3853 }
3854 decode_klass_not_null(dst);
3855 } else {
3856 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3857 }
3858 }
3859
3860 // ((OopHandle)result).resolve();
3861 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3862 // OopHandle::resolve is an indirection.
3863 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3864 }
3865
3866 // ((WeakHandle)result).resolve();
3867 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
3868 assert_different_registers(rresult, rtmp);
3869 Label resolved;
3870
3871 // A null weak handle resolves to null.
3872 cbz(rresult, resolved);
3873
3874 // Only 64 bit platforms support GCs that require a tmp register
3875 // Only IN_HEAP loads require a thread_tmp register
3876 // WeakHandle::resolve is an indirection like jweak.
3877 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3878 rresult, Address(rresult), rtmp, /*tmp_thread*/noreg);
3879 bind(resolved);
3880 }
3881
3882 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3883 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3884 ldr(dst, Address(rmethod, Method::const_offset()));
3885 ldr(dst, Address(dst, ConstMethod::constants_offset()));
3886 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3887 ldr(dst, Address(dst, mirror_offset));
3888 resolve_oop_handle(dst, tmp);
3889 }
3890
3891 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3892 assert_different_registers(oop, trial_klass, tmp);
3893 if (UseCompressedClassPointers) {
3894 if (UseCompactObjectHeaders) {
3895 load_nklass(tmp, oop);
3896 } else {
3897 ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3898 }
3899 if (CompressedKlassPointers::base() == NULL) {
3900 cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3901 return;
3902 } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3903 && CompressedKlassPointers::shift() == 0) {
3904 // Only the bottom 32 bits matter
3905 cmpw(trial_klass, tmp);
3906 return;
3907 }
3908 decode_klass_not_null(tmp);
3909 } else {
3910 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3911 }
3912 cmp(trial_klass, tmp);
3913 }
3914
3915 void MacroAssembler::store_klass(Register dst, Register src) {
3916 // FIXME: Should this be a store release? concurrent gcs assumes
3917 // klass length is valid if klass field is not null.
3918 if (UseCompressedClassPointers) {
3919 encode_klass_not_null(src);
3920 strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3921 } else {
3922 str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3923 }
3924 }
3925
3926 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3927 if (UseCompressedClassPointers) {
3928 // Store to klass gap in destination
3929 strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3930 }
3931 }
3932
3933 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3934 load_klass(dst, src);
3935 ldr(dst, Address(dst, Klass::prototype_header_offset()));
3936 }
3937
3938 // Algorithm must match CompressedOops::encode.
3939 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3940 #ifdef ASSERT
3941 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3942 #endif
3943 verify_oop(s, "broken oop in encode_heap_oop");
3944 if (CompressedOops::base() == NULL) {
3945 if (CompressedOops::shift() != 0) {
3946 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3947 lsr(d, s, LogMinObjAlignmentInBytes);
3948 } else {
3949 mov(d, s);
3950 }
3951 } else {
3952 subs(d, s, rheapbase);
3953 csel(d, d, zr, Assembler::HS);
3954 lsr(d, d, LogMinObjAlignmentInBytes);
3955
3956 /* Old algorithm: is this any worse?
3957 Label nonnull;
3958 cbnz(r, nonnull);
3959 sub(r, r, rheapbase);
3960 bind(nonnull);
3961 lsr(r, r, LogMinObjAlignmentInBytes);
3962 */
3963 }
3964 }
3965
3966 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3967 #ifdef ASSERT
3968 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3969 if (CheckCompressedOops) {
3970 Label ok;
3971 cbnz(r, ok);
3972 stop("null oop passed to encode_heap_oop_not_null");
3973 bind(ok);
3974 }
3975 #endif
3976 verify_oop(r, "broken oop in encode_heap_oop_not_null");
3977 if (CompressedOops::base() != NULL) {
3978 sub(r, r, rheapbase);
3979 }
3980 if (CompressedOops::shift() != 0) {
3981 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3982 lsr(r, r, LogMinObjAlignmentInBytes);
3983 }
3984 }
3985
3986 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3987 #ifdef ASSERT
3988 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3989 if (CheckCompressedOops) {
3990 Label ok;
3991 cbnz(src, ok);
3992 stop("null oop passed to encode_heap_oop_not_null2");
3993 bind(ok);
3994 }
3995 #endif
3996 verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3997
3998 Register data = src;
3999 if (CompressedOops::base() != NULL) {
4000 sub(dst, src, rheapbase);
4001 data = dst;
4002 }
4003 if (CompressedOops::shift() != 0) {
4004 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4005 lsr(dst, data, LogMinObjAlignmentInBytes);
4006 data = dst;
4007 }
4008 if (data == src)
4009 mov(dst, src);
4010 }
4011
4012 void MacroAssembler::decode_heap_oop(Register d, Register s) {
4013 #ifdef ASSERT
4014 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
4015 #endif
4016 if (CompressedOops::base() == NULL) {
4017 if (CompressedOops::shift() != 0 || d != s) {
4018 lsl(d, s, CompressedOops::shift());
4019 }
4020 } else {
4021 Label done;
4022 if (d != s)
4023 mov(d, s);
4024 cbz(s, done);
4025 add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
4026 bind(done);
4027 }
4028 verify_oop(d, "broken oop in decode_heap_oop");
4029 }
4030
4031 void MacroAssembler::decode_heap_oop_not_null(Register r) {
4032 assert (UseCompressedOops, "should only be used for compressed headers");
4033 assert (Universe::heap() != NULL, "java heap should be initialized");
4034 // Cannot assert, unverified entry point counts instructions (see .ad file)
4035 // vtableStubs also counts instructions in pd_code_size_limit.
4036 // Also do not verify_oop as this is called by verify_oop.
4037 if (CompressedOops::shift() != 0) {
4038 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4039 if (CompressedOops::base() != NULL) {
4040 add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4041 } else {
4042 add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
4043 }
4044 } else {
4045 assert (CompressedOops::base() == NULL, "sanity");
4046 }
4047 }
4048
4049 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
4050 assert (UseCompressedOops, "should only be used for compressed headers");
4051 assert (Universe::heap() != NULL, "java heap should be initialized");
4052 // Cannot assert, unverified entry point counts instructions (see .ad file)
4053 // vtableStubs also counts instructions in pd_code_size_limit.
4054 // Also do not verify_oop as this is called by verify_oop.
4055 if (CompressedOops::shift() != 0) {
4056 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4057 if (CompressedOops::base() != NULL) {
4058 add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4059 } else {
4060 add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
4061 }
4062 } else {
4063 assert (CompressedOops::base() == NULL, "sanity");
4064 if (dst != src) {
4065 mov(dst, src);
4066 }
4067 }
4068 }
4069
4070 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
4071
4072 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
4073 assert(UseCompressedClassPointers, "not using compressed class pointers");
4074 assert(Metaspace::initialized(), "metaspace not initialized yet");
4075
4076 if (_klass_decode_mode != KlassDecodeNone) {
4077 return _klass_decode_mode;
4078 }
4079
4080 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
4081 || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
4082
4083 if (CompressedKlassPointers::base() == NULL) {
4084 return (_klass_decode_mode = KlassDecodeZero);
4085 }
4086
4087 if (operand_valid_for_logical_immediate(
4088 /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
4089 const uint64_t range_mask =
4090 (1ULL << log2i(CompressedKlassPointers::range())) - 1;
4091 if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
4092 return (_klass_decode_mode = KlassDecodeXor);
4093 }
4094 }
4095
4096 const uint64_t shifted_base =
4097 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4098 guarantee((shifted_base & 0xffff0000ffffffff) == 0,
4099 "compressed class base bad alignment");
4100
4101 return (_klass_decode_mode = KlassDecodeMovk);
4102 }
4103
4104 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
4105 switch (klass_decode_mode()) {
4106 case KlassDecodeZero:
4107 if (CompressedKlassPointers::shift() != 0) {
4108 lsr(dst, src, LogKlassAlignmentInBytes);
4109 } else {
4110 if (dst != src) mov(dst, src);
4111 }
4112 break;
4113
4114 case KlassDecodeXor:
4115 if (CompressedKlassPointers::shift() != 0) {
4116 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4117 lsr(dst, dst, LogKlassAlignmentInBytes);
4118 } else {
4119 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4120 }
4121 break;
4122
4123 case KlassDecodeMovk:
4124 if (CompressedKlassPointers::shift() != 0) {
4125 ubfx(dst, src, LogKlassAlignmentInBytes, 32);
4126 } else {
4127 movw(dst, src);
4128 }
4129 break;
4130
4131 case KlassDecodeNone:
4132 ShouldNotReachHere();
4133 break;
4134 }
4135 }
4136
4137 void MacroAssembler::encode_klass_not_null(Register r) {
4138 encode_klass_not_null(r, r);
4139 }
4140
4141 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
4142 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4143
4144 switch (klass_decode_mode()) {
4145 case KlassDecodeZero:
4146 if (CompressedKlassPointers::shift() != 0) {
4147 lsl(dst, src, LogKlassAlignmentInBytes);
4148 } else {
4149 if (dst != src) mov(dst, src);
4150 }
4151 break;
4152
4153 case KlassDecodeXor:
4154 if (CompressedKlassPointers::shift() != 0) {
4155 lsl(dst, src, LogKlassAlignmentInBytes);
4156 eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4157 } else {
4158 eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4159 }
4160 break;
4161
4162 case KlassDecodeMovk: {
4163 const uint64_t shifted_base =
4164 (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4165
4166 if (dst != src) movw(dst, src);
4167 movk(dst, shifted_base >> 32, 32);
4168
4169 if (CompressedKlassPointers::shift() != 0) {
4170 lsl(dst, dst, LogKlassAlignmentInBytes);
4171 }
4172
4173 break;
4174 }
4175
4176 case KlassDecodeNone:
4177 ShouldNotReachHere();
4178 break;
4179 }
4180 }
4181
4182 void MacroAssembler::decode_klass_not_null(Register r) {
4183 decode_klass_not_null(r, r);
4184 }
4185
4186 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4187 #ifdef ASSERT
4188 {
4189 ThreadInVMfromUnknown tiv;
4190 assert (UseCompressedOops, "should only be used for compressed oops");
4191 assert (Universe::heap() != NULL, "java heap should be initialized");
4192 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4193 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4194 }
4195 #endif
4196 int oop_index = oop_recorder()->find_index(obj);
4197 InstructionMark im(this);
4198 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4199 code_section()->relocate(inst_mark(), rspec);
4200 movz(dst, 0xDEAD, 16);
4201 movk(dst, 0xBEEF);
4202 }
4203
4204 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4205 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4206 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4207 int index = oop_recorder()->find_index(k);
4208 assert(! Universe::heap()->is_in(k), "should not be an oop");
4209
4210 InstructionMark im(this);
4211 RelocationHolder rspec = metadata_Relocation::spec(index);
4212 code_section()->relocate(inst_mark(), rspec);
4213 narrowKlass nk = CompressedKlassPointers::encode(k);
4214 movz(dst, (nk >> 16), 16);
4215 movk(dst, nk & 0xffff);
4216 }
4217
4218 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4219 Register dst, Address src,
4220 Register tmp1, Register thread_tmp) {
4221 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4222 decorators = AccessInternal::decorator_fixup(decorators);
4223 bool as_raw = (decorators & AS_RAW) != 0;
4224 if (as_raw) {
4225 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4226 } else {
4227 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4228 }
4229 }
4230
4231 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4232 Address dst, Register src,
4233 Register tmp1, Register thread_tmp) {
4234 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4235 decorators = AccessInternal::decorator_fixup(decorators);
4236 bool as_raw = (decorators & AS_RAW) != 0;
4237 if (as_raw) {
4238 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4239 } else {
4240 bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4241 }
4242 }
4243
4244 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4245 Register thread_tmp, DecoratorSet decorators) {
4246 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4247 }
4248
4249 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4250 Register thread_tmp, DecoratorSet decorators) {
4251 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4252 }
4253
4254 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4255 Register thread_tmp, DecoratorSet decorators) {
4256 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4257 }
4258
4259 // Used for storing NULLs.
4260 void MacroAssembler::store_heap_oop_null(Address dst) {
4261 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4262 }
4263
4264 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4265 assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4266 int index = oop_recorder()->allocate_metadata_index(obj);
4267 RelocationHolder rspec = metadata_Relocation::spec(index);
4268 return Address((address)obj, rspec);
4269 }
4270
4271 // Move an oop into a register. immediate is true if we want
4272 // immediate instructions and nmethod entry barriers are not enabled.
4273 // i.e. we are not going to patch this instruction while the code is being
4274 // executed by another thread.
4275 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4276 int oop_index;
4277 if (obj == NULL) {
4278 oop_index = oop_recorder()->allocate_oop_index(obj);
4279 } else {
4280 #ifdef ASSERT
4281 {
4282 ThreadInVMfromUnknown tiv;
4283 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4284 }
4285 #endif
4286 oop_index = oop_recorder()->find_index(obj);
4287 }
4288 RelocationHolder rspec = oop_Relocation::spec(oop_index);
4289
4290 // nmethod entry barrier necessitate using the constant pool. They have to be
4291 // ordered with respected to oop accesses.
4292 // Using immediate literals would necessitate ISBs.
4293 if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL || !immediate) {
4294 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4295 ldr_constant(dst, Address(dummy, rspec));
4296 } else
4297 mov(dst, Address((address)obj, rspec));
4298
4299 }
4300
4301 // Move a metadata address into a register.
4302 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4303 int oop_index;
4304 if (obj == NULL) {
4305 oop_index = oop_recorder()->allocate_metadata_index(obj);
4306 } else {
4307 oop_index = oop_recorder()->find_index(obj);
4308 }
4309 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4310 mov(dst, Address((address)obj, rspec));
4311 }
4312
4313 Address MacroAssembler::constant_oop_address(jobject obj) {
4314 #ifdef ASSERT
4315 {
4316 ThreadInVMfromUnknown tiv;
4317 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4318 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4319 }
4320 #endif
4321 int oop_index = oop_recorder()->find_index(obj);
4322 return Address((address)obj, oop_Relocation::spec(oop_index));
4323 }
4324
4325 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4326 void MacroAssembler::tlab_allocate(Register obj,
4327 Register var_size_in_bytes,
4328 int con_size_in_bytes,
4329 Register t1,
4330 Register t2,
4331 Label& slow_case) {
4332 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4333 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4334 }
4335
4336 // Defines obj, preserves var_size_in_bytes
4337 void MacroAssembler::eden_allocate(Register obj,
4338 Register var_size_in_bytes,
4339 int con_size_in_bytes,
4340 Register t1,
4341 Label& slow_case) {
4342 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4343 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4344 }
4345
4346 void MacroAssembler::verify_tlab() {
4347 #ifdef ASSERT
4348 if (UseTLAB && VerifyOops) {
4349 Label next, ok;
4350
4351 stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4352
4353 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4354 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4355 cmp(rscratch2, rscratch1);
4356 br(Assembler::HS, next);
4357 STOP("assert(top >= start)");
4358 should_not_reach_here();
4359
4360 bind(next);
4361 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4362 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4363 cmp(rscratch2, rscratch1);
4364 br(Assembler::HS, ok);
4365 STOP("assert(top <= end)");
4366 should_not_reach_here();
4367
4368 bind(ok);
4369 ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4370 }
4371 #endif
4372 }
4373
4374 // Writes to stack successive pages until offset reached to check for
4375 // stack overflow + shadow pages. This clobbers tmp.
4376 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4377 assert_different_registers(tmp, size, rscratch1);
4378 mov(tmp, sp);
4379 // Bang stack for total size given plus shadow page size.
4380 // Bang one page at a time because large size can bang beyond yellow and
4381 // red zones.
4382 Label loop;
4383 mov(rscratch1, os::vm_page_size());
4384 bind(loop);
4385 lea(tmp, Address(tmp, -os::vm_page_size()));
4386 subsw(size, size, rscratch1);
4387 str(size, Address(tmp));
4388 br(Assembler::GT, loop);
4389
4390 // Bang down shadow pages too.
4391 // At this point, (tmp-0) is the last address touched, so don't
4392 // touch it again. (It was touched as (tmp-pagesize) but then tmp
4393 // was post-decremented.) Skip this address by starting at i=1, and
4394 // touch a few more pages below. N.B. It is important to touch all
4395 // the way down to and including i=StackShadowPages.
4396 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4397 // this could be any sized move but this is can be a debugging crumb
4398 // so the bigger the better.
4399 lea(tmp, Address(tmp, -os::vm_page_size()));
4400 str(size, Address(tmp));
4401 }
4402 }
4403
4404 // Move the address of the polling page into dest.
4405 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4406 ldr(dest, Address(rthread, JavaThread::polling_page_offset()));
4407 }
4408
4409 // Read the polling page. The address of the polling page must
4410 // already be in r.
4411 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4412 address mark;
4413 {
4414 InstructionMark im(this);
4415 code_section()->relocate(inst_mark(), rtype);
4416 ldrw(zr, Address(r, 0));
4417 mark = inst_mark();
4418 }
4419 verify_cross_modify_fence_not_required();
4420 return mark;
4421 }
4422
4423 void MacroAssembler::adrp(Register reg1, const Address &dest, uint64_t &byte_offset) {
4424 relocInfo::relocType rtype = dest.rspec().reloc()->type();
4425 uint64_t low_page = (uint64_t)CodeCache::low_bound() >> 12;
4426 uint64_t high_page = (uint64_t)(CodeCache::high_bound()-1) >> 12;
4427 uint64_t dest_page = (uint64_t)dest.target() >> 12;
4428 int64_t offset_low = dest_page - low_page;
4429 int64_t offset_high = dest_page - high_page;
4430
4431 assert(is_valid_AArch64_address(dest.target()), "bad address");
4432 assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4433
4434 InstructionMark im(this);
4435 code_section()->relocate(inst_mark(), dest.rspec());
4436 // 8143067: Ensure that the adrp can reach the dest from anywhere within
4437 // the code cache so that if it is relocated we know it will still reach
4438 if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4439 _adrp(reg1, dest.target());
4440 } else {
4441 uint64_t target = (uint64_t)dest.target();
4442 uint64_t adrp_target
4443 = (target & 0xffffffffULL) | ((uint64_t)pc() & 0xffff00000000ULL);
4444
4445 _adrp(reg1, (address)adrp_target);
4446 movk(reg1, target >> 32, 32);
4447 }
4448 byte_offset = (uint64_t)dest.target() & 0xfff;
4449 }
4450
4451 void MacroAssembler::load_byte_map_base(Register reg) {
4452 CardTable::CardValue* byte_map_base =
4453 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4454
4455 // Strictly speaking the byte_map_base isn't an address at all, and it might
4456 // even be negative. It is thus materialised as a constant.
4457 mov(reg, (uint64_t)byte_map_base);
4458 }
4459
4460 void MacroAssembler::build_frame(int framesize) {
4461 assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4462 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4463 if (framesize < ((1 << 9) + 2 * wordSize)) {
4464 sub(sp, sp, framesize);
4465 stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4466 if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4467 } else {
4468 stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4469 if (PreserveFramePointer) mov(rfp, sp);
4470 if (framesize < ((1 << 12) + 2 * wordSize))
4471 sub(sp, sp, framesize - 2 * wordSize);
4472 else {
4473 mov(rscratch1, framesize - 2 * wordSize);
4474 sub(sp, sp, rscratch1);
4475 }
4476 }
4477 verify_cross_modify_fence_not_required();
4478 }
4479
4480 void MacroAssembler::remove_frame(int framesize) {
4481 assert(framesize >= 2 * wordSize, "framesize must include space for FP/LR");
4482 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4483 if (framesize < ((1 << 9) + 2 * wordSize)) {
4484 ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4485 add(sp, sp, framesize);
4486 } else {
4487 if (framesize < ((1 << 12) + 2 * wordSize))
4488 add(sp, sp, framesize - 2 * wordSize);
4489 else {
4490 mov(rscratch1, framesize - 2 * wordSize);
4491 add(sp, sp, rscratch1);
4492 }
4493 ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4494 }
4495 }
4496
4497
4498 // This method checks if provided byte array contains byte with highest bit set.
4499 address MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
4500 // Simple and most common case of aligned small array which is not at the
4501 // end of memory page is placed here. All other cases are in stub.
4502 Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4503 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4504 assert_different_registers(ary1, len, result);
4505
4506 cmpw(len, 0);
4507 br(LE, SET_RESULT);
4508 cmpw(len, 4 * wordSize);
4509 br(GE, STUB_LONG); // size > 32 then go to stub
4510
4511 int shift = 64 - exact_log2(os::vm_page_size());
4512 lsl(rscratch1, ary1, shift);
4513 mov(rscratch2, (size_t)(4 * wordSize) << shift);
4514 adds(rscratch2, rscratch1, rscratch2); // At end of page?
4515 br(CS, STUB); // at the end of page then go to stub
4516 subs(len, len, wordSize);
4517 br(LT, END);
4518
4519 BIND(LOOP);
4520 ldr(rscratch1, Address(post(ary1, wordSize)));
4521 tst(rscratch1, UPPER_BIT_MASK);
4522 br(NE, SET_RESULT);
4523 subs(len, len, wordSize);
4524 br(GE, LOOP);
4525 cmpw(len, -wordSize);
4526 br(EQ, SET_RESULT);
4527
4528 BIND(END);
4529 ldr(result, Address(ary1));
4530 sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4531 lslv(result, result, len);
4532 tst(result, UPPER_BIT_MASK);
4533 b(SET_RESULT);
4534
4535 BIND(STUB);
4536 RuntimeAddress has_neg = RuntimeAddress(StubRoutines::aarch64::has_negatives());
4537 assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
4538 address tpc1 = trampoline_call(has_neg);
4539 if (tpc1 == NULL) {
4540 DEBUG_ONLY(reset_labels(STUB_LONG, SET_RESULT, DONE));
4541 postcond(pc() == badAddress);
4542 return NULL;
4543 }
4544 b(DONE);
4545
4546 BIND(STUB_LONG);
4547 RuntimeAddress has_neg_long = RuntimeAddress(StubRoutines::aarch64::has_negatives_long());
4548 assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
4549 address tpc2 = trampoline_call(has_neg_long);
4550 if (tpc2 == NULL) {
4551 DEBUG_ONLY(reset_labels(SET_RESULT, DONE));
4552 postcond(pc() == badAddress);
4553 return NULL;
4554 }
4555 b(DONE);
4556
4557 BIND(SET_RESULT);
4558 cset(result, NE); // set true or false
4559
4560 BIND(DONE);
4561 postcond(pc() != badAddress);
4562 return pc();
4563 }
4564
4565 // Clobbers: rscratch1, rscratch2, rflags
4566 // May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals)
4567 address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4568 Register tmp4, Register tmp5, Register result,
4569 Register cnt1, int elem_size) {
4570 Label DONE, SAME;
4571 Register tmp1 = rscratch1;
4572 Register tmp2 = rscratch2;
4573 Register cnt2 = tmp2; // cnt2 only used in array length compare
4574 int elem_per_word = wordSize/elem_size;
4575 int log_elem_size = exact_log2(elem_size);
4576 int length_offset = arrayOopDesc::length_offset_in_bytes();
4577 int base_offset
4578 = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4579 int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4580
4581 assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4582 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4583
4584 #ifndef PRODUCT
4585 {
4586 const char kind = (elem_size == 2) ? 'U' : 'L';
4587 char comment[64];
4588 snprintf(comment, sizeof comment, "array_equals%c{", kind);
4589 BLOCK_COMMENT(comment);
4590 }
4591 #endif
4592
4593 // if (a1 == a2)
4594 // return true;
4595 cmpoop(a1, a2); // May have read barriers for a1 and a2.
4596 br(EQ, SAME);
4597
4598 if (UseSimpleArrayEquals) {
4599 Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4600 // if (a1 == null || a2 == null)
4601 // return false;
4602 // a1 & a2 == 0 means (some-pointer is null) or
4603 // (very-rare-or-even-probably-impossible-pointer-values)
4604 // so, we can save one branch in most cases
4605 tst(a1, a2);
4606 mov(result, false);
4607 br(EQ, A_MIGHT_BE_NULL);
4608 // if (a1.length != a2.length)
4609 // return false;
4610 bind(A_IS_NOT_NULL);
4611 ldrw(cnt1, Address(a1, length_offset));
4612 ldrw(cnt2, Address(a2, length_offset));
4613 eorw(tmp5, cnt1, cnt2);
4614 cbnzw(tmp5, DONE);
4615 lea(a1, Address(a1, base_offset));
4616 lea(a2, Address(a2, base_offset));
4617 // Check for short strings, i.e. smaller than wordSize.
4618 subs(cnt1, cnt1, elem_per_word);
4619 br(Assembler::LT, SHORT);
4620 // Main 8 byte comparison loop.
4621 bind(NEXT_WORD); {
4622 ldr(tmp1, Address(post(a1, wordSize)));
4623 ldr(tmp2, Address(post(a2, wordSize)));
4624 subs(cnt1, cnt1, elem_per_word);
4625 eor(tmp5, tmp1, tmp2);
4626 cbnz(tmp5, DONE);
4627 } br(GT, NEXT_WORD);
4628 // Last longword. In the case where length == 4 we compare the
4629 // same longword twice, but that's still faster than another
4630 // conditional branch.
4631 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4632 // length == 4.
4633 if (log_elem_size > 0)
4634 lsl(cnt1, cnt1, log_elem_size);
4635 ldr(tmp3, Address(a1, cnt1));
4636 ldr(tmp4, Address(a2, cnt1));
4637 eor(tmp5, tmp3, tmp4);
4638 cbnz(tmp5, DONE);
4639 b(SAME);
4640 bind(A_MIGHT_BE_NULL);
4641 // in case both a1 and a2 are not-null, proceed with loads
4642 cbz(a1, DONE);
4643 cbz(a2, DONE);
4644 b(A_IS_NOT_NULL);
4645 bind(SHORT);
4646
4647 tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4648 {
4649 ldrw(tmp1, Address(post(a1, 4)));
4650 ldrw(tmp2, Address(post(a2, 4)));
4651 eorw(tmp5, tmp1, tmp2);
4652 cbnzw(tmp5, DONE);
4653 }
4654 bind(TAIL03);
4655 tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4656 {
4657 ldrh(tmp3, Address(post(a1, 2)));
4658 ldrh(tmp4, Address(post(a2, 2)));
4659 eorw(tmp5, tmp3, tmp4);
4660 cbnzw(tmp5, DONE);
4661 }
4662 bind(TAIL01);
4663 if (elem_size == 1) { // Only needed when comparing byte arrays.
4664 tbz(cnt1, 0, SAME); // 0-1 bytes left.
4665 {
4666 ldrb(tmp1, a1);
4667 ldrb(tmp2, a2);
4668 eorw(tmp5, tmp1, tmp2);
4669 cbnzw(tmp5, DONE);
4670 }
4671 }
4672 } else {
4673 Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB,
4674 CSET_EQ, LAST_CHECK;
4675 mov(result, false);
4676 cbz(a1, DONE);
4677 ldrw(cnt1, Address(a1, length_offset));
4678 cbz(a2, DONE);
4679 ldrw(cnt2, Address(a2, length_offset));
4680 // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
4681 // faster to perform another branch before comparing a1 and a2
4682 cmp(cnt1, (u1)elem_per_word);
4683 br(LE, SHORT); // short or same
4684 ldr(tmp3, Address(pre(a1, base_offset)));
4685 subs(zr, cnt1, stubBytesThreshold);
4686 br(GE, STUB);
4687 ldr(tmp4, Address(pre(a2, base_offset)));
4688 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4689 cmp(cnt2, cnt1);
4690 br(NE, DONE);
4691
4692 // Main 16 byte comparison loop with 2 exits
4693 bind(NEXT_DWORD); {
4694 ldr(tmp1, Address(pre(a1, wordSize)));
4695 ldr(tmp2, Address(pre(a2, wordSize)));
4696 subs(cnt1, cnt1, 2 * elem_per_word);
4697 br(LE, TAIL);
4698 eor(tmp4, tmp3, tmp4);
4699 cbnz(tmp4, DONE);
4700 ldr(tmp3, Address(pre(a1, wordSize)));
4701 ldr(tmp4, Address(pre(a2, wordSize)));
4702 cmp(cnt1, (u1)elem_per_word);
4703 br(LE, TAIL2);
4704 cmp(tmp1, tmp2);
4705 } br(EQ, NEXT_DWORD);
4706 b(DONE);
4707
4708 bind(TAIL);
4709 eor(tmp4, tmp3, tmp4);
4710 eor(tmp2, tmp1, tmp2);
4711 lslv(tmp2, tmp2, tmp5);
4712 orr(tmp5, tmp4, tmp2);
4713 cmp(tmp5, zr);
4714 b(CSET_EQ);
4715
4716 bind(TAIL2);
4717 eor(tmp2, tmp1, tmp2);
4718 cbnz(tmp2, DONE);
4719 b(LAST_CHECK);
4720
4721 bind(STUB);
4722 ldr(tmp4, Address(pre(a2, base_offset)));
4723 cmp(cnt2, cnt1);
4724 br(NE, DONE);
4725 if (elem_size == 2) { // convert to byte counter
4726 lsl(cnt1, cnt1, 1);
4727 }
4728 eor(tmp5, tmp3, tmp4);
4729 cbnz(tmp5, DONE);
4730 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
4731 assert(stub.target() != NULL, "array_equals_long stub has not been generated");
4732 address tpc = trampoline_call(stub);
4733 if (tpc == NULL) {
4734 DEBUG_ONLY(reset_labels(SHORT, LAST_CHECK, CSET_EQ, SAME, DONE));
4735 postcond(pc() == badAddress);
4736 return NULL;
4737 }
4738 b(DONE);
4739
4740 // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
4741 // so, if a2 == null => return false(0), else return true, so we can return a2
4742 mov(result, a2);
4743 b(DONE);
4744 bind(SHORT);
4745 cmp(cnt2, cnt1);
4746 br(NE, DONE);
4747 cbz(cnt1, SAME);
4748 sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4749 ldr(tmp3, Address(a1, base_offset));
4750 ldr(tmp4, Address(a2, base_offset));
4751 bind(LAST_CHECK);
4752 eor(tmp4, tmp3, tmp4);
4753 lslv(tmp5, tmp4, tmp5);
4754 cmp(tmp5, zr);
4755 bind(CSET_EQ);
4756 cset(result, EQ);
4757 b(DONE);
4758 }
4759
4760 bind(SAME);
4761 mov(result, true);
4762 // That's it.
4763 bind(DONE);
4764
4765 BLOCK_COMMENT("} array_equals");
4766 postcond(pc() != badAddress);
4767 return pc();
4768 }
4769
4770 // Compare Strings
4771
4772 // For Strings we're passed the address of the first characters in a1
4773 // and a2 and the length in cnt1.
4774 // elem_size is the element size in bytes: either 1 or 2.
4775 // There are two implementations. For arrays >= 8 bytes, all
4776 // comparisons (including the final one, which may overlap) are
4777 // performed 8 bytes at a time. For strings < 8 bytes, we compare a
4778 // halfword, then a short, and then a byte.
4779
4780 void MacroAssembler::string_equals(Register a1, Register a2,
4781 Register result, Register cnt1, int elem_size)
4782 {
4783 Label SAME, DONE, SHORT, NEXT_WORD;
4784 Register tmp1 = rscratch1;
4785 Register tmp2 = rscratch2;
4786 Register cnt2 = tmp2; // cnt2 only used in array length compare
4787
4788 assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
4789 assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4790
4791 #ifndef PRODUCT
4792 {
4793 const char kind = (elem_size == 2) ? 'U' : 'L';
4794 char comment[64];
4795 snprintf(comment, sizeof comment, "{string_equals%c", kind);
4796 BLOCK_COMMENT(comment);
4797 }
4798 #endif
4799
4800 mov(result, false);
4801
4802 // Check for short strings, i.e. smaller than wordSize.
4803 subs(cnt1, cnt1, wordSize);
4804 br(Assembler::LT, SHORT);
4805 // Main 8 byte comparison loop.
4806 bind(NEXT_WORD); {
4807 ldr(tmp1, Address(post(a1, wordSize)));
4808 ldr(tmp2, Address(post(a2, wordSize)));
4809 subs(cnt1, cnt1, wordSize);
4810 eor(tmp1, tmp1, tmp2);
4811 cbnz(tmp1, DONE);
4812 } br(GT, NEXT_WORD);
4813 // Last longword. In the case where length == 4 we compare the
4814 // same longword twice, but that's still faster than another
4815 // conditional branch.
4816 // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4817 // length == 4.
4818 ldr(tmp1, Address(a1, cnt1));
4819 ldr(tmp2, Address(a2, cnt1));
4820 eor(tmp2, tmp1, tmp2);
4821 cbnz(tmp2, DONE);
4822 b(SAME);
4823
4824 bind(SHORT);
4825 Label TAIL03, TAIL01;
4826
4827 tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
4828 {
4829 ldrw(tmp1, Address(post(a1, 4)));
4830 ldrw(tmp2, Address(post(a2, 4)));
4831 eorw(tmp1, tmp1, tmp2);
4832 cbnzw(tmp1, DONE);
4833 }
4834 bind(TAIL03);
4835 tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
4836 {
4837 ldrh(tmp1, Address(post(a1, 2)));
4838 ldrh(tmp2, Address(post(a2, 2)));
4839 eorw(tmp1, tmp1, tmp2);
4840 cbnzw(tmp1, DONE);
4841 }
4842 bind(TAIL01);
4843 if (elem_size == 1) { // Only needed when comparing 1-byte elements
4844 tbz(cnt1, 0, SAME); // 0-1 bytes left.
4845 {
4846 ldrb(tmp1, a1);
4847 ldrb(tmp2, a2);
4848 eorw(tmp1, tmp1, tmp2);
4849 cbnzw(tmp1, DONE);
4850 }
4851 }
4852 // Arrays are equal.
4853 bind(SAME);
4854 mov(result, true);
4855
4856 // That's it.
4857 bind(DONE);
4858 BLOCK_COMMENT("} string_equals");
4859 }
4860
4861
4862 // The size of the blocks erased by the zero_blocks stub. We must
4863 // handle anything smaller than this ourselves in zero_words().
4864 const int MacroAssembler::zero_words_block_size = 8;
4865
4866 // zero_words() is used by C2 ClearArray patterns and by
4867 // C1_MacroAssembler. It is as small as possible, handling small word
4868 // counts locally and delegating anything larger to the zero_blocks
4869 // stub. It is expanded many times in compiled code, so it is
4870 // important to keep it short.
4871
4872 // ptr: Address of a buffer to be zeroed.
4873 // cnt: Count in HeapWords.
4874 //
4875 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
4876 address MacroAssembler::zero_words(Register ptr, Register cnt)
4877 {
4878 assert(is_power_of_2(zero_words_block_size), "adjust this");
4879
4880 BLOCK_COMMENT("zero_words {");
4881 assert(ptr == r10 && cnt == r11, "mismatch in register usage");
4882 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4883 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4884
4885 subs(rscratch1, cnt, zero_words_block_size);
4886 Label around;
4887 br(LO, around);
4888 {
4889 RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4890 assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4891 // Make sure this is a C2 compilation. C1 allocates space only for
4892 // trampoline stubs generated by Call LIR ops, and in any case it
4893 // makes sense for a C1 compilation task to proceed as quickly as
4894 // possible.
4895 CompileTask* task;
4896 if (StubRoutines::aarch64::complete()
4897 && Thread::current()->is_Compiler_thread()
4898 && (task = ciEnv::current()->task())
4899 && is_c2_compile(task->comp_level())) {
4900 address tpc = trampoline_call(zero_blocks);
4901 if (tpc == NULL) {
4902 DEBUG_ONLY(reset_labels(around));
4903 return NULL;
4904 }
4905 } else {
4906 far_call(zero_blocks);
4907 }
4908 }
4909 bind(around);
4910
4911 // We have a few words left to do. zero_blocks has adjusted r10 and r11
4912 // for us.
4913 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4914 Label l;
4915 tbz(cnt, exact_log2(i), l);
4916 for (int j = 0; j < i; j += 2) {
4917 stp(zr, zr, post(ptr, 2 * BytesPerWord));
4918 }
4919 bind(l);
4920 }
4921 {
4922 Label l;
4923 tbz(cnt, 0, l);
4924 str(zr, Address(ptr));
4925 bind(l);
4926 }
4927
4928 BLOCK_COMMENT("} zero_words");
4929 return pc();
4930 }
4931
4932 // base: Address of a buffer to be zeroed, 8 bytes aligned.
4933 // cnt: Immediate count in HeapWords.
4934 //
4935 // r10, r11, rscratch1, and rscratch2 are clobbered.
4936 address MacroAssembler::zero_words(Register base, uint64_t cnt)
4937 {
4938 assert(wordSize <= BlockZeroingLowLimit,
4939 "increase BlockZeroingLowLimit");
4940 address result = nullptr;
4941 if (cnt <= (uint64_t)BlockZeroingLowLimit / BytesPerWord) {
4942 #ifndef PRODUCT
4943 {
4944 char buf[64];
4945 snprintf(buf, sizeof buf, "zero_words (count = %" PRIu64 ") {", cnt);
4946 BLOCK_COMMENT(buf);
4947 }
4948 #endif
4949 if (cnt >= 16) {
4950 uint64_t loops = cnt/16;
4951 if (loops > 1) {
4952 mov(rscratch2, loops - 1);
4953 }
4954 {
4955 Label loop;
4956 bind(loop);
4957 for (int i = 0; i < 16; i += 2) {
4958 stp(zr, zr, Address(base, i * BytesPerWord));
4959 }
4960 add(base, base, 16 * BytesPerWord);
4961 if (loops > 1) {
4962 subs(rscratch2, rscratch2, 1);
4963 br(GE, loop);
4964 }
4965 }
4966 }
4967 cnt %= 16;
4968 int i = cnt & 1; // store any odd word to start
4969 if (i) str(zr, Address(base));
4970 for (; i < (int)cnt; i += 2) {
4971 stp(zr, zr, Address(base, i * wordSize));
4972 }
4973 BLOCK_COMMENT("} zero_words");
4974 result = pc();
4975 } else {
4976 mov(r10, base); mov(r11, cnt);
4977 result = zero_words(r10, r11);
4978 }
4979 return result;
4980 }
4981
4982 // Zero blocks of memory by using DC ZVA.
4983 //
4984 // Aligns the base address first sufficently for DC ZVA, then uses
4985 // DC ZVA repeatedly for every full block. cnt is the size to be
4986 // zeroed in HeapWords. Returns the count of words left to be zeroed
4987 // in cnt.
4988 //
4989 // NOTE: This is intended to be used in the zero_blocks() stub. If
4990 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
4991 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
4992 Register tmp = rscratch1;
4993 Register tmp2 = rscratch2;
4994 int zva_length = VM_Version::zva_length();
4995 Label initial_table_end, loop_zva;
4996 Label fini;
4997
4998 // Base must be 16 byte aligned. If not just return and let caller handle it
4999 tst(base, 0x0f);
5000 br(Assembler::NE, fini);
5001 // Align base with ZVA length.
5002 neg(tmp, base);
5003 andr(tmp, tmp, zva_length - 1);
5004
5005 // tmp: the number of bytes to be filled to align the base with ZVA length.
5006 add(base, base, tmp);
5007 sub(cnt, cnt, tmp, Assembler::ASR, 3);
5008 adr(tmp2, initial_table_end);
5009 sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5010 br(tmp2);
5011
5012 for (int i = -zva_length + 16; i < 0; i += 16)
5013 stp(zr, zr, Address(base, i));
5014 bind(initial_table_end);
5015
5016 sub(cnt, cnt, zva_length >> 3);
5017 bind(loop_zva);
5018 dc(Assembler::ZVA, base);
5019 subs(cnt, cnt, zva_length >> 3);
5020 add(base, base, zva_length);
5021 br(Assembler::GE, loop_zva);
5022 add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5023 bind(fini);
5024 }
5025
5026 // base: Address of a buffer to be filled, 8 bytes aligned.
5027 // cnt: Count in 8-byte unit.
5028 // value: Value to be filled with.
5029 // base will point to the end of the buffer after filling.
5030 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5031 {
5032 // Algorithm:
5033 //
5034 // scratch1 = cnt & 7;
5035 // cnt -= scratch1;
5036 // p += scratch1;
5037 // switch (scratch1) {
5038 // do {
5039 // cnt -= 8;
5040 // p[-8] = v;
5041 // case 7:
5042 // p[-7] = v;
5043 // case 6:
5044 // p[-6] = v;
5045 // // ...
5046 // case 1:
5047 // p[-1] = v;
5048 // case 0:
5049 // p += 8;
5050 // } while (cnt);
5051 // }
5052
5053 assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5054
5055 Label fini, skip, entry, loop;
5056 const int unroll = 8; // Number of stp instructions we'll unroll
5057
5058 cbz(cnt, fini);
5059 tbz(base, 3, skip);
5060 str(value, Address(post(base, 8)));
5061 sub(cnt, cnt, 1);
5062 bind(skip);
5063
5064 andr(rscratch1, cnt, (unroll-1) * 2);
5065 sub(cnt, cnt, rscratch1);
5066 add(base, base, rscratch1, Assembler::LSL, 3);
5067 adr(rscratch2, entry);
5068 sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5069 br(rscratch2);
5070
5071 bind(loop);
5072 add(base, base, unroll * 16);
5073 for (int i = -unroll; i < 0; i++)
5074 stp(value, value, Address(base, i * 16));
5075 bind(entry);
5076 subs(cnt, cnt, unroll * 2);
5077 br(Assembler::GE, loop);
5078
5079 tbz(cnt, 0, fini);
5080 str(value, Address(post(base, 8)));
5081 bind(fini);
5082 }
5083
5084 // Intrinsic for
5085 //
5086 // - sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray
5087 // return the number of characters copied.
5088 // - java/lang/StringUTF16.compress
5089 // return zero (0) if copy fails, otherwise 'len'.
5090 //
5091 // This version always returns the number of characters copied, and does not
5092 // clobber the 'len' register. A successful copy will complete with the post-
5093 // condition: 'res' == 'len', while an unsuccessful copy will exit with the
5094 // post-condition: 0 <= 'res' < 'len'.
5095 //
5096 // NOTE: Attempts to use 'ld2' (and 'umaxv' in the ISO part) has proven to
5097 // degrade performance (on Ampere Altra - Neoverse N1), to an extent
5098 // beyond the acceptable, even though the footprint would be smaller.
5099 // Using 'umaxv' in the ASCII-case comes with a small penalty but does
5100 // avoid additional bloat.
5101 //
5102 // Clobbers: src, dst, res, rscratch1, rscratch2, rflags
5103 void MacroAssembler::encode_iso_array(Register src, Register dst,
5104 Register len, Register res, bool ascii,
5105 FloatRegister vtmp0, FloatRegister vtmp1,
5106 FloatRegister vtmp2, FloatRegister vtmp3,
5107 FloatRegister vtmp4, FloatRegister vtmp5)
5108 {
5109 Register cnt = res;
5110 Register max = rscratch1;
5111 Register chk = rscratch2;
5112
5113 prfm(Address(src), PLDL1STRM);
5114 movw(cnt, len);
5115
5116 #define ASCII(insn) do { if (ascii) { insn; } } while (0)
5117
5118 Label LOOP_32, DONE_32, FAIL_32;
5119
5120 BIND(LOOP_32);
5121 {
5122 cmpw(cnt, 32);
5123 br(LT, DONE_32);
5124 ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64)));
5125 // Extract lower bytes.
5126 FloatRegister vlo0 = vtmp4;
5127 FloatRegister vlo1 = vtmp5;
5128 uzp1(vlo0, T16B, vtmp0, vtmp1);
5129 uzp1(vlo1, T16B, vtmp2, vtmp3);
5130 // Merge bits...
5131 orr(vtmp0, T16B, vtmp0, vtmp1);
5132 orr(vtmp2, T16B, vtmp2, vtmp3);
5133 // Extract merged upper bytes.
5134 FloatRegister vhix = vtmp0;
5135 uzp2(vhix, T16B, vtmp0, vtmp2);
5136 // ISO-check on hi-parts (all zero).
5137 // ASCII-check on lo-parts (no sign).
5138 FloatRegister vlox = vtmp1; // Merge lower bytes.
5139 ASCII(orr(vlox, T16B, vlo0, vlo1));
5140 umov(chk, vhix, D, 1); ASCII(cmlt(vlox, T16B, vlox));
5141 fmovd(max, vhix); ASCII(umaxv(vlox, T16B, vlox));
5142 orr(chk, chk, max); ASCII(umov(max, vlox, B, 0));
5143 ASCII(orr(chk, chk, max));
5144 cbnz(chk, FAIL_32);
5145 subw(cnt, cnt, 32);
5146 st1(vlo0, vlo1, T16B, Address(post(dst, 32)));
5147 b(LOOP_32);
5148 }
5149 BIND(FAIL_32);
5150 sub(src, src, 64);
5151 BIND(DONE_32);
5152
5153 Label LOOP_8, SKIP_8;
5154
5155 BIND(LOOP_8);
5156 {
5157 cmpw(cnt, 8);
5158 br(LT, SKIP_8);
5159 FloatRegister vhi = vtmp0;
5160 FloatRegister vlo = vtmp1;
5161 ld1(vtmp3, T8H, src);
5162 uzp1(vlo, T16B, vtmp3, vtmp3);
5163 uzp2(vhi, T16B, vtmp3, vtmp3);
5164 // ISO-check on hi-parts (all zero).
5165 // ASCII-check on lo-parts (no sign).
5166 ASCII(cmlt(vtmp2, T16B, vlo));
5167 fmovd(chk, vhi); ASCII(umaxv(vtmp2, T16B, vtmp2));
5168 ASCII(umov(max, vtmp2, B, 0));
5169 ASCII(orr(chk, chk, max));
5170 cbnz(chk, SKIP_8);
5171
5172 strd(vlo, Address(post(dst, 8)));
5173 subw(cnt, cnt, 8);
5174 add(src, src, 16);
5175 b(LOOP_8);
5176 }
5177 BIND(SKIP_8);
5178
5179 #undef ASCII
5180
5181 Label LOOP, DONE;
5182
5183 cbz(cnt, DONE);
5184 BIND(LOOP);
5185 {
5186 Register chr = rscratch1;
5187 ldrh(chr, Address(post(src, 2)));
5188 tst(chr, ascii ? 0xff80 : 0xff00);
5189 br(NE, DONE);
5190 strb(chr, Address(post(dst, 1)));
5191 subs(cnt, cnt, 1);
5192 br(GT, LOOP);
5193 }
5194 BIND(DONE);
5195 // Return index where we stopped.
5196 subw(res, len, cnt);
5197 }
5198
5199 // Inflate byte[] array to char[].
5200 // Clobbers: src, dst, len, rflags, rscratch1, v0-v6
5201 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5202 FloatRegister vtmp1, FloatRegister vtmp2,
5203 FloatRegister vtmp3, Register tmp4) {
5204 Label big, done, after_init, to_stub;
5205
5206 assert_different_registers(src, dst, len, tmp4, rscratch1);
5207
5208 fmovd(vtmp1, 0.0);
5209 lsrw(tmp4, len, 3);
5210 bind(after_init);
5211 cbnzw(tmp4, big);
5212 // Short string: less than 8 bytes.
5213 {
5214 Label loop, tiny;
5215
5216 cmpw(len, 4);
5217 br(LT, tiny);
5218 // Use SIMD to do 4 bytes.
5219 ldrs(vtmp2, post(src, 4));
5220 zip1(vtmp3, T8B, vtmp2, vtmp1);
5221 subw(len, len, 4);
5222 strd(vtmp3, post(dst, 8));
5223
5224 cbzw(len, done);
5225
5226 // Do the remaining bytes by steam.
5227 bind(loop);
5228 ldrb(tmp4, post(src, 1));
5229 strh(tmp4, post(dst, 2));
5230 subw(len, len, 1);
5231
5232 bind(tiny);
5233 cbnz(len, loop);
5234
5235 b(done);
5236 }
5237
5238 if (SoftwarePrefetchHintDistance >= 0) {
5239 bind(to_stub);
5240 RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5241 assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5242 address tpc = trampoline_call(stub);
5243 if (tpc == NULL) {
5244 DEBUG_ONLY(reset_labels(big, done));
5245 postcond(pc() == badAddress);
5246 return NULL;
5247 }
5248 b(after_init);
5249 }
5250
5251 // Unpack the bytes 8 at a time.
5252 bind(big);
5253 {
5254 Label loop, around, loop_last, loop_start;
5255
5256 if (SoftwarePrefetchHintDistance >= 0) {
5257 const int large_loop_threshold = (64 + 16)/8;
5258 ldrd(vtmp2, post(src, 8));
5259 andw(len, len, 7);
5260 cmp(tmp4, (u1)large_loop_threshold);
5261 br(GE, to_stub);
5262 b(loop_start);
5263
5264 bind(loop);
5265 ldrd(vtmp2, post(src, 8));
5266 bind(loop_start);
5267 subs(tmp4, tmp4, 1);
5268 br(EQ, loop_last);
5269 zip1(vtmp2, T16B, vtmp2, vtmp1);
5270 ldrd(vtmp3, post(src, 8));
5271 st1(vtmp2, T8H, post(dst, 16));
5272 subs(tmp4, tmp4, 1);
5273 zip1(vtmp3, T16B, vtmp3, vtmp1);
5274 st1(vtmp3, T8H, post(dst, 16));
5275 br(NE, loop);
5276 b(around);
5277 bind(loop_last);
5278 zip1(vtmp2, T16B, vtmp2, vtmp1);
5279 st1(vtmp2, T8H, post(dst, 16));
5280 bind(around);
5281 cbz(len, done);
5282 } else {
5283 andw(len, len, 7);
5284 bind(loop);
5285 ldrd(vtmp2, post(src, 8));
5286 sub(tmp4, tmp4, 1);
5287 zip1(vtmp3, T16B, vtmp2, vtmp1);
5288 st1(vtmp3, T8H, post(dst, 16));
5289 cbnz(tmp4, loop);
5290 }
5291 }
5292
5293 // Do the tail of up to 8 bytes.
5294 add(src, src, len);
5295 ldrd(vtmp3, Address(src, -8));
5296 add(dst, dst, len, ext::uxtw, 1);
5297 zip1(vtmp3, T16B, vtmp3, vtmp1);
5298 strq(vtmp3, Address(dst, -16));
5299
5300 bind(done);
5301 postcond(pc() != badAddress);
5302 return pc();
5303 }
5304
5305 // Compress char[] array to byte[].
5306 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5307 Register res,
5308 FloatRegister tmp0, FloatRegister tmp1,
5309 FloatRegister tmp2, FloatRegister tmp3,
5310 FloatRegister tmp4, FloatRegister tmp5) {
5311 encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5);
5312 // Adjust result: res == len ? len : 0
5313 cmp(len, res);
5314 csel(res, res, zr, EQ);
5315 }
5316
5317 // get_thread() can be called anywhere inside generated code so we
5318 // need to save whatever non-callee save context might get clobbered
5319 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5320 // the call setup code.
5321 //
5322 // On Linux, aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5323 // On other systems, the helper is a usual C function.
5324 //
5325 void MacroAssembler::get_thread(Register dst) {
5326 RegSet saved_regs =
5327 LINUX_ONLY(RegSet::range(r0, r1) + lr - dst)
5328 NOT_LINUX (RegSet::range(r0, r17) + lr - dst);
5329
5330 push(saved_regs, sp);
5331
5332 mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5333 blr(lr);
5334 if (dst != c_rarg0) {
5335 mov(dst, c_rarg0);
5336 }
5337
5338 pop(saved_regs, sp);
5339 }
5340
5341 void MacroAssembler::cache_wb(Address line) {
5342 assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5343 assert(line.index() == noreg, "index should be noreg");
5344 assert(line.offset() == 0, "offset should be 0");
5345 // would like to assert this
5346 // assert(line._ext.shift == 0, "shift should be zero");
5347 if (VM_Version::features() & VM_Version::CPU_DCPOP) {
5348 // writeback using clear virtual address to point of persistence
5349 dc(Assembler::CVAP, line.base());
5350 } else {
5351 // no need to generate anything as Unsafe.writebackMemory should
5352 // never invoke this stub
5353 }
5354 }
5355
5356 void MacroAssembler::cache_wbsync(bool is_pre) {
5357 // we only need a barrier post sync
5358 if (!is_pre) {
5359 membar(Assembler::AnyAny);
5360 }
5361 }
5362
5363 void MacroAssembler::verify_sve_vector_length() {
5364 // Make sure that native code does not change SVE vector length.
5365 if (!UseSVE) return;
5366 Label verify_ok;
5367 movw(rscratch1, zr);
5368 sve_inc(rscratch1, B);
5369 subsw(zr, rscratch1, VM_Version::get_initial_sve_vector_length());
5370 br(EQ, verify_ok);
5371 stop("Error: SVE vector length has changed since jvm startup");
5372 bind(verify_ok);
5373 }
5374
5375 void MacroAssembler::verify_ptrue() {
5376 Label verify_ok;
5377 if (!UseSVE) {
5378 return;
5379 }
5380 sve_cntp(rscratch1, B, ptrue, ptrue); // get true elements count.
5381 sve_dec(rscratch1, B);
5382 cbz(rscratch1, verify_ok);
5383 stop("Error: the preserved predicate register (p7) elements are not all true");
5384 bind(verify_ok);
5385 }
5386
5387 void MacroAssembler::safepoint_isb() {
5388 isb();
5389 #ifndef PRODUCT
5390 if (VerifyCrossModifyFence) {
5391 // Clear the thread state.
5392 strb(zr, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5393 }
5394 #endif
5395 }
5396
5397 #ifndef PRODUCT
5398 void MacroAssembler::verify_cross_modify_fence_not_required() {
5399 if (VerifyCrossModifyFence) {
5400 // Check if thread needs a cross modify fence.
5401 ldrb(rscratch1, Address(rthread, in_bytes(JavaThread::requires_cross_modify_fence_offset())));
5402 Label fence_not_required;
5403 cbz(rscratch1, fence_not_required);
5404 // If it does then fail.
5405 lea(rscratch1, CAST_FROM_FN_PTR(address, JavaThread::verify_cross_modify_fence_failure));
5406 mov(c_rarg0, rthread);
5407 blr(rscratch1);
5408 bind(fence_not_required);
5409 }
5410 }
5411 #endif
5412
5413 void MacroAssembler::spin_wait() {
5414 for (int i = 0; i < VM_Version::spin_wait_desc().inst_count(); ++i) {
5415 switch (VM_Version::spin_wait_desc().inst()) {
5416 case SpinWait::NOP:
5417 nop();
5418 break;
5419 case SpinWait::ISB:
5420 isb();
5421 break;
5422 case SpinWait::YIELD:
5423 yield();
5424 break;
5425 default:
5426 ShouldNotReachHere();
5427 }
5428 }
5429 }
5430
5431 // Implements lightweight-locking.
5432 //
5433 // - obj: the object to be locked
5434 // - t1, t2, t3: temporary registers, will be destroyed
5435 // - slow: branched to if locking fails, absolute offset may larger than 32KB (imm14 encoding).
5436 void MacroAssembler::lightweight_lock(Register obj, Register t1, Register t2, Register t3, Label& slow) {
5437 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5438 assert_different_registers(obj, t1, t2, t3, rscratch1);
5439
5440 Label push;
5441 const Register top = t1;
5442 const Register mark = t2;
5443 const Register t = t3;
5444
5445 // Preload the markWord. It is important that this is the first
5446 // instruction emitted as it is part of C1's null check semantics.
5447 ldr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5448
5449 // Check if the lock-stack is full.
5450 ldrw(top, Address(rthread, JavaThread::lock_stack_top_offset()));
5451 cmpw(top, (unsigned)LockStack::end_offset());
5452 br(Assembler::GE, slow);
5453
5454 // Check for recursion.
5455 subw(t, top, oopSize);
5456 ldr(t, Address(rthread, t));
5457 cmp(obj, t);
5458 br(Assembler::EQ, push);
5459
5460 // Check header for monitor (0b10).
5461 tst(mark, markWord::monitor_value);
5462 br(Assembler::NE, slow);
5463
5464 // Try to lock. Transition lock bits 0b01 => 0b00
5465 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5466 orr(mark, mark, markWord::unlocked_value);
5467 eor(t, mark, markWord::unlocked_value);
5468 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::xword,
5469 /*acquire*/ true, /*release*/ false, /*weak*/ false, noreg);
5470 br(Assembler::NE, slow);
5471
5472 bind(push);
5473 // After successful lock, push object on lock-stack.
5474 str(obj, Address(rthread, top));
5475 addw(top, top, oopSize);
5476 strw(top, Address(rthread, JavaThread::lock_stack_top_offset()));
5477 }
5478
5479 // Implements lightweight-unlocking.
5480 //
5481 // - obj: the object to be unlocked
5482 // - t1, t2, t3: temporary registers
5483 // - slow: branched to if unlocking fails, absolute offset may larger than 32KB (imm14 encoding).
5484 void MacroAssembler::lightweight_unlock(Register obj, Register t1, Register t2, Register t3, Label& slow) {
5485 assert(LockingMode == LM_LIGHTWEIGHT, "only used with new lightweight locking");
5486 // cmpxchg clobbers rscratch1.
5487 assert_different_registers(obj, t1, t2, t3, rscratch1);
5488
5489 #ifdef ASSERT
5490 {
5491 // Check for lock-stack underflow.
5492 Label stack_ok;
5493 ldrw(t1, Address(rthread, JavaThread::lock_stack_top_offset()));
5494 cmpw(t1, (unsigned)LockStack::start_offset());
5495 br(Assembler::GE, stack_ok);
5496 STOP("Lock-stack underflow");
5497 bind(stack_ok);
5498 }
5499 #endif
5500
5501 Label unlocked, push_and_slow;
5502 const Register top = t1;
5503 const Register mark = t2;
5504 const Register t = t3;
5505
5506 // Check if obj is top of lock-stack.
5507 ldrw(top, Address(rthread, JavaThread::lock_stack_top_offset()));
5508 subw(top, top, oopSize);
5509 ldr(t, Address(rthread, top));
5510 cmp(obj, t);
5511 br(Assembler::NE, slow);
5512
5513 // Pop lock-stack.
5514 DEBUG_ONLY(str(zr, Address(rthread, top));)
5515 strw(top, Address(rthread, JavaThread::lock_stack_top_offset()));
5516
5517 // Check if recursive.
5518 subw(t, top, oopSize);
5519 ldr(t, Address(rthread, t));
5520 cmp(obj, t);
5521 br(Assembler::EQ, unlocked);
5522
5523 // Not recursive. Check header for monitor (0b10).
5524 ldr(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
5525 tbnz(mark, log2i_exact(markWord::monitor_value), push_and_slow);
5526
5527 #ifdef ASSERT
5528 // Check header not unlocked (0b01).
5529 Label not_unlocked;
5530 tbz(mark, log2i_exact(markWord::unlocked_value), not_unlocked);
5531 stop("lightweight_unlock already unlocked");
5532 bind(not_unlocked);
5533 #endif
5534
5535 // Try to unlock. Transition lock bits 0b00 => 0b01
5536 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
5537 orr(t, mark, markWord::unlocked_value);
5538 cmpxchg(obj, mark, t, Assembler::xword,
5539 /*acquire*/ false, /*release*/ true, /*weak*/ false, noreg);
5540 br(Assembler::EQ, unlocked);
5541
5542 bind(push_and_slow);
5543 // Restore lock-stack and handle the unlock in runtime.
5544 DEBUG_ONLY(str(obj, Address(rthread, top));)
5545 addw(top, top, oopSize);
5546 strw(top, Address(rthread, JavaThread::lock_stack_top_offset()));
5547 b(slow);
5548
5549 bind(unlocked);
5550 }