1 /*
2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "code/compiledIC.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/cardTableBarrierSet.hpp"
35 #include "gc/shared/collectedHeap.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "memory/resourceArea.hpp"
40 #include "memory/universe.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedKlass.inline.hpp"
43 #include "oops/compressedOops.inline.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/oop.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/javaThread.hpp"
48 #include "runtime/jniHandles.inline.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "utilities/globalDefinitions.hpp"
52 #include "utilities/powerOfTwo.hpp"
53 #ifdef COMPILER2
54 #include "opto/compile.hpp"
55 #include "opto/node.hpp"
56 #include "opto/output.hpp"
57 #endif
58
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #else
62 #define BLOCK_COMMENT(str) block_comment(str)
63 #endif
64 #define STOP(str) stop(str);
65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
66
67
68
69 Register MacroAssembler::extract_rs1(address instr) {
70 assert_cond(instr != nullptr);
71 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
72 }
73
74 Register MacroAssembler::extract_rs2(address instr) {
75 assert_cond(instr != nullptr);
76 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
77 }
78
79 Register MacroAssembler::extract_rd(address instr) {
80 assert_cond(instr != nullptr);
81 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
82 }
83
84 uint32_t MacroAssembler::extract_opcode(address instr) {
85 assert_cond(instr != nullptr);
86 return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
87 }
88
89 uint32_t MacroAssembler::extract_funct3(address instr) {
90 assert_cond(instr != nullptr);
91 return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
92 }
93
94 bool MacroAssembler::is_pc_relative_at(address instr) {
95 // auipc + jalr
96 // auipc + addi
97 // auipc + load
98 // auipc + fload_load
99 return (is_auipc_at(instr)) &&
100 (is_addi_at(instr + MacroAssembler::instruction_size) ||
101 is_jalr_at(instr + MacroAssembler::instruction_size) ||
102 is_load_at(instr + MacroAssembler::instruction_size) ||
103 is_float_load_at(instr + MacroAssembler::instruction_size)) &&
104 check_pc_relative_data_dependency(instr);
105 }
106
107 // ie:ld(Rd, Label)
108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
109 return is_auipc_at(instr) && // auipc
110 is_ld_at(instr + MacroAssembler::instruction_size) && // ld
111 check_load_pc_relative_data_dependency(instr);
112 }
113
114 bool MacroAssembler::is_movptr1_at(address instr) {
115 return is_lui_at(instr) && // Lui
116 is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
117 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
118 is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
119 is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
120 (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
121 is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
122 is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
123 check_movptr1_data_dependency(instr);
124 }
125
126 bool MacroAssembler::is_movptr2_at(address instr) {
127 return is_lui_at(instr) && // lui
128 is_lui_at(instr + MacroAssembler::instruction_size) && // lui
129 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
130 is_add_at(instr + MacroAssembler::instruction_size * 3) &&
131 (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
132 is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
133 is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
134 check_movptr2_data_dependency(instr);
135 }
136
137 bool MacroAssembler::is_li16u_at(address instr) {
138 return is_lui_at(instr) && // lui
139 is_srli_at(instr + MacroAssembler::instruction_size) && // srli
140 check_li16u_data_dependency(instr);
141 }
142
143 bool MacroAssembler::is_li32_at(address instr) {
144 return is_lui_at(instr) && // lui
145 is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
146 check_li32_data_dependency(instr);
147 }
148
149 bool MacroAssembler::is_lwu_to_zr(address instr) {
150 assert_cond(instr != nullptr);
151 return (extract_opcode(instr) == 0b0000011 &&
152 extract_funct3(instr) == 0b110 &&
153 extract_rd(instr) == zr); // zr
154 }
155
156 uint32_t MacroAssembler::get_membar_kind(address addr) {
157 assert_cond(addr != nullptr);
158 assert(is_membar(addr), "no membar found");
159
160 uint32_t insn = Bytes::get_native_u4(addr);
161
162 uint32_t predecessor = Assembler::extract(insn, 27, 24);
163 uint32_t successor = Assembler::extract(insn, 23, 20);
164
165 return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
166 }
167
168 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
169 assert_cond(addr != nullptr);
170 assert(is_membar(addr), "no membar found");
171
172 uint32_t predecessor = 0;
173 uint32_t successor = 0;
174
175 MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
176
177 uint32_t insn = Bytes::get_native_u4(addr);
178 address pInsn = (address) &insn;
179 Assembler::patch(pInsn, 27, 24, predecessor);
180 Assembler::patch(pInsn, 23, 20, successor);
181
182 address membar = addr;
183 Assembler::sd_instr(membar, insn);
184 }
185
186 static void pass_arg0(MacroAssembler* masm, Register arg) {
187 if (c_rarg0 != arg) {
188 masm->mv(c_rarg0, arg);
189 }
190 }
191
192 static void pass_arg1(MacroAssembler* masm, Register arg) {
193 if (c_rarg1 != arg) {
194 masm->mv(c_rarg1, arg);
195 }
196 }
197
198 static void pass_arg2(MacroAssembler* masm, Register arg) {
199 if (c_rarg2 != arg) {
200 masm->mv(c_rarg2, arg);
201 }
202 }
203
204 static void pass_arg3(MacroAssembler* masm, Register arg) {
205 if (c_rarg3 != arg) {
206 masm->mv(c_rarg3, arg);
207 }
208 }
209
210 void MacroAssembler::push_cont_fastpath(Register java_thread) {
211 if (!Continuations::enabled()) return;
212 Label done;
213 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
214 bleu(sp, t0, done);
215 sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
216 bind(done);
217 }
218
219 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
220 if (!Continuations::enabled()) return;
221 Label done;
222 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
223 bltu(sp, t0, done);
224 sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
225 bind(done);
226 }
227
228 int MacroAssembler::align(int modulus, int extra_offset) {
229 CompressibleScope scope(this);
230 intptr_t before = offset();
231 while ((offset() + extra_offset) % modulus != 0) { nop(); }
232 return (int)(offset() - before);
233 }
234
235 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
236 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
237 }
238
239 // Implementation of call_VM versions
240
241 void MacroAssembler::call_VM(Register oop_result,
242 address entry_point,
243 bool check_exceptions) {
244 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
245 }
246
247 void MacroAssembler::call_VM(Register oop_result,
248 address entry_point,
249 Register arg_1,
250 bool check_exceptions) {
251 pass_arg1(this, arg_1);
252 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
253 }
254
255 void MacroAssembler::call_VM(Register oop_result,
256 address entry_point,
257 Register arg_1,
258 Register arg_2,
259 bool check_exceptions) {
260 assert_different_registers(arg_1, c_rarg2);
261 pass_arg2(this, arg_2);
262 pass_arg1(this, arg_1);
263 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
264 }
265
266 void MacroAssembler::call_VM(Register oop_result,
267 address entry_point,
268 Register arg_1,
269 Register arg_2,
270 Register arg_3,
271 bool check_exceptions) {
272 assert_different_registers(arg_1, c_rarg2, c_rarg3);
273 assert_different_registers(arg_2, c_rarg3);
274 pass_arg3(this, arg_3);
275
276 pass_arg2(this, arg_2);
277
278 pass_arg1(this, arg_1);
279 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
280 }
281
282 void MacroAssembler::call_VM(Register oop_result,
283 Register last_java_sp,
284 address entry_point,
285 int number_of_arguments,
286 bool check_exceptions) {
287 call_VM_base(oop_result, xthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
288 }
289
290 void MacroAssembler::call_VM(Register oop_result,
291 Register last_java_sp,
292 address entry_point,
293 Register arg_1,
294 bool check_exceptions) {
295 pass_arg1(this, arg_1);
296 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
297 }
298
299 void MacroAssembler::call_VM(Register oop_result,
300 Register last_java_sp,
301 address entry_point,
302 Register arg_1,
303 Register arg_2,
304 bool check_exceptions) {
305
306 assert_different_registers(arg_1, c_rarg2);
307 pass_arg2(this, arg_2);
308 pass_arg1(this, arg_1);
309 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
310 }
311
312 void MacroAssembler::call_VM(Register oop_result,
313 Register last_java_sp,
314 address entry_point,
315 Register arg_1,
316 Register arg_2,
317 Register arg_3,
318 bool check_exceptions) {
319 assert_different_registers(arg_1, c_rarg2, c_rarg3);
320 assert_different_registers(arg_2, c_rarg3);
321 pass_arg3(this, arg_3);
322 pass_arg2(this, arg_2);
323 pass_arg1(this, arg_1);
324 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
325 }
326
327 void MacroAssembler::post_call_nop() {
328 assert(!in_compressible_scope(), "Must be");
329 assert_alignment(pc());
330 if (!Continuations::enabled()) {
331 return;
332 }
333 relocate(post_call_nop_Relocation::spec());
334 InlineSkippedInstructionsCounter skipCounter(this);
335 nop();
336 li32(zr, 0);
337 }
338
339 // these are no-ops overridden by InterpreterMacroAssembler
340 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
341 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
342
343 // Calls to C land
344 //
345 // When entering C land, the fp, & esp of the last Java frame have to be recorded
346 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
347 // has to be reset to 0. This is required to allow proper stack traversal.
348 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
349 Register last_java_fp,
350 Register last_java_pc) {
351
352 if (last_java_pc->is_valid()) {
353 sd(last_java_pc, Address(xthread,
354 JavaThread::frame_anchor_offset() +
355 JavaFrameAnchor::last_Java_pc_offset()));
356 }
357
358 // determine last_java_sp register
359 if (!last_java_sp->is_valid()) {
360 last_java_sp = esp;
361 }
362
363 // last_java_fp is optional
364 if (last_java_fp->is_valid()) {
365 sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
366 }
367
368 // We must set sp last.
369 sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
370
371 }
372
373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
374 Register last_java_fp,
375 address last_java_pc,
376 Register tmp) {
377 assert(last_java_pc != nullptr, "must provide a valid PC");
378
379 la(tmp, last_java_pc);
380 sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
381
382 set_last_Java_frame(last_java_sp, last_java_fp, noreg);
383 }
384
385 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
386 Register last_java_fp,
387 Label &L,
388 Register tmp) {
389 if (L.is_bound()) {
390 set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
391 } else {
392 L.add_patch_at(code(), locator());
393 IncompressibleScope scope(this); // the label address will be patched back.
394 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
395 }
396 }
397
398 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
399 // we must set sp to zero to clear frame
400 sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
401
402 // must clear fp, so that compiled frames are not confused; it is
403 // possible that we need it only for debugging
404 if (clear_fp) {
405 sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
406 }
407
408 // Always clear the pc because it could have been set by make_walkable()
409 sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
410 }
411
412 static bool is_preemptable(address entry_point) {
413 return entry_point == CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter);
414 }
415
416 void MacroAssembler::call_VM_base(Register oop_result,
417 Register java_thread,
418 Register last_java_sp,
419 address entry_point,
420 int number_of_arguments,
421 bool check_exceptions) {
422 // determine java_thread register
423 if (!java_thread->is_valid()) {
424 java_thread = xthread;
425 }
426 // determine last_java_sp register
427 if (!last_java_sp->is_valid()) {
428 last_java_sp = esp;
429 }
430
431 // debugging support
432 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
433 assert(java_thread == xthread, "unexpected register");
434
435 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
436 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
437
438 // push java thread (becomes first argument of C function)
439 mv(c_rarg0, java_thread);
440
441 // set last Java frame before call
442 assert(last_java_sp != fp, "can't use fp");
443
444 Label l;
445 if (is_preemptable(entry_point)) {
446 // skip setting last_pc since we already set it to desired value.
447 set_last_Java_frame(last_java_sp, fp, noreg);
448 } else {
449 set_last_Java_frame(last_java_sp, fp, l, t0);
450 }
451
452 // do the call, remove parameters
453 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
454
455 // reset last Java frame
456 // Only interpreter should have to clear fp
457 reset_last_Java_frame(true);
458
459 // C++ interp handles this in the interpreter
460 check_and_handle_popframe(java_thread);
461 check_and_handle_earlyret(java_thread);
462
463 if (check_exceptions) {
464 // check for pending exceptions (java_thread is set upon return)
465 ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
466 Label ok;
467 beqz(t0, ok);
468 j(RuntimeAddress(StubRoutines::forward_exception_entry()));
469 bind(ok);
470 }
471
472 // get oop result if there is one and reset the value in the thread
473 if (oop_result->is_valid()) {
474 get_vm_result_oop(oop_result, java_thread);
475 }
476 }
477
478 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
479 ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
480 sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
481 verify_oop_msg(oop_result, "broken oop in call_VM_base");
482 }
483
484 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
485 ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
486 sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
487 }
488
489 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
490 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
491 assert_different_registers(klass, xthread, tmp);
492
493 Label L_fallthrough, L_tmp;
494 if (L_fast_path == nullptr) {
495 L_fast_path = &L_fallthrough;
496 } else if (L_slow_path == nullptr) {
497 L_slow_path = &L_fallthrough;
498 }
499
500 // Fast path check: class is fully initialized
501 lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
502 membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
503 sub(tmp, tmp, InstanceKlass::fully_initialized);
504 beqz(tmp, *L_fast_path);
505
506 // Fast path check: current thread is initializer thread
507 ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
508
509 if (L_slow_path == &L_fallthrough) {
510 beq(xthread, tmp, *L_fast_path);
511 bind(*L_slow_path);
512 } else if (L_fast_path == &L_fallthrough) {
513 bne(xthread, tmp, *L_slow_path);
514 bind(*L_fast_path);
515 } else {
516 Unimplemented();
517 }
518 }
519
520 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
521 if (!VerifyOops) { return; }
522
523 // Pass register number to verify_oop_subroutine
524 const char* b = nullptr;
525 {
526 ResourceMark rm;
527 stringStream ss;
528 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
529 b = code_string(ss.as_string());
530 }
531 BLOCK_COMMENT("verify_oop {");
532
533 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
534
535 mv(c_rarg0, reg); // c_rarg0 : x10
536 {
537 // The length of the instruction sequence emitted should not depend
538 // on the address of the char buffer so that the size of mach nodes for
539 // scratch emit and normal emit matches.
540 IncompressibleScope scope(this); // Fixed length
541 movptr(t0, (address) b);
542 }
543
544 // Call indirectly to solve generation ordering problem
545 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
546 jalr(t1);
547
548 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
549
550 BLOCK_COMMENT("} verify_oop");
551 }
552
553 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
554 if (!VerifyOops) {
555 return;
556 }
557
558 const char* b = nullptr;
559 {
560 ResourceMark rm;
561 stringStream ss;
562 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
563 b = code_string(ss.as_string());
564 }
565 BLOCK_COMMENT("verify_oop_addr {");
566
567 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
568
569 if (addr.uses(sp)) {
570 la(x10, addr);
571 ld(x10, Address(x10, 4 * wordSize));
572 } else {
573 ld(x10, addr);
574 }
575
576 {
577 // The length of the instruction sequence emitted should not depend
578 // on the address of the char buffer so that the size of mach nodes for
579 // scratch emit and normal emit matches.
580 IncompressibleScope scope(this); // Fixed length
581 movptr(t0, (address) b);
582 }
583
584 // Call indirectly to solve generation ordering problem
585 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
586 jalr(t1);
587
588 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
589
590 BLOCK_COMMENT("} verify_oop_addr");
591 }
592
593 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
594 int extra_slot_offset) {
595 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
596 int stackElementSize = Interpreter::stackElementSize;
597 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
598 #ifdef ASSERT
599 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
600 assert(offset1 - offset == stackElementSize, "correct arithmetic");
601 #endif
602 if (arg_slot.is_constant()) {
603 return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
604 } else {
605 assert_different_registers(t0, arg_slot.as_register());
606 shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
607 return Address(t0, offset);
608 }
609 }
610
611 #ifndef PRODUCT
612 extern "C" void findpc(intptr_t x);
613 #endif
614
615 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
616 {
617 // In order to get locks to work, we need to fake a in_VM state
618 if (ShowMessageBoxOnError) {
619 JavaThread* thread = JavaThread::current();
620 JavaThreadState saved_state = thread->thread_state();
621 thread->set_thread_state(_thread_in_vm);
622 #ifndef PRODUCT
623 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
624 ttyLocker ttyl;
625 BytecodeCounter::print();
626 }
627 #endif
628 if (os::message_box(msg, "Execution stopped, print registers?")) {
629 ttyLocker ttyl;
630 tty->print_cr(" pc = 0x%016lx", pc);
631 #ifndef PRODUCT
632 tty->cr();
633 findpc(pc);
634 tty->cr();
635 #endif
636 tty->print_cr(" x0 = 0x%016lx", regs[0]);
637 tty->print_cr(" x1 = 0x%016lx", regs[1]);
638 tty->print_cr(" x2 = 0x%016lx", regs[2]);
639 tty->print_cr(" x3 = 0x%016lx", regs[3]);
640 tty->print_cr(" x4 = 0x%016lx", regs[4]);
641 tty->print_cr(" x5 = 0x%016lx", regs[5]);
642 tty->print_cr(" x6 = 0x%016lx", regs[6]);
643 tty->print_cr(" x7 = 0x%016lx", regs[7]);
644 tty->print_cr(" x8 = 0x%016lx", regs[8]);
645 tty->print_cr(" x9 = 0x%016lx", regs[9]);
646 tty->print_cr("x10 = 0x%016lx", regs[10]);
647 tty->print_cr("x11 = 0x%016lx", regs[11]);
648 tty->print_cr("x12 = 0x%016lx", regs[12]);
649 tty->print_cr("x13 = 0x%016lx", regs[13]);
650 tty->print_cr("x14 = 0x%016lx", regs[14]);
651 tty->print_cr("x15 = 0x%016lx", regs[15]);
652 tty->print_cr("x16 = 0x%016lx", regs[16]);
653 tty->print_cr("x17 = 0x%016lx", regs[17]);
654 tty->print_cr("x18 = 0x%016lx", regs[18]);
655 tty->print_cr("x19 = 0x%016lx", regs[19]);
656 tty->print_cr("x20 = 0x%016lx", regs[20]);
657 tty->print_cr("x21 = 0x%016lx", regs[21]);
658 tty->print_cr("x22 = 0x%016lx", regs[22]);
659 tty->print_cr("x23 = 0x%016lx", regs[23]);
660 tty->print_cr("x24 = 0x%016lx", regs[24]);
661 tty->print_cr("x25 = 0x%016lx", regs[25]);
662 tty->print_cr("x26 = 0x%016lx", regs[26]);
663 tty->print_cr("x27 = 0x%016lx", regs[27]);
664 tty->print_cr("x28 = 0x%016lx", regs[28]);
665 tty->print_cr("x30 = 0x%016lx", regs[30]);
666 tty->print_cr("x31 = 0x%016lx", regs[31]);
667 BREAKPOINT;
668 }
669 }
670 fatal("DEBUG MESSAGE: %s", msg);
671 }
672
673 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
674 assert_different_registers(value, tmp1, tmp2);
675 Label done, tagged, weak_tagged;
676
677 beqz(value, done); // Use null as-is.
678 // Test for tag.
679 andi(tmp1, value, JNIHandles::tag_mask);
680 bnez(tmp1, tagged);
681
682 // Resolve local handle
683 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
684 verify_oop(value);
685 j(done);
686
687 bind(tagged);
688 // Test for jweak tag.
689 STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
690 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
691 bnez(tmp1, weak_tagged);
692
693 // Resolve global handle
694 access_load_at(T_OBJECT, IN_NATIVE, value,
695 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
696 verify_oop(value);
697 j(done);
698
699 bind(weak_tagged);
700 // Resolve jweak.
701 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
702 Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
703 verify_oop(value);
704
705 bind(done);
706 }
707
708 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
709 assert_different_registers(value, tmp1, tmp2);
710 Label done;
711
712 beqz(value, done); // Use null as-is.
713
714 #ifdef ASSERT
715 {
716 STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
717 Label valid_global_tag;
718 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
719 bnez(tmp1, valid_global_tag);
720 stop("non global jobject using resolve_global_jobject");
721 bind(valid_global_tag);
722 }
723 #endif
724
725 // Resolve global handle
726 access_load_at(T_OBJECT, IN_NATIVE, value,
727 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
728 verify_oop(value);
729
730 bind(done);
731 }
732
733 void MacroAssembler::stop(const char* msg) {
734 BLOCK_COMMENT(msg);
735 illegal_instruction(Assembler::csr::time);
736 emit_int64((uintptr_t)msg);
737 }
738
739 void MacroAssembler::unimplemented(const char* what) {
740 const char* buf = nullptr;
741 {
742 ResourceMark rm;
743 stringStream ss;
744 ss.print("unimplemented: %s", what);
745 buf = code_string(ss.as_string());
746 }
747 stop(buf);
748 }
749
750 void MacroAssembler::emit_static_call_stub() {
751 IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
752 // CompiledDirectCall::set_to_interpreted knows the
753 // exact layout of this stub.
754
755 mov_metadata(xmethod, (Metadata*)nullptr);
756
757 // Jump to the entry point of the c2i stub.
758 int32_t offset = 0;
759 movptr2(t1, 0, offset, t0); // lui + lui + slli + add
760 jr(t1, offset);
761 }
762
763 void MacroAssembler::call_VM_leaf_base(address entry_point,
764 int number_of_arguments,
765 Label *retaddr) {
766 int32_t offset = 0;
767 push_reg(RegSet::of(t1, xmethod), sp); // push << t1 & xmethod >> to sp
768 movptr(t1, entry_point, offset, t0);
769 jalr(t1, offset);
770 if (retaddr != nullptr) {
771 bind(*retaddr);
772 }
773 pop_reg(RegSet::of(t1, xmethod), sp); // pop << t1 & xmethod >> from sp
774 }
775
776 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
777 call_VM_leaf_base(entry_point, number_of_arguments);
778 }
779
780 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
781 pass_arg0(this, arg_0);
782 call_VM_leaf_base(entry_point, 1);
783 }
784
785 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
786 assert_different_registers(arg_1, c_rarg0);
787 pass_arg0(this, arg_0);
788 pass_arg1(this, arg_1);
789 call_VM_leaf_base(entry_point, 2);
790 }
791
792 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
793 Register arg_1, Register arg_2) {
794 assert_different_registers(arg_1, c_rarg0);
795 assert_different_registers(arg_2, c_rarg0, c_rarg1);
796 pass_arg0(this, arg_0);
797 pass_arg1(this, arg_1);
798 pass_arg2(this, arg_2);
799 call_VM_leaf_base(entry_point, 3);
800 }
801
802 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
803 pass_arg0(this, arg_0);
804 MacroAssembler::call_VM_leaf_base(entry_point, 1);
805 }
806
807 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
808
809 assert_different_registers(arg_0, c_rarg1);
810 pass_arg1(this, arg_1);
811 pass_arg0(this, arg_0);
812 MacroAssembler::call_VM_leaf_base(entry_point, 2);
813 }
814
815 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
816 assert_different_registers(arg_0, c_rarg1, c_rarg2);
817 assert_different_registers(arg_1, c_rarg2);
818 pass_arg2(this, arg_2);
819 pass_arg1(this, arg_1);
820 pass_arg0(this, arg_0);
821 MacroAssembler::call_VM_leaf_base(entry_point, 3);
822 }
823
824 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
825 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
826 assert_different_registers(arg_1, c_rarg2, c_rarg3);
827 assert_different_registers(arg_2, c_rarg3);
828
829 pass_arg3(this, arg_3);
830 pass_arg2(this, arg_2);
831 pass_arg1(this, arg_1);
832 pass_arg0(this, arg_0);
833 MacroAssembler::call_VM_leaf_base(entry_point, 4);
834 }
835
836 void MacroAssembler::la(Register Rd, const address addr) {
837 int32_t offset;
838 la(Rd, addr, offset);
839 addi(Rd, Rd, offset);
840 }
841
842 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
843 int64_t distance = addr - pc();
844 assert(is_valid_32bit_offset(distance), "Must be");
845 auipc(Rd, (int32_t)distance + 0x800);
846 offset = ((int32_t)distance << 20) >> 20;
847 }
848
849 // Materialize with auipc + addi sequence if adr is a literal
850 // address inside code cache. Emit a movptr sequence otherwise.
851 void MacroAssembler::la(Register Rd, const Address &adr) {
852 switch (adr.getMode()) {
853 case Address::literal: {
854 relocInfo::relocType rtype = adr.rspec().reloc()->type();
855 if (rtype == relocInfo::none) {
856 mv(Rd, (intptr_t)(adr.target()));
857 } else {
858 if (CodeCache::contains(adr.target())) {
859 relocate(adr.rspec(), [&] {
860 la(Rd, adr.target());
861 });
862 } else {
863 relocate(adr.rspec(), [&] {
864 movptr(Rd, adr.target());
865 });
866 }
867 }
868 break;
869 }
870 case Address::base_plus_offset: {
871 Address new_adr = legitimize_address(Rd, adr);
872 if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
873 addi(Rd, new_adr.base(), new_adr.offset());
874 }
875 break;
876 }
877 default:
878 ShouldNotReachHere();
879 }
880 }
881
882 void MacroAssembler::la(Register Rd, Label &label) {
883 IncompressibleScope scope(this); // the label address may be patched back.
884 wrap_label(Rd, label, &MacroAssembler::la);
885 }
886
887 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
888 lui(Rd, (uint32_t)imm << 12);
889 srli(Rd, Rd, 12);
890 }
891
892 void MacroAssembler::li32(Register Rd, int32_t imm) {
893 // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
894 int64_t upper = imm, lower = imm;
895 lower = (imm << 20) >> 20;
896 upper -= lower;
897 upper = (int32_t)upper;
898 // lui Rd, imm[31:12] + imm[11]
899 lui(Rd, upper);
900 addiw(Rd, Rd, lower);
901 }
902
903 void MacroAssembler::li(Register Rd, int64_t imm) {
904 // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
905 // li -> c.li
906 if (do_compress() && (is_simm6(imm) && Rd != x0)) {
907 c_li(Rd, imm);
908 return;
909 }
910
911 int shift = 12;
912 int64_t upper = imm, lower = imm;
913 // Split imm to a lower 12-bit sign-extended part and the remainder,
914 // because addi will sign-extend the lower imm.
915 lower = ((int32_t)imm << 20) >> 20;
916 upper -= lower;
917
918 // Test whether imm is a 32-bit integer.
919 if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
920 (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
921 while (((upper >> shift) & 1) == 0) { shift++; }
922 upper >>= shift;
923 li(Rd, upper);
924 slli(Rd, Rd, shift);
925 if (lower != 0) {
926 addi(Rd, Rd, lower);
927 }
928 } else {
929 // 32-bit integer
930 Register hi_Rd = zr;
931 if (upper != 0) {
932 lui(Rd, (int32_t)upper);
933 hi_Rd = Rd;
934 }
935 if (lower != 0 || hi_Rd == zr) {
936 addiw(Rd, hi_Rd, lower);
937 }
938 }
939 }
940
941 void MacroAssembler::j(const address dest, Register temp) {
942 assert(CodeCache::contains(dest), "Must be");
943 assert_cond(dest != nullptr);
944 int64_t distance = dest - pc();
945
946 // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
947 IncompressibleScope scope(this);
948 if (is_simm21(distance) && ((distance % 2) == 0)) {
949 Assembler::jal(x0, distance);
950 } else {
951 assert(temp != noreg && temp != x0, "Expecting a register");
952 assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
953 int32_t offset = 0;
954 la(temp, dest, offset);
955 jr(temp, offset);
956 }
957 }
958
959 void MacroAssembler::j(const Address &dest, Register temp) {
960 switch (dest.getMode()) {
961 case Address::literal: {
962 if (CodeCache::contains(dest.target())) {
963 far_jump(dest, temp);
964 } else {
965 relocate(dest.rspec(), [&] {
966 int32_t offset;
967 movptr(temp, dest.target(), offset);
968 jr(temp, offset);
969 });
970 }
971 break;
972 }
973 case Address::base_plus_offset: {
974 int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
975 la(temp, Address(dest.base(), dest.offset() - offset));
976 jr(temp, offset);
977 break;
978 }
979 default:
980 ShouldNotReachHere();
981 }
982 }
983
984 void MacroAssembler::j(Label &lab, Register temp) {
985 assert_different_registers(x0, temp);
986 if (lab.is_bound()) {
987 MacroAssembler::j(target(lab), temp);
988 } else {
989 lab.add_patch_at(code(), locator());
990 MacroAssembler::j(pc(), temp);
991 }
992 }
993
994 void MacroAssembler::jr(Register Rd, int32_t offset) {
995 assert(Rd != noreg, "expecting a register");
996 assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
997 Assembler::jalr(x0, Rd, offset);
998 }
999
1000 void MacroAssembler::call(const address dest, Register temp) {
1001 assert_cond(dest != nullptr);
1002 assert(temp != noreg, "expecting a register");
1003 assert(temp != x5, "temp register must not be x5.");
1004 int32_t offset = 0;
1005 la(temp, dest, offset);
1006 jalr(temp, offset);
1007 }
1008
1009 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1010 assert(Rs != noreg, "expecting a register");
1011 assert(Rs != x5, "Rs register must not be x5.");
1012 Assembler::jalr(x1, Rs, offset);
1013 }
1014
1015 void MacroAssembler::rt_call(address dest, Register tmp) {
1016 assert(tmp != x5, "tmp register must not be x5.");
1017 RuntimeAddress target(dest);
1018 if (CodeCache::contains(dest)) {
1019 far_call(target, tmp);
1020 } else {
1021 relocate(target.rspec(), [&] {
1022 int32_t offset;
1023 movptr(tmp, target.target(), offset);
1024 jalr(tmp, offset);
1025 });
1026 }
1027 }
1028
1029 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1030 if (L.is_bound()) {
1031 (this->*insn)(Rt, target(L));
1032 } else {
1033 L.add_patch_at(code(), locator());
1034 (this->*insn)(Rt, pc());
1035 }
1036 }
1037
1038 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1039 compare_and_branch_insn insn,
1040 compare_and_branch_label_insn neg_insn, bool is_far) {
1041 if (is_far) {
1042 Label done;
1043 (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1044 j(L);
1045 bind(done);
1046 } else {
1047 if (L.is_bound()) {
1048 (this->*insn)(r1, r2, target(L));
1049 } else {
1050 L.add_patch_at(code(), locator());
1051 (this->*insn)(r1, r2, pc());
1052 }
1053 }
1054 }
1055
1056 #define INSN(NAME, NEG_INSN) \
1057 void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) { \
1058 wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far); \
1059 }
1060
1061 INSN(beq, bne);
1062 INSN(bne, beq);
1063 INSN(blt, bge);
1064 INSN(bge, blt);
1065 INSN(bltu, bgeu);
1066 INSN(bgeu, bltu);
1067
1068 #undef INSN
1069
1070 #define INSN(NAME) \
1071 void MacroAssembler::NAME##z(Register Rs, const address dest) { \
1072 NAME(Rs, zr, dest); \
1073 } \
1074 void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) { \
1075 NAME(Rs, zr, l, is_far); \
1076 } \
1077
1078 INSN(beq);
1079 INSN(bne);
1080 INSN(blt);
1081 INSN(ble);
1082 INSN(bge);
1083 INSN(bgt);
1084
1085 #undef INSN
1086
1087 #define INSN(NAME, NEG_INSN) \
1088 void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) { \
1089 NEG_INSN(Rt, Rs, dest); \
1090 } \
1091 void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) { \
1092 NEG_INSN(Rt, Rs, l, is_far); \
1093 }
1094
1095 INSN(bgt, blt);
1096 INSN(ble, bge);
1097 INSN(bgtu, bltu);
1098 INSN(bleu, bgeu);
1099
1100 #undef INSN
1101
1102 // cmov
1103 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1104 if (UseZicond) {
1105 xorr(t0, cmp1, cmp2);
1106 czero_eqz(dst, dst, t0);
1107 czero_nez(t0 , src, t0);
1108 orr(dst, dst, t0);
1109 return;
1110 }
1111 Label no_set;
1112 bne(cmp1, cmp2, no_set);
1113 mv(dst, src);
1114 bind(no_set);
1115 }
1116
1117 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1118 if (UseZicond) {
1119 xorr(t0, cmp1, cmp2);
1120 czero_nez(dst, dst, t0);
1121 czero_eqz(t0 , src, t0);
1122 orr(dst, dst, t0);
1123 return;
1124 }
1125 Label no_set;
1126 beq(cmp1, cmp2, no_set);
1127 mv(dst, src);
1128 bind(no_set);
1129 }
1130
1131 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1132 if (UseZicond) {
1133 slt(t0, cmp2, cmp1);
1134 czero_eqz(dst, dst, t0);
1135 czero_nez(t0, src, t0);
1136 orr(dst, dst, t0);
1137 return;
1138 }
1139 Label no_set;
1140 bgt(cmp1, cmp2, no_set);
1141 mv(dst, src);
1142 bind(no_set);
1143 }
1144
1145 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1146 if (UseZicond) {
1147 sltu(t0, cmp2, cmp1);
1148 czero_eqz(dst, dst, t0);
1149 czero_nez(t0, src, t0);
1150 orr(dst, dst, t0);
1151 return;
1152 }
1153 Label no_set;
1154 bgtu(cmp1, cmp2, no_set);
1155 mv(dst, src);
1156 bind(no_set);
1157 }
1158
1159 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1160 if (UseZicond) {
1161 slt(t0, cmp1, cmp2);
1162 czero_eqz(dst, dst, t0);
1163 czero_nez(t0, src, t0);
1164 orr(dst, dst, t0);
1165 return;
1166 }
1167 Label no_set;
1168 blt(cmp1, cmp2, no_set);
1169 mv(dst, src);
1170 bind(no_set);
1171 }
1172
1173 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1174 if (UseZicond) {
1175 sltu(t0, cmp1, cmp2);
1176 czero_eqz(dst, dst, t0);
1177 czero_nez(t0, src, t0);
1178 orr(dst, dst, t0);
1179 return;
1180 }
1181 Label no_set;
1182 bltu(cmp1, cmp2, no_set);
1183 mv(dst, src);
1184 bind(no_set);
1185 }
1186
1187 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1188 if (UseZicond) {
1189 slt(t0, cmp1, cmp2);
1190 czero_nez(dst, dst, t0);
1191 czero_eqz(t0, src, t0);
1192 orr(dst, dst, t0);
1193 return;
1194 }
1195 Label no_set;
1196 bge(cmp1, cmp2, no_set);
1197 mv(dst, src);
1198 bind(no_set);
1199 }
1200
1201 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1202 if (UseZicond) {
1203 sltu(t0, cmp1, cmp2);
1204 czero_nez(dst, dst, t0);
1205 czero_eqz(t0, src, t0);
1206 orr(dst, dst, t0);
1207 return;
1208 }
1209 Label no_set;
1210 bgeu(cmp1, cmp2, no_set);
1211 mv(dst, src);
1212 bind(no_set);
1213 }
1214
1215 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1216 if (UseZicond) {
1217 slt(t0, cmp2, cmp1);
1218 czero_nez(dst, dst, t0);
1219 czero_eqz(t0, src, t0);
1220 orr(dst, dst, t0);
1221 return;
1222 }
1223 Label no_set;
1224 ble(cmp1, cmp2, no_set);
1225 mv(dst, src);
1226 bind(no_set);
1227 }
1228
1229 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1230 if (UseZicond) {
1231 sltu(t0, cmp2, cmp1);
1232 czero_nez(dst, dst, t0);
1233 czero_eqz(t0, src, t0);
1234 orr(dst, dst, t0);
1235 return;
1236 }
1237 Label no_set;
1238 bleu(cmp1, cmp2, no_set);
1239 mv(dst, src);
1240 bind(no_set);
1241 }
1242
1243 // ----------- cmove, compare float -----------
1244 //
1245 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1246 // so, just list behaviour of unordered ones as follow.
1247 //
1248 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1249 // (If one or both inputs to the compare are NaN, then)
1250 // 1. (op1 lt op2) => true => CMove: dst = src
1251 // 2. (op1 le op2) => true => CMove: dst = src
1252 // 3. (op1 gt op2) => false => CMove: dst = dst
1253 // 4. (op1 ge op2) => false => CMove: dst = dst
1254 // 5. (op1 eq op2) => false => CMove: dst = dst
1255 // 6. (op1 ne op2) => true => CMove: dst = src
1256
1257 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1258 if (UseZicond) {
1259 if (is_single) {
1260 feq_s(t0, cmp1, cmp2);
1261 } else {
1262 feq_d(t0, cmp1, cmp2);
1263 }
1264 czero_nez(dst, dst, t0);
1265 czero_eqz(t0 , src, t0);
1266 orr(dst, dst, t0);
1267 return;
1268 }
1269 Label no_set;
1270 if (is_single) {
1271 // jump if cmp1 != cmp2, including the case of NaN
1272 // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1273 float_bne(cmp1, cmp2, no_set);
1274 } else {
1275 double_bne(cmp1, cmp2, no_set);
1276 }
1277 mv(dst, src);
1278 bind(no_set);
1279 }
1280
1281 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1282 if (UseZicond) {
1283 if (is_single) {
1284 feq_s(t0, cmp1, cmp2);
1285 } else {
1286 feq_d(t0, cmp1, cmp2);
1287 }
1288 czero_eqz(dst, dst, t0);
1289 czero_nez(t0 , src, t0);
1290 orr(dst, dst, t0);
1291 return;
1292 }
1293 Label no_set;
1294 if (is_single) {
1295 // jump if cmp1 == cmp2
1296 // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1297 float_beq(cmp1, cmp2, no_set);
1298 } else {
1299 double_beq(cmp1, cmp2, no_set);
1300 }
1301 mv(dst, src);
1302 bind(no_set);
1303 }
1304
1305 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1306 if (UseZicond) {
1307 if (is_single) {
1308 flt_s(t0, cmp2, cmp1);
1309 } else {
1310 flt_d(t0, cmp2, cmp1);
1311 }
1312 czero_eqz(dst, dst, t0);
1313 czero_nez(t0 , src, t0);
1314 orr(dst, dst, t0);
1315 return;
1316 }
1317 Label no_set;
1318 if (is_single) {
1319 // jump if cmp1 > cmp2
1320 // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1321 float_bgt(cmp1, cmp2, no_set);
1322 } else {
1323 double_bgt(cmp1, cmp2, no_set);
1324 }
1325 mv(dst, src);
1326 bind(no_set);
1327 }
1328
1329 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1330 if (UseZicond) {
1331 if (is_single) {
1332 fle_s(t0, cmp2, cmp1);
1333 } else {
1334 fle_d(t0, cmp2, cmp1);
1335 }
1336 czero_nez(dst, dst, t0);
1337 czero_eqz(t0 , src, t0);
1338 orr(dst, dst, t0);
1339 return;
1340 }
1341 Label no_set;
1342 if (is_single) {
1343 // jump if cmp1 < cmp2 or either is NaN
1344 // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1345 float_blt(cmp1, cmp2, no_set, false, true);
1346 } else {
1347 double_blt(cmp1, cmp2, no_set, false, true);
1348 }
1349 mv(dst, src);
1350 bind(no_set);
1351 }
1352
1353 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1354 if (UseZicond) {
1355 if (is_single) {
1356 fle_s(t0, cmp2, cmp1);
1357 } else {
1358 fle_d(t0, cmp2, cmp1);
1359 }
1360 czero_eqz(dst, dst, t0);
1361 czero_nez(t0 , src, t0);
1362 orr(dst, dst, t0);
1363 return;
1364 }
1365 Label no_set;
1366 if (is_single) {
1367 // jump if cmp1 >= cmp2
1368 // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1369 float_bge(cmp1, cmp2, no_set);
1370 } else {
1371 double_bge(cmp1, cmp2, no_set);
1372 }
1373 mv(dst, src);
1374 bind(no_set);
1375 }
1376
1377 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1378 if (UseZicond) {
1379 if (is_single) {
1380 flt_s(t0, cmp2, cmp1);
1381 } else {
1382 flt_d(t0, cmp2, cmp1);
1383 }
1384 czero_nez(dst, dst, t0);
1385 czero_eqz(t0 , src, t0);
1386 orr(dst, dst, t0);
1387 return;
1388 }
1389 Label no_set;
1390 if (is_single) {
1391 // jump if cmp1 <= cmp2 or either is NaN
1392 // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1393 float_ble(cmp1, cmp2, no_set, false, true);
1394 } else {
1395 double_ble(cmp1, cmp2, no_set, false, true);
1396 }
1397 mv(dst, src);
1398 bind(no_set);
1399 }
1400
1401 // Float compare branch instructions
1402
1403 #define INSN(NAME, FLOATCMP, BRANCH) \
1404 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1405 FLOATCMP##_s(t0, Rs1, Rs2); \
1406 BRANCH(t0, l, is_far); \
1407 } \
1408 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1409 FLOATCMP##_d(t0, Rs1, Rs2); \
1410 BRANCH(t0, l, is_far); \
1411 }
1412
1413 INSN(beq, feq, bnez);
1414 INSN(bne, feq, beqz);
1415
1416 #undef INSN
1417
1418
1419 #define INSN(NAME, FLOATCMP1, FLOATCMP2) \
1420 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1421 bool is_far, bool is_unordered) { \
1422 if (is_unordered) { \
1423 /* jump if either source is NaN or condition is expected */ \
1424 FLOATCMP2##_s(t0, Rs2, Rs1); \
1425 beqz(t0, l, is_far); \
1426 } else { \
1427 /* jump if no NaN in source and condition is expected */ \
1428 FLOATCMP1##_s(t0, Rs1, Rs2); \
1429 bnez(t0, l, is_far); \
1430 } \
1431 } \
1432 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1433 bool is_far, bool is_unordered) { \
1434 if (is_unordered) { \
1435 /* jump if either source is NaN or condition is expected */ \
1436 FLOATCMP2##_d(t0, Rs2, Rs1); \
1437 beqz(t0, l, is_far); \
1438 } else { \
1439 /* jump if no NaN in source and condition is expected */ \
1440 FLOATCMP1##_d(t0, Rs1, Rs2); \
1441 bnez(t0, l, is_far); \
1442 } \
1443 }
1444
1445 INSN(ble, fle, flt);
1446 INSN(blt, flt, fle);
1447
1448 #undef INSN
1449
1450 #define INSN(NAME, CMP) \
1451 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1452 bool is_far, bool is_unordered) { \
1453 float_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1454 } \
1455 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1456 bool is_far, bool is_unordered) { \
1457 double_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1458 }
1459
1460 INSN(bgt, blt);
1461 INSN(bge, ble);
1462
1463 #undef INSN
1464
1465 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1466 // These three are specified in zicntr and are unused.
1467 // Before adding use-cases add the appropriate hwprobe and flag.
1468 assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1469 "Not intended for use without enabling zicntr.");
1470 csrrs(Rd, csr, x0);
1471 }
1472
1473 #define INSN(NAME, OPFUN) \
1474 void MacroAssembler::NAME(unsigned csr, Register Rs) { \
1475 OPFUN(x0, csr, Rs); \
1476 }
1477
1478 INSN(csrw, csrrw);
1479 INSN(csrs, csrrs);
1480 INSN(csrc, csrrc);
1481
1482 #undef INSN
1483
1484 #define INSN(NAME, OPFUN) \
1485 void MacroAssembler::NAME(unsigned csr, unsigned imm) { \
1486 OPFUN(x0, csr, imm); \
1487 }
1488
1489 INSN(csrwi, csrrwi);
1490 INSN(csrsi, csrrsi);
1491 INSN(csrci, csrrci);
1492
1493 #undef INSN
1494
1495 #define INSN(NAME, CSR) \
1496 void MacroAssembler::NAME(Register Rd, Register Rs) { \
1497 csrrw(Rd, CSR, Rs); \
1498 }
1499
1500 INSN(fscsr, CSR_FCSR);
1501 INSN(fsrm, CSR_FRM);
1502 INSN(fsflags, CSR_FFLAGS);
1503
1504 #undef INSN
1505
1506 #define INSN(NAME) \
1507 void MacroAssembler::NAME(Register Rs) { \
1508 NAME(x0, Rs); \
1509 }
1510
1511 INSN(fscsr);
1512 INSN(fsrm);
1513 INSN(fsflags);
1514
1515 #undef INSN
1516
1517 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1518 guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1519 csrrwi(Rd, CSR_FRM, imm);
1520 }
1521
1522 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1523 csrrwi(Rd, CSR_FFLAGS, imm);
1524 }
1525
1526 #define INSN(NAME) \
1527 void MacroAssembler::NAME(unsigned imm) { \
1528 NAME(x0, imm); \
1529 }
1530
1531 INSN(fsrmi);
1532 INSN(fsflagsi);
1533
1534 #undef INSN
1535
1536 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1537 if (RestoreMXCSROnJNICalls) {
1538 Label skip_fsrmi;
1539 frrm(tmp);
1540 // Set FRM to the state we need. We do want Round to Nearest.
1541 // We don't want non-IEEE rounding modes.
1542 guarantee(RoundingMode::rne == 0, "must be");
1543 beqz(tmp, skip_fsrmi); // Only reset FRM if it's wrong
1544 fsrmi(RoundingMode::rne);
1545 bind(skip_fsrmi);
1546 }
1547 }
1548
1549 void MacroAssembler::push_reg(Register Rs)
1550 {
1551 subi(esp, esp, wordSize);
1552 sd(Rs, Address(esp, 0));
1553 }
1554
1555 void MacroAssembler::pop_reg(Register Rd)
1556 {
1557 ld(Rd, Address(esp, 0));
1558 addi(esp, esp, wordSize);
1559 }
1560
1561 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1562 int count = 0;
1563 // Scan bitset to accumulate register pairs
1564 for (int reg = 31; reg >= 0; reg--) {
1565 if ((1U << 31) & bitset) {
1566 regs[count++] = reg;
1567 }
1568 bitset <<= 1;
1569 }
1570 return count;
1571 }
1572
1573 // Push integer registers in the bitset supplied. Don't push sp.
1574 // Return the number of words pushed
1575 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1576 DEBUG_ONLY(int words_pushed = 0;)
1577 unsigned char regs[32];
1578 int count = bitset_to_regs(bitset, regs);
1579 // reserve one slot to align for odd count
1580 int offset = is_even(count) ? 0 : wordSize;
1581
1582 if (count) {
1583 sub(stack, stack, count * wordSize + offset);
1584 }
1585 for (int i = count - 1; i >= 0; i--) {
1586 sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1587 DEBUG_ONLY(words_pushed++;)
1588 }
1589
1590 assert(words_pushed == count, "oops, pushed != count");
1591
1592 return count;
1593 }
1594
1595 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1596 DEBUG_ONLY(int words_popped = 0;)
1597 unsigned char regs[32];
1598 int count = bitset_to_regs(bitset, regs);
1599 // reserve one slot to align for odd count
1600 int offset = is_even(count) ? 0 : wordSize;
1601
1602 for (int i = count - 1; i >= 0; i--) {
1603 ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1604 DEBUG_ONLY(words_popped++;)
1605 }
1606
1607 if (count) {
1608 add(stack, stack, count * wordSize + offset);
1609 }
1610 assert(words_popped == count, "oops, popped != count");
1611
1612 return count;
1613 }
1614
1615 // Push floating-point registers in the bitset supplied.
1616 // Return the number of words pushed
1617 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1618 DEBUG_ONLY(int words_pushed = 0;)
1619 unsigned char regs[32];
1620 int count = bitset_to_regs(bitset, regs);
1621 int push_slots = count + (count & 1);
1622
1623 if (count) {
1624 subi(stack, stack, push_slots * wordSize);
1625 }
1626
1627 for (int i = count - 1; i >= 0; i--) {
1628 fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1629 DEBUG_ONLY(words_pushed++;)
1630 }
1631
1632 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1633
1634 return count;
1635 }
1636
1637 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1638 DEBUG_ONLY(int words_popped = 0;)
1639 unsigned char regs[32];
1640 int count = bitset_to_regs(bitset, regs);
1641 int pop_slots = count + (count & 1);
1642
1643 for (int i = count - 1; i >= 0; i--) {
1644 fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1645 DEBUG_ONLY(words_popped++;)
1646 }
1647
1648 if (count) {
1649 addi(stack, stack, pop_slots * wordSize);
1650 }
1651
1652 assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1653
1654 return count;
1655 }
1656
1657 /**
1658 * Emits code to update CRC-32 with a byte value according to constants in table
1659 *
1660 * @param [in,out]crc Register containing the crc.
1661 * @param [in]val Register containing the byte to fold into the CRC.
1662 * @param [in]table Register containing the table of crc constants.
1663 *
1664 * uint32_t crc;
1665 * val = crc_table[(val ^ crc) & 0xFF];
1666 * crc = val ^ (crc >> 8);
1667 *
1668 */
1669 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1670 assert_different_registers(crc, val, table);
1671
1672 xorr(val, val, crc);
1673 zext(val, val, 8);
1674 shadd(val, val, table, val, 2);
1675 lwu(val, Address(val));
1676 srli(crc, crc, 8);
1677 xorr(crc, val, crc);
1678 }
1679
1680 /**
1681 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1682 *
1683 * @param [in,out]crc Register containing the crc.
1684 * @param [in]v Register containing the 32-bit to fold into the CRC.
1685 * @param [in]table0 Register containing table 0 of crc constants.
1686 * @param [in]table1 Register containing table 1 of crc constants.
1687 * @param [in]table2 Register containing table 2 of crc constants.
1688 * @param [in]table3 Register containing table 3 of crc constants.
1689 *
1690 * uint32_t crc;
1691 * v = crc ^ v
1692 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1693 *
1694 */
1695 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1696 Register table0, Register table1, Register table2, Register table3, bool upper) {
1697 assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1698
1699 if (upper)
1700 srli(v, v, 32);
1701 xorr(v, v, crc);
1702
1703 zext(tmp1, v, 8);
1704 shadd(tmp1, tmp1, table3, tmp2, 2);
1705 lwu(crc, Address(tmp1));
1706
1707 slli(tmp1, v, 16);
1708 slli(tmp3, v, 8);
1709
1710 srliw(tmp1, tmp1, 24);
1711 srliw(tmp3, tmp3, 24);
1712
1713 shadd(tmp1, tmp1, table2, tmp1, 2);
1714 lwu(tmp2, Address(tmp1));
1715
1716 shadd(tmp3, tmp3, table1, tmp3, 2);
1717 xorr(crc, crc, tmp2);
1718
1719 lwu(tmp2, Address(tmp3));
1720 // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1721 if (upper)
1722 srli(tmp1, v, 24);
1723 else
1724 srliw(tmp1, v, 24);
1725
1726 // no need to clear bits other than lowest two
1727 shadd(tmp1, tmp1, table0, tmp1, 2);
1728 xorr(crc, crc, tmp2);
1729 lwu(tmp2, Address(tmp1));
1730 xorr(crc, crc, tmp2);
1731 }
1732
1733
1734 #ifdef COMPILER2
1735 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1736 // To make it, following steps are taken:
1737 // 1. in zcrc32.c, modify N to 16 and related code,
1738 // 2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1739 // 3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1740 // New tables for vector version is after table3.
1741 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1742 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1743 Register table0, Register table3) {
1744 assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1745 const int N = 16, W = 4;
1746 const int64_t single_table_size = 256;
1747 const Register blks = tmp2;
1748 const Register tmpTable = tmp3, tableN16 = tmp4;
1749 const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1750 Label VectorLoop;
1751 Label LastBlock;
1752
1753 add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
1754 mv(tmp5, 0xff);
1755
1756 if (MaxVectorSize == 16) {
1757 vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1758 } else if (MaxVectorSize == 32) {
1759 vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1760 } else {
1761 assert(MaxVectorSize > 32, "sanity");
1762 vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1763 }
1764
1765 vmv_v_x(vcrc, zr);
1766 vmv_s_x(vcrc, crc);
1767
1768 // multiple of 64
1769 srli(blks, len, 6);
1770 slli(t1, blks, 6);
1771 sub(len, len, t1);
1772 subi(blks, blks, 1);
1773 blez(blks, LastBlock);
1774
1775 bind(VectorLoop);
1776 {
1777 mv(tmpTable, tableN16);
1778
1779 vle32_v(vword, buf);
1780 vxor_vv(vword, vword, vcrc);
1781
1782 addi(buf, buf, N*4);
1783
1784 vand_vx(vtmp, vword, tmp5);
1785 vsll_vi(vtmp, vtmp, 2);
1786 vluxei32_v(vcrc, tmpTable, vtmp);
1787
1788 mv(tmp1, 1);
1789 for (int k = 1; k < W; k++) {
1790 addi(tmpTable, tmpTable, single_table_size*4);
1791
1792 slli(t1, tmp1, 3);
1793 vsrl_vx(vtmp, vword, t1);
1794
1795 vand_vx(vtmp, vtmp, tmp5);
1796 vsll_vi(vtmp, vtmp, 2);
1797 vluxei32_v(vtmp, tmpTable, vtmp);
1798
1799 vxor_vv(vcrc, vcrc, vtmp);
1800
1801 addi(tmp1, tmp1, 1);
1802 }
1803
1804 subi(blks, blks, 1);
1805 bgtz(blks, VectorLoop);
1806 }
1807
1808 bind(LastBlock);
1809 {
1810 vle32_v(vtmp, buf);
1811 vxor_vv(vcrc, vcrc, vtmp);
1812 mv(crc, zr);
1813 for (int i = 0; i < N; i++) {
1814 vmv_x_s(tmp2, vcrc);
1815 // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1816 zext(tmp2, tmp2, 32);
1817 vslidedown_vi(vcrc, vcrc, 1);
1818 xorr(crc, crc, tmp2);
1819 for (int j = 0; j < W; j++) {
1820 andr(t1, crc, tmp5);
1821 shadd(t1, t1, table0, tmp1, 2);
1822 lwu(t1, Address(t1, 0));
1823 srli(tmp2, crc, 8);
1824 xorr(crc, tmp2, t1);
1825 }
1826 }
1827 addi(buf, buf, N*4);
1828 }
1829 }
1830
1831 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
1832 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1833 Register buf, Register tmp, const int STEP) {
1834 assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1835 vclmul_vv(vtmp1, vx, vt);
1836 vclmulh_vv(vtmp2, vx, vt);
1837 vle64_v(vtmp4, buf); addi(buf, buf, STEP);
1838 // low parts
1839 vredxor_vs(vtmp3, vtmp1, vtmp4);
1840 // high parts
1841 vslidedown_vi(vx, vtmp4, 1);
1842 vredxor_vs(vtmp1, vtmp2, vx);
1843 // merge low and high back
1844 vslideup_vi(vx, vtmp1, 1);
1845 vmv_x_s(tmp, vtmp3);
1846 vmv_s_x(vx, tmp);
1847 }
1848
1849 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1850 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1851 Register tmp) {
1852 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1853 vclmul_vv(vtmp1, vx, vt);
1854 vclmulh_vv(vtmp2, vx, vt);
1855 // low parts
1856 vredxor_vs(vtmp3, vtmp1, vy);
1857 // high parts
1858 vslidedown_vi(vtmp4, vy, 1);
1859 vredxor_vs(vtmp1, vtmp2, vtmp4);
1860 // merge low and high back
1861 vslideup_vi(vx, vtmp1, 1);
1862 vmv_x_s(tmp, vtmp3);
1863 vmv_s_x(vx, tmp);
1864 }
1865
1866 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1867 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1868 Register tmp) {
1869 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1870 vclmul_vv(vtmp1, vx, vt);
1871 vclmulh_vv(vtmp2, vx, vt);
1872 // low parts
1873 vredxor_vs(vtmp3, vtmp1, vy);
1874 // high parts
1875 vslidedown_vi(vtmp4, vy, 1);
1876 vredxor_vs(vtmp1, vtmp2, vtmp4);
1877 // merge low and high back
1878 vslideup_vi(vy, vtmp1, 1);
1879 vmv_x_s(tmp, vtmp3);
1880 vmv_s_x(vy, tmp);
1881 }
1882
1883 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
1884 Register vclmul_table, Register tmp1, Register tmp2) {
1885 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1886 assert(MaxVectorSize == 16, "sanity");
1887
1888 const int TABLE_STEP = 16;
1889 const int STEP = 16;
1890 const int LOOP_STEP = 128;
1891 const int N = 2;
1892
1893 Register loop_step = t1;
1894
1895 // ======== preparation ========
1896
1897 mv(loop_step, LOOP_STEP);
1898 sub(len, len, loop_step);
1899
1900 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1901 vle64_v(v0, buf); addi(buf, buf, STEP);
1902 vle64_v(v1, buf); addi(buf, buf, STEP);
1903 vle64_v(v2, buf); addi(buf, buf, STEP);
1904 vle64_v(v3, buf); addi(buf, buf, STEP);
1905 vle64_v(v4, buf); addi(buf, buf, STEP);
1906 vle64_v(v5, buf); addi(buf, buf, STEP);
1907 vle64_v(v6, buf); addi(buf, buf, STEP);
1908 vle64_v(v7, buf); addi(buf, buf, STEP);
1909
1910 vmv_v_x(v31, zr);
1911 vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
1912 vmv_s_x(v31, crc);
1913 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1914 vxor_vv(v0, v0, v31);
1915
1916 // load table
1917 vle64_v(v31, vclmul_table);
1918
1919 Label L_16_bytes_loop;
1920 j(L_16_bytes_loop);
1921
1922
1923 // ======== folding 128 bytes in data buffer per round ========
1924
1925 align(OptoLoopAlignment);
1926 bind(L_16_bytes_loop);
1927 {
1928 crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1929 crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1930 crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1931 crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1932 crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1933 crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1934 crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1935 crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1936 }
1937 sub(len, len, loop_step);
1938 bge(len, loop_step, L_16_bytes_loop);
1939
1940
1941 // ======== folding into 64 bytes from 128 bytes in register ========
1942
1943 // load table
1944 addi(vclmul_table, vclmul_table, TABLE_STEP);
1945 vle64_v(v31, vclmul_table);
1946
1947 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
1948 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
1949 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
1950 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
1951
1952
1953 // ======== folding into 16 bytes from 64 bytes in register ========
1954
1955 addi(vclmul_table, vclmul_table, TABLE_STEP);
1956 vle64_v(v31, vclmul_table);
1957 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
1958
1959 addi(vclmul_table, vclmul_table, TABLE_STEP);
1960 vle64_v(v31, vclmul_table);
1961 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
1962
1963 addi(vclmul_table, vclmul_table, TABLE_STEP);
1964 vle64_v(v31, vclmul_table);
1965 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
1966
1967 #undef FOLD_2_VCLMUL_3
1968
1969
1970 // ======== final: move result to scalar regsiters ========
1971
1972 vmv_x_s(tmp1, v3);
1973 vslidedown_vi(v1, v3, 1);
1974 vmv_x_s(tmp2, v1);
1975 }
1976
1977 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1978 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1979 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1980 vclmul_vv(vtmp1, vx, vt);
1981 vclmulh_vv(vtmp2, vx, vt);
1982 // low parts
1983 vredxor_vs(vtmp3, vtmp1, vy);
1984 // high parts
1985 vslidedown_vi(vtmp4, vy, 1);
1986 vredxor_vs(vtmp1, vtmp2, vtmp4);
1987 // merge low and high back
1988 vslideup_vi(vy, vtmp1, 1);
1989 vmv_x_s(t1, vtmp3);
1990 vmv_s_x(vy, t1);
1991 }
1992
1993 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
1994 Register vclmul_table, Register tmp1, Register tmp2) {
1995 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1996 assert(MaxVectorSize >= 32, "sanity");
1997
1998 // utility: load table
1999 #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2000 vid_v(vtmp); \
2001 mv(rtmp, 2); \
2002 vremu_vx(vtmp, vtmp, rtmp); \
2003 vsll_vi(vtmp, vtmp, 3); \
2004 vluxei64_v(vt, rt, vtmp);
2005
2006 const int TABLE_STEP = 16;
2007 const int STEP = 128; // 128 bytes per round
2008 const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2009
2010 Register step = tmp2;
2011
2012
2013 // ======== preparation ========
2014
2015 mv(step, STEP);
2016 sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2017
2018 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2019 // load data
2020 vle64_v(v4, buf);
2021 add(buf, buf, step);
2022
2023 // load table
2024 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2025 // load mask,
2026 // v28 should already contains: 0, 8, 0, 8, ...
2027 vmseq_vi(v2, v28, 0);
2028 // now, v2 should contains: 101010...
2029 vmnand_mm(v1, v2, v2);
2030 // now, v1 should contains: 010101...
2031
2032 // initial crc
2033 vmv_v_x(v24, zr);
2034 vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2035 vmv_s_x(v24, crc);
2036 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2037 vxor_vv(v4, v4, v24);
2038
2039 Label L_128_bytes_loop;
2040 j(L_128_bytes_loop);
2041
2042
2043 // ======== folding 128 bytes in data buffer per round ========
2044
2045 align(OptoLoopAlignment);
2046 bind(L_128_bytes_loop);
2047 {
2048 // v4: data
2049 // v4: buf, reused
2050 // v8: table
2051 // v12: lows
2052 // v16: highs
2053 // v20: low_slides
2054 // v24: high_slides
2055 vclmul_vv(v12, v4, v8);
2056 vclmulh_vv(v16, v4, v8);
2057 vle64_v(v4, buf);
2058 add(buf, buf, step);
2059 // lows
2060 vslidedown_vi(v20, v12, 1);
2061 vmand_mm(v0, v2, v2);
2062 vxor_vv(v12, v12, v20, v0_t);
2063 // with buf data
2064 vxor_vv(v4, v4, v12, v0_t);
2065
2066 // highs
2067 vslideup_vi(v24, v16, 1);
2068 vmand_mm(v0, v1, v1);
2069 vxor_vv(v16, v16, v24, v0_t);
2070 // with buf data
2071 vxor_vv(v4, v4, v16, v0_t);
2072 }
2073 sub(len, len, step);
2074 bge(len, step, L_128_bytes_loop);
2075
2076
2077 // ======== folding into 64 bytes from 128 bytes in register ========
2078
2079 // load table
2080 addi(vclmul_table, vclmul_table, TABLE_STEP);
2081 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2082
2083 // v4: data, first (low) part, N/2 of 64-bits
2084 // v20: data, second (high) part, N/2 of 64-bits
2085 // v8: table
2086 // v10: lows
2087 // v12: highs
2088 // v14: low_slides
2089 // v16: high_slides
2090
2091 // high part
2092 vslidedown_vi(v20, v4, N/2);
2093
2094 vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2095
2096 vclmul_vv(v10, v4, v8);
2097 vclmulh_vv(v12, v4, v8);
2098
2099 // lows
2100 vslidedown_vi(v14, v10, 1);
2101 vmand_mm(v0, v2, v2);
2102 vxor_vv(v10, v10, v14, v0_t);
2103 // with data part 2
2104 vxor_vv(v4, v20, v10, v0_t);
2105
2106 // highs
2107 vslideup_vi(v16, v12, 1);
2108 vmand_mm(v0, v1, v1);
2109 vxor_vv(v12, v12, v16, v0_t);
2110 // with data part 2
2111 vxor_vv(v4, v20, v12, v0_t);
2112
2113
2114 // ======== folding into 16 bytes from 64 bytes in register ========
2115
2116 // v4: data, first part, 2 of 64-bits
2117 // v16: data, second part, 2 of 64-bits
2118 // v18: data, third part, 2 of 64-bits
2119 // v20: data, second part, 2 of 64-bits
2120 // v8: table
2121
2122 vslidedown_vi(v16, v4, 2);
2123 vslidedown_vi(v18, v4, 4);
2124 vslidedown_vi(v20, v4, 6);
2125
2126 vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2127
2128 addi(vclmul_table, vclmul_table, TABLE_STEP);
2129 vle64_v(v8, vclmul_table);
2130 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2131
2132 addi(vclmul_table, vclmul_table, TABLE_STEP);
2133 vle64_v(v8, vclmul_table);
2134 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2135
2136 addi(vclmul_table, vclmul_table, TABLE_STEP);
2137 vle64_v(v8, vclmul_table);
2138 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2139
2140
2141 // ======== final: move result to scalar regsiters ========
2142
2143 vmv_x_s(tmp1, v20);
2144 vslidedown_vi(v4, v20, 1);
2145 vmv_x_s(tmp2, v4);
2146
2147 #undef CRC32_VCLMUL_LOAD_TABLE
2148 }
2149
2150 // For more details of the algorithm, please check the paper:
2151 // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2152 //
2153 // Please also refer to the corresponding code in aarch64 or x86 ones.
2154 //
2155 // As the riscv carry-less multiplication is a bit different from the other platforms,
2156 // so the implementation itself is also a bit different from others.
2157
2158 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2159 Register table0, Register table1, Register table2, Register table3,
2160 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2161 const int64_t single_table_size = 256;
2162 const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2163 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2164 Register vclmul_table = tmp3;
2165
2166 la(vclmul_table, table_addr);
2167 add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2168 la(table0, table_addr);
2169
2170 if (MaxVectorSize == 16) {
2171 kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2172 } else {
2173 kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2174 }
2175
2176 mv(crc, zr);
2177 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2178 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2179 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2180 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2181 }
2182
2183 #endif // COMPILER2
2184
2185 /**
2186 * @param crc register containing existing CRC (32-bit)
2187 * @param buf register pointing to input byte buffer (byte*)
2188 * @param len register containing number of bytes
2189 * @param table register that will contain address of CRC table
2190 * @param tmp scratch registers
2191 */
2192 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2193 Register table0, Register table1, Register table2, Register table3,
2194 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2195 assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2196 Label L_vector_entry,
2197 L_unroll_loop,
2198 L_by4_loop_entry, L_by4_loop,
2199 L_by1_loop, L_exit, L_skip1, L_skip2;
2200
2201 const int64_t single_table_size = 256;
2202 const int64_t unroll = 16;
2203 const int64_t unroll_words = unroll*wordSize;
2204
2205 // tmp5 = 0xffffffff
2206 notr(tmp5, zr);
2207 srli(tmp5, tmp5, 32);
2208
2209 andn(crc, tmp5, crc);
2210
2211 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2212 la(table0, table_addr);
2213 add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2214 add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2215 add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2216
2217 // Ensure basic 4-byte alignment of input byte buffer
2218 mv(tmp1, 4);
2219 blt(len, tmp1, L_by1_loop);
2220 test_bit(tmp1, buf, 0);
2221 beqz(tmp1, L_skip1);
2222 subiw(len, len, 1);
2223 lbu(tmp1, Address(buf));
2224 addi(buf, buf, 1);
2225 update_byte_crc32(crc, tmp1, table0);
2226 bind(L_skip1);
2227 test_bit(tmp1, buf, 1);
2228 beqz(tmp1, L_skip2);
2229 subiw(len, len, 2);
2230 lhu(tmp1, Address(buf));
2231 addi(buf, buf, 2);
2232 zext(tmp2, tmp1, 8);
2233 update_byte_crc32(crc, tmp2, table0);
2234 srli(tmp2, tmp1, 8);
2235 update_byte_crc32(crc, tmp2, table0);
2236 bind(L_skip2);
2237
2238 #ifdef COMPILER2
2239 if (UseRVV) {
2240 const int64_t tmp_limit =
2241 UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2242 : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2243 mv(tmp1, tmp_limit);
2244 bge(len, tmp1, L_vector_entry);
2245 }
2246 #endif // COMPILER2
2247
2248 mv(tmp1, unroll_words);
2249 blt(len, tmp1, L_by4_loop_entry);
2250
2251 const Register loop_buf_end = tmp3;
2252
2253 align(CodeEntryAlignment);
2254 // Entry for L_unroll_loop
2255 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2256 andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2257 sub(loop_buf_end, loop_buf_end, len);
2258 bind(L_unroll_loop);
2259 for (int i = 0; i < unroll; i++) {
2260 ld(tmp1, Address(buf, i*wordSize));
2261 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2262 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2263 }
2264
2265 addi(buf, buf, unroll_words);
2266 blt(buf, loop_buf_end, L_unroll_loop);
2267
2268 bind(L_by4_loop_entry);
2269 mv(tmp1, 4);
2270 blt(len, tmp1, L_by1_loop);
2271 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2272 andi(len, len, 3);
2273 sub(loop_buf_end, loop_buf_end, len);
2274 bind(L_by4_loop);
2275 lwu(tmp1, Address(buf));
2276 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2277 addi(buf, buf, 4);
2278 blt(buf, loop_buf_end, L_by4_loop);
2279
2280 bind(L_by1_loop);
2281 beqz(len, L_exit);
2282
2283 subiw(len, len, 1);
2284 lbu(tmp1, Address(buf));
2285 update_byte_crc32(crc, tmp1, table0);
2286 beqz(len, L_exit);
2287
2288 subiw(len, len, 1);
2289 lbu(tmp1, Address(buf, 1));
2290 update_byte_crc32(crc, tmp1, table0);
2291 beqz(len, L_exit);
2292
2293 subiw(len, len, 1);
2294 lbu(tmp1, Address(buf, 2));
2295 update_byte_crc32(crc, tmp1, table0);
2296
2297 #ifdef COMPILER2
2298 // put vector code here, otherwise "offset is too large" error occurs.
2299 if (UseRVV) {
2300 // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2301 j(L_exit);
2302
2303 bind(L_vector_entry);
2304 if (UseZvbc) { // carry-less multiplication
2305 kernel_crc32_vclmul_fold(crc, buf, len,
2306 table0, table1, table2, table3,
2307 tmp1, tmp2, tmp3, tmp4, tmp6);
2308 } else { // plain vector instructions
2309 vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2310 }
2311
2312 bgtz(len, L_by4_loop_entry);
2313 }
2314 #endif // COMPILER2
2315
2316 bind(L_exit);
2317 andn(crc, tmp5, crc);
2318 }
2319
2320 #ifdef COMPILER2
2321 // Push vector registers in the bitset supplied.
2322 // Return the number of words pushed
2323 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
2324 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2325
2326 // Scan bitset to accumulate register pairs
2327 unsigned char regs[32];
2328 int count = bitset_to_regs(bitset, regs);
2329
2330 for (int i = 0; i < count; i++) {
2331 sub(stack, stack, vector_size_in_bytes);
2332 vs1r_v(as_VectorRegister(regs[i]), stack);
2333 }
2334
2335 return count * vector_size_in_bytes / wordSize;
2336 }
2337
2338 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
2339 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2340
2341 // Scan bitset to accumulate register pairs
2342 unsigned char regs[32];
2343 int count = bitset_to_regs(bitset, regs);
2344
2345 for (int i = count - 1; i >= 0; i--) {
2346 vl1r_v(as_VectorRegister(regs[i]), stack);
2347 add(stack, stack, vector_size_in_bytes);
2348 }
2349
2350 return count * vector_size_in_bytes / wordSize;
2351 }
2352 #endif // COMPILER2
2353
2354 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2355 // Push integer registers x7, x10-x17, x28-x31.
2356 push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2357
2358 // Push float registers f0-f7, f10-f17, f28-f31.
2359 subi(sp, sp, wordSize * 20);
2360 int offset = 0;
2361 for (int i = 0; i < 32; i++) {
2362 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2363 fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2364 }
2365 }
2366 }
2367
2368 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2369 int offset = 0;
2370 for (int i = 0; i < 32; i++) {
2371 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2372 fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2373 }
2374 }
2375 addi(sp, sp, wordSize * 20);
2376
2377 pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2378 }
2379
2380 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2381 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2382 push_reg(RegSet::range(x5, x31), sp);
2383
2384 // float registers
2385 subi(sp, sp, 32 * wordSize);
2386 for (int i = 0; i < 32; i++) {
2387 fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2388 }
2389
2390 // vector registers
2391 if (save_vectors) {
2392 sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2393 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2394 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2395 add(t0, sp, vector_size_in_bytes * i);
2396 vse64_v(as_VectorRegister(i), t0);
2397 }
2398 }
2399 }
2400
2401 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2402 // vector registers
2403 if (restore_vectors) {
2404 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2405 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2406 vle64_v(as_VectorRegister(i), sp);
2407 add(sp, sp, vector_size_in_bytes * 8);
2408 }
2409 }
2410
2411 // float registers
2412 for (int i = 0; i < 32; i++) {
2413 fld(as_FloatRegister(i), Address(sp, i * wordSize));
2414 }
2415 addi(sp, sp, 32 * wordSize);
2416
2417 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2418 pop_reg(RegSet::range(x5, x31), sp);
2419 }
2420
2421 static int patch_offset_in_jal(address branch, int64_t offset) {
2422 assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2423 "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2424 Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1); // offset[20] ==> branch[31]
2425 Assembler::patch(branch, 30, 21, (offset >> 1) & 0x3ff); // offset[10:1] ==> branch[30:21]
2426 Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1); // offset[11] ==> branch[20]
2427 Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff); // offset[19:12] ==> branch[19:12]
2428 return MacroAssembler::instruction_size; // only one instruction
2429 }
2430
2431 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2432 assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2433 "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2434 Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1); // offset[12] ==> branch[31]
2435 Assembler::patch(branch, 30, 25, (offset >> 5) & 0x3f); // offset[10:5] ==> branch[30:25]
2436 Assembler::patch(branch, 7, 7, (offset >> 11) & 0x1); // offset[11] ==> branch[7]
2437 Assembler::patch(branch, 11, 8, (offset >> 1) & 0xf); // offset[4:1] ==> branch[11:8]
2438 return MacroAssembler::instruction_size; // only one instruction
2439 }
2440
2441 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2442 const int PC_RELATIVE_INSTRUCTION_NUM = 2; // auipc, addi/jalr/load
2443 Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff); // Auipc. offset[31:12] ==> branch[31:12]
2444 Assembler::patch(branch + 4, 31, 20, offset & 0xfff); // Addi/Jalr/Load. offset[11:0] ==> branch[31:20]
2445 return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2446 }
2447
2448 static int patch_addr_in_movptr1(address branch, address target) {
2449 int32_t lower = ((intptr_t)target << 35) >> 35;
2450 int64_t upper = ((intptr_t)target - lower) >> 29;
2451 Assembler::patch(branch + 0, 31, 12, upper & 0xfffff); // Lui. target[48:29] + target[28] ==> branch[31:12]
2452 Assembler::patch(branch + 4, 31, 20, (lower >> 17) & 0xfff); // Addi. target[28:17] ==> branch[31:20]
2453 Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff); // Addi. target[16: 6] ==> branch[31:20]
2454 Assembler::patch(branch + 20, 31, 20, lower & 0x3f); // Addi/Jalr/Load. target[ 5: 0] ==> branch[31:20]
2455 return MacroAssembler::movptr1_instruction_size;
2456 }
2457
2458 static int patch_addr_in_movptr2(address instruction_address, address target) {
2459 uintptr_t addr = (uintptr_t)target;
2460
2461 assert(addr < (1ull << 48), "48-bit overflow in address constant");
2462 unsigned int upper18 = (addr >> 30ull);
2463 int lower30 = (addr & 0x3fffffffu);
2464 int low12 = (lower30 << 20) >> 20;
2465 int mid18 = ((lower30 - low12) >> 12);
2466
2467 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2468 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18 & 0xfffff)); // Lui
2469 // Slli
2470 // Add
2471 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff); // Addi/Jalr/Load
2472
2473 assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2474
2475 return MacroAssembler::movptr2_instruction_size;
2476 }
2477
2478 static int patch_imm_in_li16u(address branch, uint16_t target) {
2479 Assembler::patch(branch, 31, 12, target); // patch lui only
2480 return MacroAssembler::instruction_size;
2481 }
2482
2483 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2484 const int LI32_INSTRUCTIONS_NUM = 2; // lui + addiw
2485 int64_t upper = (intptr_t)target;
2486 int32_t lower = (((int32_t)target) << 20) >> 20;
2487 upper -= lower;
2488 upper = (int32_t)upper;
2489 Assembler::patch(branch + 0, 31, 12, (upper >> 12) & 0xfffff); // Lui.
2490 Assembler::patch(branch + 4, 31, 20, lower & 0xfff); // Addiw.
2491 return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2492 }
2493
2494 static long get_offset_of_jal(address insn_addr) {
2495 assert_cond(insn_addr != nullptr);
2496 long offset = 0;
2497 unsigned insn = Assembler::ld_instr(insn_addr);
2498 long val = (long)Assembler::sextract(insn, 31, 12);
2499 offset |= ((val >> 19) & 0x1) << 20;
2500 offset |= (val & 0xff) << 12;
2501 offset |= ((val >> 8) & 0x1) << 11;
2502 offset |= ((val >> 9) & 0x3ff) << 1;
2503 offset = (offset << 43) >> 43;
2504 return offset;
2505 }
2506
2507 static long get_offset_of_conditional_branch(address insn_addr) {
2508 long offset = 0;
2509 assert_cond(insn_addr != nullptr);
2510 unsigned insn = Assembler::ld_instr(insn_addr);
2511 offset = (long)Assembler::sextract(insn, 31, 31);
2512 offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2513 offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2514 offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2515 offset = (offset << 41) >> 41;
2516 return offset;
2517 }
2518
2519 static long get_offset_of_pc_relative(address insn_addr) {
2520 long offset = 0;
2521 assert_cond(insn_addr != nullptr);
2522 offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12; // Auipc.
2523 offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addi/Jalr/Load.
2524 offset = (offset << 32) >> 32;
2525 return offset;
2526 }
2527
2528 static address get_target_of_movptr1(address insn_addr) {
2529 assert_cond(insn_addr != nullptr);
2530 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2531 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17; // Addi.
2532 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6; // Addi.
2533 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)); // Addi/Jalr/Load.
2534 return (address) target_address;
2535 }
2536
2537 static address get_target_of_movptr2(address insn_addr) {
2538 assert_cond(insn_addr != nullptr);
2539 int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2540 int32_t mid18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2541 // 2 // Slli
2542 // 3 // Add
2543 int32_t low12 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2544 address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2545 return ret;
2546 }
2547
2548 address MacroAssembler::get_target_of_li32(address insn_addr) {
2549 assert_cond(insn_addr != nullptr);
2550 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2551 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addiw.
2552 return (address)target_address;
2553 }
2554
2555 // Patch any kind of instruction; there may be several instructions.
2556 // Return the total length (in bytes) of the instructions.
2557 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2558 assert_cond(instruction_address != nullptr);
2559 int64_t offset = target - instruction_address;
2560 if (MacroAssembler::is_jal_at(instruction_address)) { // jal
2561 return patch_offset_in_jal(instruction_address, offset);
2562 } else if (MacroAssembler::is_branch_at(instruction_address)) { // beq/bge/bgeu/blt/bltu/bne
2563 return patch_offset_in_conditional_branch(instruction_address, offset);
2564 } else if (MacroAssembler::is_pc_relative_at(instruction_address)) { // auipc, addi/jalr/load
2565 return patch_offset_in_pc_relative(instruction_address, offset);
2566 } else if (MacroAssembler::is_movptr1_at(instruction_address)) { // movptr1
2567 return patch_addr_in_movptr1(instruction_address, target);
2568 } else if (MacroAssembler::is_movptr2_at(instruction_address)) { // movptr2
2569 return patch_addr_in_movptr2(instruction_address, target);
2570 } else if (MacroAssembler::is_li32_at(instruction_address)) { // li32
2571 int64_t imm = (intptr_t)target;
2572 return patch_imm_in_li32(instruction_address, (int32_t)imm);
2573 } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2574 int64_t imm = (intptr_t)target;
2575 return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2576 } else {
2577 #ifdef ASSERT
2578 tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
2579 Assembler::ld_instr(instruction_address), p2i(instruction_address));
2580 Disassembler::decode(instruction_address - 16, instruction_address + 16);
2581 #endif
2582 ShouldNotReachHere();
2583 return -1;
2584 }
2585 }
2586
2587 address MacroAssembler::target_addr_for_insn(address insn_addr) {
2588 long offset = 0;
2589 assert_cond(insn_addr != nullptr);
2590 if (MacroAssembler::is_jal_at(insn_addr)) { // jal
2591 offset = get_offset_of_jal(insn_addr);
2592 } else if (MacroAssembler::is_branch_at(insn_addr)) { // beq/bge/bgeu/blt/bltu/bne
2593 offset = get_offset_of_conditional_branch(insn_addr);
2594 } else if (MacroAssembler::is_pc_relative_at(insn_addr)) { // auipc, addi/jalr/load
2595 offset = get_offset_of_pc_relative(insn_addr);
2596 } else if (MacroAssembler::is_movptr1_at(insn_addr)) { // movptr1
2597 return get_target_of_movptr1(insn_addr);
2598 } else if (MacroAssembler::is_movptr2_at(insn_addr)) { // movptr2
2599 return get_target_of_movptr2(insn_addr);
2600 } else if (MacroAssembler::is_li32_at(insn_addr)) { // li32
2601 return get_target_of_li32(insn_addr);
2602 } else {
2603 ShouldNotReachHere();
2604 }
2605 return address(((uintptr_t)insn_addr + offset));
2606 }
2607
2608 int MacroAssembler::patch_oop(address insn_addr, address o) {
2609 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
2610 // narrow OOPs by setting the upper 16 bits in the first
2611 // instruction.
2612 if (MacroAssembler::is_li32_at(insn_addr)) {
2613 // Move narrow OOP
2614 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
2615 return patch_imm_in_li32(insn_addr, (int32_t)n);
2616 } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
2617 // Move wide OOP
2618 return patch_addr_in_movptr1(insn_addr, o);
2619 } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
2620 // Move wide OOP
2621 return patch_addr_in_movptr2(insn_addr, o);
2622 }
2623 ShouldNotReachHere();
2624 return -1;
2625 }
2626
2627 void MacroAssembler::reinit_heapbase() {
2628 if (UseCompressedOops) {
2629 if (Universe::is_fully_initialized()) {
2630 mv(xheapbase, CompressedOops::base());
2631 } else {
2632 ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
2633 }
2634 }
2635 }
2636
2637 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
2638 assert(addr.getMode() == Address::literal, "must be applied to a literal address");
2639 relocate(addr.rspec(), [&] {
2640 movptr(Rd, addr.target(), temp);
2641 });
2642 }
2643
2644 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
2645 int offset = 0;
2646 movptr(Rd, addr, offset, temp);
2647 addi(Rd, Rd, offset);
2648 }
2649
2650 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
2651 uint64_t uimm64 = (uint64_t)addr;
2652 #ifndef PRODUCT
2653 {
2654 char buffer[64];
2655 os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
2656 block_comment(buffer);
2657 }
2658 #endif
2659 assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
2660
2661 if (temp == noreg) {
2662 movptr1(Rd, uimm64, offset);
2663 } else {
2664 movptr2(Rd, uimm64, offset, temp);
2665 }
2666 }
2667
2668 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2669 // Load upper 31 bits
2670 //
2671 // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2672 // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2673 // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2674 // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2675 // `lower = (lower << 52) >> 52;`.
2676 // After `upper -= lower;`,
2677 // upper_20' = upper_20 - (-1) == upper_20 + 1
2678 // upper_12 = 0x000
2679 // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2680 // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2681 // Rd_20 == upper_20'
2682 // Rd_12 == 0x000
2683 // After `addi(Rd, Rd, lower);`,
2684 // Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2685 // Rd_12 = lower_12
2686 // So, finally Rd == [upper_20, lower_12]
2687 int64_t imm = imm64 >> 17;
2688 int64_t upper = imm, lower = imm;
2689 lower = (lower << 52) >> 52;
2690 upper -= lower;
2691 upper = (int32_t)upper;
2692 lui(Rd, upper);
2693 addi(Rd, Rd, lower);
2694
2695 // Load the rest 17 bits.
2696 slli(Rd, Rd, 11);
2697 addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2698 slli(Rd, Rd, 6);
2699
2700 // This offset will be used by following jalr/ld.
2701 offset = imm64 & 0x3f;
2702 }
2703
2704 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2705 assert_different_registers(Rd, tmp, noreg);
2706
2707 // addr: [upper18, lower30[mid18, lower12]]
2708
2709 int64_t upper18 = addr >> 18;
2710 lui(tmp, upper18);
2711
2712 int64_t lower30 = addr & 0x3fffffff;
2713 int64_t mid18 = lower30, lower12 = lower30;
2714 lower12 = (lower12 << 52) >> 52;
2715 // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2716 // please refer to movptr1 above.
2717 mid18 -= (int32_t)lower12;
2718 lui(Rd, mid18);
2719
2720 slli(tmp, tmp, 18);
2721 add(Rd, Rd, tmp);
2722
2723 offset = lower12;
2724 }
2725
2726 // floating point imm move
2727 bool MacroAssembler::can_hf_imm_load(short imm) {
2728 jshort h_bits = (jshort)imm;
2729 if (h_bits == 0) {
2730 return true;
2731 }
2732 return can_zfa_zli_half_float(imm);
2733 }
2734
2735 bool MacroAssembler::can_fp_imm_load(float imm) {
2736 jint f_bits = jint_cast(imm);
2737 if (f_bits == 0) {
2738 return true;
2739 }
2740 return can_zfa_zli_float(imm);
2741 }
2742
2743 bool MacroAssembler::can_dp_imm_load(double imm) {
2744 julong d_bits = julong_cast(imm);
2745 if (d_bits == 0) {
2746 return true;
2747 }
2748 return can_zfa_zli_double(imm);
2749 }
2750
2751 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
2752 jshort h_bits = (jshort)imm;
2753 if (h_bits == 0) {
2754 fmv_h_x(Rd, zr);
2755 return;
2756 }
2757 int Rs = zfa_zli_lookup_half_float(h_bits);
2758 assert(Rs != -1, "Must be");
2759 _fli_h(Rd, Rs);
2760 }
2761
2762 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
2763 jint f_bits = jint_cast(imm);
2764 if (f_bits == 0) {
2765 fmv_w_x(Rd, zr);
2766 return;
2767 }
2768 int Rs = zfa_zli_lookup_float(f_bits);
2769 assert(Rs != -1, "Must be");
2770 _fli_s(Rd, Rs);
2771 }
2772
2773 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
2774 uint64_t d_bits = (uint64_t)julong_cast(imm);
2775 if (d_bits == 0) {
2776 fmv_d_x(Rd, zr);
2777 return;
2778 }
2779 int Rs = zfa_zli_lookup_double(d_bits);
2780 assert(Rs != -1, "Must be");
2781 _fli_d(Rd, Rs);
2782 }
2783
2784 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
2785 if (is_simm12(increment)) {
2786 addi(Rd, Rn, increment);
2787 } else {
2788 assert_different_registers(Rn, tmp);
2789 mv(tmp, increment);
2790 add(Rd, Rn, tmp);
2791 }
2792 }
2793
2794 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2795 add(Rd, Rn, -decrement, tmp);
2796 }
2797
2798 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
2799 if (is_simm12(increment)) {
2800 addiw(Rd, Rn, increment);
2801 } else {
2802 assert_different_registers(Rn, tmp);
2803 mv(tmp, increment);
2804 addw(Rd, Rn, tmp);
2805 }
2806 }
2807
2808 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2809 addw(Rd, Rn, -decrement, tmp);
2810 }
2811
2812 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2813 andr(Rd, Rs1, Rs2);
2814 sext(Rd, Rd, 32);
2815 }
2816
2817 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2818 orr(Rd, Rs1, Rs2);
2819 sext(Rd, Rd, 32);
2820 }
2821
2822 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2823 xorr(Rd, Rs1, Rs2);
2824 sext(Rd, Rd, 32);
2825 }
2826
2827 // Rd = Rs1 & (~Rd2)
2828 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2829 if (UseZbb) {
2830 Assembler::andn(Rd, Rs1, Rs2);
2831 return;
2832 }
2833
2834 notr(Rd, Rs2);
2835 andr(Rd, Rs1, Rd);
2836 }
2837
2838 // Rd = Rs1 | (~Rd2)
2839 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2840 if (UseZbb) {
2841 Assembler::orn(Rd, Rs1, Rs2);
2842 return;
2843 }
2844
2845 notr(Rd, Rs2);
2846 orr(Rd, Rs1, Rd);
2847 }
2848
2849 // Note: load_unsigned_short used to be called load_unsigned_word.
2850 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2851 int off = offset();
2852 lhu(dst, src);
2853 return off;
2854 }
2855
2856 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2857 int off = offset();
2858 lbu(dst, src);
2859 return off;
2860 }
2861
2862 int MacroAssembler::load_signed_short(Register dst, Address src) {
2863 int off = offset();
2864 lh(dst, src);
2865 return off;
2866 }
2867
2868 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2869 int off = offset();
2870 lb(dst, src);
2871 return off;
2872 }
2873
2874 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2875 switch (size_in_bytes) {
2876 case 8: ld(dst, src); break;
2877 case 4: is_signed ? lw(dst, src) : lwu(dst, src); break;
2878 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2879 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2880 default: ShouldNotReachHere();
2881 }
2882 }
2883
2884 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2885 switch (size_in_bytes) {
2886 case 8: sd(src, dst); break;
2887 case 4: sw(src, dst); break;
2888 case 2: sh(src, dst); break;
2889 case 1: sb(src, dst); break;
2890 default: ShouldNotReachHere();
2891 }
2892 }
2893
2894 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2895 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2896 if (granularity != 1 && granularity != 2) {
2897 ShouldNotReachHere();
2898 }
2899 if (AvoidUnalignedAccesses && (granularity != 2)) {
2900 assert_different_registers(dst, tmp);
2901 assert_different_registers(tmp, src.base());
2902 is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2903 slli(tmp, tmp, 8);
2904 lbu(dst, src);
2905 add(dst, dst, tmp);
2906 } else {
2907 is_signed ? lh(dst, src) : lhu(dst, src);
2908 }
2909 }
2910
2911 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2912 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2913 if (AvoidUnalignedAccesses && (granularity != 4)) {
2914 switch(granularity) {
2915 case 1:
2916 assert_different_registers(dst, tmp, src.base());
2917 lbu(dst, src);
2918 lbu(tmp, Address(src.base(), src.offset() + 1));
2919 slli(tmp, tmp, 8);
2920 add(dst, dst, tmp);
2921 lbu(tmp, Address(src.base(), src.offset() + 2));
2922 slli(tmp, tmp, 16);
2923 add(dst, dst, tmp);
2924 is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2925 slli(tmp, tmp, 24);
2926 add(dst, dst, tmp);
2927 break;
2928 case 2:
2929 assert_different_registers(dst, tmp);
2930 assert_different_registers(tmp, src.base());
2931 is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2932 slli(tmp, tmp, 16);
2933 lhu(dst, src);
2934 add(dst, dst, tmp);
2935 break;
2936 default:
2937 ShouldNotReachHere();
2938 }
2939 } else {
2940 is_signed ? lw(dst, src) : lwu(dst, src);
2941 }
2942 }
2943
2944 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2945 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2946 if (AvoidUnalignedAccesses && (granularity != 8)) {
2947 switch(granularity){
2948 case 1:
2949 assert_different_registers(dst, tmp, src.base());
2950 lbu(dst, src);
2951 lbu(tmp, Address(src.base(), src.offset() + 1));
2952 slli(tmp, tmp, 8);
2953 add(dst, dst, tmp);
2954 lbu(tmp, Address(src.base(), src.offset() + 2));
2955 slli(tmp, tmp, 16);
2956 add(dst, dst, tmp);
2957 lbu(tmp, Address(src.base(), src.offset() + 3));
2958 slli(tmp, tmp, 24);
2959 add(dst, dst, tmp);
2960 lbu(tmp, Address(src.base(), src.offset() + 4));
2961 slli(tmp, tmp, 32);
2962 add(dst, dst, tmp);
2963 lbu(tmp, Address(src.base(), src.offset() + 5));
2964 slli(tmp, tmp, 40);
2965 add(dst, dst, tmp);
2966 lbu(tmp, Address(src.base(), src.offset() + 6));
2967 slli(tmp, tmp, 48);
2968 add(dst, dst, tmp);
2969 lbu(tmp, Address(src.base(), src.offset() + 7));
2970 slli(tmp, tmp, 56);
2971 add(dst, dst, tmp);
2972 break;
2973 case 2:
2974 assert_different_registers(dst, tmp, src.base());
2975 lhu(dst, src);
2976 lhu(tmp, Address(src.base(), src.offset() + 2));
2977 slli(tmp, tmp, 16);
2978 add(dst, dst, tmp);
2979 lhu(tmp, Address(src.base(), src.offset() + 4));
2980 slli(tmp, tmp, 32);
2981 add(dst, dst, tmp);
2982 lhu(tmp, Address(src.base(), src.offset() + 6));
2983 slli(tmp, tmp, 48);
2984 add(dst, dst, tmp);
2985 break;
2986 case 4:
2987 assert_different_registers(dst, tmp);
2988 assert_different_registers(tmp, src.base());
2989 lwu(tmp, Address(src.base(), src.offset() + 4));
2990 slli(tmp, tmp, 32);
2991 lwu(dst, src);
2992 add(dst, dst, tmp);
2993 break;
2994 default:
2995 ShouldNotReachHere();
2996 }
2997 } else {
2998 ld(dst, src);
2999 }
3000 }
3001
3002 // reverse bytes in lower word, sign-extend
3003 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3004 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3005 if (UseZbb) {
3006 rev8(Rd, Rs);
3007 srai(Rd, Rd, 32);
3008 return;
3009 }
3010 assert_different_registers(Rs, tmp1, tmp2);
3011 assert_different_registers(Rd, tmp1, tmp2);
3012 zext(tmp1, Rs, 8);
3013 slli(tmp1, tmp1, 8);
3014 for (int step = 8; step < 24; step += 8) {
3015 srli(tmp2, Rs, step);
3016 zext(tmp2, tmp2, 8);
3017 orr(tmp1, tmp1, tmp2);
3018 slli(tmp1, tmp1, 8);
3019 }
3020 srli(Rd, Rs, 24);
3021 zext(Rd, Rd, 8);
3022 orr(Rd, tmp1, Rd);
3023 sext(Rd, Rd, 32);
3024 }
3025
3026 // reverse bytes in doubleword
3027 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3028 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3029 if (UseZbb) {
3030 rev8(Rd, Rs);
3031 return;
3032 }
3033 assert_different_registers(Rs, tmp1, tmp2);
3034 assert_different_registers(Rd, tmp1, tmp2);
3035 zext(tmp1, Rs, 8);
3036 slli(tmp1, tmp1, 8);
3037 for (int step = 8; step < 56; step += 8) {
3038 srli(tmp2, Rs, step);
3039 zext(tmp2, tmp2, 8);
3040 orr(tmp1, tmp1, tmp2);
3041 slli(tmp1, tmp1, 8);
3042 }
3043 srli(Rd, Rs, 56);
3044 orr(Rd, tmp1, Rd);
3045 }
3046
3047 // rotate right with shift bits
3048 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3049 {
3050 if (UseZbb) {
3051 rorr(dst, src, shift);
3052 return;
3053 }
3054
3055 assert_different_registers(dst, tmp);
3056 assert_different_registers(src, tmp);
3057
3058 mv(tmp, 64);
3059 sub(tmp, tmp, shift);
3060 sll(tmp, src, tmp);
3061 srl(dst, src, shift);
3062 orr(dst, dst, tmp);
3063 }
3064
3065 // rotate right with shift bits
3066 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3067 {
3068 if (UseZbb) {
3069 rori(dst, src, shift);
3070 return;
3071 }
3072
3073 assert_different_registers(dst, tmp);
3074 assert_different_registers(src, tmp);
3075 assert(shift < 64, "shift amount must be < 64");
3076 slli(tmp, src, 64 - shift);
3077 srli(dst, src, shift);
3078 orr(dst, dst, tmp);
3079 }
3080
3081 // rotate left with shift bits, 32-bit version
3082 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3083 if (UseZbb) {
3084 // no roliw available
3085 roriw(dst, src, 32 - shift);
3086 return;
3087 }
3088
3089 assert_different_registers(dst, tmp);
3090 assert_different_registers(src, tmp);
3091 assert(shift < 32, "shift amount must be < 32");
3092 srliw(tmp, src, 32 - shift);
3093 slliw(dst, src, shift);
3094 orr(dst, dst, tmp);
3095 }
3096
3097 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3098 ld(tmp1, adr);
3099 if (src.is_register()) {
3100 orr(tmp1, tmp1, src.as_register());
3101 } else {
3102 if (is_simm12(src.as_constant())) {
3103 ori(tmp1, tmp1, src.as_constant());
3104 } else {
3105 assert_different_registers(tmp1, tmp2);
3106 mv(tmp2, src.as_constant());
3107 orr(tmp1, tmp1, tmp2);
3108 }
3109 }
3110 sd(tmp1, adr);
3111 }
3112
3113 void MacroAssembler::cmp_klass_compressed(Register oop, Register trial_klass, Register tmp, Label &L, bool equal) {
3114 if (UseCompactObjectHeaders) {
3115 load_narrow_klass_compact(tmp, oop);
3116 } else if (UseCompressedClassPointers) {
3117 lwu(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3118 } else {
3119 ld(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3120 }
3121 if (equal) {
3122 beq(trial_klass, tmp, L);
3123 } else {
3124 bne(trial_klass, tmp, L);
3125 }
3126 }
3127
3128 // Move an oop into a register.
3129 void MacroAssembler::movoop(Register dst, jobject obj) {
3130 int oop_index;
3131 if (obj == nullptr) {
3132 oop_index = oop_recorder()->allocate_oop_index(obj);
3133 } else {
3134 #ifdef ASSERT
3135 {
3136 ThreadInVMfromUnknown tiv;
3137 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3138 }
3139 #endif
3140 oop_index = oop_recorder()->find_index(obj);
3141 }
3142 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3143
3144 if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3145 movptr(dst, Address((address)obj, rspec));
3146 } else {
3147 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3148 ld(dst, Address(dummy, rspec));
3149 }
3150 }
3151
3152 // Move a metadata address into a register.
3153 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3154 assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3155 int oop_index;
3156 if (obj == nullptr) {
3157 oop_index = oop_recorder()->allocate_metadata_index(obj);
3158 } else {
3159 oop_index = oop_recorder()->find_index(obj);
3160 }
3161 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3162 movptr(dst, Address((address)obj, rspec));
3163 }
3164
3165 // Writes to stack successive pages until offset reached to check for
3166 // stack overflow + shadow pages. This clobbers tmp.
3167 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3168 assert_different_registers(tmp, size, t0);
3169 // Bang stack for total size given plus shadow page size.
3170 // Bang one page at a time because large size can bang beyond yellow and
3171 // red zones.
3172 mv(t0, (int)os::vm_page_size());
3173 Label loop;
3174 bind(loop);
3175 sub(tmp, sp, t0);
3176 subw(size, size, t0);
3177 sd(size, Address(tmp));
3178 bgtz(size, loop);
3179
3180 // Bang down shadow pages too.
3181 // At this point, (tmp-0) is the last address touched, so don't
3182 // touch it again. (It was touched as (tmp-pagesize) but then tmp
3183 // was post-decremented.) Skip this address by starting at i=1, and
3184 // touch a few more pages below. N.B. It is important to touch all
3185 // the way down to and including i=StackShadowPages.
3186 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3187 // this could be any sized move but this is can be a debugging crumb
3188 // so the bigger the better.
3189 sub(tmp, tmp, (int)os::vm_page_size());
3190 sd(size, Address(tmp, 0));
3191 }
3192 }
3193
3194 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3195 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3196 ld(dst, Address(xmethod, Method::const_offset()));
3197 ld(dst, Address(dst, ConstMethod::constants_offset()));
3198 ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3199 ld(dst, Address(dst, mirror_offset));
3200 resolve_oop_handle(dst, tmp1, tmp2);
3201 }
3202
3203 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3204 // OopHandle::resolve is an indirection.
3205 assert_different_registers(result, tmp1, tmp2);
3206 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3207 }
3208
3209 // ((WeakHandle)result).resolve()
3210 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3211 assert_different_registers(result, tmp1, tmp2);
3212 Label resolved;
3213
3214 // A null weak handle resolves to null.
3215 beqz(result, resolved);
3216
3217 // Only 64 bit platforms support GCs that require a tmp register
3218 // Only IN_HEAP loads require a thread_tmp register
3219 // WeakHandle::resolve is an indirection like jweak.
3220 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3221 result, Address(result), tmp1, tmp2);
3222 bind(resolved);
3223 }
3224
3225 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3226 Register dst, Address src,
3227 Register tmp1, Register tmp2) {
3228 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3229 decorators = AccessInternal::decorator_fixup(decorators, type);
3230 bool as_raw = (decorators & AS_RAW) != 0;
3231 if (as_raw) {
3232 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3233 } else {
3234 bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3235 }
3236 }
3237
3238 void MacroAssembler::null_check(Register reg, int offset) {
3239 if (needs_explicit_null_check(offset)) {
3240 // provoke OS null exception if reg is null by
3241 // accessing M[reg] w/o changing any registers
3242 // NOTE: this is plenty to provoke a segv
3243 ld(zr, Address(reg, 0));
3244 } else {
3245 // nothing to do, (later) access of M[reg + offset]
3246 // will provoke OS null exception if reg is null
3247 }
3248 }
3249
3250 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3251 Address dst, Register val,
3252 Register tmp1, Register tmp2, Register tmp3) {
3253 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3254 decorators = AccessInternal::decorator_fixup(decorators, type);
3255 bool as_raw = (decorators & AS_RAW) != 0;
3256 if (as_raw) {
3257 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3258 } else {
3259 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3260 }
3261 }
3262
3263 // Algorithm must match CompressedOops::encode.
3264 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3265 verify_oop_msg(s, "broken oop in encode_heap_oop");
3266 if (CompressedOops::base() == nullptr) {
3267 if (CompressedOops::shift() != 0) {
3268 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3269 srli(d, s, LogMinObjAlignmentInBytes);
3270 } else {
3271 mv(d, s);
3272 }
3273 } else {
3274 Label notNull;
3275 sub(d, s, xheapbase);
3276 bgez(d, notNull);
3277 mv(d, zr);
3278 bind(notNull);
3279 if (CompressedOops::shift() != 0) {
3280 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3281 srli(d, d, CompressedOops::shift());
3282 }
3283 }
3284 }
3285
3286 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3287 #ifdef ASSERT
3288 if (CheckCompressedOops) {
3289 Label ok;
3290 bnez(r, ok);
3291 stop("null oop passed to encode_heap_oop_not_null");
3292 bind(ok);
3293 }
3294 #endif
3295 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3296 if (CompressedOops::base() != nullptr) {
3297 sub(r, r, xheapbase);
3298 }
3299 if (CompressedOops::shift() != 0) {
3300 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3301 srli(r, r, LogMinObjAlignmentInBytes);
3302 }
3303 }
3304
3305 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3306 #ifdef ASSERT
3307 if (CheckCompressedOops) {
3308 Label ok;
3309 bnez(src, ok);
3310 stop("null oop passed to encode_heap_oop_not_null2");
3311 bind(ok);
3312 }
3313 #endif
3314 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3315
3316 Register data = src;
3317 if (CompressedOops::base() != nullptr) {
3318 sub(dst, src, xheapbase);
3319 data = dst;
3320 }
3321 if (CompressedOops::shift() != 0) {
3322 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3323 srli(dst, data, LogMinObjAlignmentInBytes);
3324 data = dst;
3325 }
3326 if (data == src) {
3327 mv(dst, src);
3328 }
3329 }
3330
3331 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3332 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3333 ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3334 srli(dst, dst, markWord::klass_shift);
3335 }
3336
3337 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3338 assert_different_registers(dst, tmp);
3339 assert_different_registers(src, tmp);
3340 if (UseCompactObjectHeaders) {
3341 load_narrow_klass_compact(dst, src);
3342 decode_klass_not_null(dst, tmp);
3343 } else if (UseCompressedClassPointers) {
3344 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3345 decode_klass_not_null(dst, tmp);
3346 } else {
3347 ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3348 }
3349 }
3350
3351 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3352 // FIXME: Should this be a store release? concurrent gcs assumes
3353 // klass length is valid if klass field is not null.
3354 assert(!UseCompactObjectHeaders, "not with compact headers");
3355 if (UseCompressedClassPointers) {
3356 encode_klass_not_null(src, tmp);
3357 sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3358 } else {
3359 sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3360 }
3361 }
3362
3363 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3364 assert(!UseCompactObjectHeaders, "not with compact headers");
3365 if (UseCompressedClassPointers) {
3366 // Store to klass gap in destination
3367 sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3368 }
3369 }
3370
3371 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3372 assert_different_registers(r, tmp);
3373 decode_klass_not_null(r, r, tmp);
3374 }
3375
3376 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3377 assert(UseCompressedClassPointers, "should only be used for compressed headers");
3378 assert_different_registers(dst, tmp);
3379 assert_different_registers(src, tmp);
3380
3381 if (CompressedKlassPointers::base() == nullptr) {
3382 if (CompressedKlassPointers::shift() != 0) {
3383 slli(dst, src, CompressedKlassPointers::shift());
3384 } else {
3385 mv(dst, src);
3386 }
3387 return;
3388 }
3389
3390 Register xbase = tmp;
3391
3392 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3393
3394 if (CompressedKlassPointers::shift() != 0) {
3395 // dst = (src << shift) + xbase
3396 shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3397 } else {
3398 add(dst, xbase, src);
3399 }
3400 }
3401
3402 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3403 assert_different_registers(r, tmp);
3404 encode_klass_not_null(r, r, tmp);
3405 }
3406
3407 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3408 assert(UseCompressedClassPointers, "should only be used for compressed headers");
3409
3410 if (CompressedKlassPointers::base() == nullptr) {
3411 if (CompressedKlassPointers::shift() != 0) {
3412 srli(dst, src, CompressedKlassPointers::shift());
3413 } else {
3414 mv(dst, src);
3415 }
3416 return;
3417 }
3418
3419 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3420 CompressedKlassPointers::shift() == 0) {
3421 zext(dst, src, 32);
3422 return;
3423 }
3424
3425 Register xbase = dst;
3426 if (dst == src) {
3427 xbase = tmp;
3428 }
3429
3430 assert_different_registers(src, xbase);
3431 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3432 sub(dst, src, xbase);
3433 if (CompressedKlassPointers::shift() != 0) {
3434 srli(dst, dst, CompressedKlassPointers::shift());
3435 }
3436 }
3437
3438 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3439 decode_heap_oop_not_null(r, r);
3440 }
3441
3442 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3443 assert(UseCompressedOops, "should only be used for compressed headers");
3444 assert(Universe::heap() != nullptr, "java heap should be initialized");
3445 // Cannot assert, unverified entry point counts instructions (see .ad file)
3446 // vtableStubs also counts instructions in pd_code_size_limit.
3447 // Also do not verify_oop as this is called by verify_oop.
3448 if (CompressedOops::shift() != 0) {
3449 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3450 slli(dst, src, LogMinObjAlignmentInBytes);
3451 if (CompressedOops::base() != nullptr) {
3452 add(dst, xheapbase, dst);
3453 }
3454 } else {
3455 assert(CompressedOops::base() == nullptr, "sanity");
3456 mv(dst, src);
3457 }
3458 }
3459
3460 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3461 if (CompressedOops::base() == nullptr) {
3462 if (CompressedOops::shift() != 0 || d != s) {
3463 slli(d, s, CompressedOops::shift());
3464 }
3465 } else {
3466 Label done;
3467 mv(d, s);
3468 beqz(s, done);
3469 shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3470 bind(done);
3471 }
3472 verify_oop_msg(d, "broken oop in decode_heap_oop");
3473 }
3474
3475 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3476 Register tmp2, Register tmp3, DecoratorSet decorators) {
3477 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3478 }
3479
3480 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3481 Register tmp2, DecoratorSet decorators) {
3482 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3483 }
3484
3485 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3486 Register tmp2, DecoratorSet decorators) {
3487 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3488 }
3489
3490 // Used for storing nulls.
3491 void MacroAssembler::store_heap_oop_null(Address dst) {
3492 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3493 }
3494
3495 // Look up the method for a megamorphic invokeinterface call.
3496 // The target method is determined by <intf_klass, itable_index>.
3497 // The receiver klass is in recv_klass.
3498 // On success, the result will be in method_result, and execution falls through.
3499 // On failure, execution transfers to the given label.
3500 void MacroAssembler::lookup_interface_method(Register recv_klass,
3501 Register intf_klass,
3502 RegisterOrConstant itable_index,
3503 Register method_result,
3504 Register scan_tmp,
3505 Label& L_no_such_interface,
3506 bool return_method) {
3507 assert_different_registers(recv_klass, intf_klass, scan_tmp);
3508 assert_different_registers(method_result, intf_klass, scan_tmp);
3509 assert(recv_klass != method_result || !return_method,
3510 "recv_klass can be destroyed when method isn't needed");
3511 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3512 "caller must use same register for non-constant itable index as for method");
3513
3514 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3515 int vtable_base = in_bytes(Klass::vtable_start_offset());
3516 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3517 int scan_step = itableOffsetEntry::size() * wordSize;
3518 int vte_size = vtableEntry::size_in_bytes();
3519 assert(vte_size == wordSize, "else adjust times_vte_scale");
3520
3521 lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3522
3523 // Could store the aligned, prescaled offset in the klass.
3524 shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3525 add(scan_tmp, scan_tmp, vtable_base);
3526
3527 if (return_method) {
3528 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3529 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3530 if (itable_index.is_register()) {
3531 slli(t0, itable_index.as_register(), 3);
3532 } else {
3533 mv(t0, itable_index.as_constant() << 3);
3534 }
3535 add(recv_klass, recv_klass, t0);
3536 if (itentry_off) {
3537 add(recv_klass, recv_klass, itentry_off);
3538 }
3539 }
3540
3541 Label search, found_method;
3542
3543 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3544 beq(intf_klass, method_result, found_method);
3545 bind(search);
3546 // Check that the previous entry is non-null. A null entry means that
3547 // the receiver class doesn't implement the interface, and wasn't the
3548 // same as when the caller was compiled.
3549 beqz(method_result, L_no_such_interface, /* is_far */ true);
3550 addi(scan_tmp, scan_tmp, scan_step);
3551 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3552 bne(intf_klass, method_result, search);
3553
3554 bind(found_method);
3555
3556 // Got a hit.
3557 if (return_method) {
3558 lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3559 add(method_result, recv_klass, scan_tmp);
3560 ld(method_result, Address(method_result));
3561 }
3562 }
3563
3564 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3565 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3566 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3567 // The target method is determined by <holder_klass, itable_index>.
3568 // The receiver klass is in recv_klass.
3569 // On success, the result will be in method_result, and execution falls through.
3570 // On failure, execution transfers to the given label.
3571 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3572 Register holder_klass,
3573 Register resolved_klass,
3574 Register method_result,
3575 Register temp_itbl_klass,
3576 Register scan_temp,
3577 int itable_index,
3578 Label& L_no_such_interface) {
3579 // 'method_result' is only used as output register at the very end of this method.
3580 // Until then we can reuse it as 'holder_offset'.
3581 Register holder_offset = method_result;
3582 assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3583
3584 int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3585 int scan_step = itableOffsetEntry::size() * wordSize;
3586 int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3587 int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3588 int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3589 const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3590
3591 Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3592
3593 lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3594 add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3595 // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3596 // + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3597 // scan_temp = &(itable[0]._interface)
3598 // temp_itbl_klass = itable[0]._interface;
3599 shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3600 ld(temp_itbl_klass, Address(scan_temp));
3601 mv(holder_offset, zr);
3602
3603 // Initial checks:
3604 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3605 // - if (itable[0] == holder_klass), shortcut to "holder found"
3606 // - if (itable[0] == 0), no such interface
3607 bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3608 beq(holder_klass, temp_itbl_klass, L_holder_found);
3609 beqz(temp_itbl_klass, L_no_such_interface);
3610
3611 // Loop: Look for holder_klass record in itable
3612 // do {
3613 // temp_itbl_klass = *(scan_temp += scan_step);
3614 // if (temp_itbl_klass == holder_klass) {
3615 // goto L_holder_found; // Found!
3616 // }
3617 // } while (temp_itbl_klass != 0);
3618 // goto L_no_such_interface // Not found.
3619 Label L_search_holder;
3620 bind(L_search_holder);
3621 add(scan_temp, scan_temp, scan_step);
3622 ld(temp_itbl_klass, Address(scan_temp));
3623 beq(holder_klass, temp_itbl_klass, L_holder_found);
3624 bnez(temp_itbl_klass, L_search_holder);
3625
3626 j(L_no_such_interface);
3627
3628 // Loop: Look for resolved_class record in itable
3629 // while (true) {
3630 // temp_itbl_klass = *(scan_temp += scan_step);
3631 // if (temp_itbl_klass == 0) {
3632 // goto L_no_such_interface;
3633 // }
3634 // if (temp_itbl_klass == resolved_klass) {
3635 // goto L_resolved_found; // Found!
3636 // }
3637 // if (temp_itbl_klass == holder_klass) {
3638 // holder_offset = scan_temp;
3639 // }
3640 // }
3641 //
3642 Label L_loop_search_resolved;
3643 bind(L_loop_search_resolved);
3644 add(scan_temp, scan_temp, scan_step);
3645 ld(temp_itbl_klass, Address(scan_temp));
3646 bind(L_loop_search_resolved_entry);
3647 beqz(temp_itbl_klass, L_no_such_interface);
3648 beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3649 bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3650 mv(holder_offset, scan_temp);
3651 j(L_loop_search_resolved);
3652
3653 // See if we already have a holder klass. If not, go and scan for it.
3654 bind(L_resolved_found);
3655 beqz(holder_offset, L_search_holder);
3656 mv(scan_temp, holder_offset);
3657
3658 // Finally, scan_temp contains holder_klass vtable offset
3659 bind(L_holder_found);
3660 lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3661 add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3662 - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3663 add(method_result, recv_klass, method_result);
3664 ld(method_result, Address(method_result));
3665 }
3666
3667 // virtual method calling
3668 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3669 RegisterOrConstant vtable_index,
3670 Register method_result) {
3671 const ByteSize base = Klass::vtable_start_offset();
3672 assert(vtableEntry::size() * wordSize == 8,
3673 "adjust the scaling in the code below");
3674 int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3675
3676 if (vtable_index.is_register()) {
3677 shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3678 ld(method_result, Address(method_result, vtable_offset_in_bytes));
3679 } else {
3680 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3681 ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3682 }
3683 }
3684
3685 void MacroAssembler::membar(uint32_t order_constraint) {
3686 if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
3687 // TSO allows for stores to be reordered after loads. When the compiler
3688 // generates a fence to disallow that, we are required to generate the
3689 // fence for correctness.
3690 BLOCK_COMMENT("elided tso membar");
3691 return;
3692 }
3693
3694 address prev = pc() - MacroAssembler::instruction_size;
3695 address last = code()->last_insn();
3696
3697 if (last != nullptr && is_membar(last) && prev == last) {
3698 // We are merging two memory barrier instructions. On RISCV we
3699 // can do this simply by ORing them together.
3700 set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3701 BLOCK_COMMENT("merged membar");
3702 return;
3703 }
3704
3705 code()->set_last_insn(pc());
3706 uint32_t predecessor = 0;
3707 uint32_t successor = 0;
3708 membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3709 fence(predecessor, successor);
3710 }
3711
3712 void MacroAssembler::cmodx_fence() {
3713 BLOCK_COMMENT("cmodx fence");
3714 if (VM_Version::supports_fencei_barrier()) {
3715 Assembler::fencei();
3716 }
3717 }
3718
3719 // Form an address from base + offset in Rd. Rd my or may not
3720 // actually be used: you must use the Address that is returned. It
3721 // is up to you to ensure that the shift provided matches the size
3722 // of your data.
3723 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3724 if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3725 return Address(base, byte_offset);
3726 }
3727
3728 assert_different_registers(Rd, base, noreg);
3729
3730 // Do it the hard way
3731 mv(Rd, byte_offset);
3732 add(Rd, base, Rd);
3733 return Address(Rd);
3734 }
3735
3736 void MacroAssembler::check_klass_subtype(Register sub_klass,
3737 Register super_klass,
3738 Register tmp_reg,
3739 Label& L_success) {
3740 Label L_failure;
3741 check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3742 check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3743 bind(L_failure);
3744 }
3745
3746 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
3747 ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
3748 if (at_return) {
3749 bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
3750 } else {
3751 test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
3752 bnez(tmp_reg, slow_path, /* is_far */ true);
3753 }
3754 }
3755
3756 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3757 Label &succeed, Label *fail) {
3758 assert_different_registers(addr, tmp, t0);
3759 assert_different_registers(newv, tmp, t0);
3760 assert_different_registers(oldv, tmp, t0);
3761
3762 // oldv holds comparison value
3763 // newv holds value to write in exchange
3764 // addr identifies memory word to compare against/update
3765 if (UseZacas) {
3766 mv(tmp, oldv);
3767 atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3768 beq(tmp, oldv, succeed);
3769 } else {
3770 Label retry_load, nope;
3771 bind(retry_load);
3772 // Load reserved from the memory location
3773 load_reserved(tmp, addr, int64, Assembler::aqrl);
3774 // Fail and exit if it is not what we expect
3775 bne(tmp, oldv, nope);
3776 // If the store conditional succeeds, tmp will be zero
3777 store_conditional(tmp, newv, addr, int64, Assembler::rl);
3778 beqz(tmp, succeed);
3779 // Retry only when the store conditional failed
3780 j(retry_load);
3781
3782 bind(nope);
3783 }
3784
3785 // neither amocas nor lr/sc have an implied barrier in the failing case
3786 membar(AnyAny);
3787
3788 mv(oldv, tmp);
3789 if (fail != nullptr) {
3790 j(*fail);
3791 }
3792 }
3793
3794 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3795 Label &succeed, Label *fail) {
3796 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3797 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3798 }
3799
3800 void MacroAssembler::load_reserved(Register dst,
3801 Register addr,
3802 Assembler::operand_size size,
3803 Assembler::Aqrl acquire) {
3804 switch (size) {
3805 case int64:
3806 lr_d(dst, addr, acquire);
3807 break;
3808 case int32:
3809 lr_w(dst, addr, acquire);
3810 break;
3811 case uint32:
3812 lr_w(dst, addr, acquire);
3813 zext(dst, dst, 32);
3814 break;
3815 default:
3816 ShouldNotReachHere();
3817 }
3818 }
3819
3820 void MacroAssembler::store_conditional(Register dst,
3821 Register new_val,
3822 Register addr,
3823 Assembler::operand_size size,
3824 Assembler::Aqrl release) {
3825 switch (size) {
3826 case int64:
3827 sc_d(dst, addr, new_val, release);
3828 break;
3829 case int32:
3830 case uint32:
3831 sc_w(dst, addr, new_val, release);
3832 break;
3833 default:
3834 ShouldNotReachHere();
3835 }
3836 }
3837
3838
3839 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
3840 Assembler::operand_size size,
3841 Register shift, Register mask, Register aligned_addr) {
3842 assert(size == int8 || size == int16, "unsupported operand size");
3843
3844 andi(shift, addr, 3);
3845 slli(shift, shift, 3);
3846
3847 andi(aligned_addr, addr, ~3);
3848
3849 if (size == int8) {
3850 mv(mask, 0xff);
3851 } else {
3852 // size == int16 case
3853 mv(mask, -1);
3854 zext(mask, mask, 16);
3855 }
3856 sll(mask, mask, shift);
3857
3858 sll(expected, expected, shift);
3859 andr(expected, expected, mask);
3860
3861 sll(new_val, new_val, shift);
3862 andr(new_val, new_val, mask);
3863 }
3864
3865 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3866 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3867 // which are forced to work with 4-byte aligned address.
3868 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3869 Register new_val,
3870 Assembler::operand_size size,
3871 Assembler::Aqrl acquire, Assembler::Aqrl release,
3872 Register result, bool result_as_bool,
3873 Register tmp1, Register tmp2, Register tmp3) {
3874 assert(!(UseZacas && UseZabha), "Use amocas");
3875 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3876
3877 Register scratch0 = t0, aligned_addr = t1;
3878 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3879
3880 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3881
3882 Label retry, fail, done;
3883
3884 if (UseZacas) {
3885 lw(result, aligned_addr);
3886
3887 bind(retry); // amocas loads the current value into result
3888 notr(scratch1, mask);
3889
3890 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
3891 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3892 bne(result, scratch1, fail); // cas bits differ, cas failed
3893
3894 // result is the same as expected, use as expected value.
3895
3896 // scratch0 is still = word - cas bits
3897 // Or in the new value to create complete new value.
3898 orr(scratch0, scratch0, new_val);
3899
3900 mv(scratch1, result); // save our expected value
3901 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3902 bne(scratch1, result, retry);
3903 } else {
3904 notr(scratch1, mask);
3905 bind(retry);
3906
3907 load_reserved(result, aligned_addr, operand_size::int32, acquire);
3908 andr(scratch0, result, mask);
3909 bne(scratch0, expected, fail);
3910
3911 andr(scratch0, result, scratch1); // scratch1 is ~mask
3912 orr(scratch0, scratch0, new_val);
3913 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3914 bnez(scratch0, retry);
3915 }
3916
3917 if (result_as_bool) {
3918 mv(result, 1);
3919 j(done);
3920
3921 bind(fail);
3922 mv(result, zr);
3923
3924 bind(done);
3925 } else {
3926 bind(fail);
3927
3928 andr(scratch0, result, mask);
3929 srl(result, scratch0, shift);
3930
3931 if (size == int8) {
3932 sext(result, result, 8);
3933 } else {
3934 // size == int16 case
3935 sext(result, result, 16);
3936 }
3937 }
3938 }
3939
3940 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3941 // the weak CAS stuff. The major difference is that it just failed when store conditional
3942 // failed.
3943 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3944 Register new_val,
3945 Assembler::operand_size size,
3946 Assembler::Aqrl acquire, Assembler::Aqrl release,
3947 Register result,
3948 Register tmp1, Register tmp2, Register tmp3) {
3949 assert(!(UseZacas && UseZabha), "Use amocas");
3950 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3951
3952 Register scratch0 = t0, aligned_addr = t1;
3953 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3954
3955 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3956
3957 Label fail, done;
3958
3959 if (UseZacas) {
3960 lw(result, aligned_addr);
3961
3962 notr(scratch1, mask);
3963
3964 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
3965 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3966 bne(result, scratch1, fail); // cas bits differ, cas failed
3967
3968 // result is the same as expected, use as expected value.
3969
3970 // scratch0 is still = word - cas bits
3971 // Or in the new value to create complete new value.
3972 orr(scratch0, scratch0, new_val);
3973
3974 mv(scratch1, result); // save our expected value
3975 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3976 bne(scratch1, result, fail); // This weak, so just bail-out.
3977 } else {
3978 notr(scratch1, mask);
3979
3980 load_reserved(result, aligned_addr, operand_size::int32, acquire);
3981 andr(scratch0, result, mask);
3982 bne(scratch0, expected, fail);
3983
3984 andr(scratch0, result, scratch1); // scratch1 is ~mask
3985 orr(scratch0, scratch0, new_val);
3986 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3987 bnez(scratch0, fail);
3988 }
3989
3990 // Success
3991 mv(result, 1);
3992 j(done);
3993
3994 // Fail
3995 bind(fail);
3996 mv(result, zr);
3997
3998 bind(done);
3999 }
4000
4001 void MacroAssembler::cmpxchg(Register addr, Register expected,
4002 Register new_val,
4003 Assembler::operand_size size,
4004 Assembler::Aqrl acquire, Assembler::Aqrl release,
4005 Register result, bool result_as_bool) {
4006 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4007 assert_different_registers(addr, t0);
4008 assert_different_registers(expected, t0);
4009 assert_different_registers(new_val, t0);
4010
4011 // NOTE:
4012 // Register _result_ may be the same register as _new_val_ or _expected_.
4013 // Hence do NOT use _result_ until after 'cas'.
4014 //
4015 // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4016 // Hence do NOT change _expected_ or _new_val_.
4017 //
4018 // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4019 //
4020 // TODO: Address these issues.
4021
4022 if (UseZacas) {
4023 if (result_as_bool) {
4024 mv(t0, expected);
4025 atomic_cas(t0, new_val, addr, size, acquire, release);
4026 xorr(t0, t0, expected);
4027 seqz(result, t0);
4028 } else {
4029 mv(t0, expected);
4030 atomic_cas(t0, new_val, addr, size, acquire, release);
4031 mv(result, t0);
4032 }
4033 return;
4034 }
4035
4036 Label retry_load, done, ne_done;
4037 bind(retry_load);
4038 load_reserved(t0, addr, size, acquire);
4039 bne(t0, expected, ne_done);
4040 store_conditional(t0, new_val, addr, size, release);
4041 bnez(t0, retry_load);
4042
4043 // equal, succeed
4044 if (result_as_bool) {
4045 mv(result, 1);
4046 } else {
4047 mv(result, expected);
4048 }
4049 j(done);
4050
4051 // not equal, failed
4052 bind(ne_done);
4053 if (result_as_bool) {
4054 mv(result, zr);
4055 } else {
4056 mv(result, t0);
4057 }
4058
4059 bind(done);
4060 }
4061
4062 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4063 Register new_val,
4064 Assembler::operand_size size,
4065 Assembler::Aqrl acquire, Assembler::Aqrl release,
4066 Register result) {
4067 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4068 assert_different_registers(addr, t0);
4069 assert_different_registers(expected, t0);
4070 assert_different_registers(new_val, t0);
4071
4072 if (UseZacas) {
4073 cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4074 return;
4075 }
4076
4077 Label fail, done;
4078 load_reserved(t0, addr, size, acquire);
4079 bne(t0, expected, fail);
4080 store_conditional(t0, new_val, addr, size, release);
4081 bnez(t0, fail);
4082
4083 // Success
4084 mv(result, 1);
4085 j(done);
4086
4087 // Fail
4088 bind(fail);
4089 mv(result, zr);
4090
4091 bind(done);
4092 }
4093
4094 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE) \
4095 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4096 prev = prev->is_valid() ? prev : zr; \
4097 if (incr.is_register()) { \
4098 AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4099 } else { \
4100 mv(t0, incr.as_constant()); \
4101 AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4102 } \
4103 return; \
4104 }
4105
4106 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4107 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4108 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4109 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4110
4111 #undef ATOMIC_OP
4112
4113 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE) \
4114 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
4115 prev = prev->is_valid() ? prev : zr; \
4116 AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4117 return; \
4118 }
4119
4120 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4121 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4122 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4123 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4124
4125 #undef ATOMIC_XCHG
4126
4127 #define ATOMIC_XCHGU(OP1, OP2) \
4128 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) { \
4129 atomic_##OP2(prev, newv, addr); \
4130 zext(prev, prev, 32); \
4131 return; \
4132 }
4133
4134 ATOMIC_XCHGU(xchgwu, xchgw)
4135 ATOMIC_XCHGU(xchgalwu, xchgalw)
4136
4137 #undef ATOMIC_XCHGU
4138
4139 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4140 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4141 switch (size) {
4142 case int64:
4143 amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4144 break;
4145 case int32:
4146 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4147 break;
4148 case uint32:
4149 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4150 zext(prev, prev, 32);
4151 break;
4152 case int16:
4153 amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4154 break;
4155 case int8:
4156 amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4157 break;
4158 default:
4159 ShouldNotReachHere();
4160 }
4161 }
4162
4163 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4164 assert(CodeCache::contains(entry.target()),
4165 "destination of far jump not found in code cache");
4166 assert(entry.rspec().type() == relocInfo::external_word_type
4167 || entry.rspec().type() == relocInfo::runtime_call_type
4168 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4169 // Fixed length: see MacroAssembler::far_branch_size()
4170 // We can use auipc + jr here because we know that the total size of
4171 // the code cache cannot exceed 2Gb.
4172 relocate(entry.rspec(), [&] {
4173 int64_t distance = entry.target() - pc();
4174 int32_t offset = ((int32_t)distance << 20) >> 20;
4175 assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4176 auipc(tmp, (int32_t)distance + 0x800);
4177 jr(tmp, offset);
4178 });
4179 }
4180
4181 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4182 assert(tmp != x5, "tmp register must not be x5.");
4183 assert(CodeCache::contains(entry.target()),
4184 "destination of far call not found in code cache");
4185 assert(entry.rspec().type() == relocInfo::external_word_type
4186 || entry.rspec().type() == relocInfo::runtime_call_type
4187 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4188 // Fixed length: see MacroAssembler::far_branch_size()
4189 // We can use auipc + jalr here because we know that the total size of
4190 // the code cache cannot exceed 2Gb.
4191 relocate(entry.rspec(), [&] {
4192 int64_t distance = entry.target() - pc();
4193 int32_t offset = ((int32_t)distance << 20) >> 20;
4194 assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4195 auipc(tmp, (int32_t)distance + 0x800);
4196 jalr(tmp, offset);
4197 });
4198 }
4199
4200 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4201 Register super_klass,
4202 Register tmp_reg,
4203 Label* L_success,
4204 Label* L_failure,
4205 Label* L_slow_path,
4206 Register super_check_offset) {
4207 assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4208 bool must_load_sco = !super_check_offset->is_valid();
4209 if (must_load_sco) {
4210 assert(tmp_reg != noreg, "supply either a temp or a register offset");
4211 }
4212
4213 Label L_fallthrough;
4214 int label_nulls = 0;
4215 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4216 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4217 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4218 assert(label_nulls <= 1, "at most one null in batch");
4219
4220 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4221 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4222 Address super_check_offset_addr(super_klass, sco_offset);
4223
4224 // Hacked jmp, which may only be used just before L_fallthrough.
4225 #define final_jmp(label) \
4226 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4227 else j(label) /*omit semi*/
4228
4229 // If the pointers are equal, we are done (e.g., String[] elements).
4230 // This self-check enables sharing of secondary supertype arrays among
4231 // non-primary types such as array-of-interface. Otherwise, each such
4232 // type would need its own customized SSA.
4233 // We move this check to the front of the fast path because many
4234 // type checks are in fact trivially successful in this manner,
4235 // so we get a nicely predicted branch right at the start of the check.
4236 beq(sub_klass, super_klass, *L_success);
4237
4238 // Check the supertype display:
4239 if (must_load_sco) {
4240 lwu(tmp_reg, super_check_offset_addr);
4241 super_check_offset = tmp_reg;
4242 }
4243 add(t0, sub_klass, super_check_offset);
4244 Address super_check_addr(t0);
4245 ld(t0, super_check_addr); // load displayed supertype
4246 beq(super_klass, t0, *L_success);
4247
4248 // This check has worked decisively for primary supers.
4249 // Secondary supers are sought in the super_cache ('super_cache_addr').
4250 // (Secondary supers are interfaces and very deeply nested subtypes.)
4251 // This works in the same check above because of a tricky aliasing
4252 // between the super_Cache and the primary super display elements.
4253 // (The 'super_check_addr' can address either, as the case requires.)
4254 // Note that the cache is updated below if it does not help us find
4255 // what we need immediately.
4256 // So if it was a primary super, we can just fail immediately.
4257 // Otherwise, it's the slow path for us (no success at this point).
4258
4259 mv(t1, sc_offset);
4260 if (L_failure == &L_fallthrough) {
4261 beq(super_check_offset, t1, *L_slow_path);
4262 } else {
4263 bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4264 final_jmp(*L_slow_path);
4265 }
4266
4267 bind(L_fallthrough);
4268
4269 #undef final_jmp
4270 }
4271
4272 // Scans count pointer sized words at [addr] for occurrence of value,
4273 // generic
4274 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4275 Register tmp) {
4276 Label Lloop, Lexit;
4277 beqz(count, Lexit);
4278 bind(Lloop);
4279 ld(tmp, addr);
4280 beq(value, tmp, Lexit);
4281 addi(addr, addr, wordSize);
4282 subi(count, count, 1);
4283 bnez(count, Lloop);
4284 bind(Lexit);
4285 }
4286
4287 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4288 Register super_klass,
4289 Register tmp1_reg,
4290 Register tmp2_reg,
4291 Label* L_success,
4292 Label* L_failure,
4293 bool set_cond_codes) {
4294 assert_different_registers(sub_klass, super_klass, tmp1_reg);
4295 if (tmp2_reg != noreg) {
4296 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4297 }
4298 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4299
4300 Label L_fallthrough;
4301 int label_nulls = 0;
4302 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4303 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4304
4305 assert(label_nulls <= 1, "at most one null in the batch");
4306
4307 // A couple of useful fields in sub_klass:
4308 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4309 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4310 Address secondary_supers_addr(sub_klass, ss_offset);
4311 Address super_cache_addr( sub_klass, sc_offset);
4312
4313 BLOCK_COMMENT("check_klass_subtype_slow_path");
4314
4315 // Do a linear scan of the secondary super-klass chain.
4316 // This code is rarely used, so simplicity is a virtue here.
4317 // The repne_scan instruction uses fixed registers, which we must spill.
4318 // Don't worry too much about pre-existing connections with the input regs.
4319
4320 assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4321 assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4322
4323 RegSet pushed_registers;
4324 if (!IS_A_TEMP(x12)) {
4325 pushed_registers += x12;
4326 }
4327 if (!IS_A_TEMP(x15)) {
4328 pushed_registers += x15;
4329 }
4330
4331 if (super_klass != x10) {
4332 if (!IS_A_TEMP(x10)) {
4333 pushed_registers += x10;
4334 }
4335 }
4336
4337 push_reg(pushed_registers, sp);
4338
4339 // Get super_klass value into x10 (even if it was in x15 or x12)
4340 mv(x10, super_klass);
4341
4342 #ifndef PRODUCT
4343 incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4344 #endif // PRODUCT
4345
4346 // We will consult the secondary-super array.
4347 ld(x15, secondary_supers_addr);
4348 // Load the array length.
4349 lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4350 // Skip to start of data.
4351 addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4352
4353 // Set t0 to an obvious invalid value, falling through by default
4354 mv(t0, -1);
4355 // Scan X12 words at [X15] for an occurrence of X10.
4356 repne_scan(x15, x10, x12, t0);
4357
4358 // pop will restore x10, so we should use a temp register to keep its value
4359 mv(t1, x10);
4360
4361 // Unspill the temp registers:
4362 pop_reg(pushed_registers, sp);
4363
4364 bne(t1, t0, *L_failure);
4365
4366 // Success. Cache the super we found an proceed in triumph.
4367 if (UseSecondarySupersCache) {
4368 sd(super_klass, super_cache_addr);
4369 }
4370
4371 if (L_success != &L_fallthrough) {
4372 j(*L_success);
4373 }
4374
4375 #undef IS_A_TEMP
4376
4377 bind(L_fallthrough);
4378 }
4379
4380 // population_count variant for running without the CPOP
4381 // instruction, which was introduced with Zbb extension.
4382 void MacroAssembler::population_count(Register dst, Register src,
4383 Register tmp1, Register tmp2) {
4384 if (UsePopCountInstruction) {
4385 cpop(dst, src);
4386 } else {
4387 assert_different_registers(src, tmp1, tmp2);
4388 assert_different_registers(dst, tmp1, tmp2);
4389 Label loop, done;
4390
4391 mv(tmp1, src);
4392 // dst = 0;
4393 // while(tmp1 != 0) {
4394 // dst++;
4395 // tmp1 &= (tmp1 - 1);
4396 // }
4397 mv(dst, zr);
4398 beqz(tmp1, done);
4399 {
4400 bind(loop);
4401 addi(dst, dst, 1);
4402 subi(tmp2, tmp1, 1);
4403 andr(tmp1, tmp1, tmp2);
4404 bnez(tmp1, loop);
4405 }
4406 bind(done);
4407 }
4408 }
4409
4410 // If Register r is invalid, remove a new register from
4411 // available_regs, and add new register to regs_to_push.
4412 Register MacroAssembler::allocate_if_noreg(Register r,
4413 RegSetIterator<Register> &available_regs,
4414 RegSet ®s_to_push) {
4415 if (!r->is_valid()) {
4416 r = *available_regs++;
4417 regs_to_push += r;
4418 }
4419 return r;
4420 }
4421
4422 // check_klass_subtype_slow_path_table() looks for super_klass in the
4423 // hash table belonging to super_klass, branching to L_success or
4424 // L_failure as appropriate. This is essentially a shim which
4425 // allocates registers as necessary then calls
4426 // lookup_secondary_supers_table() to do the work. Any of the tmp
4427 // regs may be noreg, in which case this logic will chooses some
4428 // registers push and pop them from the stack.
4429 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4430 Register super_klass,
4431 Register tmp1_reg,
4432 Register tmp2_reg,
4433 Label* L_success,
4434 Label* L_failure,
4435 bool set_cond_codes) {
4436 RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4437
4438 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4439
4440 Label L_fallthrough;
4441 int label_nulls = 0;
4442 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4443 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4444 assert(label_nulls <= 1, "at most one null in the batch");
4445
4446 BLOCK_COMMENT("check_klass_subtype_slow_path");
4447
4448 RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4449 RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4450
4451 RegSet pushed_regs;
4452
4453 tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4454 tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4455
4456 Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4457
4458 tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4459 tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4460 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4461
4462 push_reg(pushed_regs, sp);
4463
4464 lookup_secondary_supers_table_var(sub_klass,
4465 super_klass,
4466 result_reg,
4467 tmp1_reg, tmp2_reg, tmp3_reg,
4468 tmp4_reg, nullptr);
4469
4470 // Move the result to t1 as we are about to unspill the tmp registers.
4471 mv(t1, result_reg);
4472
4473 // Unspill the tmp. registers:
4474 pop_reg(pushed_regs, sp);
4475
4476 // NB! Callers may assume that, when set_cond_codes is true, this
4477 // code sets tmp2_reg to a nonzero value.
4478 if (set_cond_codes) {
4479 mv(tmp2_reg, 1);
4480 }
4481
4482 bnez(t1, *L_failure);
4483
4484 if (L_success != &L_fallthrough) {
4485 j(*L_success);
4486 }
4487
4488 bind(L_fallthrough);
4489 }
4490
4491 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4492 Register super_klass,
4493 Register tmp1_reg,
4494 Register tmp2_reg,
4495 Label* L_success,
4496 Label* L_failure,
4497 bool set_cond_codes) {
4498 if (UseSecondarySupersTable) {
4499 check_klass_subtype_slow_path_table
4500 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4501 } else {
4502 check_klass_subtype_slow_path_linear
4503 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4504 }
4505 }
4506
4507 // Ensure that the inline code and the stub are using the same registers
4508 // as we need to call the stub from inline code when there is a collision
4509 // in the hashed lookup in the secondary supers array.
4510 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
4511 r_array_index, r_sub_klass, result, r_bitmap) \
4512 do { \
4513 assert(r_super_klass == x10 && \
4514 r_array_base == x11 && \
4515 r_array_length == x12 && \
4516 (r_array_index == x13 || r_array_index == noreg) && \
4517 (r_sub_klass == x14 || r_sub_klass == noreg) && \
4518 (result == x15 || result == noreg) && \
4519 (r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
4520 } while(0)
4521
4522 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4523 Register r_super_klass,
4524 Register result,
4525 Register tmp1,
4526 Register tmp2,
4527 Register tmp3,
4528 Register tmp4,
4529 u1 super_klass_slot,
4530 bool stub_is_near) {
4531 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4532
4533 Label L_fallthrough;
4534
4535 BLOCK_COMMENT("lookup_secondary_supers_table {");
4536
4537 const Register
4538 r_array_base = tmp1, // x11
4539 r_array_length = tmp2, // x12
4540 r_array_index = tmp3, // x13
4541 r_bitmap = tmp4; // x16
4542
4543 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4544 r_array_index, r_sub_klass, result, r_bitmap);
4545
4546 u1 bit = super_klass_slot;
4547
4548 // Initialize result value to 1 which means mismatch.
4549 mv(result, 1);
4550
4551 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4552
4553 // First check the bitmap to see if super_klass might be present. If
4554 // the bit is zero, we are certain that super_klass is not one of
4555 // the secondary supers.
4556 test_bit(t0, r_bitmap, bit);
4557 beqz(t0, L_fallthrough);
4558
4559 // Get the first array index that can contain super_klass into r_array_index.
4560 if (bit != 0) {
4561 slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4562 population_count(r_array_index, r_array_index, tmp1, tmp2);
4563 } else {
4564 mv(r_array_index, (u1)1);
4565 }
4566
4567 // We will consult the secondary-super array.
4568 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4569
4570 // The value i in r_array_index is >= 1, so even though r_array_base
4571 // points to the length, we don't need to adjust it to point to the data.
4572 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4573 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4574
4575 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4576 ld(result, Address(result));
4577 xorr(result, result, r_super_klass);
4578 beqz(result, L_fallthrough); // Found a match
4579
4580 // Is there another entry to check? Consult the bitmap.
4581 test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4582 beqz(t0, L_fallthrough);
4583
4584 // Linear probe.
4585 if (bit != 0) {
4586 ror(r_bitmap, r_bitmap, bit);
4587 }
4588
4589 // The slot we just inspected is at secondary_supers[r_array_index - 1].
4590 // The next slot to be inspected, by the stub we're about to call,
4591 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4592 // have been checked.
4593 rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4594
4595 BLOCK_COMMENT("} lookup_secondary_supers_table");
4596
4597 bind(L_fallthrough);
4598
4599 if (VerifySecondarySupers) {
4600 verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4601 result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
4602 }
4603 return true;
4604 }
4605
4606 // At runtime, return 0 in result if r_super_klass is a superclass of
4607 // r_sub_klass, otherwise return nonzero. Use this version of
4608 // lookup_secondary_supers_table() if you don't know ahead of time
4609 // which superclass will be searched for. Used by interpreter and
4610 // runtime stubs. It is larger and has somewhat greater latency than
4611 // the version above, which takes a constant super_klass_slot.
4612 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4613 Register r_super_klass,
4614 Register result,
4615 Register tmp1,
4616 Register tmp2,
4617 Register tmp3,
4618 Register tmp4,
4619 Label *L_success) {
4620 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4621
4622 Label L_fallthrough;
4623
4624 BLOCK_COMMENT("lookup_secondary_supers_table {");
4625
4626 const Register
4627 r_array_index = tmp3,
4628 r_bitmap = tmp4,
4629 slot = t1;
4630
4631 lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4632
4633 // Make sure that result is nonzero if the test below misses.
4634 mv(result, 1);
4635
4636 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4637
4638 // First check the bitmap to see if super_klass might be present. If
4639 // the bit is zero, we are certain that super_klass is not one of
4640 // the secondary supers.
4641
4642 // This next instruction is equivalent to:
4643 // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4644 // sub(r_array_index, slot, tmp_reg);
4645 xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4646 sll(r_array_index, r_bitmap, r_array_index);
4647 test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
4648 beqz(t0, L_fallthrough);
4649
4650 // Get the first array index that can contain super_klass into r_array_index.
4651 population_count(r_array_index, r_array_index, tmp1, tmp2);
4652
4653 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4654
4655 const Register
4656 r_array_base = tmp1,
4657 r_array_length = tmp2;
4658
4659 // The value i in r_array_index is >= 1, so even though r_array_base
4660 // points to the length, we don't need to adjust it to point to the data.
4661 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4662 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4663
4664 // We will consult the secondary-super array.
4665 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4666
4667 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4668 ld(result, Address(result));
4669 xorr(result, result, r_super_klass);
4670 beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
4671
4672 // Is there another entry to check? Consult the bitmap.
4673 ror(r_bitmap, r_bitmap, slot);
4674 test_bit(t0, r_bitmap, 1);
4675 beqz(t0, L_fallthrough);
4676
4677 // The slot we just inspected is at secondary_supers[r_array_index - 1].
4678 // The next slot to be inspected, by the logic we're about to call,
4679 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4680 // have been checked.
4681 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
4682 r_bitmap, result, r_array_length, false /*is_stub*/);
4683
4684 BLOCK_COMMENT("} lookup_secondary_supers_table");
4685
4686 bind(L_fallthrough);
4687
4688 if (VerifySecondarySupers) {
4689 verify_secondary_supers_table(r_sub_klass, r_super_klass,
4690 result, tmp1, tmp2, tmp3);
4691 }
4692
4693 if (L_success) {
4694 beqz(result, *L_success);
4695 }
4696 }
4697
4698 // Called by code generated by check_klass_subtype_slow_path
4699 // above. This is called when there is a collision in the hashed
4700 // lookup in the secondary supers array.
4701 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4702 Register r_array_base,
4703 Register r_array_index,
4704 Register r_bitmap,
4705 Register result,
4706 Register tmp,
4707 bool is_stub) {
4708 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
4709
4710 const Register
4711 r_array_length = tmp,
4712 r_sub_klass = noreg; // unused
4713
4714 if (is_stub) {
4715 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4716 r_array_index, r_sub_klass, result, r_bitmap);
4717 }
4718
4719 Label L_matched, L_fallthrough, L_bitmap_full;
4720
4721 // Initialize result value to 1 which means mismatch.
4722 mv(result, 1);
4723
4724 // Load the array length.
4725 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4726 // And adjust the array base to point to the data.
4727 // NB! Effectively increments current slot index by 1.
4728 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4729 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4730
4731 // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4732 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4733 subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4734 bgtz(t0, L_bitmap_full);
4735
4736 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4737 // current slot (at secondary_supers[r_array_index]) has not yet
4738 // been inspected, and r_array_index may be out of bounds if we
4739 // wrapped around the end of the array.
4740
4741 { // This is conventional linear probing, but instead of terminating
4742 // when a null entry is found in the table, we maintain a bitmap
4743 // in which a 0 indicates missing entries.
4744 // As long as the bitmap is not completely full,
4745 // array_length == popcount(bitmap). The array_length check above
4746 // guarantees there are 0s in the bitmap, so the loop eventually
4747 // terminates.
4748 Label L_loop;
4749 bind(L_loop);
4750
4751 // Check for wraparound.
4752 Label skip;
4753 blt(r_array_index, r_array_length, skip);
4754 mv(r_array_index, zr);
4755 bind(skip);
4756
4757 shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4758 ld(t0, Address(t0));
4759 beq(t0, r_super_klass, L_matched);
4760
4761 test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
4762 beqz(t0, L_fallthrough);
4763
4764 ror(r_bitmap, r_bitmap, 1);
4765 addi(r_array_index, r_array_index, 1);
4766 j(L_loop);
4767 }
4768
4769 { // Degenerate case: more than 64 secondary supers.
4770 // FIXME: We could do something smarter here, maybe a vectorized
4771 // comparison or a binary search, but is that worth any added
4772 // complexity?
4773 bind(L_bitmap_full);
4774 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4775 bne(r_super_klass, t0, L_fallthrough);
4776 }
4777
4778 bind(L_matched);
4779 mv(result, zr);
4780
4781 bind(L_fallthrough);
4782 }
4783
4784 // Make sure that the hashed lookup and a linear scan agree.
4785 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4786 Register r_super_klass,
4787 Register result,
4788 Register tmp1,
4789 Register tmp2,
4790 Register tmp3) {
4791 assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
4792
4793 const Register
4794 r_array_base = tmp1, // X11
4795 r_array_length = tmp2, // X12
4796 r_array_index = noreg, // unused
4797 r_bitmap = noreg; // unused
4798
4799 BLOCK_COMMENT("verify_secondary_supers_table {");
4800
4801 // We will consult the secondary-super array.
4802 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4803
4804 // Load the array length.
4805 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4806 // And adjust the array base to point to the data.
4807 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4808
4809 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4810 Label failed;
4811 mv(tmp3, 1);
4812 bne(r_super_klass, t0, failed);
4813 mv(tmp3, zr);
4814 bind(failed);
4815
4816 snez(result, result); // normalize result to 0/1 for comparison
4817
4818 Label passed;
4819 beq(tmp3, result, passed);
4820 {
4821 mv(x10, r_super_klass);
4822 mv(x11, r_sub_klass);
4823 mv(x12, tmp3);
4824 mv(x13, result);
4825 mv(x14, (address)("mismatch"));
4826 rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4827 should_not_reach_here();
4828 }
4829 bind(passed);
4830
4831 BLOCK_COMMENT("} verify_secondary_supers_table");
4832 }
4833
4834 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4835 void MacroAssembler::tlab_allocate(Register obj,
4836 Register var_size_in_bytes,
4837 int con_size_in_bytes,
4838 Register tmp1,
4839 Register tmp2,
4840 Label& slow_case,
4841 bool is_far) {
4842 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4843 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4844 }
4845
4846 // get_thread() can be called anywhere inside generated code so we
4847 // need to save whatever non-callee save context might get clobbered
4848 // by the call to Thread::current() or, indeed, the call setup code.
4849 void MacroAssembler::get_thread(Register thread) {
4850 // save all call-clobbered regs except thread
4851 RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4852 RegSet::range(x28, x31) + ra - thread;
4853 push_reg(saved_regs, sp);
4854
4855 mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
4856 jalr(t1);
4857 if (thread != c_rarg0) {
4858 mv(thread, c_rarg0);
4859 }
4860
4861 // restore pushed registers
4862 pop_reg(saved_regs, sp);
4863 }
4864
4865 void MacroAssembler::load_byte_map_base(Register reg) {
4866 CardTable::CardValue* byte_map_base =
4867 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4868 mv(reg, (uint64_t)byte_map_base);
4869 }
4870
4871 void MacroAssembler::build_frame(int framesize) {
4872 assert(framesize >= 2, "framesize must include space for FP/RA");
4873 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4874 sub(sp, sp, framesize);
4875 sd(fp, Address(sp, framesize - 2 * wordSize));
4876 sd(ra, Address(sp, framesize - wordSize));
4877 if (PreserveFramePointer) { add(fp, sp, framesize); }
4878 }
4879
4880 void MacroAssembler::remove_frame(int framesize) {
4881 assert(framesize >= 2, "framesize must include space for FP/RA");
4882 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4883 ld(fp, Address(sp, framesize - 2 * wordSize));
4884 ld(ra, Address(sp, framesize - wordSize));
4885 add(sp, sp, framesize);
4886 }
4887
4888 void MacroAssembler::reserved_stack_check() {
4889 // testing if reserved zone needs to be enabled
4890 Label no_reserved_zone_enabling;
4891
4892 ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4893 bltu(sp, t0, no_reserved_zone_enabling);
4894
4895 enter(); // RA and FP are live.
4896 mv(c_rarg0, xthread);
4897 rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4898 leave();
4899
4900 // We have already removed our own frame.
4901 // throw_delayed_StackOverflowError will think that it's been
4902 // called by our caller.
4903 j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4904 should_not_reach_here();
4905
4906 bind(no_reserved_zone_enabling);
4907 }
4908
4909 // Move the address of the polling page into dest.
4910 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4911 ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4912 }
4913
4914 // Read the polling page. The address of the polling page must
4915 // already be in r.
4916 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4917 relocate(rtype, [&] {
4918 lwu(zr, Address(r, offset));
4919 });
4920 }
4921
4922 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4923 #ifdef ASSERT
4924 {
4925 ThreadInVMfromUnknown tiv;
4926 assert (UseCompressedOops, "should only be used for compressed oops");
4927 assert (Universe::heap() != nullptr, "java heap should be initialized");
4928 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4929 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4930 }
4931 #endif
4932 int oop_index = oop_recorder()->find_index(obj);
4933 relocate(oop_Relocation::spec(oop_index), [&] {
4934 li32(dst, 0xDEADBEEF);
4935 });
4936 zext(dst, dst, 32);
4937 }
4938
4939 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4940 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4941 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4942 int index = oop_recorder()->find_index(k);
4943 assert(!Universe::heap()->is_in(k), "should not be an oop");
4944
4945 narrowKlass nk = CompressedKlassPointers::encode(k);
4946 relocate(metadata_Relocation::spec(index), [&] {
4947 li32(dst, nk);
4948 });
4949 zext(dst, dst, 32);
4950 }
4951
4952 address MacroAssembler::reloc_call(Address entry, Register tmp) {
4953 assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4954 entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4955 entry.rspec().type() == relocInfo::static_call_type ||
4956 entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4957
4958 address target = entry.target();
4959
4960 if (!in_scratch_emit_size()) {
4961 address stub = emit_reloc_call_address_stub(offset(), target);
4962 if (stub == nullptr) {
4963 postcond(pc() == badAddress);
4964 return nullptr; // CodeCache is full
4965 }
4966 }
4967
4968 address call_pc = pc();
4969 #ifdef ASSERT
4970 if (entry.rspec().type() != relocInfo::runtime_call_type) {
4971 assert_alignment(call_pc);
4972 }
4973 #endif
4974
4975 // The relocation created while emitting the stub will ensure this
4976 // call instruction is subsequently patched to call the stub.
4977 relocate(entry.rspec(), [&] {
4978 auipc(tmp, 0);
4979 ld(tmp, Address(tmp, 0));
4980 jalr(tmp);
4981 });
4982
4983 postcond(pc() != badAddress);
4984 return call_pc;
4985 }
4986
4987 address MacroAssembler::ic_call(address entry, jint method_index) {
4988 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4989 assert(!in_compressible_scope(), "Must be");
4990 movptr(t0, (address)Universe::non_oop_word(), t1);
4991 assert_cond(entry != nullptr);
4992 return reloc_call(Address(entry, rh));
4993 }
4994
4995 int MacroAssembler::ic_check_size() {
4996 // No compressed
4997 return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4998 far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
4999 }
5000
5001 int MacroAssembler::ic_check(int end_alignment) {
5002 IncompressibleScope scope(this);
5003 Register receiver = j_rarg0;
5004 Register data = t0;
5005
5006 Register tmp1 = t1; // scratch
5007 // t2 is saved on call, thus should have been saved before this check.
5008 // Hence we can clobber it.
5009 Register tmp2 = t2;
5010
5011 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5012 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5013 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5014 // before the inline cache check here, and not after
5015 align(end_alignment, ic_check_size());
5016 int uep_offset = offset();
5017
5018 if (UseCompactObjectHeaders) {
5019 load_narrow_klass_compact(tmp1, receiver);
5020 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5021 } else if (UseCompressedClassPointers) {
5022 lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5023 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5024 } else {
5025 ld(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5026 ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5027 }
5028
5029 Label ic_hit;
5030 beq(tmp1, tmp2, ic_hit);
5031 // Note, far_jump is not fixed size.
5032 // Is this ever generates a movptr alignment/size will be off.
5033 far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5034 bind(ic_hit);
5035
5036 assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5037 return uep_offset;
5038 }
5039
5040 // Emit an address stub for a call to a target which is too far away.
5041 // Note that we only put the target address of the call in the stub.
5042 //
5043 // code sequences:
5044 //
5045 // call-site:
5046 // load target address from stub
5047 // jump-and-link target address
5048 //
5049 // Related address stub for this call site in the stub section:
5050 // alignment nop
5051 // target address
5052
5053 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5054 address stub = start_a_stub(max_reloc_call_address_stub_size());
5055 if (stub == nullptr) {
5056 return nullptr; // CodeBuffer::expand failed
5057 }
5058
5059 // We are always 4-byte aligned here.
5060 assert_alignment(pc());
5061
5062 // Make sure the address of destination 8-byte aligned.
5063 align(wordSize, 0);
5064
5065 RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5066 insts_call_instruction_offset);
5067 const int stub_start_offset = offset();
5068 relocate(rh, [&] {
5069 assert(offset() - stub_start_offset == 0,
5070 "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5071 assert(offset() % wordSize == 0, "bad alignment");
5072 emit_int64((int64_t)dest);
5073 });
5074
5075 const address stub_start_addr = addr_at(stub_start_offset);
5076 end_a_stub();
5077
5078 return stub_start_addr;
5079 }
5080
5081 int MacroAssembler::max_reloc_call_address_stub_size() {
5082 // Max stub size: alignment nop, target address.
5083 return 1 * MacroAssembler::instruction_size + wordSize;
5084 }
5085
5086 int MacroAssembler::static_call_stub_size() {
5087 // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5088 return 11 * MacroAssembler::instruction_size;
5089 }
5090
5091 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5092 switch (dst.getMode()) {
5093 case Address::base_plus_offset:
5094 // This is the expected mode, although we allow all the other
5095 // forms below.
5096 return form_address(tmp, dst.base(), dst.offset());
5097 default:
5098 la(tmp, dst);
5099 return Address(tmp);
5100 }
5101 }
5102
5103 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5104 assert(((dst.getMode() == Address::base_plus_offset &&
5105 is_simm12(dst.offset())) || is_simm12(value)),
5106 "invalid value and address mode combination");
5107 Address adr = add_memory_helper(dst, tmp2);
5108 assert(!adr.uses(tmp1), "invalid dst for address increment");
5109 ld(tmp1, adr);
5110 add(tmp1, tmp1, value, tmp2);
5111 sd(tmp1, adr);
5112 }
5113
5114 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5115 assert(((dst.getMode() == Address::base_plus_offset &&
5116 is_simm12(dst.offset())) || is_simm12(value)),
5117 "invalid value and address mode combination");
5118 Address adr = add_memory_helper(dst, tmp2);
5119 assert(!adr.uses(tmp1), "invalid dst for address increment");
5120 lwu(tmp1, adr);
5121 addw(tmp1, tmp1, value, tmp2);
5122 sw(tmp1, adr);
5123 }
5124
5125 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5126 assert(((dst.getMode() == Address::base_plus_offset &&
5127 is_simm12(dst.offset())) || is_simm12(value)),
5128 "invalid value and address mode combination");
5129 Address adr = add_memory_helper(dst, tmp2);
5130 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5131 ld(tmp1, adr);
5132 sub(tmp1, tmp1, value, tmp2);
5133 sd(tmp1, adr);
5134 }
5135
5136 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5137 assert(((dst.getMode() == Address::base_plus_offset &&
5138 is_simm12(dst.offset())) || is_simm12(value)),
5139 "invalid value and address mode combination");
5140 Address adr = add_memory_helper(dst, tmp2);
5141 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5142 lwu(tmp1, adr);
5143 subw(tmp1, tmp1, value, tmp2);
5144 sw(tmp1, adr);
5145 }
5146
5147 void MacroAssembler::cmpptr(Register src1, const Address &src2, Label& equal, Register tmp) {
5148 assert_different_registers(src1, tmp);
5149 assert(src2.getMode() == Address::literal, "must be applied to a literal address");
5150 ld(tmp, src2);
5151 beq(src1, tmp, equal);
5152 }
5153
5154 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5155 load_method_holder(result, method);
5156 ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5157 }
5158
5159 void MacroAssembler::load_method_holder(Register holder, Register method) {
5160 ld(holder, Address(method, Method::const_offset())); // ConstMethod*
5161 ld(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5162 ld(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5163 }
5164
5165 // string indexof
5166 // compute index by trailing zeros
5167 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5168 Register match_mask, Register result,
5169 Register ch2, Register tmp,
5170 bool haystack_isL) {
5171 int haystack_chr_shift = haystack_isL ? 0 : 1;
5172 srl(match_mask, match_mask, trailing_zeros);
5173 srli(match_mask, match_mask, 1);
5174 srli(tmp, trailing_zeros, LogBitsPerByte);
5175 if (!haystack_isL) andi(tmp, tmp, 0xE);
5176 add(haystack, haystack, tmp);
5177 ld(ch2, Address(haystack));
5178 if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5179 add(result, result, tmp);
5180 }
5181
5182 // string indexof
5183 // Find pattern element in src, compute match mask,
5184 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5185 // match mask patterns and corresponding indices would be like:
5186 // - 0x8080808080808080 (Latin1)
5187 // - 7 6 5 4 3 2 1 0 (match index)
5188 // - 0x8000800080008000 (UTF16)
5189 // - 3 2 1 0 (match index)
5190 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5191 Register mask1, Register mask2) {
5192 xorr(src, pattern, src);
5193 sub(match_mask, src, mask1);
5194 orr(src, src, mask2);
5195 notr(src, src);
5196 andr(match_mask, match_mask, src);
5197 }
5198
5199 #ifdef COMPILER2
5200 // Code for BigInteger::mulAdd intrinsic
5201 // out = x10
5202 // in = x11
5203 // offset = x12 (already out.length-offset)
5204 // len = x13
5205 // k = x14
5206 // tmp = x28
5207 //
5208 // pseudo code from java implementation:
5209 // long kLong = k & LONG_MASK;
5210 // carry = 0;
5211 // offset = out.length-offset - 1;
5212 // for (int j = len - 1; j >= 0; j--) {
5213 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5214 // out[offset--] = (int)product;
5215 // carry = product >>> 32;
5216 // }
5217 // return (int)carry;
5218 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5219 Register len, Register k, Register tmp) {
5220 Label L_tail_loop, L_unroll, L_end;
5221 mv(tmp, out);
5222 mv(out, zr);
5223 blez(len, L_end);
5224 zext(k, k, 32);
5225 slliw(t0, offset, LogBytesPerInt);
5226 add(offset, tmp, t0);
5227 slliw(t0, len, LogBytesPerInt);
5228 add(in, in, t0);
5229
5230 const int unroll = 8;
5231 mv(tmp, unroll);
5232 blt(len, tmp, L_tail_loop);
5233 bind(L_unroll);
5234 for (int i = 0; i < unroll; i++) {
5235 subi(in, in, BytesPerInt);
5236 lwu(t0, Address(in, 0));
5237 mul(t1, t0, k);
5238 add(t0, t1, out);
5239 subi(offset, offset, BytesPerInt);
5240 lwu(t1, Address(offset, 0));
5241 add(t0, t0, t1);
5242 sw(t0, Address(offset, 0));
5243 srli(out, t0, 32);
5244 }
5245 subw(len, len, tmp);
5246 bge(len, tmp, L_unroll);
5247
5248 bind(L_tail_loop);
5249 blez(len, L_end);
5250 subi(in, in, BytesPerInt);
5251 lwu(t0, Address(in, 0));
5252 mul(t1, t0, k);
5253 add(t0, t1, out);
5254 subi(offset, offset, BytesPerInt);
5255 lwu(t1, Address(offset, 0));
5256 add(t0, t0, t1);
5257 sw(t0, Address(offset, 0));
5258 srli(out, t0, 32);
5259 subiw(len, len, 1);
5260 j(L_tail_loop);
5261
5262 bind(L_end);
5263 }
5264
5265 // Multiply and multiply-accumulate unsigned 64-bit registers.
5266 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5267 assert_different_registers(prod_lo, prod_hi);
5268
5269 mul(prod_lo, n, m);
5270 mulhu(prod_hi, n, m);
5271 }
5272
5273 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5274 Register m, Register tmp1, Register tmp2) {
5275 assert_different_registers(sum_lo, sum_hi);
5276 assert_different_registers(sum_hi, tmp2);
5277
5278 wide_mul(tmp1, tmp2, n, m);
5279 cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
5280 adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
5281 }
5282
5283 // add two unsigned input and output carry
5284 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5285 {
5286 assert_different_registers(dst, carry);
5287 assert_different_registers(dst, src2);
5288 add(dst, src1, src2);
5289 sltu(carry, dst, src2);
5290 }
5291
5292 // add two input with carry
5293 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5294 assert_different_registers(dst, carry);
5295 add(dst, src1, src2);
5296 add(dst, dst, carry);
5297 }
5298
5299 // add two unsigned input with carry and output carry
5300 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5301 assert_different_registers(dst, src2);
5302 adc(dst, src1, src2, carry);
5303 sltu(carry, dst, src2);
5304 }
5305
5306 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5307 Register src1, Register src2, Register carry) {
5308 cad(dest_lo, dest_lo, src1, carry);
5309 add(dest_hi, dest_hi, carry);
5310 cad(dest_lo, dest_lo, src2, carry);
5311 add(final_dest_hi, dest_hi, carry);
5312 }
5313
5314 /**
5315 * Multiply 64 bit by 64 bit first loop.
5316 */
5317 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5318 Register y, Register y_idx, Register z,
5319 Register carry, Register product,
5320 Register idx, Register kdx) {
5321 //
5322 // jlong carry, x[], y[], z[];
5323 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5324 // huge_128 product = y[idx] * x[xstart] + carry;
5325 // z[kdx] = (jlong)product;
5326 // carry = (jlong)(product >>> 64);
5327 // }
5328 // z[xstart] = carry;
5329 //
5330
5331 Label L_first_loop, L_first_loop_exit;
5332 Label L_one_x, L_one_y, L_multiply;
5333
5334 subiw(xstart, xstart, 1);
5335 bltz(xstart, L_one_x);
5336
5337 shadd(t0, xstart, x, t0, LogBytesPerInt);
5338 ld(x_xstart, Address(t0, 0));
5339 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5340
5341 bind(L_first_loop);
5342 subiw(idx, idx, 1);
5343 bltz(idx, L_first_loop_exit);
5344 subiw(idx, idx, 1);
5345 bltz(idx, L_one_y);
5346
5347 shadd(t0, idx, y, t0, LogBytesPerInt);
5348 ld(y_idx, Address(t0, 0));
5349 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5350 bind(L_multiply);
5351
5352 mulhu(t0, x_xstart, y_idx);
5353 mul(product, x_xstart, y_idx);
5354 cad(product, product, carry, t1);
5355 adc(carry, t0, zr, t1);
5356
5357 subiw(kdx, kdx, 2);
5358 ror(product, product, 32); // back to big-endian
5359 shadd(t0, kdx, z, t0, LogBytesPerInt);
5360 sd(product, Address(t0, 0));
5361
5362 j(L_first_loop);
5363
5364 bind(L_one_y);
5365 lwu(y_idx, Address(y, 0));
5366 j(L_multiply);
5367
5368 bind(L_one_x);
5369 lwu(x_xstart, Address(x, 0));
5370 j(L_first_loop);
5371
5372 bind(L_first_loop_exit);
5373 }
5374
5375 /**
5376 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5377 *
5378 */
5379 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5380 Register carry, Register carry2,
5381 Register idx, Register jdx,
5382 Register yz_idx1, Register yz_idx2,
5383 Register tmp, Register tmp3, Register tmp4,
5384 Register tmp6, Register product_hi) {
5385 // jlong carry, x[], y[], z[];
5386 // int kdx = xstart+1;
5387 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5388 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5389 // jlong carry2 = (jlong)(tmp3 >>> 64);
5390 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
5391 // carry = (jlong)(tmp4 >>> 64);
5392 // z[kdx+idx+1] = (jlong)tmp3;
5393 // z[kdx+idx] = (jlong)tmp4;
5394 // }
5395 // idx += 2;
5396 // if (idx > 0) {
5397 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5398 // z[kdx+idx] = (jlong)yz_idx1;
5399 // carry = (jlong)(yz_idx1 >>> 64);
5400 // }
5401 //
5402
5403 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5404
5405 srliw(jdx, idx, 2);
5406
5407 bind(L_third_loop);
5408
5409 subw(jdx, jdx, 1);
5410 bltz(jdx, L_third_loop_exit);
5411 subw(idx, idx, 4);
5412
5413 shadd(t0, idx, y, t0, LogBytesPerInt);
5414 ld(yz_idx2, Address(t0, 0));
5415 ld(yz_idx1, Address(t0, wordSize));
5416
5417 shadd(tmp6, idx, z, t0, LogBytesPerInt);
5418
5419 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5420 ror(yz_idx2, yz_idx2, 32);
5421
5422 ld(t1, Address(tmp6, 0));
5423 ld(t0, Address(tmp6, wordSize));
5424
5425 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5426 mulhu(tmp4, product_hi, yz_idx1);
5427
5428 ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5429 ror(t1, t1, 32, tmp);
5430
5431 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
5432 mulhu(carry2, product_hi, yz_idx2);
5433
5434 cad(tmp3, tmp3, carry, carry);
5435 adc(tmp4, tmp4, zr, carry);
5436 cad(tmp3, tmp3, t0, t0);
5437 cadc(tmp4, tmp4, tmp, t0);
5438 adc(carry, carry2, zr, t0);
5439 cad(tmp4, tmp4, t1, carry2);
5440 adc(carry, carry, zr, carry2);
5441
5442 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5443 ror(tmp4, tmp4, 32);
5444 sd(tmp4, Address(tmp6, 0));
5445 sd(tmp3, Address(tmp6, wordSize));
5446
5447 j(L_third_loop);
5448
5449 bind(L_third_loop_exit);
5450
5451 andi(idx, idx, 0x3);
5452 beqz(idx, L_post_third_loop_done);
5453
5454 Label L_check_1;
5455 subiw(idx, idx, 2);
5456 bltz(idx, L_check_1);
5457
5458 shadd(t0, idx, y, t0, LogBytesPerInt);
5459 ld(yz_idx1, Address(t0, 0));
5460 ror(yz_idx1, yz_idx1, 32);
5461
5462 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5463 mulhu(tmp4, product_hi, yz_idx1);
5464
5465 shadd(t0, idx, z, t0, LogBytesPerInt);
5466 ld(yz_idx2, Address(t0, 0));
5467 ror(yz_idx2, yz_idx2, 32, tmp);
5468
5469 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5470
5471 ror(tmp3, tmp3, 32, tmp);
5472 sd(tmp3, Address(t0, 0));
5473
5474 bind(L_check_1);
5475
5476 andi(idx, idx, 0x1);
5477 subiw(idx, idx, 1);
5478 bltz(idx, L_post_third_loop_done);
5479 shadd(t0, idx, y, t0, LogBytesPerInt);
5480 lwu(tmp4, Address(t0, 0));
5481 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
5482 mulhu(carry2, tmp4, product_hi);
5483
5484 shadd(t0, idx, z, t0, LogBytesPerInt);
5485 lwu(tmp4, Address(t0, 0));
5486
5487 add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5488
5489 shadd(t0, idx, z, t0, LogBytesPerInt);
5490 sw(tmp3, Address(t0, 0));
5491
5492 slli(t0, carry2, 32);
5493 srli(carry, tmp3, 32);
5494 orr(carry, carry, t0);
5495
5496 bind(L_post_third_loop_done);
5497 }
5498
5499 /**
5500 * Code for BigInteger::multiplyToLen() intrinsic.
5501 *
5502 * x10: x
5503 * x11: xlen
5504 * x12: y
5505 * x13: ylen
5506 * x14: z
5507 * x15: tmp0
5508 * x16: tmp1
5509 * x17: tmp2
5510 * x7: tmp3
5511 * x28: tmp4
5512 * x29: tmp5
5513 * x30: tmp6
5514 * x31: tmp7
5515 */
5516 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5517 Register z, Register tmp0,
5518 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5519 Register tmp5, Register tmp6, Register product_hi) {
5520 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5521
5522 const Register idx = tmp1;
5523 const Register kdx = tmp2;
5524 const Register xstart = tmp3;
5525
5526 const Register y_idx = tmp4;
5527 const Register carry = tmp5;
5528 const Register product = xlen;
5529 const Register x_xstart = tmp0;
5530 const Register jdx = tmp1;
5531
5532 mv(idx, ylen); // idx = ylen;
5533 addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5534 mv(carry, zr); // carry = 0;
5535
5536 Label L_done;
5537 subiw(xstart, xlen, 1);
5538 bltz(xstart, L_done);
5539
5540 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5541
5542 Label L_second_loop_aligned;
5543 beqz(kdx, L_second_loop_aligned);
5544
5545 Label L_carry;
5546 subiw(kdx, kdx, 1);
5547 beqz(kdx, L_carry);
5548
5549 shadd(t0, kdx, z, t0, LogBytesPerInt);
5550 sw(carry, Address(t0, 0));
5551 srli(carry, carry, 32);
5552 subiw(kdx, kdx, 1);
5553
5554 bind(L_carry);
5555 shadd(t0, kdx, z, t0, LogBytesPerInt);
5556 sw(carry, Address(t0, 0));
5557
5558 // Second and third (nested) loops.
5559 //
5560 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5561 // carry = 0;
5562 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5563 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5564 // (z[k] & LONG_MASK) + carry;
5565 // z[k] = (int)product;
5566 // carry = product >>> 32;
5567 // }
5568 // z[i] = (int)carry;
5569 // }
5570 //
5571 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5572
5573 bind(L_second_loop_aligned);
5574 mv(carry, zr); // carry = 0;
5575 mv(jdx, ylen); // j = ystart+1
5576
5577 subiw(xstart, xstart, 1); // i = xstart-1;
5578 bltz(xstart, L_done);
5579
5580 subi(sp, sp, 4 * wordSize);
5581 sd(z, Address(sp, 0));
5582
5583 Label L_last_x;
5584 shadd(t0, xstart, z, t0, LogBytesPerInt);
5585 addi(z, t0, 4);
5586 subiw(xstart, xstart, 1); // i = xstart-1;
5587 bltz(xstart, L_last_x);
5588
5589 shadd(t0, xstart, x, t0, LogBytesPerInt);
5590 ld(product_hi, Address(t0, 0));
5591 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5592
5593 Label L_third_loop_prologue;
5594 bind(L_third_loop_prologue);
5595
5596 sd(ylen, Address(sp, wordSize));
5597 sd(x, Address(sp, 2 * wordSize));
5598 sd(xstart, Address(sp, 3 * wordSize));
5599 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5600 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5601 ld(z, Address(sp, 0));
5602 ld(ylen, Address(sp, wordSize));
5603 ld(x, Address(sp, 2 * wordSize));
5604 ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5605 addi(sp, sp, 4 * wordSize);
5606
5607 addiw(tmp3, xlen, 1);
5608 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5609 sw(carry, Address(t0, 0));
5610
5611 subiw(tmp3, tmp3, 1);
5612 bltz(tmp3, L_done);
5613
5614 srli(carry, carry, 32);
5615 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5616 sw(carry, Address(t0, 0));
5617 j(L_second_loop_aligned);
5618
5619 // Next infrequent code is moved outside loops.
5620 bind(L_last_x);
5621 lwu(product_hi, Address(x, 0));
5622 j(L_third_loop_prologue);
5623
5624 bind(L_done);
5625 }
5626 #endif
5627
5628 // Count bits of trailing zero chars from lsb to msb until first non-zero
5629 // char seen. For the LL case, shift 8 bits once as there is only one byte
5630 // per each char. For other cases, shift 16 bits once.
5631 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
5632 Register tmp1, Register tmp2) {
5633 int step = isLL ? 8 : 16;
5634 if (UseZbb) {
5635 ctz(Rd, Rs);
5636 andi(Rd, Rd, -step);
5637 return;
5638 }
5639
5640 assert_different_registers(Rd, tmp1, tmp2);
5641 Label Loop;
5642 mv(tmp2, Rs);
5643 mv(Rd, -step);
5644
5645 bind(Loop);
5646 addi(Rd, Rd, step);
5647 zext(tmp1, tmp2, step);
5648 srli(tmp2, tmp2, step);
5649 beqz(tmp1, Loop);
5650 }
5651
5652 // This instruction reads adjacent 4 bytes from the lower half of source register,
5653 // inflate into a register, for example:
5654 // Rs: A7A6A5A4A3A2A1A0
5655 // Rd: 00A300A200A100A0
5656 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5657 assert_different_registers(Rd, Rs, tmp1, tmp2);
5658
5659 mv(tmp1, 0xFF000000); // first byte mask at lower word
5660 andr(Rd, Rs, tmp1);
5661 for (int i = 0; i < 2; i++) {
5662 slli(Rd, Rd, wordSize);
5663 srli(tmp1, tmp1, wordSize);
5664 andr(tmp2, Rs, tmp1);
5665 orr(Rd, Rd, tmp2);
5666 }
5667 slli(Rd, Rd, wordSize);
5668 zext(tmp2, Rs, 8); // last byte mask at lower word
5669 orr(Rd, Rd, tmp2);
5670 }
5671
5672 // This instruction reads adjacent 4 bytes from the upper half of source register,
5673 // inflate into a register, for example:
5674 // Rs: A7A6A5A4A3A2A1A0
5675 // Rd: 00A700A600A500A4
5676 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5677 assert_different_registers(Rd, Rs, tmp1, tmp2);
5678 srli(Rs, Rs, 32); // only upper 32 bits are needed
5679 inflate_lo32(Rd, Rs, tmp1, tmp2);
5680 }
5681
5682 // The size of the blocks erased by the zero_blocks stub. We must
5683 // handle anything smaller than this ourselves in zero_words().
5684 const int MacroAssembler::zero_words_block_size = 8;
5685
5686 // zero_words() is used by C2 ClearArray patterns. It is as small as
5687 // possible, handling small word counts locally and delegating
5688 // anything larger to the zero_blocks stub. It is expanded many times
5689 // in compiled code, so it is important to keep it short.
5690
5691 // ptr: Address of a buffer to be zeroed.
5692 // cnt: Count in HeapWords.
5693 //
5694 // ptr, cnt, t1, and t0 are clobbered.
5695 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5696 assert(is_power_of_2(zero_words_block_size), "adjust this");
5697 assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5698 assert_different_registers(cnt, t0, t1);
5699
5700 BLOCK_COMMENT("zero_words {");
5701
5702 mv(t0, zero_words_block_size);
5703 Label around, done, done16;
5704 bltu(cnt, t0, around);
5705 {
5706 RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5707 assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5708 if (StubRoutines::riscv::complete()) {
5709 address tpc = reloc_call(zero_blocks);
5710 if (tpc == nullptr) {
5711 DEBUG_ONLY(reset_labels(around));
5712 postcond(pc() == badAddress);
5713 return nullptr;
5714 }
5715 } else {
5716 // Clobbers t1
5717 rt_call(zero_blocks.target());
5718 }
5719 }
5720 bind(around);
5721 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5722 Label l;
5723 test_bit(t0, cnt, exact_log2(i));
5724 beqz(t0, l);
5725 for (int j = 0; j < i; j++) {
5726 sd(zr, Address(ptr, j * wordSize));
5727 }
5728 addi(ptr, ptr, i * wordSize);
5729 bind(l);
5730 }
5731 {
5732 Label l;
5733 test_bit(t0, cnt, 0);
5734 beqz(t0, l);
5735 sd(zr, Address(ptr, 0));
5736 bind(l);
5737 }
5738
5739 BLOCK_COMMENT("} zero_words");
5740 postcond(pc() != badAddress);
5741 return pc();
5742 }
5743
5744 #define SmallArraySize (18 * BytesPerLong)
5745
5746 // base: Address of a buffer to be zeroed, 8 bytes aligned.
5747 // cnt: Immediate count in HeapWords.
5748 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5749 assert_different_registers(base, t0, t1);
5750
5751 BLOCK_COMMENT("zero_words {");
5752
5753 if (cnt <= SmallArraySize / BytesPerLong) {
5754 for (int i = 0; i < (int)cnt; i++) {
5755 sd(zr, Address(base, i * wordSize));
5756 }
5757 } else {
5758 const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5759 int remainder = cnt % unroll;
5760 for (int i = 0; i < remainder; i++) {
5761 sd(zr, Address(base, i * wordSize));
5762 }
5763
5764 Label loop;
5765 Register cnt_reg = t0;
5766 Register loop_base = t1;
5767 cnt = cnt - remainder;
5768 mv(cnt_reg, cnt);
5769 addi(loop_base, base, remainder * wordSize);
5770 bind(loop);
5771 sub(cnt_reg, cnt_reg, unroll);
5772 for (int i = 0; i < unroll; i++) {
5773 sd(zr, Address(loop_base, i * wordSize));
5774 }
5775 addi(loop_base, loop_base, unroll * wordSize);
5776 bnez(cnt_reg, loop);
5777 }
5778
5779 BLOCK_COMMENT("} zero_words");
5780 }
5781
5782 // base: Address of a buffer to be filled, 8 bytes aligned.
5783 // cnt: Count in 8-byte unit.
5784 // value: Value to be filled with.
5785 // base will point to the end of the buffer after filling.
5786 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5787 // Algorithm:
5788 //
5789 // t0 = cnt & 7
5790 // cnt -= t0
5791 // p += t0
5792 // switch (t0):
5793 // switch start:
5794 // do while cnt
5795 // cnt -= 8
5796 // p[-8] = value
5797 // case 7:
5798 // p[-7] = value
5799 // case 6:
5800 // p[-6] = value
5801 // // ...
5802 // case 1:
5803 // p[-1] = value
5804 // case 0:
5805 // p += 8
5806 // do-while end
5807 // switch end
5808
5809 assert_different_registers(base, cnt, value, t0, t1);
5810
5811 Label fini, skip, entry, loop;
5812 const int unroll = 8; // Number of sd instructions we'll unroll
5813
5814 beqz(cnt, fini);
5815
5816 andi(t0, cnt, unroll - 1);
5817 sub(cnt, cnt, t0);
5818 shadd(base, t0, base, t1, 3);
5819 la(t1, entry);
5820 slli(t0, t0, 2);
5821 sub(t1, t1, t0);
5822 jr(t1);
5823
5824 bind(loop);
5825 addi(base, base, unroll * wordSize);
5826 {
5827 IncompressibleScope scope(this); // Fixed length
5828 for (int i = -unroll; i < 0; i++) {
5829 sd(value, Address(base, i * 8));
5830 }
5831 }
5832 bind(entry);
5833 subi(cnt, cnt, unroll);
5834 bgez(cnt, loop);
5835
5836 bind(fini);
5837 }
5838
5839 // Zero blocks of memory by using CBO.ZERO.
5840 //
5841 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5842 // CBO.ZERO repeatedly for every full block. cnt is the size to be
5843 // zeroed in HeapWords. Returns the count of words left to be zeroed
5844 // in cnt.
5845 //
5846 // NOTE: This is intended to be used in the zero_blocks() stub. If
5847 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
5848 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5849 int zicboz_block_size = VM_Version::zicboz_block_size.value();
5850 Label initial_table_end, loop;
5851
5852 // Align base with cache line size.
5853 neg(tmp1, base);
5854 andi(tmp1, tmp1, zicboz_block_size - 1);
5855
5856 // tmp1: the number of bytes to be filled to align the base with cache line size.
5857 add(base, base, tmp1);
5858 srai(tmp2, tmp1, 3);
5859 sub(cnt, cnt, tmp2);
5860 srli(tmp2, tmp1, 1);
5861 la(tmp1, initial_table_end);
5862 sub(tmp2, tmp1, tmp2);
5863 jr(tmp2);
5864 for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
5865 sd(zr, Address(base, i));
5866 }
5867 bind(initial_table_end);
5868
5869 mv(tmp1, zicboz_block_size / wordSize);
5870 bind(loop);
5871 cbo_zero(base);
5872 sub(cnt, cnt, tmp1);
5873 addi(base, base, zicboz_block_size);
5874 bge(cnt, tmp1, loop);
5875 }
5876
5877 // java.lang.Math.round(float a)
5878 // Returns the closest int to the argument, with ties rounding to positive infinity.
5879 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5880 // this instructions calling sequence provides performance improvement on all tested devices;
5881 // don't change it without re-verification
5882 Label done;
5883 mv(t0, jint_cast(0.5f));
5884 fmv_w_x(ftmp, t0);
5885
5886 // dst = 0 if NaN
5887 feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5888 mv(dst, zr);
5889 beqz(t0, done);
5890
5891 // dst = (src + 0.5f) rounded down towards negative infinity
5892 // Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5893 // RDN is required for fadd_s, RNE gives incorrect results:
5894 // --------------------------------------------------------------------
5895 // fadd.s rne (src + 0.5f): src = 8388609.000000 ftmp = 8388610.000000
5896 // fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5897 // --------------------------------------------------------------------
5898 // fadd.s rdn (src + 0.5f): src = 8388609.000000 ftmp = 8388609.000000
5899 // fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5900 // --------------------------------------------------------------------
5901 fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5902 fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5903
5904 bind(done);
5905 }
5906
5907 // java.lang.Math.round(double a)
5908 // Returns the closest long to the argument, with ties rounding to positive infinity.
5909 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5910 // this instructions calling sequence provides performance improvement on all tested devices;
5911 // don't change it without re-verification
5912 Label done;
5913 mv(t0, julong_cast(0.5));
5914 fmv_d_x(ftmp, t0);
5915
5916 // dst = 0 if NaN
5917 feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5918 mv(dst, zr);
5919 beqz(t0, done);
5920
5921 // dst = (src + 0.5) rounded down towards negative infinity
5922 fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5923 fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5924
5925 bind(done);
5926 }
5927
5928 // Helper routine processing the slow path of NaN when converting float to float16
5929 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
5930 Register tmp1, Register tmp2) {
5931 fmv_x_w(dst, src);
5932
5933 // Float (32 bits)
5934 // Bit: 31 30 to 23 22 to 0
5935 // +---+------------------+-----------------------------+
5936 // | S | Exponent | Mantissa (Fraction) |
5937 // +---+------------------+-----------------------------+
5938 // 1 bit 8 bits 23 bits
5939 //
5940 // Float (16 bits)
5941 // Bit: 15 14 to 10 9 to 0
5942 // +---+----------------+------------------+
5943 // | S | Exponent | Mantissa |
5944 // +---+----------------+------------------+
5945 // 1 bit 5 bits 10 bits
5946 const int fp_sign_bits = 1;
5947 const int fp32_bits = 32;
5948 const int fp32_exponent_bits = 8;
5949 const int fp32_mantissa_1st_part_bits = 10;
5950 const int fp32_mantissa_2nd_part_bits = 9;
5951 const int fp32_mantissa_3rd_part_bits = 4;
5952 const int fp16_exponent_bits = 5;
5953 const int fp16_mantissa_bits = 10;
5954
5955 // preserve the sign bit and exponent, clear mantissa.
5956 srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
5957 slli(tmp2, tmp2, fp16_mantissa_bits);
5958
5959 // Preserve high order bit of float NaN in the
5960 // binary16 result NaN (tenth bit); OR in remaining
5961 // bits into lower 9 bits of binary 16 significand.
5962 // | (doppel & 0x007f_e000) >> 13 // 10 bits
5963 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
5964 // | (doppel & 0x0000_000f)); // 4 bits
5965 //
5966 // Check j.l.Float.floatToFloat16 for more information.
5967 // 10 bits
5968 int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
5969 int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
5970 slli(tmp1, dst, left_shift);
5971 srli(tmp1, tmp1, right_shift);
5972 orr(tmp2, tmp2, tmp1);
5973 // 9 bits
5974 left_shift += fp32_mantissa_1st_part_bits;
5975 right_shift = left_shift + fp32_mantissa_3rd_part_bits;
5976 slli(tmp1, dst, left_shift);
5977 srli(tmp1, tmp1, right_shift);
5978 orr(tmp2, tmp2, tmp1);
5979 // 4 bits
5980 andi(tmp1, dst, 0xf);
5981 orr(dst, tmp2, tmp1);
5982 }
5983
5984 #define FCVT_SAFE(FLOATCVT, FLOATSIG) \
5985 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
5986 Label done; \
5987 assert_different_registers(dst, tmp); \
5988 fclass_##FLOATSIG(tmp, src); \
5989 mv(dst, zr); \
5990 /* check if src is NaN */ \
5991 andi(tmp, tmp, FClassBits::nan); \
5992 bnez(tmp, done); \
5993 FLOATCVT(dst, src); \
5994 bind(done); \
5995 }
5996
5997 FCVT_SAFE(fcvt_w_s, s);
5998 FCVT_SAFE(fcvt_l_s, s);
5999 FCVT_SAFE(fcvt_w_d, d);
6000 FCVT_SAFE(fcvt_l_d, d);
6001
6002 #undef FCVT_SAFE
6003
6004 #define FCMP(FLOATTYPE, FLOATSIG) \
6005 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1, \
6006 FloatRegister Rs2, int unordered_result) { \
6007 Label Ldone; \
6008 if (unordered_result < 0) { \
6009 /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */ \
6010 /* installs 1 if gt else 0 */ \
6011 flt_##FLOATSIG(result, Rs2, Rs1); \
6012 /* Rs1 > Rs2, install 1 */ \
6013 bgtz(result, Ldone); \
6014 feq_##FLOATSIG(result, Rs1, Rs2); \
6015 subi(result, result, 1); \
6016 /* Rs1 = Rs2, install 0 */ \
6017 /* NaN or Rs1 < Rs2, install -1 */ \
6018 bind(Ldone); \
6019 } else { \
6020 /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */ \
6021 /* installs 1 if gt or unordered else 0 */ \
6022 flt_##FLOATSIG(result, Rs1, Rs2); \
6023 /* Rs1 < Rs2, install -1 */ \
6024 bgtz(result, Ldone); \
6025 feq_##FLOATSIG(result, Rs1, Rs2); \
6026 subi(result, result, 1); \
6027 /* Rs1 = Rs2, install 0 */ \
6028 /* NaN or Rs1 > Rs2, install 1 */ \
6029 bind(Ldone); \
6030 neg(result, result); \
6031 } \
6032 }
6033
6034 FCMP(float, s);
6035 FCMP(double, d);
6036
6037 #undef FCMP
6038
6039 // Zero words; len is in bytes
6040 // Destroys all registers except addr
6041 // len must be a nonzero multiple of wordSize
6042 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6043 assert_different_registers(addr, len, tmp, t0, t1);
6044
6045 #ifdef ASSERT
6046 {
6047 Label L;
6048 andi(t0, len, BytesPerWord - 1);
6049 beqz(t0, L);
6050 stop("len is not a multiple of BytesPerWord");
6051 bind(L);
6052 }
6053 #endif // ASSERT
6054
6055 #ifndef PRODUCT
6056 block_comment("zero memory");
6057 #endif // PRODUCT
6058
6059 Label loop;
6060 Label entry;
6061
6062 // Algorithm:
6063 //
6064 // t0 = cnt & 7
6065 // cnt -= t0
6066 // p += t0
6067 // switch (t0) {
6068 // do {
6069 // cnt -= 8
6070 // p[-8] = 0
6071 // case 7:
6072 // p[-7] = 0
6073 // case 6:
6074 // p[-6] = 0
6075 // ...
6076 // case 1:
6077 // p[-1] = 0
6078 // case 0:
6079 // p += 8
6080 // } while (cnt)
6081 // }
6082
6083 const int unroll = 8; // Number of sd(zr) instructions we'll unroll
6084
6085 srli(len, len, LogBytesPerWord);
6086 andi(t0, len, unroll - 1); // t0 = cnt % unroll
6087 sub(len, len, t0); // cnt -= unroll
6088 // tmp always points to the end of the region we're about to zero
6089 shadd(tmp, t0, addr, t1, LogBytesPerWord);
6090 la(t1, entry);
6091 slli(t0, t0, 2);
6092 sub(t1, t1, t0);
6093 jr(t1);
6094
6095 bind(loop);
6096 sub(len, len, unroll);
6097 {
6098 IncompressibleScope scope(this); // Fixed length
6099 for (int i = -unroll; i < 0; i++) {
6100 sd(zr, Address(tmp, i * wordSize));
6101 }
6102 }
6103 bind(entry);
6104 add(tmp, tmp, unroll * wordSize);
6105 bnez(len, loop);
6106 }
6107
6108 // shift left by shamt and add
6109 // Rd = (Rs1 << shamt) + Rs2
6110 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6111 if (UseZba) {
6112 if (shamt == 1) {
6113 sh1add(Rd, Rs1, Rs2);
6114 return;
6115 } else if (shamt == 2) {
6116 sh2add(Rd, Rs1, Rs2);
6117 return;
6118 } else if (shamt == 3) {
6119 sh3add(Rd, Rs1, Rs2);
6120 return;
6121 }
6122 }
6123
6124 if (shamt != 0) {
6125 assert_different_registers(Rs2, tmp);
6126 slli(tmp, Rs1, shamt);
6127 add(Rd, Rs2, tmp);
6128 } else {
6129 add(Rd, Rs1, Rs2);
6130 }
6131 }
6132
6133 void MacroAssembler::zext(Register dst, Register src, int bits) {
6134 switch (bits) {
6135 case 32:
6136 if (UseZba) {
6137 zext_w(dst, src);
6138 return;
6139 }
6140 break;
6141 case 16:
6142 if (UseZbb) {
6143 zext_h(dst, src);
6144 return;
6145 }
6146 break;
6147 case 8:
6148 zext_b(dst, src);
6149 return;
6150 default:
6151 break;
6152 }
6153
6154 slli(dst, src, XLEN - bits);
6155 srli(dst, dst, XLEN - bits);
6156 }
6157
6158 void MacroAssembler::sext(Register dst, Register src, int bits) {
6159 switch (bits) {
6160 case 32:
6161 sext_w(dst, src);
6162 return;
6163 case 16:
6164 if (UseZbb) {
6165 sext_h(dst, src);
6166 return;
6167 }
6168 break;
6169 case 8:
6170 if (UseZbb) {
6171 sext_b(dst, src);
6172 return;
6173 }
6174 break;
6175 default:
6176 break;
6177 }
6178
6179 slli(dst, src, XLEN - bits);
6180 srai(dst, dst, XLEN - bits);
6181 }
6182
6183 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6184 Register tmp, bool is_signed) {
6185 if (src1 == src2) {
6186 mv(dst, zr);
6187 return;
6188 }
6189 Label done;
6190 Register left = src1;
6191 Register right = src2;
6192 if (dst == src1) {
6193 assert_different_registers(dst, src2, tmp);
6194 mv(tmp, src1);
6195 left = tmp;
6196 } else if (dst == src2) {
6197 assert_different_registers(dst, src1, tmp);
6198 mv(tmp, src2);
6199 right = tmp;
6200 }
6201
6202 // installs 1 if gt else 0
6203 if (is_signed) {
6204 slt(dst, right, left);
6205 } else {
6206 sltu(dst, right, left);
6207 }
6208 bnez(dst, done);
6209 if (is_signed) {
6210 slt(dst, left, right);
6211 } else {
6212 sltu(dst, left, right);
6213 }
6214 // dst = -1 if lt; else if eq , dst = 0
6215 neg(dst, dst);
6216 bind(done);
6217 }
6218
6219 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6220 {
6221 cmp_x2i(dst, src1, src2, tmp);
6222 }
6223
6224 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6225 cmp_x2i(dst, src1, src2, tmp, false);
6226 }
6227
6228 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6229 cmp_x2i(dst, src1, src2, tmp, false);
6230 }
6231
6232 // The java_calling_convention describes stack locations as ideal slots on
6233 // a frame with no abi restrictions. Since we must observe abi restrictions
6234 // (like the placement of the register window) the slots must be biased by
6235 // the following value.
6236 static int reg2offset_in(VMReg r) {
6237 // Account for saved fp and ra
6238 // This should really be in_preserve_stack_slots
6239 return r->reg2stack() * VMRegImpl::stack_slot_size;
6240 }
6241
6242 static int reg2offset_out(VMReg r) {
6243 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6244 }
6245
6246 // The C ABI specifies:
6247 // "integer scalars narrower than XLEN bits are widened according to the sign
6248 // of their type up to 32 bits, then sign-extended to XLEN bits."
6249 // Applies for both passed in register and stack.
6250 //
6251 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6252 // Native uses 64-bit stack slots for all integer scalar types.
6253 //
6254 // lw loads the Java stack slot, sign-extends and
6255 // sd store this widened integer into a 64 bit native stack slot.
6256 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6257 if (src.first()->is_stack()) {
6258 if (dst.first()->is_stack()) {
6259 // stack to stack
6260 lw(tmp, Address(fp, reg2offset_in(src.first())));
6261 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6262 } else {
6263 // stack to reg
6264 lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6265 }
6266 } else if (dst.first()->is_stack()) {
6267 // reg to stack
6268 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6269 } else {
6270 if (dst.first() != src.first()) {
6271 sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6272 }
6273 }
6274 }
6275
6276 // An oop arg. Must pass a handle not the oop itself
6277 void MacroAssembler::object_move(OopMap* map,
6278 int oop_handle_offset,
6279 int framesize_in_slots,
6280 VMRegPair src,
6281 VMRegPair dst,
6282 bool is_receiver,
6283 int* receiver_offset) {
6284 assert_cond(map != nullptr && receiver_offset != nullptr);
6285
6286 // must pass a handle. First figure out the location we use as a handle
6287 Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6288
6289 // See if oop is null if it is we need no handle
6290
6291 if (src.first()->is_stack()) {
6292 // Oop is already on the stack as an argument
6293 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6294 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6295 if (is_receiver) {
6296 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6297 }
6298
6299 ld(t0, Address(fp, reg2offset_in(src.first())));
6300 la(rHandle, Address(fp, reg2offset_in(src.first())));
6301 // conditionally move a null
6302 Label notZero1;
6303 bnez(t0, notZero1);
6304 mv(rHandle, zr);
6305 bind(notZero1);
6306 } else {
6307
6308 // Oop is in a register we must store it to the space we reserve
6309 // on the stack for oop_handles and pass a handle if oop is non-null
6310
6311 const Register rOop = src.first()->as_Register();
6312 int oop_slot = -1;
6313 if (rOop == j_rarg0) {
6314 oop_slot = 0;
6315 } else if (rOop == j_rarg1) {
6316 oop_slot = 1;
6317 } else if (rOop == j_rarg2) {
6318 oop_slot = 2;
6319 } else if (rOop == j_rarg3) {
6320 oop_slot = 3;
6321 } else if (rOop == j_rarg4) {
6322 oop_slot = 4;
6323 } else if (rOop == j_rarg5) {
6324 oop_slot = 5;
6325 } else if (rOop == j_rarg6) {
6326 oop_slot = 6;
6327 } else {
6328 assert(rOop == j_rarg7, "wrong register");
6329 oop_slot = 7;
6330 }
6331
6332 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6333 int offset = oop_slot * VMRegImpl::stack_slot_size;
6334
6335 map->set_oop(VMRegImpl::stack2reg(oop_slot));
6336 // Store oop in handle area, may be null
6337 sd(rOop, Address(sp, offset));
6338 if (is_receiver) {
6339 *receiver_offset = offset;
6340 }
6341
6342 //rOop maybe the same as rHandle
6343 if (rOop == rHandle) {
6344 Label isZero;
6345 beqz(rOop, isZero);
6346 la(rHandle, Address(sp, offset));
6347 bind(isZero);
6348 } else {
6349 Label notZero2;
6350 la(rHandle, Address(sp, offset));
6351 bnez(rOop, notZero2);
6352 mv(rHandle, zr);
6353 bind(notZero2);
6354 }
6355 }
6356
6357 // If arg is on the stack then place it otherwise it is already in correct reg.
6358 if (dst.first()->is_stack()) {
6359 sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6360 }
6361 }
6362
6363 // A float arg may have to do float reg int reg conversion
6364 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6365 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6366 (src.first()->is_reg() && dst.first()->is_reg()) ||
6367 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6368 if (src.first()->is_stack()) {
6369 if (dst.first()->is_stack()) {
6370 lwu(tmp, Address(fp, reg2offset_in(src.first())));
6371 sw(tmp, Address(sp, reg2offset_out(dst.first())));
6372 } else if (dst.first()->is_Register()) {
6373 lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6374 } else {
6375 ShouldNotReachHere();
6376 }
6377 } else if (src.first() != dst.first()) {
6378 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6379 fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6380 } else {
6381 ShouldNotReachHere();
6382 }
6383 }
6384 }
6385
6386 // A long move
6387 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6388 if (src.first()->is_stack()) {
6389 if (dst.first()->is_stack()) {
6390 // stack to stack
6391 ld(tmp, Address(fp, reg2offset_in(src.first())));
6392 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6393 } else {
6394 // stack to reg
6395 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6396 }
6397 } else if (dst.first()->is_stack()) {
6398 // reg to stack
6399 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6400 } else {
6401 if (dst.first() != src.first()) {
6402 mv(dst.first()->as_Register(), src.first()->as_Register());
6403 }
6404 }
6405 }
6406
6407 // A double move
6408 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6409 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6410 (src.first()->is_reg() && dst.first()->is_reg()) ||
6411 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6412 if (src.first()->is_stack()) {
6413 if (dst.first()->is_stack()) {
6414 ld(tmp, Address(fp, reg2offset_in(src.first())));
6415 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6416 } else if (dst.first()-> is_Register()) {
6417 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6418 } else {
6419 ShouldNotReachHere();
6420 }
6421 } else if (src.first() != dst.first()) {
6422 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6423 fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6424 } else {
6425 ShouldNotReachHere();
6426 }
6427 }
6428 }
6429
6430 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6431 assert(bit_pos < 64, "invalid bit range");
6432 if (UseZbs) {
6433 bexti(Rd, Rs, bit_pos);
6434 return;
6435 }
6436 int64_t imm = (int64_t)(1UL << bit_pos);
6437 if (is_simm12(imm)) {
6438 andi(Rd, Rs, imm);
6439 } else {
6440 srli(Rd, Rs, bit_pos);
6441 andi(Rd, Rd, 1);
6442 }
6443 }
6444
6445 // Implements lightweight-locking.
6446 //
6447 // - obj: the object to be locked
6448 // - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6449 // - slow: branched to if locking fails
6450 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6451 assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6452
6453 Label push;
6454 const Register top = tmp1;
6455 const Register mark = tmp2;
6456 const Register t = tmp3;
6457
6458 // Preload the markWord. It is important that this is the first
6459 // instruction emitted as it is part of C1's null check semantics.
6460 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6461
6462 if (UseObjectMonitorTable) {
6463 // Clear cache in case fast locking succeeds or we need to take the slow-path.
6464 sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6465 }
6466
6467 if (DiagnoseSyncOnValueBasedClasses != 0) {
6468 load_klass(tmp1, obj);
6469 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6470 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6471 bnez(tmp1, slow, /* is_far */ true);
6472 }
6473
6474 // Check if the lock-stack is full.
6475 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6476 mv(t, (unsigned)LockStack::end_offset());
6477 bge(top, t, slow, /* is_far */ true);
6478
6479 // Check for recursion.
6480 add(t, xthread, top);
6481 ld(t, Address(t, -oopSize));
6482 beq(obj, t, push);
6483
6484 // Check header for monitor (0b10).
6485 test_bit(t, mark, exact_log2(markWord::monitor_value));
6486 bnez(t, slow, /* is_far */ true);
6487
6488 // Try to lock. Transition lock-bits 0b01 => 0b00
6489 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6490 ori(mark, mark, markWord::unlocked_value);
6491 xori(t, mark, markWord::unlocked_value);
6492 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6493 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6494 bne(mark, t, slow, /* is_far */ true);
6495
6496 bind(push);
6497 // After successful lock, push object on lock-stack.
6498 add(t, xthread, top);
6499 sd(obj, Address(t));
6500 addiw(top, top, oopSize);
6501 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6502 }
6503
6504 // Implements ligthweight-unlocking.
6505 //
6506 // - obj: the object to be unlocked
6507 // - tmp1, tmp2, tmp3: temporary registers
6508 // - slow: branched to if unlocking fails
6509 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6510 assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6511
6512 #ifdef ASSERT
6513 {
6514 // Check for lock-stack underflow.
6515 Label stack_ok;
6516 lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6517 mv(tmp2, (unsigned)LockStack::start_offset());
6518 bge(tmp1, tmp2, stack_ok);
6519 STOP("Lock-stack underflow");
6520 bind(stack_ok);
6521 }
6522 #endif
6523
6524 Label unlocked, push_and_slow;
6525 const Register top = tmp1;
6526 const Register mark = tmp2;
6527 const Register t = tmp3;
6528
6529 // Check if obj is top of lock-stack.
6530 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6531 subiw(top, top, oopSize);
6532 add(t, xthread, top);
6533 ld(t, Address(t));
6534 bne(obj, t, slow, /* is_far */ true);
6535
6536 // Pop lock-stack.
6537 DEBUG_ONLY(add(t, xthread, top);)
6538 DEBUG_ONLY(sd(zr, Address(t));)
6539 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6540
6541 // Check if recursive.
6542 add(t, xthread, top);
6543 ld(t, Address(t, -oopSize));
6544 beq(obj, t, unlocked);
6545
6546 // Not recursive. Check header for monitor (0b10).
6547 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6548 test_bit(t, mark, exact_log2(markWord::monitor_value));
6549 bnez(t, push_and_slow);
6550
6551 #ifdef ASSERT
6552 // Check header not unlocked (0b01).
6553 Label not_unlocked;
6554 test_bit(t, mark, exact_log2(markWord::unlocked_value));
6555 beqz(t, not_unlocked);
6556 stop("lightweight_unlock already unlocked");
6557 bind(not_unlocked);
6558 #endif
6559
6560 // Try to unlock. Transition lock bits 0b00 => 0b01
6561 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6562 ori(t, mark, markWord::unlocked_value);
6563 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6564 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6565 beq(mark, t, unlocked);
6566
6567 bind(push_and_slow);
6568 // Restore lock-stack and handle the unlock in runtime.
6569 DEBUG_ONLY(add(t, xthread, top);)
6570 DEBUG_ONLY(sd(obj, Address(t));)
6571 addiw(top, top, oopSize);
6572 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6573 j(slow);
6574
6575 bind(unlocked);
6576 }