1 /*
2 * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "code/compiledIC.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/cardTableBarrierSet.hpp"
35 #include "gc/shared/collectedHeap.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "memory/resourceArea.hpp"
40 #include "memory/universe.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedKlass.inline.hpp"
43 #include "oops/compressedOops.inline.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/oop.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/javaThread.hpp"
48 #include "runtime/jniHandles.inline.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "utilities/globalDefinitions.hpp"
52 #include "utilities/powerOfTwo.hpp"
53 #ifdef COMPILER2
54 #include "opto/compile.hpp"
55 #include "opto/node.hpp"
56 #include "opto/output.hpp"
57 #endif
58
59 #ifdef PRODUCT
60 #define BLOCK_COMMENT(str) /* nothing */
61 #else
62 #define BLOCK_COMMENT(str) block_comment(str)
63 #endif
64 #define STOP(str) stop(str);
65 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
66
67
68
69 Register MacroAssembler::extract_rs1(address instr) {
70 assert_cond(instr != nullptr);
71 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
72 }
73
74 Register MacroAssembler::extract_rs2(address instr) {
75 assert_cond(instr != nullptr);
76 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
77 }
78
79 Register MacroAssembler::extract_rd(address instr) {
80 assert_cond(instr != nullptr);
81 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
82 }
83
84 uint32_t MacroAssembler::extract_opcode(address instr) {
85 assert_cond(instr != nullptr);
86 return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
87 }
88
89 uint32_t MacroAssembler::extract_funct3(address instr) {
90 assert_cond(instr != nullptr);
91 return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
92 }
93
94 bool MacroAssembler::is_pc_relative_at(address instr) {
95 // auipc + jalr
96 // auipc + addi
97 // auipc + load
98 // auipc + fload_load
99 return (is_auipc_at(instr)) &&
100 (is_addi_at(instr + MacroAssembler::instruction_size) ||
101 is_jalr_at(instr + MacroAssembler::instruction_size) ||
102 is_load_at(instr + MacroAssembler::instruction_size) ||
103 is_float_load_at(instr + MacroAssembler::instruction_size)) &&
104 check_pc_relative_data_dependency(instr);
105 }
106
107 // ie:ld(Rd, Label)
108 bool MacroAssembler::is_load_pc_relative_at(address instr) {
109 return is_auipc_at(instr) && // auipc
110 is_ld_at(instr + MacroAssembler::instruction_size) && // ld
111 check_load_pc_relative_data_dependency(instr);
112 }
113
114 bool MacroAssembler::is_movptr1_at(address instr) {
115 return is_lui_at(instr) && // Lui
116 is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
117 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
118 is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
119 is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
120 (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
121 is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
122 is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
123 check_movptr1_data_dependency(instr);
124 }
125
126 bool MacroAssembler::is_movptr2_at(address instr) {
127 return is_lui_at(instr) && // lui
128 is_lui_at(instr + MacroAssembler::instruction_size) && // lui
129 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
130 is_add_at(instr + MacroAssembler::instruction_size * 3) &&
131 (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
132 is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
133 is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
134 check_movptr2_data_dependency(instr);
135 }
136
137 bool MacroAssembler::is_li16u_at(address instr) {
138 return is_lui_at(instr) && // lui
139 is_srli_at(instr + MacroAssembler::instruction_size) && // srli
140 check_li16u_data_dependency(instr);
141 }
142
143 bool MacroAssembler::is_li32_at(address instr) {
144 return is_lui_at(instr) && // lui
145 is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
146 check_li32_data_dependency(instr);
147 }
148
149 bool MacroAssembler::is_lwu_to_zr(address instr) {
150 assert_cond(instr != nullptr);
151 return (extract_opcode(instr) == 0b0000011 &&
152 extract_funct3(instr) == 0b110 &&
153 extract_rd(instr) == zr); // zr
154 }
155
156 uint32_t MacroAssembler::get_membar_kind(address addr) {
157 assert_cond(addr != nullptr);
158 assert(is_membar(addr), "no membar found");
159
160 uint32_t insn = Bytes::get_native_u4(addr);
161
162 uint32_t predecessor = Assembler::extract(insn, 27, 24);
163 uint32_t successor = Assembler::extract(insn, 23, 20);
164
165 return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
166 }
167
168 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
169 assert_cond(addr != nullptr);
170 assert(is_membar(addr), "no membar found");
171
172 uint32_t predecessor = 0;
173 uint32_t successor = 0;
174
175 MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
176
177 uint32_t insn = Bytes::get_native_u4(addr);
178 address pInsn = (address) &insn;
179 Assembler::patch(pInsn, 27, 24, predecessor);
180 Assembler::patch(pInsn, 23, 20, successor);
181
182 address membar = addr;
183 Assembler::sd_instr(membar, insn);
184 }
185
186 static void pass_arg0(MacroAssembler* masm, Register arg) {
187 if (c_rarg0 != arg) {
188 masm->mv(c_rarg0, arg);
189 }
190 }
191
192 static void pass_arg1(MacroAssembler* masm, Register arg) {
193 if (c_rarg1 != arg) {
194 masm->mv(c_rarg1, arg);
195 }
196 }
197
198 static void pass_arg2(MacroAssembler* masm, Register arg) {
199 if (c_rarg2 != arg) {
200 masm->mv(c_rarg2, arg);
201 }
202 }
203
204 static void pass_arg3(MacroAssembler* masm, Register arg) {
205 if (c_rarg3 != arg) {
206 masm->mv(c_rarg3, arg);
207 }
208 }
209
210 void MacroAssembler::push_cont_fastpath(Register java_thread) {
211 if (!Continuations::enabled()) return;
212 Label done;
213 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
214 bleu(sp, t0, done);
215 sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
216 bind(done);
217 }
218
219 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
220 if (!Continuations::enabled()) return;
221 Label done;
222 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
223 bltu(sp, t0, done);
224 sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
225 bind(done);
226 }
227
228 int MacroAssembler::align(int modulus, int extra_offset) {
229 CompressibleScope scope(this);
230 intptr_t before = offset();
231 while ((offset() + extra_offset) % modulus != 0) { nop(); }
232 return (int)(offset() - before);
233 }
234
235 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
236 call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
237 }
238
239 // Implementation of call_VM versions
240
241 void MacroAssembler::call_VM(Register oop_result,
242 address entry_point,
243 bool check_exceptions) {
244 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
245 }
246
247 void MacroAssembler::call_VM(Register oop_result,
248 address entry_point,
249 Register arg_1,
250 bool check_exceptions) {
251 pass_arg1(this, arg_1);
252 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
253 }
254
255 void MacroAssembler::call_VM(Register oop_result,
256 address entry_point,
257 Register arg_1,
258 Register arg_2,
259 bool check_exceptions) {
260 assert_different_registers(arg_1, c_rarg2);
261 pass_arg2(this, arg_2);
262 pass_arg1(this, arg_1);
263 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
264 }
265
266 void MacroAssembler::call_VM(Register oop_result,
267 address entry_point,
268 Register arg_1,
269 Register arg_2,
270 Register arg_3,
271 bool check_exceptions) {
272 assert_different_registers(arg_1, c_rarg2, c_rarg3);
273 assert_different_registers(arg_2, c_rarg3);
274 pass_arg3(this, arg_3);
275
276 pass_arg2(this, arg_2);
277
278 pass_arg1(this, arg_1);
279 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
280 }
281
282 void MacroAssembler::call_VM(Register oop_result,
283 Register last_java_sp,
284 address entry_point,
285 int number_of_arguments,
286 bool check_exceptions) {
287 call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
288 }
289
290 void MacroAssembler::call_VM(Register oop_result,
291 Register last_java_sp,
292 address entry_point,
293 Register arg_1,
294 bool check_exceptions) {
295 pass_arg1(this, arg_1);
296 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
297 }
298
299 void MacroAssembler::call_VM(Register oop_result,
300 Register last_java_sp,
301 address entry_point,
302 Register arg_1,
303 Register arg_2,
304 bool check_exceptions) {
305
306 assert_different_registers(arg_1, c_rarg2);
307 pass_arg2(this, arg_2);
308 pass_arg1(this, arg_1);
309 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
310 }
311
312 void MacroAssembler::call_VM(Register oop_result,
313 Register last_java_sp,
314 address entry_point,
315 Register arg_1,
316 Register arg_2,
317 Register arg_3,
318 bool check_exceptions) {
319 assert_different_registers(arg_1, c_rarg2, c_rarg3);
320 assert_different_registers(arg_2, c_rarg3);
321 pass_arg3(this, arg_3);
322 pass_arg2(this, arg_2);
323 pass_arg1(this, arg_1);
324 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
325 }
326
327 void MacroAssembler::post_call_nop() {
328 assert(!in_compressible_scope(), "Must be");
329 assert_alignment(pc());
330 if (!Continuations::enabled()) {
331 return;
332 }
333 relocate(post_call_nop_Relocation::spec());
334 InlineSkippedInstructionsCounter skipCounter(this);
335 nop();
336 li32(zr, 0);
337 }
338
339 // these are no-ops overridden by InterpreterMacroAssembler
340 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
341 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
342
343 // Calls to C land
344 //
345 // When entering C land, the fp, & esp of the last Java frame have to be recorded
346 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
347 // has to be reset to 0. This is required to allow proper stack traversal.
348 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
349 Register last_java_fp,
350 Register last_java_pc) {
351
352 if (last_java_pc->is_valid()) {
353 sd(last_java_pc, Address(xthread,
354 JavaThread::frame_anchor_offset() +
355 JavaFrameAnchor::last_Java_pc_offset()));
356 }
357
358 // determine last_java_sp register
359 if (!last_java_sp->is_valid()) {
360 last_java_sp = esp;
361 }
362
363 // last_java_fp is optional
364 if (last_java_fp->is_valid()) {
365 sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
366 }
367
368 // We must set sp last.
369 sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
370
371 }
372
373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
374 Register last_java_fp,
375 address last_java_pc,
376 Register tmp) {
377 assert(last_java_pc != nullptr, "must provide a valid PC");
378
379 la(tmp, last_java_pc);
380 sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
381
382 set_last_Java_frame(last_java_sp, last_java_fp, noreg);
383 }
384
385 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
386 Register last_java_fp,
387 Label &L,
388 Register tmp) {
389 if (L.is_bound()) {
390 set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
391 } else {
392 L.add_patch_at(code(), locator());
393 IncompressibleScope scope(this); // the label address will be patched back.
394 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
395 }
396 }
397
398 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
399 // we must set sp to zero to clear frame
400 sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
401
402 // must clear fp, so that compiled frames are not confused; it is
403 // possible that we need it only for debugging
404 if (clear_fp) {
405 sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
406 }
407
408 // Always clear the pc because it could have been set by make_walkable()
409 sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
410 }
411
412 void MacroAssembler::call_VM_base(Register oop_result,
413 Register java_thread,
414 Register last_java_sp,
415 Label* return_pc,
416 address entry_point,
417 int number_of_arguments,
418 bool check_exceptions) {
419 // determine java_thread register
420 if (!java_thread->is_valid()) {
421 java_thread = xthread;
422 }
423
424 // determine last_java_sp register
425 if (!last_java_sp->is_valid()) {
426 last_java_sp = esp;
427 }
428
429 // debugging support
430 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
431 assert(java_thread == xthread, "unexpected register");
432
433 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
434 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
435
436 // push java thread (becomes first argument of C function)
437 mv(c_rarg0, java_thread);
438
439 // set last Java frame before call
440 assert(last_java_sp != fp, "can't use fp");
441
442 Label l;
443 set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
444
445 // do the call, remove parameters
446 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
447
448 // reset last Java frame
449 // Only interpreter should have to clear fp
450 reset_last_Java_frame(true);
451
452 // C++ interp handles this in the interpreter
453 check_and_handle_popframe(java_thread);
454 check_and_handle_earlyret(java_thread);
455
456 if (check_exceptions) {
457 // check for pending exceptions (java_thread is set upon return)
458 ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
459 Label ok;
460 beqz(t0, ok);
461 j(RuntimeAddress(StubRoutines::forward_exception_entry()));
462 bind(ok);
463 }
464
465 // get oop result if there is one and reset the value in the thread
466 if (oop_result->is_valid()) {
467 get_vm_result_oop(oop_result, java_thread);
468 }
469 }
470
471 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
472 ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
473 sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
474 verify_oop_msg(oop_result, "broken oop in call_VM_base");
475 }
476
477 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
478 ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
479 sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
480 }
481
482 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
483 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
484 assert_different_registers(klass, xthread, tmp);
485
486 Label L_fallthrough, L_tmp;
487 if (L_fast_path == nullptr) {
488 L_fast_path = &L_fallthrough;
489 } else if (L_slow_path == nullptr) {
490 L_slow_path = &L_fallthrough;
491 }
492
493 // Fast path check: class is fully initialized
494 lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
495 membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
496 sub(tmp, tmp, InstanceKlass::fully_initialized);
497 beqz(tmp, *L_fast_path);
498
499 // Fast path check: current thread is initializer thread
500 ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
501
502 if (L_slow_path == &L_fallthrough) {
503 beq(xthread, tmp, *L_fast_path);
504 bind(*L_slow_path);
505 } else if (L_fast_path == &L_fallthrough) {
506 bne(xthread, tmp, *L_slow_path);
507 bind(*L_fast_path);
508 } else {
509 Unimplemented();
510 }
511 }
512
513 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
514 if (!VerifyOops) { return; }
515
516 // Pass register number to verify_oop_subroutine
517 const char* b = nullptr;
518 {
519 ResourceMark rm;
520 stringStream ss;
521 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
522 b = code_string(ss.as_string());
523 }
524 BLOCK_COMMENT("verify_oop {");
525
526 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
527
528 mv(c_rarg0, reg); // c_rarg0 : x10
529 {
530 // The length of the instruction sequence emitted should not depend
531 // on the address of the char buffer so that the size of mach nodes for
532 // scratch emit and normal emit matches.
533 IncompressibleScope scope(this); // Fixed length
534 movptr(t0, (address) b);
535 }
536
537 // Call indirectly to solve generation ordering problem
538 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
539 jalr(t1);
540
541 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
542
543 BLOCK_COMMENT("} verify_oop");
544 }
545
546 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
547 if (!VerifyOops) {
548 return;
549 }
550
551 const char* b = nullptr;
552 {
553 ResourceMark rm;
554 stringStream ss;
555 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
556 b = code_string(ss.as_string());
557 }
558 BLOCK_COMMENT("verify_oop_addr {");
559
560 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
561
562 if (addr.uses(sp)) {
563 la(x10, addr);
564 ld(x10, Address(x10, 4 * wordSize));
565 } else {
566 ld(x10, addr);
567 }
568
569 {
570 // The length of the instruction sequence emitted should not depend
571 // on the address of the char buffer so that the size of mach nodes for
572 // scratch emit and normal emit matches.
573 IncompressibleScope scope(this); // Fixed length
574 movptr(t0, (address) b);
575 }
576
577 // Call indirectly to solve generation ordering problem
578 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
579 jalr(t1);
580
581 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
582
583 BLOCK_COMMENT("} verify_oop_addr");
584 }
585
586 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
587 int extra_slot_offset) {
588 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
589 int stackElementSize = Interpreter::stackElementSize;
590 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
591 #ifdef ASSERT
592 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
593 assert(offset1 - offset == stackElementSize, "correct arithmetic");
594 #endif
595 if (arg_slot.is_constant()) {
596 return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
597 } else {
598 assert_different_registers(t0, arg_slot.as_register());
599 shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
600 return Address(t0, offset);
601 }
602 }
603
604 #ifndef PRODUCT
605 extern "C" void findpc(intptr_t x);
606 #endif
607
608 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
609 {
610 // In order to get locks to work, we need to fake a in_VM state
611 if (ShowMessageBoxOnError) {
612 JavaThread* thread = JavaThread::current();
613 JavaThreadState saved_state = thread->thread_state();
614 thread->set_thread_state(_thread_in_vm);
615 #ifndef PRODUCT
616 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
617 ttyLocker ttyl;
618 BytecodeCounter::print();
619 }
620 #endif
621 if (os::message_box(msg, "Execution stopped, print registers?")) {
622 ttyLocker ttyl;
623 tty->print_cr(" pc = 0x%016lx", pc);
624 #ifndef PRODUCT
625 tty->cr();
626 findpc(pc);
627 tty->cr();
628 #endif
629 tty->print_cr(" x0 = 0x%016lx", regs[0]);
630 tty->print_cr(" x1 = 0x%016lx", regs[1]);
631 tty->print_cr(" x2 = 0x%016lx", regs[2]);
632 tty->print_cr(" x3 = 0x%016lx", regs[3]);
633 tty->print_cr(" x4 = 0x%016lx", regs[4]);
634 tty->print_cr(" x5 = 0x%016lx", regs[5]);
635 tty->print_cr(" x6 = 0x%016lx", regs[6]);
636 tty->print_cr(" x7 = 0x%016lx", regs[7]);
637 tty->print_cr(" x8 = 0x%016lx", regs[8]);
638 tty->print_cr(" x9 = 0x%016lx", regs[9]);
639 tty->print_cr("x10 = 0x%016lx", regs[10]);
640 tty->print_cr("x11 = 0x%016lx", regs[11]);
641 tty->print_cr("x12 = 0x%016lx", regs[12]);
642 tty->print_cr("x13 = 0x%016lx", regs[13]);
643 tty->print_cr("x14 = 0x%016lx", regs[14]);
644 tty->print_cr("x15 = 0x%016lx", regs[15]);
645 tty->print_cr("x16 = 0x%016lx", regs[16]);
646 tty->print_cr("x17 = 0x%016lx", regs[17]);
647 tty->print_cr("x18 = 0x%016lx", regs[18]);
648 tty->print_cr("x19 = 0x%016lx", regs[19]);
649 tty->print_cr("x20 = 0x%016lx", regs[20]);
650 tty->print_cr("x21 = 0x%016lx", regs[21]);
651 tty->print_cr("x22 = 0x%016lx", regs[22]);
652 tty->print_cr("x23 = 0x%016lx", regs[23]);
653 tty->print_cr("x24 = 0x%016lx", regs[24]);
654 tty->print_cr("x25 = 0x%016lx", regs[25]);
655 tty->print_cr("x26 = 0x%016lx", regs[26]);
656 tty->print_cr("x27 = 0x%016lx", regs[27]);
657 tty->print_cr("x28 = 0x%016lx", regs[28]);
658 tty->print_cr("x30 = 0x%016lx", regs[30]);
659 tty->print_cr("x31 = 0x%016lx", regs[31]);
660 BREAKPOINT;
661 }
662 }
663 fatal("DEBUG MESSAGE: %s", msg);
664 }
665
666 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
667 assert_different_registers(value, tmp1, tmp2);
668 Label done, tagged, weak_tagged;
669
670 beqz(value, done); // Use null as-is.
671 // Test for tag.
672 andi(tmp1, value, JNIHandles::tag_mask);
673 bnez(tmp1, tagged);
674
675 // Resolve local handle
676 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
677 verify_oop(value);
678 j(done);
679
680 bind(tagged);
681 // Test for jweak tag.
682 STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
683 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
684 bnez(tmp1, weak_tagged);
685
686 // Resolve global handle
687 access_load_at(T_OBJECT, IN_NATIVE, value,
688 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
689 verify_oop(value);
690 j(done);
691
692 bind(weak_tagged);
693 // Resolve jweak.
694 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
695 Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
696 verify_oop(value);
697
698 bind(done);
699 }
700
701 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
702 assert_different_registers(value, tmp1, tmp2);
703 Label done;
704
705 beqz(value, done); // Use null as-is.
706
707 #ifdef ASSERT
708 {
709 STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
710 Label valid_global_tag;
711 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
712 bnez(tmp1, valid_global_tag);
713 stop("non global jobject using resolve_global_jobject");
714 bind(valid_global_tag);
715 }
716 #endif
717
718 // Resolve global handle
719 access_load_at(T_OBJECT, IN_NATIVE, value,
720 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
721 verify_oop(value);
722
723 bind(done);
724 }
725
726 void MacroAssembler::stop(const char* msg) {
727 BLOCK_COMMENT(msg);
728 illegal_instruction(Assembler::csr::time);
729 emit_int64((uintptr_t)msg);
730 }
731
732 void MacroAssembler::unimplemented(const char* what) {
733 const char* buf = nullptr;
734 {
735 ResourceMark rm;
736 stringStream ss;
737 ss.print("unimplemented: %s", what);
738 buf = code_string(ss.as_string());
739 }
740 stop(buf);
741 }
742
743 void MacroAssembler::emit_static_call_stub() {
744 IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
745 // CompiledDirectCall::set_to_interpreted knows the
746 // exact layout of this stub.
747
748 mov_metadata(xmethod, (Metadata*)nullptr);
749
750 // Jump to the entry point of the c2i stub.
751 int32_t offset = 0;
752 movptr2(t1, 0, offset, t0); // lui + lui + slli + add
753 jr(t1, offset);
754 }
755
756 void MacroAssembler::call_VM_leaf_base(address entry_point,
757 int number_of_arguments,
758 Label *retaddr) {
759 int32_t offset = 0;
760 push_reg(RegSet::of(t1, xmethod), sp); // push << t1 & xmethod >> to sp
761 movptr(t1, entry_point, offset, t0);
762 jalr(t1, offset);
763 if (retaddr != nullptr) {
764 bind(*retaddr);
765 }
766 pop_reg(RegSet::of(t1, xmethod), sp); // pop << t1 & xmethod >> from sp
767 }
768
769 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
770 call_VM_leaf_base(entry_point, number_of_arguments);
771 }
772
773 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
774 pass_arg0(this, arg_0);
775 call_VM_leaf_base(entry_point, 1);
776 }
777
778 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
779 assert_different_registers(arg_1, c_rarg0);
780 pass_arg0(this, arg_0);
781 pass_arg1(this, arg_1);
782 call_VM_leaf_base(entry_point, 2);
783 }
784
785 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
786 Register arg_1, Register arg_2) {
787 assert_different_registers(arg_1, c_rarg0);
788 assert_different_registers(arg_2, c_rarg0, c_rarg1);
789 pass_arg0(this, arg_0);
790 pass_arg1(this, arg_1);
791 pass_arg2(this, arg_2);
792 call_VM_leaf_base(entry_point, 3);
793 }
794
795 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
796 pass_arg0(this, arg_0);
797 MacroAssembler::call_VM_leaf_base(entry_point, 1);
798 }
799
800 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
801
802 assert_different_registers(arg_0, c_rarg1);
803 pass_arg1(this, arg_1);
804 pass_arg0(this, arg_0);
805 MacroAssembler::call_VM_leaf_base(entry_point, 2);
806 }
807
808 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
809 assert_different_registers(arg_0, c_rarg1, c_rarg2);
810 assert_different_registers(arg_1, c_rarg2);
811 pass_arg2(this, arg_2);
812 pass_arg1(this, arg_1);
813 pass_arg0(this, arg_0);
814 MacroAssembler::call_VM_leaf_base(entry_point, 3);
815 }
816
817 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
818 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
819 assert_different_registers(arg_1, c_rarg2, c_rarg3);
820 assert_different_registers(arg_2, c_rarg3);
821
822 pass_arg3(this, arg_3);
823 pass_arg2(this, arg_2);
824 pass_arg1(this, arg_1);
825 pass_arg0(this, arg_0);
826 MacroAssembler::call_VM_leaf_base(entry_point, 4);
827 }
828
829 void MacroAssembler::la(Register Rd, const address addr) {
830 int32_t offset;
831 la(Rd, addr, offset);
832 addi(Rd, Rd, offset);
833 }
834
835 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
836 int64_t distance = addr - pc();
837 assert(is_valid_32bit_offset(distance), "Must be");
838 auipc(Rd, (int32_t)distance + 0x800);
839 offset = ((int32_t)distance << 20) >> 20;
840 }
841
842 // Materialize with auipc + addi sequence if adr is a literal
843 // address inside code cache. Emit a movptr sequence otherwise.
844 void MacroAssembler::la(Register Rd, const Address &adr) {
845 switch (adr.getMode()) {
846 case Address::literal: {
847 relocInfo::relocType rtype = adr.rspec().reloc()->type();
848 if (rtype == relocInfo::none) {
849 mv(Rd, (intptr_t)(adr.target()));
850 } else {
851 if (CodeCache::contains(adr.target())) {
852 relocate(adr.rspec(), [&] {
853 la(Rd, adr.target());
854 });
855 } else {
856 relocate(adr.rspec(), [&] {
857 movptr(Rd, adr.target());
858 });
859 }
860 }
861 break;
862 }
863 case Address::base_plus_offset: {
864 Address new_adr = legitimize_address(Rd, adr);
865 if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
866 addi(Rd, new_adr.base(), new_adr.offset());
867 }
868 break;
869 }
870 default:
871 ShouldNotReachHere();
872 }
873 }
874
875 void MacroAssembler::la(Register Rd, Label &label) {
876 IncompressibleScope scope(this); // the label address may be patched back.
877 wrap_label(Rd, label, &MacroAssembler::la);
878 }
879
880 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
881 lui(Rd, (uint32_t)imm << 12);
882 srli(Rd, Rd, 12);
883 }
884
885 void MacroAssembler::li32(Register Rd, int32_t imm) {
886 // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
887 int64_t upper = imm, lower = imm;
888 lower = (imm << 20) >> 20;
889 upper -= lower;
890 upper = (int32_t)upper;
891 // lui Rd, imm[31:12] + imm[11]
892 lui(Rd, upper);
893 addiw(Rd, Rd, lower);
894 }
895
896 void MacroAssembler::li(Register Rd, int64_t imm) {
897 // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
898 // li -> c.li
899 if (do_compress() && (is_simm6(imm) && Rd != x0)) {
900 c_li(Rd, imm);
901 return;
902 }
903
904 int shift = 12;
905 int64_t upper = imm, lower = imm;
906 // Split imm to a lower 12-bit sign-extended part and the remainder,
907 // because addi will sign-extend the lower imm.
908 lower = ((int32_t)imm << 20) >> 20;
909 upper -= lower;
910
911 // Test whether imm is a 32-bit integer.
912 if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
913 (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
914 while (((upper >> shift) & 1) == 0) { shift++; }
915 upper >>= shift;
916 li(Rd, upper);
917 slli(Rd, Rd, shift);
918 if (lower != 0) {
919 addi(Rd, Rd, lower);
920 }
921 } else {
922 // 32-bit integer
923 Register hi_Rd = zr;
924 if (upper != 0) {
925 lui(Rd, (int32_t)upper);
926 hi_Rd = Rd;
927 }
928 if (lower != 0 || hi_Rd == zr) {
929 addiw(Rd, hi_Rd, lower);
930 }
931 }
932 }
933
934 void MacroAssembler::j(const address dest, Register temp) {
935 assert(CodeCache::contains(dest), "Must be");
936 assert_cond(dest != nullptr);
937 int64_t distance = dest - pc();
938
939 // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
940 IncompressibleScope scope(this);
941 if (is_simm21(distance) && ((distance % 2) == 0)) {
942 Assembler::jal(x0, distance);
943 } else {
944 assert(temp != noreg && temp != x0, "Expecting a register");
945 assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
946 int32_t offset = 0;
947 la(temp, dest, offset);
948 jr(temp, offset);
949 }
950 }
951
952 void MacroAssembler::j(const Address &dest, Register temp) {
953 switch (dest.getMode()) {
954 case Address::literal: {
955 if (CodeCache::contains(dest.target())) {
956 far_jump(dest, temp);
957 } else {
958 relocate(dest.rspec(), [&] {
959 int32_t offset;
960 movptr(temp, dest.target(), offset);
961 jr(temp, offset);
962 });
963 }
964 break;
965 }
966 case Address::base_plus_offset: {
967 int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
968 la(temp, Address(dest.base(), dest.offset() - offset));
969 jr(temp, offset);
970 break;
971 }
972 default:
973 ShouldNotReachHere();
974 }
975 }
976
977 void MacroAssembler::j(Label &lab, Register temp) {
978 assert_different_registers(x0, temp);
979 if (lab.is_bound()) {
980 MacroAssembler::j(target(lab), temp);
981 } else {
982 lab.add_patch_at(code(), locator());
983 MacroAssembler::j(pc(), temp);
984 }
985 }
986
987 void MacroAssembler::jr(Register Rd, int32_t offset) {
988 assert(Rd != noreg, "expecting a register");
989 assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
990 Assembler::jalr(x0, Rd, offset);
991 }
992
993 void MacroAssembler::call(const address dest, Register temp) {
994 assert_cond(dest != nullptr);
995 assert(temp != noreg, "expecting a register");
996 assert(temp != x5, "temp register must not be x5.");
997 int32_t offset = 0;
998 la(temp, dest, offset);
999 jalr(temp, offset);
1000 }
1001
1002 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1003 assert(Rs != noreg, "expecting a register");
1004 assert(Rs != x5, "Rs register must not be x5.");
1005 Assembler::jalr(x1, Rs, offset);
1006 }
1007
1008 void MacroAssembler::rt_call(address dest, Register tmp) {
1009 assert(tmp != x5, "tmp register must not be x5.");
1010 RuntimeAddress target(dest);
1011 if (CodeCache::contains(dest)) {
1012 far_call(target, tmp);
1013 } else {
1014 relocate(target.rspec(), [&] {
1015 int32_t offset;
1016 movptr(tmp, target.target(), offset);
1017 jalr(tmp, offset);
1018 });
1019 }
1020 }
1021
1022 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1023 if (L.is_bound()) {
1024 (this->*insn)(Rt, target(L));
1025 } else {
1026 L.add_patch_at(code(), locator());
1027 (this->*insn)(Rt, pc());
1028 }
1029 }
1030
1031 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1032 compare_and_branch_insn insn,
1033 compare_and_branch_label_insn neg_insn, bool is_far) {
1034 if (is_far) {
1035 Label done;
1036 (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1037 j(L);
1038 bind(done);
1039 } else {
1040 if (L.is_bound()) {
1041 (this->*insn)(r1, r2, target(L));
1042 } else {
1043 L.add_patch_at(code(), locator());
1044 (this->*insn)(r1, r2, pc());
1045 }
1046 }
1047 }
1048
1049 #define INSN(NAME, NEG_INSN) \
1050 void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) { \
1051 wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far); \
1052 }
1053
1054 INSN(beq, bne);
1055 INSN(bne, beq);
1056 INSN(blt, bge);
1057 INSN(bge, blt);
1058 INSN(bltu, bgeu);
1059 INSN(bgeu, bltu);
1060
1061 #undef INSN
1062
1063 #define INSN(NAME) \
1064 void MacroAssembler::NAME##z(Register Rs, const address dest) { \
1065 NAME(Rs, zr, dest); \
1066 } \
1067 void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) { \
1068 NAME(Rs, zr, l, is_far); \
1069 } \
1070
1071 INSN(beq);
1072 INSN(bne);
1073 INSN(blt);
1074 INSN(ble);
1075 INSN(bge);
1076 INSN(bgt);
1077
1078 #undef INSN
1079
1080 #define INSN(NAME, NEG_INSN) \
1081 void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) { \
1082 NEG_INSN(Rt, Rs, dest); \
1083 } \
1084 void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) { \
1085 NEG_INSN(Rt, Rs, l, is_far); \
1086 }
1087
1088 INSN(bgt, blt);
1089 INSN(ble, bge);
1090 INSN(bgtu, bltu);
1091 INSN(bleu, bgeu);
1092
1093 #undef INSN
1094
1095 // cmov
1096 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1097 if (UseZicond) {
1098 xorr(t0, cmp1, cmp2);
1099 czero_eqz(dst, dst, t0);
1100 czero_nez(t0 , src, t0);
1101 orr(dst, dst, t0);
1102 return;
1103 }
1104 Label no_set;
1105 bne(cmp1, cmp2, no_set);
1106 mv(dst, src);
1107 bind(no_set);
1108 }
1109
1110 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1111 if (UseZicond) {
1112 xorr(t0, cmp1, cmp2);
1113 czero_nez(dst, dst, t0);
1114 czero_eqz(t0 , src, t0);
1115 orr(dst, dst, t0);
1116 return;
1117 }
1118 Label no_set;
1119 beq(cmp1, cmp2, no_set);
1120 mv(dst, src);
1121 bind(no_set);
1122 }
1123
1124 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1125 if (UseZicond) {
1126 slt(t0, cmp2, cmp1);
1127 czero_eqz(dst, dst, t0);
1128 czero_nez(t0, src, t0);
1129 orr(dst, dst, t0);
1130 return;
1131 }
1132 Label no_set;
1133 bgt(cmp1, cmp2, no_set);
1134 mv(dst, src);
1135 bind(no_set);
1136 }
1137
1138 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1139 if (UseZicond) {
1140 sltu(t0, cmp2, cmp1);
1141 czero_eqz(dst, dst, t0);
1142 czero_nez(t0, src, t0);
1143 orr(dst, dst, t0);
1144 return;
1145 }
1146 Label no_set;
1147 bgtu(cmp1, cmp2, no_set);
1148 mv(dst, src);
1149 bind(no_set);
1150 }
1151
1152 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1153 if (UseZicond) {
1154 slt(t0, cmp1, cmp2);
1155 czero_eqz(dst, dst, t0);
1156 czero_nez(t0, src, t0);
1157 orr(dst, dst, t0);
1158 return;
1159 }
1160 Label no_set;
1161 blt(cmp1, cmp2, no_set);
1162 mv(dst, src);
1163 bind(no_set);
1164 }
1165
1166 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1167 if (UseZicond) {
1168 sltu(t0, cmp1, cmp2);
1169 czero_eqz(dst, dst, t0);
1170 czero_nez(t0, src, t0);
1171 orr(dst, dst, t0);
1172 return;
1173 }
1174 Label no_set;
1175 bltu(cmp1, cmp2, no_set);
1176 mv(dst, src);
1177 bind(no_set);
1178 }
1179
1180 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1181 if (UseZicond) {
1182 slt(t0, cmp1, cmp2);
1183 czero_nez(dst, dst, t0);
1184 czero_eqz(t0, src, t0);
1185 orr(dst, dst, t0);
1186 return;
1187 }
1188 Label no_set;
1189 bge(cmp1, cmp2, no_set);
1190 mv(dst, src);
1191 bind(no_set);
1192 }
1193
1194 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1195 if (UseZicond) {
1196 sltu(t0, cmp1, cmp2);
1197 czero_nez(dst, dst, t0);
1198 czero_eqz(t0, src, t0);
1199 orr(dst, dst, t0);
1200 return;
1201 }
1202 Label no_set;
1203 bgeu(cmp1, cmp2, no_set);
1204 mv(dst, src);
1205 bind(no_set);
1206 }
1207
1208 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1209 if (UseZicond) {
1210 slt(t0, cmp2, cmp1);
1211 czero_nez(dst, dst, t0);
1212 czero_eqz(t0, src, t0);
1213 orr(dst, dst, t0);
1214 return;
1215 }
1216 Label no_set;
1217 ble(cmp1, cmp2, no_set);
1218 mv(dst, src);
1219 bind(no_set);
1220 }
1221
1222 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1223 if (UseZicond) {
1224 sltu(t0, cmp2, cmp1);
1225 czero_nez(dst, dst, t0);
1226 czero_eqz(t0, src, t0);
1227 orr(dst, dst, t0);
1228 return;
1229 }
1230 Label no_set;
1231 bleu(cmp1, cmp2, no_set);
1232 mv(dst, src);
1233 bind(no_set);
1234 }
1235
1236 // ----------- cmove, compare float -----------
1237 //
1238 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1239 // so, just list behaviour of unordered ones as follow.
1240 //
1241 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1242 // (If one or both inputs to the compare are NaN, then)
1243 // 1. (op1 lt op2) => true => CMove: dst = src
1244 // 2. (op1 le op2) => true => CMove: dst = src
1245 // 3. (op1 gt op2) => false => CMove: dst = dst
1246 // 4. (op1 ge op2) => false => CMove: dst = dst
1247 // 5. (op1 eq op2) => false => CMove: dst = dst
1248 // 6. (op1 ne op2) => true => CMove: dst = src
1249
1250 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1251 if (UseZicond) {
1252 if (is_single) {
1253 feq_s(t0, cmp1, cmp2);
1254 } else {
1255 feq_d(t0, cmp1, cmp2);
1256 }
1257 czero_nez(dst, dst, t0);
1258 czero_eqz(t0 , src, t0);
1259 orr(dst, dst, t0);
1260 return;
1261 }
1262 Label no_set;
1263 if (is_single) {
1264 // jump if cmp1 != cmp2, including the case of NaN
1265 // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1266 float_bne(cmp1, cmp2, no_set);
1267 } else {
1268 double_bne(cmp1, cmp2, no_set);
1269 }
1270 mv(dst, src);
1271 bind(no_set);
1272 }
1273
1274 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1275 if (UseZicond) {
1276 if (is_single) {
1277 feq_s(t0, cmp1, cmp2);
1278 } else {
1279 feq_d(t0, cmp1, cmp2);
1280 }
1281 czero_eqz(dst, dst, t0);
1282 czero_nez(t0 , src, t0);
1283 orr(dst, dst, t0);
1284 return;
1285 }
1286 Label no_set;
1287 if (is_single) {
1288 // jump if cmp1 == cmp2
1289 // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1290 float_beq(cmp1, cmp2, no_set);
1291 } else {
1292 double_beq(cmp1, cmp2, no_set);
1293 }
1294 mv(dst, src);
1295 bind(no_set);
1296 }
1297
1298 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1299 if (UseZicond) {
1300 if (is_single) {
1301 flt_s(t0, cmp2, cmp1);
1302 } else {
1303 flt_d(t0, cmp2, cmp1);
1304 }
1305 czero_eqz(dst, dst, t0);
1306 czero_nez(t0 , src, t0);
1307 orr(dst, dst, t0);
1308 return;
1309 }
1310 Label no_set;
1311 if (is_single) {
1312 // jump if cmp1 > cmp2
1313 // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1314 float_bgt(cmp1, cmp2, no_set);
1315 } else {
1316 double_bgt(cmp1, cmp2, no_set);
1317 }
1318 mv(dst, src);
1319 bind(no_set);
1320 }
1321
1322 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1323 if (UseZicond) {
1324 if (is_single) {
1325 fle_s(t0, cmp2, cmp1);
1326 } else {
1327 fle_d(t0, cmp2, cmp1);
1328 }
1329 czero_nez(dst, dst, t0);
1330 czero_eqz(t0 , src, t0);
1331 orr(dst, dst, t0);
1332 return;
1333 }
1334 Label no_set;
1335 if (is_single) {
1336 // jump if cmp1 < cmp2 or either is NaN
1337 // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1338 float_blt(cmp1, cmp2, no_set, false, true);
1339 } else {
1340 double_blt(cmp1, cmp2, no_set, false, true);
1341 }
1342 mv(dst, src);
1343 bind(no_set);
1344 }
1345
1346 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1347 if (UseZicond) {
1348 if (is_single) {
1349 fle_s(t0, cmp2, cmp1);
1350 } else {
1351 fle_d(t0, cmp2, cmp1);
1352 }
1353 czero_eqz(dst, dst, t0);
1354 czero_nez(t0 , src, t0);
1355 orr(dst, dst, t0);
1356 return;
1357 }
1358 Label no_set;
1359 if (is_single) {
1360 // jump if cmp1 >= cmp2
1361 // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1362 float_bge(cmp1, cmp2, no_set);
1363 } else {
1364 double_bge(cmp1, cmp2, no_set);
1365 }
1366 mv(dst, src);
1367 bind(no_set);
1368 }
1369
1370 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1371 if (UseZicond) {
1372 if (is_single) {
1373 flt_s(t0, cmp2, cmp1);
1374 } else {
1375 flt_d(t0, cmp2, cmp1);
1376 }
1377 czero_nez(dst, dst, t0);
1378 czero_eqz(t0 , src, t0);
1379 orr(dst, dst, t0);
1380 return;
1381 }
1382 Label no_set;
1383 if (is_single) {
1384 // jump if cmp1 <= cmp2 or either is NaN
1385 // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1386 float_ble(cmp1, cmp2, no_set, false, true);
1387 } else {
1388 double_ble(cmp1, cmp2, no_set, false, true);
1389 }
1390 mv(dst, src);
1391 bind(no_set);
1392 }
1393
1394 // Float compare branch instructions
1395
1396 #define INSN(NAME, FLOATCMP, BRANCH) \
1397 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1398 FLOATCMP##_s(t0, Rs1, Rs2); \
1399 BRANCH(t0, l, is_far); \
1400 } \
1401 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1402 FLOATCMP##_d(t0, Rs1, Rs2); \
1403 BRANCH(t0, l, is_far); \
1404 }
1405
1406 INSN(beq, feq, bnez);
1407 INSN(bne, feq, beqz);
1408
1409 #undef INSN
1410
1411
1412 #define INSN(NAME, FLOATCMP1, FLOATCMP2) \
1413 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1414 bool is_far, bool is_unordered) { \
1415 if (is_unordered) { \
1416 /* jump if either source is NaN or condition is expected */ \
1417 FLOATCMP2##_s(t0, Rs2, Rs1); \
1418 beqz(t0, l, is_far); \
1419 } else { \
1420 /* jump if no NaN in source and condition is expected */ \
1421 FLOATCMP1##_s(t0, Rs1, Rs2); \
1422 bnez(t0, l, is_far); \
1423 } \
1424 } \
1425 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1426 bool is_far, bool is_unordered) { \
1427 if (is_unordered) { \
1428 /* jump if either source is NaN or condition is expected */ \
1429 FLOATCMP2##_d(t0, Rs2, Rs1); \
1430 beqz(t0, l, is_far); \
1431 } else { \
1432 /* jump if no NaN in source and condition is expected */ \
1433 FLOATCMP1##_d(t0, Rs1, Rs2); \
1434 bnez(t0, l, is_far); \
1435 } \
1436 }
1437
1438 INSN(ble, fle, flt);
1439 INSN(blt, flt, fle);
1440
1441 #undef INSN
1442
1443 #define INSN(NAME, CMP) \
1444 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1445 bool is_far, bool is_unordered) { \
1446 float_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1447 } \
1448 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1449 bool is_far, bool is_unordered) { \
1450 double_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1451 }
1452
1453 INSN(bgt, blt);
1454 INSN(bge, ble);
1455
1456 #undef INSN
1457
1458 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1459 // These three are specified in zicntr and are unused.
1460 // Before adding use-cases add the appropriate hwprobe and flag.
1461 assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1462 "Not intended for use without enabling zicntr.");
1463 csrrs(Rd, csr, x0);
1464 }
1465
1466 #define INSN(NAME, OPFUN) \
1467 void MacroAssembler::NAME(unsigned csr, Register Rs) { \
1468 OPFUN(x0, csr, Rs); \
1469 }
1470
1471 INSN(csrw, csrrw);
1472 INSN(csrs, csrrs);
1473 INSN(csrc, csrrc);
1474
1475 #undef INSN
1476
1477 #define INSN(NAME, OPFUN) \
1478 void MacroAssembler::NAME(unsigned csr, unsigned imm) { \
1479 OPFUN(x0, csr, imm); \
1480 }
1481
1482 INSN(csrwi, csrrwi);
1483 INSN(csrsi, csrrsi);
1484 INSN(csrci, csrrci);
1485
1486 #undef INSN
1487
1488 #define INSN(NAME, CSR) \
1489 void MacroAssembler::NAME(Register Rd, Register Rs) { \
1490 csrrw(Rd, CSR, Rs); \
1491 }
1492
1493 INSN(fscsr, CSR_FCSR);
1494 INSN(fsrm, CSR_FRM);
1495 INSN(fsflags, CSR_FFLAGS);
1496
1497 #undef INSN
1498
1499 #define INSN(NAME) \
1500 void MacroAssembler::NAME(Register Rs) { \
1501 NAME(x0, Rs); \
1502 }
1503
1504 INSN(fscsr);
1505 INSN(fsrm);
1506 INSN(fsflags);
1507
1508 #undef INSN
1509
1510 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1511 guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1512 csrrwi(Rd, CSR_FRM, imm);
1513 }
1514
1515 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1516 csrrwi(Rd, CSR_FFLAGS, imm);
1517 }
1518
1519 #define INSN(NAME) \
1520 void MacroAssembler::NAME(unsigned imm) { \
1521 NAME(x0, imm); \
1522 }
1523
1524 INSN(fsrmi);
1525 INSN(fsflagsi);
1526
1527 #undef INSN
1528
1529 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1530 if (RestoreMXCSROnJNICalls) {
1531 Label skip_fsrmi;
1532 frrm(tmp);
1533 // Set FRM to the state we need. We do want Round to Nearest.
1534 // We don't want non-IEEE rounding modes.
1535 guarantee(RoundingMode::rne == 0, "must be");
1536 beqz(tmp, skip_fsrmi); // Only reset FRM if it's wrong
1537 fsrmi(RoundingMode::rne);
1538 bind(skip_fsrmi);
1539 }
1540 }
1541
1542 void MacroAssembler::push_reg(Register Rs)
1543 {
1544 subi(esp, esp, wordSize);
1545 sd(Rs, Address(esp, 0));
1546 }
1547
1548 void MacroAssembler::pop_reg(Register Rd)
1549 {
1550 ld(Rd, Address(esp, 0));
1551 addi(esp, esp, wordSize);
1552 }
1553
1554 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1555 int count = 0;
1556 // Scan bitset to accumulate register pairs
1557 for (int reg = 31; reg >= 0; reg--) {
1558 if ((1U << 31) & bitset) {
1559 regs[count++] = reg;
1560 }
1561 bitset <<= 1;
1562 }
1563 return count;
1564 }
1565
1566 // Push integer registers in the bitset supplied. Don't push sp.
1567 // Return the number of words pushed
1568 int MacroAssembler::push_reg(unsigned int bitset, Register stack) {
1569 DEBUG_ONLY(int words_pushed = 0;)
1570 unsigned char regs[32];
1571 int count = bitset_to_regs(bitset, regs);
1572 // reserve one slot to align for odd count
1573 int offset = is_even(count) ? 0 : wordSize;
1574
1575 if (count) {
1576 sub(stack, stack, count * wordSize + offset);
1577 }
1578 for (int i = count - 1; i >= 0; i--) {
1579 sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1580 DEBUG_ONLY(words_pushed++;)
1581 }
1582
1583 assert(words_pushed == count, "oops, pushed != count");
1584
1585 return count;
1586 }
1587
1588 int MacroAssembler::pop_reg(unsigned int bitset, Register stack) {
1589 DEBUG_ONLY(int words_popped = 0;)
1590 unsigned char regs[32];
1591 int count = bitset_to_regs(bitset, regs);
1592 // reserve one slot to align for odd count
1593 int offset = is_even(count) ? 0 : wordSize;
1594
1595 for (int i = count - 1; i >= 0; i--) {
1596 ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1597 DEBUG_ONLY(words_popped++;)
1598 }
1599
1600 if (count) {
1601 add(stack, stack, count * wordSize + offset);
1602 }
1603 assert(words_popped == count, "oops, popped != count");
1604
1605 return count;
1606 }
1607
1608 // Push floating-point registers in the bitset supplied.
1609 // Return the number of words pushed
1610 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
1611 DEBUG_ONLY(int words_pushed = 0;)
1612 unsigned char regs[32];
1613 int count = bitset_to_regs(bitset, regs);
1614 int push_slots = count + (count & 1);
1615
1616 if (count) {
1617 subi(stack, stack, push_slots * wordSize);
1618 }
1619
1620 for (int i = count - 1; i >= 0; i--) {
1621 fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
1622 DEBUG_ONLY(words_pushed++;)
1623 }
1624
1625 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
1626
1627 return count;
1628 }
1629
1630 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
1631 DEBUG_ONLY(int words_popped = 0;)
1632 unsigned char regs[32];
1633 int count = bitset_to_regs(bitset, regs);
1634 int pop_slots = count + (count & 1);
1635
1636 for (int i = count - 1; i >= 0; i--) {
1637 fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
1638 DEBUG_ONLY(words_popped++;)
1639 }
1640
1641 if (count) {
1642 addi(stack, stack, pop_slots * wordSize);
1643 }
1644
1645 assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
1646
1647 return count;
1648 }
1649
1650 /**
1651 * Emits code to update CRC-32 with a byte value according to constants in table
1652 *
1653 * @param [in,out]crc Register containing the crc.
1654 * @param [in]val Register containing the byte to fold into the CRC.
1655 * @param [in]table Register containing the table of crc constants.
1656 *
1657 * uint32_t crc;
1658 * val = crc_table[(val ^ crc) & 0xFF];
1659 * crc = val ^ (crc >> 8);
1660 *
1661 */
1662 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
1663 assert_different_registers(crc, val, table);
1664
1665 xorr(val, val, crc);
1666 zext(val, val, 8);
1667 shadd(val, val, table, val, 2);
1668 lwu(val, Address(val));
1669 srli(crc, crc, 8);
1670 xorr(crc, val, crc);
1671 }
1672
1673 /**
1674 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
1675 *
1676 * @param [in,out]crc Register containing the crc.
1677 * @param [in]v Register containing the 32-bit to fold into the CRC.
1678 * @param [in]table0 Register containing table 0 of crc constants.
1679 * @param [in]table1 Register containing table 1 of crc constants.
1680 * @param [in]table2 Register containing table 2 of crc constants.
1681 * @param [in]table3 Register containing table 3 of crc constants.
1682 *
1683 * uint32_t crc;
1684 * v = crc ^ v
1685 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
1686 *
1687 */
1688 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
1689 Register table0, Register table1, Register table2, Register table3, bool upper) {
1690 assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
1691
1692 if (upper)
1693 srli(v, v, 32);
1694 xorr(v, v, crc);
1695
1696 zext(tmp1, v, 8);
1697 shadd(tmp1, tmp1, table3, tmp2, 2);
1698 lwu(crc, Address(tmp1));
1699
1700 slli(tmp1, v, 16);
1701 slli(tmp3, v, 8);
1702
1703 srliw(tmp1, tmp1, 24);
1704 srliw(tmp3, tmp3, 24);
1705
1706 shadd(tmp1, tmp1, table2, tmp1, 2);
1707 lwu(tmp2, Address(tmp1));
1708
1709 shadd(tmp3, tmp3, table1, tmp3, 2);
1710 xorr(crc, crc, tmp2);
1711
1712 lwu(tmp2, Address(tmp3));
1713 // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
1714 if (upper)
1715 srli(tmp1, v, 24);
1716 else
1717 srliw(tmp1, v, 24);
1718
1719 // no need to clear bits other than lowest two
1720 shadd(tmp1, tmp1, table0, tmp1, 2);
1721 xorr(crc, crc, tmp2);
1722 lwu(tmp2, Address(tmp1));
1723 xorr(crc, crc, tmp2);
1724 }
1725
1726
1727 #ifdef COMPILER2
1728 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
1729 // To make it, following steps are taken:
1730 // 1. in zcrc32.c, modify N to 16 and related code,
1731 // 2. re-generate the tables needed, we use tables of (N == 16, W == 4)
1732 // 3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
1733 // New tables for vector version is after table3.
1734 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
1735 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
1736 Register table0, Register table3) {
1737 assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
1738 const int N = 16, W = 4;
1739 const int64_t single_table_size = 256;
1740 const Register blks = tmp2;
1741 const Register tmpTable = tmp3, tableN16 = tmp4;
1742 const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
1743 Label VectorLoop;
1744 Label LastBlock;
1745
1746 add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
1747 mv(tmp5, 0xff);
1748
1749 if (MaxVectorSize == 16) {
1750 vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
1751 } else if (MaxVectorSize == 32) {
1752 vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
1753 } else {
1754 assert(MaxVectorSize > 32, "sanity");
1755 vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
1756 }
1757
1758 vmv_v_x(vcrc, zr);
1759 vmv_s_x(vcrc, crc);
1760
1761 // multiple of 64
1762 srli(blks, len, 6);
1763 slli(t1, blks, 6);
1764 sub(len, len, t1);
1765 subi(blks, blks, 1);
1766 blez(blks, LastBlock);
1767
1768 bind(VectorLoop);
1769 {
1770 mv(tmpTable, tableN16);
1771
1772 vle32_v(vword, buf);
1773 vxor_vv(vword, vword, vcrc);
1774
1775 addi(buf, buf, N*4);
1776
1777 vand_vx(vtmp, vword, tmp5);
1778 vsll_vi(vtmp, vtmp, 2);
1779 vluxei32_v(vcrc, tmpTable, vtmp);
1780
1781 mv(tmp1, 1);
1782 for (int k = 1; k < W; k++) {
1783 addi(tmpTable, tmpTable, single_table_size*4);
1784
1785 slli(t1, tmp1, 3);
1786 vsrl_vx(vtmp, vword, t1);
1787
1788 vand_vx(vtmp, vtmp, tmp5);
1789 vsll_vi(vtmp, vtmp, 2);
1790 vluxei32_v(vtmp, tmpTable, vtmp);
1791
1792 vxor_vv(vcrc, vcrc, vtmp);
1793
1794 addi(tmp1, tmp1, 1);
1795 }
1796
1797 subi(blks, blks, 1);
1798 bgtz(blks, VectorLoop);
1799 }
1800
1801 bind(LastBlock);
1802 {
1803 vle32_v(vtmp, buf);
1804 vxor_vv(vcrc, vcrc, vtmp);
1805 mv(crc, zr);
1806 for (int i = 0; i < N; i++) {
1807 vmv_x_s(tmp2, vcrc);
1808 // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
1809 zext(tmp2, tmp2, 32);
1810 vslidedown_vi(vcrc, vcrc, 1);
1811 xorr(crc, crc, tmp2);
1812 for (int j = 0; j < W; j++) {
1813 andr(t1, crc, tmp5);
1814 shadd(t1, t1, table0, tmp1, 2);
1815 lwu(t1, Address(t1, 0));
1816 srli(tmp2, crc, 8);
1817 xorr(crc, tmp2, t1);
1818 }
1819 }
1820 addi(buf, buf, N*4);
1821 }
1822 }
1823
1824 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
1825 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1826 Register buf, Register tmp, const int STEP) {
1827 assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1828 vclmul_vv(vtmp1, vx, vt);
1829 vclmulh_vv(vtmp2, vx, vt);
1830 vle64_v(vtmp4, buf); addi(buf, buf, STEP);
1831 // low parts
1832 vredxor_vs(vtmp3, vtmp1, vtmp4);
1833 // high parts
1834 vslidedown_vi(vx, vtmp4, 1);
1835 vredxor_vs(vtmp1, vtmp2, vx);
1836 // merge low and high back
1837 vslideup_vi(vx, vtmp1, 1);
1838 vmv_x_s(tmp, vtmp3);
1839 vmv_s_x(vx, tmp);
1840 }
1841
1842 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1843 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1844 Register tmp) {
1845 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1846 vclmul_vv(vtmp1, vx, vt);
1847 vclmulh_vv(vtmp2, vx, vt);
1848 // low parts
1849 vredxor_vs(vtmp3, vtmp1, vy);
1850 // high parts
1851 vslidedown_vi(vtmp4, vy, 1);
1852 vredxor_vs(vtmp1, vtmp2, vtmp4);
1853 // merge low and high back
1854 vslideup_vi(vx, vtmp1, 1);
1855 vmv_x_s(tmp, vtmp3);
1856 vmv_s_x(vx, tmp);
1857 }
1858
1859 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1860 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
1861 Register tmp) {
1862 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1863 vclmul_vv(vtmp1, vx, vt);
1864 vclmulh_vv(vtmp2, vx, vt);
1865 // low parts
1866 vredxor_vs(vtmp3, vtmp1, vy);
1867 // high parts
1868 vslidedown_vi(vtmp4, vy, 1);
1869 vredxor_vs(vtmp1, vtmp2, vtmp4);
1870 // merge low and high back
1871 vslideup_vi(vy, vtmp1, 1);
1872 vmv_x_s(tmp, vtmp3);
1873 vmv_s_x(vy, tmp);
1874 }
1875
1876 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
1877 Register vclmul_table, Register tmp1, Register tmp2) {
1878 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1879 assert(MaxVectorSize == 16, "sanity");
1880
1881 const int TABLE_STEP = 16;
1882 const int STEP = 16;
1883 const int LOOP_STEP = 128;
1884 const int N = 2;
1885
1886 Register loop_step = t1;
1887
1888 // ======== preparation ========
1889
1890 mv(loop_step, LOOP_STEP);
1891 sub(len, len, loop_step);
1892
1893 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1894 vle64_v(v0, buf); addi(buf, buf, STEP);
1895 vle64_v(v1, buf); addi(buf, buf, STEP);
1896 vle64_v(v2, buf); addi(buf, buf, STEP);
1897 vle64_v(v3, buf); addi(buf, buf, STEP);
1898 vle64_v(v4, buf); addi(buf, buf, STEP);
1899 vle64_v(v5, buf); addi(buf, buf, STEP);
1900 vle64_v(v6, buf); addi(buf, buf, STEP);
1901 vle64_v(v7, buf); addi(buf, buf, STEP);
1902
1903 vmv_v_x(v31, zr);
1904 vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
1905 vmv_s_x(v31, crc);
1906 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
1907 vxor_vv(v0, v0, v31);
1908
1909 // load table
1910 vle64_v(v31, vclmul_table);
1911
1912 Label L_16_bytes_loop;
1913 j(L_16_bytes_loop);
1914
1915
1916 // ======== folding 128 bytes in data buffer per round ========
1917
1918 align(OptoLoopAlignment);
1919 bind(L_16_bytes_loop);
1920 {
1921 crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1922 crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1923 crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1924 crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
1925 crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
1926 crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
1927 crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
1928 crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
1929 }
1930 sub(len, len, loop_step);
1931 bge(len, loop_step, L_16_bytes_loop);
1932
1933
1934 // ======== folding into 64 bytes from 128 bytes in register ========
1935
1936 // load table
1937 addi(vclmul_table, vclmul_table, TABLE_STEP);
1938 vle64_v(v31, vclmul_table);
1939
1940 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
1941 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
1942 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
1943 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
1944
1945
1946 // ======== folding into 16 bytes from 64 bytes in register ========
1947
1948 addi(vclmul_table, vclmul_table, TABLE_STEP);
1949 vle64_v(v31, vclmul_table);
1950 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
1951
1952 addi(vclmul_table, vclmul_table, TABLE_STEP);
1953 vle64_v(v31, vclmul_table);
1954 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
1955
1956 addi(vclmul_table, vclmul_table, TABLE_STEP);
1957 vle64_v(v31, vclmul_table);
1958 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
1959
1960 #undef FOLD_2_VCLMUL_3
1961
1962
1963 // ======== final: move result to scalar regsiters ========
1964
1965 vmv_x_s(tmp1, v3);
1966 vslidedown_vi(v1, v3, 1);
1967 vmv_x_s(tmp2, v1);
1968 }
1969
1970 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
1971 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
1972 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
1973 vclmul_vv(vtmp1, vx, vt);
1974 vclmulh_vv(vtmp2, vx, vt);
1975 // low parts
1976 vredxor_vs(vtmp3, vtmp1, vy);
1977 // high parts
1978 vslidedown_vi(vtmp4, vy, 1);
1979 vredxor_vs(vtmp1, vtmp2, vtmp4);
1980 // merge low and high back
1981 vslideup_vi(vy, vtmp1, 1);
1982 vmv_x_s(t1, vtmp3);
1983 vmv_s_x(vy, t1);
1984 }
1985
1986 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
1987 Register vclmul_table, Register tmp1, Register tmp2) {
1988 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
1989 assert(MaxVectorSize >= 32, "sanity");
1990
1991 // utility: load table
1992 #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
1993 vid_v(vtmp); \
1994 mv(rtmp, 2); \
1995 vremu_vx(vtmp, vtmp, rtmp); \
1996 vsll_vi(vtmp, vtmp, 3); \
1997 vluxei64_v(vt, rt, vtmp);
1998
1999 const int TABLE_STEP = 16;
2000 const int STEP = 128; // 128 bytes per round
2001 const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2002
2003 Register step = tmp2;
2004
2005
2006 // ======== preparation ========
2007
2008 mv(step, STEP);
2009 sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2010
2011 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2012 // load data
2013 vle64_v(v4, buf);
2014 add(buf, buf, step);
2015
2016 // load table
2017 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2018 // load mask,
2019 // v28 should already contains: 0, 8, 0, 8, ...
2020 vmseq_vi(v2, v28, 0);
2021 // now, v2 should contains: 101010...
2022 vmnand_mm(v1, v2, v2);
2023 // now, v1 should contains: 010101...
2024
2025 // initial crc
2026 vmv_v_x(v24, zr);
2027 vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2028 vmv_s_x(v24, crc);
2029 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2030 vxor_vv(v4, v4, v24);
2031
2032 Label L_128_bytes_loop;
2033 j(L_128_bytes_loop);
2034
2035
2036 // ======== folding 128 bytes in data buffer per round ========
2037
2038 align(OptoLoopAlignment);
2039 bind(L_128_bytes_loop);
2040 {
2041 // v4: data
2042 // v4: buf, reused
2043 // v8: table
2044 // v12: lows
2045 // v16: highs
2046 // v20: low_slides
2047 // v24: high_slides
2048 vclmul_vv(v12, v4, v8);
2049 vclmulh_vv(v16, v4, v8);
2050 vle64_v(v4, buf);
2051 add(buf, buf, step);
2052 // lows
2053 vslidedown_vi(v20, v12, 1);
2054 vmand_mm(v0, v2, v2);
2055 vxor_vv(v12, v12, v20, v0_t);
2056 // with buf data
2057 vxor_vv(v4, v4, v12, v0_t);
2058
2059 // highs
2060 vslideup_vi(v24, v16, 1);
2061 vmand_mm(v0, v1, v1);
2062 vxor_vv(v16, v16, v24, v0_t);
2063 // with buf data
2064 vxor_vv(v4, v4, v16, v0_t);
2065 }
2066 sub(len, len, step);
2067 bge(len, step, L_128_bytes_loop);
2068
2069
2070 // ======== folding into 64 bytes from 128 bytes in register ========
2071
2072 // load table
2073 addi(vclmul_table, vclmul_table, TABLE_STEP);
2074 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2075
2076 // v4: data, first (low) part, N/2 of 64-bits
2077 // v20: data, second (high) part, N/2 of 64-bits
2078 // v8: table
2079 // v10: lows
2080 // v12: highs
2081 // v14: low_slides
2082 // v16: high_slides
2083
2084 // high part
2085 vslidedown_vi(v20, v4, N/2);
2086
2087 vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2088
2089 vclmul_vv(v10, v4, v8);
2090 vclmulh_vv(v12, v4, v8);
2091
2092 // lows
2093 vslidedown_vi(v14, v10, 1);
2094 vmand_mm(v0, v2, v2);
2095 vxor_vv(v10, v10, v14, v0_t);
2096 // with data part 2
2097 vxor_vv(v4, v20, v10, v0_t);
2098
2099 // highs
2100 vslideup_vi(v16, v12, 1);
2101 vmand_mm(v0, v1, v1);
2102 vxor_vv(v12, v12, v16, v0_t);
2103 // with data part 2
2104 vxor_vv(v4, v20, v12, v0_t);
2105
2106
2107 // ======== folding into 16 bytes from 64 bytes in register ========
2108
2109 // v4: data, first part, 2 of 64-bits
2110 // v16: data, second part, 2 of 64-bits
2111 // v18: data, third part, 2 of 64-bits
2112 // v20: data, second part, 2 of 64-bits
2113 // v8: table
2114
2115 vslidedown_vi(v16, v4, 2);
2116 vslidedown_vi(v18, v4, 4);
2117 vslidedown_vi(v20, v4, 6);
2118
2119 vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2120
2121 addi(vclmul_table, vclmul_table, TABLE_STEP);
2122 vle64_v(v8, vclmul_table);
2123 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2124
2125 addi(vclmul_table, vclmul_table, TABLE_STEP);
2126 vle64_v(v8, vclmul_table);
2127 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2128
2129 addi(vclmul_table, vclmul_table, TABLE_STEP);
2130 vle64_v(v8, vclmul_table);
2131 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2132
2133
2134 // ======== final: move result to scalar regsiters ========
2135
2136 vmv_x_s(tmp1, v20);
2137 vslidedown_vi(v4, v20, 1);
2138 vmv_x_s(tmp2, v4);
2139
2140 #undef CRC32_VCLMUL_LOAD_TABLE
2141 }
2142
2143 // For more details of the algorithm, please check the paper:
2144 // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2145 //
2146 // Please also refer to the corresponding code in aarch64 or x86 ones.
2147 //
2148 // As the riscv carry-less multiplication is a bit different from the other platforms,
2149 // so the implementation itself is also a bit different from others.
2150
2151 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2152 Register table0, Register table1, Register table2, Register table3,
2153 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2154 const int64_t single_table_size = 256;
2155 const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2156 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2157 Register vclmul_table = tmp3;
2158
2159 la(vclmul_table, table_addr);
2160 add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2161 la(table0, table_addr);
2162
2163 if (MaxVectorSize == 16) {
2164 kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2165 } else {
2166 kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2167 }
2168
2169 mv(crc, zr);
2170 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2171 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2172 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2173 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2174 }
2175
2176 #endif // COMPILER2
2177
2178 /**
2179 * @param crc register containing existing CRC (32-bit)
2180 * @param buf register pointing to input byte buffer (byte*)
2181 * @param len register containing number of bytes
2182 * @param table register that will contain address of CRC table
2183 * @param tmp scratch registers
2184 */
2185 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2186 Register table0, Register table1, Register table2, Register table3,
2187 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2188 assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2189 Label L_vector_entry,
2190 L_unroll_loop,
2191 L_by4_loop_entry, L_by4_loop,
2192 L_by1_loop, L_exit, L_skip1, L_skip2;
2193
2194 const int64_t single_table_size = 256;
2195 const int64_t unroll = 16;
2196 const int64_t unroll_words = unroll*wordSize;
2197
2198 // tmp5 = 0xffffffff
2199 notr(tmp5, zr);
2200 srli(tmp5, tmp5, 32);
2201
2202 andn(crc, tmp5, crc);
2203
2204 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2205 la(table0, table_addr);
2206 add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2207 add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2208 add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2209
2210 // Ensure basic 4-byte alignment of input byte buffer
2211 mv(tmp1, 4);
2212 blt(len, tmp1, L_by1_loop);
2213 test_bit(tmp1, buf, 0);
2214 beqz(tmp1, L_skip1);
2215 subiw(len, len, 1);
2216 lbu(tmp1, Address(buf));
2217 addi(buf, buf, 1);
2218 update_byte_crc32(crc, tmp1, table0);
2219 bind(L_skip1);
2220 test_bit(tmp1, buf, 1);
2221 beqz(tmp1, L_skip2);
2222 subiw(len, len, 2);
2223 lhu(tmp1, Address(buf));
2224 addi(buf, buf, 2);
2225 zext(tmp2, tmp1, 8);
2226 update_byte_crc32(crc, tmp2, table0);
2227 srli(tmp2, tmp1, 8);
2228 update_byte_crc32(crc, tmp2, table0);
2229 bind(L_skip2);
2230
2231 #ifdef COMPILER2
2232 if (UseRVV) {
2233 const int64_t tmp_limit =
2234 UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2235 : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2236 mv(tmp1, tmp_limit);
2237 bge(len, tmp1, L_vector_entry);
2238 }
2239 #endif // COMPILER2
2240
2241 mv(tmp1, unroll_words);
2242 blt(len, tmp1, L_by4_loop_entry);
2243
2244 const Register loop_buf_end = tmp3;
2245
2246 align(CodeEntryAlignment);
2247 // Entry for L_unroll_loop
2248 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2249 andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2250 sub(loop_buf_end, loop_buf_end, len);
2251 bind(L_unroll_loop);
2252 for (int i = 0; i < unroll; i++) {
2253 ld(tmp1, Address(buf, i*wordSize));
2254 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2255 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2256 }
2257
2258 addi(buf, buf, unroll_words);
2259 blt(buf, loop_buf_end, L_unroll_loop);
2260
2261 bind(L_by4_loop_entry);
2262 mv(tmp1, 4);
2263 blt(len, tmp1, L_by1_loop);
2264 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2265 andi(len, len, 3);
2266 sub(loop_buf_end, loop_buf_end, len);
2267 bind(L_by4_loop);
2268 lwu(tmp1, Address(buf));
2269 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2270 addi(buf, buf, 4);
2271 blt(buf, loop_buf_end, L_by4_loop);
2272
2273 bind(L_by1_loop);
2274 beqz(len, L_exit);
2275
2276 subiw(len, len, 1);
2277 lbu(tmp1, Address(buf));
2278 update_byte_crc32(crc, tmp1, table0);
2279 beqz(len, L_exit);
2280
2281 subiw(len, len, 1);
2282 lbu(tmp1, Address(buf, 1));
2283 update_byte_crc32(crc, tmp1, table0);
2284 beqz(len, L_exit);
2285
2286 subiw(len, len, 1);
2287 lbu(tmp1, Address(buf, 2));
2288 update_byte_crc32(crc, tmp1, table0);
2289
2290 #ifdef COMPILER2
2291 // put vector code here, otherwise "offset is too large" error occurs.
2292 if (UseRVV) {
2293 // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2294 j(L_exit);
2295
2296 bind(L_vector_entry);
2297 if (UseZvbc) { // carry-less multiplication
2298 kernel_crc32_vclmul_fold(crc, buf, len,
2299 table0, table1, table2, table3,
2300 tmp1, tmp2, tmp3, tmp4, tmp6);
2301 } else { // plain vector instructions
2302 vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2303 }
2304
2305 bgtz(len, L_by4_loop_entry);
2306 }
2307 #endif // COMPILER2
2308
2309 bind(L_exit);
2310 andn(crc, tmp5, crc);
2311 }
2312
2313 #ifdef COMPILER2
2314 // Push vector registers in the bitset supplied.
2315 // Return the number of words pushed
2316 int MacroAssembler::push_v(unsigned int bitset, Register stack) {
2317 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2318
2319 // Scan bitset to accumulate register pairs
2320 unsigned char regs[32];
2321 int count = bitset_to_regs(bitset, regs);
2322
2323 for (int i = 0; i < count; i++) {
2324 sub(stack, stack, vector_size_in_bytes);
2325 vs1r_v(as_VectorRegister(regs[i]), stack);
2326 }
2327
2328 return count * vector_size_in_bytes / wordSize;
2329 }
2330
2331 int MacroAssembler::pop_v(unsigned int bitset, Register stack) {
2332 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2333
2334 // Scan bitset to accumulate register pairs
2335 unsigned char regs[32];
2336 int count = bitset_to_regs(bitset, regs);
2337
2338 for (int i = count - 1; i >= 0; i--) {
2339 vl1r_v(as_VectorRegister(regs[i]), stack);
2340 add(stack, stack, vector_size_in_bytes);
2341 }
2342
2343 return count * vector_size_in_bytes / wordSize;
2344 }
2345 #endif // COMPILER2
2346
2347 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2348 // Push integer registers x7, x10-x17, x28-x31.
2349 push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2350
2351 // Push float registers f0-f7, f10-f17, f28-f31.
2352 subi(sp, sp, wordSize * 20);
2353 int offset = 0;
2354 for (int i = 0; i < 32; i++) {
2355 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2356 fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2357 }
2358 }
2359 }
2360
2361 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2362 int offset = 0;
2363 for (int i = 0; i < 32; i++) {
2364 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2365 fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2366 }
2367 }
2368 addi(sp, sp, wordSize * 20);
2369
2370 pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2371 }
2372
2373 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2374 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2375 push_reg(RegSet::range(x5, x31), sp);
2376
2377 // float registers
2378 subi(sp, sp, 32 * wordSize);
2379 for (int i = 0; i < 32; i++) {
2380 fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2381 }
2382
2383 // vector registers
2384 if (save_vectors) {
2385 sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2386 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2387 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2388 add(t0, sp, vector_size_in_bytes * i);
2389 vse64_v(as_VectorRegister(i), t0);
2390 }
2391 }
2392 }
2393
2394 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2395 // vector registers
2396 if (restore_vectors) {
2397 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2398 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2399 vle64_v(as_VectorRegister(i), sp);
2400 add(sp, sp, vector_size_in_bytes * 8);
2401 }
2402 }
2403
2404 // float registers
2405 for (int i = 0; i < 32; i++) {
2406 fld(as_FloatRegister(i), Address(sp, i * wordSize));
2407 }
2408 addi(sp, sp, 32 * wordSize);
2409
2410 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2411 pop_reg(RegSet::range(x5, x31), sp);
2412 }
2413
2414 static int patch_offset_in_jal(address branch, int64_t offset) {
2415 assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2416 "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2417 Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1); // offset[20] ==> branch[31]
2418 Assembler::patch(branch, 30, 21, (offset >> 1) & 0x3ff); // offset[10:1] ==> branch[30:21]
2419 Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1); // offset[11] ==> branch[20]
2420 Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff); // offset[19:12] ==> branch[19:12]
2421 return MacroAssembler::instruction_size; // only one instruction
2422 }
2423
2424 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2425 assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2426 "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2427 Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1); // offset[12] ==> branch[31]
2428 Assembler::patch(branch, 30, 25, (offset >> 5) & 0x3f); // offset[10:5] ==> branch[30:25]
2429 Assembler::patch(branch, 7, 7, (offset >> 11) & 0x1); // offset[11] ==> branch[7]
2430 Assembler::patch(branch, 11, 8, (offset >> 1) & 0xf); // offset[4:1] ==> branch[11:8]
2431 return MacroAssembler::instruction_size; // only one instruction
2432 }
2433
2434 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2435 const int PC_RELATIVE_INSTRUCTION_NUM = 2; // auipc, addi/jalr/load
2436 Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff); // Auipc. offset[31:12] ==> branch[31:12]
2437 Assembler::patch(branch + 4, 31, 20, offset & 0xfff); // Addi/Jalr/Load. offset[11:0] ==> branch[31:20]
2438 return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2439 }
2440
2441 static int patch_addr_in_movptr1(address branch, address target) {
2442 int32_t lower = ((intptr_t)target << 35) >> 35;
2443 int64_t upper = ((intptr_t)target - lower) >> 29;
2444 Assembler::patch(branch + 0, 31, 12, upper & 0xfffff); // Lui. target[48:29] + target[28] ==> branch[31:12]
2445 Assembler::patch(branch + 4, 31, 20, (lower >> 17) & 0xfff); // Addi. target[28:17] ==> branch[31:20]
2446 Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff); // Addi. target[16: 6] ==> branch[31:20]
2447 Assembler::patch(branch + 20, 31, 20, lower & 0x3f); // Addi/Jalr/Load. target[ 5: 0] ==> branch[31:20]
2448 return MacroAssembler::movptr1_instruction_size;
2449 }
2450
2451 static int patch_addr_in_movptr2(address instruction_address, address target) {
2452 uintptr_t addr = (uintptr_t)target;
2453
2454 assert(addr < (1ull << 48), "48-bit overflow in address constant");
2455 unsigned int upper18 = (addr >> 30ull);
2456 int lower30 = (addr & 0x3fffffffu);
2457 int low12 = (lower30 << 20) >> 20;
2458 int mid18 = ((lower30 - low12) >> 12);
2459
2460 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2461 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18 & 0xfffff)); // Lui
2462 // Slli
2463 // Add
2464 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff); // Addi/Jalr/Load
2465
2466 assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2467
2468 return MacroAssembler::movptr2_instruction_size;
2469 }
2470
2471 static int patch_imm_in_li16u(address branch, uint16_t target) {
2472 Assembler::patch(branch, 31, 12, target); // patch lui only
2473 return MacroAssembler::instruction_size;
2474 }
2475
2476 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2477 const int LI32_INSTRUCTIONS_NUM = 2; // lui + addiw
2478 int64_t upper = (intptr_t)target;
2479 int32_t lower = (((int32_t)target) << 20) >> 20;
2480 upper -= lower;
2481 upper = (int32_t)upper;
2482 Assembler::patch(branch + 0, 31, 12, (upper >> 12) & 0xfffff); // Lui.
2483 Assembler::patch(branch + 4, 31, 20, lower & 0xfff); // Addiw.
2484 return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2485 }
2486
2487 static long get_offset_of_jal(address insn_addr) {
2488 assert_cond(insn_addr != nullptr);
2489 long offset = 0;
2490 unsigned insn = Assembler::ld_instr(insn_addr);
2491 long val = (long)Assembler::sextract(insn, 31, 12);
2492 offset |= ((val >> 19) & 0x1) << 20;
2493 offset |= (val & 0xff) << 12;
2494 offset |= ((val >> 8) & 0x1) << 11;
2495 offset |= ((val >> 9) & 0x3ff) << 1;
2496 offset = (offset << 43) >> 43;
2497 return offset;
2498 }
2499
2500 static long get_offset_of_conditional_branch(address insn_addr) {
2501 long offset = 0;
2502 assert_cond(insn_addr != nullptr);
2503 unsigned insn = Assembler::ld_instr(insn_addr);
2504 offset = (long)Assembler::sextract(insn, 31, 31);
2505 offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2506 offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2507 offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2508 offset = (offset << 41) >> 41;
2509 return offset;
2510 }
2511
2512 static long get_offset_of_pc_relative(address insn_addr) {
2513 long offset = 0;
2514 assert_cond(insn_addr != nullptr);
2515 offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12; // Auipc.
2516 offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addi/Jalr/Load.
2517 offset = (offset << 32) >> 32;
2518 return offset;
2519 }
2520
2521 static address get_target_of_movptr1(address insn_addr) {
2522 assert_cond(insn_addr != nullptr);
2523 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2524 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17; // Addi.
2525 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6; // Addi.
2526 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)); // Addi/Jalr/Load.
2527 return (address) target_address;
2528 }
2529
2530 static address get_target_of_movptr2(address insn_addr) {
2531 assert_cond(insn_addr != nullptr);
2532 int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2533 int32_t mid18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2534 // 2 // Slli
2535 // 3 // Add
2536 int32_t low12 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2537 address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2538 return ret;
2539 }
2540
2541 address MacroAssembler::get_target_of_li32(address insn_addr) {
2542 assert_cond(insn_addr != nullptr);
2543 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2544 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addiw.
2545 return (address)target_address;
2546 }
2547
2548 // Patch any kind of instruction; there may be several instructions.
2549 // Return the total length (in bytes) of the instructions.
2550 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2551 assert_cond(instruction_address != nullptr);
2552 int64_t offset = target - instruction_address;
2553 if (MacroAssembler::is_jal_at(instruction_address)) { // jal
2554 return patch_offset_in_jal(instruction_address, offset);
2555 } else if (MacroAssembler::is_branch_at(instruction_address)) { // beq/bge/bgeu/blt/bltu/bne
2556 return patch_offset_in_conditional_branch(instruction_address, offset);
2557 } else if (MacroAssembler::is_pc_relative_at(instruction_address)) { // auipc, addi/jalr/load
2558 return patch_offset_in_pc_relative(instruction_address, offset);
2559 } else if (MacroAssembler::is_movptr1_at(instruction_address)) { // movptr1
2560 return patch_addr_in_movptr1(instruction_address, target);
2561 } else if (MacroAssembler::is_movptr2_at(instruction_address)) { // movptr2
2562 return patch_addr_in_movptr2(instruction_address, target);
2563 } else if (MacroAssembler::is_li32_at(instruction_address)) { // li32
2564 int64_t imm = (intptr_t)target;
2565 return patch_imm_in_li32(instruction_address, (int32_t)imm);
2566 } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2567 int64_t imm = (intptr_t)target;
2568 return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2569 } else {
2570 #ifdef ASSERT
2571 tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
2572 Assembler::ld_instr(instruction_address), p2i(instruction_address));
2573 Disassembler::decode(instruction_address - 16, instruction_address + 16);
2574 #endif
2575 ShouldNotReachHere();
2576 return -1;
2577 }
2578 }
2579
2580 address MacroAssembler::target_addr_for_insn(address insn_addr) {
2581 long offset = 0;
2582 assert_cond(insn_addr != nullptr);
2583 if (MacroAssembler::is_jal_at(insn_addr)) { // jal
2584 offset = get_offset_of_jal(insn_addr);
2585 } else if (MacroAssembler::is_branch_at(insn_addr)) { // beq/bge/bgeu/blt/bltu/bne
2586 offset = get_offset_of_conditional_branch(insn_addr);
2587 } else if (MacroAssembler::is_pc_relative_at(insn_addr)) { // auipc, addi/jalr/load
2588 offset = get_offset_of_pc_relative(insn_addr);
2589 } else if (MacroAssembler::is_movptr1_at(insn_addr)) { // movptr1
2590 return get_target_of_movptr1(insn_addr);
2591 } else if (MacroAssembler::is_movptr2_at(insn_addr)) { // movptr2
2592 return get_target_of_movptr2(insn_addr);
2593 } else if (MacroAssembler::is_li32_at(insn_addr)) { // li32
2594 return get_target_of_li32(insn_addr);
2595 } else {
2596 ShouldNotReachHere();
2597 }
2598 return address(((uintptr_t)insn_addr + offset));
2599 }
2600
2601 int MacroAssembler::patch_oop(address insn_addr, address o) {
2602 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
2603 // narrow OOPs by setting the upper 16 bits in the first
2604 // instruction.
2605 if (MacroAssembler::is_li32_at(insn_addr)) {
2606 // Move narrow OOP
2607 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
2608 return patch_imm_in_li32(insn_addr, (int32_t)n);
2609 } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
2610 // Move wide OOP
2611 return patch_addr_in_movptr1(insn_addr, o);
2612 } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
2613 // Move wide OOP
2614 return patch_addr_in_movptr2(insn_addr, o);
2615 }
2616 ShouldNotReachHere();
2617 return -1;
2618 }
2619
2620 void MacroAssembler::reinit_heapbase() {
2621 if (UseCompressedOops) {
2622 if (Universe::is_fully_initialized()) {
2623 mv(xheapbase, CompressedOops::base());
2624 } else {
2625 ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
2626 }
2627 }
2628 }
2629
2630 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
2631 assert(addr.getMode() == Address::literal, "must be applied to a literal address");
2632 relocate(addr.rspec(), [&] {
2633 movptr(Rd, addr.target(), temp);
2634 });
2635 }
2636
2637 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
2638 int offset = 0;
2639 movptr(Rd, addr, offset, temp);
2640 addi(Rd, Rd, offset);
2641 }
2642
2643 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
2644 uint64_t uimm64 = (uint64_t)addr;
2645 #ifndef PRODUCT
2646 {
2647 char buffer[64];
2648 os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
2649 block_comment(buffer);
2650 }
2651 #endif
2652 assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
2653
2654 if (temp == noreg) {
2655 movptr1(Rd, uimm64, offset);
2656 } else {
2657 movptr2(Rd, uimm64, offset, temp);
2658 }
2659 }
2660
2661 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
2662 // Load upper 31 bits
2663 //
2664 // In case of 11th bit of `lower` is 0, it's straightforward to understand.
2665 // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
2666 // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
2667 // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
2668 // `lower = (lower << 52) >> 52;`.
2669 // After `upper -= lower;`,
2670 // upper_20' = upper_20 - (-1) == upper_20 + 1
2671 // upper_12 = 0x000
2672 // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
2673 // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
2674 // Rd_20 == upper_20'
2675 // Rd_12 == 0x000
2676 // After `addi(Rd, Rd, lower);`,
2677 // Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
2678 // Rd_12 = lower_12
2679 // So, finally Rd == [upper_20, lower_12]
2680 int64_t imm = imm64 >> 17;
2681 int64_t upper = imm, lower = imm;
2682 lower = (lower << 52) >> 52;
2683 upper -= lower;
2684 upper = (int32_t)upper;
2685 lui(Rd, upper);
2686 addi(Rd, Rd, lower);
2687
2688 // Load the rest 17 bits.
2689 slli(Rd, Rd, 11);
2690 addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
2691 slli(Rd, Rd, 6);
2692
2693 // This offset will be used by following jalr/ld.
2694 offset = imm64 & 0x3f;
2695 }
2696
2697 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
2698 assert_different_registers(Rd, tmp, noreg);
2699
2700 // addr: [upper18, lower30[mid18, lower12]]
2701
2702 int64_t upper18 = addr >> 18;
2703 lui(tmp, upper18);
2704
2705 int64_t lower30 = addr & 0x3fffffff;
2706 int64_t mid18 = lower30, lower12 = lower30;
2707 lower12 = (lower12 << 52) >> 52;
2708 // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
2709 // please refer to movptr1 above.
2710 mid18 -= (int32_t)lower12;
2711 lui(Rd, mid18);
2712
2713 slli(tmp, tmp, 18);
2714 add(Rd, Rd, tmp);
2715
2716 offset = lower12;
2717 }
2718
2719 // floating point imm move
2720 bool MacroAssembler::can_hf_imm_load(short imm) {
2721 jshort h_bits = (jshort)imm;
2722 if (h_bits == 0) {
2723 return true;
2724 }
2725 return can_zfa_zli_half_float(imm);
2726 }
2727
2728 bool MacroAssembler::can_fp_imm_load(float imm) {
2729 jint f_bits = jint_cast(imm);
2730 if (f_bits == 0) {
2731 return true;
2732 }
2733 return can_zfa_zli_float(imm);
2734 }
2735
2736 bool MacroAssembler::can_dp_imm_load(double imm) {
2737 julong d_bits = julong_cast(imm);
2738 if (d_bits == 0) {
2739 return true;
2740 }
2741 return can_zfa_zli_double(imm);
2742 }
2743
2744 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
2745 jshort h_bits = (jshort)imm;
2746 if (h_bits == 0) {
2747 fmv_h_x(Rd, zr);
2748 return;
2749 }
2750 int Rs = zfa_zli_lookup_half_float(h_bits);
2751 assert(Rs != -1, "Must be");
2752 _fli_h(Rd, Rs);
2753 }
2754
2755 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
2756 jint f_bits = jint_cast(imm);
2757 if (f_bits == 0) {
2758 fmv_w_x(Rd, zr);
2759 return;
2760 }
2761 int Rs = zfa_zli_lookup_float(f_bits);
2762 assert(Rs != -1, "Must be");
2763 _fli_s(Rd, Rs);
2764 }
2765
2766 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
2767 uint64_t d_bits = (uint64_t)julong_cast(imm);
2768 if (d_bits == 0) {
2769 fmv_d_x(Rd, zr);
2770 return;
2771 }
2772 int Rs = zfa_zli_lookup_double(d_bits);
2773 assert(Rs != -1, "Must be");
2774 _fli_d(Rd, Rs);
2775 }
2776
2777 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
2778 if (is_simm12(increment)) {
2779 addi(Rd, Rn, increment);
2780 } else {
2781 assert_different_registers(Rn, tmp);
2782 mv(tmp, increment);
2783 add(Rd, Rn, tmp);
2784 }
2785 }
2786
2787 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2788 add(Rd, Rn, -decrement, tmp);
2789 }
2790
2791 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
2792 if (is_simm12(increment)) {
2793 addiw(Rd, Rn, increment);
2794 } else {
2795 assert_different_registers(Rn, tmp);
2796 mv(tmp, increment);
2797 addw(Rd, Rn, tmp);
2798 }
2799 }
2800
2801 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
2802 addw(Rd, Rn, -decrement, tmp);
2803 }
2804
2805 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
2806 andr(Rd, Rs1, Rs2);
2807 sext(Rd, Rd, 32);
2808 }
2809
2810 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
2811 orr(Rd, Rs1, Rs2);
2812 sext(Rd, Rd, 32);
2813 }
2814
2815 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
2816 xorr(Rd, Rs1, Rs2);
2817 sext(Rd, Rd, 32);
2818 }
2819
2820 // Rd = Rs1 & (~Rd2)
2821 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
2822 if (UseZbb) {
2823 Assembler::andn(Rd, Rs1, Rs2);
2824 return;
2825 }
2826
2827 notr(Rd, Rs2);
2828 andr(Rd, Rs1, Rd);
2829 }
2830
2831 // Rd = Rs1 | (~Rd2)
2832 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
2833 if (UseZbb) {
2834 Assembler::orn(Rd, Rs1, Rs2);
2835 return;
2836 }
2837
2838 notr(Rd, Rs2);
2839 orr(Rd, Rs1, Rd);
2840 }
2841
2842 // Note: load_unsigned_short used to be called load_unsigned_word.
2843 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
2844 int off = offset();
2845 lhu(dst, src);
2846 return off;
2847 }
2848
2849 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
2850 int off = offset();
2851 lbu(dst, src);
2852 return off;
2853 }
2854
2855 int MacroAssembler::load_signed_short(Register dst, Address src) {
2856 int off = offset();
2857 lh(dst, src);
2858 return off;
2859 }
2860
2861 int MacroAssembler::load_signed_byte(Register dst, Address src) {
2862 int off = offset();
2863 lb(dst, src);
2864 return off;
2865 }
2866
2867 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2868 switch (size_in_bytes) {
2869 case 8: ld(dst, src); break;
2870 case 4: is_signed ? lw(dst, src) : lwu(dst, src); break;
2871 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
2872 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
2873 default: ShouldNotReachHere();
2874 }
2875 }
2876
2877 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
2878 switch (size_in_bytes) {
2879 case 8: sd(src, dst); break;
2880 case 4: sw(src, dst); break;
2881 case 2: sh(src, dst); break;
2882 case 1: sb(src, dst); break;
2883 default: ShouldNotReachHere();
2884 }
2885 }
2886
2887 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
2888 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2889 if (granularity != 1 && granularity != 2) {
2890 ShouldNotReachHere();
2891 }
2892 if (AvoidUnalignedAccesses && (granularity != 2)) {
2893 assert_different_registers(dst, tmp);
2894 assert_different_registers(tmp, src.base());
2895 is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
2896 slli(tmp, tmp, 8);
2897 lbu(dst, src);
2898 add(dst, dst, tmp);
2899 } else {
2900 is_signed ? lh(dst, src) : lhu(dst, src);
2901 }
2902 }
2903
2904 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
2905 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
2906 if (AvoidUnalignedAccesses && (granularity != 4)) {
2907 switch(granularity) {
2908 case 1:
2909 assert_different_registers(dst, tmp, src.base());
2910 lbu(dst, src);
2911 lbu(tmp, Address(src.base(), src.offset() + 1));
2912 slli(tmp, tmp, 8);
2913 add(dst, dst, tmp);
2914 lbu(tmp, Address(src.base(), src.offset() + 2));
2915 slli(tmp, tmp, 16);
2916 add(dst, dst, tmp);
2917 is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
2918 slli(tmp, tmp, 24);
2919 add(dst, dst, tmp);
2920 break;
2921 case 2:
2922 assert_different_registers(dst, tmp);
2923 assert_different_registers(tmp, src.base());
2924 is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
2925 slli(tmp, tmp, 16);
2926 lhu(dst, src);
2927 add(dst, dst, tmp);
2928 break;
2929 default:
2930 ShouldNotReachHere();
2931 }
2932 } else {
2933 is_signed ? lw(dst, src) : lwu(dst, src);
2934 }
2935 }
2936
2937 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
2938 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
2939 if (AvoidUnalignedAccesses && (granularity != 8)) {
2940 switch(granularity){
2941 case 1:
2942 assert_different_registers(dst, tmp, src.base());
2943 lbu(dst, src);
2944 lbu(tmp, Address(src.base(), src.offset() + 1));
2945 slli(tmp, tmp, 8);
2946 add(dst, dst, tmp);
2947 lbu(tmp, Address(src.base(), src.offset() + 2));
2948 slli(tmp, tmp, 16);
2949 add(dst, dst, tmp);
2950 lbu(tmp, Address(src.base(), src.offset() + 3));
2951 slli(tmp, tmp, 24);
2952 add(dst, dst, tmp);
2953 lbu(tmp, Address(src.base(), src.offset() + 4));
2954 slli(tmp, tmp, 32);
2955 add(dst, dst, tmp);
2956 lbu(tmp, Address(src.base(), src.offset() + 5));
2957 slli(tmp, tmp, 40);
2958 add(dst, dst, tmp);
2959 lbu(tmp, Address(src.base(), src.offset() + 6));
2960 slli(tmp, tmp, 48);
2961 add(dst, dst, tmp);
2962 lbu(tmp, Address(src.base(), src.offset() + 7));
2963 slli(tmp, tmp, 56);
2964 add(dst, dst, tmp);
2965 break;
2966 case 2:
2967 assert_different_registers(dst, tmp, src.base());
2968 lhu(dst, src);
2969 lhu(tmp, Address(src.base(), src.offset() + 2));
2970 slli(tmp, tmp, 16);
2971 add(dst, dst, tmp);
2972 lhu(tmp, Address(src.base(), src.offset() + 4));
2973 slli(tmp, tmp, 32);
2974 add(dst, dst, tmp);
2975 lhu(tmp, Address(src.base(), src.offset() + 6));
2976 slli(tmp, tmp, 48);
2977 add(dst, dst, tmp);
2978 break;
2979 case 4:
2980 assert_different_registers(dst, tmp);
2981 assert_different_registers(tmp, src.base());
2982 lwu(tmp, Address(src.base(), src.offset() + 4));
2983 slli(tmp, tmp, 32);
2984 lwu(dst, src);
2985 add(dst, dst, tmp);
2986 break;
2987 default:
2988 ShouldNotReachHere();
2989 }
2990 } else {
2991 ld(dst, src);
2992 }
2993 }
2994
2995 // reverse bytes in lower word, sign-extend
2996 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
2997 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
2998 if (UseZbb) {
2999 rev8(Rd, Rs);
3000 srai(Rd, Rd, 32);
3001 return;
3002 }
3003 assert_different_registers(Rs, tmp1, tmp2);
3004 assert_different_registers(Rd, tmp1, tmp2);
3005 zext(tmp1, Rs, 8);
3006 slli(tmp1, tmp1, 8);
3007 for (int step = 8; step < 24; step += 8) {
3008 srli(tmp2, Rs, step);
3009 zext(tmp2, tmp2, 8);
3010 orr(tmp1, tmp1, tmp2);
3011 slli(tmp1, tmp1, 8);
3012 }
3013 srli(Rd, Rs, 24);
3014 zext(Rd, Rd, 8);
3015 orr(Rd, tmp1, Rd);
3016 sext(Rd, Rd, 32);
3017 }
3018
3019 // reverse bytes in doubleword
3020 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3021 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3022 if (UseZbb) {
3023 rev8(Rd, Rs);
3024 return;
3025 }
3026 assert_different_registers(Rs, tmp1, tmp2);
3027 assert_different_registers(Rd, tmp1, tmp2);
3028 zext(tmp1, Rs, 8);
3029 slli(tmp1, tmp1, 8);
3030 for (int step = 8; step < 56; step += 8) {
3031 srli(tmp2, Rs, step);
3032 zext(tmp2, tmp2, 8);
3033 orr(tmp1, tmp1, tmp2);
3034 slli(tmp1, tmp1, 8);
3035 }
3036 srli(Rd, Rs, 56);
3037 orr(Rd, tmp1, Rd);
3038 }
3039
3040 // rotate right with shift bits
3041 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3042 {
3043 if (UseZbb) {
3044 rorr(dst, src, shift);
3045 return;
3046 }
3047
3048 assert_different_registers(dst, tmp);
3049 assert_different_registers(src, tmp);
3050
3051 mv(tmp, 64);
3052 sub(tmp, tmp, shift);
3053 sll(tmp, src, tmp);
3054 srl(dst, src, shift);
3055 orr(dst, dst, tmp);
3056 }
3057
3058 // rotate right with shift bits
3059 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3060 {
3061 if (UseZbb) {
3062 rori(dst, src, shift);
3063 return;
3064 }
3065
3066 assert_different_registers(dst, tmp);
3067 assert_different_registers(src, tmp);
3068 assert(shift < 64, "shift amount must be < 64");
3069 slli(tmp, src, 64 - shift);
3070 srli(dst, src, shift);
3071 orr(dst, dst, tmp);
3072 }
3073
3074 // rotate left with shift bits, 32-bit version
3075 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3076 if (UseZbb) {
3077 // no roliw available
3078 roriw(dst, src, 32 - shift);
3079 return;
3080 }
3081
3082 assert_different_registers(dst, tmp);
3083 assert_different_registers(src, tmp);
3084 assert(shift < 32, "shift amount must be < 32");
3085 srliw(tmp, src, 32 - shift);
3086 slliw(dst, src, shift);
3087 orr(dst, dst, tmp);
3088 }
3089
3090 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3091 ld(tmp1, adr);
3092 if (src.is_register()) {
3093 orr(tmp1, tmp1, src.as_register());
3094 } else {
3095 if (is_simm12(src.as_constant())) {
3096 ori(tmp1, tmp1, src.as_constant());
3097 } else {
3098 assert_different_registers(tmp1, tmp2);
3099 mv(tmp2, src.as_constant());
3100 orr(tmp1, tmp1, tmp2);
3101 }
3102 }
3103 sd(tmp1, adr);
3104 }
3105
3106 void MacroAssembler::cmp_klass_compressed(Register oop, Register trial_klass, Register tmp, Label &L, bool equal) {
3107 if (UseCompactObjectHeaders) {
3108 load_narrow_klass_compact(tmp, oop);
3109 } else if (UseCompressedClassPointers) {
3110 lwu(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3111 } else {
3112 ld(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3113 }
3114 if (equal) {
3115 beq(trial_klass, tmp, L);
3116 } else {
3117 bne(trial_klass, tmp, L);
3118 }
3119 }
3120
3121 // Move an oop into a register.
3122 void MacroAssembler::movoop(Register dst, jobject obj) {
3123 int oop_index;
3124 if (obj == nullptr) {
3125 oop_index = oop_recorder()->allocate_oop_index(obj);
3126 } else {
3127 #ifdef ASSERT
3128 {
3129 ThreadInVMfromUnknown tiv;
3130 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3131 }
3132 #endif
3133 oop_index = oop_recorder()->find_index(obj);
3134 }
3135 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3136
3137 if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3138 movptr(dst, Address((address)obj, rspec));
3139 } else {
3140 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3141 ld(dst, Address(dummy, rspec));
3142 }
3143 }
3144
3145 // Move a metadata address into a register.
3146 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3147 assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3148 int oop_index;
3149 if (obj == nullptr) {
3150 oop_index = oop_recorder()->allocate_metadata_index(obj);
3151 } else {
3152 oop_index = oop_recorder()->find_index(obj);
3153 }
3154 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3155 movptr(dst, Address((address)obj, rspec));
3156 }
3157
3158 // Writes to stack successive pages until offset reached to check for
3159 // stack overflow + shadow pages. This clobbers tmp.
3160 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3161 assert_different_registers(tmp, size, t0);
3162 // Bang stack for total size given plus shadow page size.
3163 // Bang one page at a time because large size can bang beyond yellow and
3164 // red zones.
3165 mv(t0, (int)os::vm_page_size());
3166 Label loop;
3167 bind(loop);
3168 sub(tmp, sp, t0);
3169 subw(size, size, t0);
3170 sd(size, Address(tmp));
3171 bgtz(size, loop);
3172
3173 // Bang down shadow pages too.
3174 // At this point, (tmp-0) is the last address touched, so don't
3175 // touch it again. (It was touched as (tmp-pagesize) but then tmp
3176 // was post-decremented.) Skip this address by starting at i=1, and
3177 // touch a few more pages below. N.B. It is important to touch all
3178 // the way down to and including i=StackShadowPages.
3179 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3180 // this could be any sized move but this is can be a debugging crumb
3181 // so the bigger the better.
3182 sub(tmp, tmp, (int)os::vm_page_size());
3183 sd(size, Address(tmp, 0));
3184 }
3185 }
3186
3187 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3188 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3189 ld(dst, Address(xmethod, Method::const_offset()));
3190 ld(dst, Address(dst, ConstMethod::constants_offset()));
3191 ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3192 ld(dst, Address(dst, mirror_offset));
3193 resolve_oop_handle(dst, tmp1, tmp2);
3194 }
3195
3196 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3197 // OopHandle::resolve is an indirection.
3198 assert_different_registers(result, tmp1, tmp2);
3199 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3200 }
3201
3202 // ((WeakHandle)result).resolve()
3203 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3204 assert_different_registers(result, tmp1, tmp2);
3205 Label resolved;
3206
3207 // A null weak handle resolves to null.
3208 beqz(result, resolved);
3209
3210 // Only 64 bit platforms support GCs that require a tmp register
3211 // Only IN_HEAP loads require a thread_tmp register
3212 // WeakHandle::resolve is an indirection like jweak.
3213 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3214 result, Address(result), tmp1, tmp2);
3215 bind(resolved);
3216 }
3217
3218 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3219 Register dst, Address src,
3220 Register tmp1, Register tmp2) {
3221 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3222 decorators = AccessInternal::decorator_fixup(decorators, type);
3223 bool as_raw = (decorators & AS_RAW) != 0;
3224 if (as_raw) {
3225 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3226 } else {
3227 bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3228 }
3229 }
3230
3231 void MacroAssembler::null_check(Register reg, int offset) {
3232 if (needs_explicit_null_check(offset)) {
3233 // provoke OS null exception if reg is null by
3234 // accessing M[reg] w/o changing any registers
3235 // NOTE: this is plenty to provoke a segv
3236 ld(zr, Address(reg, 0));
3237 } else {
3238 // nothing to do, (later) access of M[reg + offset]
3239 // will provoke OS null exception if reg is null
3240 }
3241 }
3242
3243 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3244 Address dst, Register val,
3245 Register tmp1, Register tmp2, Register tmp3) {
3246 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3247 decorators = AccessInternal::decorator_fixup(decorators, type);
3248 bool as_raw = (decorators & AS_RAW) != 0;
3249 if (as_raw) {
3250 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3251 } else {
3252 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3253 }
3254 }
3255
3256 // Algorithm must match CompressedOops::encode.
3257 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3258 verify_oop_msg(s, "broken oop in encode_heap_oop");
3259 if (CompressedOops::base() == nullptr) {
3260 if (CompressedOops::shift() != 0) {
3261 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3262 srli(d, s, LogMinObjAlignmentInBytes);
3263 } else {
3264 mv(d, s);
3265 }
3266 } else {
3267 Label notNull;
3268 sub(d, s, xheapbase);
3269 bgez(d, notNull);
3270 mv(d, zr);
3271 bind(notNull);
3272 if (CompressedOops::shift() != 0) {
3273 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3274 srli(d, d, CompressedOops::shift());
3275 }
3276 }
3277 }
3278
3279 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3280 #ifdef ASSERT
3281 if (CheckCompressedOops) {
3282 Label ok;
3283 bnez(r, ok);
3284 stop("null oop passed to encode_heap_oop_not_null");
3285 bind(ok);
3286 }
3287 #endif
3288 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3289 if (CompressedOops::base() != nullptr) {
3290 sub(r, r, xheapbase);
3291 }
3292 if (CompressedOops::shift() != 0) {
3293 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3294 srli(r, r, LogMinObjAlignmentInBytes);
3295 }
3296 }
3297
3298 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3299 #ifdef ASSERT
3300 if (CheckCompressedOops) {
3301 Label ok;
3302 bnez(src, ok);
3303 stop("null oop passed to encode_heap_oop_not_null2");
3304 bind(ok);
3305 }
3306 #endif
3307 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3308
3309 Register data = src;
3310 if (CompressedOops::base() != nullptr) {
3311 sub(dst, src, xheapbase);
3312 data = dst;
3313 }
3314 if (CompressedOops::shift() != 0) {
3315 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3316 srli(dst, data, LogMinObjAlignmentInBytes);
3317 data = dst;
3318 }
3319 if (data == src) {
3320 mv(dst, src);
3321 }
3322 }
3323
3324 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3325 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3326 ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3327 srli(dst, dst, markWord::klass_shift);
3328 }
3329
3330 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3331 assert_different_registers(dst, tmp);
3332 assert_different_registers(src, tmp);
3333 if (UseCompactObjectHeaders) {
3334 load_narrow_klass_compact(dst, src);
3335 decode_klass_not_null(dst, tmp);
3336 } else if (UseCompressedClassPointers) {
3337 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3338 decode_klass_not_null(dst, tmp);
3339 } else {
3340 ld(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3341 }
3342 }
3343
3344 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3345 // FIXME: Should this be a store release? concurrent gcs assumes
3346 // klass length is valid if klass field is not null.
3347 assert(!UseCompactObjectHeaders, "not with compact headers");
3348 if (UseCompressedClassPointers) {
3349 encode_klass_not_null(src, tmp);
3350 sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3351 } else {
3352 sd(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3353 }
3354 }
3355
3356 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3357 assert(!UseCompactObjectHeaders, "not with compact headers");
3358 if (UseCompressedClassPointers) {
3359 // Store to klass gap in destination
3360 sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3361 }
3362 }
3363
3364 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3365 assert_different_registers(r, tmp);
3366 decode_klass_not_null(r, r, tmp);
3367 }
3368
3369 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3370 assert(UseCompressedClassPointers, "should only be used for compressed headers");
3371 assert_different_registers(dst, tmp);
3372 assert_different_registers(src, tmp);
3373
3374 if (CompressedKlassPointers::base() == nullptr) {
3375 if (CompressedKlassPointers::shift() != 0) {
3376 slli(dst, src, CompressedKlassPointers::shift());
3377 } else {
3378 mv(dst, src);
3379 }
3380 return;
3381 }
3382
3383 Register xbase = tmp;
3384
3385 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3386
3387 if (CompressedKlassPointers::shift() != 0) {
3388 // dst = (src << shift) + xbase
3389 shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3390 } else {
3391 add(dst, xbase, src);
3392 }
3393 }
3394
3395 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3396 assert_different_registers(r, tmp);
3397 encode_klass_not_null(r, r, tmp);
3398 }
3399
3400 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3401 assert(UseCompressedClassPointers, "should only be used for compressed headers");
3402
3403 if (CompressedKlassPointers::base() == nullptr) {
3404 if (CompressedKlassPointers::shift() != 0) {
3405 srli(dst, src, CompressedKlassPointers::shift());
3406 } else {
3407 mv(dst, src);
3408 }
3409 return;
3410 }
3411
3412 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3413 CompressedKlassPointers::shift() == 0) {
3414 zext(dst, src, 32);
3415 return;
3416 }
3417
3418 Register xbase = dst;
3419 if (dst == src) {
3420 xbase = tmp;
3421 }
3422
3423 assert_different_registers(src, xbase);
3424 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3425 sub(dst, src, xbase);
3426 if (CompressedKlassPointers::shift() != 0) {
3427 srli(dst, dst, CompressedKlassPointers::shift());
3428 }
3429 }
3430
3431 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3432 decode_heap_oop_not_null(r, r);
3433 }
3434
3435 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3436 assert(UseCompressedOops, "should only be used for compressed headers");
3437 assert(Universe::heap() != nullptr, "java heap should be initialized");
3438 // Cannot assert, unverified entry point counts instructions (see .ad file)
3439 // vtableStubs also counts instructions in pd_code_size_limit.
3440 // Also do not verify_oop as this is called by verify_oop.
3441 if (CompressedOops::shift() != 0) {
3442 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3443 slli(dst, src, LogMinObjAlignmentInBytes);
3444 if (CompressedOops::base() != nullptr) {
3445 add(dst, xheapbase, dst);
3446 }
3447 } else {
3448 assert(CompressedOops::base() == nullptr, "sanity");
3449 mv(dst, src);
3450 }
3451 }
3452
3453 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3454 if (CompressedOops::base() == nullptr) {
3455 if (CompressedOops::shift() != 0 || d != s) {
3456 slli(d, s, CompressedOops::shift());
3457 }
3458 } else {
3459 Label done;
3460 mv(d, s);
3461 beqz(s, done);
3462 shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3463 bind(done);
3464 }
3465 verify_oop_msg(d, "broken oop in decode_heap_oop");
3466 }
3467
3468 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3469 Register tmp2, Register tmp3, DecoratorSet decorators) {
3470 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3471 }
3472
3473 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3474 Register tmp2, DecoratorSet decorators) {
3475 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3476 }
3477
3478 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3479 Register tmp2, DecoratorSet decorators) {
3480 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3481 }
3482
3483 // Used for storing nulls.
3484 void MacroAssembler::store_heap_oop_null(Address dst) {
3485 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3486 }
3487
3488 // Look up the method for a megamorphic invokeinterface call.
3489 // The target method is determined by <intf_klass, itable_index>.
3490 // The receiver klass is in recv_klass.
3491 // On success, the result will be in method_result, and execution falls through.
3492 // On failure, execution transfers to the given label.
3493 void MacroAssembler::lookup_interface_method(Register recv_klass,
3494 Register intf_klass,
3495 RegisterOrConstant itable_index,
3496 Register method_result,
3497 Register scan_tmp,
3498 Label& L_no_such_interface,
3499 bool return_method) {
3500 assert_different_registers(recv_klass, intf_klass, scan_tmp);
3501 assert_different_registers(method_result, intf_klass, scan_tmp);
3502 assert(recv_klass != method_result || !return_method,
3503 "recv_klass can be destroyed when method isn't needed");
3504 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3505 "caller must use same register for non-constant itable index as for method");
3506
3507 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3508 int vtable_base = in_bytes(Klass::vtable_start_offset());
3509 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3510 int scan_step = itableOffsetEntry::size() * wordSize;
3511 int vte_size = vtableEntry::size_in_bytes();
3512 assert(vte_size == wordSize, "else adjust times_vte_scale");
3513
3514 lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3515
3516 // Could store the aligned, prescaled offset in the klass.
3517 shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3518 add(scan_tmp, scan_tmp, vtable_base);
3519
3520 if (return_method) {
3521 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3522 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3523 if (itable_index.is_register()) {
3524 slli(t0, itable_index.as_register(), 3);
3525 } else {
3526 mv(t0, itable_index.as_constant() << 3);
3527 }
3528 add(recv_klass, recv_klass, t0);
3529 if (itentry_off) {
3530 add(recv_klass, recv_klass, itentry_off);
3531 }
3532 }
3533
3534 Label search, found_method;
3535
3536 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3537 beq(intf_klass, method_result, found_method);
3538 bind(search);
3539 // Check that the previous entry is non-null. A null entry means that
3540 // the receiver class doesn't implement the interface, and wasn't the
3541 // same as when the caller was compiled.
3542 beqz(method_result, L_no_such_interface, /* is_far */ true);
3543 addi(scan_tmp, scan_tmp, scan_step);
3544 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3545 bne(intf_klass, method_result, search);
3546
3547 bind(found_method);
3548
3549 // Got a hit.
3550 if (return_method) {
3551 lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3552 add(method_result, recv_klass, scan_tmp);
3553 ld(method_result, Address(method_result));
3554 }
3555 }
3556
3557 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3558 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3559 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3560 // The target method is determined by <holder_klass, itable_index>.
3561 // The receiver klass is in recv_klass.
3562 // On success, the result will be in method_result, and execution falls through.
3563 // On failure, execution transfers to the given label.
3564 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3565 Register holder_klass,
3566 Register resolved_klass,
3567 Register method_result,
3568 Register temp_itbl_klass,
3569 Register scan_temp,
3570 int itable_index,
3571 Label& L_no_such_interface) {
3572 // 'method_result' is only used as output register at the very end of this method.
3573 // Until then we can reuse it as 'holder_offset'.
3574 Register holder_offset = method_result;
3575 assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
3576
3577 int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
3578 int scan_step = itableOffsetEntry::size() * wordSize;
3579 int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
3580 int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
3581 int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
3582 const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
3583
3584 Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
3585
3586 lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
3587 add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
3588 // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
3589 // + sizeof(vtableEntry) * (recv_klass->_vtable_len);
3590 // scan_temp = &(itable[0]._interface)
3591 // temp_itbl_klass = itable[0]._interface;
3592 shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
3593 ld(temp_itbl_klass, Address(scan_temp));
3594 mv(holder_offset, zr);
3595
3596 // Initial checks:
3597 // - if (holder_klass != resolved_klass), go to "scan for resolved"
3598 // - if (itable[0] == holder_klass), shortcut to "holder found"
3599 // - if (itable[0] == 0), no such interface
3600 bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
3601 beq(holder_klass, temp_itbl_klass, L_holder_found);
3602 beqz(temp_itbl_klass, L_no_such_interface);
3603
3604 // Loop: Look for holder_klass record in itable
3605 // do {
3606 // temp_itbl_klass = *(scan_temp += scan_step);
3607 // if (temp_itbl_klass == holder_klass) {
3608 // goto L_holder_found; // Found!
3609 // }
3610 // } while (temp_itbl_klass != 0);
3611 // goto L_no_such_interface // Not found.
3612 Label L_search_holder;
3613 bind(L_search_holder);
3614 add(scan_temp, scan_temp, scan_step);
3615 ld(temp_itbl_klass, Address(scan_temp));
3616 beq(holder_klass, temp_itbl_klass, L_holder_found);
3617 bnez(temp_itbl_klass, L_search_holder);
3618
3619 j(L_no_such_interface);
3620
3621 // Loop: Look for resolved_class record in itable
3622 // while (true) {
3623 // temp_itbl_klass = *(scan_temp += scan_step);
3624 // if (temp_itbl_klass == 0) {
3625 // goto L_no_such_interface;
3626 // }
3627 // if (temp_itbl_klass == resolved_klass) {
3628 // goto L_resolved_found; // Found!
3629 // }
3630 // if (temp_itbl_klass == holder_klass) {
3631 // holder_offset = scan_temp;
3632 // }
3633 // }
3634 //
3635 Label L_loop_search_resolved;
3636 bind(L_loop_search_resolved);
3637 add(scan_temp, scan_temp, scan_step);
3638 ld(temp_itbl_klass, Address(scan_temp));
3639 bind(L_loop_search_resolved_entry);
3640 beqz(temp_itbl_klass, L_no_such_interface);
3641 beq(resolved_klass, temp_itbl_klass, L_resolved_found);
3642 bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
3643 mv(holder_offset, scan_temp);
3644 j(L_loop_search_resolved);
3645
3646 // See if we already have a holder klass. If not, go and scan for it.
3647 bind(L_resolved_found);
3648 beqz(holder_offset, L_search_holder);
3649 mv(scan_temp, holder_offset);
3650
3651 // Finally, scan_temp contains holder_klass vtable offset
3652 bind(L_holder_found);
3653 lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
3654 add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
3655 - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
3656 add(method_result, recv_klass, method_result);
3657 ld(method_result, Address(method_result));
3658 }
3659
3660 // virtual method calling
3661 void MacroAssembler::lookup_virtual_method(Register recv_klass,
3662 RegisterOrConstant vtable_index,
3663 Register method_result) {
3664 const ByteSize base = Klass::vtable_start_offset();
3665 assert(vtableEntry::size() * wordSize == 8,
3666 "adjust the scaling in the code below");
3667 int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
3668
3669 if (vtable_index.is_register()) {
3670 shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
3671 ld(method_result, Address(method_result, vtable_offset_in_bytes));
3672 } else {
3673 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
3674 ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
3675 }
3676 }
3677
3678 void MacroAssembler::membar(uint32_t order_constraint) {
3679 if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
3680 // TSO allows for stores to be reordered after loads. When the compiler
3681 // generates a fence to disallow that, we are required to generate the
3682 // fence for correctness.
3683 BLOCK_COMMENT("elided tso membar");
3684 return;
3685 }
3686
3687 address prev = pc() - MacroAssembler::instruction_size;
3688 address last = code()->last_insn();
3689
3690 if (last != nullptr && is_membar(last) && prev == last) {
3691 // We are merging two memory barrier instructions. On RISCV we
3692 // can do this simply by ORing them together.
3693 set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
3694 BLOCK_COMMENT("merged membar");
3695 return;
3696 }
3697
3698 code()->set_last_insn(pc());
3699 uint32_t predecessor = 0;
3700 uint32_t successor = 0;
3701 membar_mask_to_pred_succ(order_constraint, predecessor, successor);
3702 fence(predecessor, successor);
3703 }
3704
3705 void MacroAssembler::cmodx_fence() {
3706 BLOCK_COMMENT("cmodx fence");
3707 if (VM_Version::supports_fencei_barrier()) {
3708 Assembler::fencei();
3709 }
3710 }
3711
3712 // Form an address from base + offset in Rd. Rd my or may not
3713 // actually be used: you must use the Address that is returned. It
3714 // is up to you to ensure that the shift provided matches the size
3715 // of your data.
3716 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
3717 if (is_simm12(byte_offset)) { // 12: imm in range 2^12
3718 return Address(base, byte_offset);
3719 }
3720
3721 assert_different_registers(Rd, base, noreg);
3722
3723 // Do it the hard way
3724 mv(Rd, byte_offset);
3725 add(Rd, base, Rd);
3726 return Address(Rd);
3727 }
3728
3729 void MacroAssembler::check_klass_subtype(Register sub_klass,
3730 Register super_klass,
3731 Register tmp_reg,
3732 Label& L_success) {
3733 Label L_failure;
3734 check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
3735 check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
3736 bind(L_failure);
3737 }
3738
3739 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
3740 ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
3741 if (at_return) {
3742 bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
3743 } else {
3744 test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
3745 bnez(tmp_reg, slow_path, /* is_far */ true);
3746 }
3747 }
3748
3749 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
3750 Label &succeed, Label *fail) {
3751 assert_different_registers(addr, tmp, t0);
3752 assert_different_registers(newv, tmp, t0);
3753 assert_different_registers(oldv, tmp, t0);
3754
3755 // oldv holds comparison value
3756 // newv holds value to write in exchange
3757 // addr identifies memory word to compare against/update
3758 if (UseZacas) {
3759 mv(tmp, oldv);
3760 atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
3761 beq(tmp, oldv, succeed);
3762 } else {
3763 Label retry_load, nope;
3764 bind(retry_load);
3765 // Load reserved from the memory location
3766 load_reserved(tmp, addr, int64, Assembler::aqrl);
3767 // Fail and exit if it is not what we expect
3768 bne(tmp, oldv, nope);
3769 // If the store conditional succeeds, tmp will be zero
3770 store_conditional(tmp, newv, addr, int64, Assembler::rl);
3771 beqz(tmp, succeed);
3772 // Retry only when the store conditional failed
3773 j(retry_load);
3774
3775 bind(nope);
3776 }
3777
3778 // neither amocas nor lr/sc have an implied barrier in the failing case
3779 membar(AnyAny);
3780
3781 mv(oldv, tmp);
3782 if (fail != nullptr) {
3783 j(*fail);
3784 }
3785 }
3786
3787 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
3788 Label &succeed, Label *fail) {
3789 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
3790 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
3791 }
3792
3793 void MacroAssembler::load_reserved(Register dst,
3794 Register addr,
3795 Assembler::operand_size size,
3796 Assembler::Aqrl acquire) {
3797 switch (size) {
3798 case int64:
3799 lr_d(dst, addr, acquire);
3800 break;
3801 case int32:
3802 lr_w(dst, addr, acquire);
3803 break;
3804 case uint32:
3805 lr_w(dst, addr, acquire);
3806 zext(dst, dst, 32);
3807 break;
3808 default:
3809 ShouldNotReachHere();
3810 }
3811 }
3812
3813 void MacroAssembler::store_conditional(Register dst,
3814 Register new_val,
3815 Register addr,
3816 Assembler::operand_size size,
3817 Assembler::Aqrl release) {
3818 switch (size) {
3819 case int64:
3820 sc_d(dst, addr, new_val, release);
3821 break;
3822 case int32:
3823 case uint32:
3824 sc_w(dst, addr, new_val, release);
3825 break;
3826 default:
3827 ShouldNotReachHere();
3828 }
3829 }
3830
3831
3832 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
3833 Assembler::operand_size size,
3834 Register shift, Register mask, Register aligned_addr) {
3835 assert(size == int8 || size == int16, "unsupported operand size");
3836
3837 andi(shift, addr, 3);
3838 slli(shift, shift, 3);
3839
3840 andi(aligned_addr, addr, ~3);
3841
3842 if (size == int8) {
3843 mv(mask, 0xff);
3844 } else {
3845 // size == int16 case
3846 mv(mask, -1);
3847 zext(mask, mask, 16);
3848 }
3849 sll(mask, mask, shift);
3850
3851 sll(expected, expected, shift);
3852 andr(expected, expected, mask);
3853
3854 sll(new_val, new_val, shift);
3855 andr(new_val, new_val, mask);
3856 }
3857
3858 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
3859 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
3860 // which are forced to work with 4-byte aligned address.
3861 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
3862 Register new_val,
3863 Assembler::operand_size size,
3864 Assembler::Aqrl acquire, Assembler::Aqrl release,
3865 Register result, bool result_as_bool,
3866 Register tmp1, Register tmp2, Register tmp3) {
3867 assert(!(UseZacas && UseZabha), "Use amocas");
3868 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3869
3870 Register scratch0 = t0, aligned_addr = t1;
3871 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3872
3873 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3874
3875 Label retry, fail, done;
3876
3877 if (UseZacas) {
3878 lw(result, aligned_addr);
3879
3880 bind(retry); // amocas loads the current value into result
3881 notr(scratch1, mask);
3882
3883 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
3884 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3885 bne(result, scratch1, fail); // cas bits differ, cas failed
3886
3887 // result is the same as expected, use as expected value.
3888
3889 // scratch0 is still = word - cas bits
3890 // Or in the new value to create complete new value.
3891 orr(scratch0, scratch0, new_val);
3892
3893 mv(scratch1, result); // save our expected value
3894 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3895 bne(scratch1, result, retry);
3896 } else {
3897 notr(scratch1, mask);
3898 bind(retry);
3899
3900 load_reserved(result, aligned_addr, operand_size::int32, acquire);
3901 andr(scratch0, result, mask);
3902 bne(scratch0, expected, fail);
3903
3904 andr(scratch0, result, scratch1); // scratch1 is ~mask
3905 orr(scratch0, scratch0, new_val);
3906 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3907 bnez(scratch0, retry);
3908 }
3909
3910 if (result_as_bool) {
3911 mv(result, 1);
3912 j(done);
3913
3914 bind(fail);
3915 mv(result, zr);
3916
3917 bind(done);
3918 } else {
3919 bind(fail);
3920
3921 andr(scratch0, result, mask);
3922 srl(result, scratch0, shift);
3923
3924 if (size == int8) {
3925 sext(result, result, 8);
3926 } else {
3927 // size == int16 case
3928 sext(result, result, 16);
3929 }
3930 }
3931 }
3932
3933 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
3934 // the weak CAS stuff. The major difference is that it just failed when store conditional
3935 // failed.
3936 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
3937 Register new_val,
3938 Assembler::operand_size size,
3939 Assembler::Aqrl acquire, Assembler::Aqrl release,
3940 Register result,
3941 Register tmp1, Register tmp2, Register tmp3) {
3942 assert(!(UseZacas && UseZabha), "Use amocas");
3943 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
3944
3945 Register scratch0 = t0, aligned_addr = t1;
3946 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
3947
3948 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
3949
3950 Label fail, done;
3951
3952 if (UseZacas) {
3953 lw(result, aligned_addr);
3954
3955 notr(scratch1, mask);
3956
3957 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
3958 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
3959 bne(result, scratch1, fail); // cas bits differ, cas failed
3960
3961 // result is the same as expected, use as expected value.
3962
3963 // scratch0 is still = word - cas bits
3964 // Or in the new value to create complete new value.
3965 orr(scratch0, scratch0, new_val);
3966
3967 mv(scratch1, result); // save our expected value
3968 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
3969 bne(scratch1, result, fail); // This weak, so just bail-out.
3970 } else {
3971 notr(scratch1, mask);
3972
3973 load_reserved(result, aligned_addr, operand_size::int32, acquire);
3974 andr(scratch0, result, mask);
3975 bne(scratch0, expected, fail);
3976
3977 andr(scratch0, result, scratch1); // scratch1 is ~mask
3978 orr(scratch0, scratch0, new_val);
3979 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
3980 bnez(scratch0, fail);
3981 }
3982
3983 // Success
3984 mv(result, 1);
3985 j(done);
3986
3987 // Fail
3988 bind(fail);
3989 mv(result, zr);
3990
3991 bind(done);
3992 }
3993
3994 void MacroAssembler::cmpxchg(Register addr, Register expected,
3995 Register new_val,
3996 Assembler::operand_size size,
3997 Assembler::Aqrl acquire, Assembler::Aqrl release,
3998 Register result, bool result_as_bool) {
3999 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4000 assert_different_registers(addr, t0);
4001 assert_different_registers(expected, t0);
4002 assert_different_registers(new_val, t0);
4003
4004 // NOTE:
4005 // Register _result_ may be the same register as _new_val_ or _expected_.
4006 // Hence do NOT use _result_ until after 'cas'.
4007 //
4008 // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4009 // Hence do NOT change _expected_ or _new_val_.
4010 //
4011 // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4012 //
4013 // TODO: Address these issues.
4014
4015 if (UseZacas) {
4016 if (result_as_bool) {
4017 mv(t0, expected);
4018 atomic_cas(t0, new_val, addr, size, acquire, release);
4019 xorr(t0, t0, expected);
4020 seqz(result, t0);
4021 } else {
4022 mv(t0, expected);
4023 atomic_cas(t0, new_val, addr, size, acquire, release);
4024 mv(result, t0);
4025 }
4026 return;
4027 }
4028
4029 Label retry_load, done, ne_done;
4030 bind(retry_load);
4031 load_reserved(t0, addr, size, acquire);
4032 bne(t0, expected, ne_done);
4033 store_conditional(t0, new_val, addr, size, release);
4034 bnez(t0, retry_load);
4035
4036 // equal, succeed
4037 if (result_as_bool) {
4038 mv(result, 1);
4039 } else {
4040 mv(result, expected);
4041 }
4042 j(done);
4043
4044 // not equal, failed
4045 bind(ne_done);
4046 if (result_as_bool) {
4047 mv(result, zr);
4048 } else {
4049 mv(result, t0);
4050 }
4051
4052 bind(done);
4053 }
4054
4055 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4056 Register new_val,
4057 Assembler::operand_size size,
4058 Assembler::Aqrl acquire, Assembler::Aqrl release,
4059 Register result) {
4060 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4061 assert_different_registers(addr, t0);
4062 assert_different_registers(expected, t0);
4063 assert_different_registers(new_val, t0);
4064
4065 if (UseZacas) {
4066 cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4067 return;
4068 }
4069
4070 Label fail, done;
4071 load_reserved(t0, addr, size, acquire);
4072 bne(t0, expected, fail);
4073 store_conditional(t0, new_val, addr, size, release);
4074 bnez(t0, fail);
4075
4076 // Success
4077 mv(result, 1);
4078 j(done);
4079
4080 // Fail
4081 bind(fail);
4082 mv(result, zr);
4083
4084 bind(done);
4085 }
4086
4087 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE) \
4088 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4089 prev = prev->is_valid() ? prev : zr; \
4090 if (incr.is_register()) { \
4091 AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4092 } else { \
4093 mv(t0, incr.as_constant()); \
4094 AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4095 } \
4096 return; \
4097 }
4098
4099 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4100 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4101 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4102 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4103
4104 #undef ATOMIC_OP
4105
4106 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE) \
4107 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
4108 prev = prev->is_valid() ? prev : zr; \
4109 AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4110 return; \
4111 }
4112
4113 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4114 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4115 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4116 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4117
4118 #undef ATOMIC_XCHG
4119
4120 #define ATOMIC_XCHGU(OP1, OP2) \
4121 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) { \
4122 atomic_##OP2(prev, newv, addr); \
4123 zext(prev, prev, 32); \
4124 return; \
4125 }
4126
4127 ATOMIC_XCHGU(xchgwu, xchgw)
4128 ATOMIC_XCHGU(xchgalwu, xchgalw)
4129
4130 #undef ATOMIC_XCHGU
4131
4132 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4133 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4134 switch (size) {
4135 case int64:
4136 amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4137 break;
4138 case int32:
4139 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4140 break;
4141 case uint32:
4142 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4143 zext(prev, prev, 32);
4144 break;
4145 case int16:
4146 amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4147 break;
4148 case int8:
4149 amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4150 break;
4151 default:
4152 ShouldNotReachHere();
4153 }
4154 }
4155
4156 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4157 assert(CodeCache::contains(entry.target()),
4158 "destination of far jump not found in code cache");
4159 assert(entry.rspec().type() == relocInfo::external_word_type
4160 || entry.rspec().type() == relocInfo::runtime_call_type
4161 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4162 // Fixed length: see MacroAssembler::far_branch_size()
4163 // We can use auipc + jr here because we know that the total size of
4164 // the code cache cannot exceed 2Gb.
4165 relocate(entry.rspec(), [&] {
4166 int64_t distance = entry.target() - pc();
4167 int32_t offset = ((int32_t)distance << 20) >> 20;
4168 assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4169 auipc(tmp, (int32_t)distance + 0x800);
4170 jr(tmp, offset);
4171 });
4172 }
4173
4174 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4175 assert(tmp != x5, "tmp register must not be x5.");
4176 assert(CodeCache::contains(entry.target()),
4177 "destination of far call not found in code cache");
4178 assert(entry.rspec().type() == relocInfo::external_word_type
4179 || entry.rspec().type() == relocInfo::runtime_call_type
4180 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4181 // Fixed length: see MacroAssembler::far_branch_size()
4182 // We can use auipc + jalr here because we know that the total size of
4183 // the code cache cannot exceed 2Gb.
4184 relocate(entry.rspec(), [&] {
4185 int64_t distance = entry.target() - pc();
4186 int32_t offset = ((int32_t)distance << 20) >> 20;
4187 assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4188 auipc(tmp, (int32_t)distance + 0x800);
4189 jalr(tmp, offset);
4190 });
4191 }
4192
4193 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4194 Register super_klass,
4195 Register tmp_reg,
4196 Label* L_success,
4197 Label* L_failure,
4198 Label* L_slow_path,
4199 Register super_check_offset) {
4200 assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4201 bool must_load_sco = !super_check_offset->is_valid();
4202 if (must_load_sco) {
4203 assert(tmp_reg != noreg, "supply either a temp or a register offset");
4204 }
4205
4206 Label L_fallthrough;
4207 int label_nulls = 0;
4208 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4209 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4210 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4211 assert(label_nulls <= 1, "at most one null in batch");
4212
4213 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4214 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4215 Address super_check_offset_addr(super_klass, sco_offset);
4216
4217 // Hacked jmp, which may only be used just before L_fallthrough.
4218 #define final_jmp(label) \
4219 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4220 else j(label) /*omit semi*/
4221
4222 // If the pointers are equal, we are done (e.g., String[] elements).
4223 // This self-check enables sharing of secondary supertype arrays among
4224 // non-primary types such as array-of-interface. Otherwise, each such
4225 // type would need its own customized SSA.
4226 // We move this check to the front of the fast path because many
4227 // type checks are in fact trivially successful in this manner,
4228 // so we get a nicely predicted branch right at the start of the check.
4229 beq(sub_klass, super_klass, *L_success);
4230
4231 // Check the supertype display:
4232 if (must_load_sco) {
4233 lwu(tmp_reg, super_check_offset_addr);
4234 super_check_offset = tmp_reg;
4235 }
4236 add(t0, sub_klass, super_check_offset);
4237 Address super_check_addr(t0);
4238 ld(t0, super_check_addr); // load displayed supertype
4239 beq(super_klass, t0, *L_success);
4240
4241 // This check has worked decisively for primary supers.
4242 // Secondary supers are sought in the super_cache ('super_cache_addr').
4243 // (Secondary supers are interfaces and very deeply nested subtypes.)
4244 // This works in the same check above because of a tricky aliasing
4245 // between the super_Cache and the primary super display elements.
4246 // (The 'super_check_addr' can address either, as the case requires.)
4247 // Note that the cache is updated below if it does not help us find
4248 // what we need immediately.
4249 // So if it was a primary super, we can just fail immediately.
4250 // Otherwise, it's the slow path for us (no success at this point).
4251
4252 mv(t1, sc_offset);
4253 if (L_failure == &L_fallthrough) {
4254 beq(super_check_offset, t1, *L_slow_path);
4255 } else {
4256 bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4257 final_jmp(*L_slow_path);
4258 }
4259
4260 bind(L_fallthrough);
4261
4262 #undef final_jmp
4263 }
4264
4265 // Scans count pointer sized words at [addr] for occurrence of value,
4266 // generic
4267 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4268 Register tmp) {
4269 Label Lloop, Lexit;
4270 beqz(count, Lexit);
4271 bind(Lloop);
4272 ld(tmp, addr);
4273 beq(value, tmp, Lexit);
4274 addi(addr, addr, wordSize);
4275 subi(count, count, 1);
4276 bnez(count, Lloop);
4277 bind(Lexit);
4278 }
4279
4280 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4281 Register super_klass,
4282 Register tmp1_reg,
4283 Register tmp2_reg,
4284 Label* L_success,
4285 Label* L_failure,
4286 bool set_cond_codes) {
4287 assert_different_registers(sub_klass, super_klass, tmp1_reg);
4288 if (tmp2_reg != noreg) {
4289 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4290 }
4291 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4292
4293 Label L_fallthrough;
4294 int label_nulls = 0;
4295 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4296 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4297
4298 assert(label_nulls <= 1, "at most one null in the batch");
4299
4300 // A couple of useful fields in sub_klass:
4301 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4302 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4303 Address secondary_supers_addr(sub_klass, ss_offset);
4304 Address super_cache_addr( sub_klass, sc_offset);
4305
4306 BLOCK_COMMENT("check_klass_subtype_slow_path");
4307
4308 // Do a linear scan of the secondary super-klass chain.
4309 // This code is rarely used, so simplicity is a virtue here.
4310 // The repne_scan instruction uses fixed registers, which we must spill.
4311 // Don't worry too much about pre-existing connections with the input regs.
4312
4313 assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4314 assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4315
4316 RegSet pushed_registers;
4317 if (!IS_A_TEMP(x12)) {
4318 pushed_registers += x12;
4319 }
4320 if (!IS_A_TEMP(x15)) {
4321 pushed_registers += x15;
4322 }
4323
4324 if (super_klass != x10) {
4325 if (!IS_A_TEMP(x10)) {
4326 pushed_registers += x10;
4327 }
4328 }
4329
4330 push_reg(pushed_registers, sp);
4331
4332 // Get super_klass value into x10 (even if it was in x15 or x12)
4333 mv(x10, super_klass);
4334
4335 #ifndef PRODUCT
4336 incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4337 #endif // PRODUCT
4338
4339 // We will consult the secondary-super array.
4340 ld(x15, secondary_supers_addr);
4341 // Load the array length.
4342 lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4343 // Skip to start of data.
4344 addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4345
4346 // Set t0 to an obvious invalid value, falling through by default
4347 mv(t0, -1);
4348 // Scan X12 words at [X15] for an occurrence of X10.
4349 repne_scan(x15, x10, x12, t0);
4350
4351 // pop will restore x10, so we should use a temp register to keep its value
4352 mv(t1, x10);
4353
4354 // Unspill the temp registers:
4355 pop_reg(pushed_registers, sp);
4356
4357 bne(t1, t0, *L_failure);
4358
4359 // Success. Cache the super we found an proceed in triumph.
4360 if (UseSecondarySupersCache) {
4361 sd(super_klass, super_cache_addr);
4362 }
4363
4364 if (L_success != &L_fallthrough) {
4365 j(*L_success);
4366 }
4367
4368 #undef IS_A_TEMP
4369
4370 bind(L_fallthrough);
4371 }
4372
4373 // population_count variant for running without the CPOP
4374 // instruction, which was introduced with Zbb extension.
4375 void MacroAssembler::population_count(Register dst, Register src,
4376 Register tmp1, Register tmp2) {
4377 if (UsePopCountInstruction) {
4378 cpop(dst, src);
4379 } else {
4380 assert_different_registers(src, tmp1, tmp2);
4381 assert_different_registers(dst, tmp1, tmp2);
4382 Label loop, done;
4383
4384 mv(tmp1, src);
4385 // dst = 0;
4386 // while(tmp1 != 0) {
4387 // dst++;
4388 // tmp1 &= (tmp1 - 1);
4389 // }
4390 mv(dst, zr);
4391 beqz(tmp1, done);
4392 {
4393 bind(loop);
4394 addi(dst, dst, 1);
4395 subi(tmp2, tmp1, 1);
4396 andr(tmp1, tmp1, tmp2);
4397 bnez(tmp1, loop);
4398 }
4399 bind(done);
4400 }
4401 }
4402
4403 // If Register r is invalid, remove a new register from
4404 // available_regs, and add new register to regs_to_push.
4405 Register MacroAssembler::allocate_if_noreg(Register r,
4406 RegSetIterator<Register> &available_regs,
4407 RegSet ®s_to_push) {
4408 if (!r->is_valid()) {
4409 r = *available_regs++;
4410 regs_to_push += r;
4411 }
4412 return r;
4413 }
4414
4415 // check_klass_subtype_slow_path_table() looks for super_klass in the
4416 // hash table belonging to super_klass, branching to L_success or
4417 // L_failure as appropriate. This is essentially a shim which
4418 // allocates registers as necessary then calls
4419 // lookup_secondary_supers_table() to do the work. Any of the tmp
4420 // regs may be noreg, in which case this logic will chooses some
4421 // registers push and pop them from the stack.
4422 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4423 Register super_klass,
4424 Register tmp1_reg,
4425 Register tmp2_reg,
4426 Label* L_success,
4427 Label* L_failure,
4428 bool set_cond_codes) {
4429 RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4430
4431 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4432
4433 Label L_fallthrough;
4434 int label_nulls = 0;
4435 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4436 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4437 assert(label_nulls <= 1, "at most one null in the batch");
4438
4439 BLOCK_COMMENT("check_klass_subtype_slow_path");
4440
4441 RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4442 RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4443
4444 RegSet pushed_regs;
4445
4446 tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4447 tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4448
4449 Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4450
4451 tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4452 tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4453 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4454
4455 push_reg(pushed_regs, sp);
4456
4457 lookup_secondary_supers_table_var(sub_klass,
4458 super_klass,
4459 result_reg,
4460 tmp1_reg, tmp2_reg, tmp3_reg,
4461 tmp4_reg, nullptr);
4462
4463 // Move the result to t1 as we are about to unspill the tmp registers.
4464 mv(t1, result_reg);
4465
4466 // Unspill the tmp. registers:
4467 pop_reg(pushed_regs, sp);
4468
4469 // NB! Callers may assume that, when set_cond_codes is true, this
4470 // code sets tmp2_reg to a nonzero value.
4471 if (set_cond_codes) {
4472 mv(tmp2_reg, 1);
4473 }
4474
4475 bnez(t1, *L_failure);
4476
4477 if (L_success != &L_fallthrough) {
4478 j(*L_success);
4479 }
4480
4481 bind(L_fallthrough);
4482 }
4483
4484 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4485 Register super_klass,
4486 Register tmp1_reg,
4487 Register tmp2_reg,
4488 Label* L_success,
4489 Label* L_failure,
4490 bool set_cond_codes) {
4491 if (UseSecondarySupersTable) {
4492 check_klass_subtype_slow_path_table
4493 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4494 } else {
4495 check_klass_subtype_slow_path_linear
4496 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4497 }
4498 }
4499
4500 // Ensure that the inline code and the stub are using the same registers
4501 // as we need to call the stub from inline code when there is a collision
4502 // in the hashed lookup in the secondary supers array.
4503 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
4504 r_array_index, r_sub_klass, result, r_bitmap) \
4505 do { \
4506 assert(r_super_klass == x10 && \
4507 r_array_base == x11 && \
4508 r_array_length == x12 && \
4509 (r_array_index == x13 || r_array_index == noreg) && \
4510 (r_sub_klass == x14 || r_sub_klass == noreg) && \
4511 (result == x15 || result == noreg) && \
4512 (r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
4513 } while(0)
4514
4515 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4516 Register r_super_klass,
4517 Register result,
4518 Register tmp1,
4519 Register tmp2,
4520 Register tmp3,
4521 Register tmp4,
4522 u1 super_klass_slot,
4523 bool stub_is_near) {
4524 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4525
4526 Label L_fallthrough;
4527
4528 BLOCK_COMMENT("lookup_secondary_supers_table {");
4529
4530 const Register
4531 r_array_base = tmp1, // x11
4532 r_array_length = tmp2, // x12
4533 r_array_index = tmp3, // x13
4534 r_bitmap = tmp4; // x16
4535
4536 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4537 r_array_index, r_sub_klass, result, r_bitmap);
4538
4539 u1 bit = super_klass_slot;
4540
4541 // Initialize result value to 1 which means mismatch.
4542 mv(result, 1);
4543
4544 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4545
4546 // First check the bitmap to see if super_klass might be present. If
4547 // the bit is zero, we are certain that super_klass is not one of
4548 // the secondary supers.
4549 test_bit(t0, r_bitmap, bit);
4550 beqz(t0, L_fallthrough);
4551
4552 // Get the first array index that can contain super_klass into r_array_index.
4553 if (bit != 0) {
4554 slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4555 population_count(r_array_index, r_array_index, tmp1, tmp2);
4556 } else {
4557 mv(r_array_index, (u1)1);
4558 }
4559
4560 // We will consult the secondary-super array.
4561 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4562
4563 // The value i in r_array_index is >= 1, so even though r_array_base
4564 // points to the length, we don't need to adjust it to point to the data.
4565 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4566 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4567
4568 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4569 ld(result, Address(result));
4570 xorr(result, result, r_super_klass);
4571 beqz(result, L_fallthrough); // Found a match
4572
4573 // Is there another entry to check? Consult the bitmap.
4574 test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4575 beqz(t0, L_fallthrough);
4576
4577 // Linear probe.
4578 if (bit != 0) {
4579 ror(r_bitmap, r_bitmap, bit);
4580 }
4581
4582 // The slot we just inspected is at secondary_supers[r_array_index - 1].
4583 // The next slot to be inspected, by the stub we're about to call,
4584 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4585 // have been checked.
4586 rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4587
4588 BLOCK_COMMENT("} lookup_secondary_supers_table");
4589
4590 bind(L_fallthrough);
4591
4592 if (VerifySecondarySupers) {
4593 verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4594 result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
4595 }
4596 return true;
4597 }
4598
4599 // At runtime, return 0 in result if r_super_klass is a superclass of
4600 // r_sub_klass, otherwise return nonzero. Use this version of
4601 // lookup_secondary_supers_table() if you don't know ahead of time
4602 // which superclass will be searched for. Used by interpreter and
4603 // runtime stubs. It is larger and has somewhat greater latency than
4604 // the version above, which takes a constant super_klass_slot.
4605 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4606 Register r_super_klass,
4607 Register result,
4608 Register tmp1,
4609 Register tmp2,
4610 Register tmp3,
4611 Register tmp4,
4612 Label *L_success) {
4613 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4614
4615 Label L_fallthrough;
4616
4617 BLOCK_COMMENT("lookup_secondary_supers_table {");
4618
4619 const Register
4620 r_array_index = tmp3,
4621 r_bitmap = tmp4,
4622 slot = t1;
4623
4624 lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
4625
4626 // Make sure that result is nonzero if the test below misses.
4627 mv(result, 1);
4628
4629 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4630
4631 // First check the bitmap to see if super_klass might be present. If
4632 // the bit is zero, we are certain that super_klass is not one of
4633 // the secondary supers.
4634
4635 // This next instruction is equivalent to:
4636 // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4637 // sub(r_array_index, slot, tmp_reg);
4638 xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
4639 sll(r_array_index, r_bitmap, r_array_index);
4640 test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
4641 beqz(t0, L_fallthrough);
4642
4643 // Get the first array index that can contain super_klass into r_array_index.
4644 population_count(r_array_index, r_array_index, tmp1, tmp2);
4645
4646 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
4647
4648 const Register
4649 r_array_base = tmp1,
4650 r_array_length = tmp2;
4651
4652 // The value i in r_array_index is >= 1, so even though r_array_base
4653 // points to the length, we don't need to adjust it to point to the data.
4654 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4655 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4656
4657 // We will consult the secondary-super array.
4658 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4659
4660 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4661 ld(result, Address(result));
4662 xorr(result, result, r_super_klass);
4663 beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
4664
4665 // Is there another entry to check? Consult the bitmap.
4666 ror(r_bitmap, r_bitmap, slot);
4667 test_bit(t0, r_bitmap, 1);
4668 beqz(t0, L_fallthrough);
4669
4670 // The slot we just inspected is at secondary_supers[r_array_index - 1].
4671 // The next slot to be inspected, by the logic we're about to call,
4672 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4673 // have been checked.
4674 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
4675 r_bitmap, result, r_array_length, false /*is_stub*/);
4676
4677 BLOCK_COMMENT("} lookup_secondary_supers_table");
4678
4679 bind(L_fallthrough);
4680
4681 if (VerifySecondarySupers) {
4682 verify_secondary_supers_table(r_sub_klass, r_super_klass,
4683 result, tmp1, tmp2, tmp3);
4684 }
4685
4686 if (L_success) {
4687 beqz(result, *L_success);
4688 }
4689 }
4690
4691 // Called by code generated by check_klass_subtype_slow_path
4692 // above. This is called when there is a collision in the hashed
4693 // lookup in the secondary supers array.
4694 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
4695 Register r_array_base,
4696 Register r_array_index,
4697 Register r_bitmap,
4698 Register result,
4699 Register tmp,
4700 bool is_stub) {
4701 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
4702
4703 const Register
4704 r_array_length = tmp,
4705 r_sub_klass = noreg; // unused
4706
4707 if (is_stub) {
4708 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4709 r_array_index, r_sub_klass, result, r_bitmap);
4710 }
4711
4712 Label L_matched, L_fallthrough, L_bitmap_full;
4713
4714 // Initialize result value to 1 which means mismatch.
4715 mv(result, 1);
4716
4717 // Load the array length.
4718 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4719 // And adjust the array base to point to the data.
4720 // NB! Effectively increments current slot index by 1.
4721 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
4722 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4723
4724 // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
4725 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
4726 subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
4727 bgtz(t0, L_bitmap_full);
4728
4729 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
4730 // current slot (at secondary_supers[r_array_index]) has not yet
4731 // been inspected, and r_array_index may be out of bounds if we
4732 // wrapped around the end of the array.
4733
4734 { // This is conventional linear probing, but instead of terminating
4735 // when a null entry is found in the table, we maintain a bitmap
4736 // in which a 0 indicates missing entries.
4737 // As long as the bitmap is not completely full,
4738 // array_length == popcount(bitmap). The array_length check above
4739 // guarantees there are 0s in the bitmap, so the loop eventually
4740 // terminates.
4741 Label L_loop;
4742 bind(L_loop);
4743
4744 // Check for wraparound.
4745 Label skip;
4746 blt(r_array_index, r_array_length, skip);
4747 mv(r_array_index, zr);
4748 bind(skip);
4749
4750 shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
4751 ld(t0, Address(t0));
4752 beq(t0, r_super_klass, L_matched);
4753
4754 test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
4755 beqz(t0, L_fallthrough);
4756
4757 ror(r_bitmap, r_bitmap, 1);
4758 addi(r_array_index, r_array_index, 1);
4759 j(L_loop);
4760 }
4761
4762 { // Degenerate case: more than 64 secondary supers.
4763 // FIXME: We could do something smarter here, maybe a vectorized
4764 // comparison or a binary search, but is that worth any added
4765 // complexity?
4766 bind(L_bitmap_full);
4767 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4768 bne(r_super_klass, t0, L_fallthrough);
4769 }
4770
4771 bind(L_matched);
4772 mv(result, zr);
4773
4774 bind(L_fallthrough);
4775 }
4776
4777 // Make sure that the hashed lookup and a linear scan agree.
4778 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
4779 Register r_super_klass,
4780 Register result,
4781 Register tmp1,
4782 Register tmp2,
4783 Register tmp3) {
4784 assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
4785
4786 const Register
4787 r_array_base = tmp1, // X11
4788 r_array_length = tmp2, // X12
4789 r_array_index = noreg, // unused
4790 r_bitmap = noreg; // unused
4791
4792 BLOCK_COMMENT("verify_secondary_supers_table {");
4793
4794 // We will consult the secondary-super array.
4795 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4796
4797 // Load the array length.
4798 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
4799 // And adjust the array base to point to the data.
4800 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
4801
4802 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
4803 Label failed;
4804 mv(tmp3, 1);
4805 bne(r_super_klass, t0, failed);
4806 mv(tmp3, zr);
4807 bind(failed);
4808
4809 snez(result, result); // normalize result to 0/1 for comparison
4810
4811 Label passed;
4812 beq(tmp3, result, passed);
4813 {
4814 mv(x10, r_super_klass);
4815 mv(x11, r_sub_klass);
4816 mv(x12, tmp3);
4817 mv(x13, result);
4818 mv(x14, (address)("mismatch"));
4819 rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
4820 should_not_reach_here();
4821 }
4822 bind(passed);
4823
4824 BLOCK_COMMENT("} verify_secondary_supers_table");
4825 }
4826
4827 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
4828 void MacroAssembler::tlab_allocate(Register obj,
4829 Register var_size_in_bytes,
4830 int con_size_in_bytes,
4831 Register tmp1,
4832 Register tmp2,
4833 Label& slow_case,
4834 bool is_far) {
4835 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4836 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
4837 }
4838
4839 // get_thread() can be called anywhere inside generated code so we
4840 // need to save whatever non-callee save context might get clobbered
4841 // by the call to Thread::current() or, indeed, the call setup code.
4842 void MacroAssembler::get_thread(Register thread) {
4843 // save all call-clobbered regs except thread
4844 RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
4845 RegSet::range(x28, x31) + ra - thread;
4846 push_reg(saved_regs, sp);
4847
4848 mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
4849 jalr(t1);
4850 if (thread != c_rarg0) {
4851 mv(thread, c_rarg0);
4852 }
4853
4854 // restore pushed registers
4855 pop_reg(saved_regs, sp);
4856 }
4857
4858 void MacroAssembler::load_byte_map_base(Register reg) {
4859 CardTable::CardValue* byte_map_base =
4860 ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4861 mv(reg, (uint64_t)byte_map_base);
4862 }
4863
4864 void MacroAssembler::build_frame(int framesize) {
4865 assert(framesize >= 2, "framesize must include space for FP/RA");
4866 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4867 sub(sp, sp, framesize);
4868 sd(fp, Address(sp, framesize - 2 * wordSize));
4869 sd(ra, Address(sp, framesize - wordSize));
4870 if (PreserveFramePointer) { add(fp, sp, framesize); }
4871 }
4872
4873 void MacroAssembler::remove_frame(int framesize) {
4874 assert(framesize >= 2, "framesize must include space for FP/RA");
4875 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
4876 ld(fp, Address(sp, framesize - 2 * wordSize));
4877 ld(ra, Address(sp, framesize - wordSize));
4878 add(sp, sp, framesize);
4879 }
4880
4881 void MacroAssembler::reserved_stack_check() {
4882 // testing if reserved zone needs to be enabled
4883 Label no_reserved_zone_enabling;
4884
4885 ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
4886 bltu(sp, t0, no_reserved_zone_enabling);
4887
4888 enter(); // RA and FP are live.
4889 mv(c_rarg0, xthread);
4890 rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
4891 leave();
4892
4893 // We have already removed our own frame.
4894 // throw_delayed_StackOverflowError will think that it's been
4895 // called by our caller.
4896 j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
4897 should_not_reach_here();
4898
4899 bind(no_reserved_zone_enabling);
4900 }
4901
4902 // Move the address of the polling page into dest.
4903 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4904 ld(dest, Address(xthread, JavaThread::polling_page_offset()));
4905 }
4906
4907 // Read the polling page. The address of the polling page must
4908 // already be in r.
4909 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
4910 relocate(rtype, [&] {
4911 lwu(zr, Address(r, offset));
4912 });
4913 }
4914
4915 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4916 #ifdef ASSERT
4917 {
4918 ThreadInVMfromUnknown tiv;
4919 assert (UseCompressedOops, "should only be used for compressed oops");
4920 assert (Universe::heap() != nullptr, "java heap should be initialized");
4921 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4922 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4923 }
4924 #endif
4925 int oop_index = oop_recorder()->find_index(obj);
4926 relocate(oop_Relocation::spec(oop_index), [&] {
4927 li32(dst, 0xDEADBEEF);
4928 });
4929 zext(dst, dst, 32);
4930 }
4931
4932 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4933 assert (UseCompressedClassPointers, "should only be used for compressed headers");
4934 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
4935 int index = oop_recorder()->find_index(k);
4936 assert(!Universe::heap()->is_in(k), "should not be an oop");
4937
4938 narrowKlass nk = CompressedKlassPointers::encode(k);
4939 relocate(metadata_Relocation::spec(index), [&] {
4940 li32(dst, nk);
4941 });
4942 zext(dst, dst, 32);
4943 }
4944
4945 address MacroAssembler::reloc_call(Address entry, Register tmp) {
4946 assert(entry.rspec().type() == relocInfo::runtime_call_type ||
4947 entry.rspec().type() == relocInfo::opt_virtual_call_type ||
4948 entry.rspec().type() == relocInfo::static_call_type ||
4949 entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
4950
4951 address target = entry.target();
4952
4953 if (!in_scratch_emit_size()) {
4954 address stub = emit_reloc_call_address_stub(offset(), target);
4955 if (stub == nullptr) {
4956 postcond(pc() == badAddress);
4957 return nullptr; // CodeCache is full
4958 }
4959 }
4960
4961 address call_pc = pc();
4962 #ifdef ASSERT
4963 if (entry.rspec().type() != relocInfo::runtime_call_type) {
4964 assert_alignment(call_pc);
4965 }
4966 #endif
4967
4968 // The relocation created while emitting the stub will ensure this
4969 // call instruction is subsequently patched to call the stub.
4970 relocate(entry.rspec(), [&] {
4971 auipc(tmp, 0);
4972 ld(tmp, Address(tmp, 0));
4973 jalr(tmp);
4974 });
4975
4976 postcond(pc() != badAddress);
4977 return call_pc;
4978 }
4979
4980 address MacroAssembler::ic_call(address entry, jint method_index) {
4981 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
4982 assert(!in_compressible_scope(), "Must be");
4983 movptr(t0, (address)Universe::non_oop_word(), t1);
4984 assert_cond(entry != nullptr);
4985 return reloc_call(Address(entry, rh));
4986 }
4987
4988 int MacroAssembler::ic_check_size() {
4989 // No compressed
4990 return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
4991 far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
4992 }
4993
4994 int MacroAssembler::ic_check(int end_alignment) {
4995 IncompressibleScope scope(this);
4996 Register receiver = j_rarg0;
4997 Register data = t0;
4998
4999 Register tmp1 = t1; // scratch
5000 // t2 is saved on call, thus should have been saved before this check.
5001 // Hence we can clobber it.
5002 Register tmp2 = t2;
5003
5004 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5005 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5006 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5007 // before the inline cache check here, and not after
5008 align(end_alignment, ic_check_size());
5009 int uep_offset = offset();
5010
5011 if (UseCompactObjectHeaders) {
5012 load_narrow_klass_compact(tmp1, receiver);
5013 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5014 } else if (UseCompressedClassPointers) {
5015 lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5016 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5017 } else {
5018 ld(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5019 ld(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5020 }
5021
5022 Label ic_hit;
5023 beq(tmp1, tmp2, ic_hit);
5024 // Note, far_jump is not fixed size.
5025 // Is this ever generates a movptr alignment/size will be off.
5026 far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5027 bind(ic_hit);
5028
5029 assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5030 return uep_offset;
5031 }
5032
5033 // Emit an address stub for a call to a target which is too far away.
5034 // Note that we only put the target address of the call in the stub.
5035 //
5036 // code sequences:
5037 //
5038 // call-site:
5039 // load target address from stub
5040 // jump-and-link target address
5041 //
5042 // Related address stub for this call site in the stub section:
5043 // alignment nop
5044 // target address
5045
5046 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5047 address stub = start_a_stub(max_reloc_call_address_stub_size());
5048 if (stub == nullptr) {
5049 return nullptr; // CodeBuffer::expand failed
5050 }
5051
5052 // We are always 4-byte aligned here.
5053 assert_alignment(pc());
5054
5055 // Make sure the address of destination 8-byte aligned.
5056 align(wordSize, 0);
5057
5058 RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5059 insts_call_instruction_offset);
5060 const int stub_start_offset = offset();
5061 relocate(rh, [&] {
5062 assert(offset() - stub_start_offset == 0,
5063 "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5064 assert(offset() % wordSize == 0, "bad alignment");
5065 emit_int64((int64_t)dest);
5066 });
5067
5068 const address stub_start_addr = addr_at(stub_start_offset);
5069 end_a_stub();
5070
5071 return stub_start_addr;
5072 }
5073
5074 int MacroAssembler::max_reloc_call_address_stub_size() {
5075 // Max stub size: alignment nop, target address.
5076 return 1 * MacroAssembler::instruction_size + wordSize;
5077 }
5078
5079 int MacroAssembler::static_call_stub_size() {
5080 // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5081 return 11 * MacroAssembler::instruction_size;
5082 }
5083
5084 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5085 switch (dst.getMode()) {
5086 case Address::base_plus_offset:
5087 // This is the expected mode, although we allow all the other
5088 // forms below.
5089 return form_address(tmp, dst.base(), dst.offset());
5090 default:
5091 la(tmp, dst);
5092 return Address(tmp);
5093 }
5094 }
5095
5096 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5097 assert(((dst.getMode() == Address::base_plus_offset &&
5098 is_simm12(dst.offset())) || is_simm12(value)),
5099 "invalid value and address mode combination");
5100 Address adr = add_memory_helper(dst, tmp2);
5101 assert(!adr.uses(tmp1), "invalid dst for address increment");
5102 ld(tmp1, adr);
5103 add(tmp1, tmp1, value, tmp2);
5104 sd(tmp1, adr);
5105 }
5106
5107 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5108 assert(((dst.getMode() == Address::base_plus_offset &&
5109 is_simm12(dst.offset())) || is_simm12(value)),
5110 "invalid value and address mode combination");
5111 Address adr = add_memory_helper(dst, tmp2);
5112 assert(!adr.uses(tmp1), "invalid dst for address increment");
5113 lwu(tmp1, adr);
5114 addw(tmp1, tmp1, value, tmp2);
5115 sw(tmp1, adr);
5116 }
5117
5118 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5119 assert(((dst.getMode() == Address::base_plus_offset &&
5120 is_simm12(dst.offset())) || is_simm12(value)),
5121 "invalid value and address mode combination");
5122 Address adr = add_memory_helper(dst, tmp2);
5123 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5124 ld(tmp1, adr);
5125 sub(tmp1, tmp1, value, tmp2);
5126 sd(tmp1, adr);
5127 }
5128
5129 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5130 assert(((dst.getMode() == Address::base_plus_offset &&
5131 is_simm12(dst.offset())) || is_simm12(value)),
5132 "invalid value and address mode combination");
5133 Address adr = add_memory_helper(dst, tmp2);
5134 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5135 lwu(tmp1, adr);
5136 subw(tmp1, tmp1, value, tmp2);
5137 sw(tmp1, adr);
5138 }
5139
5140 void MacroAssembler::cmpptr(Register src1, const Address &src2, Label& equal, Register tmp) {
5141 assert_different_registers(src1, tmp);
5142 assert(src2.getMode() == Address::literal, "must be applied to a literal address");
5143 ld(tmp, src2);
5144 beq(src1, tmp, equal);
5145 }
5146
5147 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5148 load_method_holder(result, method);
5149 ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5150 }
5151
5152 void MacroAssembler::load_method_holder(Register holder, Register method) {
5153 ld(holder, Address(method, Method::const_offset())); // ConstMethod*
5154 ld(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5155 ld(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5156 }
5157
5158 // string indexof
5159 // compute index by trailing zeros
5160 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5161 Register match_mask, Register result,
5162 Register ch2, Register tmp,
5163 bool haystack_isL) {
5164 int haystack_chr_shift = haystack_isL ? 0 : 1;
5165 srl(match_mask, match_mask, trailing_zeros);
5166 srli(match_mask, match_mask, 1);
5167 srli(tmp, trailing_zeros, LogBitsPerByte);
5168 if (!haystack_isL) andi(tmp, tmp, 0xE);
5169 add(haystack, haystack, tmp);
5170 ld(ch2, Address(haystack));
5171 if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5172 add(result, result, tmp);
5173 }
5174
5175 // string indexof
5176 // Find pattern element in src, compute match mask,
5177 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5178 // match mask patterns and corresponding indices would be like:
5179 // - 0x8080808080808080 (Latin1)
5180 // - 7 6 5 4 3 2 1 0 (match index)
5181 // - 0x8000800080008000 (UTF16)
5182 // - 3 2 1 0 (match index)
5183 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5184 Register mask1, Register mask2) {
5185 xorr(src, pattern, src);
5186 sub(match_mask, src, mask1);
5187 orr(src, src, mask2);
5188 notr(src, src);
5189 andr(match_mask, match_mask, src);
5190 }
5191
5192 #ifdef COMPILER2
5193 // Code for BigInteger::mulAdd intrinsic
5194 // out = x10
5195 // in = x11
5196 // offset = x12 (already out.length-offset)
5197 // len = x13
5198 // k = x14
5199 // tmp = x28
5200 //
5201 // pseudo code from java implementation:
5202 // long kLong = k & LONG_MASK;
5203 // carry = 0;
5204 // offset = out.length-offset - 1;
5205 // for (int j = len - 1; j >= 0; j--) {
5206 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5207 // out[offset--] = (int)product;
5208 // carry = product >>> 32;
5209 // }
5210 // return (int)carry;
5211 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5212 Register len, Register k, Register tmp) {
5213 Label L_tail_loop, L_unroll, L_end;
5214 mv(tmp, out);
5215 mv(out, zr);
5216 blez(len, L_end);
5217 zext(k, k, 32);
5218 slliw(t0, offset, LogBytesPerInt);
5219 add(offset, tmp, t0);
5220 slliw(t0, len, LogBytesPerInt);
5221 add(in, in, t0);
5222
5223 const int unroll = 8;
5224 mv(tmp, unroll);
5225 blt(len, tmp, L_tail_loop);
5226 bind(L_unroll);
5227 for (int i = 0; i < unroll; i++) {
5228 subi(in, in, BytesPerInt);
5229 lwu(t0, Address(in, 0));
5230 mul(t1, t0, k);
5231 add(t0, t1, out);
5232 subi(offset, offset, BytesPerInt);
5233 lwu(t1, Address(offset, 0));
5234 add(t0, t0, t1);
5235 sw(t0, Address(offset, 0));
5236 srli(out, t0, 32);
5237 }
5238 subw(len, len, tmp);
5239 bge(len, tmp, L_unroll);
5240
5241 bind(L_tail_loop);
5242 blez(len, L_end);
5243 subi(in, in, BytesPerInt);
5244 lwu(t0, Address(in, 0));
5245 mul(t1, t0, k);
5246 add(t0, t1, out);
5247 subi(offset, offset, BytesPerInt);
5248 lwu(t1, Address(offset, 0));
5249 add(t0, t0, t1);
5250 sw(t0, Address(offset, 0));
5251 srli(out, t0, 32);
5252 subiw(len, len, 1);
5253 j(L_tail_loop);
5254
5255 bind(L_end);
5256 }
5257
5258 // Multiply and multiply-accumulate unsigned 64-bit registers.
5259 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5260 assert_different_registers(prod_lo, prod_hi);
5261
5262 mul(prod_lo, n, m);
5263 mulhu(prod_hi, n, m);
5264 }
5265
5266 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5267 Register m, Register tmp1, Register tmp2) {
5268 assert_different_registers(sum_lo, sum_hi);
5269 assert_different_registers(sum_hi, tmp2);
5270
5271 wide_mul(tmp1, tmp2, n, m);
5272 cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
5273 adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
5274 }
5275
5276 // add two unsigned input and output carry
5277 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5278 {
5279 assert_different_registers(dst, carry);
5280 assert_different_registers(dst, src2);
5281 add(dst, src1, src2);
5282 sltu(carry, dst, src2);
5283 }
5284
5285 // add two input with carry
5286 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5287 assert_different_registers(dst, carry);
5288 add(dst, src1, src2);
5289 add(dst, dst, carry);
5290 }
5291
5292 // add two unsigned input with carry and output carry
5293 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5294 assert_different_registers(dst, src2);
5295 adc(dst, src1, src2, carry);
5296 sltu(carry, dst, src2);
5297 }
5298
5299 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5300 Register src1, Register src2, Register carry) {
5301 cad(dest_lo, dest_lo, src1, carry);
5302 add(dest_hi, dest_hi, carry);
5303 cad(dest_lo, dest_lo, src2, carry);
5304 add(final_dest_hi, dest_hi, carry);
5305 }
5306
5307 /**
5308 * Multiply 64 bit by 64 bit first loop.
5309 */
5310 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5311 Register y, Register y_idx, Register z,
5312 Register carry, Register product,
5313 Register idx, Register kdx) {
5314 //
5315 // jlong carry, x[], y[], z[];
5316 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5317 // huge_128 product = y[idx] * x[xstart] + carry;
5318 // z[kdx] = (jlong)product;
5319 // carry = (jlong)(product >>> 64);
5320 // }
5321 // z[xstart] = carry;
5322 //
5323
5324 Label L_first_loop, L_first_loop_exit;
5325 Label L_one_x, L_one_y, L_multiply;
5326
5327 subiw(xstart, xstart, 1);
5328 bltz(xstart, L_one_x);
5329
5330 shadd(t0, xstart, x, t0, LogBytesPerInt);
5331 ld(x_xstart, Address(t0, 0));
5332 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5333
5334 bind(L_first_loop);
5335 subiw(idx, idx, 1);
5336 bltz(idx, L_first_loop_exit);
5337 subiw(idx, idx, 1);
5338 bltz(idx, L_one_y);
5339
5340 shadd(t0, idx, y, t0, LogBytesPerInt);
5341 ld(y_idx, Address(t0, 0));
5342 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5343 bind(L_multiply);
5344
5345 mulhu(t0, x_xstart, y_idx);
5346 mul(product, x_xstart, y_idx);
5347 cad(product, product, carry, t1);
5348 adc(carry, t0, zr, t1);
5349
5350 subiw(kdx, kdx, 2);
5351 ror(product, product, 32); // back to big-endian
5352 shadd(t0, kdx, z, t0, LogBytesPerInt);
5353 sd(product, Address(t0, 0));
5354
5355 j(L_first_loop);
5356
5357 bind(L_one_y);
5358 lwu(y_idx, Address(y, 0));
5359 j(L_multiply);
5360
5361 bind(L_one_x);
5362 lwu(x_xstart, Address(x, 0));
5363 j(L_first_loop);
5364
5365 bind(L_first_loop_exit);
5366 }
5367
5368 /**
5369 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5370 *
5371 */
5372 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5373 Register carry, Register carry2,
5374 Register idx, Register jdx,
5375 Register yz_idx1, Register yz_idx2,
5376 Register tmp, Register tmp3, Register tmp4,
5377 Register tmp6, Register product_hi) {
5378 // jlong carry, x[], y[], z[];
5379 // int kdx = xstart+1;
5380 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5381 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5382 // jlong carry2 = (jlong)(tmp3 >>> 64);
5383 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
5384 // carry = (jlong)(tmp4 >>> 64);
5385 // z[kdx+idx+1] = (jlong)tmp3;
5386 // z[kdx+idx] = (jlong)tmp4;
5387 // }
5388 // idx += 2;
5389 // if (idx > 0) {
5390 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5391 // z[kdx+idx] = (jlong)yz_idx1;
5392 // carry = (jlong)(yz_idx1 >>> 64);
5393 // }
5394 //
5395
5396 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5397
5398 srliw(jdx, idx, 2);
5399
5400 bind(L_third_loop);
5401
5402 subw(jdx, jdx, 1);
5403 bltz(jdx, L_third_loop_exit);
5404 subw(idx, idx, 4);
5405
5406 shadd(t0, idx, y, t0, LogBytesPerInt);
5407 ld(yz_idx2, Address(t0, 0));
5408 ld(yz_idx1, Address(t0, wordSize));
5409
5410 shadd(tmp6, idx, z, t0, LogBytesPerInt);
5411
5412 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5413 ror(yz_idx2, yz_idx2, 32);
5414
5415 ld(t1, Address(tmp6, 0));
5416 ld(t0, Address(tmp6, wordSize));
5417
5418 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5419 mulhu(tmp4, product_hi, yz_idx1);
5420
5421 ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5422 ror(t1, t1, 32, tmp);
5423
5424 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
5425 mulhu(carry2, product_hi, yz_idx2);
5426
5427 cad(tmp3, tmp3, carry, carry);
5428 adc(tmp4, tmp4, zr, carry);
5429 cad(tmp3, tmp3, t0, t0);
5430 cadc(tmp4, tmp4, tmp, t0);
5431 adc(carry, carry2, zr, t0);
5432 cad(tmp4, tmp4, t1, carry2);
5433 adc(carry, carry, zr, carry2);
5434
5435 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5436 ror(tmp4, tmp4, 32);
5437 sd(tmp4, Address(tmp6, 0));
5438 sd(tmp3, Address(tmp6, wordSize));
5439
5440 j(L_third_loop);
5441
5442 bind(L_third_loop_exit);
5443
5444 andi(idx, idx, 0x3);
5445 beqz(idx, L_post_third_loop_done);
5446
5447 Label L_check_1;
5448 subiw(idx, idx, 2);
5449 bltz(idx, L_check_1);
5450
5451 shadd(t0, idx, y, t0, LogBytesPerInt);
5452 ld(yz_idx1, Address(t0, 0));
5453 ror(yz_idx1, yz_idx1, 32);
5454
5455 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5456 mulhu(tmp4, product_hi, yz_idx1);
5457
5458 shadd(t0, idx, z, t0, LogBytesPerInt);
5459 ld(yz_idx2, Address(t0, 0));
5460 ror(yz_idx2, yz_idx2, 32, tmp);
5461
5462 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5463
5464 ror(tmp3, tmp3, 32, tmp);
5465 sd(tmp3, Address(t0, 0));
5466
5467 bind(L_check_1);
5468
5469 andi(idx, idx, 0x1);
5470 subiw(idx, idx, 1);
5471 bltz(idx, L_post_third_loop_done);
5472 shadd(t0, idx, y, t0, LogBytesPerInt);
5473 lwu(tmp4, Address(t0, 0));
5474 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
5475 mulhu(carry2, tmp4, product_hi);
5476
5477 shadd(t0, idx, z, t0, LogBytesPerInt);
5478 lwu(tmp4, Address(t0, 0));
5479
5480 add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5481
5482 shadd(t0, idx, z, t0, LogBytesPerInt);
5483 sw(tmp3, Address(t0, 0));
5484
5485 slli(t0, carry2, 32);
5486 srli(carry, tmp3, 32);
5487 orr(carry, carry, t0);
5488
5489 bind(L_post_third_loop_done);
5490 }
5491
5492 /**
5493 * Code for BigInteger::multiplyToLen() intrinsic.
5494 *
5495 * x10: x
5496 * x11: xlen
5497 * x12: y
5498 * x13: ylen
5499 * x14: z
5500 * x15: tmp0
5501 * x16: tmp1
5502 * x17: tmp2
5503 * x7: tmp3
5504 * x28: tmp4
5505 * x29: tmp5
5506 * x30: tmp6
5507 * x31: tmp7
5508 */
5509 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5510 Register z, Register tmp0,
5511 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5512 Register tmp5, Register tmp6, Register product_hi) {
5513 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5514
5515 const Register idx = tmp1;
5516 const Register kdx = tmp2;
5517 const Register xstart = tmp3;
5518
5519 const Register y_idx = tmp4;
5520 const Register carry = tmp5;
5521 const Register product = xlen;
5522 const Register x_xstart = tmp0;
5523 const Register jdx = tmp1;
5524
5525 mv(idx, ylen); // idx = ylen;
5526 addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5527 mv(carry, zr); // carry = 0;
5528
5529 Label L_done;
5530 subiw(xstart, xlen, 1);
5531 bltz(xstart, L_done);
5532
5533 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5534
5535 Label L_second_loop_aligned;
5536 beqz(kdx, L_second_loop_aligned);
5537
5538 Label L_carry;
5539 subiw(kdx, kdx, 1);
5540 beqz(kdx, L_carry);
5541
5542 shadd(t0, kdx, z, t0, LogBytesPerInt);
5543 sw(carry, Address(t0, 0));
5544 srli(carry, carry, 32);
5545 subiw(kdx, kdx, 1);
5546
5547 bind(L_carry);
5548 shadd(t0, kdx, z, t0, LogBytesPerInt);
5549 sw(carry, Address(t0, 0));
5550
5551 // Second and third (nested) loops.
5552 //
5553 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5554 // carry = 0;
5555 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5556 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5557 // (z[k] & LONG_MASK) + carry;
5558 // z[k] = (int)product;
5559 // carry = product >>> 32;
5560 // }
5561 // z[i] = (int)carry;
5562 // }
5563 //
5564 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5565
5566 bind(L_second_loop_aligned);
5567 mv(carry, zr); // carry = 0;
5568 mv(jdx, ylen); // j = ystart+1
5569
5570 subiw(xstart, xstart, 1); // i = xstart-1;
5571 bltz(xstart, L_done);
5572
5573 subi(sp, sp, 4 * wordSize);
5574 sd(z, Address(sp, 0));
5575
5576 Label L_last_x;
5577 shadd(t0, xstart, z, t0, LogBytesPerInt);
5578 addi(z, t0, 4);
5579 subiw(xstart, xstart, 1); // i = xstart-1;
5580 bltz(xstart, L_last_x);
5581
5582 shadd(t0, xstart, x, t0, LogBytesPerInt);
5583 ld(product_hi, Address(t0, 0));
5584 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5585
5586 Label L_third_loop_prologue;
5587 bind(L_third_loop_prologue);
5588
5589 sd(ylen, Address(sp, wordSize));
5590 sd(x, Address(sp, 2 * wordSize));
5591 sd(xstart, Address(sp, 3 * wordSize));
5592 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5593 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5594 ld(z, Address(sp, 0));
5595 ld(ylen, Address(sp, wordSize));
5596 ld(x, Address(sp, 2 * wordSize));
5597 ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5598 addi(sp, sp, 4 * wordSize);
5599
5600 addiw(tmp3, xlen, 1);
5601 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5602 sw(carry, Address(t0, 0));
5603
5604 subiw(tmp3, tmp3, 1);
5605 bltz(tmp3, L_done);
5606
5607 srli(carry, carry, 32);
5608 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5609 sw(carry, Address(t0, 0));
5610 j(L_second_loop_aligned);
5611
5612 // Next infrequent code is moved outside loops.
5613 bind(L_last_x);
5614 lwu(product_hi, Address(x, 0));
5615 j(L_third_loop_prologue);
5616
5617 bind(L_done);
5618 }
5619 #endif
5620
5621 // Count bits of trailing zero chars from lsb to msb until first non-zero
5622 // char seen. For the LL case, shift 8 bits once as there is only one byte
5623 // per each char. For other cases, shift 16 bits once.
5624 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
5625 Register tmp1, Register tmp2) {
5626 int step = isLL ? 8 : 16;
5627 if (UseZbb) {
5628 ctz(Rd, Rs);
5629 andi(Rd, Rd, -step);
5630 return;
5631 }
5632
5633 assert_different_registers(Rd, tmp1, tmp2);
5634 Label Loop;
5635 mv(tmp2, Rs);
5636 mv(Rd, -step);
5637
5638 bind(Loop);
5639 addi(Rd, Rd, step);
5640 zext(tmp1, tmp2, step);
5641 srli(tmp2, tmp2, step);
5642 beqz(tmp1, Loop);
5643 }
5644
5645 // This instruction reads adjacent 4 bytes from the lower half of source register,
5646 // inflate into a register, for example:
5647 // Rs: A7A6A5A4A3A2A1A0
5648 // Rd: 00A300A200A100A0
5649 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5650 assert_different_registers(Rd, Rs, tmp1, tmp2);
5651
5652 mv(tmp1, 0xFF000000); // first byte mask at lower word
5653 andr(Rd, Rs, tmp1);
5654 for (int i = 0; i < 2; i++) {
5655 slli(Rd, Rd, wordSize);
5656 srli(tmp1, tmp1, wordSize);
5657 andr(tmp2, Rs, tmp1);
5658 orr(Rd, Rd, tmp2);
5659 }
5660 slli(Rd, Rd, wordSize);
5661 zext(tmp2, Rs, 8); // last byte mask at lower word
5662 orr(Rd, Rd, tmp2);
5663 }
5664
5665 // This instruction reads adjacent 4 bytes from the upper half of source register,
5666 // inflate into a register, for example:
5667 // Rs: A7A6A5A4A3A2A1A0
5668 // Rd: 00A700A600A500A4
5669 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
5670 assert_different_registers(Rd, Rs, tmp1, tmp2);
5671 srli(Rs, Rs, 32); // only upper 32 bits are needed
5672 inflate_lo32(Rd, Rs, tmp1, tmp2);
5673 }
5674
5675 // The size of the blocks erased by the zero_blocks stub. We must
5676 // handle anything smaller than this ourselves in zero_words().
5677 const int MacroAssembler::zero_words_block_size = 8;
5678
5679 // zero_words() is used by C2 ClearArray patterns. It is as small as
5680 // possible, handling small word counts locally and delegating
5681 // anything larger to the zero_blocks stub. It is expanded many times
5682 // in compiled code, so it is important to keep it short.
5683
5684 // ptr: Address of a buffer to be zeroed.
5685 // cnt: Count in HeapWords.
5686 //
5687 // ptr, cnt, t1, and t0 are clobbered.
5688 address MacroAssembler::zero_words(Register ptr, Register cnt) {
5689 assert(is_power_of_2(zero_words_block_size), "adjust this");
5690 assert(ptr == x28 && cnt == x29, "mismatch in register usage");
5691 assert_different_registers(cnt, t0, t1);
5692
5693 BLOCK_COMMENT("zero_words {");
5694
5695 mv(t0, zero_words_block_size);
5696 Label around, done, done16;
5697 bltu(cnt, t0, around);
5698 {
5699 RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
5700 assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
5701 if (StubRoutines::riscv::complete()) {
5702 address tpc = reloc_call(zero_blocks);
5703 if (tpc == nullptr) {
5704 DEBUG_ONLY(reset_labels(around));
5705 postcond(pc() == badAddress);
5706 return nullptr;
5707 }
5708 } else {
5709 // Clobbers t1
5710 rt_call(zero_blocks.target());
5711 }
5712 }
5713 bind(around);
5714 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5715 Label l;
5716 test_bit(t0, cnt, exact_log2(i));
5717 beqz(t0, l);
5718 for (int j = 0; j < i; j++) {
5719 sd(zr, Address(ptr, j * wordSize));
5720 }
5721 addi(ptr, ptr, i * wordSize);
5722 bind(l);
5723 }
5724 {
5725 Label l;
5726 test_bit(t0, cnt, 0);
5727 beqz(t0, l);
5728 sd(zr, Address(ptr, 0));
5729 bind(l);
5730 }
5731
5732 BLOCK_COMMENT("} zero_words");
5733 postcond(pc() != badAddress);
5734 return pc();
5735 }
5736
5737 #define SmallArraySize (18 * BytesPerLong)
5738
5739 // base: Address of a buffer to be zeroed, 8 bytes aligned.
5740 // cnt: Immediate count in HeapWords.
5741 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
5742 assert_different_registers(base, t0, t1);
5743
5744 BLOCK_COMMENT("zero_words {");
5745
5746 if (cnt <= SmallArraySize / BytesPerLong) {
5747 for (int i = 0; i < (int)cnt; i++) {
5748 sd(zr, Address(base, i * wordSize));
5749 }
5750 } else {
5751 const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
5752 int remainder = cnt % unroll;
5753 for (int i = 0; i < remainder; i++) {
5754 sd(zr, Address(base, i * wordSize));
5755 }
5756
5757 Label loop;
5758 Register cnt_reg = t0;
5759 Register loop_base = t1;
5760 cnt = cnt - remainder;
5761 mv(cnt_reg, cnt);
5762 addi(loop_base, base, remainder * wordSize);
5763 bind(loop);
5764 sub(cnt_reg, cnt_reg, unroll);
5765 for (int i = 0; i < unroll; i++) {
5766 sd(zr, Address(loop_base, i * wordSize));
5767 }
5768 addi(loop_base, loop_base, unroll * wordSize);
5769 bnez(cnt_reg, loop);
5770 }
5771
5772 BLOCK_COMMENT("} zero_words");
5773 }
5774
5775 // base: Address of a buffer to be filled, 8 bytes aligned.
5776 // cnt: Count in 8-byte unit.
5777 // value: Value to be filled with.
5778 // base will point to the end of the buffer after filling.
5779 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
5780 // Algorithm:
5781 //
5782 // t0 = cnt & 7
5783 // cnt -= t0
5784 // p += t0
5785 // switch (t0):
5786 // switch start:
5787 // do while cnt
5788 // cnt -= 8
5789 // p[-8] = value
5790 // case 7:
5791 // p[-7] = value
5792 // case 6:
5793 // p[-6] = value
5794 // // ...
5795 // case 1:
5796 // p[-1] = value
5797 // case 0:
5798 // p += 8
5799 // do-while end
5800 // switch end
5801
5802 assert_different_registers(base, cnt, value, t0, t1);
5803
5804 Label fini, skip, entry, loop;
5805 const int unroll = 8; // Number of sd instructions we'll unroll
5806
5807 beqz(cnt, fini);
5808
5809 andi(t0, cnt, unroll - 1);
5810 sub(cnt, cnt, t0);
5811 shadd(base, t0, base, t1, 3);
5812 la(t1, entry);
5813 slli(t0, t0, 2);
5814 sub(t1, t1, t0);
5815 jr(t1);
5816
5817 bind(loop);
5818 addi(base, base, unroll * wordSize);
5819 {
5820 IncompressibleScope scope(this); // Fixed length
5821 for (int i = -unroll; i < 0; i++) {
5822 sd(value, Address(base, i * 8));
5823 }
5824 }
5825 bind(entry);
5826 subi(cnt, cnt, unroll);
5827 bgez(cnt, loop);
5828
5829 bind(fini);
5830 }
5831
5832 // Zero blocks of memory by using CBO.ZERO.
5833 //
5834 // Aligns the base address first sufficiently for CBO.ZERO, then uses
5835 // CBO.ZERO repeatedly for every full block. cnt is the size to be
5836 // zeroed in HeapWords. Returns the count of words left to be zeroed
5837 // in cnt.
5838 //
5839 // NOTE: This is intended to be used in the zero_blocks() stub. If
5840 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
5841 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
5842 int zicboz_block_size = VM_Version::zicboz_block_size.value();
5843 Label initial_table_end, loop;
5844
5845 // Align base with cache line size.
5846 neg(tmp1, base);
5847 andi(tmp1, tmp1, zicboz_block_size - 1);
5848
5849 // tmp1: the number of bytes to be filled to align the base with cache line size.
5850 add(base, base, tmp1);
5851 srai(tmp2, tmp1, 3);
5852 sub(cnt, cnt, tmp2);
5853 srli(tmp2, tmp1, 1);
5854 la(tmp1, initial_table_end);
5855 sub(tmp2, tmp1, tmp2);
5856 jr(tmp2);
5857 for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
5858 sd(zr, Address(base, i));
5859 }
5860 bind(initial_table_end);
5861
5862 mv(tmp1, zicboz_block_size / wordSize);
5863 bind(loop);
5864 cbo_zero(base);
5865 sub(cnt, cnt, tmp1);
5866 addi(base, base, zicboz_block_size);
5867 bge(cnt, tmp1, loop);
5868 }
5869
5870 // java.lang.Math.round(float a)
5871 // Returns the closest int to the argument, with ties rounding to positive infinity.
5872 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
5873 // this instructions calling sequence provides performance improvement on all tested devices;
5874 // don't change it without re-verification
5875 Label done;
5876 mv(t0, jint_cast(0.5f));
5877 fmv_w_x(ftmp, t0);
5878
5879 // dst = 0 if NaN
5880 feq_s(t0, src, src); // replacing fclass with feq as performance optimization
5881 mv(dst, zr);
5882 beqz(t0, done);
5883
5884 // dst = (src + 0.5f) rounded down towards negative infinity
5885 // Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
5886 // RDN is required for fadd_s, RNE gives incorrect results:
5887 // --------------------------------------------------------------------
5888 // fadd.s rne (src + 0.5f): src = 8388609.000000 ftmp = 8388610.000000
5889 // fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
5890 // --------------------------------------------------------------------
5891 // fadd.s rdn (src + 0.5f): src = 8388609.000000 ftmp = 8388609.000000
5892 // fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
5893 // --------------------------------------------------------------------
5894 fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
5895 fcvt_w_s(dst, ftmp, RoundingMode::rdn);
5896
5897 bind(done);
5898 }
5899
5900 // java.lang.Math.round(double a)
5901 // Returns the closest long to the argument, with ties rounding to positive infinity.
5902 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
5903 // this instructions calling sequence provides performance improvement on all tested devices;
5904 // don't change it without re-verification
5905 Label done;
5906 mv(t0, julong_cast(0.5));
5907 fmv_d_x(ftmp, t0);
5908
5909 // dst = 0 if NaN
5910 feq_d(t0, src, src); // replacing fclass with feq as performance optimization
5911 mv(dst, zr);
5912 beqz(t0, done);
5913
5914 // dst = (src + 0.5) rounded down towards negative infinity
5915 fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
5916 fcvt_l_d(dst, ftmp, RoundingMode::rdn);
5917
5918 bind(done);
5919 }
5920
5921 // Helper routine processing the slow path of NaN when converting float to float16
5922 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
5923 Register tmp1, Register tmp2) {
5924 fmv_x_w(dst, src);
5925
5926 // Float (32 bits)
5927 // Bit: 31 30 to 23 22 to 0
5928 // +---+------------------+-----------------------------+
5929 // | S | Exponent | Mantissa (Fraction) |
5930 // +---+------------------+-----------------------------+
5931 // 1 bit 8 bits 23 bits
5932 //
5933 // Float (16 bits)
5934 // Bit: 15 14 to 10 9 to 0
5935 // +---+----------------+------------------+
5936 // | S | Exponent | Mantissa |
5937 // +---+----------------+------------------+
5938 // 1 bit 5 bits 10 bits
5939 const int fp_sign_bits = 1;
5940 const int fp32_bits = 32;
5941 const int fp32_exponent_bits = 8;
5942 const int fp32_mantissa_1st_part_bits = 10;
5943 const int fp32_mantissa_2nd_part_bits = 9;
5944 const int fp32_mantissa_3rd_part_bits = 4;
5945 const int fp16_exponent_bits = 5;
5946 const int fp16_mantissa_bits = 10;
5947
5948 // preserve the sign bit and exponent, clear mantissa.
5949 srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
5950 slli(tmp2, tmp2, fp16_mantissa_bits);
5951
5952 // Preserve high order bit of float NaN in the
5953 // binary16 result NaN (tenth bit); OR in remaining
5954 // bits into lower 9 bits of binary 16 significand.
5955 // | (doppel & 0x007f_e000) >> 13 // 10 bits
5956 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
5957 // | (doppel & 0x0000_000f)); // 4 bits
5958 //
5959 // Check j.l.Float.floatToFloat16 for more information.
5960 // 10 bits
5961 int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
5962 int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
5963 slli(tmp1, dst, left_shift);
5964 srli(tmp1, tmp1, right_shift);
5965 orr(tmp2, tmp2, tmp1);
5966 // 9 bits
5967 left_shift += fp32_mantissa_1st_part_bits;
5968 right_shift = left_shift + fp32_mantissa_3rd_part_bits;
5969 slli(tmp1, dst, left_shift);
5970 srli(tmp1, tmp1, right_shift);
5971 orr(tmp2, tmp2, tmp1);
5972 // 4 bits
5973 andi(tmp1, dst, 0xf);
5974 orr(dst, tmp2, tmp1);
5975 }
5976
5977 #define FCVT_SAFE(FLOATCVT, FLOATSIG) \
5978 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
5979 Label done; \
5980 assert_different_registers(dst, tmp); \
5981 fclass_##FLOATSIG(tmp, src); \
5982 mv(dst, zr); \
5983 /* check if src is NaN */ \
5984 andi(tmp, tmp, FClassBits::nan); \
5985 bnez(tmp, done); \
5986 FLOATCVT(dst, src); \
5987 bind(done); \
5988 }
5989
5990 FCVT_SAFE(fcvt_w_s, s);
5991 FCVT_SAFE(fcvt_l_s, s);
5992 FCVT_SAFE(fcvt_w_d, d);
5993 FCVT_SAFE(fcvt_l_d, d);
5994
5995 #undef FCVT_SAFE
5996
5997 #define FCMP(FLOATTYPE, FLOATSIG) \
5998 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1, \
5999 FloatRegister Rs2, int unordered_result) { \
6000 Label Ldone; \
6001 if (unordered_result < 0) { \
6002 /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */ \
6003 /* installs 1 if gt else 0 */ \
6004 flt_##FLOATSIG(result, Rs2, Rs1); \
6005 /* Rs1 > Rs2, install 1 */ \
6006 bgtz(result, Ldone); \
6007 feq_##FLOATSIG(result, Rs1, Rs2); \
6008 subi(result, result, 1); \
6009 /* Rs1 = Rs2, install 0 */ \
6010 /* NaN or Rs1 < Rs2, install -1 */ \
6011 bind(Ldone); \
6012 } else { \
6013 /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */ \
6014 /* installs 1 if gt or unordered else 0 */ \
6015 flt_##FLOATSIG(result, Rs1, Rs2); \
6016 /* Rs1 < Rs2, install -1 */ \
6017 bgtz(result, Ldone); \
6018 feq_##FLOATSIG(result, Rs1, Rs2); \
6019 subi(result, result, 1); \
6020 /* Rs1 = Rs2, install 0 */ \
6021 /* NaN or Rs1 > Rs2, install 1 */ \
6022 bind(Ldone); \
6023 neg(result, result); \
6024 } \
6025 }
6026
6027 FCMP(float, s);
6028 FCMP(double, d);
6029
6030 #undef FCMP
6031
6032 // Zero words; len is in bytes
6033 // Destroys all registers except addr
6034 // len must be a nonzero multiple of wordSize
6035 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6036 assert_different_registers(addr, len, tmp, t0, t1);
6037
6038 #ifdef ASSERT
6039 {
6040 Label L;
6041 andi(t0, len, BytesPerWord - 1);
6042 beqz(t0, L);
6043 stop("len is not a multiple of BytesPerWord");
6044 bind(L);
6045 }
6046 #endif // ASSERT
6047
6048 #ifndef PRODUCT
6049 block_comment("zero memory");
6050 #endif // PRODUCT
6051
6052 Label loop;
6053 Label entry;
6054
6055 // Algorithm:
6056 //
6057 // t0 = cnt & 7
6058 // cnt -= t0
6059 // p += t0
6060 // switch (t0) {
6061 // do {
6062 // cnt -= 8
6063 // p[-8] = 0
6064 // case 7:
6065 // p[-7] = 0
6066 // case 6:
6067 // p[-6] = 0
6068 // ...
6069 // case 1:
6070 // p[-1] = 0
6071 // case 0:
6072 // p += 8
6073 // } while (cnt)
6074 // }
6075
6076 const int unroll = 8; // Number of sd(zr) instructions we'll unroll
6077
6078 srli(len, len, LogBytesPerWord);
6079 andi(t0, len, unroll - 1); // t0 = cnt % unroll
6080 sub(len, len, t0); // cnt -= unroll
6081 // tmp always points to the end of the region we're about to zero
6082 shadd(tmp, t0, addr, t1, LogBytesPerWord);
6083 la(t1, entry);
6084 slli(t0, t0, 2);
6085 sub(t1, t1, t0);
6086 jr(t1);
6087
6088 bind(loop);
6089 sub(len, len, unroll);
6090 {
6091 IncompressibleScope scope(this); // Fixed length
6092 for (int i = -unroll; i < 0; i++) {
6093 sd(zr, Address(tmp, i * wordSize));
6094 }
6095 }
6096 bind(entry);
6097 add(tmp, tmp, unroll * wordSize);
6098 bnez(len, loop);
6099 }
6100
6101 // shift left by shamt and add
6102 // Rd = (Rs1 << shamt) + Rs2
6103 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6104 if (UseZba) {
6105 if (shamt == 1) {
6106 sh1add(Rd, Rs1, Rs2);
6107 return;
6108 } else if (shamt == 2) {
6109 sh2add(Rd, Rs1, Rs2);
6110 return;
6111 } else if (shamt == 3) {
6112 sh3add(Rd, Rs1, Rs2);
6113 return;
6114 }
6115 }
6116
6117 if (shamt != 0) {
6118 assert_different_registers(Rs2, tmp);
6119 slli(tmp, Rs1, shamt);
6120 add(Rd, Rs2, tmp);
6121 } else {
6122 add(Rd, Rs1, Rs2);
6123 }
6124 }
6125
6126 void MacroAssembler::zext(Register dst, Register src, int bits) {
6127 switch (bits) {
6128 case 32:
6129 if (UseZba) {
6130 zext_w(dst, src);
6131 return;
6132 }
6133 break;
6134 case 16:
6135 if (UseZbb) {
6136 zext_h(dst, src);
6137 return;
6138 }
6139 break;
6140 case 8:
6141 zext_b(dst, src);
6142 return;
6143 default:
6144 break;
6145 }
6146
6147 slli(dst, src, XLEN - bits);
6148 srli(dst, dst, XLEN - bits);
6149 }
6150
6151 void MacroAssembler::sext(Register dst, Register src, int bits) {
6152 switch (bits) {
6153 case 32:
6154 sext_w(dst, src);
6155 return;
6156 case 16:
6157 if (UseZbb) {
6158 sext_h(dst, src);
6159 return;
6160 }
6161 break;
6162 case 8:
6163 if (UseZbb) {
6164 sext_b(dst, src);
6165 return;
6166 }
6167 break;
6168 default:
6169 break;
6170 }
6171
6172 slli(dst, src, XLEN - bits);
6173 srai(dst, dst, XLEN - bits);
6174 }
6175
6176 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6177 Register tmp, bool is_signed) {
6178 if (src1 == src2) {
6179 mv(dst, zr);
6180 return;
6181 }
6182 Label done;
6183 Register left = src1;
6184 Register right = src2;
6185 if (dst == src1) {
6186 assert_different_registers(dst, src2, tmp);
6187 mv(tmp, src1);
6188 left = tmp;
6189 } else if (dst == src2) {
6190 assert_different_registers(dst, src1, tmp);
6191 mv(tmp, src2);
6192 right = tmp;
6193 }
6194
6195 // installs 1 if gt else 0
6196 if (is_signed) {
6197 slt(dst, right, left);
6198 } else {
6199 sltu(dst, right, left);
6200 }
6201 bnez(dst, done);
6202 if (is_signed) {
6203 slt(dst, left, right);
6204 } else {
6205 sltu(dst, left, right);
6206 }
6207 // dst = -1 if lt; else if eq , dst = 0
6208 neg(dst, dst);
6209 bind(done);
6210 }
6211
6212 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6213 {
6214 cmp_x2i(dst, src1, src2, tmp);
6215 }
6216
6217 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6218 cmp_x2i(dst, src1, src2, tmp, false);
6219 }
6220
6221 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6222 cmp_x2i(dst, src1, src2, tmp, false);
6223 }
6224
6225 // The java_calling_convention describes stack locations as ideal slots on
6226 // a frame with no abi restrictions. Since we must observe abi restrictions
6227 // (like the placement of the register window) the slots must be biased by
6228 // the following value.
6229 static int reg2offset_in(VMReg r) {
6230 // Account for saved fp and ra
6231 // This should really be in_preserve_stack_slots
6232 return r->reg2stack() * VMRegImpl::stack_slot_size;
6233 }
6234
6235 static int reg2offset_out(VMReg r) {
6236 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6237 }
6238
6239 // The C ABI specifies:
6240 // "integer scalars narrower than XLEN bits are widened according to the sign
6241 // of their type up to 32 bits, then sign-extended to XLEN bits."
6242 // Applies for both passed in register and stack.
6243 //
6244 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6245 // Native uses 64-bit stack slots for all integer scalar types.
6246 //
6247 // lw loads the Java stack slot, sign-extends and
6248 // sd store this widened integer into a 64 bit native stack slot.
6249 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6250 if (src.first()->is_stack()) {
6251 if (dst.first()->is_stack()) {
6252 // stack to stack
6253 lw(tmp, Address(fp, reg2offset_in(src.first())));
6254 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6255 } else {
6256 // stack to reg
6257 lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6258 }
6259 } else if (dst.first()->is_stack()) {
6260 // reg to stack
6261 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6262 } else {
6263 if (dst.first() != src.first()) {
6264 sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6265 }
6266 }
6267 }
6268
6269 // An oop arg. Must pass a handle not the oop itself
6270 void MacroAssembler::object_move(OopMap* map,
6271 int oop_handle_offset,
6272 int framesize_in_slots,
6273 VMRegPair src,
6274 VMRegPair dst,
6275 bool is_receiver,
6276 int* receiver_offset) {
6277 assert_cond(map != nullptr && receiver_offset != nullptr);
6278
6279 // must pass a handle. First figure out the location we use as a handle
6280 Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6281
6282 // See if oop is null if it is we need no handle
6283
6284 if (src.first()->is_stack()) {
6285 // Oop is already on the stack as an argument
6286 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6287 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6288 if (is_receiver) {
6289 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6290 }
6291
6292 ld(t0, Address(fp, reg2offset_in(src.first())));
6293 la(rHandle, Address(fp, reg2offset_in(src.first())));
6294 // conditionally move a null
6295 Label notZero1;
6296 bnez(t0, notZero1);
6297 mv(rHandle, zr);
6298 bind(notZero1);
6299 } else {
6300
6301 // Oop is in a register we must store it to the space we reserve
6302 // on the stack for oop_handles and pass a handle if oop is non-null
6303
6304 const Register rOop = src.first()->as_Register();
6305 int oop_slot = -1;
6306 if (rOop == j_rarg0) {
6307 oop_slot = 0;
6308 } else if (rOop == j_rarg1) {
6309 oop_slot = 1;
6310 } else if (rOop == j_rarg2) {
6311 oop_slot = 2;
6312 } else if (rOop == j_rarg3) {
6313 oop_slot = 3;
6314 } else if (rOop == j_rarg4) {
6315 oop_slot = 4;
6316 } else if (rOop == j_rarg5) {
6317 oop_slot = 5;
6318 } else if (rOop == j_rarg6) {
6319 oop_slot = 6;
6320 } else {
6321 assert(rOop == j_rarg7, "wrong register");
6322 oop_slot = 7;
6323 }
6324
6325 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6326 int offset = oop_slot * VMRegImpl::stack_slot_size;
6327
6328 map->set_oop(VMRegImpl::stack2reg(oop_slot));
6329 // Store oop in handle area, may be null
6330 sd(rOop, Address(sp, offset));
6331 if (is_receiver) {
6332 *receiver_offset = offset;
6333 }
6334
6335 //rOop maybe the same as rHandle
6336 if (rOop == rHandle) {
6337 Label isZero;
6338 beqz(rOop, isZero);
6339 la(rHandle, Address(sp, offset));
6340 bind(isZero);
6341 } else {
6342 Label notZero2;
6343 la(rHandle, Address(sp, offset));
6344 bnez(rOop, notZero2);
6345 mv(rHandle, zr);
6346 bind(notZero2);
6347 }
6348 }
6349
6350 // If arg is on the stack then place it otherwise it is already in correct reg.
6351 if (dst.first()->is_stack()) {
6352 sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6353 }
6354 }
6355
6356 // A float arg may have to do float reg int reg conversion
6357 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6358 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6359 (src.first()->is_reg() && dst.first()->is_reg()) ||
6360 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6361 if (src.first()->is_stack()) {
6362 if (dst.first()->is_stack()) {
6363 lwu(tmp, Address(fp, reg2offset_in(src.first())));
6364 sw(tmp, Address(sp, reg2offset_out(dst.first())));
6365 } else if (dst.first()->is_Register()) {
6366 lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6367 } else {
6368 ShouldNotReachHere();
6369 }
6370 } else if (src.first() != dst.first()) {
6371 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6372 fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6373 } else {
6374 ShouldNotReachHere();
6375 }
6376 }
6377 }
6378
6379 // A long move
6380 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6381 if (src.first()->is_stack()) {
6382 if (dst.first()->is_stack()) {
6383 // stack to stack
6384 ld(tmp, Address(fp, reg2offset_in(src.first())));
6385 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6386 } else {
6387 // stack to reg
6388 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6389 }
6390 } else if (dst.first()->is_stack()) {
6391 // reg to stack
6392 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6393 } else {
6394 if (dst.first() != src.first()) {
6395 mv(dst.first()->as_Register(), src.first()->as_Register());
6396 }
6397 }
6398 }
6399
6400 // A double move
6401 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6402 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6403 (src.first()->is_reg() && dst.first()->is_reg()) ||
6404 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6405 if (src.first()->is_stack()) {
6406 if (dst.first()->is_stack()) {
6407 ld(tmp, Address(fp, reg2offset_in(src.first())));
6408 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6409 } else if (dst.first()-> is_Register()) {
6410 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6411 } else {
6412 ShouldNotReachHere();
6413 }
6414 } else if (src.first() != dst.first()) {
6415 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6416 fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6417 } else {
6418 ShouldNotReachHere();
6419 }
6420 }
6421 }
6422
6423 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6424 assert(bit_pos < 64, "invalid bit range");
6425 if (UseZbs) {
6426 bexti(Rd, Rs, bit_pos);
6427 return;
6428 }
6429 int64_t imm = (int64_t)(1UL << bit_pos);
6430 if (is_simm12(imm)) {
6431 andi(Rd, Rs, imm);
6432 } else {
6433 srli(Rd, Rs, bit_pos);
6434 andi(Rd, Rd, 1);
6435 }
6436 }
6437
6438 // Implements lightweight-locking.
6439 //
6440 // - obj: the object to be locked
6441 // - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6442 // - slow: branched to if locking fails
6443 void MacroAssembler::lightweight_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6444 assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6445
6446 Label push;
6447 const Register top = tmp1;
6448 const Register mark = tmp2;
6449 const Register t = tmp3;
6450
6451 // Preload the markWord. It is important that this is the first
6452 // instruction emitted as it is part of C1's null check semantics.
6453 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6454
6455 if (UseObjectMonitorTable) {
6456 // Clear cache in case fast locking succeeds or we need to take the slow-path.
6457 sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6458 }
6459
6460 if (DiagnoseSyncOnValueBasedClasses != 0) {
6461 load_klass(tmp1, obj);
6462 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6463 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6464 bnez(tmp1, slow, /* is_far */ true);
6465 }
6466
6467 // Check if the lock-stack is full.
6468 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6469 mv(t, (unsigned)LockStack::end_offset());
6470 bge(top, t, slow, /* is_far */ true);
6471
6472 // Check for recursion.
6473 add(t, xthread, top);
6474 ld(t, Address(t, -oopSize));
6475 beq(obj, t, push);
6476
6477 // Check header for monitor (0b10).
6478 test_bit(t, mark, exact_log2(markWord::monitor_value));
6479 bnez(t, slow, /* is_far */ true);
6480
6481 // Try to lock. Transition lock-bits 0b01 => 0b00
6482 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6483 ori(mark, mark, markWord::unlocked_value);
6484 xori(t, mark, markWord::unlocked_value);
6485 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6486 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6487 bne(mark, t, slow, /* is_far */ true);
6488
6489 bind(push);
6490 // After successful lock, push object on lock-stack.
6491 add(t, xthread, top);
6492 sd(obj, Address(t));
6493 addiw(top, top, oopSize);
6494 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6495 }
6496
6497 // Implements ligthweight-unlocking.
6498 //
6499 // - obj: the object to be unlocked
6500 // - tmp1, tmp2, tmp3: temporary registers
6501 // - slow: branched to if unlocking fails
6502 void MacroAssembler::lightweight_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6503 assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6504
6505 #ifdef ASSERT
6506 {
6507 // Check for lock-stack underflow.
6508 Label stack_ok;
6509 lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6510 mv(tmp2, (unsigned)LockStack::start_offset());
6511 bge(tmp1, tmp2, stack_ok);
6512 STOP("Lock-stack underflow");
6513 bind(stack_ok);
6514 }
6515 #endif
6516
6517 Label unlocked, push_and_slow;
6518 const Register top = tmp1;
6519 const Register mark = tmp2;
6520 const Register t = tmp3;
6521
6522 // Check if obj is top of lock-stack.
6523 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6524 subiw(top, top, oopSize);
6525 add(t, xthread, top);
6526 ld(t, Address(t));
6527 bne(obj, t, slow, /* is_far */ true);
6528
6529 // Pop lock-stack.
6530 DEBUG_ONLY(add(t, xthread, top);)
6531 DEBUG_ONLY(sd(zr, Address(t));)
6532 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6533
6534 // Check if recursive.
6535 add(t, xthread, top);
6536 ld(t, Address(t, -oopSize));
6537 beq(obj, t, unlocked);
6538
6539 // Not recursive. Check header for monitor (0b10).
6540 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6541 test_bit(t, mark, exact_log2(markWord::monitor_value));
6542 bnez(t, push_and_slow);
6543
6544 #ifdef ASSERT
6545 // Check header not unlocked (0b01).
6546 Label not_unlocked;
6547 test_bit(t, mark, exact_log2(markWord::unlocked_value));
6548 beqz(t, not_unlocked);
6549 stop("lightweight_unlock already unlocked");
6550 bind(not_unlocked);
6551 #endif
6552
6553 // Try to unlock. Transition lock bits 0b00 => 0b01
6554 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6555 ori(t, mark, markWord::unlocked_value);
6556 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6557 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6558 beq(mark, t, unlocked);
6559
6560 bind(push_and_slow);
6561 // Restore lock-stack and handle the unlock in runtime.
6562 DEBUG_ONLY(add(t, xthread, top);)
6563 DEBUG_ONLY(sd(obj, Address(t));)
6564 addiw(top, top, oopSize);
6565 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6566 j(slow);
6567
6568 bind(unlocked);
6569 }