1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "ci/ciInlineKlass.hpp"
30 #include "code/compiledIC.hpp"
31 #include "compiler/disassembler.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/cardTable.hpp"
35 #include "gc/shared/cardTableBarrierSet.hpp"
36 #include "gc/shared/collectedHeap.hpp"
37 #include "interpreter/bytecodeHistogram.hpp"
38 #include "interpreter/interpreter.hpp"
39 #include "interpreter/interpreterRuntime.hpp"
40 #include "memory/resourceArea.hpp"
41 #include "memory/universe.hpp"
42 #include "oops/accessDecorators.hpp"
43 #include "oops/compressedKlass.inline.hpp"
44 #include "oops/compressedOops.inline.hpp"
45 #include "oops/klass.inline.hpp"
46 #include "oops/oop.hpp"
47 #include "oops/resolvedFieldEntry.hpp"
48 #include "runtime/interfaceSupport.inline.hpp"
49 #include "runtime/javaThread.hpp"
50 #include "runtime/jniHandles.inline.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubRoutines.hpp"
53 #include "utilities/globalDefinitions.hpp"
54 #include "utilities/integerCast.hpp"
55 #include "utilities/powerOfTwo.hpp"
56 #ifdef COMPILER2
57 #include "opto/compile.hpp"
58 #include "opto/node.hpp"
59 #include "opto/output.hpp"
60 #endif
61
62 #ifdef PRODUCT
63 #define BLOCK_COMMENT(str) /* nothing */
64 #else
65 #define BLOCK_COMMENT(str) block_comment(str)
66 #endif
67 #define STOP(str) stop(str);
68 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
69
70
71
72 Register MacroAssembler::extract_rs1(address instr) {
73 assert_cond(instr != nullptr);
74 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
75 }
76
77 Register MacroAssembler::extract_rs2(address instr) {
78 assert_cond(instr != nullptr);
79 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
80 }
81
82 Register MacroAssembler::extract_rd(address instr) {
83 assert_cond(instr != nullptr);
84 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
85 }
86
87 uint32_t MacroAssembler::extract_opcode(address instr) {
88 assert_cond(instr != nullptr);
89 return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
90 }
91
92 uint32_t MacroAssembler::extract_funct3(address instr) {
93 assert_cond(instr != nullptr);
94 return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
95 }
96
97 bool MacroAssembler::is_pc_relative_at(address instr) {
98 // auipc + jalr
99 // auipc + addi
100 // auipc + load
101 // auipc + fload_load
102 return (is_auipc_at(instr)) &&
103 (is_addi_at(instr + MacroAssembler::instruction_size) ||
104 is_jalr_at(instr + MacroAssembler::instruction_size) ||
105 is_load_at(instr + MacroAssembler::instruction_size) ||
106 is_float_load_at(instr + MacroAssembler::instruction_size)) &&
107 check_pc_relative_data_dependency(instr);
108 }
109
110 // ie:ld(Rd, Label)
111 bool MacroAssembler::is_load_pc_relative_at(address instr) {
112 return is_auipc_at(instr) && // auipc
113 is_ld_at(instr + MacroAssembler::instruction_size) && // ld
114 check_load_pc_relative_data_dependency(instr);
115 }
116
117 bool MacroAssembler::is_movptr1_at(address instr) {
118 return is_lui_at(instr) && // Lui
119 is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
120 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
121 is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
122 is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
123 (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
124 is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
125 is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
126 check_movptr1_data_dependency(instr);
127 }
128
129 bool MacroAssembler::is_movptr2_at(address instr) {
130 return is_lui_at(instr) && // lui
131 is_lui_at(instr + MacroAssembler::instruction_size) && // lui
132 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
133 is_add_at(instr + MacroAssembler::instruction_size * 3) &&
134 (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
135 is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
136 is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
137 check_movptr2_data_dependency(instr);
138 }
139
140 bool MacroAssembler::is_li16u_at(address instr) {
141 return is_lui_at(instr) && // lui
142 is_srli_at(instr + MacroAssembler::instruction_size) && // srli
143 check_li16u_data_dependency(instr);
144 }
145
146 bool MacroAssembler::is_li32_at(address instr) {
147 return is_lui_at(instr) && // lui
148 is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
149 check_li32_data_dependency(instr);
150 }
151
152 bool MacroAssembler::is_lwu_to_zr(address instr) {
153 assert_cond(instr != nullptr);
154 return (extract_opcode(instr) == 0b0000011 &&
155 extract_funct3(instr) == 0b110 &&
156 extract_rd(instr) == zr); // zr
157 }
158
159 uint32_t MacroAssembler::get_membar_kind(address addr) {
160 assert_cond(addr != nullptr);
161 assert(is_membar(addr), "no membar found");
162
163 uint32_t insn = Bytes::get_native_u4(addr);
164
165 uint32_t predecessor = Assembler::extract(insn, 27, 24);
166 uint32_t successor = Assembler::extract(insn, 23, 20);
167
168 return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
169 }
170
171 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
172 assert_cond(addr != nullptr);
173 assert(is_membar(addr), "no membar found");
174
175 uint32_t predecessor = 0;
176 uint32_t successor = 0;
177
178 MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
179
180 uint32_t insn = Bytes::get_native_u4(addr);
181 address pInsn = (address) &insn;
182 Assembler::patch(pInsn, 27, 24, predecessor);
183 Assembler::patch(pInsn, 23, 20, successor);
184
185 address membar = addr;
186 Assembler::sd_instr(membar, insn);
187 }
188
189 static void pass_arg0(MacroAssembler* masm, Register arg) {
190 if (c_rarg0 != arg) {
191 masm->mv(c_rarg0, arg);
192 }
193 }
194
195 static void pass_arg1(MacroAssembler* masm, Register arg) {
196 if (c_rarg1 != arg) {
197 masm->mv(c_rarg1, arg);
198 }
199 }
200
201 static void pass_arg2(MacroAssembler* masm, Register arg) {
202 if (c_rarg2 != arg) {
203 masm->mv(c_rarg2, arg);
204 }
205 }
206
207 static void pass_arg3(MacroAssembler* masm, Register arg) {
208 if (c_rarg3 != arg) {
209 masm->mv(c_rarg3, arg);
210 }
211 }
212
213 void MacroAssembler::push_cont_fastpath(Register java_thread) {
214 if (!Continuations::enabled()) return;
215 Label done;
216 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
217 bleu(sp, t0, done);
218 sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
219 bind(done);
220 }
221
222 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
223 if (!Continuations::enabled()) return;
224 Label done;
225 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
226 bltu(sp, t0, done);
227 sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
228 bind(done);
229 }
230
231 int MacroAssembler::align(int modulus, int extra_offset) {
232 CompressibleScope scope(this);
233 intptr_t before = offset();
234 while ((offset() + extra_offset) % modulus != 0) { nop(); }
235 return (int)(offset() - before);
236 }
237
238 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
239 call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
240 }
241
242 // Implementation of call_VM versions
243
244 void MacroAssembler::call_VM(Register oop_result,
245 address entry_point,
246 bool check_exceptions) {
247 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
248 }
249
250 void MacroAssembler::call_VM(Register oop_result,
251 address entry_point,
252 Register arg_1,
253 bool check_exceptions) {
254 pass_arg1(this, arg_1);
255 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
256 }
257
258 void MacroAssembler::call_VM(Register oop_result,
259 address entry_point,
260 Register arg_1,
261 Register arg_2,
262 bool check_exceptions) {
263 assert_different_registers(arg_1, c_rarg2);
264 pass_arg2(this, arg_2);
265 pass_arg1(this, arg_1);
266 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
267 }
268
269 void MacroAssembler::call_VM(Register oop_result,
270 address entry_point,
271 Register arg_1,
272 Register arg_2,
273 Register arg_3,
274 bool check_exceptions) {
275 assert_different_registers(arg_1, c_rarg2, c_rarg3);
276 assert_different_registers(arg_2, c_rarg3);
277 pass_arg3(this, arg_3);
278
279 pass_arg2(this, arg_2);
280
281 pass_arg1(this, arg_1);
282 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
283 }
284
285 void MacroAssembler::call_VM(Register oop_result,
286 Register last_java_sp,
287 address entry_point,
288 int number_of_arguments,
289 bool check_exceptions) {
290 call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
291 }
292
293 void MacroAssembler::call_VM(Register oop_result,
294 Register last_java_sp,
295 address entry_point,
296 Register arg_1,
297 bool check_exceptions) {
298 pass_arg1(this, arg_1);
299 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
300 }
301
302 void MacroAssembler::call_VM(Register oop_result,
303 Register last_java_sp,
304 address entry_point,
305 Register arg_1,
306 Register arg_2,
307 bool check_exceptions) {
308
309 assert_different_registers(arg_1, c_rarg2);
310 pass_arg2(this, arg_2);
311 pass_arg1(this, arg_1);
312 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
313 }
314
315 void MacroAssembler::call_VM(Register oop_result,
316 Register last_java_sp,
317 address entry_point,
318 Register arg_1,
319 Register arg_2,
320 Register arg_3,
321 bool check_exceptions) {
322 assert_different_registers(arg_1, c_rarg2, c_rarg3);
323 assert_different_registers(arg_2, c_rarg3);
324 pass_arg3(this, arg_3);
325 pass_arg2(this, arg_2);
326 pass_arg1(this, arg_1);
327 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
328 }
329
330 void MacroAssembler::post_call_nop() {
331 assert(!in_compressible_scope(), "Must be");
332 assert_alignment(pc());
333 if (!Continuations::enabled()) {
334 return;
335 }
336 relocate(post_call_nop_Relocation::spec());
337 InlineSkippedInstructionsCounter skipCounter(this);
338 nop();
339 li32(zr, 0);
340 }
341
342 // these are no-ops overridden by InterpreterMacroAssembler
343 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
344 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
345
346 // Calls to C land
347 //
348 // When entering C land, the fp, & esp of the last Java frame have to be recorded
349 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
350 // has to be reset to 0. This is required to allow proper stack traversal.
351 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
352 Register last_java_fp,
353 Register last_java_pc) {
354
355 if (last_java_pc->is_valid()) {
356 sd(last_java_pc, Address(xthread,
357 JavaThread::frame_anchor_offset() +
358 JavaFrameAnchor::last_Java_pc_offset()));
359 }
360
361 // determine last_java_sp register
362 if (!last_java_sp->is_valid()) {
363 last_java_sp = esp;
364 }
365
366 // last_java_fp is optional
367 if (last_java_fp->is_valid()) {
368 sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
369 }
370
371 // We must set sp last.
372 sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
373
374 }
375
376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
377 Register last_java_fp,
378 address last_java_pc,
379 Register tmp) {
380 assert(last_java_pc != nullptr, "must provide a valid PC");
381
382 la(tmp, last_java_pc);
383 sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
384
385 set_last_Java_frame(last_java_sp, last_java_fp, noreg);
386 }
387
388 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
389 Register last_java_fp,
390 Label &L,
391 Register tmp) {
392 if (L.is_bound()) {
393 set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
394 } else {
395 L.add_patch_at(code(), locator());
396 IncompressibleScope scope(this); // the label address will be patched back.
397 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
398 }
399 }
400
401 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
402 // we must set sp to zero to clear frame
403 sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
404
405 // must clear fp, so that compiled frames are not confused; it is
406 // possible that we need it only for debugging
407 if (clear_fp) {
408 sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
409 }
410
411 // Always clear the pc because it could have been set by make_walkable()
412 sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
413 }
414
415 void MacroAssembler::call_VM_base(Register oop_result,
416 Register java_thread,
417 Register last_java_sp,
418 Label* return_pc,
419 address entry_point,
420 int number_of_arguments,
421 bool check_exceptions) {
422 // determine java_thread register
423 if (!java_thread->is_valid()) {
424 java_thread = xthread;
425 }
426
427 // determine last_java_sp register
428 if (!last_java_sp->is_valid()) {
429 last_java_sp = esp;
430 }
431
432 // debugging support
433 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
434 assert(java_thread == xthread, "unexpected register");
435
436 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
437 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
438
439 // push java thread (becomes first argument of C function)
440 mv(c_rarg0, java_thread);
441
442 // set last Java frame before call
443 assert(last_java_sp != fp, "can't use fp");
444
445 Label l;
446 set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
447
448 // do the call, remove parameters
449 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
450
451 // reset last Java frame
452 // Only interpreter should have to clear fp
453 reset_last_Java_frame(true);
454
455 // C++ interp handles this in the interpreter
456 check_and_handle_popframe(java_thread);
457 check_and_handle_earlyret(java_thread);
458
459 if (check_exceptions) {
460 // check for pending exceptions (java_thread is set upon return)
461 ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
462 Label ok;
463 beqz(t0, ok);
464 j(RuntimeAddress(StubRoutines::forward_exception_entry()));
465 bind(ok);
466 }
467
468 // get oop result if there is one and reset the value in the thread
469 if (oop_result->is_valid()) {
470 get_vm_result_oop(oop_result, java_thread);
471 }
472 }
473
474 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
475 ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
476 sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
477 verify_oop_msg(oop_result, "broken oop in call_VM_base");
478 }
479
480 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
481 ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
482 sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
483 }
484
485 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
486 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
487 assert_different_registers(klass, xthread, tmp);
488
489 Label L_fallthrough, L_tmp;
490 if (L_fast_path == nullptr) {
491 L_fast_path = &L_fallthrough;
492 } else if (L_slow_path == nullptr) {
493 L_slow_path = &L_fallthrough;
494 }
495
496 // Fast path check: class is fully initialized
497 lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
498 membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
499 sub(tmp, tmp, InstanceKlass::fully_initialized);
500 beqz(tmp, *L_fast_path);
501
502 // Fast path check: current thread is initializer thread
503 ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
504
505 if (L_slow_path == &L_fallthrough) {
506 beq(xthread, tmp, *L_fast_path);
507 bind(*L_slow_path);
508 } else if (L_fast_path == &L_fallthrough) {
509 bne(xthread, tmp, *L_slow_path);
510 bind(*L_fast_path);
511 } else {
512 Unimplemented();
513 }
514 }
515
516 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
517 if (!VerifyOops) { return; }
518
519 // Pass register number to verify_oop_subroutine
520 const char* b = nullptr;
521 {
522 ResourceMark rm;
523 stringStream ss;
524 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
525 b = code_string(ss.as_string());
526 }
527 BLOCK_COMMENT("verify_oop {");
528
529 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
530
531 mv(c_rarg0, reg); // c_rarg0 : x10
532 {
533 // The length of the instruction sequence emitted should not depend
534 // on the address of the char buffer so that the size of mach nodes for
535 // scratch emit and normal emit matches.
536 IncompressibleScope scope(this); // Fixed length
537 movptr(t0, (address) b);
538 }
539
540 // Call indirectly to solve generation ordering problem
541 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
542 jalr(t1);
543
544 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
545
546 BLOCK_COMMENT("} verify_oop");
547 }
548
549 // Handle the receiver type profile update given the "recv" klass.
550 //
551 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
552 // If there are no matching or claimable receiver entries in RD, updates
553 // the polymorphic counter.
554 //
555 // This code expected to run by either the interpreter or JIT-ed code, without
556 // extra synchronization. For safety, receiver cells are claimed atomically, which
557 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
558 // counter updates are not atomic.
559 //
560 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
561 assert_different_registers(recv, mdp, t0, t1);
562
563 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
564 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
565 int poly_count_offset = in_bytes(CounterData::count_offset());
566 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
567 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
568
569 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
570 base_receiver_offset += mdp_offset;
571 end_receiver_offset += mdp_offset;
572 poly_count_offset += mdp_offset;
573
574 #ifdef ASSERT
575 // We are about to walk the MDO slots without asking for offsets.
576 // Check that our math hits all the right spots.
577 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
578 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
579 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
580 int offset = base_receiver_offset + receiver_step*c;
581 int count_offset = offset + receiver_to_count_step;
582 assert(offset == real_recv_offset, "receiver slot math");
583 assert(count_offset == real_count_offset, "receiver count math");
584 }
585 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
586 assert(poly_count_offset == real_poly_count_offset, "poly counter math");
587 #endif
588
589 // Corner case: no profile table. Increment poly counter and exit.
590 if (ReceiverTypeData::row_limit() == 0) {
591 increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
592 return;
593 }
594
595 Register offset = t1;
596
597 Label L_loop_search_receiver, L_loop_search_empty;
598 Label L_restart, L_found_recv, L_found_empty, L_count_update;
599
600 // The code here recognizes three major cases:
601 // A. Fastest: receiver found in the table
602 // B. Fast: no receiver in the table, and the table is full
603 // C. Slow: no receiver in the table, free slots in the table
604 //
605 // The case A performance is most important, as perfectly-behaved code would end up
606 // there, especially with larger TypeProfileWidth. The case B performance is
607 // important as well, this is where bulk of code would land for normally megamorphic
608 // cases. The case C performance is not essential, its job is to deal with installation
609 // races, we optimize for code density instead. Case C needs to make sure that receiver
610 // rows are only claimed once. This makes sure we never overwrite a row for another
611 // receiver and never duplicate the receivers in the list, making profile type-accurate.
612 //
613 // It is very tempting to handle these cases in a single loop, and claim the first slot
614 // without checking the rest of the table. But, profiling code should tolerate free slots
615 // in the table, as class unloading can clear them. After such cleanup, the receiver
616 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
617 // to complete, before trying to install new slots. Splitting the code in several tight
618 // loops also helpfully optimizes for cases A and B.
619 //
620 // This code is effectively:
621 //
622 // restart:
623 // // Fastest: receiver is already installed
624 // for (i = 0; i < receiver_count(); i++) {
625 // if (receiver(i) == recv) goto found_recv(i);
626 // }
627 //
628 // // Fast: no receiver, but profile is not full
629 // for (i = 0; i < receiver_count(); i++) {
630 // if (receiver(i) == null) goto found_null(i);
631 // }
632 //
633 // // Slow: profile is full, polymorphic case
634 // count++;
635 // return
636 //
637 // // Slow: try to install receiver
638 // found_null(i):
639 // CAS(&receiver(i), null, recv);
640 // goto restart
641 //
642 // found_recv(i):
643 // *receiver_count(i)++
644 //
645
646 bind(L_restart);
647
648 // Fastest: receiver is already installed
649 mv(offset, base_receiver_offset);
650 bind(L_loop_search_receiver);
651 add(t0, mdp, offset);
652 ld(t0, Address(t0));
653 beq(recv, t0, L_found_recv);
654 add(offset, offset, receiver_step);
655 sub(t0, offset, end_receiver_offset);
656 bnez(t0, L_loop_search_receiver);
657
658 // Fast: no receiver, but profile is not full
659 mv(offset, base_receiver_offset);
660 bind(L_loop_search_empty);
661 add(t0, mdp, offset);
662 ld(t0, Address(t0));
663 beqz(t0, L_found_empty);
664 add(offset, offset, receiver_step);
665 sub(t0, offset, end_receiver_offset);
666 bnez(t0, L_loop_search_empty);
667
668 // Slow: Receiver is not found and table is full.
669 // Increment polymorphic counter instead of receiver slot.
670 mv(offset, poly_count_offset);
671 j(L_count_update);
672
673 // Slowest: try to install receiver
674 bind(L_found_empty);
675
676 // Atomically swing receiver slot: null -> recv.
677 //
678 // The update uses CAS, which clobbers t0. Therefore, t1
679 // is used to hold the destination address. This is safe because the
680 // offset is no longer needed after the address is computed.
681 add(t1, mdp, offset);
682 weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
683 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
684
685 // CAS success means the slot now has the receiver we want. CAS failure means
686 // something had claimed the slot concurrently: it can be the same receiver we want,
687 // or something else. Since this is a slow path, we can optimize for code density,
688 // and just restart the search from the beginning.
689 j(L_restart);
690
691 // Found a receiver, convert its slot offset to corresponding count offset.
692 bind(L_found_recv);
693 add(offset, offset, receiver_to_count_step);
694
695 // Finally, update the counter
696 bind(L_count_update);
697 add(t1, mdp, offset);
698 increment(Address(t1), DataLayout::counter_increment);
699 }
700
701 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
702 if (!VerifyOops) {
703 return;
704 }
705
706 const char* b = nullptr;
707 {
708 ResourceMark rm;
709 stringStream ss;
710 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
711 b = code_string(ss.as_string());
712 }
713 BLOCK_COMMENT("verify_oop_addr {");
714
715 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
716
717 if (addr.uses(sp)) {
718 la(x10, addr);
719 ld(x10, Address(x10, 4 * wordSize));
720 } else {
721 ld(x10, addr);
722 }
723
724 {
725 // The length of the instruction sequence emitted should not depend
726 // on the address of the char buffer so that the size of mach nodes for
727 // scratch emit and normal emit matches.
728 IncompressibleScope scope(this); // Fixed length
729 movptr(t0, (address) b);
730 }
731
732 // Call indirectly to solve generation ordering problem
733 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
734 jalr(t1);
735
736 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
737
738 BLOCK_COMMENT("} verify_oop_addr");
739 }
740
741 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
742 int extra_slot_offset) {
743 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
744 int stackElementSize = Interpreter::stackElementSize;
745 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
746 #ifdef ASSERT
747 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
748 assert(offset1 - offset == stackElementSize, "correct arithmetic");
749 #endif
750 if (arg_slot.is_constant()) {
751 return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
752 } else {
753 assert_different_registers(t0, arg_slot.as_register());
754 shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
755 return Address(t0, offset);
756 }
757 }
758
759 #ifndef PRODUCT
760 extern "C" void findpc(intptr_t x);
761 #endif
762
763 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
764 {
765 // In order to get locks to work, we need to fake a in_VM state
766 if (ShowMessageBoxOnError) {
767 JavaThread* thread = JavaThread::current();
768 JavaThreadState saved_state = thread->thread_state();
769 thread->set_thread_state(_thread_in_vm);
770 #ifndef PRODUCT
771 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
772 ttyLocker ttyl;
773 BytecodeCounter::print();
774 }
775 #endif
776 if (os::message_box(msg, "Execution stopped, print registers?")) {
777 ttyLocker ttyl;
778 tty->print_cr(" pc = 0x%016lx", pc);
779 #ifndef PRODUCT
780 tty->cr();
781 findpc(pc);
782 tty->cr();
783 #endif
784 tty->print_cr(" x0 = 0x%016lx", regs[0]);
785 tty->print_cr(" x1 = 0x%016lx", regs[1]);
786 tty->print_cr(" x2 = 0x%016lx", regs[2]);
787 tty->print_cr(" x3 = 0x%016lx", regs[3]);
788 tty->print_cr(" x4 = 0x%016lx", regs[4]);
789 tty->print_cr(" x5 = 0x%016lx", regs[5]);
790 tty->print_cr(" x6 = 0x%016lx", regs[6]);
791 tty->print_cr(" x7 = 0x%016lx", regs[7]);
792 tty->print_cr(" x8 = 0x%016lx", regs[8]);
793 tty->print_cr(" x9 = 0x%016lx", regs[9]);
794 tty->print_cr("x10 = 0x%016lx", regs[10]);
795 tty->print_cr("x11 = 0x%016lx", regs[11]);
796 tty->print_cr("x12 = 0x%016lx", regs[12]);
797 tty->print_cr("x13 = 0x%016lx", regs[13]);
798 tty->print_cr("x14 = 0x%016lx", regs[14]);
799 tty->print_cr("x15 = 0x%016lx", regs[15]);
800 tty->print_cr("x16 = 0x%016lx", regs[16]);
801 tty->print_cr("x17 = 0x%016lx", regs[17]);
802 tty->print_cr("x18 = 0x%016lx", regs[18]);
803 tty->print_cr("x19 = 0x%016lx", regs[19]);
804 tty->print_cr("x20 = 0x%016lx", regs[20]);
805 tty->print_cr("x21 = 0x%016lx", regs[21]);
806 tty->print_cr("x22 = 0x%016lx", regs[22]);
807 tty->print_cr("x23 = 0x%016lx", regs[23]);
808 tty->print_cr("x24 = 0x%016lx", regs[24]);
809 tty->print_cr("x25 = 0x%016lx", regs[25]);
810 tty->print_cr("x26 = 0x%016lx", regs[26]);
811 tty->print_cr("x27 = 0x%016lx", regs[27]);
812 tty->print_cr("x28 = 0x%016lx", regs[28]);
813 tty->print_cr("x30 = 0x%016lx", regs[30]);
814 tty->print_cr("x31 = 0x%016lx", regs[31]);
815 BREAKPOINT;
816 }
817 }
818 fatal("DEBUG MESSAGE: %s", msg);
819 }
820
821 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
822 assert_different_registers(value, tmp1, tmp2);
823 Label done, tagged, weak_tagged;
824
825 beqz(value, done); // Use null as-is.
826 // Test for tag.
827 andi(tmp1, value, JNIHandles::tag_mask);
828 bnez(tmp1, tagged);
829
830 // Resolve local handle
831 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
832 verify_oop(value);
833 j(done);
834
835 bind(tagged);
836 // Test for jweak tag.
837 STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
838 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
839 bnez(tmp1, weak_tagged);
840
841 // Resolve global handle
842 access_load_at(T_OBJECT, IN_NATIVE, value,
843 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
844 verify_oop(value);
845 j(done);
846
847 bind(weak_tagged);
848 // Resolve jweak.
849 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
850 Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
851 verify_oop(value);
852
853 bind(done);
854 }
855
856 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
857 assert_different_registers(value, tmp1, tmp2);
858 Label done;
859
860 beqz(value, done); // Use null as-is.
861
862 #ifdef ASSERT
863 {
864 STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
865 Label valid_global_tag;
866 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
867 bnez(tmp1, valid_global_tag);
868 stop("non global jobject using resolve_global_jobject");
869 bind(valid_global_tag);
870 }
871 #endif
872
873 // Resolve global handle
874 access_load_at(T_OBJECT, IN_NATIVE, value,
875 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
876 verify_oop(value);
877
878 bind(done);
879 }
880
881 void MacroAssembler::stop(const char* msg) {
882 BLOCK_COMMENT(msg);
883 illegal_instruction(Assembler::csr::time);
884 emit_int64((uintptr_t)msg);
885 }
886
887 void MacroAssembler::unimplemented(const char* what) {
888 const char* buf = nullptr;
889 {
890 ResourceMark rm;
891 stringStream ss;
892 ss.print("unimplemented: %s", what);
893 buf = code_string(ss.as_string());
894 }
895 stop(buf);
896 }
897
898 void MacroAssembler::emit_static_call_stub() {
899 IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
900 // CompiledDirectCall::set_to_interpreted knows the
901 // exact layout of this stub.
902
903 mov_metadata(xmethod, (Metadata*)nullptr);
904
905 // Jump to the entry point of the c2i stub.
906 int32_t offset = 0;
907 movptr2(t1, 0, offset, t0); // lui + lui + slli + add
908 jr(t1, offset);
909 }
910
911 void MacroAssembler::call_VM_leaf_base(address entry_point,
912 int number_of_arguments,
913 Label *retaddr) {
914 int32_t offset = 0;
915 push_reg(RegSet::of(t1, xmethod), sp); // push << t1 & xmethod >> to sp
916 movptr(t1, entry_point, offset, t0);
917 jalr(t1, offset);
918 if (retaddr != nullptr) {
919 bind(*retaddr);
920 }
921 pop_reg(RegSet::of(t1, xmethod), sp); // pop << t1 & xmethod >> from sp
922 }
923
924 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
925 call_VM_leaf_base(entry_point, number_of_arguments);
926 }
927
928 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
929 pass_arg0(this, arg_0);
930 call_VM_leaf_base(entry_point, 1);
931 }
932
933 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
934 assert_different_registers(arg_1, c_rarg0);
935 pass_arg0(this, arg_0);
936 pass_arg1(this, arg_1);
937 call_VM_leaf_base(entry_point, 2);
938 }
939
940 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
941 Register arg_1, Register arg_2) {
942 assert_different_registers(arg_1, c_rarg0);
943 assert_different_registers(arg_2, c_rarg0, c_rarg1);
944 pass_arg0(this, arg_0);
945 pass_arg1(this, arg_1);
946 pass_arg2(this, arg_2);
947 call_VM_leaf_base(entry_point, 3);
948 }
949
950 void MacroAssembler::super_call_VM_leaf(address entry_point) {
951 MacroAssembler::call_VM_leaf_base(entry_point, 1);
952 }
953
954 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
955 pass_arg0(this, arg_0);
956 MacroAssembler::call_VM_leaf_base(entry_point, 1);
957 }
958
959 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
960
961 assert_different_registers(arg_0, c_rarg1);
962 pass_arg1(this, arg_1);
963 pass_arg0(this, arg_0);
964 MacroAssembler::call_VM_leaf_base(entry_point, 2);
965 }
966
967 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
968 assert_different_registers(arg_0, c_rarg1, c_rarg2);
969 assert_different_registers(arg_1, c_rarg2);
970 pass_arg2(this, arg_2);
971 pass_arg1(this, arg_1);
972 pass_arg0(this, arg_0);
973 MacroAssembler::call_VM_leaf_base(entry_point, 3);
974 }
975
976 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
977 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
978 assert_different_registers(arg_1, c_rarg2, c_rarg3);
979 assert_different_registers(arg_2, c_rarg3);
980
981 pass_arg3(this, arg_3);
982 pass_arg2(this, arg_2);
983 pass_arg1(this, arg_1);
984 pass_arg0(this, arg_0);
985 MacroAssembler::call_VM_leaf_base(entry_point, 4);
986 }
987
988 void MacroAssembler::la(Register Rd, const address addr) {
989 int32_t offset;
990 la(Rd, addr, offset);
991 addi(Rd, Rd, offset);
992 }
993
994 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
995 int64_t distance = addr - pc();
996 assert(is_valid_32bit_offset(distance), "Must be");
997 auipc(Rd, (int32_t)distance + 0x800);
998 offset = ((int32_t)distance << 20) >> 20;
999 }
1000
1001 // Materialize with auipc + addi sequence if adr is a literal
1002 // address inside code cache. Emit a movptr sequence otherwise.
1003 void MacroAssembler::la(Register Rd, const Address &adr) {
1004 switch (adr.getMode()) {
1005 case Address::literal: {
1006 relocInfo::relocType rtype = adr.rspec().reloc()->type();
1007 if (rtype == relocInfo::none) {
1008 mv(Rd, (intptr_t)(adr.target()));
1009 } else {
1010 if (CodeCache::contains(adr.target())) {
1011 relocate(adr.rspec(), [&] {
1012 la(Rd, adr.target());
1013 });
1014 } else {
1015 relocate(adr.rspec(), [&] {
1016 movptr(Rd, adr.target());
1017 });
1018 }
1019 }
1020 break;
1021 }
1022 case Address::base_plus_offset: {
1023 Address new_adr = legitimize_address(Rd, adr);
1024 if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1025 addi(Rd, new_adr.base(), new_adr.offset());
1026 }
1027 break;
1028 }
1029 default:
1030 ShouldNotReachHere();
1031 }
1032 }
1033
1034 void MacroAssembler::la(Register Rd, Label &label) {
1035 IncompressibleScope scope(this); // the label address may be patched back.
1036 wrap_label(Rd, label, &MacroAssembler::la);
1037 }
1038
1039 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1040 lui(Rd, (uint32_t)imm << 12);
1041 srli(Rd, Rd, 12);
1042 }
1043
1044 void MacroAssembler::li32(Register Rd, int32_t imm) {
1045 // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1046 int64_t upper = imm, lower = imm;
1047 lower = (imm << 20) >> 20;
1048 upper -= lower;
1049 upper = (int32_t)upper;
1050 // lui Rd, imm[31:12] + imm[11]
1051 lui(Rd, upper);
1052 addiw(Rd, Rd, lower);
1053 }
1054
1055 void MacroAssembler::li(Register Rd, int64_t imm) {
1056 // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1057 // li -> c.li
1058 if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1059 c_li(Rd, imm);
1060 return;
1061 }
1062
1063 int shift = 12;
1064 int64_t upper = imm, lower = imm;
1065 // Split imm to a lower 12-bit sign-extended part and the remainder,
1066 // because addi will sign-extend the lower imm.
1067 lower = ((int32_t)imm << 20) >> 20;
1068 upper -= lower;
1069
1070 // Test whether imm is a 32-bit integer.
1071 if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1072 (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1073 while (((upper >> shift) & 1) == 0) { shift++; }
1074 upper >>= shift;
1075 li(Rd, upper);
1076 slli(Rd, Rd, shift);
1077 if (lower != 0) {
1078 addi(Rd, Rd, lower);
1079 }
1080 } else {
1081 // 32-bit integer
1082 Register hi_Rd = zr;
1083 if (upper != 0) {
1084 lui(Rd, (int32_t)upper);
1085 hi_Rd = Rd;
1086 }
1087 if (lower != 0 || hi_Rd == zr) {
1088 addiw(Rd, hi_Rd, lower);
1089 }
1090 }
1091 }
1092
1093 void MacroAssembler::j(const address dest, Register temp) {
1094 assert(CodeCache::contains(dest), "Must be");
1095 assert_cond(dest != nullptr);
1096 int64_t distance = dest - pc();
1097
1098 // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1099 IncompressibleScope scope(this);
1100 if (is_simm21(distance) && ((distance % 2) == 0)) {
1101 Assembler::jal(x0, distance);
1102 } else {
1103 assert(temp != noreg && temp != x0, "Expecting a register");
1104 assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1105 int32_t offset = 0;
1106 la(temp, dest, offset);
1107 jr(temp, offset);
1108 }
1109 }
1110
1111 void MacroAssembler::j(const Address &dest, Register temp) {
1112 switch (dest.getMode()) {
1113 case Address::literal: {
1114 if (CodeCache::contains(dest.target())) {
1115 far_jump(dest, temp);
1116 } else {
1117 relocate(dest.rspec(), [&] {
1118 int32_t offset;
1119 movptr(temp, dest.target(), offset);
1120 jr(temp, offset);
1121 });
1122 }
1123 break;
1124 }
1125 case Address::base_plus_offset: {
1126 int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1127 la(temp, Address(dest.base(), dest.offset() - offset));
1128 jr(temp, offset);
1129 break;
1130 }
1131 default:
1132 ShouldNotReachHere();
1133 }
1134 }
1135
1136 void MacroAssembler::j(Label &lab, Register temp) {
1137 assert_different_registers(x0, temp);
1138 if (lab.is_bound()) {
1139 MacroAssembler::j(target(lab), temp);
1140 } else {
1141 lab.add_patch_at(code(), locator());
1142 MacroAssembler::j(pc(), temp);
1143 }
1144 }
1145
1146 void MacroAssembler::jr(Register Rd, int32_t offset) {
1147 assert(Rd != noreg, "expecting a register");
1148 assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1149 Assembler::jalr(x0, Rd, offset);
1150 }
1151
1152 void MacroAssembler::call(const address dest, Register temp) {
1153 assert_cond(dest != nullptr);
1154 assert(temp != noreg, "expecting a register");
1155 assert(temp != x5, "temp register must not be x5.");
1156 int32_t offset = 0;
1157 la(temp, dest, offset);
1158 jalr(temp, offset);
1159 }
1160
1161 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1162 assert(Rs != noreg, "expecting a register");
1163 assert(Rs != x5, "Rs register must not be x5.");
1164 Assembler::jalr(x1, Rs, offset);
1165 }
1166
1167 void MacroAssembler::rt_call(address dest, Register tmp) {
1168 assert(tmp != x5, "tmp register must not be x5.");
1169 RuntimeAddress target(dest);
1170 if (CodeCache::contains(dest)) {
1171 far_call(target, tmp);
1172 } else {
1173 relocate(target.rspec(), [&] {
1174 int32_t offset;
1175 movptr(tmp, target.target(), offset);
1176 jalr(tmp, offset);
1177 });
1178 }
1179 }
1180
1181 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1182 if (L.is_bound()) {
1183 (this->*insn)(Rt, target(L));
1184 } else {
1185 L.add_patch_at(code(), locator());
1186 (this->*insn)(Rt, pc());
1187 }
1188 }
1189
1190 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1191 compare_and_branch_insn insn,
1192 compare_and_branch_label_insn neg_insn, bool is_far) {
1193 if (is_far) {
1194 Label done;
1195 (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1196 j(L);
1197 bind(done);
1198 } else {
1199 if (L.is_bound()) {
1200 (this->*insn)(r1, r2, target(L));
1201 } else {
1202 L.add_patch_at(code(), locator());
1203 (this->*insn)(r1, r2, pc());
1204 }
1205 }
1206 }
1207
1208 #define INSN(NAME, NEG_INSN) \
1209 void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) { \
1210 wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far); \
1211 }
1212
1213 INSN(beq, bne);
1214 INSN(bne, beq);
1215 INSN(blt, bge);
1216 INSN(bge, blt);
1217 INSN(bltu, bgeu);
1218 INSN(bgeu, bltu);
1219
1220 #undef INSN
1221
1222 #define INSN(NAME) \
1223 void MacroAssembler::NAME##z(Register Rs, const address dest) { \
1224 NAME(Rs, zr, dest); \
1225 } \
1226 void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) { \
1227 NAME(Rs, zr, l, is_far); \
1228 } \
1229
1230 INSN(beq);
1231 INSN(bne);
1232 INSN(blt);
1233 INSN(ble);
1234 INSN(bge);
1235 INSN(bgt);
1236
1237 #undef INSN
1238
1239 #define INSN(NAME, NEG_INSN) \
1240 void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) { \
1241 NEG_INSN(Rt, Rs, dest); \
1242 } \
1243 void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) { \
1244 NEG_INSN(Rt, Rs, l, is_far); \
1245 }
1246
1247 INSN(bgt, blt);
1248 INSN(ble, bge);
1249 INSN(bgtu, bltu);
1250 INSN(bleu, bgeu);
1251
1252 #undef INSN
1253
1254 // cmov
1255 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1256 if (UseZicond) {
1257 xorr(t0, cmp1, cmp2);
1258 czero_eqz(dst, dst, t0);
1259 czero_nez(t0 , src, t0);
1260 orr(dst, dst, t0);
1261 return;
1262 }
1263 Label no_set;
1264 bne(cmp1, cmp2, no_set);
1265 mv(dst, src);
1266 bind(no_set);
1267 }
1268
1269 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1270 if (UseZicond) {
1271 xorr(t0, cmp1, cmp2);
1272 czero_nez(dst, dst, t0);
1273 czero_eqz(t0 , src, t0);
1274 orr(dst, dst, t0);
1275 return;
1276 }
1277 Label no_set;
1278 beq(cmp1, cmp2, no_set);
1279 mv(dst, src);
1280 bind(no_set);
1281 }
1282
1283 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1284 if (UseZicond) {
1285 slt(t0, cmp2, cmp1);
1286 czero_eqz(dst, dst, t0);
1287 czero_nez(t0, src, t0);
1288 orr(dst, dst, t0);
1289 return;
1290 }
1291 Label no_set;
1292 bgt(cmp1, cmp2, no_set);
1293 mv(dst, src);
1294 bind(no_set);
1295 }
1296
1297 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1298 if (UseZicond) {
1299 sltu(t0, cmp2, cmp1);
1300 czero_eqz(dst, dst, t0);
1301 czero_nez(t0, src, t0);
1302 orr(dst, dst, t0);
1303 return;
1304 }
1305 Label no_set;
1306 bgtu(cmp1, cmp2, no_set);
1307 mv(dst, src);
1308 bind(no_set);
1309 }
1310
1311 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1312 if (UseZicond) {
1313 slt(t0, cmp1, cmp2);
1314 czero_eqz(dst, dst, t0);
1315 czero_nez(t0, src, t0);
1316 orr(dst, dst, t0);
1317 return;
1318 }
1319 Label no_set;
1320 blt(cmp1, cmp2, no_set);
1321 mv(dst, src);
1322 bind(no_set);
1323 }
1324
1325 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1326 if (UseZicond) {
1327 sltu(t0, cmp1, cmp2);
1328 czero_eqz(dst, dst, t0);
1329 czero_nez(t0, src, t0);
1330 orr(dst, dst, t0);
1331 return;
1332 }
1333 Label no_set;
1334 bltu(cmp1, cmp2, no_set);
1335 mv(dst, src);
1336 bind(no_set);
1337 }
1338
1339 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1340 if (UseZicond) {
1341 slt(t0, cmp1, cmp2);
1342 czero_nez(dst, dst, t0);
1343 czero_eqz(t0, src, t0);
1344 orr(dst, dst, t0);
1345 return;
1346 }
1347 Label no_set;
1348 bge(cmp1, cmp2, no_set);
1349 mv(dst, src);
1350 bind(no_set);
1351 }
1352
1353 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1354 if (UseZicond) {
1355 sltu(t0, cmp1, cmp2);
1356 czero_nez(dst, dst, t0);
1357 czero_eqz(t0, src, t0);
1358 orr(dst, dst, t0);
1359 return;
1360 }
1361 Label no_set;
1362 bgeu(cmp1, cmp2, no_set);
1363 mv(dst, src);
1364 bind(no_set);
1365 }
1366
1367 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1368 if (UseZicond) {
1369 slt(t0, cmp2, cmp1);
1370 czero_nez(dst, dst, t0);
1371 czero_eqz(t0, src, t0);
1372 orr(dst, dst, t0);
1373 return;
1374 }
1375 Label no_set;
1376 ble(cmp1, cmp2, no_set);
1377 mv(dst, src);
1378 bind(no_set);
1379 }
1380
1381 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1382 if (UseZicond) {
1383 sltu(t0, cmp2, cmp1);
1384 czero_nez(dst, dst, t0);
1385 czero_eqz(t0, src, t0);
1386 orr(dst, dst, t0);
1387 return;
1388 }
1389 Label no_set;
1390 bleu(cmp1, cmp2, no_set);
1391 mv(dst, src);
1392 bind(no_set);
1393 }
1394
1395 // ----------- cmove float/double -----------
1396
1397 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1398 Label no_set;
1399 bne(cmp1, cmp2, no_set);
1400 if (is_single) {
1401 fmv_s(dst, src);
1402 } else {
1403 fmv_d(dst, src);
1404 }
1405 bind(no_set);
1406 }
1407
1408 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1409 Label no_set;
1410 beq(cmp1, cmp2, no_set);
1411 if (is_single) {
1412 fmv_s(dst, src);
1413 } else {
1414 fmv_d(dst, src);
1415 }
1416 bind(no_set);
1417 }
1418
1419 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1420 Label no_set;
1421 bgt(cmp1, cmp2, no_set);
1422 if (is_single) {
1423 fmv_s(dst, src);
1424 } else {
1425 fmv_d(dst, src);
1426 }
1427 bind(no_set);
1428 }
1429
1430 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1431 Label no_set;
1432 bgtu(cmp1, cmp2, no_set);
1433 if (is_single) {
1434 fmv_s(dst, src);
1435 } else {
1436 fmv_d(dst, src);
1437 }
1438 bind(no_set);
1439 }
1440
1441 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1442 Label no_set;
1443 blt(cmp1, cmp2, no_set);
1444 if (is_single) {
1445 fmv_s(dst, src);
1446 } else {
1447 fmv_d(dst, src);
1448 }
1449 bind(no_set);
1450 }
1451
1452 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1453 Label no_set;
1454 bltu(cmp1, cmp2, no_set);
1455 if (is_single) {
1456 fmv_s(dst, src);
1457 } else {
1458 fmv_d(dst, src);
1459 }
1460 bind(no_set);
1461 }
1462
1463 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1464 Label no_set;
1465 bge(cmp1, cmp2, no_set);
1466 if (is_single) {
1467 fmv_s(dst, src);
1468 } else {
1469 fmv_d(dst, src);
1470 }
1471 bind(no_set);
1472 }
1473
1474 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1475 Label no_set;
1476 bgeu(cmp1, cmp2, no_set);
1477 if (is_single) {
1478 fmv_s(dst, src);
1479 } else {
1480 fmv_d(dst, src);
1481 }
1482 bind(no_set);
1483 }
1484
1485 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1486 Label no_set;
1487 ble(cmp1, cmp2, no_set);
1488 if (is_single) {
1489 fmv_s(dst, src);
1490 } else {
1491 fmv_d(dst, src);
1492 }
1493 bind(no_set);
1494 }
1495
1496 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1497 Label no_set;
1498 bleu(cmp1, cmp2, no_set);
1499 if (is_single) {
1500 fmv_s(dst, src);
1501 } else {
1502 fmv_d(dst, src);
1503 }
1504 bind(no_set);
1505 }
1506
1507 // ----------- cmove, compare float/double -----------
1508 //
1509 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1510 // so, just list behaviour of unordered ones as follow.
1511 //
1512 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1513 // (If one or both inputs to the compare are NaN, then)
1514 // 1. (op1 lt op2) => true => CMove: dst = src
1515 // 2. (op1 le op2) => true => CMove: dst = src
1516 // 3. (op1 gt op2) => false => CMove: dst = dst
1517 // 4. (op1 ge op2) => false => CMove: dst = dst
1518 // 5. (op1 eq op2) => false => CMove: dst = dst
1519 // 6. (op1 ne op2) => true => CMove: dst = src
1520
1521 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1522 if (UseZicond) {
1523 if (is_single) {
1524 feq_s(t0, cmp1, cmp2);
1525 } else {
1526 feq_d(t0, cmp1, cmp2);
1527 }
1528 czero_nez(dst, dst, t0);
1529 czero_eqz(t0 , src, t0);
1530 orr(dst, dst, t0);
1531 return;
1532 }
1533 Label no_set;
1534 if (is_single) {
1535 // jump if cmp1 != cmp2, including the case of NaN
1536 // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1537 float_bne(cmp1, cmp2, no_set);
1538 } else {
1539 double_bne(cmp1, cmp2, no_set);
1540 }
1541 mv(dst, src);
1542 bind(no_set);
1543 }
1544
1545 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1546 if (UseZicond) {
1547 if (is_single) {
1548 feq_s(t0, cmp1, cmp2);
1549 } else {
1550 feq_d(t0, cmp1, cmp2);
1551 }
1552 czero_eqz(dst, dst, t0);
1553 czero_nez(t0 , src, t0);
1554 orr(dst, dst, t0);
1555 return;
1556 }
1557 Label no_set;
1558 if (is_single) {
1559 // jump if cmp1 == cmp2
1560 // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1561 float_beq(cmp1, cmp2, no_set);
1562 } else {
1563 double_beq(cmp1, cmp2, no_set);
1564 }
1565 mv(dst, src);
1566 bind(no_set);
1567 }
1568
1569 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1570 if (UseZicond) {
1571 if (is_single) {
1572 flt_s(t0, cmp2, cmp1);
1573 } else {
1574 flt_d(t0, cmp2, cmp1);
1575 }
1576 czero_eqz(dst, dst, t0);
1577 czero_nez(t0 , src, t0);
1578 orr(dst, dst, t0);
1579 return;
1580 }
1581 Label no_set;
1582 if (is_single) {
1583 // jump if cmp1 > cmp2
1584 // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1585 float_bgt(cmp1, cmp2, no_set);
1586 } else {
1587 double_bgt(cmp1, cmp2, no_set);
1588 }
1589 mv(dst, src);
1590 bind(no_set);
1591 }
1592
1593 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1594 if (UseZicond) {
1595 if (is_single) {
1596 fle_s(t0, cmp2, cmp1);
1597 } else {
1598 fle_d(t0, cmp2, cmp1);
1599 }
1600 czero_nez(dst, dst, t0);
1601 czero_eqz(t0 , src, t0);
1602 orr(dst, dst, t0);
1603 return;
1604 }
1605 Label no_set;
1606 if (is_single) {
1607 // jump if cmp1 < cmp2 or either is NaN
1608 // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1609 float_blt(cmp1, cmp2, no_set, false, true);
1610 } else {
1611 double_blt(cmp1, cmp2, no_set, false, true);
1612 }
1613 mv(dst, src);
1614 bind(no_set);
1615 }
1616
1617 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1618 if (UseZicond) {
1619 if (is_single) {
1620 fle_s(t0, cmp2, cmp1);
1621 } else {
1622 fle_d(t0, cmp2, cmp1);
1623 }
1624 czero_eqz(dst, dst, t0);
1625 czero_nez(t0 , src, t0);
1626 orr(dst, dst, t0);
1627 return;
1628 }
1629 Label no_set;
1630 if (is_single) {
1631 // jump if cmp1 >= cmp2
1632 // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1633 float_bge(cmp1, cmp2, no_set);
1634 } else {
1635 double_bge(cmp1, cmp2, no_set);
1636 }
1637 mv(dst, src);
1638 bind(no_set);
1639 }
1640
1641 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1642 if (UseZicond) {
1643 if (is_single) {
1644 flt_s(t0, cmp2, cmp1);
1645 } else {
1646 flt_d(t0, cmp2, cmp1);
1647 }
1648 czero_nez(dst, dst, t0);
1649 czero_eqz(t0 , src, t0);
1650 orr(dst, dst, t0);
1651 return;
1652 }
1653 Label no_set;
1654 if (is_single) {
1655 // jump if cmp1 <= cmp2 or either is NaN
1656 // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1657 float_ble(cmp1, cmp2, no_set, false, true);
1658 } else {
1659 double_ble(cmp1, cmp2, no_set, false, true);
1660 }
1661 mv(dst, src);
1662 bind(no_set);
1663 }
1664
1665 // ----------- cmove float/double, compare float/double -----------
1666
1667 // Move src to dst only if cmp1 == cmp2,
1668 // otherwise leave dst unchanged, including the case where one of them is NaN.
1669 // Clarification:
1670 // java code : cmp1 != cmp2 ? dst : src
1671 // transformed to : CMove dst, (cmp1 eq cmp2), dst, src
1672 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1673 FloatRegister dst, FloatRegister src,
1674 bool cmp_single, bool cmov_single) {
1675 Label no_set;
1676 if (cmp_single) {
1677 // jump if cmp1 != cmp2, including the case of NaN
1678 // not jump (i.e. move src to dst) if cmp1 == cmp2
1679 float_bne(cmp1, cmp2, no_set);
1680 } else {
1681 double_bne(cmp1, cmp2, no_set);
1682 }
1683 if (cmov_single) {
1684 fmv_s(dst, src);
1685 } else {
1686 fmv_d(dst, src);
1687 }
1688 bind(no_set);
1689 }
1690
1691 // Keep dst unchanged only if cmp1 == cmp2,
1692 // otherwise move src to dst, including the case where one of them is NaN.
1693 // Clarification:
1694 // java code : cmp1 == cmp2 ? dst : src
1695 // transformed to : CMove dst, (cmp1 ne cmp2), dst, src
1696 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1697 FloatRegister dst, FloatRegister src,
1698 bool cmp_single, bool cmov_single) {
1699 Label no_set;
1700 if (cmp_single) {
1701 // jump if cmp1 == cmp2
1702 // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1703 float_beq(cmp1, cmp2, no_set);
1704 } else {
1705 double_beq(cmp1, cmp2, no_set);
1706 }
1707 if (cmov_single) {
1708 fmv_s(dst, src);
1709 } else {
1710 fmv_d(dst, src);
1711 }
1712 bind(no_set);
1713 }
1714
1715 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1716 // Clarification
1717 // scenario 1:
1718 // java code : cmp2 < cmp1 ? dst : src
1719 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1720 // scenario 2:
1721 // java code : cmp1 > cmp2 ? dst : src
1722 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1723 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1724 FloatRegister dst, FloatRegister src,
1725 bool cmp_single, bool cmov_single) {
1726 Label no_set;
1727 if (cmp_single) {
1728 // jump if cmp1 > cmp2
1729 // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1730 float_bgt(cmp1, cmp2, no_set);
1731 } else {
1732 double_bgt(cmp1, cmp2, no_set);
1733 }
1734 if (cmov_single) {
1735 fmv_s(dst, src);
1736 } else {
1737 fmv_d(dst, src);
1738 }
1739 bind(no_set);
1740 }
1741
1742 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1743 FloatRegister dst, FloatRegister src,
1744 bool cmp_single, bool cmov_single) {
1745 Label no_set;
1746 if (cmp_single) {
1747 // jump if cmp1 < cmp2 or either is NaN
1748 // not jump (i.e. move src to dst) if cmp1 >= cmp2
1749 float_blt(cmp1, cmp2, no_set, false, true);
1750 } else {
1751 double_blt(cmp1, cmp2, no_set, false, true);
1752 }
1753 if (cmov_single) {
1754 fmv_s(dst, src);
1755 } else {
1756 fmv_d(dst, src);
1757 }
1758 bind(no_set);
1759 }
1760
1761 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1762 // Clarification
1763 // scenario 1:
1764 // java code : cmp2 <= cmp1 ? dst : src
1765 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1766 // scenario 2:
1767 // java code : cmp1 >= cmp2 ? dst : src
1768 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1769 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1770 FloatRegister dst, FloatRegister src,
1771 bool cmp_single, bool cmov_single) {
1772 Label no_set;
1773 if (cmp_single) {
1774 // jump if cmp1 >= cmp2
1775 // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1776 float_bge(cmp1, cmp2, no_set);
1777 } else {
1778 double_bge(cmp1, cmp2, no_set);
1779 }
1780 if (cmov_single) {
1781 fmv_s(dst, src);
1782 } else {
1783 fmv_d(dst, src);
1784 }
1785 bind(no_set);
1786 }
1787
1788 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1789 FloatRegister dst, FloatRegister src,
1790 bool cmp_single, bool cmov_single) {
1791 Label no_set;
1792 if (cmp_single) {
1793 // jump if cmp1 <= cmp2 or either is NaN
1794 // not jump (i.e. move src to dst) if cmp1 > cmp2
1795 float_ble(cmp1, cmp2, no_set, false, true);
1796 } else {
1797 double_ble(cmp1, cmp2, no_set, false, true);
1798 }
1799 if (cmov_single) {
1800 fmv_s(dst, src);
1801 } else {
1802 fmv_d(dst, src);
1803 }
1804 bind(no_set);
1805 }
1806
1807 // Float compare branch instructions
1808
1809 #define INSN(NAME, FLOATCMP, BRANCH) \
1810 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1811 FLOATCMP##_s(t0, Rs1, Rs2); \
1812 BRANCH(t0, l, is_far); \
1813 } \
1814 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1815 FLOATCMP##_d(t0, Rs1, Rs2); \
1816 BRANCH(t0, l, is_far); \
1817 }
1818
1819 INSN(beq, feq, bnez);
1820 INSN(bne, feq, beqz);
1821
1822 #undef INSN
1823
1824
1825 #define INSN(NAME, FLOATCMP1, FLOATCMP2) \
1826 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1827 bool is_far, bool is_unordered) { \
1828 if (is_unordered) { \
1829 /* jump if either source is NaN or condition is expected */ \
1830 FLOATCMP2##_s(t0, Rs2, Rs1); \
1831 beqz(t0, l, is_far); \
1832 } else { \
1833 /* jump if no NaN in source and condition is expected */ \
1834 FLOATCMP1##_s(t0, Rs1, Rs2); \
1835 bnez(t0, l, is_far); \
1836 } \
1837 } \
1838 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1839 bool is_far, bool is_unordered) { \
1840 if (is_unordered) { \
1841 /* jump if either source is NaN or condition is expected */ \
1842 FLOATCMP2##_d(t0, Rs2, Rs1); \
1843 beqz(t0, l, is_far); \
1844 } else { \
1845 /* jump if no NaN in source and condition is expected */ \
1846 FLOATCMP1##_d(t0, Rs1, Rs2); \
1847 bnez(t0, l, is_far); \
1848 } \
1849 }
1850
1851 INSN(ble, fle, flt);
1852 INSN(blt, flt, fle);
1853
1854 #undef INSN
1855
1856 #define INSN(NAME, CMP) \
1857 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1858 bool is_far, bool is_unordered) { \
1859 float_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1860 } \
1861 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1862 bool is_far, bool is_unordered) { \
1863 double_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1864 }
1865
1866 INSN(bgt, blt);
1867 INSN(bge, ble);
1868
1869 #undef INSN
1870
1871 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1872 // These three are specified in zicntr and are unused.
1873 // Before adding use-cases add the appropriate hwprobe and flag.
1874 assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1875 "Not intended for use without enabling zicntr.");
1876 csrrs(Rd, csr, x0);
1877 }
1878
1879 #define INSN(NAME, OPFUN) \
1880 void MacroAssembler::NAME(unsigned csr, Register Rs) { \
1881 OPFUN(x0, csr, Rs); \
1882 }
1883
1884 INSN(csrw, csrrw);
1885 INSN(csrs, csrrs);
1886 INSN(csrc, csrrc);
1887
1888 #undef INSN
1889
1890 #define INSN(NAME, OPFUN) \
1891 void MacroAssembler::NAME(unsigned csr, unsigned imm) { \
1892 OPFUN(x0, csr, imm); \
1893 }
1894
1895 INSN(csrwi, csrrwi);
1896 INSN(csrsi, csrrsi);
1897 INSN(csrci, csrrci);
1898
1899 #undef INSN
1900
1901 #define INSN(NAME, CSR) \
1902 void MacroAssembler::NAME(Register Rd, Register Rs) { \
1903 csrrw(Rd, CSR, Rs); \
1904 }
1905
1906 INSN(fscsr, CSR_FCSR);
1907 INSN(fsrm, CSR_FRM);
1908 INSN(fsflags, CSR_FFLAGS);
1909
1910 #undef INSN
1911
1912 #define INSN(NAME) \
1913 void MacroAssembler::NAME(Register Rs) { \
1914 NAME(x0, Rs); \
1915 }
1916
1917 INSN(fscsr);
1918 INSN(fsrm);
1919 INSN(fsflags);
1920
1921 #undef INSN
1922
1923 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1924 guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1925 csrrwi(Rd, CSR_FRM, imm);
1926 }
1927
1928 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1929 csrrwi(Rd, CSR_FFLAGS, imm);
1930 }
1931
1932 #define INSN(NAME) \
1933 void MacroAssembler::NAME(unsigned imm) { \
1934 NAME(x0, imm); \
1935 }
1936
1937 INSN(fsrmi);
1938 INSN(fsflagsi);
1939
1940 #undef INSN
1941
1942 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1943 if (RestoreMXCSROnJNICalls) {
1944 Label skip_fsrmi;
1945 frrm(tmp);
1946 // Set FRM to the state we need. We do want Round to Nearest.
1947 // We don't want non-IEEE rounding modes.
1948 guarantee(RoundingMode::rne == 0, "must be");
1949 beqz(tmp, skip_fsrmi); // Only reset FRM if it's wrong
1950 fsrmi(RoundingMode::rne);
1951 bind(skip_fsrmi);
1952 }
1953 }
1954
1955 void MacroAssembler::push_reg(Register Rs) {
1956 subi(esp, esp, wordSize);
1957 sd(Rs, Address(esp, 0));
1958 }
1959
1960 void MacroAssembler::pop_reg(Register Rd) {
1961 ld(Rd, Address(esp, 0));
1962 addi(esp, esp, wordSize);
1963 }
1964
1965 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1966 int count = 0;
1967 // Scan bitset to accumulate register pairs
1968 for (int reg = 31; reg >= 0; reg--) {
1969 if ((1U << 31) & bitset) {
1970 regs[count++] = reg;
1971 }
1972 bitset <<= 1;
1973 }
1974 return count;
1975 }
1976
1977 // Push integer registers in the bitset supplied. Don't push sp.
1978 // Return the number of words pushed
1979 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1980 if (regset.bits() == 0) {
1981 return 0;
1982 }
1983 auto bitset = integer_cast<unsigned int>(regset.bits());
1984 DEBUG_ONLY(int words_pushed = 0;)
1985 unsigned char regs[32];
1986 int count = bitset_to_regs(bitset, regs);
1987 // reserve one slot to align for odd count
1988 int offset = is_even(count) ? 0 : wordSize;
1989
1990 if (count) {
1991 sub(stack, stack, count * wordSize + offset);
1992 }
1993 for (int i = count - 1; i >= 0; i--) {
1994 sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1995 DEBUG_ONLY(words_pushed++;)
1996 }
1997
1998 assert(words_pushed == count, "oops, pushed != count");
1999
2000 return count;
2001 }
2002
2003 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
2004 if (regset.bits() == 0) {
2005 return 0;
2006 }
2007 auto bitset = integer_cast<unsigned int>(regset.bits());
2008 DEBUG_ONLY(int words_popped = 0;)
2009 unsigned char regs[32];
2010 int count = bitset_to_regs(bitset, regs);
2011 // reserve one slot to align for odd count
2012 int offset = is_even(count) ? 0 : wordSize;
2013
2014 for (int i = count - 1; i >= 0; i--) {
2015 ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2016 DEBUG_ONLY(words_popped++;)
2017 }
2018
2019 if (count) {
2020 add(stack, stack, count * wordSize + offset);
2021 }
2022 assert(words_popped == count, "oops, popped != count");
2023
2024 return count;
2025 }
2026
2027 // Push floating-point registers in the bitset supplied.
2028 // Return the number of words pushed
2029 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2030 if (regset.bits() == 0) {
2031 return 0;
2032 }
2033 auto bitset = integer_cast<unsigned int>(regset.bits());
2034 DEBUG_ONLY(int words_pushed = 0;)
2035 unsigned char regs[32];
2036 int count = bitset_to_regs(bitset, regs);
2037 int push_slots = count + (count & 1);
2038
2039 if (count) {
2040 subi(stack, stack, push_slots * wordSize);
2041 }
2042
2043 for (int i = count - 1; i >= 0; i--) {
2044 fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2045 DEBUG_ONLY(words_pushed++;)
2046 }
2047
2048 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2049
2050 return count;
2051 }
2052
2053 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2054 if (regset.bits() == 0) {
2055 return 0;
2056 }
2057 auto bitset = integer_cast<unsigned int>(regset.bits());
2058 DEBUG_ONLY(int words_popped = 0;)
2059 unsigned char regs[32];
2060 int count = bitset_to_regs(bitset, regs);
2061 int pop_slots = count + (count & 1);
2062
2063 for (int i = count - 1; i >= 0; i--) {
2064 fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2065 DEBUG_ONLY(words_popped++;)
2066 }
2067
2068 if (count) {
2069 addi(stack, stack, pop_slots * wordSize);
2070 }
2071
2072 assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2073
2074 return count;
2075 }
2076
2077 /**
2078 * Emits code to update CRC-32 with a byte value according to constants in table
2079 *
2080 * @param [in,out]crc Register containing the crc.
2081 * @param [in]val Register containing the byte to fold into the CRC.
2082 * @param [in]table Register containing the table of crc constants.
2083 *
2084 * uint32_t crc;
2085 * val = crc_table[(val ^ crc) & 0xFF];
2086 * crc = val ^ (crc >> 8);
2087 *
2088 */
2089 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2090 assert_different_registers(crc, val, table);
2091
2092 xorr(val, val, crc);
2093 zext(val, val, 8);
2094 shadd(val, val, table, val, 2);
2095 lwu(val, Address(val));
2096 srli(crc, crc, 8);
2097 xorr(crc, val, crc);
2098 }
2099
2100 /**
2101 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2102 *
2103 * @param [in,out]crc Register containing the crc.
2104 * @param [in]v Register containing the 32-bit to fold into the CRC.
2105 * @param [in]table0 Register containing table 0 of crc constants.
2106 * @param [in]table1 Register containing table 1 of crc constants.
2107 * @param [in]table2 Register containing table 2 of crc constants.
2108 * @param [in]table3 Register containing table 3 of crc constants.
2109 *
2110 * uint32_t crc;
2111 * v = crc ^ v
2112 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2113 *
2114 */
2115 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2116 Register table0, Register table1, Register table2, Register table3, bool upper) {
2117 assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2118
2119 if (upper)
2120 srli(v, v, 32);
2121 xorr(v, v, crc);
2122
2123 zext(tmp1, v, 8);
2124 shadd(tmp1, tmp1, table3, tmp2, 2);
2125 lwu(crc, Address(tmp1));
2126
2127 slli(tmp1, v, 16);
2128 slli(tmp3, v, 8);
2129
2130 srliw(tmp1, tmp1, 24);
2131 srliw(tmp3, tmp3, 24);
2132
2133 shadd(tmp1, tmp1, table2, tmp1, 2);
2134 lwu(tmp2, Address(tmp1));
2135
2136 shadd(tmp3, tmp3, table1, tmp3, 2);
2137 xorr(crc, crc, tmp2);
2138
2139 lwu(tmp2, Address(tmp3));
2140 // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2141 if (upper)
2142 srli(tmp1, v, 24);
2143 else
2144 srliw(tmp1, v, 24);
2145
2146 // no need to clear bits other than lowest two
2147 shadd(tmp1, tmp1, table0, tmp1, 2);
2148 xorr(crc, crc, tmp2);
2149 lwu(tmp2, Address(tmp1));
2150 xorr(crc, crc, tmp2);
2151 }
2152
2153
2154 #ifdef COMPILER2
2155 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2156 // To make it, following steps are taken:
2157 // 1. in zcrc32.c, modify N to 16 and related code,
2158 // 2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2159 // 3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2160 // New tables for vector version is after table3.
2161 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2162 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2163 Register table0, Register table3) {
2164 assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2165 const int N = 16, W = 4;
2166 const int64_t single_table_size = 256;
2167 const Register blks = tmp2;
2168 const Register tmpTable = tmp3, tableN16 = tmp4;
2169 const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2170 Label VectorLoop;
2171 Label LastBlock;
2172
2173 add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2174 mv(tmp5, 0xff);
2175
2176 if (MaxVectorSize == 16) {
2177 vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2178 } else if (MaxVectorSize == 32) {
2179 vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2180 } else {
2181 assert(MaxVectorSize > 32, "sanity");
2182 vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2183 }
2184
2185 vmv_v_x(vcrc, zr);
2186 vmv_s_x(vcrc, crc);
2187
2188 // multiple of 64
2189 srli(blks, len, 6);
2190 slli(t1, blks, 6);
2191 sub(len, len, t1);
2192 subi(blks, blks, 1);
2193 blez(blks, LastBlock);
2194
2195 bind(VectorLoop);
2196 {
2197 mv(tmpTable, tableN16);
2198
2199 vle32_v(vword, buf);
2200 vxor_vv(vword, vword, vcrc);
2201
2202 addi(buf, buf, N*4);
2203
2204 vand_vx(vtmp, vword, tmp5);
2205 vsll_vi(vtmp, vtmp, 2);
2206 vluxei32_v(vcrc, tmpTable, vtmp);
2207
2208 mv(tmp1, 1);
2209 for (int k = 1; k < W; k++) {
2210 addi(tmpTable, tmpTable, single_table_size*4);
2211
2212 slli(t1, tmp1, 3);
2213 vsrl_vx(vtmp, vword, t1);
2214
2215 vand_vx(vtmp, vtmp, tmp5);
2216 vsll_vi(vtmp, vtmp, 2);
2217 vluxei32_v(vtmp, tmpTable, vtmp);
2218
2219 vxor_vv(vcrc, vcrc, vtmp);
2220
2221 addi(tmp1, tmp1, 1);
2222 }
2223
2224 subi(blks, blks, 1);
2225 bgtz(blks, VectorLoop);
2226 }
2227
2228 bind(LastBlock);
2229 {
2230 vle32_v(vtmp, buf);
2231 vxor_vv(vcrc, vcrc, vtmp);
2232 mv(crc, zr);
2233 for (int i = 0; i < N; i++) {
2234 vmv_x_s(tmp2, vcrc);
2235 // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2236 zext(tmp2, tmp2, 32);
2237 vslidedown_vi(vcrc, vcrc, 1);
2238 xorr(crc, crc, tmp2);
2239 for (int j = 0; j < W; j++) {
2240 andr(t1, crc, tmp5);
2241 shadd(t1, t1, table0, tmp1, 2);
2242 lwu(t1, Address(t1, 0));
2243 srli(tmp2, crc, 8);
2244 xorr(crc, tmp2, t1);
2245 }
2246 }
2247 addi(buf, buf, N*4);
2248 }
2249 }
2250
2251 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2252 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2253 Register buf, Register tmp, const int STEP) {
2254 assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2255 vclmul_vv(vtmp1, vx, vt);
2256 vclmulh_vv(vtmp2, vx, vt);
2257 vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2258 // low parts
2259 vredxor_vs(vtmp3, vtmp1, vtmp4);
2260 // high parts
2261 vslidedown_vi(vx, vtmp4, 1);
2262 vredxor_vs(vtmp1, vtmp2, vx);
2263 // merge low and high back
2264 vslideup_vi(vx, vtmp1, 1);
2265 vmv_x_s(tmp, vtmp3);
2266 vmv_s_x(vx, tmp);
2267 }
2268
2269 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2270 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2271 Register tmp) {
2272 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2273 vclmul_vv(vtmp1, vx, vt);
2274 vclmulh_vv(vtmp2, vx, vt);
2275 // low parts
2276 vredxor_vs(vtmp3, vtmp1, vy);
2277 // high parts
2278 vslidedown_vi(vtmp4, vy, 1);
2279 vredxor_vs(vtmp1, vtmp2, vtmp4);
2280 // merge low and high back
2281 vslideup_vi(vx, vtmp1, 1);
2282 vmv_x_s(tmp, vtmp3);
2283 vmv_s_x(vx, tmp);
2284 }
2285
2286 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2287 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2288 Register tmp) {
2289 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2290 vclmul_vv(vtmp1, vx, vt);
2291 vclmulh_vv(vtmp2, vx, vt);
2292 // low parts
2293 vredxor_vs(vtmp3, vtmp1, vy);
2294 // high parts
2295 vslidedown_vi(vtmp4, vy, 1);
2296 vredxor_vs(vtmp1, vtmp2, vtmp4);
2297 // merge low and high back
2298 vslideup_vi(vy, vtmp1, 1);
2299 vmv_x_s(tmp, vtmp3);
2300 vmv_s_x(vy, tmp);
2301 }
2302
2303 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2304 Register vclmul_table, Register tmp1, Register tmp2) {
2305 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2306 assert(MaxVectorSize == 16, "sanity");
2307
2308 const int TABLE_STEP = 16;
2309 const int STEP = 16;
2310 const int LOOP_STEP = 128;
2311 const int N = 2;
2312
2313 Register loop_step = t1;
2314
2315 // ======== preparation ========
2316
2317 mv(loop_step, LOOP_STEP);
2318 sub(len, len, loop_step);
2319
2320 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2321 vle64_v(v0, buf); addi(buf, buf, STEP);
2322 vle64_v(v1, buf); addi(buf, buf, STEP);
2323 vle64_v(v2, buf); addi(buf, buf, STEP);
2324 vle64_v(v3, buf); addi(buf, buf, STEP);
2325 vle64_v(v4, buf); addi(buf, buf, STEP);
2326 vle64_v(v5, buf); addi(buf, buf, STEP);
2327 vle64_v(v6, buf); addi(buf, buf, STEP);
2328 vle64_v(v7, buf); addi(buf, buf, STEP);
2329
2330 vmv_v_x(v31, zr);
2331 vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2332 vmv_s_x(v31, crc);
2333 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2334 vxor_vv(v0, v0, v31);
2335
2336 // load table
2337 vle64_v(v31, vclmul_table);
2338
2339 Label L_16_bytes_loop;
2340 j(L_16_bytes_loop);
2341
2342
2343 // ======== folding 128 bytes in data buffer per round ========
2344
2345 align(OptoLoopAlignment);
2346 bind(L_16_bytes_loop);
2347 {
2348 crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2349 crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2350 crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2351 crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2352 crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2353 crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2354 crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2355 crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2356 }
2357 sub(len, len, loop_step);
2358 bge(len, loop_step, L_16_bytes_loop);
2359
2360
2361 // ======== folding into 64 bytes from 128 bytes in register ========
2362
2363 // load table
2364 addi(vclmul_table, vclmul_table, TABLE_STEP);
2365 vle64_v(v31, vclmul_table);
2366
2367 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2368 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2369 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2370 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2371
2372
2373 // ======== folding into 16 bytes from 64 bytes in register ========
2374
2375 addi(vclmul_table, vclmul_table, TABLE_STEP);
2376 vle64_v(v31, vclmul_table);
2377 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2378
2379 addi(vclmul_table, vclmul_table, TABLE_STEP);
2380 vle64_v(v31, vclmul_table);
2381 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2382
2383 addi(vclmul_table, vclmul_table, TABLE_STEP);
2384 vle64_v(v31, vclmul_table);
2385 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2386
2387 #undef FOLD_2_VCLMUL_3
2388
2389
2390 // ======== final: move result to scalar regsiters ========
2391
2392 vmv_x_s(tmp1, v3);
2393 vslidedown_vi(v1, v3, 1);
2394 vmv_x_s(tmp2, v1);
2395 }
2396
2397 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2398 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2399 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2400 vclmul_vv(vtmp1, vx, vt);
2401 vclmulh_vv(vtmp2, vx, vt);
2402 // low parts
2403 vredxor_vs(vtmp3, vtmp1, vy);
2404 // high parts
2405 vslidedown_vi(vtmp4, vy, 1);
2406 vredxor_vs(vtmp1, vtmp2, vtmp4);
2407 // merge low and high back
2408 vslideup_vi(vy, vtmp1, 1);
2409 vmv_x_s(t1, vtmp3);
2410 vmv_s_x(vy, t1);
2411 }
2412
2413 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2414 Register vclmul_table, Register tmp1, Register tmp2) {
2415 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2416 assert(MaxVectorSize >= 32, "sanity");
2417
2418 // utility: load table
2419 #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2420 vid_v(vtmp); \
2421 mv(rtmp, 2); \
2422 vremu_vx(vtmp, vtmp, rtmp); \
2423 vsll_vi(vtmp, vtmp, 3); \
2424 vluxei64_v(vt, rt, vtmp);
2425
2426 const int TABLE_STEP = 16;
2427 const int STEP = 128; // 128 bytes per round
2428 const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2429
2430 Register step = tmp2;
2431
2432
2433 // ======== preparation ========
2434
2435 mv(step, STEP);
2436 sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2437
2438 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2439 // load data
2440 vle64_v(v4, buf);
2441 add(buf, buf, step);
2442
2443 // load table
2444 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2445 // load mask,
2446 // v28 should already contains: 0, 8, 0, 8, ...
2447 vmseq_vi(v2, v28, 0);
2448 // now, v2 should contains: 101010...
2449 vmnand_mm(v1, v2, v2);
2450 // now, v1 should contains: 010101...
2451
2452 // initial crc
2453 vmv_v_x(v24, zr);
2454 vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2455 vmv_s_x(v24, crc);
2456 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2457 vxor_vv(v4, v4, v24);
2458
2459 Label L_128_bytes_loop;
2460 j(L_128_bytes_loop);
2461
2462
2463 // ======== folding 128 bytes in data buffer per round ========
2464
2465 align(OptoLoopAlignment);
2466 bind(L_128_bytes_loop);
2467 {
2468 // v4: data
2469 // v4: buf, reused
2470 // v8: table
2471 // v12: lows
2472 // v16: highs
2473 // v20: low_slides
2474 // v24: high_slides
2475 vclmul_vv(v12, v4, v8);
2476 vclmulh_vv(v16, v4, v8);
2477 vle64_v(v4, buf);
2478 add(buf, buf, step);
2479 // lows
2480 vslidedown_vi(v20, v12, 1);
2481 vmand_mm(v0, v2, v2);
2482 vxor_vv(v12, v12, v20, v0_t);
2483 // with buf data
2484 vxor_vv(v4, v4, v12, v0_t);
2485
2486 // highs
2487 vslideup_vi(v24, v16, 1);
2488 vmand_mm(v0, v1, v1);
2489 vxor_vv(v16, v16, v24, v0_t);
2490 // with buf data
2491 vxor_vv(v4, v4, v16, v0_t);
2492 }
2493 sub(len, len, step);
2494 bge(len, step, L_128_bytes_loop);
2495
2496
2497 // ======== folding into 64 bytes from 128 bytes in register ========
2498
2499 // load table
2500 addi(vclmul_table, vclmul_table, TABLE_STEP);
2501 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2502
2503 // v4: data, first (low) part, N/2 of 64-bits
2504 // v20: data, second (high) part, N/2 of 64-bits
2505 // v8: table
2506 // v10: lows
2507 // v12: highs
2508 // v14: low_slides
2509 // v16: high_slides
2510
2511 // high part
2512 vslidedown_vi(v20, v4, N/2);
2513
2514 vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2515
2516 vclmul_vv(v10, v4, v8);
2517 vclmulh_vv(v12, v4, v8);
2518
2519 // lows
2520 vslidedown_vi(v14, v10, 1);
2521 vmand_mm(v0, v2, v2);
2522 vxor_vv(v10, v10, v14, v0_t);
2523 // with data part 2
2524 vxor_vv(v4, v20, v10, v0_t);
2525
2526 // highs
2527 vslideup_vi(v16, v12, 1);
2528 vmand_mm(v0, v1, v1);
2529 vxor_vv(v12, v12, v16, v0_t);
2530 // with data part 2
2531 vxor_vv(v4, v20, v12, v0_t);
2532
2533
2534 // ======== folding into 16 bytes from 64 bytes in register ========
2535
2536 // v4: data, first part, 2 of 64-bits
2537 // v16: data, second part, 2 of 64-bits
2538 // v18: data, third part, 2 of 64-bits
2539 // v20: data, second part, 2 of 64-bits
2540 // v8: table
2541
2542 vslidedown_vi(v16, v4, 2);
2543 vslidedown_vi(v18, v4, 4);
2544 vslidedown_vi(v20, v4, 6);
2545
2546 vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2547
2548 addi(vclmul_table, vclmul_table, TABLE_STEP);
2549 vle64_v(v8, vclmul_table);
2550 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2551
2552 addi(vclmul_table, vclmul_table, TABLE_STEP);
2553 vle64_v(v8, vclmul_table);
2554 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2555
2556 addi(vclmul_table, vclmul_table, TABLE_STEP);
2557 vle64_v(v8, vclmul_table);
2558 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2559
2560
2561 // ======== final: move result to scalar regsiters ========
2562
2563 vmv_x_s(tmp1, v20);
2564 vslidedown_vi(v4, v20, 1);
2565 vmv_x_s(tmp2, v4);
2566
2567 #undef CRC32_VCLMUL_LOAD_TABLE
2568 }
2569
2570 // For more details of the algorithm, please check the paper:
2571 // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2572 //
2573 // Please also refer to the corresponding code in aarch64 or x86 ones.
2574 //
2575 // As the riscv carry-less multiplication is a bit different from the other platforms,
2576 // so the implementation itself is also a bit different from others.
2577
2578 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2579 Register table0, Register table1, Register table2, Register table3,
2580 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2581 const int64_t single_table_size = 256;
2582 const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2583 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2584 Register vclmul_table = tmp3;
2585
2586 la(vclmul_table, table_addr);
2587 add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2588 la(table0, table_addr);
2589
2590 if (MaxVectorSize == 16) {
2591 kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2592 } else {
2593 kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2594 }
2595
2596 mv(crc, zr);
2597 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2598 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2599 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2600 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2601 }
2602
2603 #endif // COMPILER2
2604
2605 /**
2606 * @param crc register containing existing CRC (32-bit)
2607 * @param buf register pointing to input byte buffer (byte*)
2608 * @param len register containing number of bytes
2609 * @param table register that will contain address of CRC table
2610 * @param tmp scratch registers
2611 */
2612 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2613 Register table0, Register table1, Register table2, Register table3,
2614 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2615 assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2616 Label L_vector_entry,
2617 L_unroll_loop,
2618 L_by4_loop_entry, L_by4_loop,
2619 L_by1_loop, L_exit, L_skip1, L_skip2;
2620
2621 const int64_t single_table_size = 256;
2622 const int64_t unroll = 16;
2623 const int64_t unroll_words = unroll*wordSize;
2624
2625 // tmp5 = 0xffffffff
2626 notr(tmp5, zr);
2627 srli(tmp5, tmp5, 32);
2628
2629 andn(crc, tmp5, crc);
2630
2631 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2632 la(table0, table_addr);
2633 add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2634 add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2635 add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2636
2637 // Ensure basic 4-byte alignment of input byte buffer
2638 mv(tmp1, 4);
2639 blt(len, tmp1, L_by1_loop);
2640 test_bit(tmp1, buf, 0);
2641 beqz(tmp1, L_skip1);
2642 subiw(len, len, 1);
2643 lbu(tmp1, Address(buf));
2644 addi(buf, buf, 1);
2645 update_byte_crc32(crc, tmp1, table0);
2646 bind(L_skip1);
2647 test_bit(tmp1, buf, 1);
2648 beqz(tmp1, L_skip2);
2649 subiw(len, len, 2);
2650 lhu(tmp1, Address(buf));
2651 addi(buf, buf, 2);
2652 zext(tmp2, tmp1, 8);
2653 update_byte_crc32(crc, tmp2, table0);
2654 srli(tmp2, tmp1, 8);
2655 update_byte_crc32(crc, tmp2, table0);
2656 bind(L_skip2);
2657
2658 #ifdef COMPILER2
2659 if (UseRVV) {
2660 const int64_t tmp_limit =
2661 UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2662 : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2663 mv(tmp1, tmp_limit);
2664 bge(len, tmp1, L_vector_entry);
2665 }
2666 #endif // COMPILER2
2667
2668 mv(tmp1, unroll_words);
2669 blt(len, tmp1, L_by4_loop_entry);
2670
2671 const Register loop_buf_end = tmp3;
2672
2673 align(CodeEntryAlignment);
2674 // Entry for L_unroll_loop
2675 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2676 andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2677 sub(loop_buf_end, loop_buf_end, len);
2678 bind(L_unroll_loop);
2679 for (int i = 0; i < unroll; i++) {
2680 ld(tmp1, Address(buf, i*wordSize));
2681 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2682 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2683 }
2684
2685 addi(buf, buf, unroll_words);
2686 blt(buf, loop_buf_end, L_unroll_loop);
2687
2688 bind(L_by4_loop_entry);
2689 mv(tmp1, 4);
2690 blt(len, tmp1, L_by1_loop);
2691 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2692 andi(len, len, 3);
2693 sub(loop_buf_end, loop_buf_end, len);
2694 bind(L_by4_loop);
2695 lwu(tmp1, Address(buf));
2696 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2697 addi(buf, buf, 4);
2698 blt(buf, loop_buf_end, L_by4_loop);
2699
2700 bind(L_by1_loop);
2701 beqz(len, L_exit);
2702
2703 subiw(len, len, 1);
2704 lbu(tmp1, Address(buf));
2705 update_byte_crc32(crc, tmp1, table0);
2706 beqz(len, L_exit);
2707
2708 subiw(len, len, 1);
2709 lbu(tmp1, Address(buf, 1));
2710 update_byte_crc32(crc, tmp1, table0);
2711 beqz(len, L_exit);
2712
2713 subiw(len, len, 1);
2714 lbu(tmp1, Address(buf, 2));
2715 update_byte_crc32(crc, tmp1, table0);
2716
2717 #ifdef COMPILER2
2718 // put vector code here, otherwise "offset is too large" error occurs.
2719 if (UseRVV) {
2720 // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2721 j(L_exit);
2722
2723 bind(L_vector_entry);
2724 if (UseZvbc) { // carry-less multiplication
2725 kernel_crc32_vclmul_fold(crc, buf, len,
2726 table0, table1, table2, table3,
2727 tmp1, tmp2, tmp3, tmp4, tmp6);
2728 } else { // plain vector instructions
2729 vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2730 }
2731
2732 bgtz(len, L_by4_loop_entry);
2733 }
2734 #endif // COMPILER2
2735
2736 bind(L_exit);
2737 andn(crc, tmp5, crc);
2738 }
2739
2740 #ifdef COMPILER2
2741 // Push vector registers in the bitset supplied.
2742 // Return the number of words pushed
2743 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2744 if (regset.bits() == 0) {
2745 return 0;
2746 }
2747 auto bitset = integer_cast<unsigned int>(regset.bits());
2748 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2749
2750 // Scan bitset to accumulate register pairs
2751 unsigned char regs[32];
2752 int count = bitset_to_regs(bitset, regs);
2753
2754 for (int i = 0; i < count; i++) {
2755 sub(stack, stack, vector_size_in_bytes);
2756 vs1r_v(as_VectorRegister(regs[i]), stack);
2757 }
2758
2759 return count * vector_size_in_bytes / wordSize;
2760 }
2761
2762 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2763 if (regset.bits() == 0) {
2764 return 0;
2765 }
2766 auto bitset = integer_cast<unsigned int>(regset.bits());
2767 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2768
2769 // Scan bitset to accumulate register pairs
2770 unsigned char regs[32];
2771 int count = bitset_to_regs(bitset, regs);
2772
2773 for (int i = count - 1; i >= 0; i--) {
2774 vl1r_v(as_VectorRegister(regs[i]), stack);
2775 add(stack, stack, vector_size_in_bytes);
2776 }
2777
2778 return count * vector_size_in_bytes / wordSize;
2779 }
2780 #endif // COMPILER2
2781
2782 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2783 // Push integer registers x7, x10-x17, x28-x31.
2784 push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2785
2786 // Push float registers f0-f7, f10-f17, f28-f31.
2787 subi(sp, sp, wordSize * 20);
2788 int offset = 0;
2789 for (int i = 0; i < 32; i++) {
2790 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2791 fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2792 }
2793 }
2794 }
2795
2796 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2797 int offset = 0;
2798 for (int i = 0; i < 32; i++) {
2799 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2800 fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2801 }
2802 }
2803 addi(sp, sp, wordSize * 20);
2804
2805 pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2806 }
2807
2808 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2809 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2810 push_reg(RegSet::range(x5, x31), sp);
2811
2812 // float registers
2813 subi(sp, sp, 32 * wordSize);
2814 for (int i = 0; i < 32; i++) {
2815 fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2816 }
2817
2818 // vector registers
2819 if (save_vectors) {
2820 sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2821 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2822 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2823 add(t0, sp, vector_size_in_bytes * i);
2824 vse64_v(as_VectorRegister(i), t0);
2825 }
2826 }
2827 }
2828
2829 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2830 // vector registers
2831 if (restore_vectors) {
2832 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2833 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2834 vle64_v(as_VectorRegister(i), sp);
2835 add(sp, sp, vector_size_in_bytes * 8);
2836 }
2837 }
2838
2839 // float registers
2840 for (int i = 0; i < 32; i++) {
2841 fld(as_FloatRegister(i), Address(sp, i * wordSize));
2842 }
2843 addi(sp, sp, 32 * wordSize);
2844
2845 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2846 pop_reg(RegSet::range(x5, x31), sp);
2847 }
2848
2849 static int patch_offset_in_jal(address branch, int64_t offset) {
2850 assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2851 "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2852 Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1); // offset[20] ==> branch[31]
2853 Assembler::patch(branch, 30, 21, (offset >> 1) & 0x3ff); // offset[10:1] ==> branch[30:21]
2854 Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1); // offset[11] ==> branch[20]
2855 Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff); // offset[19:12] ==> branch[19:12]
2856 return MacroAssembler::instruction_size; // only one instruction
2857 }
2858
2859 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2860 assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2861 "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2862 Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1); // offset[12] ==> branch[31]
2863 Assembler::patch(branch, 30, 25, (offset >> 5) & 0x3f); // offset[10:5] ==> branch[30:25]
2864 Assembler::patch(branch, 7, 7, (offset >> 11) & 0x1); // offset[11] ==> branch[7]
2865 Assembler::patch(branch, 11, 8, (offset >> 1) & 0xf); // offset[4:1] ==> branch[11:8]
2866 return MacroAssembler::instruction_size; // only one instruction
2867 }
2868
2869 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2870 const int PC_RELATIVE_INSTRUCTION_NUM = 2; // auipc, addi/jalr/load
2871 Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff); // Auipc. offset[31:12] ==> branch[31:12]
2872 Assembler::patch(branch + 4, 31, 20, offset & 0xfff); // Addi/Jalr/Load. offset[11:0] ==> branch[31:20]
2873 return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2874 }
2875
2876 static int patch_addr_in_movptr1(address branch, address target) {
2877 int32_t lower = ((intptr_t)target << 35) >> 35;
2878 int64_t upper = ((intptr_t)target - lower) >> 29;
2879 Assembler::patch(branch + 0, 31, 12, upper & 0xfffff); // Lui. target[48:29] + target[28] ==> branch[31:12]
2880 Assembler::patch(branch + 4, 31, 20, (lower >> 17) & 0xfff); // Addi. target[28:17] ==> branch[31:20]
2881 Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff); // Addi. target[16: 6] ==> branch[31:20]
2882 Assembler::patch(branch + 20, 31, 20, lower & 0x3f); // Addi/Jalr/Load. target[ 5: 0] ==> branch[31:20]
2883 return MacroAssembler::movptr1_instruction_size;
2884 }
2885
2886 static int patch_addr_in_movptr2(address instruction_address, address target) {
2887 uintptr_t addr = (uintptr_t)target;
2888
2889 assert(addr < (1ull << 48), "48-bit overflow in address constant");
2890 unsigned int upper18 = (addr >> 30ull);
2891 int lower30 = (addr & 0x3fffffffu);
2892 int low12 = (lower30 << 20) >> 20;
2893 int mid18 = ((lower30 - low12) >> 12);
2894
2895 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2896 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18 & 0xfffff)); // Lui
2897 // Slli
2898 // Add
2899 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff); // Addi/Jalr/Load
2900
2901 assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2902
2903 return MacroAssembler::movptr2_instruction_size;
2904 }
2905
2906 static int patch_imm_in_li16u(address branch, uint16_t target) {
2907 Assembler::patch(branch, 31, 12, target); // patch lui only
2908 return MacroAssembler::instruction_size;
2909 }
2910
2911 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2912 const int LI32_INSTRUCTIONS_NUM = 2; // lui + addiw
2913 int64_t upper = (intptr_t)target;
2914 int32_t lower = (((int32_t)target) << 20) >> 20;
2915 upper -= lower;
2916 upper = (int32_t)upper;
2917 Assembler::patch(branch + 0, 31, 12, (upper >> 12) & 0xfffff); // Lui.
2918 Assembler::patch(branch + 4, 31, 20, lower & 0xfff); // Addiw.
2919 return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2920 }
2921
2922 static long get_offset_of_jal(address insn_addr) {
2923 assert_cond(insn_addr != nullptr);
2924 long offset = 0;
2925 unsigned insn = Assembler::ld_instr(insn_addr);
2926 long val = (long)Assembler::sextract(insn, 31, 12);
2927 offset |= ((val >> 19) & 0x1) << 20;
2928 offset |= (val & 0xff) << 12;
2929 offset |= ((val >> 8) & 0x1) << 11;
2930 offset |= ((val >> 9) & 0x3ff) << 1;
2931 offset = (offset << 43) >> 43;
2932 return offset;
2933 }
2934
2935 static long get_offset_of_conditional_branch(address insn_addr) {
2936 long offset = 0;
2937 assert_cond(insn_addr != nullptr);
2938 unsigned insn = Assembler::ld_instr(insn_addr);
2939 offset = (long)Assembler::sextract(insn, 31, 31);
2940 offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2941 offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2942 offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2943 offset = (offset << 41) >> 41;
2944 return offset;
2945 }
2946
2947 static long get_offset_of_pc_relative(address insn_addr) {
2948 long offset = 0;
2949 assert_cond(insn_addr != nullptr);
2950 offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12; // Auipc.
2951 offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addi/Jalr/Load.
2952 offset = (offset << 32) >> 32;
2953 return offset;
2954 }
2955
2956 static address get_target_of_movptr1(address insn_addr) {
2957 assert_cond(insn_addr != nullptr);
2958 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2959 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17; // Addi.
2960 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6; // Addi.
2961 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)); // Addi/Jalr/Load.
2962 return (address) target_address;
2963 }
2964
2965 static address get_target_of_movptr2(address insn_addr) {
2966 assert_cond(insn_addr != nullptr);
2967 int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2968 int32_t mid18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2969 // 2 // Slli
2970 // 3 // Add
2971 int32_t low12 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2972 address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2973 return ret;
2974 }
2975
2976 address MacroAssembler::get_target_of_li32(address insn_addr) {
2977 assert_cond(insn_addr != nullptr);
2978 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2979 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addiw.
2980 return (address)target_address;
2981 }
2982
2983 // Patch any kind of instruction; there may be several instructions.
2984 // Return the total length (in bytes) of the instructions.
2985 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2986 assert_cond(instruction_address != nullptr);
2987 int64_t offset = target - instruction_address;
2988 if (MacroAssembler::is_jal_at(instruction_address)) { // jal
2989 return patch_offset_in_jal(instruction_address, offset);
2990 } else if (MacroAssembler::is_branch_at(instruction_address)) { // beq/bge/bgeu/blt/bltu/bne
2991 return patch_offset_in_conditional_branch(instruction_address, offset);
2992 } else if (MacroAssembler::is_pc_relative_at(instruction_address)) { // auipc, addi/jalr/load
2993 return patch_offset_in_pc_relative(instruction_address, offset);
2994 } else if (MacroAssembler::is_movptr1_at(instruction_address)) { // movptr1
2995 return patch_addr_in_movptr1(instruction_address, target);
2996 } else if (MacroAssembler::is_movptr2_at(instruction_address)) { // movptr2
2997 return patch_addr_in_movptr2(instruction_address, target);
2998 } else if (MacroAssembler::is_li32_at(instruction_address)) { // li32
2999 int64_t imm = (intptr_t)target;
3000 return patch_imm_in_li32(instruction_address, (int32_t)imm);
3001 } else if (MacroAssembler::is_li16u_at(instruction_address)) {
3002 int64_t imm = (intptr_t)target;
3003 return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
3004 } else {
3005 #ifdef ASSERT
3006 tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3007 Assembler::ld_instr(instruction_address), p2i(instruction_address));
3008 Disassembler::decode(instruction_address - 16, instruction_address + 16);
3009 #endif
3010 ShouldNotReachHere();
3011 return -1;
3012 }
3013 }
3014
3015 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3016 long offset = 0;
3017 assert_cond(insn_addr != nullptr);
3018 if (MacroAssembler::is_jal_at(insn_addr)) { // jal
3019 offset = get_offset_of_jal(insn_addr);
3020 } else if (MacroAssembler::is_branch_at(insn_addr)) { // beq/bge/bgeu/blt/bltu/bne
3021 offset = get_offset_of_conditional_branch(insn_addr);
3022 } else if (MacroAssembler::is_pc_relative_at(insn_addr)) { // auipc, addi/jalr/load
3023 offset = get_offset_of_pc_relative(insn_addr);
3024 } else if (MacroAssembler::is_movptr1_at(insn_addr)) { // movptr1
3025 return get_target_of_movptr1(insn_addr);
3026 } else if (MacroAssembler::is_movptr2_at(insn_addr)) { // movptr2
3027 return get_target_of_movptr2(insn_addr);
3028 } else if (MacroAssembler::is_li32_at(insn_addr)) { // li32
3029 return get_target_of_li32(insn_addr);
3030 } else {
3031 ShouldNotReachHere();
3032 }
3033 return address(((uintptr_t)insn_addr + offset));
3034 }
3035
3036 int MacroAssembler::patch_oop(address insn_addr, address o) {
3037 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
3038 // narrow OOPs by setting the upper 16 bits in the first
3039 // instruction.
3040 if (MacroAssembler::is_li32_at(insn_addr)) {
3041 // Move narrow OOP
3042 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3043 return patch_imm_in_li32(insn_addr, (int32_t)n);
3044 } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3045 // Move wide OOP
3046 return patch_addr_in_movptr1(insn_addr, o);
3047 } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3048 // Move wide OOP
3049 return patch_addr_in_movptr2(insn_addr, o);
3050 }
3051 ShouldNotReachHere();
3052 return -1;
3053 }
3054
3055 void MacroAssembler::reinit_heapbase() {
3056 if (UseCompressedOops) {
3057 if (Universe::is_fully_initialized()) {
3058 mv(xheapbase, CompressedOops::base());
3059 } else {
3060 ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3061 }
3062 }
3063 }
3064
3065 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3066 assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3067 relocate(addr.rspec(), [&] {
3068 movptr(Rd, addr.target(), temp);
3069 });
3070 }
3071
3072 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3073 int offset = 0;
3074 movptr(Rd, addr, offset, temp);
3075 addi(Rd, Rd, offset);
3076 }
3077
3078 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3079 uint64_t uimm64 = (uint64_t)addr;
3080 #ifndef PRODUCT
3081 {
3082 char buffer[64];
3083 os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3084 block_comment(buffer);
3085 }
3086 #endif
3087 assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3088
3089 if (temp == noreg) {
3090 movptr1(Rd, uimm64, offset);
3091 } else {
3092 movptr2(Rd, uimm64, offset, temp);
3093 }
3094 }
3095
3096 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3097 // Load upper 31 bits
3098 //
3099 // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3100 // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3101 // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3102 // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3103 // `lower = (lower << 52) >> 52;`.
3104 // After `upper -= lower;`,
3105 // upper_20' = upper_20 - (-1) == upper_20 + 1
3106 // upper_12 = 0x000
3107 // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3108 // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3109 // Rd_20 == upper_20'
3110 // Rd_12 == 0x000
3111 // After `addi(Rd, Rd, lower);`,
3112 // Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3113 // Rd_12 = lower_12
3114 // So, finally Rd == [upper_20, lower_12]
3115 int64_t imm = imm64 >> 17;
3116 int64_t upper = imm, lower = imm;
3117 lower = (lower << 52) >> 52;
3118 upper -= lower;
3119 upper = (int32_t)upper;
3120 lui(Rd, upper);
3121 addi(Rd, Rd, lower);
3122
3123 // Load the rest 17 bits.
3124 slli(Rd, Rd, 11);
3125 addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3126 slli(Rd, Rd, 6);
3127
3128 // This offset will be used by following jalr/ld.
3129 offset = imm64 & 0x3f;
3130 }
3131
3132 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3133 assert_different_registers(Rd, tmp, noreg);
3134
3135 // addr: [upper18, lower30[mid18, lower12]]
3136
3137 int64_t upper18 = addr >> 18;
3138 lui(tmp, upper18);
3139
3140 int64_t lower30 = addr & 0x3fffffff;
3141 int64_t mid18 = lower30, lower12 = lower30;
3142 lower12 = (lower12 << 52) >> 52;
3143 // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3144 // please refer to movptr1 above.
3145 mid18 -= (int32_t)lower12;
3146 lui(Rd, mid18);
3147
3148 slli(tmp, tmp, 18);
3149 add(Rd, Rd, tmp);
3150
3151 offset = lower12;
3152 }
3153
3154 // floating point imm move
3155 bool MacroAssembler::can_hf_imm_load(short imm) {
3156 jshort h_bits = (jshort)imm;
3157 if (h_bits == 0) {
3158 return true;
3159 }
3160 return can_zfa_zli_half_float(imm);
3161 }
3162
3163 bool MacroAssembler::can_fp_imm_load(float imm) {
3164 jint f_bits = jint_cast(imm);
3165 if (f_bits == 0) {
3166 return true;
3167 }
3168 return can_zfa_zli_float(imm);
3169 }
3170
3171 bool MacroAssembler::can_dp_imm_load(double imm) {
3172 julong d_bits = julong_cast(imm);
3173 if (d_bits == 0) {
3174 return true;
3175 }
3176 return can_zfa_zli_double(imm);
3177 }
3178
3179 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3180 jshort h_bits = (jshort)imm;
3181 if (h_bits == 0) {
3182 fmv_h_x(Rd, zr);
3183 return;
3184 }
3185 int Rs = zfa_zli_lookup_half_float(h_bits);
3186 assert(Rs != -1, "Must be");
3187 _fli_h(Rd, Rs);
3188 }
3189
3190 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3191 jint f_bits = jint_cast(imm);
3192 if (f_bits == 0) {
3193 fmv_w_x(Rd, zr);
3194 return;
3195 }
3196 int Rs = zfa_zli_lookup_float(f_bits);
3197 assert(Rs != -1, "Must be");
3198 _fli_s(Rd, Rs);
3199 }
3200
3201 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3202 uint64_t d_bits = (uint64_t)julong_cast(imm);
3203 if (d_bits == 0) {
3204 fmv_d_x(Rd, zr);
3205 return;
3206 }
3207 int Rs = zfa_zli_lookup_double(d_bits);
3208 assert(Rs != -1, "Must be");
3209 _fli_d(Rd, Rs);
3210 }
3211
3212 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3213 if (is_simm12(increment)) {
3214 addi(Rd, Rn, increment);
3215 } else {
3216 assert_different_registers(Rn, tmp);
3217 mv(tmp, increment);
3218 add(Rd, Rn, tmp);
3219 }
3220 }
3221
3222 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3223 add(Rd, Rn, -decrement, tmp);
3224 }
3225
3226 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3227 if (is_simm12(increment)) {
3228 addiw(Rd, Rn, increment);
3229 } else {
3230 assert_different_registers(Rn, tmp);
3231 mv(tmp, increment);
3232 addw(Rd, Rn, tmp);
3233 }
3234 }
3235
3236 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3237 addw(Rd, Rn, -decrement, tmp);
3238 }
3239
3240 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3241 andr(Rd, Rs1, Rs2);
3242 sext(Rd, Rd, 32);
3243 }
3244
3245 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3246 orr(Rd, Rs1, Rs2);
3247 sext(Rd, Rd, 32);
3248 }
3249
3250 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3251 xorr(Rd, Rs1, Rs2);
3252 sext(Rd, Rd, 32);
3253 }
3254
3255 // Rd = Rs1 & (~Rd2)
3256 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3257 if (UseZbb) {
3258 Assembler::andn(Rd, Rs1, Rs2);
3259 return;
3260 }
3261
3262 notr(Rd, Rs2);
3263 andr(Rd, Rs1, Rd);
3264 }
3265
3266 // Rd = Rs1 | (~Rd2)
3267 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3268 if (UseZbb) {
3269 Assembler::orn(Rd, Rs1, Rs2);
3270 return;
3271 }
3272
3273 notr(Rd, Rs2);
3274 orr(Rd, Rs1, Rd);
3275 }
3276
3277 // Note: load_unsigned_short used to be called load_unsigned_word.
3278 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3279 int off = offset();
3280 lhu(dst, src);
3281 return off;
3282 }
3283
3284 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3285 int off = offset();
3286 lbu(dst, src);
3287 return off;
3288 }
3289
3290 int MacroAssembler::load_signed_short(Register dst, Address src) {
3291 int off = offset();
3292 lh(dst, src);
3293 return off;
3294 }
3295
3296 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3297 int off = offset();
3298 lb(dst, src);
3299 return off;
3300 }
3301
3302 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3303 switch (size_in_bytes) {
3304 case 8: ld(dst, src); break;
3305 case 4: is_signed ? lw(dst, src) : lwu(dst, src); break;
3306 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3307 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3308 default: ShouldNotReachHere();
3309 }
3310 }
3311
3312 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3313 switch (size_in_bytes) {
3314 case 8: sd(src, dst); break;
3315 case 4: sw(src, dst); break;
3316 case 2: sh(src, dst); break;
3317 case 1: sb(src, dst); break;
3318 default: ShouldNotReachHere();
3319 }
3320 }
3321
3322 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3323 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3324 if (granularity != 1 && granularity != 2) {
3325 ShouldNotReachHere();
3326 }
3327 if (AvoidUnalignedAccesses && (granularity != 2)) {
3328 assert_different_registers(dst, tmp);
3329 assert_different_registers(tmp, src.base());
3330 is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3331 slli(tmp, tmp, 8);
3332 lbu(dst, src);
3333 add(dst, dst, tmp);
3334 } else {
3335 is_signed ? lh(dst, src) : lhu(dst, src);
3336 }
3337 }
3338
3339 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3340 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3341 if (AvoidUnalignedAccesses && (granularity != 4)) {
3342 switch(granularity) {
3343 case 1:
3344 assert_different_registers(dst, tmp, src.base());
3345 lbu(dst, src);
3346 lbu(tmp, Address(src.base(), src.offset() + 1));
3347 slli(tmp, tmp, 8);
3348 add(dst, dst, tmp);
3349 lbu(tmp, Address(src.base(), src.offset() + 2));
3350 slli(tmp, tmp, 16);
3351 add(dst, dst, tmp);
3352 is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3353 slli(tmp, tmp, 24);
3354 add(dst, dst, tmp);
3355 break;
3356 case 2:
3357 assert_different_registers(dst, tmp);
3358 assert_different_registers(tmp, src.base());
3359 is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3360 slli(tmp, tmp, 16);
3361 lhu(dst, src);
3362 add(dst, dst, tmp);
3363 break;
3364 default:
3365 ShouldNotReachHere();
3366 }
3367 } else {
3368 is_signed ? lw(dst, src) : lwu(dst, src);
3369 }
3370 }
3371
3372 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3373 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3374 if (AvoidUnalignedAccesses && (granularity != 8)) {
3375 switch(granularity){
3376 case 1:
3377 assert_different_registers(dst, tmp, src.base());
3378 lbu(dst, src);
3379 lbu(tmp, Address(src.base(), src.offset() + 1));
3380 slli(tmp, tmp, 8);
3381 add(dst, dst, tmp);
3382 lbu(tmp, Address(src.base(), src.offset() + 2));
3383 slli(tmp, tmp, 16);
3384 add(dst, dst, tmp);
3385 lbu(tmp, Address(src.base(), src.offset() + 3));
3386 slli(tmp, tmp, 24);
3387 add(dst, dst, tmp);
3388 lbu(tmp, Address(src.base(), src.offset() + 4));
3389 slli(tmp, tmp, 32);
3390 add(dst, dst, tmp);
3391 lbu(tmp, Address(src.base(), src.offset() + 5));
3392 slli(tmp, tmp, 40);
3393 add(dst, dst, tmp);
3394 lbu(tmp, Address(src.base(), src.offset() + 6));
3395 slli(tmp, tmp, 48);
3396 add(dst, dst, tmp);
3397 lbu(tmp, Address(src.base(), src.offset() + 7));
3398 slli(tmp, tmp, 56);
3399 add(dst, dst, tmp);
3400 break;
3401 case 2:
3402 assert_different_registers(dst, tmp, src.base());
3403 lhu(dst, src);
3404 lhu(tmp, Address(src.base(), src.offset() + 2));
3405 slli(tmp, tmp, 16);
3406 add(dst, dst, tmp);
3407 lhu(tmp, Address(src.base(), src.offset() + 4));
3408 slli(tmp, tmp, 32);
3409 add(dst, dst, tmp);
3410 lhu(tmp, Address(src.base(), src.offset() + 6));
3411 slli(tmp, tmp, 48);
3412 add(dst, dst, tmp);
3413 break;
3414 case 4:
3415 assert_different_registers(dst, tmp);
3416 assert_different_registers(tmp, src.base());
3417 lwu(tmp, Address(src.base(), src.offset() + 4));
3418 slli(tmp, tmp, 32);
3419 lwu(dst, src);
3420 add(dst, dst, tmp);
3421 break;
3422 default:
3423 ShouldNotReachHere();
3424 }
3425 } else {
3426 ld(dst, src);
3427 }
3428 }
3429
3430 // reverse bytes in lower word, sign-extend
3431 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3432 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3433 if (UseZbb) {
3434 rev8(Rd, Rs);
3435 srai(Rd, Rd, 32);
3436 return;
3437 }
3438 assert_different_registers(Rs, tmp1, tmp2);
3439 assert_different_registers(Rd, tmp1, tmp2);
3440 zext(tmp1, Rs, 8);
3441 slli(tmp1, tmp1, 8);
3442 for (int step = 8; step < 24; step += 8) {
3443 srli(tmp2, Rs, step);
3444 zext(tmp2, tmp2, 8);
3445 orr(tmp1, tmp1, tmp2);
3446 slli(tmp1, tmp1, 8);
3447 }
3448 srli(Rd, Rs, 24);
3449 zext(Rd, Rd, 8);
3450 orr(Rd, tmp1, Rd);
3451 sext(Rd, Rd, 32);
3452 }
3453
3454 // reverse bytes in doubleword
3455 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3456 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3457 if (UseZbb) {
3458 rev8(Rd, Rs);
3459 return;
3460 }
3461 assert_different_registers(Rs, tmp1, tmp2);
3462 assert_different_registers(Rd, tmp1, tmp2);
3463 zext(tmp1, Rs, 8);
3464 slli(tmp1, tmp1, 8);
3465 for (int step = 8; step < 56; step += 8) {
3466 srli(tmp2, Rs, step);
3467 zext(tmp2, tmp2, 8);
3468 orr(tmp1, tmp1, tmp2);
3469 slli(tmp1, tmp1, 8);
3470 }
3471 srli(Rd, Rs, 56);
3472 orr(Rd, tmp1, Rd);
3473 }
3474
3475 // rotate right with shift bits
3476 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3477 {
3478 if (UseZbb) {
3479 rorr(dst, src, shift);
3480 return;
3481 }
3482
3483 assert_different_registers(dst, tmp);
3484 assert_different_registers(src, tmp);
3485
3486 mv(tmp, 64);
3487 sub(tmp, tmp, shift);
3488 sll(tmp, src, tmp);
3489 srl(dst, src, shift);
3490 orr(dst, dst, tmp);
3491 }
3492
3493 // rotate right with shift bits
3494 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3495 {
3496 if (UseZbb) {
3497 rori(dst, src, shift);
3498 return;
3499 }
3500
3501 assert_different_registers(dst, tmp);
3502 assert_different_registers(src, tmp);
3503 assert(shift < 64, "shift amount must be < 64");
3504 slli(tmp, src, 64 - shift);
3505 srli(dst, src, shift);
3506 orr(dst, dst, tmp);
3507 }
3508
3509 // rotate left with shift bits, 32-bit version
3510 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3511 if (UseZbb) {
3512 // no roliw available
3513 roriw(dst, src, 32 - shift);
3514 return;
3515 }
3516
3517 assert_different_registers(dst, tmp);
3518 assert_different_registers(src, tmp);
3519 assert(shift < 32, "shift amount must be < 32");
3520 srliw(tmp, src, 32 - shift);
3521 slliw(dst, src, shift);
3522 orr(dst, dst, tmp);
3523 }
3524
3525 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3526 ld(tmp1, adr);
3527 if (src.is_register()) {
3528 orr(tmp1, tmp1, src.as_register());
3529 } else {
3530 if (is_simm12(src.as_constant())) {
3531 ori(tmp1, tmp1, src.as_constant());
3532 } else {
3533 assert_different_registers(tmp1, tmp2);
3534 mv(tmp2, src.as_constant());
3535 orr(tmp1, tmp1, tmp2);
3536 }
3537 }
3538 sd(tmp1, adr);
3539 }
3540
3541 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3542 Register tmp1, Register tmp2,
3543 Label &L, bool is_far) {
3544 assert_different_registers(obj, klass, tmp1, tmp2);
3545 if (UseCompactObjectHeaders) {
3546 load_narrow_klass_compact(tmp1, obj);
3547 } else {
3548 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3549 }
3550 decode_klass_not_null(tmp1, tmp2);
3551 beq(klass, tmp1, L, is_far);
3552 }
3553
3554 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3555 Register tmp1, Register tmp2,
3556 Label &L, bool is_far) {
3557 assert_different_registers(obj, klass, tmp1, tmp2);
3558 if (UseCompactObjectHeaders) {
3559 load_narrow_klass_compact(tmp1, obj);
3560 } else {
3561 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3562 }
3563 decode_klass_not_null(tmp1, tmp2);
3564 bne(klass, tmp1, L, is_far);
3565 }
3566
3567 // Move an oop into a register.
3568 void MacroAssembler::movoop(Register dst, jobject obj) {
3569 int oop_index;
3570 if (obj == nullptr) {
3571 oop_index = oop_recorder()->allocate_oop_index(obj);
3572 } else {
3573 #ifdef ASSERT
3574 {
3575 ThreadInVMfromUnknown tiv;
3576 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3577 }
3578 #endif
3579 oop_index = oop_recorder()->find_index(obj);
3580 }
3581 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3582
3583 if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3584 movptr(dst, Address((address)obj, rspec));
3585 } else {
3586 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3587 ld(dst, Address(dummy, rspec));
3588 }
3589 }
3590
3591 // Move a metadata address into a register.
3592 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3593 assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3594 int oop_index;
3595 if (obj == nullptr) {
3596 oop_index = oop_recorder()->allocate_metadata_index(obj);
3597 } else {
3598 oop_index = oop_recorder()->find_index(obj);
3599 }
3600 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3601 movptr(dst, Address((address)obj, rspec));
3602 }
3603
3604 void MacroAssembler::inline_layout_info(Register holder_klass, Register index, Register layout_info) {
3605 assert_different_registers(holder_klass, index, layout_info);
3606 InlineLayoutInfo array[2];
3607 int size = (char*)&array[1] - (char*)&array[0]; // computing size of array elements
3608 if (is_power_of_2(size)) {
3609 slli(index, index, log2i_exact(size)); // Scale index by power of 2
3610 } else {
3611 mv(layout_info, size);
3612 mul(index, index, layout_info); // Scale the index to be the entry index * array_element_size
3613 }
3614 ld(layout_info, Address(holder_klass, InstanceKlass::inline_layout_info_array_offset()));
3615 add(layout_info, layout_info, Array<InlineLayoutInfo>::base_offset_in_bytes());
3616 add(layout_info, layout_info, index);
3617 la(layout_info, Address(layout_info));
3618 }
3619
3620 void MacroAssembler::flat_field_copy(DecoratorSet decorators, Register src, Register dst,
3621 Register inline_layout_info) {
3622 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3623 bs->flat_field_copy(this, decorators, src, dst, inline_layout_info);
3624 }
3625
3626 void MacroAssembler::payload_offset(Register inline_klass, Register offset) {
3627 ld(offset, Address(inline_klass, InlineKlass::adr_members_offset()));
3628 lwu(offset, Address(offset, InlineKlass::payload_offset_offset()));
3629 }
3630
3631 void MacroAssembler::payload_address(Register oop, Register data, Register inline_klass) {
3632 assert_different_registers(data, t0);
3633 // ((address) (void*) o) + vk->payload_offset();
3634 Register offset = (data == oop) ? t0 : data;
3635 payload_offset(inline_klass, offset);
3636 if (data == oop) {
3637 add(data, data, offset);
3638 } else {
3639 add(data, oop, offset);
3640 la(data, Address(data));
3641 }
3642 }
3643
3644 // Writes to stack successive pages until offset reached to check for
3645 // stack overflow + shadow pages. This clobbers tmp.
3646 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3647 assert_different_registers(tmp, size, t0);
3648 // Bang stack for total size given plus shadow page size.
3649 // Bang one page at a time because large size can bang beyond yellow and
3650 // red zones.
3651 mv(t0, (int)os::vm_page_size());
3652 Label loop;
3653 bind(loop);
3654 sub(tmp, sp, t0);
3655 subw(size, size, t0);
3656 sd(size, Address(tmp));
3657 bgtz(size, loop);
3658
3659 // Bang down shadow pages too.
3660 // At this point, (tmp-0) is the last address touched, so don't
3661 // touch it again. (It was touched as (tmp-pagesize) but then tmp
3662 // was post-decremented.) Skip this address by starting at i=1, and
3663 // touch a few more pages below. N.B. It is important to touch all
3664 // the way down to and including i=StackShadowPages.
3665 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3666 // this could be any sized move but this is can be a debugging crumb
3667 // so the bigger the better.
3668 sub(tmp, tmp, (int)os::vm_page_size());
3669 sd(size, Address(tmp, 0));
3670 }
3671 }
3672
3673 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3674 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3675 ld(dst, Address(xmethod, Method::const_offset()));
3676 ld(dst, Address(dst, ConstMethod::constants_offset()));
3677 ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3678 ld(dst, Address(dst, mirror_offset));
3679 resolve_oop_handle(dst, tmp1, tmp2);
3680 }
3681
3682 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3683 // OopHandle::resolve is an indirection.
3684 assert_different_registers(result, tmp1, tmp2);
3685 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3686 }
3687
3688 // ((WeakHandle)result).resolve()
3689 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3690 assert_different_registers(result, tmp1, tmp2);
3691 Label resolved;
3692
3693 // A null weak handle resolves to null.
3694 beqz(result, resolved);
3695
3696 // Only 64 bit platforms support GCs that require a tmp register
3697 // Only IN_HEAP loads require a thread_tmp register
3698 // WeakHandle::resolve is an indirection like jweak.
3699 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3700 result, Address(result), tmp1, tmp2);
3701 bind(resolved);
3702 }
3703
3704 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3705 Register dst, Address src,
3706 Register tmp1, Register tmp2) {
3707 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3708 decorators = AccessInternal::decorator_fixup(decorators, type);
3709 bool as_raw = (decorators & AS_RAW) != 0;
3710 if (as_raw) {
3711 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3712 } else {
3713 bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3714 }
3715 }
3716
3717 void MacroAssembler::null_check(Register reg, int offset) {
3718 if (needs_explicit_null_check(offset)) {
3719 // provoke OS null exception if reg is null by
3720 // accessing M[reg] w/o changing any registers
3721 // NOTE: this is plenty to provoke a segv
3722 ld(zr, Address(reg, 0));
3723 } else {
3724 // nothing to do, (later) access of M[reg + offset]
3725 // will provoke OS null exception if reg is null
3726 }
3727 }
3728
3729 void MacroAssembler::test_field_is_null_free_inline_type(Register flags, Register temp_reg, Label& is_null_free_inline_type) {
3730 test_bit(temp_reg, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3731 bnez(temp_reg, is_null_free_inline_type);
3732 }
3733
3734 void MacroAssembler::test_field_is_not_null_free_inline_type(Register flags, Register temp_reg, Label& not_null_free_inline_type) {
3735 test_bit(temp_reg, flags, ResolvedFieldEntry::is_null_free_inline_type_shift);
3736 beqz(temp_reg, not_null_free_inline_type);
3737 }
3738
3739 void MacroAssembler::test_field_is_flat(Register flags, Register temp_reg, Label& is_flat) {
3740 test_bit(temp_reg, flags, ResolvedFieldEntry::is_flat_shift);
3741 bnez(temp_reg, is_flat);
3742 }
3743
3744 void MacroAssembler::test_markword_is_inline_type(Register markword, Label& is_inline_type) {
3745 assert_different_registers(markword, t1);
3746 mv(t1, markWord::inline_type_pattern_mask);
3747 andr(markword, markword, t1);
3748 mv(t1, markWord::inline_type_pattern);
3749 beq(markword, t1, is_inline_type);
3750 }
3751
3752 void MacroAssembler::test_oop_is_not_inline_type(Register object, Register tmp, Label& not_inline_type, bool can_be_null) {
3753 assert_different_registers(tmp, t0);
3754 if (can_be_null) {
3755 beqz(object, not_inline_type);
3756 }
3757 const int is_inline_type_mask = markWord::inline_type_pattern;
3758 ld(tmp, Address(object, oopDesc::mark_offset_in_bytes()));
3759 mv(t0, is_inline_type_mask);
3760 andr(tmp, tmp, t0);
3761 bne(tmp, t0, not_inline_type);
3762 }
3763
3764 void MacroAssembler::test_oop_prototype_bit(Register oop, Register temp_reg, int32_t tst_bit, bool jmp_set, Label& jmp_label) {
3765 assert_different_registers(temp_reg, t0);
3766 Label test_mark_word;
3767 // load mark word
3768 ld(temp_reg, Address(oop, oopDesc::mark_offset_in_bytes()));
3769 // check displaced
3770 test_bit(t0, temp_reg, exact_log2(markWord::unlocked_value));
3771 bnez(t0, test_mark_word);
3772 // slow path use klass prototype
3773 load_prototype_header(temp_reg, oop);
3774
3775 bind(test_mark_word);
3776 andi(temp_reg, temp_reg, tst_bit);
3777 if (jmp_set) {
3778 bnez(temp_reg, jmp_label, /* is_far */ true);
3779 } else {
3780 beqz(temp_reg, jmp_label, /* is_far */ true);
3781 }
3782 }
3783
3784 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flat_array) {
3785 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, true, is_flat_array);
3786 }
3787
3788 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) {
3789 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, true, is_null_free_array);
3790 }
3791
3792 void MacroAssembler::test_non_flat_array_oop(Register oop, Register temp_reg, Label&is_non_flat_array) {
3793 test_oop_prototype_bit(oop, temp_reg, markWord::flat_array_bit_in_place, false, is_non_flat_array);
3794 }
3795
3796 void MacroAssembler::test_non_null_free_array_oop(Register oop, Register temp_reg, Label&is_non_null_free_array) {
3797 test_oop_prototype_bit(oop, temp_reg, markWord::null_free_array_bit_in_place, false, is_non_null_free_array);
3798 }
3799
3800 void MacroAssembler::test_flat_array_layout(Register lh, Label& is_flat_array) {
3801 test_bit(t0, lh, exact_log2(Klass::_lh_array_tag_flat_value_bit_inplace));
3802 bnez(t0, is_flat_array);
3803 }
3804
3805 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3806 Address dst, Register val,
3807 Register tmp1, Register tmp2, Register tmp3) {
3808 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3809 decorators = AccessInternal::decorator_fixup(decorators, type);
3810 bool as_raw = (decorators & AS_RAW) != 0;
3811 if (as_raw) {
3812 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3813 } else {
3814 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3815 }
3816 }
3817
3818 // Algorithm must match CompressedOops::encode.
3819 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3820 verify_oop_msg(s, "broken oop in encode_heap_oop");
3821 if (CompressedOops::base() == nullptr) {
3822 if (CompressedOops::shift() != 0) {
3823 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3824 srli(d, s, LogMinObjAlignmentInBytes);
3825 } else {
3826 mv(d, s);
3827 }
3828 } else {
3829 Label notNull;
3830 sub(d, s, xheapbase);
3831 bgez(d, notNull);
3832 mv(d, zr);
3833 bind(notNull);
3834 if (CompressedOops::shift() != 0) {
3835 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3836 srli(d, d, CompressedOops::shift());
3837 }
3838 }
3839 }
3840
3841 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3842 #ifdef ASSERT
3843 if (CheckCompressedOops) {
3844 Label ok;
3845 bnez(r, ok);
3846 stop("null oop passed to encode_heap_oop_not_null");
3847 bind(ok);
3848 }
3849 #endif
3850 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3851 if (CompressedOops::base() != nullptr) {
3852 sub(r, r, xheapbase);
3853 }
3854 if (CompressedOops::shift() != 0) {
3855 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3856 srli(r, r, LogMinObjAlignmentInBytes);
3857 }
3858 }
3859
3860 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3861 #ifdef ASSERT
3862 if (CheckCompressedOops) {
3863 Label ok;
3864 bnez(src, ok);
3865 stop("null oop passed to encode_heap_oop_not_null2");
3866 bind(ok);
3867 }
3868 #endif
3869 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3870
3871 Register data = src;
3872 if (CompressedOops::base() != nullptr) {
3873 sub(dst, src, xheapbase);
3874 data = dst;
3875 }
3876 if (CompressedOops::shift() != 0) {
3877 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3878 srli(dst, data, LogMinObjAlignmentInBytes);
3879 data = dst;
3880 }
3881 if (data == src) {
3882 mv(dst, src);
3883 }
3884 }
3885
3886 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3887 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3888 ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3889 srli(dst, dst, markWord::klass_shift);
3890 }
3891
3892 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3893 assert_different_registers(dst, tmp);
3894 assert_different_registers(src, tmp);
3895 if (UseCompactObjectHeaders) {
3896 load_narrow_klass_compact(dst, src);
3897 decode_klass_not_null(dst, tmp);
3898 } else {
3899 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3900 decode_klass_not_null(dst, tmp);
3901 }
3902 }
3903
3904 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) {
3905 load_klass(dst, src, tmp);
3906 ld(dst, Address(dst, Klass::prototype_header_offset()));
3907 }
3908
3909 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3910 // FIXME: Should this be a store release? concurrent gcs assumes
3911 // klass length is valid if klass field is not null.
3912 assert(!UseCompactObjectHeaders, "not with compact headers");
3913 encode_klass_not_null(src, tmp);
3914 sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3915
3916 }
3917
3918 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3919 assert(!UseCompactObjectHeaders, "not with compact headers");
3920 // Store to klass gap in destination
3921 sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3922 }
3923
3924 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3925 assert_different_registers(r, tmp);
3926 decode_klass_not_null(r, r, tmp);
3927 }
3928
3929 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3930 assert_different_registers(dst, tmp);
3931 assert_different_registers(src, tmp);
3932
3933 if (CompressedKlassPointers::base() == nullptr) {
3934 if (CompressedKlassPointers::shift() != 0) {
3935 slli(dst, src, CompressedKlassPointers::shift());
3936 } else {
3937 mv(dst, src);
3938 }
3939 return;
3940 }
3941
3942 Register xbase = tmp;
3943
3944 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3945
3946 if (CompressedKlassPointers::shift() != 0) {
3947 // dst = (src << shift) + xbase
3948 shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3949 } else {
3950 add(dst, xbase, src);
3951 }
3952 }
3953
3954 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3955 assert_different_registers(r, tmp);
3956 encode_klass_not_null(r, r, tmp);
3957 }
3958
3959 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3960 if (CompressedKlassPointers::base() == nullptr) {
3961 if (CompressedKlassPointers::shift() != 0) {
3962 srli(dst, src, CompressedKlassPointers::shift());
3963 } else {
3964 mv(dst, src);
3965 }
3966 return;
3967 }
3968
3969 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3970 CompressedKlassPointers::shift() == 0) {
3971 zext(dst, src, 32);
3972 return;
3973 }
3974
3975 Register xbase = dst;
3976 if (dst == src) {
3977 xbase = tmp;
3978 }
3979
3980 assert_different_registers(src, xbase);
3981 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3982 sub(dst, src, xbase);
3983 if (CompressedKlassPointers::shift() != 0) {
3984 srli(dst, dst, CompressedKlassPointers::shift());
3985 }
3986 }
3987
3988 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3989 decode_heap_oop_not_null(r, r);
3990 }
3991
3992 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3993 assert(UseCompressedOops, "should only be used for compressed headers");
3994 assert(Universe::heap() != nullptr, "java heap should be initialized");
3995 // Cannot assert, unverified entry point counts instructions (see .ad file)
3996 // vtableStubs also counts instructions in pd_code_size_limit.
3997 // Also do not verify_oop as this is called by verify_oop.
3998 if (CompressedOops::shift() != 0) {
3999 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
4000 slli(dst, src, LogMinObjAlignmentInBytes);
4001 if (CompressedOops::base() != nullptr) {
4002 add(dst, xheapbase, dst);
4003 }
4004 } else {
4005 assert(CompressedOops::base() == nullptr, "sanity");
4006 mv(dst, src);
4007 }
4008 }
4009
4010 void MacroAssembler::decode_heap_oop(Register d, Register s) {
4011 if (CompressedOops::base() == nullptr) {
4012 if (CompressedOops::shift() != 0 || d != s) {
4013 slli(d, s, CompressedOops::shift());
4014 }
4015 } else {
4016 Label done;
4017 mv(d, s);
4018 beqz(s, done);
4019 shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
4020 bind(done);
4021 }
4022 verify_oop_msg(d, "broken oop in decode_heap_oop");
4023 }
4024
4025 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
4026 Register tmp2, Register tmp3, DecoratorSet decorators) {
4027 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
4028 }
4029
4030 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4031 Register tmp2, DecoratorSet decorators) {
4032 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
4033 }
4034
4035 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4036 Register tmp2, DecoratorSet decorators) {
4037 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
4038 }
4039
4040 // Used for storing nulls.
4041 void MacroAssembler::store_heap_oop_null(Address dst) {
4042 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4043 }
4044
4045 // Look up the method for a megamorphic invokeinterface call.
4046 // The target method is determined by <intf_klass, itable_index>.
4047 // The receiver klass is in recv_klass.
4048 // On success, the result will be in method_result, and execution falls through.
4049 // On failure, execution transfers to the given label.
4050 void MacroAssembler::lookup_interface_method(Register recv_klass,
4051 Register intf_klass,
4052 RegisterOrConstant itable_index,
4053 Register method_result,
4054 Register scan_tmp,
4055 Label& L_no_such_interface,
4056 bool return_method) {
4057 assert_different_registers(recv_klass, intf_klass, scan_tmp);
4058 assert_different_registers(method_result, intf_klass, scan_tmp);
4059 assert(recv_klass != method_result || !return_method,
4060 "recv_klass can be destroyed when method isn't needed");
4061 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
4062 "caller must use same register for non-constant itable index as for method");
4063
4064 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
4065 int vtable_base = in_bytes(Klass::vtable_start_offset());
4066 int itentry_off = in_bytes(itableMethodEntry::method_offset());
4067 int scan_step = itableOffsetEntry::size() * wordSize;
4068 int vte_size = vtableEntry::size_in_bytes();
4069 assert(vte_size == wordSize, "else adjust times_vte_scale");
4070
4071 lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
4072
4073 // Could store the aligned, prescaled offset in the klass.
4074 shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
4075 add(scan_tmp, scan_tmp, vtable_base);
4076
4077 if (return_method) {
4078 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
4079 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
4080 if (itable_index.is_register()) {
4081 slli(t0, itable_index.as_register(), 3);
4082 } else {
4083 mv(t0, itable_index.as_constant() << 3);
4084 }
4085 add(recv_klass, recv_klass, t0);
4086 if (itentry_off) {
4087 add(recv_klass, recv_klass, itentry_off);
4088 }
4089 }
4090
4091 Label search, found_method;
4092
4093 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
4094 beq(intf_klass, method_result, found_method);
4095 bind(search);
4096 // Check that the previous entry is non-null. A null entry means that
4097 // the receiver class doesn't implement the interface, and wasn't the
4098 // same as when the caller was compiled.
4099 beqz(method_result, L_no_such_interface, /* is_far */ true);
4100 addi(scan_tmp, scan_tmp, scan_step);
4101 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
4102 bne(intf_klass, method_result, search);
4103
4104 bind(found_method);
4105
4106 // Got a hit.
4107 if (return_method) {
4108 lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
4109 add(method_result, recv_klass, scan_tmp);
4110 ld(method_result, Address(method_result));
4111 }
4112 }
4113
4114 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
4115 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
4116 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
4117 // The target method is determined by <holder_klass, itable_index>.
4118 // The receiver klass is in recv_klass.
4119 // On success, the result will be in method_result, and execution falls through.
4120 // On failure, execution transfers to the given label.
4121 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
4122 Register holder_klass,
4123 Register resolved_klass,
4124 Register method_result,
4125 Register temp_itbl_klass,
4126 Register scan_temp,
4127 int itable_index,
4128 Label& L_no_such_interface) {
4129 // 'method_result' is only used as output register at the very end of this method.
4130 // Until then we can reuse it as 'holder_offset'.
4131 Register holder_offset = method_result;
4132 assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4133
4134 int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4135 int scan_step = itableOffsetEntry::size() * wordSize;
4136 int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4137 int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4138 int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4139 const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4140
4141 Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4142
4143 lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4144 add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4145 // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4146 // + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4147 // scan_temp = &(itable[0]._interface)
4148 // temp_itbl_klass = itable[0]._interface;
4149 shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4150 ld(temp_itbl_klass, Address(scan_temp));
4151 mv(holder_offset, zr);
4152
4153 // Initial checks:
4154 // - if (holder_klass != resolved_klass), go to "scan for resolved"
4155 // - if (itable[0] == holder_klass), shortcut to "holder found"
4156 // - if (itable[0] == 0), no such interface
4157 bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4158 beq(holder_klass, temp_itbl_klass, L_holder_found);
4159 beqz(temp_itbl_klass, L_no_such_interface);
4160
4161 // Loop: Look for holder_klass record in itable
4162 // do {
4163 // temp_itbl_klass = *(scan_temp += scan_step);
4164 // if (temp_itbl_klass == holder_klass) {
4165 // goto L_holder_found; // Found!
4166 // }
4167 // } while (temp_itbl_klass != 0);
4168 // goto L_no_such_interface // Not found.
4169 Label L_search_holder;
4170 bind(L_search_holder);
4171 add(scan_temp, scan_temp, scan_step);
4172 ld(temp_itbl_klass, Address(scan_temp));
4173 beq(holder_klass, temp_itbl_klass, L_holder_found);
4174 bnez(temp_itbl_klass, L_search_holder);
4175
4176 j(L_no_such_interface);
4177
4178 // Loop: Look for resolved_class record in itable
4179 // while (true) {
4180 // temp_itbl_klass = *(scan_temp += scan_step);
4181 // if (temp_itbl_klass == 0) {
4182 // goto L_no_such_interface;
4183 // }
4184 // if (temp_itbl_klass == resolved_klass) {
4185 // goto L_resolved_found; // Found!
4186 // }
4187 // if (temp_itbl_klass == holder_klass) {
4188 // holder_offset = scan_temp;
4189 // }
4190 // }
4191 //
4192 Label L_loop_search_resolved;
4193 bind(L_loop_search_resolved);
4194 add(scan_temp, scan_temp, scan_step);
4195 ld(temp_itbl_klass, Address(scan_temp));
4196 bind(L_loop_search_resolved_entry);
4197 beqz(temp_itbl_klass, L_no_such_interface);
4198 beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4199 bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4200 mv(holder_offset, scan_temp);
4201 j(L_loop_search_resolved);
4202
4203 // See if we already have a holder klass. If not, go and scan for it.
4204 bind(L_resolved_found);
4205 beqz(holder_offset, L_search_holder);
4206 mv(scan_temp, holder_offset);
4207
4208 // Finally, scan_temp contains holder_klass vtable offset
4209 bind(L_holder_found);
4210 lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4211 add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4212 - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4213 add(method_result, recv_klass, method_result);
4214 ld(method_result, Address(method_result));
4215 }
4216
4217 // virtual method calling
4218 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4219 RegisterOrConstant vtable_index,
4220 Register method_result) {
4221 const ByteSize base = Klass::vtable_start_offset();
4222 assert(vtableEntry::size() * wordSize == 8,
4223 "adjust the scaling in the code below");
4224 int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4225
4226 if (vtable_index.is_register()) {
4227 shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4228 ld(method_result, Address(method_result, vtable_offset_in_bytes));
4229 } else {
4230 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4231 ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4232 }
4233 }
4234
4235 void MacroAssembler::membar(uint32_t order_constraint) {
4236 if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4237 // TSO allows for stores to be reordered after loads. When the compiler
4238 // generates a fence to disallow that, we are required to generate the
4239 // fence for correctness.
4240 BLOCK_COMMENT("elided tso membar");
4241 return;
4242 }
4243
4244 address prev = pc() - MacroAssembler::instruction_size;
4245 address last = code()->last_merge_candidate();
4246
4247 if (last != nullptr && is_membar(last) && prev == last) {
4248 // We are merging two memory barrier instructions. On RISCV we
4249 // can do this simply by ORing them together.
4250 set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4251 BLOCK_COMMENT("merged membar");
4252 return;
4253 }
4254
4255 code()->set_last_merge_candidate(pc());
4256 uint32_t predecessor = 0;
4257 uint32_t successor = 0;
4258 membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4259 fence(predecessor, successor);
4260 }
4261
4262 void MacroAssembler::cmodx_fence() {
4263 BLOCK_COMMENT("cmodx fence");
4264 if (VM_Version::supports_fencei_barrier()) {
4265 Assembler::fencei();
4266 }
4267 }
4268
4269 // Form an address from base + offset in Rd. Rd my or may not
4270 // actually be used: you must use the Address that is returned. It
4271 // is up to you to ensure that the shift provided matches the size
4272 // of your data.
4273 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4274 if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4275 return Address(base, byte_offset);
4276 }
4277
4278 assert_different_registers(Rd, base, noreg);
4279
4280 // Do it the hard way
4281 mv(Rd, byte_offset);
4282 add(Rd, base, Rd);
4283 return Address(Rd);
4284 }
4285
4286 void MacroAssembler::check_klass_subtype(Register sub_klass,
4287 Register super_klass,
4288 Register tmp_reg,
4289 Label& L_success) {
4290 Label L_failure;
4291 check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4292 check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4293 bind(L_failure);
4294 }
4295
4296 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4297 ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4298 if (at_return) {
4299 bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4300 } else {
4301 test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4302 bnez(tmp_reg, slow_path, /* is_far */ true);
4303 }
4304 }
4305
4306 void MacroAssembler::load_reserved(Register dst,
4307 Register addr,
4308 Assembler::operand_size size,
4309 Assembler::Aqrl acquire) {
4310 switch (size) {
4311 case int64:
4312 lr_d(dst, addr, acquire);
4313 break;
4314 case int32:
4315 lr_w(dst, addr, acquire);
4316 break;
4317 case uint32:
4318 lr_w(dst, addr, acquire);
4319 zext(dst, dst, 32);
4320 break;
4321 default:
4322 ShouldNotReachHere();
4323 }
4324 }
4325
4326 void MacroAssembler::store_conditional(Register dst,
4327 Register new_val,
4328 Register addr,
4329 Assembler::operand_size size,
4330 Assembler::Aqrl release) {
4331 switch (size) {
4332 case int64:
4333 sc_d(dst, addr, new_val, release);
4334 break;
4335 case int32:
4336 case uint32:
4337 sc_w(dst, addr, new_val, release);
4338 break;
4339 default:
4340 ShouldNotReachHere();
4341 }
4342 }
4343
4344
4345 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4346 Assembler::operand_size size,
4347 Register shift, Register mask, Register aligned_addr) {
4348 assert(size == int8 || size == int16, "unsupported operand size");
4349
4350 andi(shift, addr, 3);
4351 slli(shift, shift, 3);
4352
4353 andi(aligned_addr, addr, ~3);
4354
4355 if (size == int8) {
4356 mv(mask, 0xff);
4357 } else {
4358 // size == int16 case
4359 mv(mask, -1);
4360 zext(mask, mask, 16);
4361 }
4362 sll(mask, mask, shift);
4363
4364 sll(expected, expected, shift);
4365 andr(expected, expected, mask);
4366
4367 sll(new_val, new_val, shift);
4368 andr(new_val, new_val, mask);
4369 }
4370
4371 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4372 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4373 // which are forced to work with 4-byte aligned address.
4374 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4375 Register new_val,
4376 Assembler::operand_size size,
4377 Assembler::Aqrl acquire, Assembler::Aqrl release,
4378 Register result, bool result_as_bool,
4379 Register tmp1, Register tmp2, Register tmp3) {
4380 assert(!(UseZacas && UseZabha), "Use amocas");
4381 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4382
4383 Register scratch0 = t0, aligned_addr = t1;
4384 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4385
4386 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4387
4388 Label retry, fail, done;
4389
4390 if (UseZacas) {
4391 lw(result, aligned_addr);
4392
4393 bind(retry); // amocas loads the current value into result
4394 notr(scratch1, mask);
4395
4396 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4397 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4398 bne(result, scratch1, fail); // cas bits differ, cas failed
4399
4400 // result is the same as expected, use as expected value.
4401
4402 // scratch0 is still = word - cas bits
4403 // Or in the new value to create complete new value.
4404 orr(scratch0, scratch0, new_val);
4405
4406 mv(scratch1, result); // save our expected value
4407 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4408 bne(scratch1, result, retry);
4409 } else {
4410 notr(scratch1, mask);
4411 bind(retry);
4412
4413 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4414 andr(scratch0, result, mask);
4415 bne(scratch0, expected, fail);
4416
4417 andr(scratch0, result, scratch1); // scratch1 is ~mask
4418 orr(scratch0, scratch0, new_val);
4419 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4420 bnez(scratch0, retry);
4421 }
4422
4423 if (result_as_bool) {
4424 mv(result, 1);
4425 j(done);
4426
4427 bind(fail);
4428 mv(result, zr);
4429
4430 bind(done);
4431 } else {
4432 bind(fail);
4433
4434 andr(scratch0, result, mask);
4435 srl(result, scratch0, shift);
4436
4437 if (size == int8) {
4438 sext(result, result, 8);
4439 } else {
4440 // size == int16 case
4441 sext(result, result, 16);
4442 }
4443 }
4444 }
4445
4446 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4447 // the weak CAS stuff. The major difference is that it just failed when store conditional
4448 // failed.
4449 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4450 Register new_val,
4451 Assembler::operand_size size,
4452 Assembler::Aqrl acquire, Assembler::Aqrl release,
4453 Register result,
4454 Register tmp1, Register tmp2, Register tmp3) {
4455 assert(!(UseZacas && UseZabha), "Use amocas");
4456 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4457
4458 Register scratch0 = t0, aligned_addr = t1;
4459 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4460
4461 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4462
4463 Label fail, done;
4464
4465 if (UseZacas) {
4466 lw(result, aligned_addr);
4467
4468 notr(scratch1, mask);
4469
4470 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4471 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4472 bne(result, scratch1, fail); // cas bits differ, cas failed
4473
4474 // result is the same as expected, use as expected value.
4475
4476 // scratch0 is still = word - cas bits
4477 // Or in the new value to create complete new value.
4478 orr(scratch0, scratch0, new_val);
4479
4480 mv(scratch1, result); // save our expected value
4481 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4482 bne(scratch1, result, fail); // This weak, so just bail-out.
4483 } else {
4484 notr(scratch1, mask);
4485
4486 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4487 andr(scratch0, result, mask);
4488 bne(scratch0, expected, fail);
4489
4490 andr(scratch0, result, scratch1); // scratch1 is ~mask
4491 orr(scratch0, scratch0, new_val);
4492 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4493 bnez(scratch0, fail);
4494 }
4495
4496 // Success
4497 mv(result, 1);
4498 j(done);
4499
4500 // Fail
4501 bind(fail);
4502 mv(result, zr);
4503
4504 bind(done);
4505 }
4506
4507 void MacroAssembler::cmpxchg(Register addr, Register expected,
4508 Register new_val,
4509 Assembler::operand_size size,
4510 Assembler::Aqrl acquire, Assembler::Aqrl release,
4511 Register result, bool result_as_bool) {
4512 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4513 assert_different_registers(addr, t0);
4514 assert_different_registers(expected, t0);
4515 assert_different_registers(new_val, t0);
4516
4517 // NOTE:
4518 // Register _result_ may be the same register as _new_val_ or _expected_.
4519 // Hence do NOT use _result_ until after 'cas'.
4520 //
4521 // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4522 // Hence do NOT change _expected_ or _new_val_.
4523 //
4524 // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4525 //
4526 // TODO: Address these issues.
4527
4528 if (UseZacas) {
4529 if (result_as_bool) {
4530 mv(t0, expected);
4531 atomic_cas(t0, new_val, addr, size, acquire, release);
4532 xorr(t0, t0, expected);
4533 seqz(result, t0);
4534 } else {
4535 mv(t0, expected);
4536 atomic_cas(t0, new_val, addr, size, acquire, release);
4537 mv(result, t0);
4538 }
4539 return;
4540 }
4541
4542 Label retry_load, done, ne_done;
4543 bind(retry_load);
4544 load_reserved(t0, addr, size, acquire);
4545 bne(t0, expected, ne_done);
4546 store_conditional(t0, new_val, addr, size, release);
4547 bnez(t0, retry_load);
4548
4549 // equal, succeed
4550 if (result_as_bool) {
4551 mv(result, 1);
4552 } else {
4553 mv(result, expected);
4554 }
4555 j(done);
4556
4557 // not equal, failed
4558 bind(ne_done);
4559 if (result_as_bool) {
4560 mv(result, zr);
4561 } else {
4562 mv(result, t0);
4563 }
4564
4565 bind(done);
4566 }
4567
4568 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4569 Register new_val,
4570 Assembler::operand_size size,
4571 Assembler::Aqrl acquire, Assembler::Aqrl release,
4572 Register result) {
4573 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4574 assert_different_registers(addr, t0);
4575 assert_different_registers(expected, t0);
4576 assert_different_registers(new_val, t0);
4577
4578 if (UseZacas) {
4579 cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4580 return;
4581 }
4582
4583 Label fail, done;
4584 load_reserved(t0, addr, size, acquire);
4585 bne(t0, expected, fail);
4586 store_conditional(t0, new_val, addr, size, release);
4587 bnez(t0, fail);
4588
4589 // Success
4590 mv(result, 1);
4591 j(done);
4592
4593 // Fail
4594 bind(fail);
4595 mv(result, zr);
4596
4597 bind(done);
4598 }
4599
4600 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE) \
4601 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4602 prev = prev->is_valid() ? prev : zr; \
4603 if (incr.is_register()) { \
4604 AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4605 } else { \
4606 mv(t0, incr.as_constant()); \
4607 AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4608 } \
4609 return; \
4610 }
4611
4612 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4613 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4614 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4615 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4616
4617 #undef ATOMIC_OP
4618
4619 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE) \
4620 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
4621 prev = prev->is_valid() ? prev : zr; \
4622 AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4623 return; \
4624 }
4625
4626 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4627 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4628 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4629 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4630
4631 #undef ATOMIC_XCHG
4632
4633 #define ATOMIC_XCHGU(OP1, OP2) \
4634 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) { \
4635 atomic_##OP2(prev, newv, addr); \
4636 zext(prev, prev, 32); \
4637 return; \
4638 }
4639
4640 ATOMIC_XCHGU(xchgwu, xchgw)
4641 ATOMIC_XCHGU(xchgalwu, xchgalw)
4642
4643 #undef ATOMIC_XCHGU
4644
4645 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4646 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4647 switch (size) {
4648 case int64:
4649 amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4650 break;
4651 case int32:
4652 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4653 break;
4654 case uint32:
4655 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4656 zext(prev, prev, 32);
4657 break;
4658 case int16:
4659 amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4660 break;
4661 case int8:
4662 amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4663 break;
4664 default:
4665 ShouldNotReachHere();
4666 }
4667 }
4668
4669 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4670 assert(CodeCache::contains(entry.target()),
4671 "destination of far jump not found in code cache");
4672 assert(entry.rspec().type() == relocInfo::external_word_type
4673 || entry.rspec().type() == relocInfo::runtime_call_type
4674 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4675 // Fixed length: see MacroAssembler::far_branch_size()
4676 // We can use auipc + jr here because we know that the total size of
4677 // the code cache cannot exceed 2Gb.
4678 relocate(entry.rspec(), [&] {
4679 int64_t distance = entry.target() - pc();
4680 int32_t offset = ((int32_t)distance << 20) >> 20;
4681 assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4682 auipc(tmp, (int32_t)distance + 0x800);
4683 jr(tmp, offset);
4684 });
4685 }
4686
4687 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4688 assert(tmp != x5, "tmp register must not be x5.");
4689 assert(CodeCache::contains(entry.target()),
4690 "destination of far call not found in code cache");
4691 assert(entry.rspec().type() == relocInfo::external_word_type
4692 || entry.rspec().type() == relocInfo::runtime_call_type
4693 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4694 // Fixed length: see MacroAssembler::far_branch_size()
4695 // We can use auipc + jalr here because we know that the total size of
4696 // the code cache cannot exceed 2Gb.
4697 relocate(entry.rspec(), [&] {
4698 int64_t distance = entry.target() - pc();
4699 int32_t offset = ((int32_t)distance << 20) >> 20;
4700 assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4701 auipc(tmp, (int32_t)distance + 0x800);
4702 jalr(tmp, offset);
4703 });
4704 }
4705
4706 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4707 Register super_klass,
4708 Register tmp_reg,
4709 Label* L_success,
4710 Label* L_failure,
4711 Label* L_slow_path,
4712 Register super_check_offset) {
4713 assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4714 bool must_load_sco = !super_check_offset->is_valid();
4715 if (must_load_sco) {
4716 assert(tmp_reg != noreg, "supply either a temp or a register offset");
4717 }
4718
4719 Label L_fallthrough;
4720 int label_nulls = 0;
4721 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4722 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4723 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4724 assert(label_nulls <= 1, "at most one null in batch");
4725
4726 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4727 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4728 Address super_check_offset_addr(super_klass, sco_offset);
4729
4730 // Hacked jmp, which may only be used just before L_fallthrough.
4731 #define final_jmp(label) \
4732 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4733 else j(label) /*omit semi*/
4734
4735 // If the pointers are equal, we are done (e.g., String[] elements).
4736 // This self-check enables sharing of secondary supertype arrays among
4737 // non-primary types such as array-of-interface. Otherwise, each such
4738 // type would need its own customized SSA.
4739 // We move this check to the front of the fast path because many
4740 // type checks are in fact trivially successful in this manner,
4741 // so we get a nicely predicted branch right at the start of the check.
4742 beq(sub_klass, super_klass, *L_success);
4743
4744 // Check the supertype display:
4745 if (must_load_sco) {
4746 lwu(tmp_reg, super_check_offset_addr);
4747 super_check_offset = tmp_reg;
4748 }
4749 add(t0, sub_klass, super_check_offset);
4750 Address super_check_addr(t0);
4751 ld(t0, super_check_addr); // load displayed supertype
4752 beq(super_klass, t0, *L_success);
4753
4754 // This check has worked decisively for primary supers.
4755 // Secondary supers are sought in the super_cache ('super_cache_addr').
4756 // (Secondary supers are interfaces and very deeply nested subtypes.)
4757 // This works in the same check above because of a tricky aliasing
4758 // between the super_Cache and the primary super display elements.
4759 // (The 'super_check_addr' can address either, as the case requires.)
4760 // Note that the cache is updated below if it does not help us find
4761 // what we need immediately.
4762 // So if it was a primary super, we can just fail immediately.
4763 // Otherwise, it's the slow path for us (no success at this point).
4764
4765 mv(t1, sc_offset);
4766 if (L_failure == &L_fallthrough) {
4767 beq(super_check_offset, t1, *L_slow_path);
4768 } else {
4769 bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4770 final_jmp(*L_slow_path);
4771 }
4772
4773 bind(L_fallthrough);
4774
4775 #undef final_jmp
4776 }
4777
4778 // Scans count pointer sized words at [addr] for occurrence of value,
4779 // generic
4780 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4781 Register tmp) {
4782 Label Lloop, Lexit;
4783 beqz(count, Lexit);
4784 bind(Lloop);
4785 ld(tmp, addr);
4786 beq(value, tmp, Lexit);
4787 addi(addr, addr, wordSize);
4788 subi(count, count, 1);
4789 bnez(count, Lloop);
4790 bind(Lexit);
4791 }
4792
4793 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4794 Register super_klass,
4795 Register tmp1_reg,
4796 Register tmp2_reg,
4797 Label* L_success,
4798 Label* L_failure,
4799 bool set_cond_codes) {
4800 assert_different_registers(sub_klass, super_klass, tmp1_reg);
4801 if (tmp2_reg != noreg) {
4802 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4803 }
4804 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4805
4806 Label L_fallthrough;
4807 int label_nulls = 0;
4808 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4809 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4810
4811 assert(label_nulls <= 1, "at most one null in the batch");
4812
4813 // A couple of useful fields in sub_klass:
4814 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4815 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4816 Address secondary_supers_addr(sub_klass, ss_offset);
4817 Address super_cache_addr( sub_klass, sc_offset);
4818
4819 BLOCK_COMMENT("check_klass_subtype_slow_path");
4820
4821 // Do a linear scan of the secondary super-klass chain.
4822 // This code is rarely used, so simplicity is a virtue here.
4823 // The repne_scan instruction uses fixed registers, which we must spill.
4824 // Don't worry too much about pre-existing connections with the input regs.
4825
4826 assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4827 assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4828
4829 RegSet pushed_registers;
4830 if (!IS_A_TEMP(x12)) {
4831 pushed_registers += x12;
4832 }
4833 if (!IS_A_TEMP(x15)) {
4834 pushed_registers += x15;
4835 }
4836
4837 if (super_klass != x10) {
4838 if (!IS_A_TEMP(x10)) {
4839 pushed_registers += x10;
4840 }
4841 }
4842
4843 push_reg(pushed_registers, sp);
4844
4845 // Get super_klass value into x10 (even if it was in x15 or x12)
4846 mv(x10, super_klass);
4847
4848 #ifndef PRODUCT
4849 incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4850 #endif // PRODUCT
4851
4852 // We will consult the secondary-super array.
4853 ld(x15, secondary_supers_addr);
4854 // Load the array length.
4855 lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4856 // Skip to start of data.
4857 addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4858
4859 // Set t0 to an obvious invalid value, falling through by default
4860 mv(t0, -1);
4861 // Scan X12 words at [X15] for an occurrence of X10.
4862 repne_scan(x15, x10, x12, t0);
4863
4864 // pop will restore x10, so we should use a temp register to keep its value
4865 mv(t1, x10);
4866
4867 // Unspill the temp registers:
4868 pop_reg(pushed_registers, sp);
4869
4870 bne(t1, t0, *L_failure);
4871
4872 // Success. Cache the super we found an proceed in triumph.
4873 if (UseSecondarySupersCache) {
4874 sd(super_klass, super_cache_addr);
4875 }
4876
4877 if (L_success != &L_fallthrough) {
4878 j(*L_success);
4879 }
4880
4881 #undef IS_A_TEMP
4882
4883 bind(L_fallthrough);
4884 }
4885
4886 // population_count variant for running without the CPOP
4887 // instruction, which was introduced with Zbb extension.
4888 void MacroAssembler::population_count(Register dst, Register src,
4889 Register tmp1, Register tmp2) {
4890 if (UsePopCountInstruction) {
4891 cpop(dst, src);
4892 } else {
4893 assert_different_registers(src, tmp1, tmp2);
4894 assert_different_registers(dst, tmp1, tmp2);
4895 Label loop, done;
4896
4897 mv(tmp1, src);
4898 // dst = 0;
4899 // while(tmp1 != 0) {
4900 // dst++;
4901 // tmp1 &= (tmp1 - 1);
4902 // }
4903 mv(dst, zr);
4904 beqz(tmp1, done);
4905 {
4906 bind(loop);
4907 addi(dst, dst, 1);
4908 subi(tmp2, tmp1, 1);
4909 andr(tmp1, tmp1, tmp2);
4910 bnez(tmp1, loop);
4911 }
4912 bind(done);
4913 }
4914 }
4915
4916 // If Register r is invalid, remove a new register from
4917 // available_regs, and add new register to regs_to_push.
4918 Register MacroAssembler::allocate_if_noreg(Register r,
4919 RegSetIterator<Register> &available_regs,
4920 RegSet ®s_to_push) {
4921 if (!r->is_valid()) {
4922 r = *available_regs++;
4923 regs_to_push += r;
4924 }
4925 return r;
4926 }
4927
4928 // check_klass_subtype_slow_path_table() looks for super_klass in the
4929 // hash table belonging to super_klass, branching to L_success or
4930 // L_failure as appropriate. This is essentially a shim which
4931 // allocates registers as necessary then calls
4932 // lookup_secondary_supers_table() to do the work. Any of the tmp
4933 // regs may be noreg, in which case this logic will chooses some
4934 // registers push and pop them from the stack.
4935 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4936 Register super_klass,
4937 Register tmp1_reg,
4938 Register tmp2_reg,
4939 Label* L_success,
4940 Label* L_failure,
4941 bool set_cond_codes) {
4942 RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4943
4944 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4945
4946 Label L_fallthrough;
4947 int label_nulls = 0;
4948 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4949 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4950 assert(label_nulls <= 1, "at most one null in the batch");
4951
4952 BLOCK_COMMENT("check_klass_subtype_slow_path");
4953
4954 RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4955 RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4956
4957 RegSet pushed_regs;
4958
4959 tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4960 tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4961
4962 Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4963
4964 tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4965 tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4966 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4967
4968 push_reg(pushed_regs, sp);
4969
4970 lookup_secondary_supers_table_var(sub_klass,
4971 super_klass,
4972 result_reg,
4973 tmp1_reg, tmp2_reg, tmp3_reg,
4974 tmp4_reg, nullptr);
4975
4976 // Move the result to t1 as we are about to unspill the tmp registers.
4977 mv(t1, result_reg);
4978
4979 // Unspill the tmp. registers:
4980 pop_reg(pushed_regs, sp);
4981
4982 // NB! Callers may assume that, when set_cond_codes is true, this
4983 // code sets tmp2_reg to a nonzero value.
4984 if (set_cond_codes) {
4985 mv(tmp2_reg, 1);
4986 }
4987
4988 bnez(t1, *L_failure);
4989
4990 if (L_success != &L_fallthrough) {
4991 j(*L_success);
4992 }
4993
4994 bind(L_fallthrough);
4995 }
4996
4997 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4998 Register super_klass,
4999 Register tmp1_reg,
5000 Register tmp2_reg,
5001 Label* L_success,
5002 Label* L_failure,
5003 bool set_cond_codes) {
5004 if (UseSecondarySupersTable) {
5005 check_klass_subtype_slow_path_table
5006 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
5007 } else {
5008 check_klass_subtype_slow_path_linear
5009 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
5010 }
5011 }
5012
5013 // Ensure that the inline code and the stub are using the same registers
5014 // as we need to call the stub from inline code when there is a collision
5015 // in the hashed lookup in the secondary supers array.
5016 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
5017 r_array_index, r_sub_klass, result, r_bitmap) \
5018 do { \
5019 assert(r_super_klass == x10 && \
5020 r_array_base == x11 && \
5021 r_array_length == x12 && \
5022 (r_array_index == x13 || r_array_index == noreg) && \
5023 (r_sub_klass == x14 || r_sub_klass == noreg) && \
5024 (result == x15 || result == noreg) && \
5025 (r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
5026 } while(0)
5027
5028 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
5029 Register r_super_klass,
5030 Register result,
5031 Register tmp1,
5032 Register tmp2,
5033 Register tmp3,
5034 Register tmp4,
5035 u1 super_klass_slot,
5036 bool stub_is_near) {
5037 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5038
5039 Label L_fallthrough;
5040
5041 BLOCK_COMMENT("lookup_secondary_supers_table {");
5042
5043 const Register
5044 r_array_base = tmp1, // x11
5045 r_array_length = tmp2, // x12
5046 r_array_index = tmp3, // x13
5047 r_bitmap = tmp4; // x16
5048
5049 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5050 r_array_index, r_sub_klass, result, r_bitmap);
5051
5052 u1 bit = super_klass_slot;
5053
5054 // Initialize result value to 1 which means mismatch.
5055 mv(result, 1);
5056
5057 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5058
5059 // First check the bitmap to see if super_klass might be present. If
5060 // the bit is zero, we are certain that super_klass is not one of
5061 // the secondary supers.
5062 test_bit(t0, r_bitmap, bit);
5063 beqz(t0, L_fallthrough);
5064
5065 // Get the first array index that can contain super_klass into r_array_index.
5066 if (bit != 0) {
5067 slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
5068 population_count(r_array_index, r_array_index, tmp1, tmp2);
5069 } else {
5070 mv(r_array_index, (u1)1);
5071 }
5072
5073 // We will consult the secondary-super array.
5074 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5075
5076 // The value i in r_array_index is >= 1, so even though r_array_base
5077 // points to the length, we don't need to adjust it to point to the data.
5078 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5079 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5080
5081 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5082 ld(result, Address(result));
5083 xorr(result, result, r_super_klass);
5084 beqz(result, L_fallthrough); // Found a match
5085
5086 // Is there another entry to check? Consult the bitmap.
5087 test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
5088 beqz(t0, L_fallthrough);
5089
5090 // Linear probe.
5091 if (bit != 0) {
5092 ror(r_bitmap, r_bitmap, bit);
5093 }
5094
5095 // The slot we just inspected is at secondary_supers[r_array_index - 1].
5096 // The next slot to be inspected, by the stub we're about to call,
5097 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5098 // have been checked.
5099 rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
5100
5101 BLOCK_COMMENT("} lookup_secondary_supers_table");
5102
5103 bind(L_fallthrough);
5104
5105 if (VerifySecondarySupers) {
5106 verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
5107 result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
5108 }
5109 return true;
5110 }
5111
5112 // At runtime, return 0 in result if r_super_klass is a superclass of
5113 // r_sub_klass, otherwise return nonzero. Use this version of
5114 // lookup_secondary_supers_table() if you don't know ahead of time
5115 // which superclass will be searched for. Used by interpreter and
5116 // runtime stubs. It is larger and has somewhat greater latency than
5117 // the version above, which takes a constant super_klass_slot.
5118 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
5119 Register r_super_klass,
5120 Register result,
5121 Register tmp1,
5122 Register tmp2,
5123 Register tmp3,
5124 Register tmp4,
5125 Label *L_success) {
5126 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5127
5128 Label L_fallthrough;
5129
5130 BLOCK_COMMENT("lookup_secondary_supers_table {");
5131
5132 const Register
5133 r_array_index = tmp3,
5134 r_bitmap = tmp4,
5135 slot = t1;
5136
5137 lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5138
5139 // Make sure that result is nonzero if the test below misses.
5140 mv(result, 1);
5141
5142 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5143
5144 // First check the bitmap to see if super_klass might be present. If
5145 // the bit is zero, we are certain that super_klass is not one of
5146 // the secondary supers.
5147
5148 // This next instruction is equivalent to:
5149 // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5150 // sub(r_array_index, slot, tmp_reg);
5151 xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5152 sll(r_array_index, r_bitmap, r_array_index);
5153 test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5154 beqz(t0, L_fallthrough);
5155
5156 // Get the first array index that can contain super_klass into r_array_index.
5157 population_count(r_array_index, r_array_index, tmp1, tmp2);
5158
5159 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5160
5161 const Register
5162 r_array_base = tmp1,
5163 r_array_length = tmp2;
5164
5165 // The value i in r_array_index is >= 1, so even though r_array_base
5166 // points to the length, we don't need to adjust it to point to the data.
5167 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5168 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5169
5170 // We will consult the secondary-super array.
5171 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5172
5173 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5174 ld(result, Address(result));
5175 xorr(result, result, r_super_klass);
5176 beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5177
5178 // Is there another entry to check? Consult the bitmap.
5179 ror(r_bitmap, r_bitmap, slot);
5180 test_bit(t0, r_bitmap, 1);
5181 beqz(t0, L_fallthrough);
5182
5183 // The slot we just inspected is at secondary_supers[r_array_index - 1].
5184 // The next slot to be inspected, by the logic we're about to call,
5185 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5186 // have been checked.
5187 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5188 r_bitmap, result, r_array_length, false /*is_stub*/);
5189
5190 BLOCK_COMMENT("} lookup_secondary_supers_table");
5191
5192 bind(L_fallthrough);
5193
5194 if (VerifySecondarySupers) {
5195 verify_secondary_supers_table(r_sub_klass, r_super_klass,
5196 result, tmp1, tmp2, tmp3);
5197 }
5198
5199 if (L_success) {
5200 beqz(result, *L_success);
5201 }
5202 }
5203
5204 // Called by code generated by check_klass_subtype_slow_path
5205 // above. This is called when there is a collision in the hashed
5206 // lookup in the secondary supers array.
5207 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5208 Register r_array_base,
5209 Register r_array_index,
5210 Register r_bitmap,
5211 Register result,
5212 Register tmp,
5213 bool is_stub) {
5214 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5215
5216 const Register
5217 r_array_length = tmp,
5218 r_sub_klass = noreg; // unused
5219
5220 if (is_stub) {
5221 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5222 r_array_index, r_sub_klass, result, r_bitmap);
5223 }
5224
5225 Label L_matched, L_fallthrough, L_bitmap_full;
5226
5227 // Initialize result value to 1 which means mismatch.
5228 mv(result, 1);
5229
5230 // Load the array length.
5231 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5232 // And adjust the array base to point to the data.
5233 // NB! Effectively increments current slot index by 1.
5234 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5235 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5236
5237 // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5238 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5239 subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5240 bgtz(t0, L_bitmap_full);
5241
5242 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5243 // current slot (at secondary_supers[r_array_index]) has not yet
5244 // been inspected, and r_array_index may be out of bounds if we
5245 // wrapped around the end of the array.
5246
5247 { // This is conventional linear probing, but instead of terminating
5248 // when a null entry is found in the table, we maintain a bitmap
5249 // in which a 0 indicates missing entries.
5250 // As long as the bitmap is not completely full,
5251 // array_length == popcount(bitmap). The array_length check above
5252 // guarantees there are 0s in the bitmap, so the loop eventually
5253 // terminates.
5254 Label L_loop;
5255 bind(L_loop);
5256
5257 // Check for wraparound.
5258 Label skip;
5259 blt(r_array_index, r_array_length, skip);
5260 mv(r_array_index, zr);
5261 bind(skip);
5262
5263 shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5264 ld(t0, Address(t0));
5265 beq(t0, r_super_klass, L_matched);
5266
5267 test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
5268 beqz(t0, L_fallthrough);
5269
5270 ror(r_bitmap, r_bitmap, 1);
5271 addi(r_array_index, r_array_index, 1);
5272 j(L_loop);
5273 }
5274
5275 { // Degenerate case: more than 64 secondary supers.
5276 // FIXME: We could do something smarter here, maybe a vectorized
5277 // comparison or a binary search, but is that worth any added
5278 // complexity?
5279 bind(L_bitmap_full);
5280 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5281 bne(r_super_klass, t0, L_fallthrough);
5282 }
5283
5284 bind(L_matched);
5285 mv(result, zr);
5286
5287 bind(L_fallthrough);
5288 }
5289
5290 // Make sure that the hashed lookup and a linear scan agree.
5291 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5292 Register r_super_klass,
5293 Register result,
5294 Register tmp1,
5295 Register tmp2,
5296 Register tmp3) {
5297 assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5298
5299 const Register
5300 r_array_base = tmp1, // X11
5301 r_array_length = tmp2, // X12
5302 r_array_index = noreg, // unused
5303 r_bitmap = noreg; // unused
5304
5305 BLOCK_COMMENT("verify_secondary_supers_table {");
5306
5307 // We will consult the secondary-super array.
5308 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5309
5310 // Load the array length.
5311 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5312 // And adjust the array base to point to the data.
5313 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5314
5315 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5316 Label failed;
5317 mv(tmp3, 1);
5318 bne(r_super_klass, t0, failed);
5319 mv(tmp3, zr);
5320 bind(failed);
5321
5322 snez(result, result); // normalize result to 0/1 for comparison
5323
5324 Label passed;
5325 beq(tmp3, result, passed);
5326 {
5327 mv(x10, r_super_klass);
5328 mv(x11, r_sub_klass);
5329 mv(x12, tmp3);
5330 mv(x13, result);
5331 mv(x14, (address)("mismatch"));
5332 rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5333 should_not_reach_here();
5334 }
5335 bind(passed);
5336
5337 BLOCK_COMMENT("} verify_secondary_supers_table");
5338 }
5339
5340 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5341 void MacroAssembler::tlab_allocate(Register obj,
5342 Register var_size_in_bytes,
5343 int con_size_in_bytes,
5344 Register tmp1,
5345 Register tmp2,
5346 Label& slow_case,
5347 bool is_far) {
5348 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5349 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5350 }
5351
5352 // get_thread() can be called anywhere inside generated code so we
5353 // need to save whatever non-callee save context might get clobbered
5354 // by the call to Thread::current() or, indeed, the call setup code.
5355 void MacroAssembler::get_thread(Register thread) {
5356 // save all call-clobbered regs except thread
5357 RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5358 RegSet::range(x28, x31) + ra - thread;
5359 push_reg(saved_regs, sp);
5360
5361 mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5362 jalr(t1);
5363 if (thread != c_rarg0) {
5364 mv(thread, c_rarg0);
5365 }
5366
5367 // restore pushed registers
5368 pop_reg(saved_regs, sp);
5369 }
5370
5371 void MacroAssembler::load_byte_map_base(Register reg) {
5372 CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5373 // Strictly speaking the card table base isn't an address at all, and it might
5374 // even be negative. It is thus materialised as a constant.
5375 mv(reg, (uint64_t)ctbs->card_table_base_const());
5376 }
5377
5378 void MacroAssembler::build_frame(int framesize) {
5379 assert(framesize >= 2, "framesize must include space for FP/RA");
5380 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5381 sub(sp, sp, framesize);
5382 sd(fp, Address(sp, framesize - 2 * wordSize));
5383 sd(ra, Address(sp, framesize - wordSize));
5384 if (PreserveFramePointer) { add(fp, sp, framesize); }
5385 }
5386
5387 void MacroAssembler::remove_frame(int framesize) {
5388 assert(framesize >= 2, "framesize must include space for FP/RA");
5389 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5390 ld(fp, Address(sp, framesize - 2 * wordSize));
5391 ld(ra, Address(sp, framesize - wordSize));
5392 add(sp, sp, framesize);
5393 }
5394
5395 void MacroAssembler::remove_frame(int initial_framesize, bool needs_stack_repair) {
5396 assert(!needs_stack_repair, "unimplemented");
5397 remove_frame(initial_framesize);
5398 }
5399
5400 #ifdef COMPILER2
5401 // C2 compiled method's prolog code
5402 // Moved here from riscv.ad to support Valhalla code belows
5403 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5404 if (C->clinit_barrier_on_entry()) {
5405 assert(!C->method()->holder()->is_not_initialized(), "initialization should have been started");
5406
5407 Label L_skip_barrier;
5408
5409 mov_metadata(t1, C->method()->holder()->constant_encoding());
5410 clinit_barrier(t1, t0, &L_skip_barrier);
5411 far_jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub()));
5412 bind(L_skip_barrier);
5413 }
5414
5415 int bangsize = C->output()->bang_size_in_bytes();
5416 if (C->output()->need_stack_bang(bangsize)) {
5417 generate_stack_overflow_check(bangsize);
5418 }
5419
5420 // n.b. frame size includes space for return pc and fp
5421 const long framesize = C->output()->frame_size_in_bytes();
5422 build_frame(framesize);
5423
5424 assert(!C->needs_stack_repair(), "unimplemented");
5425 }
5426 #endif // COMPILER2
5427
5428 // Move a value between registers/stack slots and update the reg_state
5429 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
5430 Unimplemented();
5431 return false;
5432 }
5433
5434 // Read all fields from an inline type oop and store the values in registers/stack slots
5435 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
5436 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
5437 RegState reg_state[]) {
5438
5439 Unimplemented();
5440 return false;
5441 }
5442
5443 // Pack fields back into an inline type oop
5444 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
5445 VMRegPair* from, int from_count, int& from_index, VMReg to,
5446 RegState reg_state[], Register val_array) {
5447 Unimplemented();
5448 return false;
5449 }
5450
5451 // Calculate the extra stack space required for packing or unpacking inline
5452 // args and adjust the stack pointer
5453 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
5454 Unimplemented();
5455 return false;
5456 }
5457
5458 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
5459 Unimplemented();
5460 return reg;
5461 }
5462
5463 void MacroAssembler::reserved_stack_check() {
5464 // testing if reserved zone needs to be enabled
5465 Label no_reserved_zone_enabling;
5466
5467 ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5468 bltu(sp, t0, no_reserved_zone_enabling);
5469
5470 enter(); // RA and FP are live.
5471 mv(c_rarg0, xthread);
5472 rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5473 leave();
5474
5475 // We have already removed our own frame.
5476 // throw_delayed_StackOverflowError will think that it's been
5477 // called by our caller.
5478 j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5479 should_not_reach_here();
5480
5481 bind(no_reserved_zone_enabling);
5482 }
5483
5484 // Move the address of the polling page into dest.
5485 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5486 ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5487 }
5488
5489 // Read the polling page. The address of the polling page must
5490 // already be in r.
5491 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5492 relocate(rtype, [&] {
5493 lwu(zr, Address(r, offset));
5494 });
5495 }
5496
5497 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5498 #ifdef ASSERT
5499 {
5500 ThreadInVMfromUnknown tiv;
5501 assert (UseCompressedOops, "should only be used for compressed oops");
5502 assert (Universe::heap() != nullptr, "java heap should be initialized");
5503 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5504 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5505 }
5506 #endif
5507 int oop_index = oop_recorder()->find_index(obj);
5508 relocate(oop_Relocation::spec(oop_index), [&] {
5509 li32(dst, 0xDEADBEEF);
5510 });
5511 zext(dst, dst, 32);
5512 }
5513
5514 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5515 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5516 int index = oop_recorder()->find_index(k);
5517
5518 narrowKlass nk = CompressedKlassPointers::encode(k);
5519 relocate(metadata_Relocation::spec(index), [&] {
5520 li32(dst, nk);
5521 });
5522 zext(dst, dst, 32);
5523 }
5524
5525 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5526 assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5527 entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5528 entry.rspec().type() == relocInfo::static_call_type ||
5529 entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5530
5531 address target = entry.target();
5532
5533 if (!in_scratch_emit_size()) {
5534 address stub = emit_reloc_call_address_stub(offset(), target);
5535 if (stub == nullptr) {
5536 postcond(pc() == badAddress);
5537 return nullptr; // CodeCache is full
5538 }
5539 }
5540
5541 address call_pc = pc();
5542 #ifdef ASSERT
5543 if (entry.rspec().type() != relocInfo::runtime_call_type) {
5544 assert_alignment(call_pc);
5545 }
5546 #endif
5547
5548 // The relocation created while emitting the stub will ensure this
5549 // call instruction is subsequently patched to call the stub.
5550 relocate(entry.rspec(), [&] {
5551 auipc(tmp, 0);
5552 ld(tmp, Address(tmp, 0));
5553 jalr(tmp);
5554 });
5555
5556 postcond(pc() != badAddress);
5557 return call_pc;
5558 }
5559
5560 address MacroAssembler::ic_call(address entry, jint method_index) {
5561 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5562 assert(!in_compressible_scope(), "Must be");
5563 movptr(t0, (address)Universe::non_oop_word(), t1);
5564 assert_cond(entry != nullptr);
5565 return reloc_call(Address(entry, rh));
5566 }
5567
5568 int MacroAssembler::ic_check_size() {
5569 // No compressed
5570 return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5571 far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5572 }
5573
5574 int MacroAssembler::ic_check(int end_alignment) {
5575 IncompressibleScope scope(this);
5576 Register receiver = j_rarg0;
5577 Register data = t0;
5578
5579 Register tmp1 = t1; // scratch
5580 // t2 is saved on call, thus should have been saved before this check.
5581 // Hence we can clobber it.
5582 Register tmp2 = t2;
5583
5584 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5585 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5586 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5587 // before the inline cache check here, and not after
5588 align(end_alignment, ic_check_size());
5589 int uep_offset = offset();
5590
5591 if (UseCompactObjectHeaders) {
5592 load_narrow_klass_compact(tmp1, receiver);
5593 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5594 } else {
5595 lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5596 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5597 }
5598
5599 Label ic_hit;
5600 beq(tmp1, tmp2, ic_hit);
5601 // Note, far_jump is not fixed size.
5602 // Is this ever generates a movptr alignment/size will be off.
5603 far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5604 bind(ic_hit);
5605
5606 assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5607 return uep_offset;
5608 }
5609
5610 // Emit an address stub for a call to a target which is too far away.
5611 // Note that we only put the target address of the call in the stub.
5612 //
5613 // code sequences:
5614 //
5615 // call-site:
5616 // load target address from stub
5617 // jump-and-link target address
5618 //
5619 // Related address stub for this call site in the stub section:
5620 // alignment nop
5621 // target address
5622
5623 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5624 address stub = start_a_stub(max_reloc_call_address_stub_size());
5625 if (stub == nullptr) {
5626 return nullptr; // CodeBuffer::expand failed
5627 }
5628
5629 // We are always 4-byte aligned here.
5630 assert_alignment(pc());
5631
5632 // Make sure the address of destination 8-byte aligned.
5633 align(wordSize, 0);
5634
5635 RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5636 insts_call_instruction_offset);
5637 const int stub_start_offset = offset();
5638 relocate(rh, [&] {
5639 assert(offset() - stub_start_offset == 0,
5640 "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5641 assert(offset() % wordSize == 0, "bad alignment");
5642 emit_int64((int64_t)dest);
5643 });
5644
5645 const address stub_start_addr = addr_at(stub_start_offset);
5646 end_a_stub();
5647
5648 return stub_start_addr;
5649 }
5650
5651 int MacroAssembler::max_reloc_call_address_stub_size() {
5652 // Max stub size: alignment nop, target address.
5653 return 1 * MacroAssembler::instruction_size + wordSize;
5654 }
5655
5656 int MacroAssembler::static_call_stub_size() {
5657 // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5658 return 11 * MacroAssembler::instruction_size;
5659 }
5660
5661 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5662 switch (dst.getMode()) {
5663 case Address::base_plus_offset:
5664 // This is the expected mode, although we allow all the other
5665 // forms below.
5666 return form_address(tmp, dst.base(), dst.offset());
5667 default:
5668 la(tmp, dst);
5669 return Address(tmp);
5670 }
5671 }
5672
5673 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5674 assert(((dst.getMode() == Address::base_plus_offset &&
5675 is_simm12(dst.offset())) || is_simm12(value)),
5676 "invalid value and address mode combination");
5677 Address adr = add_memory_helper(dst, tmp2);
5678 assert(!adr.uses(tmp1), "invalid dst for address increment");
5679 ld(tmp1, adr);
5680 add(tmp1, tmp1, value, tmp2);
5681 sd(tmp1, adr);
5682 }
5683
5684 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5685 assert(((dst.getMode() == Address::base_plus_offset &&
5686 is_simm12(dst.offset())) || is_simm12(value)),
5687 "invalid value and address mode combination");
5688 Address adr = add_memory_helper(dst, tmp2);
5689 assert(!adr.uses(tmp1), "invalid dst for address increment");
5690 lwu(tmp1, adr);
5691 addw(tmp1, tmp1, value, tmp2);
5692 sw(tmp1, adr);
5693 }
5694
5695 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5696 assert(((dst.getMode() == Address::base_plus_offset &&
5697 is_simm12(dst.offset())) || is_simm12(value)),
5698 "invalid value and address mode combination");
5699 Address adr = add_memory_helper(dst, tmp2);
5700 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5701 ld(tmp1, adr);
5702 sub(tmp1, tmp1, value, tmp2);
5703 sd(tmp1, adr);
5704 }
5705
5706 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5707 assert(((dst.getMode() == Address::base_plus_offset &&
5708 is_simm12(dst.offset())) || is_simm12(value)),
5709 "invalid value and address mode combination");
5710 Address adr = add_memory_helper(dst, tmp2);
5711 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5712 lwu(tmp1, adr);
5713 subw(tmp1, tmp1, value, tmp2);
5714 sw(tmp1, adr);
5715 }
5716
5717 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5718 load_method_holder(result, method);
5719 ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5720 }
5721
5722 void MacroAssembler::load_method_holder(Register holder, Register method) {
5723 ld(holder, Address(method, Method::const_offset())); // ConstMethod*
5724 ld(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5725 ld(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5726 }
5727
5728 void MacroAssembler::load_metadata(Register dst, Register src) {
5729 if (UseCompactObjectHeaders) {
5730 load_narrow_klass_compact(dst, src);
5731 } else {
5732 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
5733 }
5734 }
5735
5736 // string indexof
5737 // compute index by trailing zeros
5738 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5739 Register match_mask, Register result,
5740 Register ch2, Register tmp,
5741 bool haystack_isL) {
5742 int haystack_chr_shift = haystack_isL ? 0 : 1;
5743 srl(match_mask, match_mask, trailing_zeros);
5744 srli(match_mask, match_mask, 1);
5745 srli(tmp, trailing_zeros, LogBitsPerByte);
5746 if (!haystack_isL) andi(tmp, tmp, 0xE);
5747 add(haystack, haystack, tmp);
5748 ld(ch2, Address(haystack));
5749 if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5750 add(result, result, tmp);
5751 }
5752
5753 // string indexof
5754 // Find pattern element in src, compute match mask,
5755 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5756 // match mask patterns and corresponding indices would be like:
5757 // - 0x8080808080808080 (Latin1)
5758 // - 7 6 5 4 3 2 1 0 (match index)
5759 // - 0x8000800080008000 (UTF16)
5760 // - 3 2 1 0 (match index)
5761 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5762 Register mask1, Register mask2) {
5763 xorr(src, pattern, src);
5764 sub(match_mask, src, mask1);
5765 orr(src, src, mask2);
5766 notr(src, src);
5767 andr(match_mask, match_mask, src);
5768 }
5769
5770 #ifdef COMPILER2
5771 // Code for BigInteger::mulAdd intrinsic
5772 // out = x10
5773 // in = x11
5774 // offset = x12 (already out.length-offset)
5775 // len = x13
5776 // k = x14
5777 // tmp = x28
5778 //
5779 // pseudo code from java implementation:
5780 // long kLong = k & LONG_MASK;
5781 // carry = 0;
5782 // offset = out.length-offset - 1;
5783 // for (int j = len - 1; j >= 0; j--) {
5784 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5785 // out[offset--] = (int)product;
5786 // carry = product >>> 32;
5787 // }
5788 // return (int)carry;
5789 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5790 Register len, Register k, Register tmp) {
5791 Label L_tail_loop, L_unroll, L_end;
5792 mv(tmp, out);
5793 mv(out, zr);
5794 blez(len, L_end);
5795 zext(k, k, 32);
5796 slliw(t0, offset, LogBytesPerInt);
5797 add(offset, tmp, t0);
5798 slliw(t0, len, LogBytesPerInt);
5799 add(in, in, t0);
5800
5801 const int unroll = 8;
5802 mv(tmp, unroll);
5803 blt(len, tmp, L_tail_loop);
5804 bind(L_unroll);
5805 for (int i = 0; i < unroll; i++) {
5806 subi(in, in, BytesPerInt);
5807 lwu(t0, Address(in, 0));
5808 mul(t1, t0, k);
5809 add(t0, t1, out);
5810 subi(offset, offset, BytesPerInt);
5811 lwu(t1, Address(offset, 0));
5812 add(t0, t0, t1);
5813 sw(t0, Address(offset, 0));
5814 srli(out, t0, 32);
5815 }
5816 subw(len, len, tmp);
5817 bge(len, tmp, L_unroll);
5818
5819 bind(L_tail_loop);
5820 blez(len, L_end);
5821 subi(in, in, BytesPerInt);
5822 lwu(t0, Address(in, 0));
5823 mul(t1, t0, k);
5824 add(t0, t1, out);
5825 subi(offset, offset, BytesPerInt);
5826 lwu(t1, Address(offset, 0));
5827 add(t0, t0, t1);
5828 sw(t0, Address(offset, 0));
5829 srli(out, t0, 32);
5830 subiw(len, len, 1);
5831 j(L_tail_loop);
5832
5833 bind(L_end);
5834 }
5835
5836 // Multiply and multiply-accumulate unsigned 64-bit registers.
5837 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5838 assert_different_registers(prod_lo, prod_hi);
5839
5840 mul(prod_lo, n, m);
5841 mulhu(prod_hi, n, m);
5842 }
5843
5844 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5845 Register m, Register tmp1, Register tmp2) {
5846 assert_different_registers(sum_lo, sum_hi);
5847 assert_different_registers(sum_hi, tmp2);
5848
5849 wide_mul(tmp1, tmp2, n, m);
5850 cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
5851 adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
5852 }
5853
5854 // add two unsigned input and output carry
5855 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5856 {
5857 assert_different_registers(dst, carry);
5858 assert_different_registers(dst, src2);
5859 add(dst, src1, src2);
5860 sltu(carry, dst, src2);
5861 }
5862
5863 // add two input with carry
5864 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5865 assert_different_registers(dst, carry);
5866 add(dst, src1, src2);
5867 add(dst, dst, carry);
5868 }
5869
5870 // add two unsigned input with carry and output carry
5871 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5872 assert_different_registers(dst, src2);
5873 adc(dst, src1, src2, carry);
5874 sltu(carry, dst, src2);
5875 }
5876
5877 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5878 Register src1, Register src2, Register carry) {
5879 cad(dest_lo, dest_lo, src1, carry);
5880 add(dest_hi, dest_hi, carry);
5881 cad(dest_lo, dest_lo, src2, carry);
5882 add(final_dest_hi, dest_hi, carry);
5883 }
5884
5885 /**
5886 * Multiply 64 bit by 64 bit first loop.
5887 */
5888 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5889 Register y, Register y_idx, Register z,
5890 Register carry, Register product,
5891 Register idx, Register kdx) {
5892 //
5893 // jlong carry, x[], y[], z[];
5894 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5895 // huge_128 product = y[idx] * x[xstart] + carry;
5896 // z[kdx] = (jlong)product;
5897 // carry = (jlong)(product >>> 64);
5898 // }
5899 // z[xstart] = carry;
5900 //
5901
5902 Label L_first_loop, L_first_loop_exit;
5903 Label L_one_x, L_one_y, L_multiply;
5904
5905 subiw(xstart, xstart, 1);
5906 bltz(xstart, L_one_x);
5907
5908 shadd(t0, xstart, x, t0, LogBytesPerInt);
5909 ld(x_xstart, Address(t0, 0));
5910 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5911
5912 bind(L_first_loop);
5913 subiw(idx, idx, 1);
5914 bltz(idx, L_first_loop_exit);
5915 subiw(idx, idx, 1);
5916 bltz(idx, L_one_y);
5917
5918 shadd(t0, idx, y, t0, LogBytesPerInt);
5919 ld(y_idx, Address(t0, 0));
5920 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5921 bind(L_multiply);
5922
5923 mulhu(t0, x_xstart, y_idx);
5924 mul(product, x_xstart, y_idx);
5925 cad(product, product, carry, t1);
5926 adc(carry, t0, zr, t1);
5927
5928 subiw(kdx, kdx, 2);
5929 ror(product, product, 32); // back to big-endian
5930 shadd(t0, kdx, z, t0, LogBytesPerInt);
5931 sd(product, Address(t0, 0));
5932
5933 j(L_first_loop);
5934
5935 bind(L_one_y);
5936 lwu(y_idx, Address(y, 0));
5937 j(L_multiply);
5938
5939 bind(L_one_x);
5940 lwu(x_xstart, Address(x, 0));
5941 j(L_first_loop);
5942
5943 bind(L_first_loop_exit);
5944 }
5945
5946 /**
5947 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5948 *
5949 */
5950 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5951 Register carry, Register carry2,
5952 Register idx, Register jdx,
5953 Register yz_idx1, Register yz_idx2,
5954 Register tmp, Register tmp3, Register tmp4,
5955 Register tmp6, Register product_hi) {
5956 // jlong carry, x[], y[], z[];
5957 // int kdx = xstart+1;
5958 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5959 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5960 // jlong carry2 = (jlong)(tmp3 >>> 64);
5961 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
5962 // carry = (jlong)(tmp4 >>> 64);
5963 // z[kdx+idx+1] = (jlong)tmp3;
5964 // z[kdx+idx] = (jlong)tmp4;
5965 // }
5966 // idx += 2;
5967 // if (idx > 0) {
5968 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5969 // z[kdx+idx] = (jlong)yz_idx1;
5970 // carry = (jlong)(yz_idx1 >>> 64);
5971 // }
5972 //
5973
5974 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5975
5976 srliw(jdx, idx, 2);
5977
5978 bind(L_third_loop);
5979
5980 subw(jdx, jdx, 1);
5981 bltz(jdx, L_third_loop_exit);
5982 subw(idx, idx, 4);
5983
5984 shadd(t0, idx, y, t0, LogBytesPerInt);
5985 ld(yz_idx2, Address(t0, 0));
5986 ld(yz_idx1, Address(t0, wordSize));
5987
5988 shadd(tmp6, idx, z, t0, LogBytesPerInt);
5989
5990 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5991 ror(yz_idx2, yz_idx2, 32);
5992
5993 ld(t1, Address(tmp6, 0));
5994 ld(t0, Address(tmp6, wordSize));
5995
5996 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5997 mulhu(tmp4, product_hi, yz_idx1);
5998
5999 ror(t0, t0, 32, tmp); // convert big-endian to little-endian
6000 ror(t1, t1, 32, tmp);
6001
6002 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
6003 mulhu(carry2, product_hi, yz_idx2);
6004
6005 cad(tmp3, tmp3, carry, carry);
6006 adc(tmp4, tmp4, zr, carry);
6007 cad(tmp3, tmp3, t0, t0);
6008 cadc(tmp4, tmp4, tmp, t0);
6009 adc(carry, carry2, zr, t0);
6010 cad(tmp4, tmp4, t1, carry2);
6011 adc(carry, carry, zr, carry2);
6012
6013 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
6014 ror(tmp4, tmp4, 32);
6015 sd(tmp4, Address(tmp6, 0));
6016 sd(tmp3, Address(tmp6, wordSize));
6017
6018 j(L_third_loop);
6019
6020 bind(L_third_loop_exit);
6021
6022 andi(idx, idx, 0x3);
6023 beqz(idx, L_post_third_loop_done);
6024
6025 Label L_check_1;
6026 subiw(idx, idx, 2);
6027 bltz(idx, L_check_1);
6028
6029 shadd(t0, idx, y, t0, LogBytesPerInt);
6030 ld(yz_idx1, Address(t0, 0));
6031 ror(yz_idx1, yz_idx1, 32);
6032
6033 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
6034 mulhu(tmp4, product_hi, yz_idx1);
6035
6036 shadd(t0, idx, z, t0, LogBytesPerInt);
6037 ld(yz_idx2, Address(t0, 0));
6038 ror(yz_idx2, yz_idx2, 32, tmp);
6039
6040 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
6041
6042 ror(tmp3, tmp3, 32, tmp);
6043 sd(tmp3, Address(t0, 0));
6044
6045 bind(L_check_1);
6046
6047 andi(idx, idx, 0x1);
6048 subiw(idx, idx, 1);
6049 bltz(idx, L_post_third_loop_done);
6050 shadd(t0, idx, y, t0, LogBytesPerInt);
6051 lwu(tmp4, Address(t0, 0));
6052 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
6053 mulhu(carry2, tmp4, product_hi);
6054
6055 shadd(t0, idx, z, t0, LogBytesPerInt);
6056 lwu(tmp4, Address(t0, 0));
6057
6058 add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
6059
6060 shadd(t0, idx, z, t0, LogBytesPerInt);
6061 sw(tmp3, Address(t0, 0));
6062
6063 slli(t0, carry2, 32);
6064 srli(carry, tmp3, 32);
6065 orr(carry, carry, t0);
6066
6067 bind(L_post_third_loop_done);
6068 }
6069
6070 /**
6071 * Code for BigInteger::multiplyToLen() intrinsic.
6072 *
6073 * x10: x
6074 * x11: xlen
6075 * x12: y
6076 * x13: ylen
6077 * x14: z
6078 * x15: tmp0
6079 * x16: tmp1
6080 * x17: tmp2
6081 * x7: tmp3
6082 * x28: tmp4
6083 * x29: tmp5
6084 * x30: tmp6
6085 * x31: tmp7
6086 */
6087 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
6088 Register z, Register tmp0,
6089 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
6090 Register tmp5, Register tmp6, Register product_hi) {
6091 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
6092
6093 const Register idx = tmp1;
6094 const Register kdx = tmp2;
6095 const Register xstart = tmp3;
6096
6097 const Register y_idx = tmp4;
6098 const Register carry = tmp5;
6099 const Register product = xlen;
6100 const Register x_xstart = tmp0;
6101 const Register jdx = tmp1;
6102
6103 mv(idx, ylen); // idx = ylen;
6104 addw(kdx, xlen, ylen); // kdx = xlen+ylen;
6105 mv(carry, zr); // carry = 0;
6106
6107 Label L_done;
6108 subiw(xstart, xlen, 1);
6109 bltz(xstart, L_done);
6110
6111 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
6112
6113 Label L_second_loop_aligned;
6114 beqz(kdx, L_second_loop_aligned);
6115
6116 Label L_carry;
6117 subiw(kdx, kdx, 1);
6118 beqz(kdx, L_carry);
6119
6120 shadd(t0, kdx, z, t0, LogBytesPerInt);
6121 sw(carry, Address(t0, 0));
6122 srli(carry, carry, 32);
6123 subiw(kdx, kdx, 1);
6124
6125 bind(L_carry);
6126 shadd(t0, kdx, z, t0, LogBytesPerInt);
6127 sw(carry, Address(t0, 0));
6128
6129 // Second and third (nested) loops.
6130 //
6131 // for (int i = xstart-1; i >= 0; i--) { // Second loop
6132 // carry = 0;
6133 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
6134 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
6135 // (z[k] & LONG_MASK) + carry;
6136 // z[k] = (int)product;
6137 // carry = product >>> 32;
6138 // }
6139 // z[i] = (int)carry;
6140 // }
6141 //
6142 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
6143
6144 bind(L_second_loop_aligned);
6145 mv(carry, zr); // carry = 0;
6146 mv(jdx, ylen); // j = ystart+1
6147
6148 subiw(xstart, xstart, 1); // i = xstart-1;
6149 bltz(xstart, L_done);
6150
6151 subi(sp, sp, 4 * wordSize);
6152 sd(z, Address(sp, 0));
6153
6154 Label L_last_x;
6155 shadd(t0, xstart, z, t0, LogBytesPerInt);
6156 addi(z, t0, 4);
6157 subiw(xstart, xstart, 1); // i = xstart-1;
6158 bltz(xstart, L_last_x);
6159
6160 shadd(t0, xstart, x, t0, LogBytesPerInt);
6161 ld(product_hi, Address(t0, 0));
6162 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
6163
6164 Label L_third_loop_prologue;
6165 bind(L_third_loop_prologue);
6166
6167 sd(ylen, Address(sp, wordSize));
6168 sd(x, Address(sp, 2 * wordSize));
6169 sd(xstart, Address(sp, 3 * wordSize));
6170 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
6171 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
6172 ld(z, Address(sp, 0));
6173 ld(ylen, Address(sp, wordSize));
6174 ld(x, Address(sp, 2 * wordSize));
6175 ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
6176 addi(sp, sp, 4 * wordSize);
6177
6178 addiw(tmp3, xlen, 1);
6179 shadd(t0, tmp3, z, t0, LogBytesPerInt);
6180 sw(carry, Address(t0, 0));
6181
6182 subiw(tmp3, tmp3, 1);
6183 bltz(tmp3, L_done);
6184
6185 srli(carry, carry, 32);
6186 shadd(t0, tmp3, z, t0, LogBytesPerInt);
6187 sw(carry, Address(t0, 0));
6188 j(L_second_loop_aligned);
6189
6190 // Next infrequent code is moved outside loops.
6191 bind(L_last_x);
6192 lwu(product_hi, Address(x, 0));
6193 j(L_third_loop_prologue);
6194
6195 bind(L_done);
6196 }
6197 #endif
6198
6199 // Count bits of trailing zero chars from lsb to msb until first non-zero
6200 // char seen. For the LL case, shift 8 bits once as there is only one byte
6201 // per each char. For other cases, shift 16 bits once.
6202 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
6203 Register tmp1, Register tmp2) {
6204 int step = isLL ? 8 : 16;
6205 if (UseZbb) {
6206 ctz(Rd, Rs);
6207 andi(Rd, Rd, -step);
6208 return;
6209 }
6210
6211 assert_different_registers(Rd, tmp1, tmp2);
6212 Label Loop;
6213 mv(tmp2, Rs);
6214 mv(Rd, -step);
6215
6216 bind(Loop);
6217 addi(Rd, Rd, step);
6218 zext(tmp1, tmp2, step);
6219 srli(tmp2, tmp2, step);
6220 beqz(tmp1, Loop);
6221 }
6222
6223 // This instruction reads adjacent 4 bytes from the lower half of source register,
6224 // inflate into a register, for example:
6225 // Rs: A7A6A5A4A3A2A1A0
6226 // Rd: 00A300A200A100A0
6227 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6228 assert_different_registers(Rd, Rs, tmp1, tmp2);
6229
6230 mv(tmp1, 0xFF000000); // first byte mask at lower word
6231 andr(Rd, Rs, tmp1);
6232 for (int i = 0; i < 2; i++) {
6233 slli(Rd, Rd, wordSize);
6234 srli(tmp1, tmp1, wordSize);
6235 andr(tmp2, Rs, tmp1);
6236 orr(Rd, Rd, tmp2);
6237 }
6238 slli(Rd, Rd, wordSize);
6239 zext(tmp2, Rs, 8); // last byte mask at lower word
6240 orr(Rd, Rd, tmp2);
6241 }
6242
6243 // This instruction reads adjacent 4 bytes from the upper half of source register,
6244 // inflate into a register, for example:
6245 // Rs: A7A6A5A4A3A2A1A0
6246 // Rd: 00A700A600A500A4
6247 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6248 assert_different_registers(Rd, Rs, tmp1, tmp2);
6249 srli(Rs, Rs, 32); // only upper 32 bits are needed
6250 inflate_lo32(Rd, Rs, tmp1, tmp2);
6251 }
6252
6253 // The size of the blocks erased by the zero_blocks stub. We must
6254 // handle anything smaller than this ourselves in zero_words().
6255 const int MacroAssembler::zero_words_block_size = 8;
6256
6257 // zero_words() is used by C2 ClearArray patterns. It is as small as
6258 // possible, handling small word counts locally and delegating
6259 // anything larger to the zero_blocks stub. It is expanded many times
6260 // in compiled code, so it is important to keep it short.
6261
6262 // ptr: Address of a buffer to be zeroed.
6263 // cnt: Count in HeapWords.
6264 //
6265 // ptr, cnt, t1, and t0 are clobbered.
6266 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6267 assert(is_power_of_2(zero_words_block_size), "adjust this");
6268 assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6269 assert_different_registers(cnt, t0, t1);
6270
6271 BLOCK_COMMENT("zero_words {");
6272
6273 mv(t0, zero_words_block_size);
6274 Label around, done, done16;
6275 bltu(cnt, t0, around);
6276 {
6277 RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6278 assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6279 if (StubRoutines::riscv::complete()) {
6280 address tpc = reloc_call(zero_blocks);
6281 if (tpc == nullptr) {
6282 DEBUG_ONLY(reset_labels(around));
6283 postcond(pc() == badAddress);
6284 return nullptr;
6285 }
6286 } else {
6287 // Clobbers t1
6288 rt_call(zero_blocks.target());
6289 }
6290 }
6291 bind(around);
6292 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6293 Label l;
6294 test_bit(t0, cnt, exact_log2(i));
6295 beqz(t0, l);
6296 for (int j = 0; j < i; j++) {
6297 sd(zr, Address(ptr, j * wordSize));
6298 }
6299 addi(ptr, ptr, i * wordSize);
6300 bind(l);
6301 }
6302 {
6303 Label l;
6304 test_bit(t0, cnt, 0);
6305 beqz(t0, l);
6306 sd(zr, Address(ptr, 0));
6307 bind(l);
6308 }
6309
6310 BLOCK_COMMENT("} zero_words");
6311 postcond(pc() != badAddress);
6312 return pc();
6313 }
6314
6315 #define SmallArraySize (18 * BytesPerLong)
6316
6317 // base: Address of a buffer to be zeroed, 8 bytes aligned.
6318 // cnt: Immediate count in HeapWords.
6319 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6320 assert_different_registers(base, t0, t1);
6321
6322 BLOCK_COMMENT("zero_words {");
6323
6324 if (cnt <= SmallArraySize / BytesPerLong) {
6325 for (int i = 0; i < (int)cnt; i++) {
6326 sd(zr, Address(base, i * wordSize));
6327 }
6328 } else {
6329 const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6330 int remainder = cnt % unroll;
6331 for (int i = 0; i < remainder; i++) {
6332 sd(zr, Address(base, i * wordSize));
6333 }
6334
6335 Label loop;
6336 Register cnt_reg = t0;
6337 Register loop_base = t1;
6338 cnt = cnt - remainder;
6339 mv(cnt_reg, cnt);
6340 addi(loop_base, base, remainder * wordSize);
6341 bind(loop);
6342 sub(cnt_reg, cnt_reg, unroll);
6343 for (int i = 0; i < unroll; i++) {
6344 sd(zr, Address(loop_base, i * wordSize));
6345 }
6346 addi(loop_base, loop_base, unroll * wordSize);
6347 bnez(cnt_reg, loop);
6348 }
6349
6350 BLOCK_COMMENT("} zero_words");
6351 }
6352
6353 // base: Address of a buffer to be filled, 8 bytes aligned.
6354 // cnt: Count in 8-byte unit.
6355 // value: Value to be filled with.
6356 // base will point to the end of the buffer after filling.
6357 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6358 // Algorithm:
6359 //
6360 // t0 = cnt & 7
6361 // cnt -= t0
6362 // p += t0
6363 // switch (t0):
6364 // switch start:
6365 // do while cnt
6366 // cnt -= 8
6367 // p[-8] = value
6368 // case 7:
6369 // p[-7] = value
6370 // case 6:
6371 // p[-6] = value
6372 // // ...
6373 // case 1:
6374 // p[-1] = value
6375 // case 0:
6376 // p += 8
6377 // do-while end
6378 // switch end
6379
6380 assert_different_registers(base, cnt, value, t0, t1);
6381
6382 Label fini, skip, entry, loop;
6383 const int unroll = 8; // Number of sd instructions we'll unroll
6384
6385 beqz(cnt, fini);
6386
6387 andi(t0, cnt, unroll - 1);
6388 sub(cnt, cnt, t0);
6389 shadd(base, t0, base, t1, 3);
6390 la(t1, entry);
6391 slli(t0, t0, 2);
6392 sub(t1, t1, t0);
6393 jr(t1);
6394
6395 bind(loop);
6396 addi(base, base, unroll * wordSize);
6397 {
6398 IncompressibleScope scope(this); // Fixed length
6399 for (int i = -unroll; i < 0; i++) {
6400 sd(value, Address(base, i * 8));
6401 }
6402 }
6403 bind(entry);
6404 subi(cnt, cnt, unroll);
6405 bgez(cnt, loop);
6406
6407 bind(fini);
6408 }
6409
6410 // Zero blocks of memory by using CBO.ZERO.
6411 //
6412 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6413 // CBO.ZERO repeatedly for every full block. cnt is the size to be
6414 // zeroed in HeapWords. Returns the count of words left to be zeroed
6415 // in cnt.
6416 //
6417 // NOTE: This is intended to be used in the zero_blocks() stub. If
6418 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6419 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6420 int zicboz_block_size = VM_Version::zicboz_block_size.value();
6421 Label initial_table_end, loop;
6422
6423 // Align base with cache line size.
6424 neg(tmp1, base);
6425 andi(tmp1, tmp1, zicboz_block_size - 1);
6426
6427 // tmp1: the number of bytes to be filled to align the base with cache line size.
6428 add(base, base, tmp1);
6429 srai(tmp2, tmp1, 3);
6430 sub(cnt, cnt, tmp2);
6431 srli(tmp2, tmp1, 1);
6432 la(tmp1, initial_table_end);
6433 sub(tmp2, tmp1, tmp2);
6434 jr(tmp2);
6435 for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6436 sd(zr, Address(base, i));
6437 }
6438 bind(initial_table_end);
6439
6440 mv(tmp1, zicboz_block_size / wordSize);
6441 bind(loop);
6442 cbo_zero(base);
6443 sub(cnt, cnt, tmp1);
6444 addi(base, base, zicboz_block_size);
6445 bge(cnt, tmp1, loop);
6446 }
6447
6448 // java.lang.Math.round(float a)
6449 // Returns the closest int to the argument, with ties rounding to positive infinity.
6450 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6451 // this instructions calling sequence provides performance improvement on all tested devices;
6452 // don't change it without re-verification
6453 Label done;
6454 mv(t0, jint_cast(0.5f));
6455 fmv_w_x(ftmp, t0);
6456
6457 // dst = 0 if NaN
6458 feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6459 mv(dst, zr);
6460 beqz(t0, done);
6461
6462 // dst = (src + 0.5f) rounded down towards negative infinity
6463 // Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6464 // RDN is required for fadd_s, RNE gives incorrect results:
6465 // --------------------------------------------------------------------
6466 // fadd.s rne (src + 0.5f): src = 8388609.000000 ftmp = 8388610.000000
6467 // fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6468 // --------------------------------------------------------------------
6469 // fadd.s rdn (src + 0.5f): src = 8388609.000000 ftmp = 8388609.000000
6470 // fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6471 // --------------------------------------------------------------------
6472 fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6473 fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6474
6475 bind(done);
6476 }
6477
6478 // java.lang.Math.round(double a)
6479 // Returns the closest long to the argument, with ties rounding to positive infinity.
6480 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6481 // this instructions calling sequence provides performance improvement on all tested devices;
6482 // don't change it without re-verification
6483 Label done;
6484 mv(t0, julong_cast(0.5));
6485 fmv_d_x(ftmp, t0);
6486
6487 // dst = 0 if NaN
6488 feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6489 mv(dst, zr);
6490 beqz(t0, done);
6491
6492 // dst = (src + 0.5) rounded down towards negative infinity
6493 fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6494 fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6495
6496 bind(done);
6497 }
6498
6499 // Helper routine processing the slow path of NaN when converting float to float16
6500 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6501 Register tmp1, Register tmp2) {
6502 fmv_x_w(dst, src);
6503
6504 // Float (32 bits)
6505 // Bit: 31 30 to 23 22 to 0
6506 // +---+------------------+-----------------------------+
6507 // | S | Exponent | Mantissa (Fraction) |
6508 // +---+------------------+-----------------------------+
6509 // 1 bit 8 bits 23 bits
6510 //
6511 // Float (16 bits)
6512 // Bit: 15 14 to 10 9 to 0
6513 // +---+----------------+------------------+
6514 // | S | Exponent | Mantissa |
6515 // +---+----------------+------------------+
6516 // 1 bit 5 bits 10 bits
6517 const int fp_sign_bits = 1;
6518 const int fp32_bits = 32;
6519 const int fp32_exponent_bits = 8;
6520 const int fp32_mantissa_1st_part_bits = 10;
6521 const int fp32_mantissa_2nd_part_bits = 9;
6522 const int fp32_mantissa_3rd_part_bits = 4;
6523 const int fp16_exponent_bits = 5;
6524 const int fp16_mantissa_bits = 10;
6525
6526 // preserve the sign bit and exponent, clear mantissa.
6527 srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6528 slli(tmp2, tmp2, fp16_mantissa_bits);
6529
6530 // Preserve high order bit of float NaN in the
6531 // binary16 result NaN (tenth bit); OR in remaining
6532 // bits into lower 9 bits of binary 16 significand.
6533 // | (doppel & 0x007f_e000) >> 13 // 10 bits
6534 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
6535 // | (doppel & 0x0000_000f)); // 4 bits
6536 //
6537 // Check j.l.Float.floatToFloat16 for more information.
6538 // 10 bits
6539 int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6540 int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6541 slli(tmp1, dst, left_shift);
6542 srli(tmp1, tmp1, right_shift);
6543 orr(tmp2, tmp2, tmp1);
6544 // 9 bits
6545 left_shift += fp32_mantissa_1st_part_bits;
6546 right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6547 slli(tmp1, dst, left_shift);
6548 srli(tmp1, tmp1, right_shift);
6549 orr(tmp2, tmp2, tmp1);
6550 // 4 bits
6551 andi(tmp1, dst, 0xf);
6552 orr(dst, tmp2, tmp1);
6553 }
6554
6555 #define FCVT_SAFE(FLOATCVT, FLOATSIG) \
6556 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
6557 Label done; \
6558 assert_different_registers(dst, tmp); \
6559 fclass_##FLOATSIG(tmp, src); \
6560 mv(dst, zr); \
6561 /* check if src is NaN */ \
6562 andi(tmp, tmp, FClassBits::nan); \
6563 bnez(tmp, done); \
6564 FLOATCVT(dst, src); \
6565 bind(done); \
6566 }
6567
6568 FCVT_SAFE(fcvt_w_s, s);
6569 FCVT_SAFE(fcvt_l_s, s);
6570 FCVT_SAFE(fcvt_w_d, d);
6571 FCVT_SAFE(fcvt_l_d, d);
6572
6573 #undef FCVT_SAFE
6574
6575 #define FCMP(FLOATTYPE, FLOATSIG) \
6576 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1, \
6577 FloatRegister Rs2, int unordered_result) { \
6578 Label Ldone; \
6579 if (unordered_result < 0) { \
6580 /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */ \
6581 /* installs 1 if gt else 0 */ \
6582 flt_##FLOATSIG(result, Rs2, Rs1); \
6583 /* Rs1 > Rs2, install 1 */ \
6584 bgtz(result, Ldone); \
6585 feq_##FLOATSIG(result, Rs1, Rs2); \
6586 subi(result, result, 1); \
6587 /* Rs1 = Rs2, install 0 */ \
6588 /* NaN or Rs1 < Rs2, install -1 */ \
6589 bind(Ldone); \
6590 } else { \
6591 /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */ \
6592 /* installs 1 if gt or unordered else 0 */ \
6593 flt_##FLOATSIG(result, Rs1, Rs2); \
6594 /* Rs1 < Rs2, install -1 */ \
6595 bgtz(result, Ldone); \
6596 feq_##FLOATSIG(result, Rs1, Rs2); \
6597 subi(result, result, 1); \
6598 /* Rs1 = Rs2, install 0 */ \
6599 /* NaN or Rs1 > Rs2, install 1 */ \
6600 bind(Ldone); \
6601 neg(result, result); \
6602 } \
6603 }
6604
6605 FCMP(float, s);
6606 FCMP(double, d);
6607
6608 #undef FCMP
6609
6610 // Zero words; len is in bytes
6611 // Destroys all registers except addr
6612 // len must be a nonzero multiple of wordSize
6613 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6614 assert_different_registers(addr, len, tmp, t0, t1);
6615
6616 #ifdef ASSERT
6617 {
6618 Label L;
6619 andi(t0, len, BytesPerWord - 1);
6620 beqz(t0, L);
6621 stop("len is not a multiple of BytesPerWord");
6622 bind(L);
6623 }
6624 #endif // ASSERT
6625
6626 #ifndef PRODUCT
6627 block_comment("zero memory");
6628 #endif // PRODUCT
6629
6630 Label loop;
6631 Label entry;
6632
6633 // Algorithm:
6634 //
6635 // t0 = cnt & 7
6636 // cnt -= t0
6637 // p += t0
6638 // switch (t0) {
6639 // do {
6640 // cnt -= 8
6641 // p[-8] = 0
6642 // case 7:
6643 // p[-7] = 0
6644 // case 6:
6645 // p[-6] = 0
6646 // ...
6647 // case 1:
6648 // p[-1] = 0
6649 // case 0:
6650 // p += 8
6651 // } while (cnt)
6652 // }
6653
6654 const int unroll = 8; // Number of sd(zr) instructions we'll unroll
6655
6656 srli(len, len, LogBytesPerWord);
6657 andi(t0, len, unroll - 1); // t0 = cnt % unroll
6658 sub(len, len, t0); // cnt -= unroll
6659 // tmp always points to the end of the region we're about to zero
6660 shadd(tmp, t0, addr, t1, LogBytesPerWord);
6661 la(t1, entry);
6662 slli(t0, t0, 2);
6663 sub(t1, t1, t0);
6664 jr(t1);
6665
6666 bind(loop);
6667 sub(len, len, unroll);
6668 {
6669 IncompressibleScope scope(this); // Fixed length
6670 for (int i = -unroll; i < 0; i++) {
6671 sd(zr, Address(tmp, i * wordSize));
6672 }
6673 }
6674 bind(entry);
6675 add(tmp, tmp, unroll * wordSize);
6676 bnez(len, loop);
6677 }
6678
6679 // shift left by shamt and add
6680 // Rd = (Rs1 << shamt) + Rs2
6681 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6682 if (UseZba) {
6683 if (shamt == 1) {
6684 sh1add(Rd, Rs1, Rs2);
6685 return;
6686 } else if (shamt == 2) {
6687 sh2add(Rd, Rs1, Rs2);
6688 return;
6689 } else if (shamt == 3) {
6690 sh3add(Rd, Rs1, Rs2);
6691 return;
6692 }
6693 }
6694
6695 if (shamt != 0) {
6696 assert_different_registers(Rs2, tmp);
6697 slli(tmp, Rs1, shamt);
6698 add(Rd, Rs2, tmp);
6699 } else {
6700 add(Rd, Rs1, Rs2);
6701 }
6702 }
6703
6704 void MacroAssembler::zext(Register dst, Register src, int bits) {
6705 switch (bits) {
6706 case 32:
6707 if (UseZba) {
6708 zext_w(dst, src);
6709 return;
6710 }
6711 break;
6712 case 16:
6713 if (UseZbb) {
6714 zext_h(dst, src);
6715 return;
6716 }
6717 break;
6718 case 8:
6719 zext_b(dst, src);
6720 return;
6721 default:
6722 break;
6723 }
6724
6725 slli(dst, src, XLEN - bits);
6726 srli(dst, dst, XLEN - bits);
6727 }
6728
6729 void MacroAssembler::sext(Register dst, Register src, int bits) {
6730 switch (bits) {
6731 case 32:
6732 sext_w(dst, src);
6733 return;
6734 case 16:
6735 if (UseZbb) {
6736 sext_h(dst, src);
6737 return;
6738 }
6739 break;
6740 case 8:
6741 if (UseZbb) {
6742 sext_b(dst, src);
6743 return;
6744 }
6745 break;
6746 default:
6747 break;
6748 }
6749
6750 slli(dst, src, XLEN - bits);
6751 srai(dst, dst, XLEN - bits);
6752 }
6753
6754 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6755 Register tmp, bool is_signed) {
6756 if (src1 == src2) {
6757 mv(dst, zr);
6758 return;
6759 }
6760 Label done;
6761 Register left = src1;
6762 Register right = src2;
6763 if (dst == src1) {
6764 assert_different_registers(dst, src2, tmp);
6765 mv(tmp, src1);
6766 left = tmp;
6767 } else if (dst == src2) {
6768 assert_different_registers(dst, src1, tmp);
6769 mv(tmp, src2);
6770 right = tmp;
6771 }
6772
6773 // installs 1 if gt else 0
6774 if (is_signed) {
6775 slt(dst, right, left);
6776 } else {
6777 sltu(dst, right, left);
6778 }
6779 bnez(dst, done);
6780 if (is_signed) {
6781 slt(dst, left, right);
6782 } else {
6783 sltu(dst, left, right);
6784 }
6785 // dst = -1 if lt; else if eq , dst = 0
6786 neg(dst, dst);
6787 bind(done);
6788 }
6789
6790 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6791 {
6792 cmp_x2i(dst, src1, src2, tmp);
6793 }
6794
6795 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6796 cmp_x2i(dst, src1, src2, tmp, false);
6797 }
6798
6799 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6800 cmp_x2i(dst, src1, src2, tmp, false);
6801 }
6802
6803 // The java_calling_convention describes stack locations as ideal slots on
6804 // a frame with no abi restrictions. Since we must observe abi restrictions
6805 // (like the placement of the register window) the slots must be biased by
6806 // the following value.
6807 static int reg2offset_in(VMReg r) {
6808 // Account for saved fp and ra
6809 // This should really be in_preserve_stack_slots
6810 return r->reg2stack() * VMRegImpl::stack_slot_size;
6811 }
6812
6813 static int reg2offset_out(VMReg r) {
6814 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6815 }
6816
6817 // The C ABI specifies:
6818 // "integer scalars narrower than XLEN bits are widened according to the sign
6819 // of their type up to 32 bits, then sign-extended to XLEN bits."
6820 // Applies for both passed in register and stack.
6821 //
6822 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6823 // Native uses 64-bit stack slots for all integer scalar types.
6824 //
6825 // lw loads the Java stack slot, sign-extends and
6826 // sd store this widened integer into a 64 bit native stack slot.
6827 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6828 if (src.first()->is_stack()) {
6829 if (dst.first()->is_stack()) {
6830 // stack to stack
6831 lw(tmp, Address(fp, reg2offset_in(src.first())));
6832 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6833 } else {
6834 // stack to reg
6835 lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6836 }
6837 } else if (dst.first()->is_stack()) {
6838 // reg to stack
6839 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6840 } else {
6841 if (dst.first() != src.first()) {
6842 sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6843 }
6844 }
6845 }
6846
6847 // An oop arg. Must pass a handle not the oop itself
6848 void MacroAssembler::object_move(OopMap* map,
6849 int oop_handle_offset,
6850 int framesize_in_slots,
6851 VMRegPair src,
6852 VMRegPair dst,
6853 bool is_receiver,
6854 int* receiver_offset) {
6855 assert_cond(map != nullptr && receiver_offset != nullptr);
6856
6857 // must pass a handle. First figure out the location we use as a handle
6858 Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6859
6860 // See if oop is null if it is we need no handle
6861
6862 if (src.first()->is_stack()) {
6863 // Oop is already on the stack as an argument
6864 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6865 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6866 if (is_receiver) {
6867 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6868 }
6869
6870 ld(t0, Address(fp, reg2offset_in(src.first())));
6871 la(rHandle, Address(fp, reg2offset_in(src.first())));
6872 // conditionally move a null
6873 Label notZero1;
6874 bnez(t0, notZero1);
6875 mv(rHandle, zr);
6876 bind(notZero1);
6877 } else {
6878
6879 // Oop is in a register we must store it to the space we reserve
6880 // on the stack for oop_handles and pass a handle if oop is non-null
6881
6882 const Register rOop = src.first()->as_Register();
6883 int oop_slot = -1;
6884 if (rOop == j_rarg0) {
6885 oop_slot = 0;
6886 } else if (rOop == j_rarg1) {
6887 oop_slot = 1;
6888 } else if (rOop == j_rarg2) {
6889 oop_slot = 2;
6890 } else if (rOop == j_rarg3) {
6891 oop_slot = 3;
6892 } else if (rOop == j_rarg4) {
6893 oop_slot = 4;
6894 } else if (rOop == j_rarg5) {
6895 oop_slot = 5;
6896 } else if (rOop == j_rarg6) {
6897 oop_slot = 6;
6898 } else {
6899 assert(rOop == j_rarg7, "wrong register");
6900 oop_slot = 7;
6901 }
6902
6903 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6904 int offset = oop_slot * VMRegImpl::stack_slot_size;
6905
6906 map->set_oop(VMRegImpl::stack2reg(oop_slot));
6907 // Store oop in handle area, may be null
6908 sd(rOop, Address(sp, offset));
6909 if (is_receiver) {
6910 *receiver_offset = offset;
6911 }
6912
6913 //rOop maybe the same as rHandle
6914 if (rOop == rHandle) {
6915 Label isZero;
6916 beqz(rOop, isZero);
6917 la(rHandle, Address(sp, offset));
6918 bind(isZero);
6919 } else {
6920 Label notZero2;
6921 la(rHandle, Address(sp, offset));
6922 bnez(rOop, notZero2);
6923 mv(rHandle, zr);
6924 bind(notZero2);
6925 }
6926 }
6927
6928 // If arg is on the stack then place it otherwise it is already in correct reg.
6929 if (dst.first()->is_stack()) {
6930 sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6931 }
6932 }
6933
6934 // A float arg may have to do float reg int reg conversion
6935 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6936 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6937 (src.first()->is_reg() && dst.first()->is_reg()) ||
6938 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6939 if (src.first()->is_stack()) {
6940 if (dst.first()->is_stack()) {
6941 lwu(tmp, Address(fp, reg2offset_in(src.first())));
6942 sw(tmp, Address(sp, reg2offset_out(dst.first())));
6943 } else if (dst.first()->is_Register()) {
6944 lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6945 } else {
6946 ShouldNotReachHere();
6947 }
6948 } else if (src.first() != dst.first()) {
6949 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6950 fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6951 } else {
6952 ShouldNotReachHere();
6953 }
6954 }
6955 }
6956
6957 // A long move
6958 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6959 if (src.first()->is_stack()) {
6960 if (dst.first()->is_stack()) {
6961 // stack to stack
6962 ld(tmp, Address(fp, reg2offset_in(src.first())));
6963 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6964 } else {
6965 // stack to reg
6966 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6967 }
6968 } else if (dst.first()->is_stack()) {
6969 // reg to stack
6970 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6971 } else {
6972 if (dst.first() != src.first()) {
6973 mv(dst.first()->as_Register(), src.first()->as_Register());
6974 }
6975 }
6976 }
6977
6978 // A double move
6979 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6980 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6981 (src.first()->is_reg() && dst.first()->is_reg()) ||
6982 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6983 if (src.first()->is_stack()) {
6984 if (dst.first()->is_stack()) {
6985 ld(tmp, Address(fp, reg2offset_in(src.first())));
6986 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6987 } else if (dst.first()-> is_Register()) {
6988 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6989 } else {
6990 ShouldNotReachHere();
6991 }
6992 } else if (src.first() != dst.first()) {
6993 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6994 fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6995 } else {
6996 ShouldNotReachHere();
6997 }
6998 }
6999 }
7000
7001 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
7002 assert(bit_pos < 64, "invalid bit range");
7003 if (UseZbs) {
7004 bexti(Rd, Rs, bit_pos);
7005 return;
7006 }
7007 int64_t imm = (int64_t)(1UL << bit_pos);
7008 if (is_simm12(imm)) {
7009 andi(Rd, Rs, imm);
7010 } else {
7011 srli(Rd, Rs, bit_pos);
7012 andi(Rd, Rd, 1);
7013 }
7014 }
7015
7016 // Implements fast-locking.
7017 //
7018 // - obj: the object to be locked
7019 // - tmp1, tmp2, tmp3: temporary registers, will be destroyed
7020 // - slow: branched to if locking fails
7021 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
7022 assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
7023
7024 Label push;
7025 const Register top = tmp1;
7026 const Register mark = tmp2;
7027 const Register t = tmp3;
7028
7029 // Preload the markWord. It is important that this is the first
7030 // instruction emitted as it is part of C1's null check semantics.
7031 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
7032
7033 if (UseObjectMonitorTable) {
7034 // Clear cache in case fast locking succeeds or we need to take the slow-path.
7035 sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
7036 }
7037
7038 if (DiagnoseSyncOnValueBasedClasses != 0) {
7039 load_klass(tmp1, obj);
7040 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
7041 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
7042 bnez(tmp1, slow, /* is_far */ true);
7043 }
7044
7045 // Check if the lock-stack is full.
7046 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7047 mv(t, (unsigned)LockStack::end_offset());
7048 bge(top, t, slow, /* is_far */ true);
7049
7050 // Check for recursion.
7051 add(t, xthread, top);
7052 ld(t, Address(t, -oopSize));
7053 beq(obj, t, push);
7054
7055 // Check header for monitor (0b10).
7056 test_bit(t, mark, exact_log2(markWord::monitor_value));
7057 bnez(t, slow, /* is_far */ true);
7058
7059 // Try to lock. Transition lock-bits 0b01 => 0b00
7060 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
7061 ori(mark, mark, markWord::unlocked_value);
7062 // Mask inline_type bit such that we go to the slow path if object is an inline type
7063 andi(mark, mark, ~((int) markWord::inline_type_bit_in_place));
7064 xori(t, mark, markWord::unlocked_value);
7065 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
7066 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
7067 bne(mark, t, slow, /* is_far */ true);
7068
7069 bind(push);
7070 // After successful lock, push object on lock-stack.
7071 add(t, xthread, top);
7072 sd(obj, Address(t));
7073 addiw(top, top, oopSize);
7074 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7075 }
7076
7077 // Implements ligthweight-unlocking.
7078 //
7079 // - obj: the object to be unlocked
7080 // - tmp1, tmp2, tmp3: temporary registers
7081 // - slow: branched to if unlocking fails
7082 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
7083 assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
7084
7085 #ifdef ASSERT
7086 {
7087 // Check for lock-stack underflow.
7088 Label stack_ok;
7089 lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
7090 mv(tmp2, (unsigned)LockStack::start_offset());
7091 bge(tmp1, tmp2, stack_ok);
7092 STOP("Lock-stack underflow");
7093 bind(stack_ok);
7094 }
7095 #endif
7096
7097 Label unlocked, push_and_slow;
7098 const Register top = tmp1;
7099 const Register mark = tmp2;
7100 const Register t = tmp3;
7101
7102 // Check if obj is top of lock-stack.
7103 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7104 subiw(top, top, oopSize);
7105 add(t, xthread, top);
7106 ld(t, Address(t));
7107 bne(obj, t, slow, /* is_far */ true);
7108
7109 // Pop lock-stack.
7110 DEBUG_ONLY(add(t, xthread, top);)
7111 DEBUG_ONLY(sd(zr, Address(t));)
7112 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7113
7114 // Check if recursive.
7115 add(t, xthread, top);
7116 ld(t, Address(t, -oopSize));
7117 beq(obj, t, unlocked);
7118
7119 // Not recursive. Check header for monitor (0b10).
7120 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
7121 test_bit(t, mark, exact_log2(markWord::monitor_value));
7122 bnez(t, push_and_slow);
7123
7124 #ifdef ASSERT
7125 // Check header not unlocked (0b01).
7126 Label not_unlocked;
7127 test_bit(t, mark, exact_log2(markWord::unlocked_value));
7128 beqz(t, not_unlocked);
7129 stop("fast_unlock already unlocked");
7130 bind(not_unlocked);
7131 #endif
7132
7133 // Try to unlock. Transition lock bits 0b00 => 0b01
7134 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
7135 ori(t, mark, markWord::unlocked_value);
7136 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
7137 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
7138 beq(mark, t, unlocked);
7139
7140 bind(push_and_slow);
7141 // Restore lock-stack and handle the unlock in runtime.
7142 DEBUG_ONLY(add(t, xthread, top);)
7143 DEBUG_ONLY(sd(obj, Address(t));)
7144 addiw(top, top, oopSize);
7145 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
7146 j(slow);
7147
7148 bind(unlocked);
7149 }