1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "code/compiledIC.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/cardTableBarrierSet.hpp"
35 #include "gc/shared/collectedHeap.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "memory/resourceArea.hpp"
40 #include "memory/universe.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedKlass.inline.hpp"
43 #include "oops/compressedOops.inline.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/oop.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/javaThread.hpp"
48 #include "runtime/jniHandles.inline.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "utilities/globalDefinitions.hpp"
52 #include "utilities/integerCast.hpp"
53 #include "utilities/powerOfTwo.hpp"
54 #ifdef COMPILER2
55 #include "opto/compile.hpp"
56 #include "opto/node.hpp"
57 #include "opto/output.hpp"
58 #endif
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) /* nothing */
62 #else
63 #define BLOCK_COMMENT(str) block_comment(str)
64 #endif
65 #define STOP(str) stop(str);
66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
67
68
69
70 Register MacroAssembler::extract_rs1(address instr) {
71 assert_cond(instr != nullptr);
72 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
73 }
74
75 Register MacroAssembler::extract_rs2(address instr) {
76 assert_cond(instr != nullptr);
77 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
78 }
79
80 Register MacroAssembler::extract_rd(address instr) {
81 assert_cond(instr != nullptr);
82 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
83 }
84
85 uint32_t MacroAssembler::extract_opcode(address instr) {
86 assert_cond(instr != nullptr);
87 return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
88 }
89
90 uint32_t MacroAssembler::extract_funct3(address instr) {
91 assert_cond(instr != nullptr);
92 return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
93 }
94
95 bool MacroAssembler::is_pc_relative_at(address instr) {
96 // auipc + jalr
97 // auipc + addi
98 // auipc + load
99 // auipc + fload_load
100 return (is_auipc_at(instr)) &&
101 (is_addi_at(instr + MacroAssembler::instruction_size) ||
102 is_jalr_at(instr + MacroAssembler::instruction_size) ||
103 is_load_at(instr + MacroAssembler::instruction_size) ||
104 is_float_load_at(instr + MacroAssembler::instruction_size)) &&
105 check_pc_relative_data_dependency(instr);
106 }
107
108 // ie:ld(Rd, Label)
109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
110 return is_auipc_at(instr) && // auipc
111 is_ld_at(instr + MacroAssembler::instruction_size) && // ld
112 check_load_pc_relative_data_dependency(instr);
113 }
114
115 bool MacroAssembler::is_movptr1_at(address instr) {
116 return is_lui_at(instr) && // Lui
117 is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
118 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
119 is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
120 is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
121 (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
122 is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
123 is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
124 check_movptr1_data_dependency(instr);
125 }
126
127 bool MacroAssembler::is_movptr2_at(address instr) {
128 return is_lui_at(instr) && // lui
129 is_lui_at(instr + MacroAssembler::instruction_size) && // lui
130 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
131 is_add_at(instr + MacroAssembler::instruction_size * 3) &&
132 (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
133 is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
134 is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
135 check_movptr2_data_dependency(instr);
136 }
137
138 bool MacroAssembler::is_li16u_at(address instr) {
139 return is_lui_at(instr) && // lui
140 is_srli_at(instr + MacroAssembler::instruction_size) && // srli
141 check_li16u_data_dependency(instr);
142 }
143
144 bool MacroAssembler::is_li32_at(address instr) {
145 return is_lui_at(instr) && // lui
146 is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
147 check_li32_data_dependency(instr);
148 }
149
150 bool MacroAssembler::is_lwu_to_zr(address instr) {
151 assert_cond(instr != nullptr);
152 return (extract_opcode(instr) == 0b0000011 &&
153 extract_funct3(instr) == 0b110 &&
154 extract_rd(instr) == zr); // zr
155 }
156
157 uint32_t MacroAssembler::get_membar_kind(address addr) {
158 assert_cond(addr != nullptr);
159 assert(is_membar(addr), "no membar found");
160
161 uint32_t insn = Bytes::get_native_u4(addr);
162
163 uint32_t predecessor = Assembler::extract(insn, 27, 24);
164 uint32_t successor = Assembler::extract(insn, 23, 20);
165
166 return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
167 }
168
169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
170 assert_cond(addr != nullptr);
171 assert(is_membar(addr), "no membar found");
172
173 uint32_t predecessor = 0;
174 uint32_t successor = 0;
175
176 MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
177
178 uint32_t insn = Bytes::get_native_u4(addr);
179 address pInsn = (address) &insn;
180 Assembler::patch(pInsn, 27, 24, predecessor);
181 Assembler::patch(pInsn, 23, 20, successor);
182
183 address membar = addr;
184 Assembler::sd_instr(membar, insn);
185 }
186
187 static void pass_arg0(MacroAssembler* masm, Register arg) {
188 if (c_rarg0 != arg) {
189 masm->mv(c_rarg0, arg);
190 }
191 }
192
193 static void pass_arg1(MacroAssembler* masm, Register arg) {
194 if (c_rarg1 != arg) {
195 masm->mv(c_rarg1, arg);
196 }
197 }
198
199 static void pass_arg2(MacroAssembler* masm, Register arg) {
200 if (c_rarg2 != arg) {
201 masm->mv(c_rarg2, arg);
202 }
203 }
204
205 static void pass_arg3(MacroAssembler* masm, Register arg) {
206 if (c_rarg3 != arg) {
207 masm->mv(c_rarg3, arg);
208 }
209 }
210
211 void MacroAssembler::push_cont_fastpath(Register java_thread) {
212 if (!Continuations::enabled()) return;
213 Label done;
214 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
215 bleu(sp, t0, done);
216 sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
217 bind(done);
218 }
219
220 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
221 if (!Continuations::enabled()) return;
222 Label done;
223 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
224 bltu(sp, t0, done);
225 sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
226 bind(done);
227 }
228
229 int MacroAssembler::align(int modulus, int extra_offset) {
230 CompressibleScope scope(this);
231 intptr_t before = offset();
232 while ((offset() + extra_offset) % modulus != 0) { nop(); }
233 return (int)(offset() - before);
234 }
235
236 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
237 call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
238 }
239
240 // Implementation of call_VM versions
241
242 void MacroAssembler::call_VM(Register oop_result,
243 address entry_point,
244 bool check_exceptions) {
245 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
246 }
247
248 void MacroAssembler::call_VM(Register oop_result,
249 address entry_point,
250 Register arg_1,
251 bool check_exceptions) {
252 pass_arg1(this, arg_1);
253 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
254 }
255
256 void MacroAssembler::call_VM(Register oop_result,
257 address entry_point,
258 Register arg_1,
259 Register arg_2,
260 bool check_exceptions) {
261 assert_different_registers(arg_1, c_rarg2);
262 pass_arg2(this, arg_2);
263 pass_arg1(this, arg_1);
264 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
265 }
266
267 void MacroAssembler::call_VM(Register oop_result,
268 address entry_point,
269 Register arg_1,
270 Register arg_2,
271 Register arg_3,
272 bool check_exceptions) {
273 assert_different_registers(arg_1, c_rarg2, c_rarg3);
274 assert_different_registers(arg_2, c_rarg3);
275 pass_arg3(this, arg_3);
276
277 pass_arg2(this, arg_2);
278
279 pass_arg1(this, arg_1);
280 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
281 }
282
283 void MacroAssembler::call_VM(Register oop_result,
284 Register last_java_sp,
285 address entry_point,
286 int number_of_arguments,
287 bool check_exceptions) {
288 call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
289 }
290
291 void MacroAssembler::call_VM(Register oop_result,
292 Register last_java_sp,
293 address entry_point,
294 Register arg_1,
295 bool check_exceptions) {
296 pass_arg1(this, arg_1);
297 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
298 }
299
300 void MacroAssembler::call_VM(Register oop_result,
301 Register last_java_sp,
302 address entry_point,
303 Register arg_1,
304 Register arg_2,
305 bool check_exceptions) {
306
307 assert_different_registers(arg_1, c_rarg2);
308 pass_arg2(this, arg_2);
309 pass_arg1(this, arg_1);
310 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
311 }
312
313 void MacroAssembler::call_VM(Register oop_result,
314 Register last_java_sp,
315 address entry_point,
316 Register arg_1,
317 Register arg_2,
318 Register arg_3,
319 bool check_exceptions) {
320 assert_different_registers(arg_1, c_rarg2, c_rarg3);
321 assert_different_registers(arg_2, c_rarg3);
322 pass_arg3(this, arg_3);
323 pass_arg2(this, arg_2);
324 pass_arg1(this, arg_1);
325 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
326 }
327
328 void MacroAssembler::post_call_nop() {
329 assert(!in_compressible_scope(), "Must be");
330 assert_alignment(pc());
331 if (!Continuations::enabled()) {
332 return;
333 }
334 relocate(post_call_nop_Relocation::spec());
335 InlineSkippedInstructionsCounter skipCounter(this);
336 nop();
337 li32(zr, 0);
338 }
339
340 // these are no-ops overridden by InterpreterMacroAssembler
341 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
342 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
343
344 // Calls to C land
345 //
346 // When entering C land, the fp, & esp of the last Java frame have to be recorded
347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
348 // has to be reset to 0. This is required to allow proper stack traversal.
349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
350 Register last_java_fp,
351 Register last_java_pc) {
352
353 if (last_java_pc->is_valid()) {
354 sd(last_java_pc, Address(xthread,
355 JavaThread::frame_anchor_offset() +
356 JavaFrameAnchor::last_Java_pc_offset()));
357 }
358
359 // determine last_java_sp register
360 if (!last_java_sp->is_valid()) {
361 last_java_sp = esp;
362 }
363
364 // last_java_fp is optional
365 if (last_java_fp->is_valid()) {
366 sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
367 }
368
369 // We must set sp last.
370 sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
371
372 }
373
374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
375 Register last_java_fp,
376 address last_java_pc,
377 Register tmp) {
378 assert(last_java_pc != nullptr, "must provide a valid PC");
379
380 la(tmp, last_java_pc);
381 sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
382
383 set_last_Java_frame(last_java_sp, last_java_fp, noreg);
384 }
385
386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
387 Register last_java_fp,
388 Label &L,
389 Register tmp) {
390 if (L.is_bound()) {
391 set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
392 } else {
393 L.add_patch_at(code(), locator());
394 IncompressibleScope scope(this); // the label address will be patched back.
395 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
396 }
397 }
398
399 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
400 // we must set sp to zero to clear frame
401 sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
402
403 // must clear fp, so that compiled frames are not confused; it is
404 // possible that we need it only for debugging
405 if (clear_fp) {
406 sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
407 }
408
409 // Always clear the pc because it could have been set by make_walkable()
410 sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
411 }
412
413 void MacroAssembler::call_VM_base(Register oop_result,
414 Register java_thread,
415 Register last_java_sp,
416 Label* return_pc,
417 address entry_point,
418 int number_of_arguments,
419 bool check_exceptions) {
420 // determine java_thread register
421 if (!java_thread->is_valid()) {
422 java_thread = xthread;
423 }
424
425 // determine last_java_sp register
426 if (!last_java_sp->is_valid()) {
427 last_java_sp = esp;
428 }
429
430 // debugging support
431 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
432 assert(java_thread == xthread, "unexpected register");
433
434 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
435 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
436
437 // push java thread (becomes first argument of C function)
438 mv(c_rarg0, java_thread);
439
440 // set last Java frame before call
441 assert(last_java_sp != fp, "can't use fp");
442
443 Label l;
444 set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
445
446 // do the call, remove parameters
447 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
448
449 // reset last Java frame
450 // Only interpreter should have to clear fp
451 reset_last_Java_frame(true);
452
453 // C++ interp handles this in the interpreter
454 check_and_handle_popframe(java_thread);
455 check_and_handle_earlyret(java_thread);
456
457 if (check_exceptions) {
458 // check for pending exceptions (java_thread is set upon return)
459 ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
460 Label ok;
461 beqz(t0, ok);
462 j(RuntimeAddress(StubRoutines::forward_exception_entry()));
463 bind(ok);
464 }
465
466 // get oop result if there is one and reset the value in the thread
467 if (oop_result->is_valid()) {
468 get_vm_result_oop(oop_result, java_thread);
469 }
470 }
471
472 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
473 ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
474 sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
475 verify_oop_msg(oop_result, "broken oop in call_VM_base");
476 }
477
478 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
479 ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
480 sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
481 }
482
483 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
484 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
485 assert_different_registers(klass, xthread, tmp);
486
487 Label L_fallthrough, L_tmp;
488 if (L_fast_path == nullptr) {
489 L_fast_path = &L_fallthrough;
490 } else if (L_slow_path == nullptr) {
491 L_slow_path = &L_fallthrough;
492 }
493
494 // Fast path check: class is fully initialized
495 lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
496 membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
497 sub(tmp, tmp, InstanceKlass::fully_initialized);
498 beqz(tmp, *L_fast_path);
499
500 // Fast path check: current thread is initializer thread
501 ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
502
503 if (L_slow_path == &L_fallthrough) {
504 beq(xthread, tmp, *L_fast_path);
505 bind(*L_slow_path);
506 } else if (L_fast_path == &L_fallthrough) {
507 bne(xthread, tmp, *L_slow_path);
508 bind(*L_fast_path);
509 } else {
510 Unimplemented();
511 }
512 }
513
514 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
515 if (!VerifyOops) { return; }
516
517 // Pass register number to verify_oop_subroutine
518 const char* b = nullptr;
519 {
520 ResourceMark rm;
521 stringStream ss;
522 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
523 b = code_string(ss.as_string());
524 }
525 BLOCK_COMMENT("verify_oop {");
526
527 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
528
529 mv(c_rarg0, reg); // c_rarg0 : x10
530 {
531 // The length of the instruction sequence emitted should not depend
532 // on the address of the char buffer so that the size of mach nodes for
533 // scratch emit and normal emit matches.
534 IncompressibleScope scope(this); // Fixed length
535 movptr(t0, (address) b);
536 }
537
538 // Call indirectly to solve generation ordering problem
539 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
540 jalr(t1);
541
542 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
543
544 BLOCK_COMMENT("} verify_oop");
545 }
546
547 // Handle the receiver type profile update given the "recv" klass.
548 //
549 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
550 // If there are no matching or claimable receiver entries in RD, updates
551 // the polymorphic counter.
552 //
553 // This code expected to run by either the interpreter or JIT-ed code, without
554 // extra synchronization. For safety, receiver cells are claimed atomically, which
555 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
556 // counter updates are not atomic.
557 //
558 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
559 assert_different_registers(recv, mdp, t0, t1);
560
561 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
562 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
563 int poly_count_offset = in_bytes(CounterData::count_offset());
564 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
565 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
566
567 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
568 base_receiver_offset += mdp_offset;
569 end_receiver_offset += mdp_offset;
570 poly_count_offset += mdp_offset;
571
572 #ifdef ASSERT
573 // We are about to walk the MDO slots without asking for offsets.
574 // Check that our math hits all the right spots.
575 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
576 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
577 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
578 int offset = base_receiver_offset + receiver_step*c;
579 int count_offset = offset + receiver_to_count_step;
580 assert(offset == real_recv_offset, "receiver slot math");
581 assert(count_offset == real_count_offset, "receiver count math");
582 }
583 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
584 assert(poly_count_offset == real_poly_count_offset, "poly counter math");
585 #endif
586
587 // Corner case: no profile table. Increment poly counter and exit.
588 if (ReceiverTypeData::row_limit() == 0) {
589 increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
590 return;
591 }
592
593 Register offset = t1;
594
595 Label L_loop_search_receiver, L_loop_search_empty;
596 Label L_restart, L_found_recv, L_found_empty, L_polymorphic, L_count_update;
597
598 // The code here recognizes three major cases:
599 // A. Fastest: receiver found in the table
600 // B. Fast: no receiver in the table, and the table is full
601 // C. Slow: no receiver in the table, free slots in the table
602 //
603 // The case A performance is most important, as perfectly-behaved code would end up
604 // there, especially with larger TypeProfileWidth. The case B performance is
605 // important as well, this is where bulk of code would land for normally megamorphic
606 // cases. The case C performance is not essential, its job is to deal with installation
607 // races, we optimize for code density instead. Case C needs to make sure that receiver
608 // rows are only claimed once. This makes sure we never overwrite a row for another
609 // receiver and never duplicate the receivers in the list, making profile type-accurate.
610 //
611 // It is very tempting to handle these cases in a single loop, and claim the first slot
612 // without checking the rest of the table. But, profiling code should tolerate free slots
613 // in the table, as class unloading can clear them. After such cleanup, the receiver
614 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
615 // to complete, before trying to install new slots. Splitting the code in several tight
616 // loops also helpfully optimizes for cases A and B.
617 //
618 // This code is effectively:
619 //
620 // restart:
621 // // Fastest: receiver is already installed
622 // for (i = 0; i < receiver_count(); i++) {
623 // if (receiver(i) == recv) goto found_recv(i);
624 // }
625 //
626 // // Fast: no receiver, but profile is full
627 // for (i = 0; i < receiver_count(); i++) {
628 // if (receiver(i) == null) goto found_null(i);
629 // }
630 // goto polymorphic
631 //
632 // // Slow: try to install receiver
633 // found_null(i):
634 // CAS(&receiver(i), null, recv);
635 // goto restart
636 //
637 // polymorphic:
638 // count++;
639 // return
640 //
641 // found_recv(i):
642 // *receiver_count(i)++
643 //
644
645 bind(L_restart);
646
647 // Fastest: receiver is already installed
648 mv(offset, base_receiver_offset);
649 bind(L_loop_search_receiver);
650 add(t0, mdp, offset);
651 ld(t0, Address(t0));
652 beq(recv, t0, L_found_recv);
653 add(offset, offset, receiver_step);
654 sub(t0, offset, end_receiver_offset);
655 bnez(t0, L_loop_search_receiver);
656
657 // Fast: no receiver, but profile is full
658 mv(offset, base_receiver_offset);
659 bind(L_loop_search_empty);
660 add(t0, mdp, offset);
661 ld(t0, Address(t0));
662 beqz(t0, L_found_empty);
663 add(offset, offset, receiver_step);
664 sub(t0, offset, end_receiver_offset);
665 bnez(t0, L_loop_search_empty);
666 j(L_polymorphic);
667
668 // Slow: try to install receiver
669 bind(L_found_empty);
670
671 // Atomically swing receiver slot: null -> recv.
672 //
673 // The update uses CAS, which clobbers t0. Therefore, t1
674 // is used to hold the destination address. This is safe because the
675 // offset is no longer needed after the address is computed.
676 add(t1, mdp, offset);
677 weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
678 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
679
680 // CAS success means the slot now has the receiver we want. CAS failure means
681 // something had claimed the slot concurrently: it can be the same receiver we want,
682 // or something else. Since this is a slow path, we can optimize for code density,
683 // and just restart the search from the beginning.
684 j(L_restart);
685
686 // Counter updates:
687 // Increment polymorphic counter instead of receiver slot.
688 bind(L_polymorphic);
689 mv(offset, poly_count_offset);
690 j(L_count_update);
691
692 // Found a receiver, convert its slot offset to corresponding count offset.
693 bind(L_found_recv);
694 add(offset, offset, receiver_to_count_step);
695
696 bind(L_count_update);
697 add(t1, mdp, offset);
698 increment(Address(t1), DataLayout::counter_increment);
699 }
700
701 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
702 if (!VerifyOops) {
703 return;
704 }
705
706 const char* b = nullptr;
707 {
708 ResourceMark rm;
709 stringStream ss;
710 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
711 b = code_string(ss.as_string());
712 }
713 BLOCK_COMMENT("verify_oop_addr {");
714
715 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
716
717 if (addr.uses(sp)) {
718 la(x10, addr);
719 ld(x10, Address(x10, 4 * wordSize));
720 } else {
721 ld(x10, addr);
722 }
723
724 {
725 // The length of the instruction sequence emitted should not depend
726 // on the address of the char buffer so that the size of mach nodes for
727 // scratch emit and normal emit matches.
728 IncompressibleScope scope(this); // Fixed length
729 movptr(t0, (address) b);
730 }
731
732 // Call indirectly to solve generation ordering problem
733 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
734 jalr(t1);
735
736 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
737
738 BLOCK_COMMENT("} verify_oop_addr");
739 }
740
741 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
742 int extra_slot_offset) {
743 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
744 int stackElementSize = Interpreter::stackElementSize;
745 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
746 #ifdef ASSERT
747 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
748 assert(offset1 - offset == stackElementSize, "correct arithmetic");
749 #endif
750 if (arg_slot.is_constant()) {
751 return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
752 } else {
753 assert_different_registers(t0, arg_slot.as_register());
754 shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
755 return Address(t0, offset);
756 }
757 }
758
759 #ifndef PRODUCT
760 extern "C" void findpc(intptr_t x);
761 #endif
762
763 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
764 {
765 // In order to get locks to work, we need to fake a in_VM state
766 if (ShowMessageBoxOnError) {
767 JavaThread* thread = JavaThread::current();
768 JavaThreadState saved_state = thread->thread_state();
769 thread->set_thread_state(_thread_in_vm);
770 #ifndef PRODUCT
771 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
772 ttyLocker ttyl;
773 BytecodeCounter::print();
774 }
775 #endif
776 if (os::message_box(msg, "Execution stopped, print registers?")) {
777 ttyLocker ttyl;
778 tty->print_cr(" pc = 0x%016lx", pc);
779 #ifndef PRODUCT
780 tty->cr();
781 findpc(pc);
782 tty->cr();
783 #endif
784 tty->print_cr(" x0 = 0x%016lx", regs[0]);
785 tty->print_cr(" x1 = 0x%016lx", regs[1]);
786 tty->print_cr(" x2 = 0x%016lx", regs[2]);
787 tty->print_cr(" x3 = 0x%016lx", regs[3]);
788 tty->print_cr(" x4 = 0x%016lx", regs[4]);
789 tty->print_cr(" x5 = 0x%016lx", regs[5]);
790 tty->print_cr(" x6 = 0x%016lx", regs[6]);
791 tty->print_cr(" x7 = 0x%016lx", regs[7]);
792 tty->print_cr(" x8 = 0x%016lx", regs[8]);
793 tty->print_cr(" x9 = 0x%016lx", regs[9]);
794 tty->print_cr("x10 = 0x%016lx", regs[10]);
795 tty->print_cr("x11 = 0x%016lx", regs[11]);
796 tty->print_cr("x12 = 0x%016lx", regs[12]);
797 tty->print_cr("x13 = 0x%016lx", regs[13]);
798 tty->print_cr("x14 = 0x%016lx", regs[14]);
799 tty->print_cr("x15 = 0x%016lx", regs[15]);
800 tty->print_cr("x16 = 0x%016lx", regs[16]);
801 tty->print_cr("x17 = 0x%016lx", regs[17]);
802 tty->print_cr("x18 = 0x%016lx", regs[18]);
803 tty->print_cr("x19 = 0x%016lx", regs[19]);
804 tty->print_cr("x20 = 0x%016lx", regs[20]);
805 tty->print_cr("x21 = 0x%016lx", regs[21]);
806 tty->print_cr("x22 = 0x%016lx", regs[22]);
807 tty->print_cr("x23 = 0x%016lx", regs[23]);
808 tty->print_cr("x24 = 0x%016lx", regs[24]);
809 tty->print_cr("x25 = 0x%016lx", regs[25]);
810 tty->print_cr("x26 = 0x%016lx", regs[26]);
811 tty->print_cr("x27 = 0x%016lx", regs[27]);
812 tty->print_cr("x28 = 0x%016lx", regs[28]);
813 tty->print_cr("x30 = 0x%016lx", regs[30]);
814 tty->print_cr("x31 = 0x%016lx", regs[31]);
815 BREAKPOINT;
816 }
817 }
818 fatal("DEBUG MESSAGE: %s", msg);
819 }
820
821 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
822 assert_different_registers(value, tmp1, tmp2);
823 Label done, tagged, weak_tagged;
824
825 beqz(value, done); // Use null as-is.
826 // Test for tag.
827 andi(tmp1, value, JNIHandles::tag_mask);
828 bnez(tmp1, tagged);
829
830 // Resolve local handle
831 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
832 verify_oop(value);
833 j(done);
834
835 bind(tagged);
836 // Test for jweak tag.
837 STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
838 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
839 bnez(tmp1, weak_tagged);
840
841 // Resolve global handle
842 access_load_at(T_OBJECT, IN_NATIVE, value,
843 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
844 verify_oop(value);
845 j(done);
846
847 bind(weak_tagged);
848 // Resolve jweak.
849 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
850 Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
851 verify_oop(value);
852
853 bind(done);
854 }
855
856 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
857 assert_different_registers(value, tmp1, tmp2);
858 Label done;
859
860 beqz(value, done); // Use null as-is.
861
862 #ifdef ASSERT
863 {
864 STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
865 Label valid_global_tag;
866 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
867 bnez(tmp1, valid_global_tag);
868 stop("non global jobject using resolve_global_jobject");
869 bind(valid_global_tag);
870 }
871 #endif
872
873 // Resolve global handle
874 access_load_at(T_OBJECT, IN_NATIVE, value,
875 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
876 verify_oop(value);
877
878 bind(done);
879 }
880
881 void MacroAssembler::stop(const char* msg) {
882 BLOCK_COMMENT(msg);
883 illegal_instruction(Assembler::csr::time);
884 emit_int64((uintptr_t)msg);
885 }
886
887 void MacroAssembler::unimplemented(const char* what) {
888 const char* buf = nullptr;
889 {
890 ResourceMark rm;
891 stringStream ss;
892 ss.print("unimplemented: %s", what);
893 buf = code_string(ss.as_string());
894 }
895 stop(buf);
896 }
897
898 void MacroAssembler::emit_static_call_stub() {
899 IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
900 // CompiledDirectCall::set_to_interpreted knows the
901 // exact layout of this stub.
902
903 mov_metadata(xmethod, (Metadata*)nullptr);
904
905 // Jump to the entry point of the c2i stub.
906 int32_t offset = 0;
907 movptr2(t1, 0, offset, t0); // lui + lui + slli + add
908 jr(t1, offset);
909 }
910
911 void MacroAssembler::call_VM_leaf_base(address entry_point,
912 int number_of_arguments,
913 Label *retaddr) {
914 int32_t offset = 0;
915 push_reg(RegSet::of(t1, xmethod), sp); // push << t1 & xmethod >> to sp
916 movptr(t1, entry_point, offset, t0);
917 jalr(t1, offset);
918 if (retaddr != nullptr) {
919 bind(*retaddr);
920 }
921 pop_reg(RegSet::of(t1, xmethod), sp); // pop << t1 & xmethod >> from sp
922 }
923
924 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
925 call_VM_leaf_base(entry_point, number_of_arguments);
926 }
927
928 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
929 pass_arg0(this, arg_0);
930 call_VM_leaf_base(entry_point, 1);
931 }
932
933 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
934 assert_different_registers(arg_1, c_rarg0);
935 pass_arg0(this, arg_0);
936 pass_arg1(this, arg_1);
937 call_VM_leaf_base(entry_point, 2);
938 }
939
940 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
941 Register arg_1, Register arg_2) {
942 assert_different_registers(arg_1, c_rarg0);
943 assert_different_registers(arg_2, c_rarg0, c_rarg1);
944 pass_arg0(this, arg_0);
945 pass_arg1(this, arg_1);
946 pass_arg2(this, arg_2);
947 call_VM_leaf_base(entry_point, 3);
948 }
949
950 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
951 pass_arg0(this, arg_0);
952 MacroAssembler::call_VM_leaf_base(entry_point, 1);
953 }
954
955 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
956
957 assert_different_registers(arg_0, c_rarg1);
958 pass_arg1(this, arg_1);
959 pass_arg0(this, arg_0);
960 MacroAssembler::call_VM_leaf_base(entry_point, 2);
961 }
962
963 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
964 assert_different_registers(arg_0, c_rarg1, c_rarg2);
965 assert_different_registers(arg_1, c_rarg2);
966 pass_arg2(this, arg_2);
967 pass_arg1(this, arg_1);
968 pass_arg0(this, arg_0);
969 MacroAssembler::call_VM_leaf_base(entry_point, 3);
970 }
971
972 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
973 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
974 assert_different_registers(arg_1, c_rarg2, c_rarg3);
975 assert_different_registers(arg_2, c_rarg3);
976
977 pass_arg3(this, arg_3);
978 pass_arg2(this, arg_2);
979 pass_arg1(this, arg_1);
980 pass_arg0(this, arg_0);
981 MacroAssembler::call_VM_leaf_base(entry_point, 4);
982 }
983
984 void MacroAssembler::la(Register Rd, const address addr) {
985 int32_t offset;
986 la(Rd, addr, offset);
987 addi(Rd, Rd, offset);
988 }
989
990 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
991 int64_t distance = addr - pc();
992 assert(is_valid_32bit_offset(distance), "Must be");
993 auipc(Rd, (int32_t)distance + 0x800);
994 offset = ((int32_t)distance << 20) >> 20;
995 }
996
997 // Materialize with auipc + addi sequence if adr is a literal
998 // address inside code cache. Emit a movptr sequence otherwise.
999 void MacroAssembler::la(Register Rd, const Address &adr) {
1000 switch (adr.getMode()) {
1001 case Address::literal: {
1002 relocInfo::relocType rtype = adr.rspec().reloc()->type();
1003 if (rtype == relocInfo::none) {
1004 mv(Rd, (intptr_t)(adr.target()));
1005 } else {
1006 if (CodeCache::contains(adr.target())) {
1007 relocate(adr.rspec(), [&] {
1008 la(Rd, adr.target());
1009 });
1010 } else {
1011 relocate(adr.rspec(), [&] {
1012 movptr(Rd, adr.target());
1013 });
1014 }
1015 }
1016 break;
1017 }
1018 case Address::base_plus_offset: {
1019 Address new_adr = legitimize_address(Rd, adr);
1020 if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1021 addi(Rd, new_adr.base(), new_adr.offset());
1022 }
1023 break;
1024 }
1025 default:
1026 ShouldNotReachHere();
1027 }
1028 }
1029
1030 void MacroAssembler::la(Register Rd, Label &label) {
1031 IncompressibleScope scope(this); // the label address may be patched back.
1032 wrap_label(Rd, label, &MacroAssembler::la);
1033 }
1034
1035 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1036 lui(Rd, (uint32_t)imm << 12);
1037 srli(Rd, Rd, 12);
1038 }
1039
1040 void MacroAssembler::li32(Register Rd, int32_t imm) {
1041 // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1042 int64_t upper = imm, lower = imm;
1043 lower = (imm << 20) >> 20;
1044 upper -= lower;
1045 upper = (int32_t)upper;
1046 // lui Rd, imm[31:12] + imm[11]
1047 lui(Rd, upper);
1048 addiw(Rd, Rd, lower);
1049 }
1050
1051 void MacroAssembler::li(Register Rd, int64_t imm) {
1052 // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1053 // li -> c.li
1054 if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1055 c_li(Rd, imm);
1056 return;
1057 }
1058
1059 int shift = 12;
1060 int64_t upper = imm, lower = imm;
1061 // Split imm to a lower 12-bit sign-extended part and the remainder,
1062 // because addi will sign-extend the lower imm.
1063 lower = ((int32_t)imm << 20) >> 20;
1064 upper -= lower;
1065
1066 // Test whether imm is a 32-bit integer.
1067 if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1068 (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1069 while (((upper >> shift) & 1) == 0) { shift++; }
1070 upper >>= shift;
1071 li(Rd, upper);
1072 slli(Rd, Rd, shift);
1073 if (lower != 0) {
1074 addi(Rd, Rd, lower);
1075 }
1076 } else {
1077 // 32-bit integer
1078 Register hi_Rd = zr;
1079 if (upper != 0) {
1080 lui(Rd, (int32_t)upper);
1081 hi_Rd = Rd;
1082 }
1083 if (lower != 0 || hi_Rd == zr) {
1084 addiw(Rd, hi_Rd, lower);
1085 }
1086 }
1087 }
1088
1089 void MacroAssembler::j(const address dest, Register temp) {
1090 assert(CodeCache::contains(dest), "Must be");
1091 assert_cond(dest != nullptr);
1092 int64_t distance = dest - pc();
1093
1094 // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1095 IncompressibleScope scope(this);
1096 if (is_simm21(distance) && ((distance % 2) == 0)) {
1097 Assembler::jal(x0, distance);
1098 } else {
1099 assert(temp != noreg && temp != x0, "Expecting a register");
1100 assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1101 int32_t offset = 0;
1102 la(temp, dest, offset);
1103 jr(temp, offset);
1104 }
1105 }
1106
1107 void MacroAssembler::j(const Address &dest, Register temp) {
1108 switch (dest.getMode()) {
1109 case Address::literal: {
1110 if (CodeCache::contains(dest.target())) {
1111 far_jump(dest, temp);
1112 } else {
1113 relocate(dest.rspec(), [&] {
1114 int32_t offset;
1115 movptr(temp, dest.target(), offset);
1116 jr(temp, offset);
1117 });
1118 }
1119 break;
1120 }
1121 case Address::base_plus_offset: {
1122 int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1123 la(temp, Address(dest.base(), dest.offset() - offset));
1124 jr(temp, offset);
1125 break;
1126 }
1127 default:
1128 ShouldNotReachHere();
1129 }
1130 }
1131
1132 void MacroAssembler::j(Label &lab, Register temp) {
1133 assert_different_registers(x0, temp);
1134 if (lab.is_bound()) {
1135 MacroAssembler::j(target(lab), temp);
1136 } else {
1137 lab.add_patch_at(code(), locator());
1138 MacroAssembler::j(pc(), temp);
1139 }
1140 }
1141
1142 void MacroAssembler::jr(Register Rd, int32_t offset) {
1143 assert(Rd != noreg, "expecting a register");
1144 assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1145 Assembler::jalr(x0, Rd, offset);
1146 }
1147
1148 void MacroAssembler::call(const address dest, Register temp) {
1149 assert_cond(dest != nullptr);
1150 assert(temp != noreg, "expecting a register");
1151 assert(temp != x5, "temp register must not be x5.");
1152 int32_t offset = 0;
1153 la(temp, dest, offset);
1154 jalr(temp, offset);
1155 }
1156
1157 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1158 assert(Rs != noreg, "expecting a register");
1159 assert(Rs != x5, "Rs register must not be x5.");
1160 Assembler::jalr(x1, Rs, offset);
1161 }
1162
1163 void MacroAssembler::rt_call(address dest, Register tmp) {
1164 assert(tmp != x5, "tmp register must not be x5.");
1165 RuntimeAddress target(dest);
1166 if (CodeCache::contains(dest)) {
1167 far_call(target, tmp);
1168 } else {
1169 relocate(target.rspec(), [&] {
1170 int32_t offset;
1171 movptr(tmp, target.target(), offset);
1172 jalr(tmp, offset);
1173 });
1174 }
1175 }
1176
1177 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1178 if (L.is_bound()) {
1179 (this->*insn)(Rt, target(L));
1180 } else {
1181 L.add_patch_at(code(), locator());
1182 (this->*insn)(Rt, pc());
1183 }
1184 }
1185
1186 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1187 compare_and_branch_insn insn,
1188 compare_and_branch_label_insn neg_insn, bool is_far) {
1189 if (is_far) {
1190 Label done;
1191 (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1192 j(L);
1193 bind(done);
1194 } else {
1195 if (L.is_bound()) {
1196 (this->*insn)(r1, r2, target(L));
1197 } else {
1198 L.add_patch_at(code(), locator());
1199 (this->*insn)(r1, r2, pc());
1200 }
1201 }
1202 }
1203
1204 #define INSN(NAME, NEG_INSN) \
1205 void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) { \
1206 wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far); \
1207 }
1208
1209 INSN(beq, bne);
1210 INSN(bne, beq);
1211 INSN(blt, bge);
1212 INSN(bge, blt);
1213 INSN(bltu, bgeu);
1214 INSN(bgeu, bltu);
1215
1216 #undef INSN
1217
1218 #define INSN(NAME) \
1219 void MacroAssembler::NAME##z(Register Rs, const address dest) { \
1220 NAME(Rs, zr, dest); \
1221 } \
1222 void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) { \
1223 NAME(Rs, zr, l, is_far); \
1224 } \
1225
1226 INSN(beq);
1227 INSN(bne);
1228 INSN(blt);
1229 INSN(ble);
1230 INSN(bge);
1231 INSN(bgt);
1232
1233 #undef INSN
1234
1235 #define INSN(NAME, NEG_INSN) \
1236 void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) { \
1237 NEG_INSN(Rt, Rs, dest); \
1238 } \
1239 void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) { \
1240 NEG_INSN(Rt, Rs, l, is_far); \
1241 }
1242
1243 INSN(bgt, blt);
1244 INSN(ble, bge);
1245 INSN(bgtu, bltu);
1246 INSN(bleu, bgeu);
1247
1248 #undef INSN
1249
1250 // cmov
1251 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1252 if (UseZicond) {
1253 xorr(t0, cmp1, cmp2);
1254 czero_eqz(dst, dst, t0);
1255 czero_nez(t0 , src, t0);
1256 orr(dst, dst, t0);
1257 return;
1258 }
1259 Label no_set;
1260 bne(cmp1, cmp2, no_set);
1261 mv(dst, src);
1262 bind(no_set);
1263 }
1264
1265 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1266 if (UseZicond) {
1267 xorr(t0, cmp1, cmp2);
1268 czero_nez(dst, dst, t0);
1269 czero_eqz(t0 , src, t0);
1270 orr(dst, dst, t0);
1271 return;
1272 }
1273 Label no_set;
1274 beq(cmp1, cmp2, no_set);
1275 mv(dst, src);
1276 bind(no_set);
1277 }
1278
1279 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1280 if (UseZicond) {
1281 slt(t0, cmp2, cmp1);
1282 czero_eqz(dst, dst, t0);
1283 czero_nez(t0, src, t0);
1284 orr(dst, dst, t0);
1285 return;
1286 }
1287 Label no_set;
1288 bgt(cmp1, cmp2, no_set);
1289 mv(dst, src);
1290 bind(no_set);
1291 }
1292
1293 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1294 if (UseZicond) {
1295 sltu(t0, cmp2, cmp1);
1296 czero_eqz(dst, dst, t0);
1297 czero_nez(t0, src, t0);
1298 orr(dst, dst, t0);
1299 return;
1300 }
1301 Label no_set;
1302 bgtu(cmp1, cmp2, no_set);
1303 mv(dst, src);
1304 bind(no_set);
1305 }
1306
1307 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1308 if (UseZicond) {
1309 slt(t0, cmp1, cmp2);
1310 czero_eqz(dst, dst, t0);
1311 czero_nez(t0, src, t0);
1312 orr(dst, dst, t0);
1313 return;
1314 }
1315 Label no_set;
1316 blt(cmp1, cmp2, no_set);
1317 mv(dst, src);
1318 bind(no_set);
1319 }
1320
1321 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1322 if (UseZicond) {
1323 sltu(t0, cmp1, cmp2);
1324 czero_eqz(dst, dst, t0);
1325 czero_nez(t0, src, t0);
1326 orr(dst, dst, t0);
1327 return;
1328 }
1329 Label no_set;
1330 bltu(cmp1, cmp2, no_set);
1331 mv(dst, src);
1332 bind(no_set);
1333 }
1334
1335 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1336 if (UseZicond) {
1337 slt(t0, cmp1, cmp2);
1338 czero_nez(dst, dst, t0);
1339 czero_eqz(t0, src, t0);
1340 orr(dst, dst, t0);
1341 return;
1342 }
1343 Label no_set;
1344 bge(cmp1, cmp2, no_set);
1345 mv(dst, src);
1346 bind(no_set);
1347 }
1348
1349 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1350 if (UseZicond) {
1351 sltu(t0, cmp1, cmp2);
1352 czero_nez(dst, dst, t0);
1353 czero_eqz(t0, src, t0);
1354 orr(dst, dst, t0);
1355 return;
1356 }
1357 Label no_set;
1358 bgeu(cmp1, cmp2, no_set);
1359 mv(dst, src);
1360 bind(no_set);
1361 }
1362
1363 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1364 if (UseZicond) {
1365 slt(t0, cmp2, cmp1);
1366 czero_nez(dst, dst, t0);
1367 czero_eqz(t0, src, t0);
1368 orr(dst, dst, t0);
1369 return;
1370 }
1371 Label no_set;
1372 ble(cmp1, cmp2, no_set);
1373 mv(dst, src);
1374 bind(no_set);
1375 }
1376
1377 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1378 if (UseZicond) {
1379 sltu(t0, cmp2, cmp1);
1380 czero_nez(dst, dst, t0);
1381 czero_eqz(t0, src, t0);
1382 orr(dst, dst, t0);
1383 return;
1384 }
1385 Label no_set;
1386 bleu(cmp1, cmp2, no_set);
1387 mv(dst, src);
1388 bind(no_set);
1389 }
1390
1391 // ----------- cmove float/double -----------
1392
1393 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1394 Label no_set;
1395 bne(cmp1, cmp2, no_set);
1396 if (is_single) {
1397 fmv_s(dst, src);
1398 } else {
1399 fmv_d(dst, src);
1400 }
1401 bind(no_set);
1402 }
1403
1404 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1405 Label no_set;
1406 beq(cmp1, cmp2, no_set);
1407 if (is_single) {
1408 fmv_s(dst, src);
1409 } else {
1410 fmv_d(dst, src);
1411 }
1412 bind(no_set);
1413 }
1414
1415 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1416 Label no_set;
1417 bgt(cmp1, cmp2, no_set);
1418 if (is_single) {
1419 fmv_s(dst, src);
1420 } else {
1421 fmv_d(dst, src);
1422 }
1423 bind(no_set);
1424 }
1425
1426 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1427 Label no_set;
1428 bgtu(cmp1, cmp2, no_set);
1429 if (is_single) {
1430 fmv_s(dst, src);
1431 } else {
1432 fmv_d(dst, src);
1433 }
1434 bind(no_set);
1435 }
1436
1437 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1438 Label no_set;
1439 blt(cmp1, cmp2, no_set);
1440 if (is_single) {
1441 fmv_s(dst, src);
1442 } else {
1443 fmv_d(dst, src);
1444 }
1445 bind(no_set);
1446 }
1447
1448 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1449 Label no_set;
1450 bltu(cmp1, cmp2, no_set);
1451 if (is_single) {
1452 fmv_s(dst, src);
1453 } else {
1454 fmv_d(dst, src);
1455 }
1456 bind(no_set);
1457 }
1458
1459 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1460 Label no_set;
1461 bge(cmp1, cmp2, no_set);
1462 if (is_single) {
1463 fmv_s(dst, src);
1464 } else {
1465 fmv_d(dst, src);
1466 }
1467 bind(no_set);
1468 }
1469
1470 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1471 Label no_set;
1472 bgeu(cmp1, cmp2, no_set);
1473 if (is_single) {
1474 fmv_s(dst, src);
1475 } else {
1476 fmv_d(dst, src);
1477 }
1478 bind(no_set);
1479 }
1480
1481 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1482 Label no_set;
1483 ble(cmp1, cmp2, no_set);
1484 if (is_single) {
1485 fmv_s(dst, src);
1486 } else {
1487 fmv_d(dst, src);
1488 }
1489 bind(no_set);
1490 }
1491
1492 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1493 Label no_set;
1494 bleu(cmp1, cmp2, no_set);
1495 if (is_single) {
1496 fmv_s(dst, src);
1497 } else {
1498 fmv_d(dst, src);
1499 }
1500 bind(no_set);
1501 }
1502
1503 // ----------- cmove, compare float/double -----------
1504 //
1505 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1506 // so, just list behaviour of unordered ones as follow.
1507 //
1508 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1509 // (If one or both inputs to the compare are NaN, then)
1510 // 1. (op1 lt op2) => true => CMove: dst = src
1511 // 2. (op1 le op2) => true => CMove: dst = src
1512 // 3. (op1 gt op2) => false => CMove: dst = dst
1513 // 4. (op1 ge op2) => false => CMove: dst = dst
1514 // 5. (op1 eq op2) => false => CMove: dst = dst
1515 // 6. (op1 ne op2) => true => CMove: dst = src
1516
1517 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1518 if (UseZicond) {
1519 if (is_single) {
1520 feq_s(t0, cmp1, cmp2);
1521 } else {
1522 feq_d(t0, cmp1, cmp2);
1523 }
1524 czero_nez(dst, dst, t0);
1525 czero_eqz(t0 , src, t0);
1526 orr(dst, dst, t0);
1527 return;
1528 }
1529 Label no_set;
1530 if (is_single) {
1531 // jump if cmp1 != cmp2, including the case of NaN
1532 // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1533 float_bne(cmp1, cmp2, no_set);
1534 } else {
1535 double_bne(cmp1, cmp2, no_set);
1536 }
1537 mv(dst, src);
1538 bind(no_set);
1539 }
1540
1541 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1542 if (UseZicond) {
1543 if (is_single) {
1544 feq_s(t0, cmp1, cmp2);
1545 } else {
1546 feq_d(t0, cmp1, cmp2);
1547 }
1548 czero_eqz(dst, dst, t0);
1549 czero_nez(t0 , src, t0);
1550 orr(dst, dst, t0);
1551 return;
1552 }
1553 Label no_set;
1554 if (is_single) {
1555 // jump if cmp1 == cmp2
1556 // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1557 float_beq(cmp1, cmp2, no_set);
1558 } else {
1559 double_beq(cmp1, cmp2, no_set);
1560 }
1561 mv(dst, src);
1562 bind(no_set);
1563 }
1564
1565 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1566 if (UseZicond) {
1567 if (is_single) {
1568 flt_s(t0, cmp2, cmp1);
1569 } else {
1570 flt_d(t0, cmp2, cmp1);
1571 }
1572 czero_eqz(dst, dst, t0);
1573 czero_nez(t0 , src, t0);
1574 orr(dst, dst, t0);
1575 return;
1576 }
1577 Label no_set;
1578 if (is_single) {
1579 // jump if cmp1 > cmp2
1580 // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1581 float_bgt(cmp1, cmp2, no_set);
1582 } else {
1583 double_bgt(cmp1, cmp2, no_set);
1584 }
1585 mv(dst, src);
1586 bind(no_set);
1587 }
1588
1589 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1590 if (UseZicond) {
1591 if (is_single) {
1592 fle_s(t0, cmp2, cmp1);
1593 } else {
1594 fle_d(t0, cmp2, cmp1);
1595 }
1596 czero_nez(dst, dst, t0);
1597 czero_eqz(t0 , src, t0);
1598 orr(dst, dst, t0);
1599 return;
1600 }
1601 Label no_set;
1602 if (is_single) {
1603 // jump if cmp1 < cmp2 or either is NaN
1604 // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1605 float_blt(cmp1, cmp2, no_set, false, true);
1606 } else {
1607 double_blt(cmp1, cmp2, no_set, false, true);
1608 }
1609 mv(dst, src);
1610 bind(no_set);
1611 }
1612
1613 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1614 if (UseZicond) {
1615 if (is_single) {
1616 fle_s(t0, cmp2, cmp1);
1617 } else {
1618 fle_d(t0, cmp2, cmp1);
1619 }
1620 czero_eqz(dst, dst, t0);
1621 czero_nez(t0 , src, t0);
1622 orr(dst, dst, t0);
1623 return;
1624 }
1625 Label no_set;
1626 if (is_single) {
1627 // jump if cmp1 >= cmp2
1628 // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1629 float_bge(cmp1, cmp2, no_set);
1630 } else {
1631 double_bge(cmp1, cmp2, no_set);
1632 }
1633 mv(dst, src);
1634 bind(no_set);
1635 }
1636
1637 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1638 if (UseZicond) {
1639 if (is_single) {
1640 flt_s(t0, cmp2, cmp1);
1641 } else {
1642 flt_d(t0, cmp2, cmp1);
1643 }
1644 czero_nez(dst, dst, t0);
1645 czero_eqz(t0 , src, t0);
1646 orr(dst, dst, t0);
1647 return;
1648 }
1649 Label no_set;
1650 if (is_single) {
1651 // jump if cmp1 <= cmp2 or either is NaN
1652 // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1653 float_ble(cmp1, cmp2, no_set, false, true);
1654 } else {
1655 double_ble(cmp1, cmp2, no_set, false, true);
1656 }
1657 mv(dst, src);
1658 bind(no_set);
1659 }
1660
1661 // ----------- cmove float/double, compare float/double -----------
1662
1663 // Move src to dst only if cmp1 == cmp2,
1664 // otherwise leave dst unchanged, including the case where one of them is NaN.
1665 // Clarification:
1666 // java code : cmp1 != cmp2 ? dst : src
1667 // transformed to : CMove dst, (cmp1 eq cmp2), dst, src
1668 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1669 FloatRegister dst, FloatRegister src,
1670 bool cmp_single, bool cmov_single) {
1671 Label no_set;
1672 if (cmp_single) {
1673 // jump if cmp1 != cmp2, including the case of NaN
1674 // not jump (i.e. move src to dst) if cmp1 == cmp2
1675 float_bne(cmp1, cmp2, no_set);
1676 } else {
1677 double_bne(cmp1, cmp2, no_set);
1678 }
1679 if (cmov_single) {
1680 fmv_s(dst, src);
1681 } else {
1682 fmv_d(dst, src);
1683 }
1684 bind(no_set);
1685 }
1686
1687 // Keep dst unchanged only if cmp1 == cmp2,
1688 // otherwise move src to dst, including the case where one of them is NaN.
1689 // Clarification:
1690 // java code : cmp1 == cmp2 ? dst : src
1691 // transformed to : CMove dst, (cmp1 ne cmp2), dst, src
1692 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1693 FloatRegister dst, FloatRegister src,
1694 bool cmp_single, bool cmov_single) {
1695 Label no_set;
1696 if (cmp_single) {
1697 // jump if cmp1 == cmp2
1698 // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1699 float_beq(cmp1, cmp2, no_set);
1700 } else {
1701 double_beq(cmp1, cmp2, no_set);
1702 }
1703 if (cmov_single) {
1704 fmv_s(dst, src);
1705 } else {
1706 fmv_d(dst, src);
1707 }
1708 bind(no_set);
1709 }
1710
1711 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1712 // Clarification
1713 // scenario 1:
1714 // java code : cmp2 < cmp1 ? dst : src
1715 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1716 // scenario 2:
1717 // java code : cmp1 > cmp2 ? dst : src
1718 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1719 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1720 FloatRegister dst, FloatRegister src,
1721 bool cmp_single, bool cmov_single) {
1722 Label no_set;
1723 if (cmp_single) {
1724 // jump if cmp1 > cmp2
1725 // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1726 float_bgt(cmp1, cmp2, no_set);
1727 } else {
1728 double_bgt(cmp1, cmp2, no_set);
1729 }
1730 if (cmov_single) {
1731 fmv_s(dst, src);
1732 } else {
1733 fmv_d(dst, src);
1734 }
1735 bind(no_set);
1736 }
1737
1738 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1739 FloatRegister dst, FloatRegister src,
1740 bool cmp_single, bool cmov_single) {
1741 Label no_set;
1742 if (cmp_single) {
1743 // jump if cmp1 < cmp2 or either is NaN
1744 // not jump (i.e. move src to dst) if cmp1 >= cmp2
1745 float_blt(cmp1, cmp2, no_set, false, true);
1746 } else {
1747 double_blt(cmp1, cmp2, no_set, false, true);
1748 }
1749 if (cmov_single) {
1750 fmv_s(dst, src);
1751 } else {
1752 fmv_d(dst, src);
1753 }
1754 bind(no_set);
1755 }
1756
1757 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1758 // Clarification
1759 // scenario 1:
1760 // java code : cmp2 <= cmp1 ? dst : src
1761 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1762 // scenario 2:
1763 // java code : cmp1 >= cmp2 ? dst : src
1764 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1765 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1766 FloatRegister dst, FloatRegister src,
1767 bool cmp_single, bool cmov_single) {
1768 Label no_set;
1769 if (cmp_single) {
1770 // jump if cmp1 >= cmp2
1771 // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1772 float_bge(cmp1, cmp2, no_set);
1773 } else {
1774 double_bge(cmp1, cmp2, no_set);
1775 }
1776 if (cmov_single) {
1777 fmv_s(dst, src);
1778 } else {
1779 fmv_d(dst, src);
1780 }
1781 bind(no_set);
1782 }
1783
1784 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1785 FloatRegister dst, FloatRegister src,
1786 bool cmp_single, bool cmov_single) {
1787 Label no_set;
1788 if (cmp_single) {
1789 // jump if cmp1 <= cmp2 or either is NaN
1790 // not jump (i.e. move src to dst) if cmp1 > cmp2
1791 float_ble(cmp1, cmp2, no_set, false, true);
1792 } else {
1793 double_ble(cmp1, cmp2, no_set, false, true);
1794 }
1795 if (cmov_single) {
1796 fmv_s(dst, src);
1797 } else {
1798 fmv_d(dst, src);
1799 }
1800 bind(no_set);
1801 }
1802
1803 // Float compare branch instructions
1804
1805 #define INSN(NAME, FLOATCMP, BRANCH) \
1806 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1807 FLOATCMP##_s(t0, Rs1, Rs2); \
1808 BRANCH(t0, l, is_far); \
1809 } \
1810 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1811 FLOATCMP##_d(t0, Rs1, Rs2); \
1812 BRANCH(t0, l, is_far); \
1813 }
1814
1815 INSN(beq, feq, bnez);
1816 INSN(bne, feq, beqz);
1817
1818 #undef INSN
1819
1820
1821 #define INSN(NAME, FLOATCMP1, FLOATCMP2) \
1822 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1823 bool is_far, bool is_unordered) { \
1824 if (is_unordered) { \
1825 /* jump if either source is NaN or condition is expected */ \
1826 FLOATCMP2##_s(t0, Rs2, Rs1); \
1827 beqz(t0, l, is_far); \
1828 } else { \
1829 /* jump if no NaN in source and condition is expected */ \
1830 FLOATCMP1##_s(t0, Rs1, Rs2); \
1831 bnez(t0, l, is_far); \
1832 } \
1833 } \
1834 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1835 bool is_far, bool is_unordered) { \
1836 if (is_unordered) { \
1837 /* jump if either source is NaN or condition is expected */ \
1838 FLOATCMP2##_d(t0, Rs2, Rs1); \
1839 beqz(t0, l, is_far); \
1840 } else { \
1841 /* jump if no NaN in source and condition is expected */ \
1842 FLOATCMP1##_d(t0, Rs1, Rs2); \
1843 bnez(t0, l, is_far); \
1844 } \
1845 }
1846
1847 INSN(ble, fle, flt);
1848 INSN(blt, flt, fle);
1849
1850 #undef INSN
1851
1852 #define INSN(NAME, CMP) \
1853 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1854 bool is_far, bool is_unordered) { \
1855 float_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1856 } \
1857 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1858 bool is_far, bool is_unordered) { \
1859 double_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1860 }
1861
1862 INSN(bgt, blt);
1863 INSN(bge, ble);
1864
1865 #undef INSN
1866
1867 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1868 // These three are specified in zicntr and are unused.
1869 // Before adding use-cases add the appropriate hwprobe and flag.
1870 assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1871 "Not intended for use without enabling zicntr.");
1872 csrrs(Rd, csr, x0);
1873 }
1874
1875 #define INSN(NAME, OPFUN) \
1876 void MacroAssembler::NAME(unsigned csr, Register Rs) { \
1877 OPFUN(x0, csr, Rs); \
1878 }
1879
1880 INSN(csrw, csrrw);
1881 INSN(csrs, csrrs);
1882 INSN(csrc, csrrc);
1883
1884 #undef INSN
1885
1886 #define INSN(NAME, OPFUN) \
1887 void MacroAssembler::NAME(unsigned csr, unsigned imm) { \
1888 OPFUN(x0, csr, imm); \
1889 }
1890
1891 INSN(csrwi, csrrwi);
1892 INSN(csrsi, csrrsi);
1893 INSN(csrci, csrrci);
1894
1895 #undef INSN
1896
1897 #define INSN(NAME, CSR) \
1898 void MacroAssembler::NAME(Register Rd, Register Rs) { \
1899 csrrw(Rd, CSR, Rs); \
1900 }
1901
1902 INSN(fscsr, CSR_FCSR);
1903 INSN(fsrm, CSR_FRM);
1904 INSN(fsflags, CSR_FFLAGS);
1905
1906 #undef INSN
1907
1908 #define INSN(NAME) \
1909 void MacroAssembler::NAME(Register Rs) { \
1910 NAME(x0, Rs); \
1911 }
1912
1913 INSN(fscsr);
1914 INSN(fsrm);
1915 INSN(fsflags);
1916
1917 #undef INSN
1918
1919 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1920 guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1921 csrrwi(Rd, CSR_FRM, imm);
1922 }
1923
1924 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1925 csrrwi(Rd, CSR_FFLAGS, imm);
1926 }
1927
1928 #define INSN(NAME) \
1929 void MacroAssembler::NAME(unsigned imm) { \
1930 NAME(x0, imm); \
1931 }
1932
1933 INSN(fsrmi);
1934 INSN(fsflagsi);
1935
1936 #undef INSN
1937
1938 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1939 if (RestoreMXCSROnJNICalls) {
1940 Label skip_fsrmi;
1941 frrm(tmp);
1942 // Set FRM to the state we need. We do want Round to Nearest.
1943 // We don't want non-IEEE rounding modes.
1944 guarantee(RoundingMode::rne == 0, "must be");
1945 beqz(tmp, skip_fsrmi); // Only reset FRM if it's wrong
1946 fsrmi(RoundingMode::rne);
1947 bind(skip_fsrmi);
1948 }
1949 }
1950
1951 void MacroAssembler::push_reg(Register Rs) {
1952 subi(esp, esp, wordSize);
1953 sd(Rs, Address(esp, 0));
1954 }
1955
1956 void MacroAssembler::pop_reg(Register Rd) {
1957 ld(Rd, Address(esp, 0));
1958 addi(esp, esp, wordSize);
1959 }
1960
1961 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1962 int count = 0;
1963 // Scan bitset to accumulate register pairs
1964 for (int reg = 31; reg >= 0; reg--) {
1965 if ((1U << 31) & bitset) {
1966 regs[count++] = reg;
1967 }
1968 bitset <<= 1;
1969 }
1970 return count;
1971 }
1972
1973 // Push integer registers in the bitset supplied. Don't push sp.
1974 // Return the number of words pushed
1975 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1976 if (regset.bits() == 0) {
1977 return 0;
1978 }
1979 auto bitset = integer_cast<unsigned int>(regset.bits());
1980 DEBUG_ONLY(int words_pushed = 0;)
1981 unsigned char regs[32];
1982 int count = bitset_to_regs(bitset, regs);
1983 // reserve one slot to align for odd count
1984 int offset = is_even(count) ? 0 : wordSize;
1985
1986 if (count) {
1987 sub(stack, stack, count * wordSize + offset);
1988 }
1989 for (int i = count - 1; i >= 0; i--) {
1990 sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1991 DEBUG_ONLY(words_pushed++;)
1992 }
1993
1994 assert(words_pushed == count, "oops, pushed != count");
1995
1996 return count;
1997 }
1998
1999 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
2000 if (regset.bits() == 0) {
2001 return 0;
2002 }
2003 auto bitset = integer_cast<unsigned int>(regset.bits());
2004 DEBUG_ONLY(int words_popped = 0;)
2005 unsigned char regs[32];
2006 int count = bitset_to_regs(bitset, regs);
2007 // reserve one slot to align for odd count
2008 int offset = is_even(count) ? 0 : wordSize;
2009
2010 for (int i = count - 1; i >= 0; i--) {
2011 ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2012 DEBUG_ONLY(words_popped++;)
2013 }
2014
2015 if (count) {
2016 add(stack, stack, count * wordSize + offset);
2017 }
2018 assert(words_popped == count, "oops, popped != count");
2019
2020 return count;
2021 }
2022
2023 // Push floating-point registers in the bitset supplied.
2024 // Return the number of words pushed
2025 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2026 if (regset.bits() == 0) {
2027 return 0;
2028 }
2029 auto bitset = integer_cast<unsigned int>(regset.bits());
2030 DEBUG_ONLY(int words_pushed = 0;)
2031 unsigned char regs[32];
2032 int count = bitset_to_regs(bitset, regs);
2033 int push_slots = count + (count & 1);
2034
2035 if (count) {
2036 subi(stack, stack, push_slots * wordSize);
2037 }
2038
2039 for (int i = count - 1; i >= 0; i--) {
2040 fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2041 DEBUG_ONLY(words_pushed++;)
2042 }
2043
2044 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2045
2046 return count;
2047 }
2048
2049 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2050 if (regset.bits() == 0) {
2051 return 0;
2052 }
2053 auto bitset = integer_cast<unsigned int>(regset.bits());
2054 DEBUG_ONLY(int words_popped = 0;)
2055 unsigned char regs[32];
2056 int count = bitset_to_regs(bitset, regs);
2057 int pop_slots = count + (count & 1);
2058
2059 for (int i = count - 1; i >= 0; i--) {
2060 fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2061 DEBUG_ONLY(words_popped++;)
2062 }
2063
2064 if (count) {
2065 addi(stack, stack, pop_slots * wordSize);
2066 }
2067
2068 assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2069
2070 return count;
2071 }
2072
2073 /**
2074 * Emits code to update CRC-32 with a byte value according to constants in table
2075 *
2076 * @param [in,out]crc Register containing the crc.
2077 * @param [in]val Register containing the byte to fold into the CRC.
2078 * @param [in]table Register containing the table of crc constants.
2079 *
2080 * uint32_t crc;
2081 * val = crc_table[(val ^ crc) & 0xFF];
2082 * crc = val ^ (crc >> 8);
2083 *
2084 */
2085 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2086 assert_different_registers(crc, val, table);
2087
2088 xorr(val, val, crc);
2089 zext(val, val, 8);
2090 shadd(val, val, table, val, 2);
2091 lwu(val, Address(val));
2092 srli(crc, crc, 8);
2093 xorr(crc, val, crc);
2094 }
2095
2096 /**
2097 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2098 *
2099 * @param [in,out]crc Register containing the crc.
2100 * @param [in]v Register containing the 32-bit to fold into the CRC.
2101 * @param [in]table0 Register containing table 0 of crc constants.
2102 * @param [in]table1 Register containing table 1 of crc constants.
2103 * @param [in]table2 Register containing table 2 of crc constants.
2104 * @param [in]table3 Register containing table 3 of crc constants.
2105 *
2106 * uint32_t crc;
2107 * v = crc ^ v
2108 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2109 *
2110 */
2111 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2112 Register table0, Register table1, Register table2, Register table3, bool upper) {
2113 assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2114
2115 if (upper)
2116 srli(v, v, 32);
2117 xorr(v, v, crc);
2118
2119 zext(tmp1, v, 8);
2120 shadd(tmp1, tmp1, table3, tmp2, 2);
2121 lwu(crc, Address(tmp1));
2122
2123 slli(tmp1, v, 16);
2124 slli(tmp3, v, 8);
2125
2126 srliw(tmp1, tmp1, 24);
2127 srliw(tmp3, tmp3, 24);
2128
2129 shadd(tmp1, tmp1, table2, tmp1, 2);
2130 lwu(tmp2, Address(tmp1));
2131
2132 shadd(tmp3, tmp3, table1, tmp3, 2);
2133 xorr(crc, crc, tmp2);
2134
2135 lwu(tmp2, Address(tmp3));
2136 // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2137 if (upper)
2138 srli(tmp1, v, 24);
2139 else
2140 srliw(tmp1, v, 24);
2141
2142 // no need to clear bits other than lowest two
2143 shadd(tmp1, tmp1, table0, tmp1, 2);
2144 xorr(crc, crc, tmp2);
2145 lwu(tmp2, Address(tmp1));
2146 xorr(crc, crc, tmp2);
2147 }
2148
2149
2150 #ifdef COMPILER2
2151 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2152 // To make it, following steps are taken:
2153 // 1. in zcrc32.c, modify N to 16 and related code,
2154 // 2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2155 // 3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2156 // New tables for vector version is after table3.
2157 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2158 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2159 Register table0, Register table3) {
2160 assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2161 const int N = 16, W = 4;
2162 const int64_t single_table_size = 256;
2163 const Register blks = tmp2;
2164 const Register tmpTable = tmp3, tableN16 = tmp4;
2165 const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2166 Label VectorLoop;
2167 Label LastBlock;
2168
2169 add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2170 mv(tmp5, 0xff);
2171
2172 if (MaxVectorSize == 16) {
2173 vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2174 } else if (MaxVectorSize == 32) {
2175 vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2176 } else {
2177 assert(MaxVectorSize > 32, "sanity");
2178 vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2179 }
2180
2181 vmv_v_x(vcrc, zr);
2182 vmv_s_x(vcrc, crc);
2183
2184 // multiple of 64
2185 srli(blks, len, 6);
2186 slli(t1, blks, 6);
2187 sub(len, len, t1);
2188 subi(blks, blks, 1);
2189 blez(blks, LastBlock);
2190
2191 bind(VectorLoop);
2192 {
2193 mv(tmpTable, tableN16);
2194
2195 vle32_v(vword, buf);
2196 vxor_vv(vword, vword, vcrc);
2197
2198 addi(buf, buf, N*4);
2199
2200 vand_vx(vtmp, vword, tmp5);
2201 vsll_vi(vtmp, vtmp, 2);
2202 vluxei32_v(vcrc, tmpTable, vtmp);
2203
2204 mv(tmp1, 1);
2205 for (int k = 1; k < W; k++) {
2206 addi(tmpTable, tmpTable, single_table_size*4);
2207
2208 slli(t1, tmp1, 3);
2209 vsrl_vx(vtmp, vword, t1);
2210
2211 vand_vx(vtmp, vtmp, tmp5);
2212 vsll_vi(vtmp, vtmp, 2);
2213 vluxei32_v(vtmp, tmpTable, vtmp);
2214
2215 vxor_vv(vcrc, vcrc, vtmp);
2216
2217 addi(tmp1, tmp1, 1);
2218 }
2219
2220 subi(blks, blks, 1);
2221 bgtz(blks, VectorLoop);
2222 }
2223
2224 bind(LastBlock);
2225 {
2226 vle32_v(vtmp, buf);
2227 vxor_vv(vcrc, vcrc, vtmp);
2228 mv(crc, zr);
2229 for (int i = 0; i < N; i++) {
2230 vmv_x_s(tmp2, vcrc);
2231 // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2232 zext(tmp2, tmp2, 32);
2233 vslidedown_vi(vcrc, vcrc, 1);
2234 xorr(crc, crc, tmp2);
2235 for (int j = 0; j < W; j++) {
2236 andr(t1, crc, tmp5);
2237 shadd(t1, t1, table0, tmp1, 2);
2238 lwu(t1, Address(t1, 0));
2239 srli(tmp2, crc, 8);
2240 xorr(crc, tmp2, t1);
2241 }
2242 }
2243 addi(buf, buf, N*4);
2244 }
2245 }
2246
2247 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2248 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2249 Register buf, Register tmp, const int STEP) {
2250 assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2251 vclmul_vv(vtmp1, vx, vt);
2252 vclmulh_vv(vtmp2, vx, vt);
2253 vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2254 // low parts
2255 vredxor_vs(vtmp3, vtmp1, vtmp4);
2256 // high parts
2257 vslidedown_vi(vx, vtmp4, 1);
2258 vredxor_vs(vtmp1, vtmp2, vx);
2259 // merge low and high back
2260 vslideup_vi(vx, vtmp1, 1);
2261 vmv_x_s(tmp, vtmp3);
2262 vmv_s_x(vx, tmp);
2263 }
2264
2265 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2266 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2267 Register tmp) {
2268 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2269 vclmul_vv(vtmp1, vx, vt);
2270 vclmulh_vv(vtmp2, vx, vt);
2271 // low parts
2272 vredxor_vs(vtmp3, vtmp1, vy);
2273 // high parts
2274 vslidedown_vi(vtmp4, vy, 1);
2275 vredxor_vs(vtmp1, vtmp2, vtmp4);
2276 // merge low and high back
2277 vslideup_vi(vx, vtmp1, 1);
2278 vmv_x_s(tmp, vtmp3);
2279 vmv_s_x(vx, tmp);
2280 }
2281
2282 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2283 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2284 Register tmp) {
2285 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2286 vclmul_vv(vtmp1, vx, vt);
2287 vclmulh_vv(vtmp2, vx, vt);
2288 // low parts
2289 vredxor_vs(vtmp3, vtmp1, vy);
2290 // high parts
2291 vslidedown_vi(vtmp4, vy, 1);
2292 vredxor_vs(vtmp1, vtmp2, vtmp4);
2293 // merge low and high back
2294 vslideup_vi(vy, vtmp1, 1);
2295 vmv_x_s(tmp, vtmp3);
2296 vmv_s_x(vy, tmp);
2297 }
2298
2299 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2300 Register vclmul_table, Register tmp1, Register tmp2) {
2301 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2302 assert(MaxVectorSize == 16, "sanity");
2303
2304 const int TABLE_STEP = 16;
2305 const int STEP = 16;
2306 const int LOOP_STEP = 128;
2307 const int N = 2;
2308
2309 Register loop_step = t1;
2310
2311 // ======== preparation ========
2312
2313 mv(loop_step, LOOP_STEP);
2314 sub(len, len, loop_step);
2315
2316 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2317 vle64_v(v0, buf); addi(buf, buf, STEP);
2318 vle64_v(v1, buf); addi(buf, buf, STEP);
2319 vle64_v(v2, buf); addi(buf, buf, STEP);
2320 vle64_v(v3, buf); addi(buf, buf, STEP);
2321 vle64_v(v4, buf); addi(buf, buf, STEP);
2322 vle64_v(v5, buf); addi(buf, buf, STEP);
2323 vle64_v(v6, buf); addi(buf, buf, STEP);
2324 vle64_v(v7, buf); addi(buf, buf, STEP);
2325
2326 vmv_v_x(v31, zr);
2327 vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2328 vmv_s_x(v31, crc);
2329 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2330 vxor_vv(v0, v0, v31);
2331
2332 // load table
2333 vle64_v(v31, vclmul_table);
2334
2335 Label L_16_bytes_loop;
2336 j(L_16_bytes_loop);
2337
2338
2339 // ======== folding 128 bytes in data buffer per round ========
2340
2341 align(OptoLoopAlignment);
2342 bind(L_16_bytes_loop);
2343 {
2344 crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2345 crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2346 crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2347 crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2348 crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2349 crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2350 crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2351 crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2352 }
2353 sub(len, len, loop_step);
2354 bge(len, loop_step, L_16_bytes_loop);
2355
2356
2357 // ======== folding into 64 bytes from 128 bytes in register ========
2358
2359 // load table
2360 addi(vclmul_table, vclmul_table, TABLE_STEP);
2361 vle64_v(v31, vclmul_table);
2362
2363 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2364 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2365 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2366 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2367
2368
2369 // ======== folding into 16 bytes from 64 bytes in register ========
2370
2371 addi(vclmul_table, vclmul_table, TABLE_STEP);
2372 vle64_v(v31, vclmul_table);
2373 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2374
2375 addi(vclmul_table, vclmul_table, TABLE_STEP);
2376 vle64_v(v31, vclmul_table);
2377 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2378
2379 addi(vclmul_table, vclmul_table, TABLE_STEP);
2380 vle64_v(v31, vclmul_table);
2381 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2382
2383 #undef FOLD_2_VCLMUL_3
2384
2385
2386 // ======== final: move result to scalar regsiters ========
2387
2388 vmv_x_s(tmp1, v3);
2389 vslidedown_vi(v1, v3, 1);
2390 vmv_x_s(tmp2, v1);
2391 }
2392
2393 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2394 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2395 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2396 vclmul_vv(vtmp1, vx, vt);
2397 vclmulh_vv(vtmp2, vx, vt);
2398 // low parts
2399 vredxor_vs(vtmp3, vtmp1, vy);
2400 // high parts
2401 vslidedown_vi(vtmp4, vy, 1);
2402 vredxor_vs(vtmp1, vtmp2, vtmp4);
2403 // merge low and high back
2404 vslideup_vi(vy, vtmp1, 1);
2405 vmv_x_s(t1, vtmp3);
2406 vmv_s_x(vy, t1);
2407 }
2408
2409 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2410 Register vclmul_table, Register tmp1, Register tmp2) {
2411 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2412 assert(MaxVectorSize >= 32, "sanity");
2413
2414 // utility: load table
2415 #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2416 vid_v(vtmp); \
2417 mv(rtmp, 2); \
2418 vremu_vx(vtmp, vtmp, rtmp); \
2419 vsll_vi(vtmp, vtmp, 3); \
2420 vluxei64_v(vt, rt, vtmp);
2421
2422 const int TABLE_STEP = 16;
2423 const int STEP = 128; // 128 bytes per round
2424 const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2425
2426 Register step = tmp2;
2427
2428
2429 // ======== preparation ========
2430
2431 mv(step, STEP);
2432 sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2433
2434 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2435 // load data
2436 vle64_v(v4, buf);
2437 add(buf, buf, step);
2438
2439 // load table
2440 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2441 // load mask,
2442 // v28 should already contains: 0, 8, 0, 8, ...
2443 vmseq_vi(v2, v28, 0);
2444 // now, v2 should contains: 101010...
2445 vmnand_mm(v1, v2, v2);
2446 // now, v1 should contains: 010101...
2447
2448 // initial crc
2449 vmv_v_x(v24, zr);
2450 vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2451 vmv_s_x(v24, crc);
2452 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2453 vxor_vv(v4, v4, v24);
2454
2455 Label L_128_bytes_loop;
2456 j(L_128_bytes_loop);
2457
2458
2459 // ======== folding 128 bytes in data buffer per round ========
2460
2461 align(OptoLoopAlignment);
2462 bind(L_128_bytes_loop);
2463 {
2464 // v4: data
2465 // v4: buf, reused
2466 // v8: table
2467 // v12: lows
2468 // v16: highs
2469 // v20: low_slides
2470 // v24: high_slides
2471 vclmul_vv(v12, v4, v8);
2472 vclmulh_vv(v16, v4, v8);
2473 vle64_v(v4, buf);
2474 add(buf, buf, step);
2475 // lows
2476 vslidedown_vi(v20, v12, 1);
2477 vmand_mm(v0, v2, v2);
2478 vxor_vv(v12, v12, v20, v0_t);
2479 // with buf data
2480 vxor_vv(v4, v4, v12, v0_t);
2481
2482 // highs
2483 vslideup_vi(v24, v16, 1);
2484 vmand_mm(v0, v1, v1);
2485 vxor_vv(v16, v16, v24, v0_t);
2486 // with buf data
2487 vxor_vv(v4, v4, v16, v0_t);
2488 }
2489 sub(len, len, step);
2490 bge(len, step, L_128_bytes_loop);
2491
2492
2493 // ======== folding into 64 bytes from 128 bytes in register ========
2494
2495 // load table
2496 addi(vclmul_table, vclmul_table, TABLE_STEP);
2497 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2498
2499 // v4: data, first (low) part, N/2 of 64-bits
2500 // v20: data, second (high) part, N/2 of 64-bits
2501 // v8: table
2502 // v10: lows
2503 // v12: highs
2504 // v14: low_slides
2505 // v16: high_slides
2506
2507 // high part
2508 vslidedown_vi(v20, v4, N/2);
2509
2510 vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2511
2512 vclmul_vv(v10, v4, v8);
2513 vclmulh_vv(v12, v4, v8);
2514
2515 // lows
2516 vslidedown_vi(v14, v10, 1);
2517 vmand_mm(v0, v2, v2);
2518 vxor_vv(v10, v10, v14, v0_t);
2519 // with data part 2
2520 vxor_vv(v4, v20, v10, v0_t);
2521
2522 // highs
2523 vslideup_vi(v16, v12, 1);
2524 vmand_mm(v0, v1, v1);
2525 vxor_vv(v12, v12, v16, v0_t);
2526 // with data part 2
2527 vxor_vv(v4, v20, v12, v0_t);
2528
2529
2530 // ======== folding into 16 bytes from 64 bytes in register ========
2531
2532 // v4: data, first part, 2 of 64-bits
2533 // v16: data, second part, 2 of 64-bits
2534 // v18: data, third part, 2 of 64-bits
2535 // v20: data, second part, 2 of 64-bits
2536 // v8: table
2537
2538 vslidedown_vi(v16, v4, 2);
2539 vslidedown_vi(v18, v4, 4);
2540 vslidedown_vi(v20, v4, 6);
2541
2542 vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2543
2544 addi(vclmul_table, vclmul_table, TABLE_STEP);
2545 vle64_v(v8, vclmul_table);
2546 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2547
2548 addi(vclmul_table, vclmul_table, TABLE_STEP);
2549 vle64_v(v8, vclmul_table);
2550 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2551
2552 addi(vclmul_table, vclmul_table, TABLE_STEP);
2553 vle64_v(v8, vclmul_table);
2554 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2555
2556
2557 // ======== final: move result to scalar regsiters ========
2558
2559 vmv_x_s(tmp1, v20);
2560 vslidedown_vi(v4, v20, 1);
2561 vmv_x_s(tmp2, v4);
2562
2563 #undef CRC32_VCLMUL_LOAD_TABLE
2564 }
2565
2566 // For more details of the algorithm, please check the paper:
2567 // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2568 //
2569 // Please also refer to the corresponding code in aarch64 or x86 ones.
2570 //
2571 // As the riscv carry-less multiplication is a bit different from the other platforms,
2572 // so the implementation itself is also a bit different from others.
2573
2574 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2575 Register table0, Register table1, Register table2, Register table3,
2576 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2577 const int64_t single_table_size = 256;
2578 const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2579 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2580 Register vclmul_table = tmp3;
2581
2582 la(vclmul_table, table_addr);
2583 add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2584 la(table0, table_addr);
2585
2586 if (MaxVectorSize == 16) {
2587 kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2588 } else {
2589 kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2590 }
2591
2592 mv(crc, zr);
2593 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2594 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2595 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2596 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2597 }
2598
2599 #endif // COMPILER2
2600
2601 /**
2602 * @param crc register containing existing CRC (32-bit)
2603 * @param buf register pointing to input byte buffer (byte*)
2604 * @param len register containing number of bytes
2605 * @param table register that will contain address of CRC table
2606 * @param tmp scratch registers
2607 */
2608 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2609 Register table0, Register table1, Register table2, Register table3,
2610 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2611 assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2612 Label L_vector_entry,
2613 L_unroll_loop,
2614 L_by4_loop_entry, L_by4_loop,
2615 L_by1_loop, L_exit, L_skip1, L_skip2;
2616
2617 const int64_t single_table_size = 256;
2618 const int64_t unroll = 16;
2619 const int64_t unroll_words = unroll*wordSize;
2620
2621 // tmp5 = 0xffffffff
2622 notr(tmp5, zr);
2623 srli(tmp5, tmp5, 32);
2624
2625 andn(crc, tmp5, crc);
2626
2627 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2628 la(table0, table_addr);
2629 add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2630 add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2631 add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2632
2633 // Ensure basic 4-byte alignment of input byte buffer
2634 mv(tmp1, 4);
2635 blt(len, tmp1, L_by1_loop);
2636 test_bit(tmp1, buf, 0);
2637 beqz(tmp1, L_skip1);
2638 subiw(len, len, 1);
2639 lbu(tmp1, Address(buf));
2640 addi(buf, buf, 1);
2641 update_byte_crc32(crc, tmp1, table0);
2642 bind(L_skip1);
2643 test_bit(tmp1, buf, 1);
2644 beqz(tmp1, L_skip2);
2645 subiw(len, len, 2);
2646 lhu(tmp1, Address(buf));
2647 addi(buf, buf, 2);
2648 zext(tmp2, tmp1, 8);
2649 update_byte_crc32(crc, tmp2, table0);
2650 srli(tmp2, tmp1, 8);
2651 update_byte_crc32(crc, tmp2, table0);
2652 bind(L_skip2);
2653
2654 #ifdef COMPILER2
2655 if (UseRVV) {
2656 const int64_t tmp_limit =
2657 UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2658 : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2659 mv(tmp1, tmp_limit);
2660 bge(len, tmp1, L_vector_entry);
2661 }
2662 #endif // COMPILER2
2663
2664 mv(tmp1, unroll_words);
2665 blt(len, tmp1, L_by4_loop_entry);
2666
2667 const Register loop_buf_end = tmp3;
2668
2669 align(CodeEntryAlignment);
2670 // Entry for L_unroll_loop
2671 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2672 andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2673 sub(loop_buf_end, loop_buf_end, len);
2674 bind(L_unroll_loop);
2675 for (int i = 0; i < unroll; i++) {
2676 ld(tmp1, Address(buf, i*wordSize));
2677 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2678 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2679 }
2680
2681 addi(buf, buf, unroll_words);
2682 blt(buf, loop_buf_end, L_unroll_loop);
2683
2684 bind(L_by4_loop_entry);
2685 mv(tmp1, 4);
2686 blt(len, tmp1, L_by1_loop);
2687 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2688 andi(len, len, 3);
2689 sub(loop_buf_end, loop_buf_end, len);
2690 bind(L_by4_loop);
2691 lwu(tmp1, Address(buf));
2692 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2693 addi(buf, buf, 4);
2694 blt(buf, loop_buf_end, L_by4_loop);
2695
2696 bind(L_by1_loop);
2697 beqz(len, L_exit);
2698
2699 subiw(len, len, 1);
2700 lbu(tmp1, Address(buf));
2701 update_byte_crc32(crc, tmp1, table0);
2702 beqz(len, L_exit);
2703
2704 subiw(len, len, 1);
2705 lbu(tmp1, Address(buf, 1));
2706 update_byte_crc32(crc, tmp1, table0);
2707 beqz(len, L_exit);
2708
2709 subiw(len, len, 1);
2710 lbu(tmp1, Address(buf, 2));
2711 update_byte_crc32(crc, tmp1, table0);
2712
2713 #ifdef COMPILER2
2714 // put vector code here, otherwise "offset is too large" error occurs.
2715 if (UseRVV) {
2716 // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2717 j(L_exit);
2718
2719 bind(L_vector_entry);
2720 if (UseZvbc) { // carry-less multiplication
2721 kernel_crc32_vclmul_fold(crc, buf, len,
2722 table0, table1, table2, table3,
2723 tmp1, tmp2, tmp3, tmp4, tmp6);
2724 } else { // plain vector instructions
2725 vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2726 }
2727
2728 bgtz(len, L_by4_loop_entry);
2729 }
2730 #endif // COMPILER2
2731
2732 bind(L_exit);
2733 andn(crc, tmp5, crc);
2734 }
2735
2736 #ifdef COMPILER2
2737 // Push vector registers in the bitset supplied.
2738 // Return the number of words pushed
2739 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2740 if (regset.bits() == 0) {
2741 return 0;
2742 }
2743 auto bitset = integer_cast<unsigned int>(regset.bits());
2744 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2745
2746 // Scan bitset to accumulate register pairs
2747 unsigned char regs[32];
2748 int count = bitset_to_regs(bitset, regs);
2749
2750 for (int i = 0; i < count; i++) {
2751 sub(stack, stack, vector_size_in_bytes);
2752 vs1r_v(as_VectorRegister(regs[i]), stack);
2753 }
2754
2755 return count * vector_size_in_bytes / wordSize;
2756 }
2757
2758 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2759 if (regset.bits() == 0) {
2760 return 0;
2761 }
2762 auto bitset = integer_cast<unsigned int>(regset.bits());
2763 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2764
2765 // Scan bitset to accumulate register pairs
2766 unsigned char regs[32];
2767 int count = bitset_to_regs(bitset, regs);
2768
2769 for (int i = count - 1; i >= 0; i--) {
2770 vl1r_v(as_VectorRegister(regs[i]), stack);
2771 add(stack, stack, vector_size_in_bytes);
2772 }
2773
2774 return count * vector_size_in_bytes / wordSize;
2775 }
2776 #endif // COMPILER2
2777
2778 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2779 // Push integer registers x7, x10-x17, x28-x31.
2780 push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2781
2782 // Push float registers f0-f7, f10-f17, f28-f31.
2783 subi(sp, sp, wordSize * 20);
2784 int offset = 0;
2785 for (int i = 0; i < 32; i++) {
2786 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2787 fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2788 }
2789 }
2790 }
2791
2792 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2793 int offset = 0;
2794 for (int i = 0; i < 32; i++) {
2795 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2796 fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2797 }
2798 }
2799 addi(sp, sp, wordSize * 20);
2800
2801 pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2802 }
2803
2804 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2805 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2806 push_reg(RegSet::range(x5, x31), sp);
2807
2808 // float registers
2809 subi(sp, sp, 32 * wordSize);
2810 for (int i = 0; i < 32; i++) {
2811 fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2812 }
2813
2814 // vector registers
2815 if (save_vectors) {
2816 sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2817 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2818 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2819 add(t0, sp, vector_size_in_bytes * i);
2820 vse64_v(as_VectorRegister(i), t0);
2821 }
2822 }
2823 }
2824
2825 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2826 // vector registers
2827 if (restore_vectors) {
2828 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2829 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2830 vle64_v(as_VectorRegister(i), sp);
2831 add(sp, sp, vector_size_in_bytes * 8);
2832 }
2833 }
2834
2835 // float registers
2836 for (int i = 0; i < 32; i++) {
2837 fld(as_FloatRegister(i), Address(sp, i * wordSize));
2838 }
2839 addi(sp, sp, 32 * wordSize);
2840
2841 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2842 pop_reg(RegSet::range(x5, x31), sp);
2843 }
2844
2845 static int patch_offset_in_jal(address branch, int64_t offset) {
2846 assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2847 "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2848 Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1); // offset[20] ==> branch[31]
2849 Assembler::patch(branch, 30, 21, (offset >> 1) & 0x3ff); // offset[10:1] ==> branch[30:21]
2850 Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1); // offset[11] ==> branch[20]
2851 Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff); // offset[19:12] ==> branch[19:12]
2852 return MacroAssembler::instruction_size; // only one instruction
2853 }
2854
2855 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2856 assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2857 "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2858 Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1); // offset[12] ==> branch[31]
2859 Assembler::patch(branch, 30, 25, (offset >> 5) & 0x3f); // offset[10:5] ==> branch[30:25]
2860 Assembler::patch(branch, 7, 7, (offset >> 11) & 0x1); // offset[11] ==> branch[7]
2861 Assembler::patch(branch, 11, 8, (offset >> 1) & 0xf); // offset[4:1] ==> branch[11:8]
2862 return MacroAssembler::instruction_size; // only one instruction
2863 }
2864
2865 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2866 const int PC_RELATIVE_INSTRUCTION_NUM = 2; // auipc, addi/jalr/load
2867 Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff); // Auipc. offset[31:12] ==> branch[31:12]
2868 Assembler::patch(branch + 4, 31, 20, offset & 0xfff); // Addi/Jalr/Load. offset[11:0] ==> branch[31:20]
2869 return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2870 }
2871
2872 static int patch_addr_in_movptr1(address branch, address target) {
2873 int32_t lower = ((intptr_t)target << 35) >> 35;
2874 int64_t upper = ((intptr_t)target - lower) >> 29;
2875 Assembler::patch(branch + 0, 31, 12, upper & 0xfffff); // Lui. target[48:29] + target[28] ==> branch[31:12]
2876 Assembler::patch(branch + 4, 31, 20, (lower >> 17) & 0xfff); // Addi. target[28:17] ==> branch[31:20]
2877 Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff); // Addi. target[16: 6] ==> branch[31:20]
2878 Assembler::patch(branch + 20, 31, 20, lower & 0x3f); // Addi/Jalr/Load. target[ 5: 0] ==> branch[31:20]
2879 return MacroAssembler::movptr1_instruction_size;
2880 }
2881
2882 static int patch_addr_in_movptr2(address instruction_address, address target) {
2883 uintptr_t addr = (uintptr_t)target;
2884
2885 assert(addr < (1ull << 48), "48-bit overflow in address constant");
2886 unsigned int upper18 = (addr >> 30ull);
2887 int lower30 = (addr & 0x3fffffffu);
2888 int low12 = (lower30 << 20) >> 20;
2889 int mid18 = ((lower30 - low12) >> 12);
2890
2891 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2892 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18 & 0xfffff)); // Lui
2893 // Slli
2894 // Add
2895 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff); // Addi/Jalr/Load
2896
2897 assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2898
2899 return MacroAssembler::movptr2_instruction_size;
2900 }
2901
2902 static int patch_imm_in_li16u(address branch, uint16_t target) {
2903 Assembler::patch(branch, 31, 12, target); // patch lui only
2904 return MacroAssembler::instruction_size;
2905 }
2906
2907 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2908 const int LI32_INSTRUCTIONS_NUM = 2; // lui + addiw
2909 int64_t upper = (intptr_t)target;
2910 int32_t lower = (((int32_t)target) << 20) >> 20;
2911 upper -= lower;
2912 upper = (int32_t)upper;
2913 Assembler::patch(branch + 0, 31, 12, (upper >> 12) & 0xfffff); // Lui.
2914 Assembler::patch(branch + 4, 31, 20, lower & 0xfff); // Addiw.
2915 return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2916 }
2917
2918 static long get_offset_of_jal(address insn_addr) {
2919 assert_cond(insn_addr != nullptr);
2920 long offset = 0;
2921 unsigned insn = Assembler::ld_instr(insn_addr);
2922 long val = (long)Assembler::sextract(insn, 31, 12);
2923 offset |= ((val >> 19) & 0x1) << 20;
2924 offset |= (val & 0xff) << 12;
2925 offset |= ((val >> 8) & 0x1) << 11;
2926 offset |= ((val >> 9) & 0x3ff) << 1;
2927 offset = (offset << 43) >> 43;
2928 return offset;
2929 }
2930
2931 static long get_offset_of_conditional_branch(address insn_addr) {
2932 long offset = 0;
2933 assert_cond(insn_addr != nullptr);
2934 unsigned insn = Assembler::ld_instr(insn_addr);
2935 offset = (long)Assembler::sextract(insn, 31, 31);
2936 offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2937 offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2938 offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2939 offset = (offset << 41) >> 41;
2940 return offset;
2941 }
2942
2943 static long get_offset_of_pc_relative(address insn_addr) {
2944 long offset = 0;
2945 assert_cond(insn_addr != nullptr);
2946 offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12; // Auipc.
2947 offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addi/Jalr/Load.
2948 offset = (offset << 32) >> 32;
2949 return offset;
2950 }
2951
2952 static address get_target_of_movptr1(address insn_addr) {
2953 assert_cond(insn_addr != nullptr);
2954 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2955 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17; // Addi.
2956 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6; // Addi.
2957 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)); // Addi/Jalr/Load.
2958 return (address) target_address;
2959 }
2960
2961 static address get_target_of_movptr2(address insn_addr) {
2962 assert_cond(insn_addr != nullptr);
2963 int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2964 int32_t mid18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2965 // 2 // Slli
2966 // 3 // Add
2967 int32_t low12 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2968 address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2969 return ret;
2970 }
2971
2972 address MacroAssembler::get_target_of_li32(address insn_addr) {
2973 assert_cond(insn_addr != nullptr);
2974 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2975 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addiw.
2976 return (address)target_address;
2977 }
2978
2979 // Patch any kind of instruction; there may be several instructions.
2980 // Return the total length (in bytes) of the instructions.
2981 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2982 assert_cond(instruction_address != nullptr);
2983 int64_t offset = target - instruction_address;
2984 if (MacroAssembler::is_jal_at(instruction_address)) { // jal
2985 return patch_offset_in_jal(instruction_address, offset);
2986 } else if (MacroAssembler::is_branch_at(instruction_address)) { // beq/bge/bgeu/blt/bltu/bne
2987 return patch_offset_in_conditional_branch(instruction_address, offset);
2988 } else if (MacroAssembler::is_pc_relative_at(instruction_address)) { // auipc, addi/jalr/load
2989 return patch_offset_in_pc_relative(instruction_address, offset);
2990 } else if (MacroAssembler::is_movptr1_at(instruction_address)) { // movptr1
2991 return patch_addr_in_movptr1(instruction_address, target);
2992 } else if (MacroAssembler::is_movptr2_at(instruction_address)) { // movptr2
2993 return patch_addr_in_movptr2(instruction_address, target);
2994 } else if (MacroAssembler::is_li32_at(instruction_address)) { // li32
2995 int64_t imm = (intptr_t)target;
2996 return patch_imm_in_li32(instruction_address, (int32_t)imm);
2997 } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2998 int64_t imm = (intptr_t)target;
2999 return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
3000 } else {
3001 #ifdef ASSERT
3002 tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3003 Assembler::ld_instr(instruction_address), p2i(instruction_address));
3004 Disassembler::decode(instruction_address - 16, instruction_address + 16);
3005 #endif
3006 ShouldNotReachHere();
3007 return -1;
3008 }
3009 }
3010
3011 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3012 long offset = 0;
3013 assert_cond(insn_addr != nullptr);
3014 if (MacroAssembler::is_jal_at(insn_addr)) { // jal
3015 offset = get_offset_of_jal(insn_addr);
3016 } else if (MacroAssembler::is_branch_at(insn_addr)) { // beq/bge/bgeu/blt/bltu/bne
3017 offset = get_offset_of_conditional_branch(insn_addr);
3018 } else if (MacroAssembler::is_pc_relative_at(insn_addr)) { // auipc, addi/jalr/load
3019 offset = get_offset_of_pc_relative(insn_addr);
3020 } else if (MacroAssembler::is_movptr1_at(insn_addr)) { // movptr1
3021 return get_target_of_movptr1(insn_addr);
3022 } else if (MacroAssembler::is_movptr2_at(insn_addr)) { // movptr2
3023 return get_target_of_movptr2(insn_addr);
3024 } else if (MacroAssembler::is_li32_at(insn_addr)) { // li32
3025 return get_target_of_li32(insn_addr);
3026 } else {
3027 ShouldNotReachHere();
3028 }
3029 return address(((uintptr_t)insn_addr + offset));
3030 }
3031
3032 int MacroAssembler::patch_oop(address insn_addr, address o) {
3033 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
3034 // narrow OOPs by setting the upper 16 bits in the first
3035 // instruction.
3036 if (MacroAssembler::is_li32_at(insn_addr)) {
3037 // Move narrow OOP
3038 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3039 return patch_imm_in_li32(insn_addr, (int32_t)n);
3040 } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3041 // Move wide OOP
3042 return patch_addr_in_movptr1(insn_addr, o);
3043 } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3044 // Move wide OOP
3045 return patch_addr_in_movptr2(insn_addr, o);
3046 }
3047 ShouldNotReachHere();
3048 return -1;
3049 }
3050
3051 void MacroAssembler::reinit_heapbase() {
3052 if (UseCompressedOops) {
3053 if (Universe::is_fully_initialized()) {
3054 mv(xheapbase, CompressedOops::base());
3055 } else {
3056 ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3057 }
3058 }
3059 }
3060
3061 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3062 assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3063 relocate(addr.rspec(), [&] {
3064 movptr(Rd, addr.target(), temp);
3065 });
3066 }
3067
3068 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3069 int offset = 0;
3070 movptr(Rd, addr, offset, temp);
3071 addi(Rd, Rd, offset);
3072 }
3073
3074 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3075 uint64_t uimm64 = (uint64_t)addr;
3076 #ifndef PRODUCT
3077 {
3078 char buffer[64];
3079 os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3080 block_comment(buffer);
3081 }
3082 #endif
3083 assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3084
3085 if (temp == noreg) {
3086 movptr1(Rd, uimm64, offset);
3087 } else {
3088 movptr2(Rd, uimm64, offset, temp);
3089 }
3090 }
3091
3092 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3093 // Load upper 31 bits
3094 //
3095 // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3096 // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3097 // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3098 // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3099 // `lower = (lower << 52) >> 52;`.
3100 // After `upper -= lower;`,
3101 // upper_20' = upper_20 - (-1) == upper_20 + 1
3102 // upper_12 = 0x000
3103 // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3104 // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3105 // Rd_20 == upper_20'
3106 // Rd_12 == 0x000
3107 // After `addi(Rd, Rd, lower);`,
3108 // Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3109 // Rd_12 = lower_12
3110 // So, finally Rd == [upper_20, lower_12]
3111 int64_t imm = imm64 >> 17;
3112 int64_t upper = imm, lower = imm;
3113 lower = (lower << 52) >> 52;
3114 upper -= lower;
3115 upper = (int32_t)upper;
3116 lui(Rd, upper);
3117 addi(Rd, Rd, lower);
3118
3119 // Load the rest 17 bits.
3120 slli(Rd, Rd, 11);
3121 addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3122 slli(Rd, Rd, 6);
3123
3124 // This offset will be used by following jalr/ld.
3125 offset = imm64 & 0x3f;
3126 }
3127
3128 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3129 assert_different_registers(Rd, tmp, noreg);
3130
3131 // addr: [upper18, lower30[mid18, lower12]]
3132
3133 int64_t upper18 = addr >> 18;
3134 lui(tmp, upper18);
3135
3136 int64_t lower30 = addr & 0x3fffffff;
3137 int64_t mid18 = lower30, lower12 = lower30;
3138 lower12 = (lower12 << 52) >> 52;
3139 // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3140 // please refer to movptr1 above.
3141 mid18 -= (int32_t)lower12;
3142 lui(Rd, mid18);
3143
3144 slli(tmp, tmp, 18);
3145 add(Rd, Rd, tmp);
3146
3147 offset = lower12;
3148 }
3149
3150 // floating point imm move
3151 bool MacroAssembler::can_hf_imm_load(short imm) {
3152 jshort h_bits = (jshort)imm;
3153 if (h_bits == 0) {
3154 return true;
3155 }
3156 return can_zfa_zli_half_float(imm);
3157 }
3158
3159 bool MacroAssembler::can_fp_imm_load(float imm) {
3160 jint f_bits = jint_cast(imm);
3161 if (f_bits == 0) {
3162 return true;
3163 }
3164 return can_zfa_zli_float(imm);
3165 }
3166
3167 bool MacroAssembler::can_dp_imm_load(double imm) {
3168 julong d_bits = julong_cast(imm);
3169 if (d_bits == 0) {
3170 return true;
3171 }
3172 return can_zfa_zli_double(imm);
3173 }
3174
3175 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3176 jshort h_bits = (jshort)imm;
3177 if (h_bits == 0) {
3178 fmv_h_x(Rd, zr);
3179 return;
3180 }
3181 int Rs = zfa_zli_lookup_half_float(h_bits);
3182 assert(Rs != -1, "Must be");
3183 _fli_h(Rd, Rs);
3184 }
3185
3186 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3187 jint f_bits = jint_cast(imm);
3188 if (f_bits == 0) {
3189 fmv_w_x(Rd, zr);
3190 return;
3191 }
3192 int Rs = zfa_zli_lookup_float(f_bits);
3193 assert(Rs != -1, "Must be");
3194 _fli_s(Rd, Rs);
3195 }
3196
3197 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3198 uint64_t d_bits = (uint64_t)julong_cast(imm);
3199 if (d_bits == 0) {
3200 fmv_d_x(Rd, zr);
3201 return;
3202 }
3203 int Rs = zfa_zli_lookup_double(d_bits);
3204 assert(Rs != -1, "Must be");
3205 _fli_d(Rd, Rs);
3206 }
3207
3208 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3209 if (is_simm12(increment)) {
3210 addi(Rd, Rn, increment);
3211 } else {
3212 assert_different_registers(Rn, tmp);
3213 mv(tmp, increment);
3214 add(Rd, Rn, tmp);
3215 }
3216 }
3217
3218 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3219 add(Rd, Rn, -decrement, tmp);
3220 }
3221
3222 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3223 if (is_simm12(increment)) {
3224 addiw(Rd, Rn, increment);
3225 } else {
3226 assert_different_registers(Rn, tmp);
3227 mv(tmp, increment);
3228 addw(Rd, Rn, tmp);
3229 }
3230 }
3231
3232 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3233 addw(Rd, Rn, -decrement, tmp);
3234 }
3235
3236 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3237 andr(Rd, Rs1, Rs2);
3238 sext(Rd, Rd, 32);
3239 }
3240
3241 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3242 orr(Rd, Rs1, Rs2);
3243 sext(Rd, Rd, 32);
3244 }
3245
3246 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3247 xorr(Rd, Rs1, Rs2);
3248 sext(Rd, Rd, 32);
3249 }
3250
3251 // Rd = Rs1 & (~Rd2)
3252 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3253 if (UseZbb) {
3254 Assembler::andn(Rd, Rs1, Rs2);
3255 return;
3256 }
3257
3258 notr(Rd, Rs2);
3259 andr(Rd, Rs1, Rd);
3260 }
3261
3262 // Rd = Rs1 | (~Rd2)
3263 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3264 if (UseZbb) {
3265 Assembler::orn(Rd, Rs1, Rs2);
3266 return;
3267 }
3268
3269 notr(Rd, Rs2);
3270 orr(Rd, Rs1, Rd);
3271 }
3272
3273 // Note: load_unsigned_short used to be called load_unsigned_word.
3274 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3275 int off = offset();
3276 lhu(dst, src);
3277 return off;
3278 }
3279
3280 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3281 int off = offset();
3282 lbu(dst, src);
3283 return off;
3284 }
3285
3286 int MacroAssembler::load_signed_short(Register dst, Address src) {
3287 int off = offset();
3288 lh(dst, src);
3289 return off;
3290 }
3291
3292 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3293 int off = offset();
3294 lb(dst, src);
3295 return off;
3296 }
3297
3298 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3299 switch (size_in_bytes) {
3300 case 8: ld(dst, src); break;
3301 case 4: is_signed ? lw(dst, src) : lwu(dst, src); break;
3302 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3303 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3304 default: ShouldNotReachHere();
3305 }
3306 }
3307
3308 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3309 switch (size_in_bytes) {
3310 case 8: sd(src, dst); break;
3311 case 4: sw(src, dst); break;
3312 case 2: sh(src, dst); break;
3313 case 1: sb(src, dst); break;
3314 default: ShouldNotReachHere();
3315 }
3316 }
3317
3318 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3319 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3320 if (granularity != 1 && granularity != 2) {
3321 ShouldNotReachHere();
3322 }
3323 if (AvoidUnalignedAccesses && (granularity != 2)) {
3324 assert_different_registers(dst, tmp);
3325 assert_different_registers(tmp, src.base());
3326 is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3327 slli(tmp, tmp, 8);
3328 lbu(dst, src);
3329 add(dst, dst, tmp);
3330 } else {
3331 is_signed ? lh(dst, src) : lhu(dst, src);
3332 }
3333 }
3334
3335 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3336 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3337 if (AvoidUnalignedAccesses && (granularity != 4)) {
3338 switch(granularity) {
3339 case 1:
3340 assert_different_registers(dst, tmp, src.base());
3341 lbu(dst, src);
3342 lbu(tmp, Address(src.base(), src.offset() + 1));
3343 slli(tmp, tmp, 8);
3344 add(dst, dst, tmp);
3345 lbu(tmp, Address(src.base(), src.offset() + 2));
3346 slli(tmp, tmp, 16);
3347 add(dst, dst, tmp);
3348 is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3349 slli(tmp, tmp, 24);
3350 add(dst, dst, tmp);
3351 break;
3352 case 2:
3353 assert_different_registers(dst, tmp);
3354 assert_different_registers(tmp, src.base());
3355 is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3356 slli(tmp, tmp, 16);
3357 lhu(dst, src);
3358 add(dst, dst, tmp);
3359 break;
3360 default:
3361 ShouldNotReachHere();
3362 }
3363 } else {
3364 is_signed ? lw(dst, src) : lwu(dst, src);
3365 }
3366 }
3367
3368 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3369 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3370 if (AvoidUnalignedAccesses && (granularity != 8)) {
3371 switch(granularity){
3372 case 1:
3373 assert_different_registers(dst, tmp, src.base());
3374 lbu(dst, src);
3375 lbu(tmp, Address(src.base(), src.offset() + 1));
3376 slli(tmp, tmp, 8);
3377 add(dst, dst, tmp);
3378 lbu(tmp, Address(src.base(), src.offset() + 2));
3379 slli(tmp, tmp, 16);
3380 add(dst, dst, tmp);
3381 lbu(tmp, Address(src.base(), src.offset() + 3));
3382 slli(tmp, tmp, 24);
3383 add(dst, dst, tmp);
3384 lbu(tmp, Address(src.base(), src.offset() + 4));
3385 slli(tmp, tmp, 32);
3386 add(dst, dst, tmp);
3387 lbu(tmp, Address(src.base(), src.offset() + 5));
3388 slli(tmp, tmp, 40);
3389 add(dst, dst, tmp);
3390 lbu(tmp, Address(src.base(), src.offset() + 6));
3391 slli(tmp, tmp, 48);
3392 add(dst, dst, tmp);
3393 lbu(tmp, Address(src.base(), src.offset() + 7));
3394 slli(tmp, tmp, 56);
3395 add(dst, dst, tmp);
3396 break;
3397 case 2:
3398 assert_different_registers(dst, tmp, src.base());
3399 lhu(dst, src);
3400 lhu(tmp, Address(src.base(), src.offset() + 2));
3401 slli(tmp, tmp, 16);
3402 add(dst, dst, tmp);
3403 lhu(tmp, Address(src.base(), src.offset() + 4));
3404 slli(tmp, tmp, 32);
3405 add(dst, dst, tmp);
3406 lhu(tmp, Address(src.base(), src.offset() + 6));
3407 slli(tmp, tmp, 48);
3408 add(dst, dst, tmp);
3409 break;
3410 case 4:
3411 assert_different_registers(dst, tmp);
3412 assert_different_registers(tmp, src.base());
3413 lwu(tmp, Address(src.base(), src.offset() + 4));
3414 slli(tmp, tmp, 32);
3415 lwu(dst, src);
3416 add(dst, dst, tmp);
3417 break;
3418 default:
3419 ShouldNotReachHere();
3420 }
3421 } else {
3422 ld(dst, src);
3423 }
3424 }
3425
3426 // reverse bytes in lower word, sign-extend
3427 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3428 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3429 if (UseZbb) {
3430 rev8(Rd, Rs);
3431 srai(Rd, Rd, 32);
3432 return;
3433 }
3434 assert_different_registers(Rs, tmp1, tmp2);
3435 assert_different_registers(Rd, tmp1, tmp2);
3436 zext(tmp1, Rs, 8);
3437 slli(tmp1, tmp1, 8);
3438 for (int step = 8; step < 24; step += 8) {
3439 srli(tmp2, Rs, step);
3440 zext(tmp2, tmp2, 8);
3441 orr(tmp1, tmp1, tmp2);
3442 slli(tmp1, tmp1, 8);
3443 }
3444 srli(Rd, Rs, 24);
3445 zext(Rd, Rd, 8);
3446 orr(Rd, tmp1, Rd);
3447 sext(Rd, Rd, 32);
3448 }
3449
3450 // reverse bytes in doubleword
3451 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3452 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3453 if (UseZbb) {
3454 rev8(Rd, Rs);
3455 return;
3456 }
3457 assert_different_registers(Rs, tmp1, tmp2);
3458 assert_different_registers(Rd, tmp1, tmp2);
3459 zext(tmp1, Rs, 8);
3460 slli(tmp1, tmp1, 8);
3461 for (int step = 8; step < 56; step += 8) {
3462 srli(tmp2, Rs, step);
3463 zext(tmp2, tmp2, 8);
3464 orr(tmp1, tmp1, tmp2);
3465 slli(tmp1, tmp1, 8);
3466 }
3467 srli(Rd, Rs, 56);
3468 orr(Rd, tmp1, Rd);
3469 }
3470
3471 // rotate right with shift bits
3472 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3473 {
3474 if (UseZbb) {
3475 rorr(dst, src, shift);
3476 return;
3477 }
3478
3479 assert_different_registers(dst, tmp);
3480 assert_different_registers(src, tmp);
3481
3482 mv(tmp, 64);
3483 sub(tmp, tmp, shift);
3484 sll(tmp, src, tmp);
3485 srl(dst, src, shift);
3486 orr(dst, dst, tmp);
3487 }
3488
3489 // rotate right with shift bits
3490 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3491 {
3492 if (UseZbb) {
3493 rori(dst, src, shift);
3494 return;
3495 }
3496
3497 assert_different_registers(dst, tmp);
3498 assert_different_registers(src, tmp);
3499 assert(shift < 64, "shift amount must be < 64");
3500 slli(tmp, src, 64 - shift);
3501 srli(dst, src, shift);
3502 orr(dst, dst, tmp);
3503 }
3504
3505 // rotate left with shift bits, 32-bit version
3506 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3507 if (UseZbb) {
3508 // no roliw available
3509 roriw(dst, src, 32 - shift);
3510 return;
3511 }
3512
3513 assert_different_registers(dst, tmp);
3514 assert_different_registers(src, tmp);
3515 assert(shift < 32, "shift amount must be < 32");
3516 srliw(tmp, src, 32 - shift);
3517 slliw(dst, src, shift);
3518 orr(dst, dst, tmp);
3519 }
3520
3521 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3522 ld(tmp1, adr);
3523 if (src.is_register()) {
3524 orr(tmp1, tmp1, src.as_register());
3525 } else {
3526 if (is_simm12(src.as_constant())) {
3527 ori(tmp1, tmp1, src.as_constant());
3528 } else {
3529 assert_different_registers(tmp1, tmp2);
3530 mv(tmp2, src.as_constant());
3531 orr(tmp1, tmp1, tmp2);
3532 }
3533 }
3534 sd(tmp1, adr);
3535 }
3536
3537 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3538 Register tmp1, Register tmp2,
3539 Label &L, bool is_far) {
3540 assert_different_registers(obj, klass, tmp1, tmp2);
3541 if (UseCompactObjectHeaders) {
3542 load_narrow_klass_compact(tmp1, obj);
3543 } else {
3544 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3545 }
3546 decode_klass_not_null(tmp1, tmp2);
3547 beq(klass, tmp1, L, is_far);
3548 }
3549
3550 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3551 Register tmp1, Register tmp2,
3552 Label &L, bool is_far) {
3553 assert_different_registers(obj, klass, tmp1, tmp2);
3554 if (UseCompactObjectHeaders) {
3555 load_narrow_klass_compact(tmp1, obj);
3556 } else {
3557 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3558 }
3559 decode_klass_not_null(tmp1, tmp2);
3560 bne(klass, tmp1, L, is_far);
3561 }
3562
3563 // Move an oop into a register.
3564 void MacroAssembler::movoop(Register dst, jobject obj) {
3565 int oop_index;
3566 if (obj == nullptr) {
3567 oop_index = oop_recorder()->allocate_oop_index(obj);
3568 } else {
3569 #ifdef ASSERT
3570 {
3571 ThreadInVMfromUnknown tiv;
3572 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3573 }
3574 #endif
3575 oop_index = oop_recorder()->find_index(obj);
3576 }
3577 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3578
3579 if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3580 movptr(dst, Address((address)obj, rspec));
3581 } else {
3582 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3583 ld(dst, Address(dummy, rspec));
3584 }
3585 }
3586
3587 // Move a metadata address into a register.
3588 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3589 assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3590 int oop_index;
3591 if (obj == nullptr) {
3592 oop_index = oop_recorder()->allocate_metadata_index(obj);
3593 } else {
3594 oop_index = oop_recorder()->find_index(obj);
3595 }
3596 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3597 movptr(dst, Address((address)obj, rspec));
3598 }
3599
3600 // Writes to stack successive pages until offset reached to check for
3601 // stack overflow + shadow pages. This clobbers tmp.
3602 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3603 assert_different_registers(tmp, size, t0);
3604 // Bang stack for total size given plus shadow page size.
3605 // Bang one page at a time because large size can bang beyond yellow and
3606 // red zones.
3607 mv(t0, (int)os::vm_page_size());
3608 Label loop;
3609 bind(loop);
3610 sub(tmp, sp, t0);
3611 subw(size, size, t0);
3612 sd(size, Address(tmp));
3613 bgtz(size, loop);
3614
3615 // Bang down shadow pages too.
3616 // At this point, (tmp-0) is the last address touched, so don't
3617 // touch it again. (It was touched as (tmp-pagesize) but then tmp
3618 // was post-decremented.) Skip this address by starting at i=1, and
3619 // touch a few more pages below. N.B. It is important to touch all
3620 // the way down to and including i=StackShadowPages.
3621 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3622 // this could be any sized move but this is can be a debugging crumb
3623 // so the bigger the better.
3624 sub(tmp, tmp, (int)os::vm_page_size());
3625 sd(size, Address(tmp, 0));
3626 }
3627 }
3628
3629 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3630 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3631 ld(dst, Address(xmethod, Method::const_offset()));
3632 ld(dst, Address(dst, ConstMethod::constants_offset()));
3633 ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3634 ld(dst, Address(dst, mirror_offset));
3635 resolve_oop_handle(dst, tmp1, tmp2);
3636 }
3637
3638 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3639 // OopHandle::resolve is an indirection.
3640 assert_different_registers(result, tmp1, tmp2);
3641 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3642 }
3643
3644 // ((WeakHandle)result).resolve()
3645 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3646 assert_different_registers(result, tmp1, tmp2);
3647 Label resolved;
3648
3649 // A null weak handle resolves to null.
3650 beqz(result, resolved);
3651
3652 // Only 64 bit platforms support GCs that require a tmp register
3653 // Only IN_HEAP loads require a thread_tmp register
3654 // WeakHandle::resolve is an indirection like jweak.
3655 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3656 result, Address(result), tmp1, tmp2);
3657 bind(resolved);
3658 }
3659
3660 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3661 Register dst, Address src,
3662 Register tmp1, Register tmp2) {
3663 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3664 decorators = AccessInternal::decorator_fixup(decorators, type);
3665 bool as_raw = (decorators & AS_RAW) != 0;
3666 if (as_raw) {
3667 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3668 } else {
3669 bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3670 }
3671 }
3672
3673 void MacroAssembler::null_check(Register reg, int offset) {
3674 if (needs_explicit_null_check(offset)) {
3675 // provoke OS null exception if reg is null by
3676 // accessing M[reg] w/o changing any registers
3677 // NOTE: this is plenty to provoke a segv
3678 ld(zr, Address(reg, 0));
3679 } else {
3680 // nothing to do, (later) access of M[reg + offset]
3681 // will provoke OS null exception if reg is null
3682 }
3683 }
3684
3685 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3686 Address dst, Register val,
3687 Register tmp1, Register tmp2, Register tmp3) {
3688 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3689 decorators = AccessInternal::decorator_fixup(decorators, type);
3690 bool as_raw = (decorators & AS_RAW) != 0;
3691 if (as_raw) {
3692 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3693 } else {
3694 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3695 }
3696 }
3697
3698 // Algorithm must match CompressedOops::encode.
3699 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3700 verify_oop_msg(s, "broken oop in encode_heap_oop");
3701 if (CompressedOops::base() == nullptr) {
3702 if (CompressedOops::shift() != 0) {
3703 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3704 srli(d, s, LogMinObjAlignmentInBytes);
3705 } else {
3706 mv(d, s);
3707 }
3708 } else {
3709 Label notNull;
3710 sub(d, s, xheapbase);
3711 bgez(d, notNull);
3712 mv(d, zr);
3713 bind(notNull);
3714 if (CompressedOops::shift() != 0) {
3715 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3716 srli(d, d, CompressedOops::shift());
3717 }
3718 }
3719 }
3720
3721 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3722 #ifdef ASSERT
3723 if (CheckCompressedOops) {
3724 Label ok;
3725 bnez(r, ok);
3726 stop("null oop passed to encode_heap_oop_not_null");
3727 bind(ok);
3728 }
3729 #endif
3730 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3731 if (CompressedOops::base() != nullptr) {
3732 sub(r, r, xheapbase);
3733 }
3734 if (CompressedOops::shift() != 0) {
3735 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3736 srli(r, r, LogMinObjAlignmentInBytes);
3737 }
3738 }
3739
3740 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3741 #ifdef ASSERT
3742 if (CheckCompressedOops) {
3743 Label ok;
3744 bnez(src, ok);
3745 stop("null oop passed to encode_heap_oop_not_null2");
3746 bind(ok);
3747 }
3748 #endif
3749 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3750
3751 Register data = src;
3752 if (CompressedOops::base() != nullptr) {
3753 sub(dst, src, xheapbase);
3754 data = dst;
3755 }
3756 if (CompressedOops::shift() != 0) {
3757 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3758 srli(dst, data, LogMinObjAlignmentInBytes);
3759 data = dst;
3760 }
3761 if (data == src) {
3762 mv(dst, src);
3763 }
3764 }
3765
3766 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3767 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3768 ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3769 srli(dst, dst, markWord::klass_shift);
3770 }
3771
3772 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3773 assert_different_registers(dst, tmp);
3774 assert_different_registers(src, tmp);
3775 if (UseCompactObjectHeaders) {
3776 load_narrow_klass_compact(dst, src);
3777 decode_klass_not_null(dst, tmp);
3778 } else {
3779 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3780 decode_klass_not_null(dst, tmp);
3781 }
3782 }
3783
3784 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3785 // FIXME: Should this be a store release? concurrent gcs assumes
3786 // klass length is valid if klass field is not null.
3787 assert(!UseCompactObjectHeaders, "not with compact headers");
3788 encode_klass_not_null(src, tmp);
3789 sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3790
3791 }
3792
3793 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3794 assert(!UseCompactObjectHeaders, "not with compact headers");
3795 // Store to klass gap in destination
3796 sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3797 }
3798
3799 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3800 assert_different_registers(r, tmp);
3801 decode_klass_not_null(r, r, tmp);
3802 }
3803
3804 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3805 assert_different_registers(dst, tmp);
3806 assert_different_registers(src, tmp);
3807
3808 if (CompressedKlassPointers::base() == nullptr) {
3809 if (CompressedKlassPointers::shift() != 0) {
3810 slli(dst, src, CompressedKlassPointers::shift());
3811 } else {
3812 mv(dst, src);
3813 }
3814 return;
3815 }
3816
3817 Register xbase = tmp;
3818
3819 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3820
3821 if (CompressedKlassPointers::shift() != 0) {
3822 // dst = (src << shift) + xbase
3823 shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3824 } else {
3825 add(dst, xbase, src);
3826 }
3827 }
3828
3829 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3830 assert_different_registers(r, tmp);
3831 encode_klass_not_null(r, r, tmp);
3832 }
3833
3834 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3835 if (CompressedKlassPointers::base() == nullptr) {
3836 if (CompressedKlassPointers::shift() != 0) {
3837 srli(dst, src, CompressedKlassPointers::shift());
3838 } else {
3839 mv(dst, src);
3840 }
3841 return;
3842 }
3843
3844 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3845 CompressedKlassPointers::shift() == 0) {
3846 zext(dst, src, 32);
3847 return;
3848 }
3849
3850 Register xbase = dst;
3851 if (dst == src) {
3852 xbase = tmp;
3853 }
3854
3855 assert_different_registers(src, xbase);
3856 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3857 sub(dst, src, xbase);
3858 if (CompressedKlassPointers::shift() != 0) {
3859 srli(dst, dst, CompressedKlassPointers::shift());
3860 }
3861 }
3862
3863 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3864 decode_heap_oop_not_null(r, r);
3865 }
3866
3867 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3868 assert(UseCompressedOops, "should only be used for compressed headers");
3869 assert(Universe::heap() != nullptr, "java heap should be initialized");
3870 // Cannot assert, unverified entry point counts instructions (see .ad file)
3871 // vtableStubs also counts instructions in pd_code_size_limit.
3872 // Also do not verify_oop as this is called by verify_oop.
3873 if (CompressedOops::shift() != 0) {
3874 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3875 slli(dst, src, LogMinObjAlignmentInBytes);
3876 if (CompressedOops::base() != nullptr) {
3877 add(dst, xheapbase, dst);
3878 }
3879 } else {
3880 assert(CompressedOops::base() == nullptr, "sanity");
3881 mv(dst, src);
3882 }
3883 }
3884
3885 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3886 if (CompressedOops::base() == nullptr) {
3887 if (CompressedOops::shift() != 0 || d != s) {
3888 slli(d, s, CompressedOops::shift());
3889 }
3890 } else {
3891 Label done;
3892 mv(d, s);
3893 beqz(s, done);
3894 shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3895 bind(done);
3896 }
3897 verify_oop_msg(d, "broken oop in decode_heap_oop");
3898 }
3899
3900 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3901 Register tmp2, Register tmp3, DecoratorSet decorators) {
3902 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3903 }
3904
3905 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3906 Register tmp2, DecoratorSet decorators) {
3907 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3908 }
3909
3910 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3911 Register tmp2, DecoratorSet decorators) {
3912 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3913 }
3914
3915 // Used for storing nulls.
3916 void MacroAssembler::store_heap_oop_null(Address dst) {
3917 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3918 }
3919
3920 // Look up the method for a megamorphic invokeinterface call.
3921 // The target method is determined by <intf_klass, itable_index>.
3922 // The receiver klass is in recv_klass.
3923 // On success, the result will be in method_result, and execution falls through.
3924 // On failure, execution transfers to the given label.
3925 void MacroAssembler::lookup_interface_method(Register recv_klass,
3926 Register intf_klass,
3927 RegisterOrConstant itable_index,
3928 Register method_result,
3929 Register scan_tmp,
3930 Label& L_no_such_interface,
3931 bool return_method) {
3932 assert_different_registers(recv_klass, intf_klass, scan_tmp);
3933 assert_different_registers(method_result, intf_klass, scan_tmp);
3934 assert(recv_klass != method_result || !return_method,
3935 "recv_klass can be destroyed when method isn't needed");
3936 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3937 "caller must use same register for non-constant itable index as for method");
3938
3939 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3940 int vtable_base = in_bytes(Klass::vtable_start_offset());
3941 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3942 int scan_step = itableOffsetEntry::size() * wordSize;
3943 int vte_size = vtableEntry::size_in_bytes();
3944 assert(vte_size == wordSize, "else adjust times_vte_scale");
3945
3946 lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3947
3948 // Could store the aligned, prescaled offset in the klass.
3949 shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3950 add(scan_tmp, scan_tmp, vtable_base);
3951
3952 if (return_method) {
3953 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3954 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3955 if (itable_index.is_register()) {
3956 slli(t0, itable_index.as_register(), 3);
3957 } else {
3958 mv(t0, itable_index.as_constant() << 3);
3959 }
3960 add(recv_klass, recv_klass, t0);
3961 if (itentry_off) {
3962 add(recv_klass, recv_klass, itentry_off);
3963 }
3964 }
3965
3966 Label search, found_method;
3967
3968 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3969 beq(intf_klass, method_result, found_method);
3970 bind(search);
3971 // Check that the previous entry is non-null. A null entry means that
3972 // the receiver class doesn't implement the interface, and wasn't the
3973 // same as when the caller was compiled.
3974 beqz(method_result, L_no_such_interface, /* is_far */ true);
3975 addi(scan_tmp, scan_tmp, scan_step);
3976 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3977 bne(intf_klass, method_result, search);
3978
3979 bind(found_method);
3980
3981 // Got a hit.
3982 if (return_method) {
3983 lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3984 add(method_result, recv_klass, scan_tmp);
3985 ld(method_result, Address(method_result));
3986 }
3987 }
3988
3989 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3990 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3991 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3992 // The target method is determined by <holder_klass, itable_index>.
3993 // The receiver klass is in recv_klass.
3994 // On success, the result will be in method_result, and execution falls through.
3995 // On failure, execution transfers to the given label.
3996 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3997 Register holder_klass,
3998 Register resolved_klass,
3999 Register method_result,
4000 Register temp_itbl_klass,
4001 Register scan_temp,
4002 int itable_index,
4003 Label& L_no_such_interface) {
4004 // 'method_result' is only used as output register at the very end of this method.
4005 // Until then we can reuse it as 'holder_offset'.
4006 Register holder_offset = method_result;
4007 assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4008
4009 int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4010 int scan_step = itableOffsetEntry::size() * wordSize;
4011 int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4012 int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4013 int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4014 const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4015
4016 Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4017
4018 lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4019 add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4020 // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4021 // + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4022 // scan_temp = &(itable[0]._interface)
4023 // temp_itbl_klass = itable[0]._interface;
4024 shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4025 ld(temp_itbl_klass, Address(scan_temp));
4026 mv(holder_offset, zr);
4027
4028 // Initial checks:
4029 // - if (holder_klass != resolved_klass), go to "scan for resolved"
4030 // - if (itable[0] == holder_klass), shortcut to "holder found"
4031 // - if (itable[0] == 0), no such interface
4032 bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4033 beq(holder_klass, temp_itbl_klass, L_holder_found);
4034 beqz(temp_itbl_klass, L_no_such_interface);
4035
4036 // Loop: Look for holder_klass record in itable
4037 // do {
4038 // temp_itbl_klass = *(scan_temp += scan_step);
4039 // if (temp_itbl_klass == holder_klass) {
4040 // goto L_holder_found; // Found!
4041 // }
4042 // } while (temp_itbl_klass != 0);
4043 // goto L_no_such_interface // Not found.
4044 Label L_search_holder;
4045 bind(L_search_holder);
4046 add(scan_temp, scan_temp, scan_step);
4047 ld(temp_itbl_klass, Address(scan_temp));
4048 beq(holder_klass, temp_itbl_klass, L_holder_found);
4049 bnez(temp_itbl_klass, L_search_holder);
4050
4051 j(L_no_such_interface);
4052
4053 // Loop: Look for resolved_class record in itable
4054 // while (true) {
4055 // temp_itbl_klass = *(scan_temp += scan_step);
4056 // if (temp_itbl_klass == 0) {
4057 // goto L_no_such_interface;
4058 // }
4059 // if (temp_itbl_klass == resolved_klass) {
4060 // goto L_resolved_found; // Found!
4061 // }
4062 // if (temp_itbl_klass == holder_klass) {
4063 // holder_offset = scan_temp;
4064 // }
4065 // }
4066 //
4067 Label L_loop_search_resolved;
4068 bind(L_loop_search_resolved);
4069 add(scan_temp, scan_temp, scan_step);
4070 ld(temp_itbl_klass, Address(scan_temp));
4071 bind(L_loop_search_resolved_entry);
4072 beqz(temp_itbl_klass, L_no_such_interface);
4073 beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4074 bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4075 mv(holder_offset, scan_temp);
4076 j(L_loop_search_resolved);
4077
4078 // See if we already have a holder klass. If not, go and scan for it.
4079 bind(L_resolved_found);
4080 beqz(holder_offset, L_search_holder);
4081 mv(scan_temp, holder_offset);
4082
4083 // Finally, scan_temp contains holder_klass vtable offset
4084 bind(L_holder_found);
4085 lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4086 add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4087 - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4088 add(method_result, recv_klass, method_result);
4089 ld(method_result, Address(method_result));
4090 }
4091
4092 // virtual method calling
4093 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4094 RegisterOrConstant vtable_index,
4095 Register method_result) {
4096 const ByteSize base = Klass::vtable_start_offset();
4097 assert(vtableEntry::size() * wordSize == 8,
4098 "adjust the scaling in the code below");
4099 int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4100
4101 if (vtable_index.is_register()) {
4102 shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4103 ld(method_result, Address(method_result, vtable_offset_in_bytes));
4104 } else {
4105 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4106 ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4107 }
4108 }
4109
4110 void MacroAssembler::membar(uint32_t order_constraint) {
4111 if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4112 // TSO allows for stores to be reordered after loads. When the compiler
4113 // generates a fence to disallow that, we are required to generate the
4114 // fence for correctness.
4115 BLOCK_COMMENT("elided tso membar");
4116 return;
4117 }
4118
4119 address prev = pc() - MacroAssembler::instruction_size;
4120 address last = code()->last_insn();
4121
4122 if (last != nullptr && is_membar(last) && prev == last) {
4123 // We are merging two memory barrier instructions. On RISCV we
4124 // can do this simply by ORing them together.
4125 set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4126 BLOCK_COMMENT("merged membar");
4127 return;
4128 }
4129
4130 code()->set_last_insn(pc());
4131 uint32_t predecessor = 0;
4132 uint32_t successor = 0;
4133 membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4134 fence(predecessor, successor);
4135 }
4136
4137 void MacroAssembler::cmodx_fence() {
4138 BLOCK_COMMENT("cmodx fence");
4139 if (VM_Version::supports_fencei_barrier()) {
4140 Assembler::fencei();
4141 }
4142 }
4143
4144 // Form an address from base + offset in Rd. Rd my or may not
4145 // actually be used: you must use the Address that is returned. It
4146 // is up to you to ensure that the shift provided matches the size
4147 // of your data.
4148 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4149 if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4150 return Address(base, byte_offset);
4151 }
4152
4153 assert_different_registers(Rd, base, noreg);
4154
4155 // Do it the hard way
4156 mv(Rd, byte_offset);
4157 add(Rd, base, Rd);
4158 return Address(Rd);
4159 }
4160
4161 void MacroAssembler::check_klass_subtype(Register sub_klass,
4162 Register super_klass,
4163 Register tmp_reg,
4164 Label& L_success) {
4165 Label L_failure;
4166 check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4167 check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4168 bind(L_failure);
4169 }
4170
4171 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4172 ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4173 if (at_return) {
4174 bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4175 } else {
4176 test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4177 bnez(tmp_reg, slow_path, /* is_far */ true);
4178 }
4179 }
4180
4181 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
4182 Label &succeed, Label *fail) {
4183 assert_different_registers(addr, tmp, t0);
4184 assert_different_registers(newv, tmp, t0);
4185 assert_different_registers(oldv, tmp, t0);
4186
4187 // oldv holds comparison value
4188 // newv holds value to write in exchange
4189 // addr identifies memory word to compare against/update
4190 if (UseZacas) {
4191 mv(tmp, oldv);
4192 atomic_cas(tmp, newv, addr, Assembler::int64, Assembler::aq, Assembler::rl);
4193 beq(tmp, oldv, succeed);
4194 } else {
4195 Label retry_load, nope;
4196 bind(retry_load);
4197 // Load reserved from the memory location
4198 load_reserved(tmp, addr, int64, Assembler::aqrl);
4199 // Fail and exit if it is not what we expect
4200 bne(tmp, oldv, nope);
4201 // If the store conditional succeeds, tmp will be zero
4202 store_conditional(tmp, newv, addr, int64, Assembler::rl);
4203 beqz(tmp, succeed);
4204 // Retry only when the store conditional failed
4205 j(retry_load);
4206
4207 bind(nope);
4208 }
4209
4210 // neither amocas nor lr/sc have an implied barrier in the failing case
4211 membar(AnyAny);
4212
4213 mv(oldv, tmp);
4214 if (fail != nullptr) {
4215 j(*fail);
4216 }
4217 }
4218
4219 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
4220 Label &succeed, Label *fail) {
4221 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
4222 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
4223 }
4224
4225 void MacroAssembler::load_reserved(Register dst,
4226 Register addr,
4227 Assembler::operand_size size,
4228 Assembler::Aqrl acquire) {
4229 switch (size) {
4230 case int64:
4231 lr_d(dst, addr, acquire);
4232 break;
4233 case int32:
4234 lr_w(dst, addr, acquire);
4235 break;
4236 case uint32:
4237 lr_w(dst, addr, acquire);
4238 zext(dst, dst, 32);
4239 break;
4240 default:
4241 ShouldNotReachHere();
4242 }
4243 }
4244
4245 void MacroAssembler::store_conditional(Register dst,
4246 Register new_val,
4247 Register addr,
4248 Assembler::operand_size size,
4249 Assembler::Aqrl release) {
4250 switch (size) {
4251 case int64:
4252 sc_d(dst, addr, new_val, release);
4253 break;
4254 case int32:
4255 case uint32:
4256 sc_w(dst, addr, new_val, release);
4257 break;
4258 default:
4259 ShouldNotReachHere();
4260 }
4261 }
4262
4263
4264 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4265 Assembler::operand_size size,
4266 Register shift, Register mask, Register aligned_addr) {
4267 assert(size == int8 || size == int16, "unsupported operand size");
4268
4269 andi(shift, addr, 3);
4270 slli(shift, shift, 3);
4271
4272 andi(aligned_addr, addr, ~3);
4273
4274 if (size == int8) {
4275 mv(mask, 0xff);
4276 } else {
4277 // size == int16 case
4278 mv(mask, -1);
4279 zext(mask, mask, 16);
4280 }
4281 sll(mask, mask, shift);
4282
4283 sll(expected, expected, shift);
4284 andr(expected, expected, mask);
4285
4286 sll(new_val, new_val, shift);
4287 andr(new_val, new_val, mask);
4288 }
4289
4290 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4291 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4292 // which are forced to work with 4-byte aligned address.
4293 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4294 Register new_val,
4295 Assembler::operand_size size,
4296 Assembler::Aqrl acquire, Assembler::Aqrl release,
4297 Register result, bool result_as_bool,
4298 Register tmp1, Register tmp2, Register tmp3) {
4299 assert(!(UseZacas && UseZabha), "Use amocas");
4300 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4301
4302 Register scratch0 = t0, aligned_addr = t1;
4303 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4304
4305 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4306
4307 Label retry, fail, done;
4308
4309 if (UseZacas) {
4310 lw(result, aligned_addr);
4311
4312 bind(retry); // amocas loads the current value into result
4313 notr(scratch1, mask);
4314
4315 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4316 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4317 bne(result, scratch1, fail); // cas bits differ, cas failed
4318
4319 // result is the same as expected, use as expected value.
4320
4321 // scratch0 is still = word - cas bits
4322 // Or in the new value to create complete new value.
4323 orr(scratch0, scratch0, new_val);
4324
4325 mv(scratch1, result); // save our expected value
4326 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4327 bne(scratch1, result, retry);
4328 } else {
4329 notr(scratch1, mask);
4330 bind(retry);
4331
4332 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4333 andr(scratch0, result, mask);
4334 bne(scratch0, expected, fail);
4335
4336 andr(scratch0, result, scratch1); // scratch1 is ~mask
4337 orr(scratch0, scratch0, new_val);
4338 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4339 bnez(scratch0, retry);
4340 }
4341
4342 if (result_as_bool) {
4343 mv(result, 1);
4344 j(done);
4345
4346 bind(fail);
4347 mv(result, zr);
4348
4349 bind(done);
4350 } else {
4351 bind(fail);
4352
4353 andr(scratch0, result, mask);
4354 srl(result, scratch0, shift);
4355
4356 if (size == int8) {
4357 sext(result, result, 8);
4358 } else {
4359 // size == int16 case
4360 sext(result, result, 16);
4361 }
4362 }
4363 }
4364
4365 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4366 // the weak CAS stuff. The major difference is that it just failed when store conditional
4367 // failed.
4368 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4369 Register new_val,
4370 Assembler::operand_size size,
4371 Assembler::Aqrl acquire, Assembler::Aqrl release,
4372 Register result,
4373 Register tmp1, Register tmp2, Register tmp3) {
4374 assert(!(UseZacas && UseZabha), "Use amocas");
4375 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4376
4377 Register scratch0 = t0, aligned_addr = t1;
4378 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4379
4380 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4381
4382 Label fail, done;
4383
4384 if (UseZacas) {
4385 lw(result, aligned_addr);
4386
4387 notr(scratch1, mask);
4388
4389 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4390 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4391 bne(result, scratch1, fail); // cas bits differ, cas failed
4392
4393 // result is the same as expected, use as expected value.
4394
4395 // scratch0 is still = word - cas bits
4396 // Or in the new value to create complete new value.
4397 orr(scratch0, scratch0, new_val);
4398
4399 mv(scratch1, result); // save our expected value
4400 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4401 bne(scratch1, result, fail); // This weak, so just bail-out.
4402 } else {
4403 notr(scratch1, mask);
4404
4405 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4406 andr(scratch0, result, mask);
4407 bne(scratch0, expected, fail);
4408
4409 andr(scratch0, result, scratch1); // scratch1 is ~mask
4410 orr(scratch0, scratch0, new_val);
4411 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4412 bnez(scratch0, fail);
4413 }
4414
4415 // Success
4416 mv(result, 1);
4417 j(done);
4418
4419 // Fail
4420 bind(fail);
4421 mv(result, zr);
4422
4423 bind(done);
4424 }
4425
4426 void MacroAssembler::cmpxchg(Register addr, Register expected,
4427 Register new_val,
4428 Assembler::operand_size size,
4429 Assembler::Aqrl acquire, Assembler::Aqrl release,
4430 Register result, bool result_as_bool) {
4431 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4432 assert_different_registers(addr, t0);
4433 assert_different_registers(expected, t0);
4434 assert_different_registers(new_val, t0);
4435
4436 // NOTE:
4437 // Register _result_ may be the same register as _new_val_ or _expected_.
4438 // Hence do NOT use _result_ until after 'cas'.
4439 //
4440 // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4441 // Hence do NOT change _expected_ or _new_val_.
4442 //
4443 // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4444 //
4445 // TODO: Address these issues.
4446
4447 if (UseZacas) {
4448 if (result_as_bool) {
4449 mv(t0, expected);
4450 atomic_cas(t0, new_val, addr, size, acquire, release);
4451 xorr(t0, t0, expected);
4452 seqz(result, t0);
4453 } else {
4454 mv(t0, expected);
4455 atomic_cas(t0, new_val, addr, size, acquire, release);
4456 mv(result, t0);
4457 }
4458 return;
4459 }
4460
4461 Label retry_load, done, ne_done;
4462 bind(retry_load);
4463 load_reserved(t0, addr, size, acquire);
4464 bne(t0, expected, ne_done);
4465 store_conditional(t0, new_val, addr, size, release);
4466 bnez(t0, retry_load);
4467
4468 // equal, succeed
4469 if (result_as_bool) {
4470 mv(result, 1);
4471 } else {
4472 mv(result, expected);
4473 }
4474 j(done);
4475
4476 // not equal, failed
4477 bind(ne_done);
4478 if (result_as_bool) {
4479 mv(result, zr);
4480 } else {
4481 mv(result, t0);
4482 }
4483
4484 bind(done);
4485 }
4486
4487 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4488 Register new_val,
4489 Assembler::operand_size size,
4490 Assembler::Aqrl acquire, Assembler::Aqrl release,
4491 Register result) {
4492 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4493 assert_different_registers(addr, t0);
4494 assert_different_registers(expected, t0);
4495 assert_different_registers(new_val, t0);
4496
4497 if (UseZacas) {
4498 cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4499 return;
4500 }
4501
4502 Label fail, done;
4503 load_reserved(t0, addr, size, acquire);
4504 bne(t0, expected, fail);
4505 store_conditional(t0, new_val, addr, size, release);
4506 bnez(t0, fail);
4507
4508 // Success
4509 mv(result, 1);
4510 j(done);
4511
4512 // Fail
4513 bind(fail);
4514 mv(result, zr);
4515
4516 bind(done);
4517 }
4518
4519 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE) \
4520 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4521 prev = prev->is_valid() ? prev : zr; \
4522 if (incr.is_register()) { \
4523 AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4524 } else { \
4525 mv(t0, incr.as_constant()); \
4526 AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4527 } \
4528 return; \
4529 }
4530
4531 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4532 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4533 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4534 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4535
4536 #undef ATOMIC_OP
4537
4538 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE) \
4539 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
4540 prev = prev->is_valid() ? prev : zr; \
4541 AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4542 return; \
4543 }
4544
4545 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4546 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4547 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4548 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4549
4550 #undef ATOMIC_XCHG
4551
4552 #define ATOMIC_XCHGU(OP1, OP2) \
4553 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) { \
4554 atomic_##OP2(prev, newv, addr); \
4555 zext(prev, prev, 32); \
4556 return; \
4557 }
4558
4559 ATOMIC_XCHGU(xchgwu, xchgw)
4560 ATOMIC_XCHGU(xchgalwu, xchgalw)
4561
4562 #undef ATOMIC_XCHGU
4563
4564 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4565 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4566 switch (size) {
4567 case int64:
4568 amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4569 break;
4570 case int32:
4571 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4572 break;
4573 case uint32:
4574 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4575 zext(prev, prev, 32);
4576 break;
4577 case int16:
4578 amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4579 break;
4580 case int8:
4581 amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4582 break;
4583 default:
4584 ShouldNotReachHere();
4585 }
4586 }
4587
4588 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4589 assert(CodeCache::contains(entry.target()),
4590 "destination of far jump not found in code cache");
4591 assert(entry.rspec().type() == relocInfo::external_word_type
4592 || entry.rspec().type() == relocInfo::runtime_call_type
4593 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4594 // Fixed length: see MacroAssembler::far_branch_size()
4595 // We can use auipc + jr here because we know that the total size of
4596 // the code cache cannot exceed 2Gb.
4597 relocate(entry.rspec(), [&] {
4598 int64_t distance = entry.target() - pc();
4599 int32_t offset = ((int32_t)distance << 20) >> 20;
4600 assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4601 auipc(tmp, (int32_t)distance + 0x800);
4602 jr(tmp, offset);
4603 });
4604 }
4605
4606 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4607 assert(tmp != x5, "tmp register must not be x5.");
4608 assert(CodeCache::contains(entry.target()),
4609 "destination of far call not found in code cache");
4610 assert(entry.rspec().type() == relocInfo::external_word_type
4611 || entry.rspec().type() == relocInfo::runtime_call_type
4612 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4613 // Fixed length: see MacroAssembler::far_branch_size()
4614 // We can use auipc + jalr here because we know that the total size of
4615 // the code cache cannot exceed 2Gb.
4616 relocate(entry.rspec(), [&] {
4617 int64_t distance = entry.target() - pc();
4618 int32_t offset = ((int32_t)distance << 20) >> 20;
4619 assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4620 auipc(tmp, (int32_t)distance + 0x800);
4621 jalr(tmp, offset);
4622 });
4623 }
4624
4625 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4626 Register super_klass,
4627 Register tmp_reg,
4628 Label* L_success,
4629 Label* L_failure,
4630 Label* L_slow_path,
4631 Register super_check_offset) {
4632 assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4633 bool must_load_sco = !super_check_offset->is_valid();
4634 if (must_load_sco) {
4635 assert(tmp_reg != noreg, "supply either a temp or a register offset");
4636 }
4637
4638 Label L_fallthrough;
4639 int label_nulls = 0;
4640 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4641 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4642 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4643 assert(label_nulls <= 1, "at most one null in batch");
4644
4645 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4646 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4647 Address super_check_offset_addr(super_klass, sco_offset);
4648
4649 // Hacked jmp, which may only be used just before L_fallthrough.
4650 #define final_jmp(label) \
4651 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4652 else j(label) /*omit semi*/
4653
4654 // If the pointers are equal, we are done (e.g., String[] elements).
4655 // This self-check enables sharing of secondary supertype arrays among
4656 // non-primary types such as array-of-interface. Otherwise, each such
4657 // type would need its own customized SSA.
4658 // We move this check to the front of the fast path because many
4659 // type checks are in fact trivially successful in this manner,
4660 // so we get a nicely predicted branch right at the start of the check.
4661 beq(sub_klass, super_klass, *L_success);
4662
4663 // Check the supertype display:
4664 if (must_load_sco) {
4665 lwu(tmp_reg, super_check_offset_addr);
4666 super_check_offset = tmp_reg;
4667 }
4668 add(t0, sub_klass, super_check_offset);
4669 Address super_check_addr(t0);
4670 ld(t0, super_check_addr); // load displayed supertype
4671 beq(super_klass, t0, *L_success);
4672
4673 // This check has worked decisively for primary supers.
4674 // Secondary supers are sought in the super_cache ('super_cache_addr').
4675 // (Secondary supers are interfaces and very deeply nested subtypes.)
4676 // This works in the same check above because of a tricky aliasing
4677 // between the super_Cache and the primary super display elements.
4678 // (The 'super_check_addr' can address either, as the case requires.)
4679 // Note that the cache is updated below if it does not help us find
4680 // what we need immediately.
4681 // So if it was a primary super, we can just fail immediately.
4682 // Otherwise, it's the slow path for us (no success at this point).
4683
4684 mv(t1, sc_offset);
4685 if (L_failure == &L_fallthrough) {
4686 beq(super_check_offset, t1, *L_slow_path);
4687 } else {
4688 bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4689 final_jmp(*L_slow_path);
4690 }
4691
4692 bind(L_fallthrough);
4693
4694 #undef final_jmp
4695 }
4696
4697 // Scans count pointer sized words at [addr] for occurrence of value,
4698 // generic
4699 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4700 Register tmp) {
4701 Label Lloop, Lexit;
4702 beqz(count, Lexit);
4703 bind(Lloop);
4704 ld(tmp, addr);
4705 beq(value, tmp, Lexit);
4706 addi(addr, addr, wordSize);
4707 subi(count, count, 1);
4708 bnez(count, Lloop);
4709 bind(Lexit);
4710 }
4711
4712 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4713 Register super_klass,
4714 Register tmp1_reg,
4715 Register tmp2_reg,
4716 Label* L_success,
4717 Label* L_failure,
4718 bool set_cond_codes) {
4719 assert_different_registers(sub_klass, super_klass, tmp1_reg);
4720 if (tmp2_reg != noreg) {
4721 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4722 }
4723 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4724
4725 Label L_fallthrough;
4726 int label_nulls = 0;
4727 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4728 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4729
4730 assert(label_nulls <= 1, "at most one null in the batch");
4731
4732 // A couple of useful fields in sub_klass:
4733 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4734 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4735 Address secondary_supers_addr(sub_klass, ss_offset);
4736 Address super_cache_addr( sub_klass, sc_offset);
4737
4738 BLOCK_COMMENT("check_klass_subtype_slow_path");
4739
4740 // Do a linear scan of the secondary super-klass chain.
4741 // This code is rarely used, so simplicity is a virtue here.
4742 // The repne_scan instruction uses fixed registers, which we must spill.
4743 // Don't worry too much about pre-existing connections with the input regs.
4744
4745 assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4746 assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4747
4748 RegSet pushed_registers;
4749 if (!IS_A_TEMP(x12)) {
4750 pushed_registers += x12;
4751 }
4752 if (!IS_A_TEMP(x15)) {
4753 pushed_registers += x15;
4754 }
4755
4756 if (super_klass != x10) {
4757 if (!IS_A_TEMP(x10)) {
4758 pushed_registers += x10;
4759 }
4760 }
4761
4762 push_reg(pushed_registers, sp);
4763
4764 // Get super_klass value into x10 (even if it was in x15 or x12)
4765 mv(x10, super_klass);
4766
4767 #ifndef PRODUCT
4768 incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4769 #endif // PRODUCT
4770
4771 // We will consult the secondary-super array.
4772 ld(x15, secondary_supers_addr);
4773 // Load the array length.
4774 lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4775 // Skip to start of data.
4776 addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4777
4778 // Set t0 to an obvious invalid value, falling through by default
4779 mv(t0, -1);
4780 // Scan X12 words at [X15] for an occurrence of X10.
4781 repne_scan(x15, x10, x12, t0);
4782
4783 // pop will restore x10, so we should use a temp register to keep its value
4784 mv(t1, x10);
4785
4786 // Unspill the temp registers:
4787 pop_reg(pushed_registers, sp);
4788
4789 bne(t1, t0, *L_failure);
4790
4791 // Success. Cache the super we found an proceed in triumph.
4792 if (UseSecondarySupersCache) {
4793 sd(super_klass, super_cache_addr);
4794 }
4795
4796 if (L_success != &L_fallthrough) {
4797 j(*L_success);
4798 }
4799
4800 #undef IS_A_TEMP
4801
4802 bind(L_fallthrough);
4803 }
4804
4805 // population_count variant for running without the CPOP
4806 // instruction, which was introduced with Zbb extension.
4807 void MacroAssembler::population_count(Register dst, Register src,
4808 Register tmp1, Register tmp2) {
4809 if (UsePopCountInstruction) {
4810 cpop(dst, src);
4811 } else {
4812 assert_different_registers(src, tmp1, tmp2);
4813 assert_different_registers(dst, tmp1, tmp2);
4814 Label loop, done;
4815
4816 mv(tmp1, src);
4817 // dst = 0;
4818 // while(tmp1 != 0) {
4819 // dst++;
4820 // tmp1 &= (tmp1 - 1);
4821 // }
4822 mv(dst, zr);
4823 beqz(tmp1, done);
4824 {
4825 bind(loop);
4826 addi(dst, dst, 1);
4827 subi(tmp2, tmp1, 1);
4828 andr(tmp1, tmp1, tmp2);
4829 bnez(tmp1, loop);
4830 }
4831 bind(done);
4832 }
4833 }
4834
4835 // If Register r is invalid, remove a new register from
4836 // available_regs, and add new register to regs_to_push.
4837 Register MacroAssembler::allocate_if_noreg(Register r,
4838 RegSetIterator<Register> &available_regs,
4839 RegSet ®s_to_push) {
4840 if (!r->is_valid()) {
4841 r = *available_regs++;
4842 regs_to_push += r;
4843 }
4844 return r;
4845 }
4846
4847 // check_klass_subtype_slow_path_table() looks for super_klass in the
4848 // hash table belonging to super_klass, branching to L_success or
4849 // L_failure as appropriate. This is essentially a shim which
4850 // allocates registers as necessary then calls
4851 // lookup_secondary_supers_table() to do the work. Any of the tmp
4852 // regs may be noreg, in which case this logic will chooses some
4853 // registers push and pop them from the stack.
4854 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4855 Register super_klass,
4856 Register tmp1_reg,
4857 Register tmp2_reg,
4858 Label* L_success,
4859 Label* L_failure,
4860 bool set_cond_codes) {
4861 RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4862
4863 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4864
4865 Label L_fallthrough;
4866 int label_nulls = 0;
4867 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4868 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4869 assert(label_nulls <= 1, "at most one null in the batch");
4870
4871 BLOCK_COMMENT("check_klass_subtype_slow_path");
4872
4873 RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4874 RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4875
4876 RegSet pushed_regs;
4877
4878 tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4879 tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4880
4881 Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4882
4883 tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4884 tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4885 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4886
4887 push_reg(pushed_regs, sp);
4888
4889 lookup_secondary_supers_table_var(sub_klass,
4890 super_klass,
4891 result_reg,
4892 tmp1_reg, tmp2_reg, tmp3_reg,
4893 tmp4_reg, nullptr);
4894
4895 // Move the result to t1 as we are about to unspill the tmp registers.
4896 mv(t1, result_reg);
4897
4898 // Unspill the tmp. registers:
4899 pop_reg(pushed_regs, sp);
4900
4901 // NB! Callers may assume that, when set_cond_codes is true, this
4902 // code sets tmp2_reg to a nonzero value.
4903 if (set_cond_codes) {
4904 mv(tmp2_reg, 1);
4905 }
4906
4907 bnez(t1, *L_failure);
4908
4909 if (L_success != &L_fallthrough) {
4910 j(*L_success);
4911 }
4912
4913 bind(L_fallthrough);
4914 }
4915
4916 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4917 Register super_klass,
4918 Register tmp1_reg,
4919 Register tmp2_reg,
4920 Label* L_success,
4921 Label* L_failure,
4922 bool set_cond_codes) {
4923 if (UseSecondarySupersTable) {
4924 check_klass_subtype_slow_path_table
4925 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4926 } else {
4927 check_klass_subtype_slow_path_linear
4928 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4929 }
4930 }
4931
4932 // Ensure that the inline code and the stub are using the same registers
4933 // as we need to call the stub from inline code when there is a collision
4934 // in the hashed lookup in the secondary supers array.
4935 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
4936 r_array_index, r_sub_klass, result, r_bitmap) \
4937 do { \
4938 assert(r_super_klass == x10 && \
4939 r_array_base == x11 && \
4940 r_array_length == x12 && \
4941 (r_array_index == x13 || r_array_index == noreg) && \
4942 (r_sub_klass == x14 || r_sub_klass == noreg) && \
4943 (result == x15 || result == noreg) && \
4944 (r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
4945 } while(0)
4946
4947 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4948 Register r_super_klass,
4949 Register result,
4950 Register tmp1,
4951 Register tmp2,
4952 Register tmp3,
4953 Register tmp4,
4954 u1 super_klass_slot,
4955 bool stub_is_near) {
4956 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4957
4958 Label L_fallthrough;
4959
4960 BLOCK_COMMENT("lookup_secondary_supers_table {");
4961
4962 const Register
4963 r_array_base = tmp1, // x11
4964 r_array_length = tmp2, // x12
4965 r_array_index = tmp3, // x13
4966 r_bitmap = tmp4; // x16
4967
4968 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4969 r_array_index, r_sub_klass, result, r_bitmap);
4970
4971 u1 bit = super_klass_slot;
4972
4973 // Initialize result value to 1 which means mismatch.
4974 mv(result, 1);
4975
4976 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4977
4978 // First check the bitmap to see if super_klass might be present. If
4979 // the bit is zero, we are certain that super_klass is not one of
4980 // the secondary supers.
4981 test_bit(t0, r_bitmap, bit);
4982 beqz(t0, L_fallthrough);
4983
4984 // Get the first array index that can contain super_klass into r_array_index.
4985 if (bit != 0) {
4986 slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4987 population_count(r_array_index, r_array_index, tmp1, tmp2);
4988 } else {
4989 mv(r_array_index, (u1)1);
4990 }
4991
4992 // We will consult the secondary-super array.
4993 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4994
4995 // The value i in r_array_index is >= 1, so even though r_array_base
4996 // points to the length, we don't need to adjust it to point to the data.
4997 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4998 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4999
5000 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5001 ld(result, Address(result));
5002 xorr(result, result, r_super_klass);
5003 beqz(result, L_fallthrough); // Found a match
5004
5005 // Is there another entry to check? Consult the bitmap.
5006 test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
5007 beqz(t0, L_fallthrough);
5008
5009 // Linear probe.
5010 if (bit != 0) {
5011 ror(r_bitmap, r_bitmap, bit);
5012 }
5013
5014 // The slot we just inspected is at secondary_supers[r_array_index - 1].
5015 // The next slot to be inspected, by the stub we're about to call,
5016 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5017 // have been checked.
5018 rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
5019
5020 BLOCK_COMMENT("} lookup_secondary_supers_table");
5021
5022 bind(L_fallthrough);
5023
5024 if (VerifySecondarySupers) {
5025 verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
5026 result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
5027 }
5028 return true;
5029 }
5030
5031 // At runtime, return 0 in result if r_super_klass is a superclass of
5032 // r_sub_klass, otherwise return nonzero. Use this version of
5033 // lookup_secondary_supers_table() if you don't know ahead of time
5034 // which superclass will be searched for. Used by interpreter and
5035 // runtime stubs. It is larger and has somewhat greater latency than
5036 // the version above, which takes a constant super_klass_slot.
5037 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
5038 Register r_super_klass,
5039 Register result,
5040 Register tmp1,
5041 Register tmp2,
5042 Register tmp3,
5043 Register tmp4,
5044 Label *L_success) {
5045 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5046
5047 Label L_fallthrough;
5048
5049 BLOCK_COMMENT("lookup_secondary_supers_table {");
5050
5051 const Register
5052 r_array_index = tmp3,
5053 r_bitmap = tmp4,
5054 slot = t1;
5055
5056 lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5057
5058 // Make sure that result is nonzero if the test below misses.
5059 mv(result, 1);
5060
5061 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5062
5063 // First check the bitmap to see if super_klass might be present. If
5064 // the bit is zero, we are certain that super_klass is not one of
5065 // the secondary supers.
5066
5067 // This next instruction is equivalent to:
5068 // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5069 // sub(r_array_index, slot, tmp_reg);
5070 xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5071 sll(r_array_index, r_bitmap, r_array_index);
5072 test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5073 beqz(t0, L_fallthrough);
5074
5075 // Get the first array index that can contain super_klass into r_array_index.
5076 population_count(r_array_index, r_array_index, tmp1, tmp2);
5077
5078 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5079
5080 const Register
5081 r_array_base = tmp1,
5082 r_array_length = tmp2;
5083
5084 // The value i in r_array_index is >= 1, so even though r_array_base
5085 // points to the length, we don't need to adjust it to point to the data.
5086 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5087 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5088
5089 // We will consult the secondary-super array.
5090 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5091
5092 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5093 ld(result, Address(result));
5094 xorr(result, result, r_super_klass);
5095 beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5096
5097 // Is there another entry to check? Consult the bitmap.
5098 ror(r_bitmap, r_bitmap, slot);
5099 test_bit(t0, r_bitmap, 1);
5100 beqz(t0, L_fallthrough);
5101
5102 // The slot we just inspected is at secondary_supers[r_array_index - 1].
5103 // The next slot to be inspected, by the logic we're about to call,
5104 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5105 // have been checked.
5106 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5107 r_bitmap, result, r_array_length, false /*is_stub*/);
5108
5109 BLOCK_COMMENT("} lookup_secondary_supers_table");
5110
5111 bind(L_fallthrough);
5112
5113 if (VerifySecondarySupers) {
5114 verify_secondary_supers_table(r_sub_klass, r_super_klass,
5115 result, tmp1, tmp2, tmp3);
5116 }
5117
5118 if (L_success) {
5119 beqz(result, *L_success);
5120 }
5121 }
5122
5123 // Called by code generated by check_klass_subtype_slow_path
5124 // above. This is called when there is a collision in the hashed
5125 // lookup in the secondary supers array.
5126 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5127 Register r_array_base,
5128 Register r_array_index,
5129 Register r_bitmap,
5130 Register result,
5131 Register tmp,
5132 bool is_stub) {
5133 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5134
5135 const Register
5136 r_array_length = tmp,
5137 r_sub_klass = noreg; // unused
5138
5139 if (is_stub) {
5140 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5141 r_array_index, r_sub_klass, result, r_bitmap);
5142 }
5143
5144 Label L_matched, L_fallthrough, L_bitmap_full;
5145
5146 // Initialize result value to 1 which means mismatch.
5147 mv(result, 1);
5148
5149 // Load the array length.
5150 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5151 // And adjust the array base to point to the data.
5152 // NB! Effectively increments current slot index by 1.
5153 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5154 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5155
5156 // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5157 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5158 subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5159 bgtz(t0, L_bitmap_full);
5160
5161 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5162 // current slot (at secondary_supers[r_array_index]) has not yet
5163 // been inspected, and r_array_index may be out of bounds if we
5164 // wrapped around the end of the array.
5165
5166 { // This is conventional linear probing, but instead of terminating
5167 // when a null entry is found in the table, we maintain a bitmap
5168 // in which a 0 indicates missing entries.
5169 // As long as the bitmap is not completely full,
5170 // array_length == popcount(bitmap). The array_length check above
5171 // guarantees there are 0s in the bitmap, so the loop eventually
5172 // terminates.
5173 Label L_loop;
5174 bind(L_loop);
5175
5176 // Check for wraparound.
5177 Label skip;
5178 blt(r_array_index, r_array_length, skip);
5179 mv(r_array_index, zr);
5180 bind(skip);
5181
5182 shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5183 ld(t0, Address(t0));
5184 beq(t0, r_super_klass, L_matched);
5185
5186 test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
5187 beqz(t0, L_fallthrough);
5188
5189 ror(r_bitmap, r_bitmap, 1);
5190 addi(r_array_index, r_array_index, 1);
5191 j(L_loop);
5192 }
5193
5194 { // Degenerate case: more than 64 secondary supers.
5195 // FIXME: We could do something smarter here, maybe a vectorized
5196 // comparison or a binary search, but is that worth any added
5197 // complexity?
5198 bind(L_bitmap_full);
5199 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5200 bne(r_super_klass, t0, L_fallthrough);
5201 }
5202
5203 bind(L_matched);
5204 mv(result, zr);
5205
5206 bind(L_fallthrough);
5207 }
5208
5209 // Make sure that the hashed lookup and a linear scan agree.
5210 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5211 Register r_super_klass,
5212 Register result,
5213 Register tmp1,
5214 Register tmp2,
5215 Register tmp3) {
5216 assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5217
5218 const Register
5219 r_array_base = tmp1, // X11
5220 r_array_length = tmp2, // X12
5221 r_array_index = noreg, // unused
5222 r_bitmap = noreg; // unused
5223
5224 BLOCK_COMMENT("verify_secondary_supers_table {");
5225
5226 // We will consult the secondary-super array.
5227 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5228
5229 // Load the array length.
5230 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5231 // And adjust the array base to point to the data.
5232 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5233
5234 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5235 Label failed;
5236 mv(tmp3, 1);
5237 bne(r_super_klass, t0, failed);
5238 mv(tmp3, zr);
5239 bind(failed);
5240
5241 snez(result, result); // normalize result to 0/1 for comparison
5242
5243 Label passed;
5244 beq(tmp3, result, passed);
5245 {
5246 mv(x10, r_super_klass);
5247 mv(x11, r_sub_klass);
5248 mv(x12, tmp3);
5249 mv(x13, result);
5250 mv(x14, (address)("mismatch"));
5251 rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5252 should_not_reach_here();
5253 }
5254 bind(passed);
5255
5256 BLOCK_COMMENT("} verify_secondary_supers_table");
5257 }
5258
5259 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5260 void MacroAssembler::tlab_allocate(Register obj,
5261 Register var_size_in_bytes,
5262 int con_size_in_bytes,
5263 Register tmp1,
5264 Register tmp2,
5265 Label& slow_case,
5266 bool is_far) {
5267 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5268 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5269 }
5270
5271 // get_thread() can be called anywhere inside generated code so we
5272 // need to save whatever non-callee save context might get clobbered
5273 // by the call to Thread::current() or, indeed, the call setup code.
5274 void MacroAssembler::get_thread(Register thread) {
5275 // save all call-clobbered regs except thread
5276 RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5277 RegSet::range(x28, x31) + ra - thread;
5278 push_reg(saved_regs, sp);
5279
5280 mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5281 jalr(t1);
5282 if (thread != c_rarg0) {
5283 mv(thread, c_rarg0);
5284 }
5285
5286 // restore pushed registers
5287 pop_reg(saved_regs, sp);
5288 }
5289
5290 void MacroAssembler::load_byte_map_base(Register reg) {
5291 CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5292 mv(reg, (uint64_t)ctbs->card_table_base_const());
5293 }
5294
5295 void MacroAssembler::build_frame(int framesize) {
5296 assert(framesize >= 2, "framesize must include space for FP/RA");
5297 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5298 sub(sp, sp, framesize);
5299 sd(fp, Address(sp, framesize - 2 * wordSize));
5300 sd(ra, Address(sp, framesize - wordSize));
5301 if (PreserveFramePointer) { add(fp, sp, framesize); }
5302 }
5303
5304 void MacroAssembler::remove_frame(int framesize) {
5305 assert(framesize >= 2, "framesize must include space for FP/RA");
5306 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5307 ld(fp, Address(sp, framesize - 2 * wordSize));
5308 ld(ra, Address(sp, framesize - wordSize));
5309 add(sp, sp, framesize);
5310 }
5311
5312 void MacroAssembler::reserved_stack_check() {
5313 // testing if reserved zone needs to be enabled
5314 Label no_reserved_zone_enabling;
5315
5316 ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5317 bltu(sp, t0, no_reserved_zone_enabling);
5318
5319 enter(); // RA and FP are live.
5320 mv(c_rarg0, xthread);
5321 rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5322 leave();
5323
5324 // We have already removed our own frame.
5325 // throw_delayed_StackOverflowError will think that it's been
5326 // called by our caller.
5327 j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5328 should_not_reach_here();
5329
5330 bind(no_reserved_zone_enabling);
5331 }
5332
5333 // Move the address of the polling page into dest.
5334 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5335 ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5336 }
5337
5338 // Read the polling page. The address of the polling page must
5339 // already be in r.
5340 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5341 relocate(rtype, [&] {
5342 lwu(zr, Address(r, offset));
5343 });
5344 }
5345
5346 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5347 #ifdef ASSERT
5348 {
5349 ThreadInVMfromUnknown tiv;
5350 assert (UseCompressedOops, "should only be used for compressed oops");
5351 assert (Universe::heap() != nullptr, "java heap should be initialized");
5352 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5353 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5354 }
5355 #endif
5356 int oop_index = oop_recorder()->find_index(obj);
5357 relocate(oop_Relocation::spec(oop_index), [&] {
5358 li32(dst, 0xDEADBEEF);
5359 });
5360 zext(dst, dst, 32);
5361 }
5362
5363 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5364 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5365 int index = oop_recorder()->find_index(k);
5366
5367 narrowKlass nk = CompressedKlassPointers::encode(k);
5368 relocate(metadata_Relocation::spec(index), [&] {
5369 li32(dst, nk);
5370 });
5371 zext(dst, dst, 32);
5372 }
5373
5374 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5375 assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5376 entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5377 entry.rspec().type() == relocInfo::static_call_type ||
5378 entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5379
5380 address target = entry.target();
5381
5382 if (!in_scratch_emit_size()) {
5383 address stub = emit_reloc_call_address_stub(offset(), target);
5384 if (stub == nullptr) {
5385 postcond(pc() == badAddress);
5386 return nullptr; // CodeCache is full
5387 }
5388 }
5389
5390 address call_pc = pc();
5391 #ifdef ASSERT
5392 if (entry.rspec().type() != relocInfo::runtime_call_type) {
5393 assert_alignment(call_pc);
5394 }
5395 #endif
5396
5397 // The relocation created while emitting the stub will ensure this
5398 // call instruction is subsequently patched to call the stub.
5399 relocate(entry.rspec(), [&] {
5400 auipc(tmp, 0);
5401 ld(tmp, Address(tmp, 0));
5402 jalr(tmp);
5403 });
5404
5405 postcond(pc() != badAddress);
5406 return call_pc;
5407 }
5408
5409 address MacroAssembler::ic_call(address entry, jint method_index) {
5410 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5411 assert(!in_compressible_scope(), "Must be");
5412 movptr(t0, (address)Universe::non_oop_word(), t1);
5413 assert_cond(entry != nullptr);
5414 return reloc_call(Address(entry, rh));
5415 }
5416
5417 int MacroAssembler::ic_check_size() {
5418 // No compressed
5419 return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5420 far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5421 }
5422
5423 int MacroAssembler::ic_check(int end_alignment) {
5424 IncompressibleScope scope(this);
5425 Register receiver = j_rarg0;
5426 Register data = t0;
5427
5428 Register tmp1 = t1; // scratch
5429 // t2 is saved on call, thus should have been saved before this check.
5430 // Hence we can clobber it.
5431 Register tmp2 = t2;
5432
5433 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5434 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5435 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5436 // before the inline cache check here, and not after
5437 align(end_alignment, ic_check_size());
5438 int uep_offset = offset();
5439
5440 if (UseCompactObjectHeaders) {
5441 load_narrow_klass_compact(tmp1, receiver);
5442 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5443 } else {
5444 lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5445 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5446 }
5447
5448 Label ic_hit;
5449 beq(tmp1, tmp2, ic_hit);
5450 // Note, far_jump is not fixed size.
5451 // Is this ever generates a movptr alignment/size will be off.
5452 far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5453 bind(ic_hit);
5454
5455 assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5456 return uep_offset;
5457 }
5458
5459 // Emit an address stub for a call to a target which is too far away.
5460 // Note that we only put the target address of the call in the stub.
5461 //
5462 // code sequences:
5463 //
5464 // call-site:
5465 // load target address from stub
5466 // jump-and-link target address
5467 //
5468 // Related address stub for this call site in the stub section:
5469 // alignment nop
5470 // target address
5471
5472 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5473 address stub = start_a_stub(max_reloc_call_address_stub_size());
5474 if (stub == nullptr) {
5475 return nullptr; // CodeBuffer::expand failed
5476 }
5477
5478 // We are always 4-byte aligned here.
5479 assert_alignment(pc());
5480
5481 // Make sure the address of destination 8-byte aligned.
5482 align(wordSize, 0);
5483
5484 RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5485 insts_call_instruction_offset);
5486 const int stub_start_offset = offset();
5487 relocate(rh, [&] {
5488 assert(offset() - stub_start_offset == 0,
5489 "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5490 assert(offset() % wordSize == 0, "bad alignment");
5491 emit_int64((int64_t)dest);
5492 });
5493
5494 const address stub_start_addr = addr_at(stub_start_offset);
5495 end_a_stub();
5496
5497 return stub_start_addr;
5498 }
5499
5500 int MacroAssembler::max_reloc_call_address_stub_size() {
5501 // Max stub size: alignment nop, target address.
5502 return 1 * MacroAssembler::instruction_size + wordSize;
5503 }
5504
5505 int MacroAssembler::static_call_stub_size() {
5506 // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5507 return 11 * MacroAssembler::instruction_size;
5508 }
5509
5510 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5511 switch (dst.getMode()) {
5512 case Address::base_plus_offset:
5513 // This is the expected mode, although we allow all the other
5514 // forms below.
5515 return form_address(tmp, dst.base(), dst.offset());
5516 default:
5517 la(tmp, dst);
5518 return Address(tmp);
5519 }
5520 }
5521
5522 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5523 assert(((dst.getMode() == Address::base_plus_offset &&
5524 is_simm12(dst.offset())) || is_simm12(value)),
5525 "invalid value and address mode combination");
5526 Address adr = add_memory_helper(dst, tmp2);
5527 assert(!adr.uses(tmp1), "invalid dst for address increment");
5528 ld(tmp1, adr);
5529 add(tmp1, tmp1, value, tmp2);
5530 sd(tmp1, adr);
5531 }
5532
5533 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5534 assert(((dst.getMode() == Address::base_plus_offset &&
5535 is_simm12(dst.offset())) || is_simm12(value)),
5536 "invalid value and address mode combination");
5537 Address adr = add_memory_helper(dst, tmp2);
5538 assert(!adr.uses(tmp1), "invalid dst for address increment");
5539 lwu(tmp1, adr);
5540 addw(tmp1, tmp1, value, tmp2);
5541 sw(tmp1, adr);
5542 }
5543
5544 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5545 assert(((dst.getMode() == Address::base_plus_offset &&
5546 is_simm12(dst.offset())) || is_simm12(value)),
5547 "invalid value and address mode combination");
5548 Address adr = add_memory_helper(dst, tmp2);
5549 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5550 ld(tmp1, adr);
5551 sub(tmp1, tmp1, value, tmp2);
5552 sd(tmp1, adr);
5553 }
5554
5555 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5556 assert(((dst.getMode() == Address::base_plus_offset &&
5557 is_simm12(dst.offset())) || is_simm12(value)),
5558 "invalid value and address mode combination");
5559 Address adr = add_memory_helper(dst, tmp2);
5560 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5561 lwu(tmp1, adr);
5562 subw(tmp1, tmp1, value, tmp2);
5563 sw(tmp1, adr);
5564 }
5565
5566 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5567 load_method_holder(result, method);
5568 ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5569 }
5570
5571 void MacroAssembler::load_method_holder(Register holder, Register method) {
5572 ld(holder, Address(method, Method::const_offset())); // ConstMethod*
5573 ld(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5574 ld(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5575 }
5576
5577 // string indexof
5578 // compute index by trailing zeros
5579 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5580 Register match_mask, Register result,
5581 Register ch2, Register tmp,
5582 bool haystack_isL) {
5583 int haystack_chr_shift = haystack_isL ? 0 : 1;
5584 srl(match_mask, match_mask, trailing_zeros);
5585 srli(match_mask, match_mask, 1);
5586 srli(tmp, trailing_zeros, LogBitsPerByte);
5587 if (!haystack_isL) andi(tmp, tmp, 0xE);
5588 add(haystack, haystack, tmp);
5589 ld(ch2, Address(haystack));
5590 if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5591 add(result, result, tmp);
5592 }
5593
5594 // string indexof
5595 // Find pattern element in src, compute match mask,
5596 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5597 // match mask patterns and corresponding indices would be like:
5598 // - 0x8080808080808080 (Latin1)
5599 // - 7 6 5 4 3 2 1 0 (match index)
5600 // - 0x8000800080008000 (UTF16)
5601 // - 3 2 1 0 (match index)
5602 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5603 Register mask1, Register mask2) {
5604 xorr(src, pattern, src);
5605 sub(match_mask, src, mask1);
5606 orr(src, src, mask2);
5607 notr(src, src);
5608 andr(match_mask, match_mask, src);
5609 }
5610
5611 #ifdef COMPILER2
5612 // Code for BigInteger::mulAdd intrinsic
5613 // out = x10
5614 // in = x11
5615 // offset = x12 (already out.length-offset)
5616 // len = x13
5617 // k = x14
5618 // tmp = x28
5619 //
5620 // pseudo code from java implementation:
5621 // long kLong = k & LONG_MASK;
5622 // carry = 0;
5623 // offset = out.length-offset - 1;
5624 // for (int j = len - 1; j >= 0; j--) {
5625 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5626 // out[offset--] = (int)product;
5627 // carry = product >>> 32;
5628 // }
5629 // return (int)carry;
5630 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5631 Register len, Register k, Register tmp) {
5632 Label L_tail_loop, L_unroll, L_end;
5633 mv(tmp, out);
5634 mv(out, zr);
5635 blez(len, L_end);
5636 zext(k, k, 32);
5637 slliw(t0, offset, LogBytesPerInt);
5638 add(offset, tmp, t0);
5639 slliw(t0, len, LogBytesPerInt);
5640 add(in, in, t0);
5641
5642 const int unroll = 8;
5643 mv(tmp, unroll);
5644 blt(len, tmp, L_tail_loop);
5645 bind(L_unroll);
5646 for (int i = 0; i < unroll; i++) {
5647 subi(in, in, BytesPerInt);
5648 lwu(t0, Address(in, 0));
5649 mul(t1, t0, k);
5650 add(t0, t1, out);
5651 subi(offset, offset, BytesPerInt);
5652 lwu(t1, Address(offset, 0));
5653 add(t0, t0, t1);
5654 sw(t0, Address(offset, 0));
5655 srli(out, t0, 32);
5656 }
5657 subw(len, len, tmp);
5658 bge(len, tmp, L_unroll);
5659
5660 bind(L_tail_loop);
5661 blez(len, L_end);
5662 subi(in, in, BytesPerInt);
5663 lwu(t0, Address(in, 0));
5664 mul(t1, t0, k);
5665 add(t0, t1, out);
5666 subi(offset, offset, BytesPerInt);
5667 lwu(t1, Address(offset, 0));
5668 add(t0, t0, t1);
5669 sw(t0, Address(offset, 0));
5670 srli(out, t0, 32);
5671 subiw(len, len, 1);
5672 j(L_tail_loop);
5673
5674 bind(L_end);
5675 }
5676
5677 // Multiply and multiply-accumulate unsigned 64-bit registers.
5678 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5679 assert_different_registers(prod_lo, prod_hi);
5680
5681 mul(prod_lo, n, m);
5682 mulhu(prod_hi, n, m);
5683 }
5684
5685 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5686 Register m, Register tmp1, Register tmp2) {
5687 assert_different_registers(sum_lo, sum_hi);
5688 assert_different_registers(sum_hi, tmp2);
5689
5690 wide_mul(tmp1, tmp2, n, m);
5691 cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
5692 adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
5693 }
5694
5695 // add two unsigned input and output carry
5696 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5697 {
5698 assert_different_registers(dst, carry);
5699 assert_different_registers(dst, src2);
5700 add(dst, src1, src2);
5701 sltu(carry, dst, src2);
5702 }
5703
5704 // add two input with carry
5705 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5706 assert_different_registers(dst, carry);
5707 add(dst, src1, src2);
5708 add(dst, dst, carry);
5709 }
5710
5711 // add two unsigned input with carry and output carry
5712 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5713 assert_different_registers(dst, src2);
5714 adc(dst, src1, src2, carry);
5715 sltu(carry, dst, src2);
5716 }
5717
5718 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5719 Register src1, Register src2, Register carry) {
5720 cad(dest_lo, dest_lo, src1, carry);
5721 add(dest_hi, dest_hi, carry);
5722 cad(dest_lo, dest_lo, src2, carry);
5723 add(final_dest_hi, dest_hi, carry);
5724 }
5725
5726 /**
5727 * Multiply 64 bit by 64 bit first loop.
5728 */
5729 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5730 Register y, Register y_idx, Register z,
5731 Register carry, Register product,
5732 Register idx, Register kdx) {
5733 //
5734 // jlong carry, x[], y[], z[];
5735 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5736 // huge_128 product = y[idx] * x[xstart] + carry;
5737 // z[kdx] = (jlong)product;
5738 // carry = (jlong)(product >>> 64);
5739 // }
5740 // z[xstart] = carry;
5741 //
5742
5743 Label L_first_loop, L_first_loop_exit;
5744 Label L_one_x, L_one_y, L_multiply;
5745
5746 subiw(xstart, xstart, 1);
5747 bltz(xstart, L_one_x);
5748
5749 shadd(t0, xstart, x, t0, LogBytesPerInt);
5750 ld(x_xstart, Address(t0, 0));
5751 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5752
5753 bind(L_first_loop);
5754 subiw(idx, idx, 1);
5755 bltz(idx, L_first_loop_exit);
5756 subiw(idx, idx, 1);
5757 bltz(idx, L_one_y);
5758
5759 shadd(t0, idx, y, t0, LogBytesPerInt);
5760 ld(y_idx, Address(t0, 0));
5761 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5762 bind(L_multiply);
5763
5764 mulhu(t0, x_xstart, y_idx);
5765 mul(product, x_xstart, y_idx);
5766 cad(product, product, carry, t1);
5767 adc(carry, t0, zr, t1);
5768
5769 subiw(kdx, kdx, 2);
5770 ror(product, product, 32); // back to big-endian
5771 shadd(t0, kdx, z, t0, LogBytesPerInt);
5772 sd(product, Address(t0, 0));
5773
5774 j(L_first_loop);
5775
5776 bind(L_one_y);
5777 lwu(y_idx, Address(y, 0));
5778 j(L_multiply);
5779
5780 bind(L_one_x);
5781 lwu(x_xstart, Address(x, 0));
5782 j(L_first_loop);
5783
5784 bind(L_first_loop_exit);
5785 }
5786
5787 /**
5788 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5789 *
5790 */
5791 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5792 Register carry, Register carry2,
5793 Register idx, Register jdx,
5794 Register yz_idx1, Register yz_idx2,
5795 Register tmp, Register tmp3, Register tmp4,
5796 Register tmp6, Register product_hi) {
5797 // jlong carry, x[], y[], z[];
5798 // int kdx = xstart+1;
5799 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5800 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5801 // jlong carry2 = (jlong)(tmp3 >>> 64);
5802 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
5803 // carry = (jlong)(tmp4 >>> 64);
5804 // z[kdx+idx+1] = (jlong)tmp3;
5805 // z[kdx+idx] = (jlong)tmp4;
5806 // }
5807 // idx += 2;
5808 // if (idx > 0) {
5809 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5810 // z[kdx+idx] = (jlong)yz_idx1;
5811 // carry = (jlong)(yz_idx1 >>> 64);
5812 // }
5813 //
5814
5815 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5816
5817 srliw(jdx, idx, 2);
5818
5819 bind(L_third_loop);
5820
5821 subw(jdx, jdx, 1);
5822 bltz(jdx, L_third_loop_exit);
5823 subw(idx, idx, 4);
5824
5825 shadd(t0, idx, y, t0, LogBytesPerInt);
5826 ld(yz_idx2, Address(t0, 0));
5827 ld(yz_idx1, Address(t0, wordSize));
5828
5829 shadd(tmp6, idx, z, t0, LogBytesPerInt);
5830
5831 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5832 ror(yz_idx2, yz_idx2, 32);
5833
5834 ld(t1, Address(tmp6, 0));
5835 ld(t0, Address(tmp6, wordSize));
5836
5837 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5838 mulhu(tmp4, product_hi, yz_idx1);
5839
5840 ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5841 ror(t1, t1, 32, tmp);
5842
5843 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
5844 mulhu(carry2, product_hi, yz_idx2);
5845
5846 cad(tmp3, tmp3, carry, carry);
5847 adc(tmp4, tmp4, zr, carry);
5848 cad(tmp3, tmp3, t0, t0);
5849 cadc(tmp4, tmp4, tmp, t0);
5850 adc(carry, carry2, zr, t0);
5851 cad(tmp4, tmp4, t1, carry2);
5852 adc(carry, carry, zr, carry2);
5853
5854 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5855 ror(tmp4, tmp4, 32);
5856 sd(tmp4, Address(tmp6, 0));
5857 sd(tmp3, Address(tmp6, wordSize));
5858
5859 j(L_third_loop);
5860
5861 bind(L_third_loop_exit);
5862
5863 andi(idx, idx, 0x3);
5864 beqz(idx, L_post_third_loop_done);
5865
5866 Label L_check_1;
5867 subiw(idx, idx, 2);
5868 bltz(idx, L_check_1);
5869
5870 shadd(t0, idx, y, t0, LogBytesPerInt);
5871 ld(yz_idx1, Address(t0, 0));
5872 ror(yz_idx1, yz_idx1, 32);
5873
5874 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5875 mulhu(tmp4, product_hi, yz_idx1);
5876
5877 shadd(t0, idx, z, t0, LogBytesPerInt);
5878 ld(yz_idx2, Address(t0, 0));
5879 ror(yz_idx2, yz_idx2, 32, tmp);
5880
5881 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5882
5883 ror(tmp3, tmp3, 32, tmp);
5884 sd(tmp3, Address(t0, 0));
5885
5886 bind(L_check_1);
5887
5888 andi(idx, idx, 0x1);
5889 subiw(idx, idx, 1);
5890 bltz(idx, L_post_third_loop_done);
5891 shadd(t0, idx, y, t0, LogBytesPerInt);
5892 lwu(tmp4, Address(t0, 0));
5893 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
5894 mulhu(carry2, tmp4, product_hi);
5895
5896 shadd(t0, idx, z, t0, LogBytesPerInt);
5897 lwu(tmp4, Address(t0, 0));
5898
5899 add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5900
5901 shadd(t0, idx, z, t0, LogBytesPerInt);
5902 sw(tmp3, Address(t0, 0));
5903
5904 slli(t0, carry2, 32);
5905 srli(carry, tmp3, 32);
5906 orr(carry, carry, t0);
5907
5908 bind(L_post_third_loop_done);
5909 }
5910
5911 /**
5912 * Code for BigInteger::multiplyToLen() intrinsic.
5913 *
5914 * x10: x
5915 * x11: xlen
5916 * x12: y
5917 * x13: ylen
5918 * x14: z
5919 * x15: tmp0
5920 * x16: tmp1
5921 * x17: tmp2
5922 * x7: tmp3
5923 * x28: tmp4
5924 * x29: tmp5
5925 * x30: tmp6
5926 * x31: tmp7
5927 */
5928 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5929 Register z, Register tmp0,
5930 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5931 Register tmp5, Register tmp6, Register product_hi) {
5932 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5933
5934 const Register idx = tmp1;
5935 const Register kdx = tmp2;
5936 const Register xstart = tmp3;
5937
5938 const Register y_idx = tmp4;
5939 const Register carry = tmp5;
5940 const Register product = xlen;
5941 const Register x_xstart = tmp0;
5942 const Register jdx = tmp1;
5943
5944 mv(idx, ylen); // idx = ylen;
5945 addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5946 mv(carry, zr); // carry = 0;
5947
5948 Label L_done;
5949 subiw(xstart, xlen, 1);
5950 bltz(xstart, L_done);
5951
5952 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5953
5954 Label L_second_loop_aligned;
5955 beqz(kdx, L_second_loop_aligned);
5956
5957 Label L_carry;
5958 subiw(kdx, kdx, 1);
5959 beqz(kdx, L_carry);
5960
5961 shadd(t0, kdx, z, t0, LogBytesPerInt);
5962 sw(carry, Address(t0, 0));
5963 srli(carry, carry, 32);
5964 subiw(kdx, kdx, 1);
5965
5966 bind(L_carry);
5967 shadd(t0, kdx, z, t0, LogBytesPerInt);
5968 sw(carry, Address(t0, 0));
5969
5970 // Second and third (nested) loops.
5971 //
5972 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5973 // carry = 0;
5974 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5975 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5976 // (z[k] & LONG_MASK) + carry;
5977 // z[k] = (int)product;
5978 // carry = product >>> 32;
5979 // }
5980 // z[i] = (int)carry;
5981 // }
5982 //
5983 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5984
5985 bind(L_second_loop_aligned);
5986 mv(carry, zr); // carry = 0;
5987 mv(jdx, ylen); // j = ystart+1
5988
5989 subiw(xstart, xstart, 1); // i = xstart-1;
5990 bltz(xstart, L_done);
5991
5992 subi(sp, sp, 4 * wordSize);
5993 sd(z, Address(sp, 0));
5994
5995 Label L_last_x;
5996 shadd(t0, xstart, z, t0, LogBytesPerInt);
5997 addi(z, t0, 4);
5998 subiw(xstart, xstart, 1); // i = xstart-1;
5999 bltz(xstart, L_last_x);
6000
6001 shadd(t0, xstart, x, t0, LogBytesPerInt);
6002 ld(product_hi, Address(t0, 0));
6003 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
6004
6005 Label L_third_loop_prologue;
6006 bind(L_third_loop_prologue);
6007
6008 sd(ylen, Address(sp, wordSize));
6009 sd(x, Address(sp, 2 * wordSize));
6010 sd(xstart, Address(sp, 3 * wordSize));
6011 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
6012 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
6013 ld(z, Address(sp, 0));
6014 ld(ylen, Address(sp, wordSize));
6015 ld(x, Address(sp, 2 * wordSize));
6016 ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
6017 addi(sp, sp, 4 * wordSize);
6018
6019 addiw(tmp3, xlen, 1);
6020 shadd(t0, tmp3, z, t0, LogBytesPerInt);
6021 sw(carry, Address(t0, 0));
6022
6023 subiw(tmp3, tmp3, 1);
6024 bltz(tmp3, L_done);
6025
6026 srli(carry, carry, 32);
6027 shadd(t0, tmp3, z, t0, LogBytesPerInt);
6028 sw(carry, Address(t0, 0));
6029 j(L_second_loop_aligned);
6030
6031 // Next infrequent code is moved outside loops.
6032 bind(L_last_x);
6033 lwu(product_hi, Address(x, 0));
6034 j(L_third_loop_prologue);
6035
6036 bind(L_done);
6037 }
6038 #endif
6039
6040 // Count bits of trailing zero chars from lsb to msb until first non-zero
6041 // char seen. For the LL case, shift 8 bits once as there is only one byte
6042 // per each char. For other cases, shift 16 bits once.
6043 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
6044 Register tmp1, Register tmp2) {
6045 int step = isLL ? 8 : 16;
6046 if (UseZbb) {
6047 ctz(Rd, Rs);
6048 andi(Rd, Rd, -step);
6049 return;
6050 }
6051
6052 assert_different_registers(Rd, tmp1, tmp2);
6053 Label Loop;
6054 mv(tmp2, Rs);
6055 mv(Rd, -step);
6056
6057 bind(Loop);
6058 addi(Rd, Rd, step);
6059 zext(tmp1, tmp2, step);
6060 srli(tmp2, tmp2, step);
6061 beqz(tmp1, Loop);
6062 }
6063
6064 // This instruction reads adjacent 4 bytes from the lower half of source register,
6065 // inflate into a register, for example:
6066 // Rs: A7A6A5A4A3A2A1A0
6067 // Rd: 00A300A200A100A0
6068 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6069 assert_different_registers(Rd, Rs, tmp1, tmp2);
6070
6071 mv(tmp1, 0xFF000000); // first byte mask at lower word
6072 andr(Rd, Rs, tmp1);
6073 for (int i = 0; i < 2; i++) {
6074 slli(Rd, Rd, wordSize);
6075 srli(tmp1, tmp1, wordSize);
6076 andr(tmp2, Rs, tmp1);
6077 orr(Rd, Rd, tmp2);
6078 }
6079 slli(Rd, Rd, wordSize);
6080 zext(tmp2, Rs, 8); // last byte mask at lower word
6081 orr(Rd, Rd, tmp2);
6082 }
6083
6084 // This instruction reads adjacent 4 bytes from the upper half of source register,
6085 // inflate into a register, for example:
6086 // Rs: A7A6A5A4A3A2A1A0
6087 // Rd: 00A700A600A500A4
6088 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6089 assert_different_registers(Rd, Rs, tmp1, tmp2);
6090 srli(Rs, Rs, 32); // only upper 32 bits are needed
6091 inflate_lo32(Rd, Rs, tmp1, tmp2);
6092 }
6093
6094 // The size of the blocks erased by the zero_blocks stub. We must
6095 // handle anything smaller than this ourselves in zero_words().
6096 const int MacroAssembler::zero_words_block_size = 8;
6097
6098 // zero_words() is used by C2 ClearArray patterns. It is as small as
6099 // possible, handling small word counts locally and delegating
6100 // anything larger to the zero_blocks stub. It is expanded many times
6101 // in compiled code, so it is important to keep it short.
6102
6103 // ptr: Address of a buffer to be zeroed.
6104 // cnt: Count in HeapWords.
6105 //
6106 // ptr, cnt, t1, and t0 are clobbered.
6107 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6108 assert(is_power_of_2(zero_words_block_size), "adjust this");
6109 assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6110 assert_different_registers(cnt, t0, t1);
6111
6112 BLOCK_COMMENT("zero_words {");
6113
6114 mv(t0, zero_words_block_size);
6115 Label around, done, done16;
6116 bltu(cnt, t0, around);
6117 {
6118 RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6119 assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6120 if (StubRoutines::riscv::complete()) {
6121 address tpc = reloc_call(zero_blocks);
6122 if (tpc == nullptr) {
6123 DEBUG_ONLY(reset_labels(around));
6124 postcond(pc() == badAddress);
6125 return nullptr;
6126 }
6127 } else {
6128 // Clobbers t1
6129 rt_call(zero_blocks.target());
6130 }
6131 }
6132 bind(around);
6133 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6134 Label l;
6135 test_bit(t0, cnt, exact_log2(i));
6136 beqz(t0, l);
6137 for (int j = 0; j < i; j++) {
6138 sd(zr, Address(ptr, j * wordSize));
6139 }
6140 addi(ptr, ptr, i * wordSize);
6141 bind(l);
6142 }
6143 {
6144 Label l;
6145 test_bit(t0, cnt, 0);
6146 beqz(t0, l);
6147 sd(zr, Address(ptr, 0));
6148 bind(l);
6149 }
6150
6151 BLOCK_COMMENT("} zero_words");
6152 postcond(pc() != badAddress);
6153 return pc();
6154 }
6155
6156 #define SmallArraySize (18 * BytesPerLong)
6157
6158 // base: Address of a buffer to be zeroed, 8 bytes aligned.
6159 // cnt: Immediate count in HeapWords.
6160 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6161 assert_different_registers(base, t0, t1);
6162
6163 BLOCK_COMMENT("zero_words {");
6164
6165 if (cnt <= SmallArraySize / BytesPerLong) {
6166 for (int i = 0; i < (int)cnt; i++) {
6167 sd(zr, Address(base, i * wordSize));
6168 }
6169 } else {
6170 const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6171 int remainder = cnt % unroll;
6172 for (int i = 0; i < remainder; i++) {
6173 sd(zr, Address(base, i * wordSize));
6174 }
6175
6176 Label loop;
6177 Register cnt_reg = t0;
6178 Register loop_base = t1;
6179 cnt = cnt - remainder;
6180 mv(cnt_reg, cnt);
6181 addi(loop_base, base, remainder * wordSize);
6182 bind(loop);
6183 sub(cnt_reg, cnt_reg, unroll);
6184 for (int i = 0; i < unroll; i++) {
6185 sd(zr, Address(loop_base, i * wordSize));
6186 }
6187 addi(loop_base, loop_base, unroll * wordSize);
6188 bnez(cnt_reg, loop);
6189 }
6190
6191 BLOCK_COMMENT("} zero_words");
6192 }
6193
6194 // base: Address of a buffer to be filled, 8 bytes aligned.
6195 // cnt: Count in 8-byte unit.
6196 // value: Value to be filled with.
6197 // base will point to the end of the buffer after filling.
6198 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6199 // Algorithm:
6200 //
6201 // t0 = cnt & 7
6202 // cnt -= t0
6203 // p += t0
6204 // switch (t0):
6205 // switch start:
6206 // do while cnt
6207 // cnt -= 8
6208 // p[-8] = value
6209 // case 7:
6210 // p[-7] = value
6211 // case 6:
6212 // p[-6] = value
6213 // // ...
6214 // case 1:
6215 // p[-1] = value
6216 // case 0:
6217 // p += 8
6218 // do-while end
6219 // switch end
6220
6221 assert_different_registers(base, cnt, value, t0, t1);
6222
6223 Label fini, skip, entry, loop;
6224 const int unroll = 8; // Number of sd instructions we'll unroll
6225
6226 beqz(cnt, fini);
6227
6228 andi(t0, cnt, unroll - 1);
6229 sub(cnt, cnt, t0);
6230 shadd(base, t0, base, t1, 3);
6231 la(t1, entry);
6232 slli(t0, t0, 2);
6233 sub(t1, t1, t0);
6234 jr(t1);
6235
6236 bind(loop);
6237 addi(base, base, unroll * wordSize);
6238 {
6239 IncompressibleScope scope(this); // Fixed length
6240 for (int i = -unroll; i < 0; i++) {
6241 sd(value, Address(base, i * 8));
6242 }
6243 }
6244 bind(entry);
6245 subi(cnt, cnt, unroll);
6246 bgez(cnt, loop);
6247
6248 bind(fini);
6249 }
6250
6251 // Zero blocks of memory by using CBO.ZERO.
6252 //
6253 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6254 // CBO.ZERO repeatedly for every full block. cnt is the size to be
6255 // zeroed in HeapWords. Returns the count of words left to be zeroed
6256 // in cnt.
6257 //
6258 // NOTE: This is intended to be used in the zero_blocks() stub. If
6259 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6260 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6261 int zicboz_block_size = VM_Version::zicboz_block_size.value();
6262 Label initial_table_end, loop;
6263
6264 // Align base with cache line size.
6265 neg(tmp1, base);
6266 andi(tmp1, tmp1, zicboz_block_size - 1);
6267
6268 // tmp1: the number of bytes to be filled to align the base with cache line size.
6269 add(base, base, tmp1);
6270 srai(tmp2, tmp1, 3);
6271 sub(cnt, cnt, tmp2);
6272 srli(tmp2, tmp1, 1);
6273 la(tmp1, initial_table_end);
6274 sub(tmp2, tmp1, tmp2);
6275 jr(tmp2);
6276 for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6277 sd(zr, Address(base, i));
6278 }
6279 bind(initial_table_end);
6280
6281 mv(tmp1, zicboz_block_size / wordSize);
6282 bind(loop);
6283 cbo_zero(base);
6284 sub(cnt, cnt, tmp1);
6285 addi(base, base, zicboz_block_size);
6286 bge(cnt, tmp1, loop);
6287 }
6288
6289 // java.lang.Math.round(float a)
6290 // Returns the closest int to the argument, with ties rounding to positive infinity.
6291 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6292 // this instructions calling sequence provides performance improvement on all tested devices;
6293 // don't change it without re-verification
6294 Label done;
6295 mv(t0, jint_cast(0.5f));
6296 fmv_w_x(ftmp, t0);
6297
6298 // dst = 0 if NaN
6299 feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6300 mv(dst, zr);
6301 beqz(t0, done);
6302
6303 // dst = (src + 0.5f) rounded down towards negative infinity
6304 // Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6305 // RDN is required for fadd_s, RNE gives incorrect results:
6306 // --------------------------------------------------------------------
6307 // fadd.s rne (src + 0.5f): src = 8388609.000000 ftmp = 8388610.000000
6308 // fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6309 // --------------------------------------------------------------------
6310 // fadd.s rdn (src + 0.5f): src = 8388609.000000 ftmp = 8388609.000000
6311 // fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6312 // --------------------------------------------------------------------
6313 fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6314 fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6315
6316 bind(done);
6317 }
6318
6319 // java.lang.Math.round(double a)
6320 // Returns the closest long to the argument, with ties rounding to positive infinity.
6321 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6322 // this instructions calling sequence provides performance improvement on all tested devices;
6323 // don't change it without re-verification
6324 Label done;
6325 mv(t0, julong_cast(0.5));
6326 fmv_d_x(ftmp, t0);
6327
6328 // dst = 0 if NaN
6329 feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6330 mv(dst, zr);
6331 beqz(t0, done);
6332
6333 // dst = (src + 0.5) rounded down towards negative infinity
6334 fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6335 fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6336
6337 bind(done);
6338 }
6339
6340 // Helper routine processing the slow path of NaN when converting float to float16
6341 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6342 Register tmp1, Register tmp2) {
6343 fmv_x_w(dst, src);
6344
6345 // Float (32 bits)
6346 // Bit: 31 30 to 23 22 to 0
6347 // +---+------------------+-----------------------------+
6348 // | S | Exponent | Mantissa (Fraction) |
6349 // +---+------------------+-----------------------------+
6350 // 1 bit 8 bits 23 bits
6351 //
6352 // Float (16 bits)
6353 // Bit: 15 14 to 10 9 to 0
6354 // +---+----------------+------------------+
6355 // | S | Exponent | Mantissa |
6356 // +---+----------------+------------------+
6357 // 1 bit 5 bits 10 bits
6358 const int fp_sign_bits = 1;
6359 const int fp32_bits = 32;
6360 const int fp32_exponent_bits = 8;
6361 const int fp32_mantissa_1st_part_bits = 10;
6362 const int fp32_mantissa_2nd_part_bits = 9;
6363 const int fp32_mantissa_3rd_part_bits = 4;
6364 const int fp16_exponent_bits = 5;
6365 const int fp16_mantissa_bits = 10;
6366
6367 // preserve the sign bit and exponent, clear mantissa.
6368 srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6369 slli(tmp2, tmp2, fp16_mantissa_bits);
6370
6371 // Preserve high order bit of float NaN in the
6372 // binary16 result NaN (tenth bit); OR in remaining
6373 // bits into lower 9 bits of binary 16 significand.
6374 // | (doppel & 0x007f_e000) >> 13 // 10 bits
6375 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
6376 // | (doppel & 0x0000_000f)); // 4 bits
6377 //
6378 // Check j.l.Float.floatToFloat16 for more information.
6379 // 10 bits
6380 int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6381 int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6382 slli(tmp1, dst, left_shift);
6383 srli(tmp1, tmp1, right_shift);
6384 orr(tmp2, tmp2, tmp1);
6385 // 9 bits
6386 left_shift += fp32_mantissa_1st_part_bits;
6387 right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6388 slli(tmp1, dst, left_shift);
6389 srli(tmp1, tmp1, right_shift);
6390 orr(tmp2, tmp2, tmp1);
6391 // 4 bits
6392 andi(tmp1, dst, 0xf);
6393 orr(dst, tmp2, tmp1);
6394 }
6395
6396 #define FCVT_SAFE(FLOATCVT, FLOATSIG) \
6397 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
6398 Label done; \
6399 assert_different_registers(dst, tmp); \
6400 fclass_##FLOATSIG(tmp, src); \
6401 mv(dst, zr); \
6402 /* check if src is NaN */ \
6403 andi(tmp, tmp, FClassBits::nan); \
6404 bnez(tmp, done); \
6405 FLOATCVT(dst, src); \
6406 bind(done); \
6407 }
6408
6409 FCVT_SAFE(fcvt_w_s, s);
6410 FCVT_SAFE(fcvt_l_s, s);
6411 FCVT_SAFE(fcvt_w_d, d);
6412 FCVT_SAFE(fcvt_l_d, d);
6413
6414 #undef FCVT_SAFE
6415
6416 #define FCMP(FLOATTYPE, FLOATSIG) \
6417 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1, \
6418 FloatRegister Rs2, int unordered_result) { \
6419 Label Ldone; \
6420 if (unordered_result < 0) { \
6421 /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */ \
6422 /* installs 1 if gt else 0 */ \
6423 flt_##FLOATSIG(result, Rs2, Rs1); \
6424 /* Rs1 > Rs2, install 1 */ \
6425 bgtz(result, Ldone); \
6426 feq_##FLOATSIG(result, Rs1, Rs2); \
6427 subi(result, result, 1); \
6428 /* Rs1 = Rs2, install 0 */ \
6429 /* NaN or Rs1 < Rs2, install -1 */ \
6430 bind(Ldone); \
6431 } else { \
6432 /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */ \
6433 /* installs 1 if gt or unordered else 0 */ \
6434 flt_##FLOATSIG(result, Rs1, Rs2); \
6435 /* Rs1 < Rs2, install -1 */ \
6436 bgtz(result, Ldone); \
6437 feq_##FLOATSIG(result, Rs1, Rs2); \
6438 subi(result, result, 1); \
6439 /* Rs1 = Rs2, install 0 */ \
6440 /* NaN or Rs1 > Rs2, install 1 */ \
6441 bind(Ldone); \
6442 neg(result, result); \
6443 } \
6444 }
6445
6446 FCMP(float, s);
6447 FCMP(double, d);
6448
6449 #undef FCMP
6450
6451 // Zero words; len is in bytes
6452 // Destroys all registers except addr
6453 // len must be a nonzero multiple of wordSize
6454 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6455 assert_different_registers(addr, len, tmp, t0, t1);
6456
6457 #ifdef ASSERT
6458 {
6459 Label L;
6460 andi(t0, len, BytesPerWord - 1);
6461 beqz(t0, L);
6462 stop("len is not a multiple of BytesPerWord");
6463 bind(L);
6464 }
6465 #endif // ASSERT
6466
6467 #ifndef PRODUCT
6468 block_comment("zero memory");
6469 #endif // PRODUCT
6470
6471 Label loop;
6472 Label entry;
6473
6474 // Algorithm:
6475 //
6476 // t0 = cnt & 7
6477 // cnt -= t0
6478 // p += t0
6479 // switch (t0) {
6480 // do {
6481 // cnt -= 8
6482 // p[-8] = 0
6483 // case 7:
6484 // p[-7] = 0
6485 // case 6:
6486 // p[-6] = 0
6487 // ...
6488 // case 1:
6489 // p[-1] = 0
6490 // case 0:
6491 // p += 8
6492 // } while (cnt)
6493 // }
6494
6495 const int unroll = 8; // Number of sd(zr) instructions we'll unroll
6496
6497 srli(len, len, LogBytesPerWord);
6498 andi(t0, len, unroll - 1); // t0 = cnt % unroll
6499 sub(len, len, t0); // cnt -= unroll
6500 // tmp always points to the end of the region we're about to zero
6501 shadd(tmp, t0, addr, t1, LogBytesPerWord);
6502 la(t1, entry);
6503 slli(t0, t0, 2);
6504 sub(t1, t1, t0);
6505 jr(t1);
6506
6507 bind(loop);
6508 sub(len, len, unroll);
6509 {
6510 IncompressibleScope scope(this); // Fixed length
6511 for (int i = -unroll; i < 0; i++) {
6512 sd(zr, Address(tmp, i * wordSize));
6513 }
6514 }
6515 bind(entry);
6516 add(tmp, tmp, unroll * wordSize);
6517 bnez(len, loop);
6518 }
6519
6520 // shift left by shamt and add
6521 // Rd = (Rs1 << shamt) + Rs2
6522 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6523 if (UseZba) {
6524 if (shamt == 1) {
6525 sh1add(Rd, Rs1, Rs2);
6526 return;
6527 } else if (shamt == 2) {
6528 sh2add(Rd, Rs1, Rs2);
6529 return;
6530 } else if (shamt == 3) {
6531 sh3add(Rd, Rs1, Rs2);
6532 return;
6533 }
6534 }
6535
6536 if (shamt != 0) {
6537 assert_different_registers(Rs2, tmp);
6538 slli(tmp, Rs1, shamt);
6539 add(Rd, Rs2, tmp);
6540 } else {
6541 add(Rd, Rs1, Rs2);
6542 }
6543 }
6544
6545 void MacroAssembler::zext(Register dst, Register src, int bits) {
6546 switch (bits) {
6547 case 32:
6548 if (UseZba) {
6549 zext_w(dst, src);
6550 return;
6551 }
6552 break;
6553 case 16:
6554 if (UseZbb) {
6555 zext_h(dst, src);
6556 return;
6557 }
6558 break;
6559 case 8:
6560 zext_b(dst, src);
6561 return;
6562 default:
6563 break;
6564 }
6565
6566 slli(dst, src, XLEN - bits);
6567 srli(dst, dst, XLEN - bits);
6568 }
6569
6570 void MacroAssembler::sext(Register dst, Register src, int bits) {
6571 switch (bits) {
6572 case 32:
6573 sext_w(dst, src);
6574 return;
6575 case 16:
6576 if (UseZbb) {
6577 sext_h(dst, src);
6578 return;
6579 }
6580 break;
6581 case 8:
6582 if (UseZbb) {
6583 sext_b(dst, src);
6584 return;
6585 }
6586 break;
6587 default:
6588 break;
6589 }
6590
6591 slli(dst, src, XLEN - bits);
6592 srai(dst, dst, XLEN - bits);
6593 }
6594
6595 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6596 Register tmp, bool is_signed) {
6597 if (src1 == src2) {
6598 mv(dst, zr);
6599 return;
6600 }
6601 Label done;
6602 Register left = src1;
6603 Register right = src2;
6604 if (dst == src1) {
6605 assert_different_registers(dst, src2, tmp);
6606 mv(tmp, src1);
6607 left = tmp;
6608 } else if (dst == src2) {
6609 assert_different_registers(dst, src1, tmp);
6610 mv(tmp, src2);
6611 right = tmp;
6612 }
6613
6614 // installs 1 if gt else 0
6615 if (is_signed) {
6616 slt(dst, right, left);
6617 } else {
6618 sltu(dst, right, left);
6619 }
6620 bnez(dst, done);
6621 if (is_signed) {
6622 slt(dst, left, right);
6623 } else {
6624 sltu(dst, left, right);
6625 }
6626 // dst = -1 if lt; else if eq , dst = 0
6627 neg(dst, dst);
6628 bind(done);
6629 }
6630
6631 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6632 {
6633 cmp_x2i(dst, src1, src2, tmp);
6634 }
6635
6636 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6637 cmp_x2i(dst, src1, src2, tmp, false);
6638 }
6639
6640 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6641 cmp_x2i(dst, src1, src2, tmp, false);
6642 }
6643
6644 // The java_calling_convention describes stack locations as ideal slots on
6645 // a frame with no abi restrictions. Since we must observe abi restrictions
6646 // (like the placement of the register window) the slots must be biased by
6647 // the following value.
6648 static int reg2offset_in(VMReg r) {
6649 // Account for saved fp and ra
6650 // This should really be in_preserve_stack_slots
6651 return r->reg2stack() * VMRegImpl::stack_slot_size;
6652 }
6653
6654 static int reg2offset_out(VMReg r) {
6655 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6656 }
6657
6658 // The C ABI specifies:
6659 // "integer scalars narrower than XLEN bits are widened according to the sign
6660 // of their type up to 32 bits, then sign-extended to XLEN bits."
6661 // Applies for both passed in register and stack.
6662 //
6663 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6664 // Native uses 64-bit stack slots for all integer scalar types.
6665 //
6666 // lw loads the Java stack slot, sign-extends and
6667 // sd store this widened integer into a 64 bit native stack slot.
6668 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6669 if (src.first()->is_stack()) {
6670 if (dst.first()->is_stack()) {
6671 // stack to stack
6672 lw(tmp, Address(fp, reg2offset_in(src.first())));
6673 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6674 } else {
6675 // stack to reg
6676 lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6677 }
6678 } else if (dst.first()->is_stack()) {
6679 // reg to stack
6680 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6681 } else {
6682 if (dst.first() != src.first()) {
6683 sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6684 }
6685 }
6686 }
6687
6688 // An oop arg. Must pass a handle not the oop itself
6689 void MacroAssembler::object_move(OopMap* map,
6690 int oop_handle_offset,
6691 int framesize_in_slots,
6692 VMRegPair src,
6693 VMRegPair dst,
6694 bool is_receiver,
6695 int* receiver_offset) {
6696 assert_cond(map != nullptr && receiver_offset != nullptr);
6697
6698 // must pass a handle. First figure out the location we use as a handle
6699 Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6700
6701 // See if oop is null if it is we need no handle
6702
6703 if (src.first()->is_stack()) {
6704 // Oop is already on the stack as an argument
6705 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6706 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6707 if (is_receiver) {
6708 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6709 }
6710
6711 ld(t0, Address(fp, reg2offset_in(src.first())));
6712 la(rHandle, Address(fp, reg2offset_in(src.first())));
6713 // conditionally move a null
6714 Label notZero1;
6715 bnez(t0, notZero1);
6716 mv(rHandle, zr);
6717 bind(notZero1);
6718 } else {
6719
6720 // Oop is in a register we must store it to the space we reserve
6721 // on the stack for oop_handles and pass a handle if oop is non-null
6722
6723 const Register rOop = src.first()->as_Register();
6724 int oop_slot = -1;
6725 if (rOop == j_rarg0) {
6726 oop_slot = 0;
6727 } else if (rOop == j_rarg1) {
6728 oop_slot = 1;
6729 } else if (rOop == j_rarg2) {
6730 oop_slot = 2;
6731 } else if (rOop == j_rarg3) {
6732 oop_slot = 3;
6733 } else if (rOop == j_rarg4) {
6734 oop_slot = 4;
6735 } else if (rOop == j_rarg5) {
6736 oop_slot = 5;
6737 } else if (rOop == j_rarg6) {
6738 oop_slot = 6;
6739 } else {
6740 assert(rOop == j_rarg7, "wrong register");
6741 oop_slot = 7;
6742 }
6743
6744 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6745 int offset = oop_slot * VMRegImpl::stack_slot_size;
6746
6747 map->set_oop(VMRegImpl::stack2reg(oop_slot));
6748 // Store oop in handle area, may be null
6749 sd(rOop, Address(sp, offset));
6750 if (is_receiver) {
6751 *receiver_offset = offset;
6752 }
6753
6754 //rOop maybe the same as rHandle
6755 if (rOop == rHandle) {
6756 Label isZero;
6757 beqz(rOop, isZero);
6758 la(rHandle, Address(sp, offset));
6759 bind(isZero);
6760 } else {
6761 Label notZero2;
6762 la(rHandle, Address(sp, offset));
6763 bnez(rOop, notZero2);
6764 mv(rHandle, zr);
6765 bind(notZero2);
6766 }
6767 }
6768
6769 // If arg is on the stack then place it otherwise it is already in correct reg.
6770 if (dst.first()->is_stack()) {
6771 sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6772 }
6773 }
6774
6775 // A float arg may have to do float reg int reg conversion
6776 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6777 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6778 (src.first()->is_reg() && dst.first()->is_reg()) ||
6779 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6780 if (src.first()->is_stack()) {
6781 if (dst.first()->is_stack()) {
6782 lwu(tmp, Address(fp, reg2offset_in(src.first())));
6783 sw(tmp, Address(sp, reg2offset_out(dst.first())));
6784 } else if (dst.first()->is_Register()) {
6785 lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6786 } else {
6787 ShouldNotReachHere();
6788 }
6789 } else if (src.first() != dst.first()) {
6790 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6791 fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6792 } else {
6793 ShouldNotReachHere();
6794 }
6795 }
6796 }
6797
6798 // A long move
6799 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6800 if (src.first()->is_stack()) {
6801 if (dst.first()->is_stack()) {
6802 // stack to stack
6803 ld(tmp, Address(fp, reg2offset_in(src.first())));
6804 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6805 } else {
6806 // stack to reg
6807 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6808 }
6809 } else if (dst.first()->is_stack()) {
6810 // reg to stack
6811 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6812 } else {
6813 if (dst.first() != src.first()) {
6814 mv(dst.first()->as_Register(), src.first()->as_Register());
6815 }
6816 }
6817 }
6818
6819 // A double move
6820 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6821 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6822 (src.first()->is_reg() && dst.first()->is_reg()) ||
6823 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6824 if (src.first()->is_stack()) {
6825 if (dst.first()->is_stack()) {
6826 ld(tmp, Address(fp, reg2offset_in(src.first())));
6827 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6828 } else if (dst.first()-> is_Register()) {
6829 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6830 } else {
6831 ShouldNotReachHere();
6832 }
6833 } else if (src.first() != dst.first()) {
6834 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6835 fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6836 } else {
6837 ShouldNotReachHere();
6838 }
6839 }
6840 }
6841
6842 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6843 assert(bit_pos < 64, "invalid bit range");
6844 if (UseZbs) {
6845 bexti(Rd, Rs, bit_pos);
6846 return;
6847 }
6848 int64_t imm = (int64_t)(1UL << bit_pos);
6849 if (is_simm12(imm)) {
6850 andi(Rd, Rs, imm);
6851 } else {
6852 srli(Rd, Rs, bit_pos);
6853 andi(Rd, Rd, 1);
6854 }
6855 }
6856
6857 // Implements fast-locking.
6858 //
6859 // - obj: the object to be locked
6860 // - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6861 // - slow: branched to if locking fails
6862 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6863 assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6864
6865 Label push;
6866 const Register top = tmp1;
6867 const Register mark = tmp2;
6868 const Register t = tmp3;
6869
6870 // Preload the markWord. It is important that this is the first
6871 // instruction emitted as it is part of C1's null check semantics.
6872 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6873
6874 if (UseObjectMonitorTable) {
6875 // Clear cache in case fast locking succeeds or we need to take the slow-path.
6876 sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6877 }
6878
6879 if (DiagnoseSyncOnValueBasedClasses != 0) {
6880 load_klass(tmp1, obj);
6881 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6882 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6883 bnez(tmp1, slow, /* is_far */ true);
6884 }
6885
6886 // Check if the lock-stack is full.
6887 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6888 mv(t, (unsigned)LockStack::end_offset());
6889 bge(top, t, slow, /* is_far */ true);
6890
6891 // Check for recursion.
6892 add(t, xthread, top);
6893 ld(t, Address(t, -oopSize));
6894 beq(obj, t, push);
6895
6896 // Check header for monitor (0b10).
6897 test_bit(t, mark, exact_log2(markWord::monitor_value));
6898 bnez(t, slow, /* is_far */ true);
6899
6900 // Try to lock. Transition lock-bits 0b01 => 0b00
6901 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6902 ori(mark, mark, markWord::unlocked_value);
6903 xori(t, mark, markWord::unlocked_value);
6904 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6905 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6906 bne(mark, t, slow, /* is_far */ true);
6907
6908 bind(push);
6909 // After successful lock, push object on lock-stack.
6910 add(t, xthread, top);
6911 sd(obj, Address(t));
6912 addiw(top, top, oopSize);
6913 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6914 }
6915
6916 // Implements ligthweight-unlocking.
6917 //
6918 // - obj: the object to be unlocked
6919 // - tmp1, tmp2, tmp3: temporary registers
6920 // - slow: branched to if unlocking fails
6921 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6922 assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6923
6924 #ifdef ASSERT
6925 {
6926 // Check for lock-stack underflow.
6927 Label stack_ok;
6928 lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6929 mv(tmp2, (unsigned)LockStack::start_offset());
6930 bge(tmp1, tmp2, stack_ok);
6931 STOP("Lock-stack underflow");
6932 bind(stack_ok);
6933 }
6934 #endif
6935
6936 Label unlocked, push_and_slow;
6937 const Register top = tmp1;
6938 const Register mark = tmp2;
6939 const Register t = tmp3;
6940
6941 // Check if obj is top of lock-stack.
6942 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6943 subiw(top, top, oopSize);
6944 add(t, xthread, top);
6945 ld(t, Address(t));
6946 bne(obj, t, slow, /* is_far */ true);
6947
6948 // Pop lock-stack.
6949 DEBUG_ONLY(add(t, xthread, top);)
6950 DEBUG_ONLY(sd(zr, Address(t));)
6951 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6952
6953 // Check if recursive.
6954 add(t, xthread, top);
6955 ld(t, Address(t, -oopSize));
6956 beq(obj, t, unlocked);
6957
6958 // Not recursive. Check header for monitor (0b10).
6959 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6960 test_bit(t, mark, exact_log2(markWord::monitor_value));
6961 bnez(t, push_and_slow);
6962
6963 #ifdef ASSERT
6964 // Check header not unlocked (0b01).
6965 Label not_unlocked;
6966 test_bit(t, mark, exact_log2(markWord::unlocked_value));
6967 beqz(t, not_unlocked);
6968 stop("fast_unlock already unlocked");
6969 bind(not_unlocked);
6970 #endif
6971
6972 // Try to unlock. Transition lock bits 0b00 => 0b01
6973 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6974 ori(t, mark, markWord::unlocked_value);
6975 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6976 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6977 beq(mark, t, unlocked);
6978
6979 bind(push_and_slow);
6980 // Restore lock-stack and handle the unlock in runtime.
6981 DEBUG_ONLY(add(t, xthread, top);)
6982 DEBUG_ONLY(sd(obj, Address(t));)
6983 addiw(top, top, oopSize);
6984 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6985 j(slow);
6986
6987 bind(unlocked);
6988 }
6989
6990 // Unimplemented methods for inline types.
6991 int MacroAssembler::store_inline_type_fields_to_buf(ciInlineKlass* vk, bool from_interpreter) {
6992 Unimplemented();
6993 }
6994
6995 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[]) {
6996 Unimplemented();
6997 }
6998
6999 bool MacroAssembler::unpack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index,
7000 VMReg from, int& from_index, VMRegPair* to, int to_count, int& to_index,
7001 RegState reg_state[]) {
7002 Unimplemented();
7003 }
7004
7005 bool MacroAssembler::pack_inline_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
7006 VMRegPair* from, int from_count, int& from_index, VMReg to,
7007 RegState reg_state[], Register val_array) {
7008 Unimplemented();
7009 }
7010
7011 int MacroAssembler::extend_stack_for_inline_args(int args_on_stack) {
7012 Unimplemented();
7013 }
7014
7015 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
7016 Unimplemented();
7017 }