1 /*
2 * Copyright (c) 1997, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2024, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/assembler.hpp"
28 #include "asm/assembler.inline.hpp"
29 #include "code/compiledIC.hpp"
30 #include "compiler/disassembler.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/cardTable.hpp"
34 #include "gc/shared/cardTableBarrierSet.hpp"
35 #include "gc/shared/collectedHeap.hpp"
36 #include "interpreter/bytecodeHistogram.hpp"
37 #include "interpreter/interpreter.hpp"
38 #include "interpreter/interpreterRuntime.hpp"
39 #include "memory/resourceArea.hpp"
40 #include "memory/universe.hpp"
41 #include "oops/accessDecorators.hpp"
42 #include "oops/compressedKlass.inline.hpp"
43 #include "oops/compressedOops.inline.hpp"
44 #include "oops/klass.inline.hpp"
45 #include "oops/oop.hpp"
46 #include "runtime/interfaceSupport.inline.hpp"
47 #include "runtime/javaThread.hpp"
48 #include "runtime/jniHandles.inline.hpp"
49 #include "runtime/sharedRuntime.hpp"
50 #include "runtime/stubRoutines.hpp"
51 #include "utilities/globalDefinitions.hpp"
52 #include "utilities/integerCast.hpp"
53 #include "utilities/powerOfTwo.hpp"
54 #ifdef COMPILER2
55 #include "opto/compile.hpp"
56 #include "opto/node.hpp"
57 #include "opto/output.hpp"
58 #endif
59
60 #ifdef PRODUCT
61 #define BLOCK_COMMENT(str) /* nothing */
62 #else
63 #define BLOCK_COMMENT(str) block_comment(str)
64 #endif
65 #define STOP(str) stop(str);
66 #define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
67
68
69
70 Register MacroAssembler::extract_rs1(address instr) {
71 assert_cond(instr != nullptr);
72 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 19, 15));
73 }
74
75 Register MacroAssembler::extract_rs2(address instr) {
76 assert_cond(instr != nullptr);
77 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 24, 20));
78 }
79
80 Register MacroAssembler::extract_rd(address instr) {
81 assert_cond(instr != nullptr);
82 return as_Register(Assembler::extract(Assembler::ld_instr(instr), 11, 7));
83 }
84
85 uint32_t MacroAssembler::extract_opcode(address instr) {
86 assert_cond(instr != nullptr);
87 return Assembler::extract(Assembler::ld_instr(instr), 6, 0);
88 }
89
90 uint32_t MacroAssembler::extract_funct3(address instr) {
91 assert_cond(instr != nullptr);
92 return Assembler::extract(Assembler::ld_instr(instr), 14, 12);
93 }
94
95 bool MacroAssembler::is_pc_relative_at(address instr) {
96 // auipc + jalr
97 // auipc + addi
98 // auipc + load
99 // auipc + fload_load
100 return (is_auipc_at(instr)) &&
101 (is_addi_at(instr + MacroAssembler::instruction_size) ||
102 is_jalr_at(instr + MacroAssembler::instruction_size) ||
103 is_load_at(instr + MacroAssembler::instruction_size) ||
104 is_float_load_at(instr + MacroAssembler::instruction_size)) &&
105 check_pc_relative_data_dependency(instr);
106 }
107
108 // ie:ld(Rd, Label)
109 bool MacroAssembler::is_load_pc_relative_at(address instr) {
110 return is_auipc_at(instr) && // auipc
111 is_ld_at(instr + MacroAssembler::instruction_size) && // ld
112 check_load_pc_relative_data_dependency(instr);
113 }
114
115 bool MacroAssembler::is_movptr1_at(address instr) {
116 return is_lui_at(instr) && // Lui
117 is_addi_at(instr + MacroAssembler::instruction_size) && // Addi
118 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 11) && // Slli Rd, Rs, 11
119 is_addi_at(instr + MacroAssembler::instruction_size * 3) && // Addi
120 is_slli_shift_at(instr + MacroAssembler::instruction_size * 4, 6) && // Slli Rd, Rs, 6
121 (is_addi_at(instr + MacroAssembler::instruction_size * 5) ||
122 is_jalr_at(instr + MacroAssembler::instruction_size * 5) ||
123 is_load_at(instr + MacroAssembler::instruction_size * 5)) && // Addi/Jalr/Load
124 check_movptr1_data_dependency(instr);
125 }
126
127 bool MacroAssembler::is_movptr2_at(address instr) {
128 return is_lui_at(instr) && // lui
129 is_lui_at(instr + MacroAssembler::instruction_size) && // lui
130 is_slli_shift_at(instr + MacroAssembler::instruction_size * 2, 18) && // slli Rd, Rs, 18
131 is_add_at(instr + MacroAssembler::instruction_size * 3) &&
132 (is_addi_at(instr + MacroAssembler::instruction_size * 4) ||
133 is_jalr_at(instr + MacroAssembler::instruction_size * 4) ||
134 is_load_at(instr + MacroAssembler::instruction_size * 4)) && // Addi/Jalr/Load
135 check_movptr2_data_dependency(instr);
136 }
137
138 bool MacroAssembler::is_li16u_at(address instr) {
139 return is_lui_at(instr) && // lui
140 is_srli_at(instr + MacroAssembler::instruction_size) && // srli
141 check_li16u_data_dependency(instr);
142 }
143
144 bool MacroAssembler::is_li32_at(address instr) {
145 return is_lui_at(instr) && // lui
146 is_addiw_at(instr + MacroAssembler::instruction_size) && // addiw
147 check_li32_data_dependency(instr);
148 }
149
150 bool MacroAssembler::is_lwu_to_zr(address instr) {
151 assert_cond(instr != nullptr);
152 return (extract_opcode(instr) == 0b0000011 &&
153 extract_funct3(instr) == 0b110 &&
154 extract_rd(instr) == zr); // zr
155 }
156
157 uint32_t MacroAssembler::get_membar_kind(address addr) {
158 assert_cond(addr != nullptr);
159 assert(is_membar(addr), "no membar found");
160
161 uint32_t insn = Bytes::get_native_u4(addr);
162
163 uint32_t predecessor = Assembler::extract(insn, 27, 24);
164 uint32_t successor = Assembler::extract(insn, 23, 20);
165
166 return MacroAssembler::pred_succ_to_membar_mask(predecessor, successor);
167 }
168
169 void MacroAssembler::set_membar_kind(address addr, uint32_t order_kind) {
170 assert_cond(addr != nullptr);
171 assert(is_membar(addr), "no membar found");
172
173 uint32_t predecessor = 0;
174 uint32_t successor = 0;
175
176 MacroAssembler::membar_mask_to_pred_succ(order_kind, predecessor, successor);
177
178 uint32_t insn = Bytes::get_native_u4(addr);
179 address pInsn = (address) &insn;
180 Assembler::patch(pInsn, 27, 24, predecessor);
181 Assembler::patch(pInsn, 23, 20, successor);
182
183 address membar = addr;
184 Assembler::sd_instr(membar, insn);
185 }
186
187 static void pass_arg0(MacroAssembler* masm, Register arg) {
188 if (c_rarg0 != arg) {
189 masm->mv(c_rarg0, arg);
190 }
191 }
192
193 static void pass_arg1(MacroAssembler* masm, Register arg) {
194 if (c_rarg1 != arg) {
195 masm->mv(c_rarg1, arg);
196 }
197 }
198
199 static void pass_arg2(MacroAssembler* masm, Register arg) {
200 if (c_rarg2 != arg) {
201 masm->mv(c_rarg2, arg);
202 }
203 }
204
205 static void pass_arg3(MacroAssembler* masm, Register arg) {
206 if (c_rarg3 != arg) {
207 masm->mv(c_rarg3, arg);
208 }
209 }
210
211 void MacroAssembler::push_cont_fastpath(Register java_thread) {
212 if (!Continuations::enabled()) return;
213 Label done;
214 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
215 bleu(sp, t0, done);
216 sd(sp, Address(java_thread, JavaThread::cont_fastpath_offset()));
217 bind(done);
218 }
219
220 void MacroAssembler::pop_cont_fastpath(Register java_thread) {
221 if (!Continuations::enabled()) return;
222 Label done;
223 ld(t0, Address(java_thread, JavaThread::cont_fastpath_offset()));
224 bltu(sp, t0, done);
225 sd(zr, Address(java_thread, JavaThread::cont_fastpath_offset()));
226 bind(done);
227 }
228
229 int MacroAssembler::align(int modulus, int extra_offset) {
230 CompressibleScope scope(this);
231 intptr_t before = offset();
232 while ((offset() + extra_offset) % modulus != 0) { nop(); }
233 return (int)(offset() - before);
234 }
235
236 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
237 call_VM_base(oop_result, noreg, noreg, nullptr, entry_point, number_of_arguments, check_exceptions);
238 }
239
240 // Implementation of call_VM versions
241
242 void MacroAssembler::call_VM(Register oop_result,
243 address entry_point,
244 bool check_exceptions) {
245 call_VM_helper(oop_result, entry_point, 0, check_exceptions);
246 }
247
248 void MacroAssembler::call_VM(Register oop_result,
249 address entry_point,
250 Register arg_1,
251 bool check_exceptions) {
252 pass_arg1(this, arg_1);
253 call_VM_helper(oop_result, entry_point, 1, check_exceptions);
254 }
255
256 void MacroAssembler::call_VM(Register oop_result,
257 address entry_point,
258 Register arg_1,
259 Register arg_2,
260 bool check_exceptions) {
261 assert_different_registers(arg_1, c_rarg2);
262 pass_arg2(this, arg_2);
263 pass_arg1(this, arg_1);
264 call_VM_helper(oop_result, entry_point, 2, check_exceptions);
265 }
266
267 void MacroAssembler::call_VM(Register oop_result,
268 address entry_point,
269 Register arg_1,
270 Register arg_2,
271 Register arg_3,
272 bool check_exceptions) {
273 assert_different_registers(arg_1, c_rarg2, c_rarg3);
274 assert_different_registers(arg_2, c_rarg3);
275 pass_arg3(this, arg_3);
276
277 pass_arg2(this, arg_2);
278
279 pass_arg1(this, arg_1);
280 call_VM_helper(oop_result, entry_point, 3, check_exceptions);
281 }
282
283 void MacroAssembler::call_VM(Register oop_result,
284 Register last_java_sp,
285 address entry_point,
286 int number_of_arguments,
287 bool check_exceptions) {
288 call_VM_base(oop_result, xthread, last_java_sp, nullptr, entry_point, number_of_arguments, check_exceptions);
289 }
290
291 void MacroAssembler::call_VM(Register oop_result,
292 Register last_java_sp,
293 address entry_point,
294 Register arg_1,
295 bool check_exceptions) {
296 pass_arg1(this, arg_1);
297 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
298 }
299
300 void MacroAssembler::call_VM(Register oop_result,
301 Register last_java_sp,
302 address entry_point,
303 Register arg_1,
304 Register arg_2,
305 bool check_exceptions) {
306
307 assert_different_registers(arg_1, c_rarg2);
308 pass_arg2(this, arg_2);
309 pass_arg1(this, arg_1);
310 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
311 }
312
313 void MacroAssembler::call_VM(Register oop_result,
314 Register last_java_sp,
315 address entry_point,
316 Register arg_1,
317 Register arg_2,
318 Register arg_3,
319 bool check_exceptions) {
320 assert_different_registers(arg_1, c_rarg2, c_rarg3);
321 assert_different_registers(arg_2, c_rarg3);
322 pass_arg3(this, arg_3);
323 pass_arg2(this, arg_2);
324 pass_arg1(this, arg_1);
325 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
326 }
327
328 void MacroAssembler::post_call_nop() {
329 assert(!in_compressible_scope(), "Must be");
330 assert_alignment(pc());
331 if (!Continuations::enabled()) {
332 return;
333 }
334 relocate(post_call_nop_Relocation::spec());
335 InlineSkippedInstructionsCounter skipCounter(this);
336 nop();
337 li32(zr, 0);
338 }
339
340 // these are no-ops overridden by InterpreterMacroAssembler
341 void MacroAssembler::check_and_handle_earlyret(Register java_thread) {}
342 void MacroAssembler::check_and_handle_popframe(Register java_thread) {}
343
344 // Calls to C land
345 //
346 // When entering C land, the fp, & esp of the last Java frame have to be recorded
347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
348 // has to be reset to 0. This is required to allow proper stack traversal.
349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
350 Register last_java_fp,
351 Register last_java_pc) {
352
353 if (last_java_pc->is_valid()) {
354 sd(last_java_pc, Address(xthread,
355 JavaThread::frame_anchor_offset() +
356 JavaFrameAnchor::last_Java_pc_offset()));
357 }
358
359 // determine last_java_sp register
360 if (!last_java_sp->is_valid()) {
361 last_java_sp = esp;
362 }
363
364 // last_java_fp is optional
365 if (last_java_fp->is_valid()) {
366 sd(last_java_fp, Address(xthread, JavaThread::last_Java_fp_offset()));
367 }
368
369 // We must set sp last.
370 sd(last_java_sp, Address(xthread, JavaThread::last_Java_sp_offset()));
371
372 }
373
374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
375 Register last_java_fp,
376 address last_java_pc,
377 Register tmp) {
378 assert(last_java_pc != nullptr, "must provide a valid PC");
379
380 la(tmp, last_java_pc);
381 sd(tmp, Address(xthread, JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()));
382
383 set_last_Java_frame(last_java_sp, last_java_fp, noreg);
384 }
385
386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
387 Register last_java_fp,
388 Label &L,
389 Register tmp) {
390 if (L.is_bound()) {
391 set_last_Java_frame(last_java_sp, last_java_fp, target(L), tmp);
392 } else {
393 L.add_patch_at(code(), locator());
394 IncompressibleScope scope(this); // the label address will be patched back.
395 set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, tmp);
396 }
397 }
398
399 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
400 // we must set sp to zero to clear frame
401 sd(zr, Address(xthread, JavaThread::last_Java_sp_offset()));
402
403 // must clear fp, so that compiled frames are not confused; it is
404 // possible that we need it only for debugging
405 if (clear_fp) {
406 sd(zr, Address(xthread, JavaThread::last_Java_fp_offset()));
407 }
408
409 // Always clear the pc because it could have been set by make_walkable()
410 sd(zr, Address(xthread, JavaThread::last_Java_pc_offset()));
411 }
412
413 void MacroAssembler::call_VM_base(Register oop_result,
414 Register java_thread,
415 Register last_java_sp,
416 Label* return_pc,
417 address entry_point,
418 int number_of_arguments,
419 bool check_exceptions) {
420 // determine java_thread register
421 if (!java_thread->is_valid()) {
422 java_thread = xthread;
423 }
424
425 // determine last_java_sp register
426 if (!last_java_sp->is_valid()) {
427 last_java_sp = esp;
428 }
429
430 // debugging support
431 assert(number_of_arguments >= 0 , "cannot have negative number of arguments");
432 assert(java_thread == xthread, "unexpected register");
433
434 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result");
435 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
436
437 // push java thread (becomes first argument of C function)
438 mv(c_rarg0, java_thread);
439
440 // set last Java frame before call
441 assert(last_java_sp != fp, "can't use fp");
442
443 Label l;
444 set_last_Java_frame(last_java_sp, fp, return_pc != nullptr ? *return_pc : l, t0);
445
446 // do the call, remove parameters
447 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
448
449 // reset last Java frame
450 // Only interpreter should have to clear fp
451 reset_last_Java_frame(true);
452
453 // C++ interp handles this in the interpreter
454 check_and_handle_popframe(java_thread);
455 check_and_handle_earlyret(java_thread);
456
457 if (check_exceptions) {
458 // check for pending exceptions (java_thread is set upon return)
459 ld(t0, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
460 Label ok;
461 beqz(t0, ok);
462 j(RuntimeAddress(StubRoutines::forward_exception_entry()));
463 bind(ok);
464 }
465
466 // get oop result if there is one and reset the value in the thread
467 if (oop_result->is_valid()) {
468 get_vm_result_oop(oop_result, java_thread);
469 }
470 }
471
472 void MacroAssembler::get_vm_result_oop(Register oop_result, Register java_thread) {
473 ld(oop_result, Address(java_thread, JavaThread::vm_result_oop_offset()));
474 sd(zr, Address(java_thread, JavaThread::vm_result_oop_offset()));
475 verify_oop_msg(oop_result, "broken oop in call_VM_base");
476 }
477
478 void MacroAssembler::get_vm_result_metadata(Register metadata_result, Register java_thread) {
479 ld(metadata_result, Address(java_thread, JavaThread::vm_result_metadata_offset()));
480 sd(zr, Address(java_thread, JavaThread::vm_result_metadata_offset()));
481 }
482
483 void MacroAssembler::clinit_barrier(Register klass, Register tmp, Label* L_fast_path, Label* L_slow_path) {
484 assert(L_fast_path != nullptr || L_slow_path != nullptr, "at least one is required");
485 assert_different_registers(klass, xthread, tmp);
486
487 Label L_fallthrough, L_tmp;
488 if (L_fast_path == nullptr) {
489 L_fast_path = &L_fallthrough;
490 } else if (L_slow_path == nullptr) {
491 L_slow_path = &L_fallthrough;
492 }
493
494 // Fast path check: class is fully initialized
495 lbu(tmp, Address(klass, InstanceKlass::init_state_offset()));
496 membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
497 sub(tmp, tmp, InstanceKlass::fully_initialized);
498 beqz(tmp, *L_fast_path);
499
500 // Fast path check: current thread is initializer thread
501 ld(tmp, Address(klass, InstanceKlass::init_thread_offset()));
502
503 if (L_slow_path == &L_fallthrough) {
504 beq(xthread, tmp, *L_fast_path);
505 bind(*L_slow_path);
506 } else if (L_fast_path == &L_fallthrough) {
507 bne(xthread, tmp, *L_slow_path);
508 bind(*L_fast_path);
509 } else {
510 Unimplemented();
511 }
512 }
513
514 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
515 if (!VerifyOops) { return; }
516
517 // Pass register number to verify_oop_subroutine
518 const char* b = nullptr;
519 {
520 ResourceMark rm;
521 stringStream ss;
522 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line);
523 b = code_string(ss.as_string());
524 }
525 BLOCK_COMMENT("verify_oop {");
526
527 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
528
529 mv(c_rarg0, reg); // c_rarg0 : x10
530 {
531 // The length of the instruction sequence emitted should not depend
532 // on the address of the char buffer so that the size of mach nodes for
533 // scratch emit and normal emit matches.
534 IncompressibleScope scope(this); // Fixed length
535 movptr(t0, (address) b);
536 }
537
538 // Call indirectly to solve generation ordering problem
539 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
540 jalr(t1);
541
542 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
543
544 BLOCK_COMMENT("} verify_oop");
545 }
546
547 // Handle the receiver type profile update given the "recv" klass.
548 //
549 // Normally updates the ReceiverData (RD) that starts at "mdp" + "mdp_offset".
550 // If there are no matching or claimable receiver entries in RD, updates
551 // the polymorphic counter.
552 //
553 // This code expected to run by either the interpreter or JIT-ed code, without
554 // extra synchronization. For safety, receiver cells are claimed atomically, which
555 // avoids grossly misrepresenting the profiles under concurrent updates. For speed,
556 // counter updates are not atomic.
557 //
558 void MacroAssembler::profile_receiver_type(Register recv, Register mdp, int mdp_offset) {
559 assert_different_registers(recv, mdp, t0, t1);
560
561 int base_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(0));
562 int end_receiver_offset = in_bytes(ReceiverTypeData::receiver_offset(ReceiverTypeData::row_limit()));
563 int poly_count_offset = in_bytes(CounterData::count_offset());
564 int receiver_step = in_bytes(ReceiverTypeData::receiver_offset(1)) - base_receiver_offset;
565 int receiver_to_count_step = in_bytes(ReceiverTypeData::receiver_count_offset(0)) - base_receiver_offset;
566
567 // Adjust for MDP offsets. Slots are pointer-sized, so is the global offset.
568 base_receiver_offset += mdp_offset;
569 end_receiver_offset += mdp_offset;
570 poly_count_offset += mdp_offset;
571
572 #ifdef ASSERT
573 // We are about to walk the MDO slots without asking for offsets.
574 // Check that our math hits all the right spots.
575 for (uint c = 0; c < ReceiverTypeData::row_limit(); c++) {
576 int real_recv_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_offset(c));
577 int real_count_offset = mdp_offset + in_bytes(ReceiverTypeData::receiver_count_offset(c));
578 int offset = base_receiver_offset + receiver_step*c;
579 int count_offset = offset + receiver_to_count_step;
580 assert(offset == real_recv_offset, "receiver slot math");
581 assert(count_offset == real_count_offset, "receiver count math");
582 }
583 int real_poly_count_offset = mdp_offset + in_bytes(CounterData::count_offset());
584 assert(poly_count_offset == real_poly_count_offset, "poly counter math");
585 #endif
586
587 // Corner case: no profile table. Increment poly counter and exit.
588 if (ReceiverTypeData::row_limit() == 0) {
589 increment(Address(mdp, poly_count_offset), DataLayout::counter_increment);
590 return;
591 }
592
593 Register offset = t1;
594
595 Label L_loop_search_receiver, L_loop_search_empty;
596 Label L_restart, L_found_recv, L_found_empty, L_count_update;
597
598 // The code here recognizes three major cases:
599 // A. Fastest: receiver found in the table
600 // B. Fast: no receiver in the table, and the table is full
601 // C. Slow: no receiver in the table, free slots in the table
602 //
603 // The case A performance is most important, as perfectly-behaved code would end up
604 // there, especially with larger TypeProfileWidth. The case B performance is
605 // important as well, this is where bulk of code would land for normally megamorphic
606 // cases. The case C performance is not essential, its job is to deal with installation
607 // races, we optimize for code density instead. Case C needs to make sure that receiver
608 // rows are only claimed once. This makes sure we never overwrite a row for another
609 // receiver and never duplicate the receivers in the list, making profile type-accurate.
610 //
611 // It is very tempting to handle these cases in a single loop, and claim the first slot
612 // without checking the rest of the table. But, profiling code should tolerate free slots
613 // in the table, as class unloading can clear them. After such cleanup, the receiver
614 // we need might be _after_ the free slot. Therefore, we need to let at least full scan
615 // to complete, before trying to install new slots. Splitting the code in several tight
616 // loops also helpfully optimizes for cases A and B.
617 //
618 // This code is effectively:
619 //
620 // restart:
621 // // Fastest: receiver is already installed
622 // for (i = 0; i < receiver_count(); i++) {
623 // if (receiver(i) == recv) goto found_recv(i);
624 // }
625 //
626 // // Fast: no receiver, but profile is not full
627 // for (i = 0; i < receiver_count(); i++) {
628 // if (receiver(i) == null) goto found_null(i);
629 // }
630 //
631 // // Slow: profile is full, polymorphic case
632 // count++;
633 // return
634 //
635 // // Slow: try to install receiver
636 // found_null(i):
637 // CAS(&receiver(i), null, recv);
638 // goto restart
639 //
640 // found_recv(i):
641 // *receiver_count(i)++
642 //
643
644 bind(L_restart);
645
646 // Fastest: receiver is already installed
647 mv(offset, base_receiver_offset);
648 bind(L_loop_search_receiver);
649 add(t0, mdp, offset);
650 ld(t0, Address(t0));
651 beq(recv, t0, L_found_recv);
652 add(offset, offset, receiver_step);
653 sub(t0, offset, end_receiver_offset);
654 bnez(t0, L_loop_search_receiver);
655
656 // Fast: no receiver, but profile is not full
657 mv(offset, base_receiver_offset);
658 bind(L_loop_search_empty);
659 add(t0, mdp, offset);
660 ld(t0, Address(t0));
661 beqz(t0, L_found_empty);
662 add(offset, offset, receiver_step);
663 sub(t0, offset, end_receiver_offset);
664 bnez(t0, L_loop_search_empty);
665
666 // Slow: Receiver is not found and table is full.
667 // Increment polymorphic counter instead of receiver slot.
668 mv(offset, poly_count_offset);
669 j(L_count_update);
670
671 // Slowest: try to install receiver
672 bind(L_found_empty);
673
674 // Atomically swing receiver slot: null -> recv.
675 //
676 // The update uses CAS, which clobbers t0. Therefore, t1
677 // is used to hold the destination address. This is safe because the
678 // offset is no longer needed after the address is computed.
679 add(t1, mdp, offset);
680 weak_cmpxchg(/*addr*/ t1, /*expected*/ zr, /*new*/ recv, Assembler::int64,
681 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::relaxed, /*result*/ t0);
682
683 // CAS success means the slot now has the receiver we want. CAS failure means
684 // something had claimed the slot concurrently: it can be the same receiver we want,
685 // or something else. Since this is a slow path, we can optimize for code density,
686 // and just restart the search from the beginning.
687 j(L_restart);
688
689 // Found a receiver, convert its slot offset to corresponding count offset.
690 bind(L_found_recv);
691 add(offset, offset, receiver_to_count_step);
692
693 // Finally, update the counter
694 bind(L_count_update);
695 add(t1, mdp, offset);
696 increment(Address(t1), DataLayout::counter_increment);
697 }
698
699 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
700 if (!VerifyOops) {
701 return;
702 }
703
704 const char* b = nullptr;
705 {
706 ResourceMark rm;
707 stringStream ss;
708 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line);
709 b = code_string(ss.as_string());
710 }
711 BLOCK_COMMENT("verify_oop_addr {");
712
713 push_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
714
715 if (addr.uses(sp)) {
716 la(x10, addr);
717 ld(x10, Address(x10, 4 * wordSize));
718 } else {
719 ld(x10, addr);
720 }
721
722 {
723 // The length of the instruction sequence emitted should not depend
724 // on the address of the char buffer so that the size of mach nodes for
725 // scratch emit and normal emit matches.
726 IncompressibleScope scope(this); // Fixed length
727 movptr(t0, (address) b);
728 }
729
730 // Call indirectly to solve generation ordering problem
731 ld(t1, RuntimeAddress(StubRoutines::verify_oop_subroutine_entry_address()));
732 jalr(t1);
733
734 pop_reg(RegSet::of(ra, t0, t1, c_rarg0), sp);
735
736 BLOCK_COMMENT("} verify_oop_addr");
737 }
738
739 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
740 int extra_slot_offset) {
741 // cf. TemplateTable::prepare_invoke(), if (load_receiver).
742 int stackElementSize = Interpreter::stackElementSize;
743 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
744 #ifdef ASSERT
745 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
746 assert(offset1 - offset == stackElementSize, "correct arithmetic");
747 #endif
748 if (arg_slot.is_constant()) {
749 return Address(esp, arg_slot.as_constant() * stackElementSize + offset);
750 } else {
751 assert_different_registers(t0, arg_slot.as_register());
752 shadd(t0, arg_slot.as_register(), esp, t0, exact_log2(stackElementSize));
753 return Address(t0, offset);
754 }
755 }
756
757 #ifndef PRODUCT
758 extern "C" void findpc(intptr_t x);
759 #endif
760
761 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
762 {
763 // In order to get locks to work, we need to fake a in_VM state
764 if (ShowMessageBoxOnError) {
765 JavaThread* thread = JavaThread::current();
766 JavaThreadState saved_state = thread->thread_state();
767 thread->set_thread_state(_thread_in_vm);
768 #ifndef PRODUCT
769 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
770 ttyLocker ttyl;
771 BytecodeCounter::print();
772 }
773 #endif
774 if (os::message_box(msg, "Execution stopped, print registers?")) {
775 ttyLocker ttyl;
776 tty->print_cr(" pc = 0x%016lx", pc);
777 #ifndef PRODUCT
778 tty->cr();
779 findpc(pc);
780 tty->cr();
781 #endif
782 tty->print_cr(" x0 = 0x%016lx", regs[0]);
783 tty->print_cr(" x1 = 0x%016lx", regs[1]);
784 tty->print_cr(" x2 = 0x%016lx", regs[2]);
785 tty->print_cr(" x3 = 0x%016lx", regs[3]);
786 tty->print_cr(" x4 = 0x%016lx", regs[4]);
787 tty->print_cr(" x5 = 0x%016lx", regs[5]);
788 tty->print_cr(" x6 = 0x%016lx", regs[6]);
789 tty->print_cr(" x7 = 0x%016lx", regs[7]);
790 tty->print_cr(" x8 = 0x%016lx", regs[8]);
791 tty->print_cr(" x9 = 0x%016lx", regs[9]);
792 tty->print_cr("x10 = 0x%016lx", regs[10]);
793 tty->print_cr("x11 = 0x%016lx", regs[11]);
794 tty->print_cr("x12 = 0x%016lx", regs[12]);
795 tty->print_cr("x13 = 0x%016lx", regs[13]);
796 tty->print_cr("x14 = 0x%016lx", regs[14]);
797 tty->print_cr("x15 = 0x%016lx", regs[15]);
798 tty->print_cr("x16 = 0x%016lx", regs[16]);
799 tty->print_cr("x17 = 0x%016lx", regs[17]);
800 tty->print_cr("x18 = 0x%016lx", regs[18]);
801 tty->print_cr("x19 = 0x%016lx", regs[19]);
802 tty->print_cr("x20 = 0x%016lx", regs[20]);
803 tty->print_cr("x21 = 0x%016lx", regs[21]);
804 tty->print_cr("x22 = 0x%016lx", regs[22]);
805 tty->print_cr("x23 = 0x%016lx", regs[23]);
806 tty->print_cr("x24 = 0x%016lx", regs[24]);
807 tty->print_cr("x25 = 0x%016lx", regs[25]);
808 tty->print_cr("x26 = 0x%016lx", regs[26]);
809 tty->print_cr("x27 = 0x%016lx", regs[27]);
810 tty->print_cr("x28 = 0x%016lx", regs[28]);
811 tty->print_cr("x30 = 0x%016lx", regs[30]);
812 tty->print_cr("x31 = 0x%016lx", regs[31]);
813 BREAKPOINT;
814 }
815 }
816 fatal("DEBUG MESSAGE: %s", msg);
817 }
818
819 void MacroAssembler::resolve_jobject(Register value, Register tmp1, Register tmp2) {
820 assert_different_registers(value, tmp1, tmp2);
821 Label done, tagged, weak_tagged;
822
823 beqz(value, done); // Use null as-is.
824 // Test for tag.
825 andi(tmp1, value, JNIHandles::tag_mask);
826 bnez(tmp1, tagged);
827
828 // Resolve local handle
829 access_load_at(T_OBJECT, IN_NATIVE | AS_RAW, value, Address(value, 0), tmp1, tmp2);
830 verify_oop(value);
831 j(done);
832
833 bind(tagged);
834 // Test for jweak tag.
835 STATIC_ASSERT(JNIHandles::TypeTag::weak_global == 0b1);
836 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::weak_global));
837 bnez(tmp1, weak_tagged);
838
839 // Resolve global handle
840 access_load_at(T_OBJECT, IN_NATIVE, value,
841 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
842 verify_oop(value);
843 j(done);
844
845 bind(weak_tagged);
846 // Resolve jweak.
847 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
848 Address(value, -JNIHandles::TypeTag::weak_global), tmp1, tmp2);
849 verify_oop(value);
850
851 bind(done);
852 }
853
854 void MacroAssembler::resolve_global_jobject(Register value, Register tmp1, Register tmp2) {
855 assert_different_registers(value, tmp1, tmp2);
856 Label done;
857
858 beqz(value, done); // Use null as-is.
859
860 #ifdef ASSERT
861 {
862 STATIC_ASSERT(JNIHandles::TypeTag::global == 0b10);
863 Label valid_global_tag;
864 test_bit(tmp1, value, exact_log2(JNIHandles::TypeTag::global)); // Test for global tag.
865 bnez(tmp1, valid_global_tag);
866 stop("non global jobject using resolve_global_jobject");
867 bind(valid_global_tag);
868 }
869 #endif
870
871 // Resolve global handle
872 access_load_at(T_OBJECT, IN_NATIVE, value,
873 Address(value, -JNIHandles::TypeTag::global), tmp1, tmp2);
874 verify_oop(value);
875
876 bind(done);
877 }
878
879 void MacroAssembler::stop(const char* msg) {
880 BLOCK_COMMENT(msg);
881 illegal_instruction(Assembler::csr::time);
882 emit_int64((uintptr_t)msg);
883 }
884
885 void MacroAssembler::unimplemented(const char* what) {
886 const char* buf = nullptr;
887 {
888 ResourceMark rm;
889 stringStream ss;
890 ss.print("unimplemented: %s", what);
891 buf = code_string(ss.as_string());
892 }
893 stop(buf);
894 }
895
896 void MacroAssembler::emit_static_call_stub() {
897 IncompressibleScope scope(this); // Fixed length: see CompiledDirectCall::to_interp_stub_size().
898 // CompiledDirectCall::set_to_interpreted knows the
899 // exact layout of this stub.
900
901 mov_metadata(xmethod, (Metadata*)nullptr);
902
903 // Jump to the entry point of the c2i stub.
904 int32_t offset = 0;
905 movptr2(t1, 0, offset, t0); // lui + lui + slli + add
906 jr(t1, offset);
907 }
908
909 void MacroAssembler::call_VM_leaf_base(address entry_point,
910 int number_of_arguments,
911 Label *retaddr) {
912 int32_t offset = 0;
913 push_reg(RegSet::of(t1, xmethod), sp); // push << t1 & xmethod >> to sp
914 movptr(t1, entry_point, offset, t0);
915 jalr(t1, offset);
916 if (retaddr != nullptr) {
917 bind(*retaddr);
918 }
919 pop_reg(RegSet::of(t1, xmethod), sp); // pop << t1 & xmethod >> from sp
920 }
921
922 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
923 call_VM_leaf_base(entry_point, number_of_arguments);
924 }
925
926 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
927 pass_arg0(this, arg_0);
928 call_VM_leaf_base(entry_point, 1);
929 }
930
931 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
932 assert_different_registers(arg_1, c_rarg0);
933 pass_arg0(this, arg_0);
934 pass_arg1(this, arg_1);
935 call_VM_leaf_base(entry_point, 2);
936 }
937
938 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
939 Register arg_1, Register arg_2) {
940 assert_different_registers(arg_1, c_rarg0);
941 assert_different_registers(arg_2, c_rarg0, c_rarg1);
942 pass_arg0(this, arg_0);
943 pass_arg1(this, arg_1);
944 pass_arg2(this, arg_2);
945 call_VM_leaf_base(entry_point, 3);
946 }
947
948 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
949 pass_arg0(this, arg_0);
950 MacroAssembler::call_VM_leaf_base(entry_point, 1);
951 }
952
953 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
954
955 assert_different_registers(arg_0, c_rarg1);
956 pass_arg1(this, arg_1);
957 pass_arg0(this, arg_0);
958 MacroAssembler::call_VM_leaf_base(entry_point, 2);
959 }
960
961 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
962 assert_different_registers(arg_0, c_rarg1, c_rarg2);
963 assert_different_registers(arg_1, c_rarg2);
964 pass_arg2(this, arg_2);
965 pass_arg1(this, arg_1);
966 pass_arg0(this, arg_0);
967 MacroAssembler::call_VM_leaf_base(entry_point, 3);
968 }
969
970 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
971 assert_different_registers(arg_0, c_rarg1, c_rarg2, c_rarg3);
972 assert_different_registers(arg_1, c_rarg2, c_rarg3);
973 assert_different_registers(arg_2, c_rarg3);
974
975 pass_arg3(this, arg_3);
976 pass_arg2(this, arg_2);
977 pass_arg1(this, arg_1);
978 pass_arg0(this, arg_0);
979 MacroAssembler::call_VM_leaf_base(entry_point, 4);
980 }
981
982 void MacroAssembler::la(Register Rd, const address addr) {
983 int32_t offset;
984 la(Rd, addr, offset);
985 addi(Rd, Rd, offset);
986 }
987
988 void MacroAssembler::la(Register Rd, const address addr, int32_t &offset) {
989 int64_t distance = addr - pc();
990 assert(is_valid_32bit_offset(distance), "Must be");
991 auipc(Rd, (int32_t)distance + 0x800);
992 offset = ((int32_t)distance << 20) >> 20;
993 }
994
995 // Materialize with auipc + addi sequence if adr is a literal
996 // address inside code cache. Emit a movptr sequence otherwise.
997 void MacroAssembler::la(Register Rd, const Address &adr) {
998 switch (adr.getMode()) {
999 case Address::literal: {
1000 relocInfo::relocType rtype = adr.rspec().reloc()->type();
1001 if (rtype == relocInfo::none) {
1002 mv(Rd, (intptr_t)(adr.target()));
1003 } else {
1004 if (CodeCache::contains(adr.target())) {
1005 relocate(adr.rspec(), [&] {
1006 la(Rd, adr.target());
1007 });
1008 } else {
1009 relocate(adr.rspec(), [&] {
1010 movptr(Rd, adr.target());
1011 });
1012 }
1013 }
1014 break;
1015 }
1016 case Address::base_plus_offset: {
1017 Address new_adr = legitimize_address(Rd, adr);
1018 if (!(new_adr.base() == Rd && new_adr.offset() == 0)) {
1019 addi(Rd, new_adr.base(), new_adr.offset());
1020 }
1021 break;
1022 }
1023 default:
1024 ShouldNotReachHere();
1025 }
1026 }
1027
1028 void MacroAssembler::la(Register Rd, Label &label) {
1029 IncompressibleScope scope(this); // the label address may be patched back.
1030 wrap_label(Rd, label, &MacroAssembler::la);
1031 }
1032
1033 void MacroAssembler::li16u(Register Rd, uint16_t imm) {
1034 lui(Rd, (uint32_t)imm << 12);
1035 srli(Rd, Rd, 12);
1036 }
1037
1038 void MacroAssembler::li32(Register Rd, int32_t imm) {
1039 // int32_t is in range 0x8000 0000 ~ 0x7fff ffff, and imm[31] is the sign bit
1040 int64_t upper = imm, lower = imm;
1041 lower = (imm << 20) >> 20;
1042 upper -= lower;
1043 upper = (int32_t)upper;
1044 // lui Rd, imm[31:12] + imm[11]
1045 lui(Rd, upper);
1046 addiw(Rd, Rd, lower);
1047 }
1048
1049 void MacroAssembler::li(Register Rd, int64_t imm) {
1050 // int64_t is in range 0x8000 0000 0000 0000 ~ 0x7fff ffff ffff ffff
1051 // li -> c.li
1052 if (do_compress() && (is_simm6(imm) && Rd != x0)) {
1053 c_li(Rd, imm);
1054 return;
1055 }
1056
1057 int shift = 12;
1058 int64_t upper = imm, lower = imm;
1059 // Split imm to a lower 12-bit sign-extended part and the remainder,
1060 // because addi will sign-extend the lower imm.
1061 lower = ((int32_t)imm << 20) >> 20;
1062 upper -= lower;
1063
1064 // Test whether imm is a 32-bit integer.
1065 if (!(((imm) & ~(int64_t)0x7fffffff) == 0 ||
1066 (((imm) & ~(int64_t)0x7fffffff) == ~(int64_t)0x7fffffff))) {
1067 while (((upper >> shift) & 1) == 0) { shift++; }
1068 upper >>= shift;
1069 li(Rd, upper);
1070 slli(Rd, Rd, shift);
1071 if (lower != 0) {
1072 addi(Rd, Rd, lower);
1073 }
1074 } else {
1075 // 32-bit integer
1076 Register hi_Rd = zr;
1077 if (upper != 0) {
1078 lui(Rd, (int32_t)upper);
1079 hi_Rd = Rd;
1080 }
1081 if (lower != 0 || hi_Rd == zr) {
1082 addiw(Rd, hi_Rd, lower);
1083 }
1084 }
1085 }
1086
1087 void MacroAssembler::j(const address dest, Register temp) {
1088 assert(CodeCache::contains(dest), "Must be");
1089 assert_cond(dest != nullptr);
1090 int64_t distance = dest - pc();
1091
1092 // We can't patch C, i.e. if Label wasn't bound we need to patch this jump.
1093 IncompressibleScope scope(this);
1094 if (is_simm21(distance) && ((distance % 2) == 0)) {
1095 Assembler::jal(x0, distance);
1096 } else {
1097 assert(temp != noreg && temp != x0, "Expecting a register");
1098 assert(temp != x1 && temp != x5, "temp register must not be x1/x5.");
1099 int32_t offset = 0;
1100 la(temp, dest, offset);
1101 jr(temp, offset);
1102 }
1103 }
1104
1105 void MacroAssembler::j(const Address &dest, Register temp) {
1106 switch (dest.getMode()) {
1107 case Address::literal: {
1108 if (CodeCache::contains(dest.target())) {
1109 far_jump(dest, temp);
1110 } else {
1111 relocate(dest.rspec(), [&] {
1112 int32_t offset;
1113 movptr(temp, dest.target(), offset);
1114 jr(temp, offset);
1115 });
1116 }
1117 break;
1118 }
1119 case Address::base_plus_offset: {
1120 int32_t offset = ((int32_t)dest.offset() << 20) >> 20;
1121 la(temp, Address(dest.base(), dest.offset() - offset));
1122 jr(temp, offset);
1123 break;
1124 }
1125 default:
1126 ShouldNotReachHere();
1127 }
1128 }
1129
1130 void MacroAssembler::j(Label &lab, Register temp) {
1131 assert_different_registers(x0, temp);
1132 if (lab.is_bound()) {
1133 MacroAssembler::j(target(lab), temp);
1134 } else {
1135 lab.add_patch_at(code(), locator());
1136 MacroAssembler::j(pc(), temp);
1137 }
1138 }
1139
1140 void MacroAssembler::jr(Register Rd, int32_t offset) {
1141 assert(Rd != noreg, "expecting a register");
1142 assert(Rd != x1 && Rd != x5, "Rd register must not be x1/x5.");
1143 Assembler::jalr(x0, Rd, offset);
1144 }
1145
1146 void MacroAssembler::call(const address dest, Register temp) {
1147 assert_cond(dest != nullptr);
1148 assert(temp != noreg, "expecting a register");
1149 assert(temp != x5, "temp register must not be x5.");
1150 int32_t offset = 0;
1151 la(temp, dest, offset);
1152 jalr(temp, offset);
1153 }
1154
1155 void MacroAssembler::jalr(Register Rs, int32_t offset) {
1156 assert(Rs != noreg, "expecting a register");
1157 assert(Rs != x5, "Rs register must not be x5.");
1158 Assembler::jalr(x1, Rs, offset);
1159 }
1160
1161 void MacroAssembler::rt_call(address dest, Register tmp) {
1162 assert(tmp != x5, "tmp register must not be x5.");
1163 RuntimeAddress target(dest);
1164 if (CodeCache::contains(dest)) {
1165 far_call(target, tmp);
1166 } else {
1167 relocate(target.rspec(), [&] {
1168 int32_t offset;
1169 movptr(tmp, target.target(), offset);
1170 jalr(tmp, offset);
1171 });
1172 }
1173 }
1174
1175 void MacroAssembler::wrap_label(Register Rt, Label &L, jal_jalr_insn insn) {
1176 if (L.is_bound()) {
1177 (this->*insn)(Rt, target(L));
1178 } else {
1179 L.add_patch_at(code(), locator());
1180 (this->*insn)(Rt, pc());
1181 }
1182 }
1183
1184 void MacroAssembler::wrap_label(Register r1, Register r2, Label &L,
1185 compare_and_branch_insn insn,
1186 compare_and_branch_label_insn neg_insn, bool is_far) {
1187 if (is_far) {
1188 Label done;
1189 (this->*neg_insn)(r1, r2, done, /* is_far */ false);
1190 j(L);
1191 bind(done);
1192 } else {
1193 if (L.is_bound()) {
1194 (this->*insn)(r1, r2, target(L));
1195 } else {
1196 L.add_patch_at(code(), locator());
1197 (this->*insn)(r1, r2, pc());
1198 }
1199 }
1200 }
1201
1202 #define INSN(NAME, NEG_INSN) \
1203 void MacroAssembler::NAME(Register Rs1, Register Rs2, Label &L, bool is_far) { \
1204 wrap_label(Rs1, Rs2, L, &MacroAssembler::NAME, &MacroAssembler::NEG_INSN, is_far); \
1205 }
1206
1207 INSN(beq, bne);
1208 INSN(bne, beq);
1209 INSN(blt, bge);
1210 INSN(bge, blt);
1211 INSN(bltu, bgeu);
1212 INSN(bgeu, bltu);
1213
1214 #undef INSN
1215
1216 #define INSN(NAME) \
1217 void MacroAssembler::NAME##z(Register Rs, const address dest) { \
1218 NAME(Rs, zr, dest); \
1219 } \
1220 void MacroAssembler::NAME##z(Register Rs, Label &l, bool is_far) { \
1221 NAME(Rs, zr, l, is_far); \
1222 } \
1223
1224 INSN(beq);
1225 INSN(bne);
1226 INSN(blt);
1227 INSN(ble);
1228 INSN(bge);
1229 INSN(bgt);
1230
1231 #undef INSN
1232
1233 #define INSN(NAME, NEG_INSN) \
1234 void MacroAssembler::NAME(Register Rs, Register Rt, const address dest) { \
1235 NEG_INSN(Rt, Rs, dest); \
1236 } \
1237 void MacroAssembler::NAME(Register Rs, Register Rt, Label &l, bool is_far) { \
1238 NEG_INSN(Rt, Rs, l, is_far); \
1239 }
1240
1241 INSN(bgt, blt);
1242 INSN(ble, bge);
1243 INSN(bgtu, bltu);
1244 INSN(bleu, bgeu);
1245
1246 #undef INSN
1247
1248 // cmov
1249 void MacroAssembler::cmov_eq(Register cmp1, Register cmp2, Register dst, Register src) {
1250 if (UseZicond) {
1251 xorr(t0, cmp1, cmp2);
1252 czero_eqz(dst, dst, t0);
1253 czero_nez(t0 , src, t0);
1254 orr(dst, dst, t0);
1255 return;
1256 }
1257 Label no_set;
1258 bne(cmp1, cmp2, no_set);
1259 mv(dst, src);
1260 bind(no_set);
1261 }
1262
1263 void MacroAssembler::cmov_ne(Register cmp1, Register cmp2, Register dst, Register src) {
1264 if (UseZicond) {
1265 xorr(t0, cmp1, cmp2);
1266 czero_nez(dst, dst, t0);
1267 czero_eqz(t0 , src, t0);
1268 orr(dst, dst, t0);
1269 return;
1270 }
1271 Label no_set;
1272 beq(cmp1, cmp2, no_set);
1273 mv(dst, src);
1274 bind(no_set);
1275 }
1276
1277 void MacroAssembler::cmov_le(Register cmp1, Register cmp2, Register dst, Register src) {
1278 if (UseZicond) {
1279 slt(t0, cmp2, cmp1);
1280 czero_eqz(dst, dst, t0);
1281 czero_nez(t0, src, t0);
1282 orr(dst, dst, t0);
1283 return;
1284 }
1285 Label no_set;
1286 bgt(cmp1, cmp2, no_set);
1287 mv(dst, src);
1288 bind(no_set);
1289 }
1290
1291 void MacroAssembler::cmov_leu(Register cmp1, Register cmp2, Register dst, Register src) {
1292 if (UseZicond) {
1293 sltu(t0, cmp2, cmp1);
1294 czero_eqz(dst, dst, t0);
1295 czero_nez(t0, src, t0);
1296 orr(dst, dst, t0);
1297 return;
1298 }
1299 Label no_set;
1300 bgtu(cmp1, cmp2, no_set);
1301 mv(dst, src);
1302 bind(no_set);
1303 }
1304
1305 void MacroAssembler::cmov_ge(Register cmp1, Register cmp2, Register dst, Register src) {
1306 if (UseZicond) {
1307 slt(t0, cmp1, cmp2);
1308 czero_eqz(dst, dst, t0);
1309 czero_nez(t0, src, t0);
1310 orr(dst, dst, t0);
1311 return;
1312 }
1313 Label no_set;
1314 blt(cmp1, cmp2, no_set);
1315 mv(dst, src);
1316 bind(no_set);
1317 }
1318
1319 void MacroAssembler::cmov_geu(Register cmp1, Register cmp2, Register dst, Register src) {
1320 if (UseZicond) {
1321 sltu(t0, cmp1, cmp2);
1322 czero_eqz(dst, dst, t0);
1323 czero_nez(t0, src, t0);
1324 orr(dst, dst, t0);
1325 return;
1326 }
1327 Label no_set;
1328 bltu(cmp1, cmp2, no_set);
1329 mv(dst, src);
1330 bind(no_set);
1331 }
1332
1333 void MacroAssembler::cmov_lt(Register cmp1, Register cmp2, Register dst, Register src) {
1334 if (UseZicond) {
1335 slt(t0, cmp1, cmp2);
1336 czero_nez(dst, dst, t0);
1337 czero_eqz(t0, src, t0);
1338 orr(dst, dst, t0);
1339 return;
1340 }
1341 Label no_set;
1342 bge(cmp1, cmp2, no_set);
1343 mv(dst, src);
1344 bind(no_set);
1345 }
1346
1347 void MacroAssembler::cmov_ltu(Register cmp1, Register cmp2, Register dst, Register src) {
1348 if (UseZicond) {
1349 sltu(t0, cmp1, cmp2);
1350 czero_nez(dst, dst, t0);
1351 czero_eqz(t0, src, t0);
1352 orr(dst, dst, t0);
1353 return;
1354 }
1355 Label no_set;
1356 bgeu(cmp1, cmp2, no_set);
1357 mv(dst, src);
1358 bind(no_set);
1359 }
1360
1361 void MacroAssembler::cmov_gt(Register cmp1, Register cmp2, Register dst, Register src) {
1362 if (UseZicond) {
1363 slt(t0, cmp2, cmp1);
1364 czero_nez(dst, dst, t0);
1365 czero_eqz(t0, src, t0);
1366 orr(dst, dst, t0);
1367 return;
1368 }
1369 Label no_set;
1370 ble(cmp1, cmp2, no_set);
1371 mv(dst, src);
1372 bind(no_set);
1373 }
1374
1375 void MacroAssembler::cmov_gtu(Register cmp1, Register cmp2, Register dst, Register src) {
1376 if (UseZicond) {
1377 sltu(t0, cmp2, cmp1);
1378 czero_nez(dst, dst, t0);
1379 czero_eqz(t0, src, t0);
1380 orr(dst, dst, t0);
1381 return;
1382 }
1383 Label no_set;
1384 bleu(cmp1, cmp2, no_set);
1385 mv(dst, src);
1386 bind(no_set);
1387 }
1388
1389 // ----------- cmove float/double -----------
1390
1391 void MacroAssembler::cmov_fp_eq(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1392 Label no_set;
1393 bne(cmp1, cmp2, no_set);
1394 if (is_single) {
1395 fmv_s(dst, src);
1396 } else {
1397 fmv_d(dst, src);
1398 }
1399 bind(no_set);
1400 }
1401
1402 void MacroAssembler::cmov_fp_ne(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1403 Label no_set;
1404 beq(cmp1, cmp2, no_set);
1405 if (is_single) {
1406 fmv_s(dst, src);
1407 } else {
1408 fmv_d(dst, src);
1409 }
1410 bind(no_set);
1411 }
1412
1413 void MacroAssembler::cmov_fp_le(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1414 Label no_set;
1415 bgt(cmp1, cmp2, no_set);
1416 if (is_single) {
1417 fmv_s(dst, src);
1418 } else {
1419 fmv_d(dst, src);
1420 }
1421 bind(no_set);
1422 }
1423
1424 void MacroAssembler::cmov_fp_leu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1425 Label no_set;
1426 bgtu(cmp1, cmp2, no_set);
1427 if (is_single) {
1428 fmv_s(dst, src);
1429 } else {
1430 fmv_d(dst, src);
1431 }
1432 bind(no_set);
1433 }
1434
1435 void MacroAssembler::cmov_fp_ge(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1436 Label no_set;
1437 blt(cmp1, cmp2, no_set);
1438 if (is_single) {
1439 fmv_s(dst, src);
1440 } else {
1441 fmv_d(dst, src);
1442 }
1443 bind(no_set);
1444 }
1445
1446 void MacroAssembler::cmov_fp_geu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1447 Label no_set;
1448 bltu(cmp1, cmp2, no_set);
1449 if (is_single) {
1450 fmv_s(dst, src);
1451 } else {
1452 fmv_d(dst, src);
1453 }
1454 bind(no_set);
1455 }
1456
1457 void MacroAssembler::cmov_fp_lt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1458 Label no_set;
1459 bge(cmp1, cmp2, no_set);
1460 if (is_single) {
1461 fmv_s(dst, src);
1462 } else {
1463 fmv_d(dst, src);
1464 }
1465 bind(no_set);
1466 }
1467
1468 void MacroAssembler::cmov_fp_ltu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1469 Label no_set;
1470 bgeu(cmp1, cmp2, no_set);
1471 if (is_single) {
1472 fmv_s(dst, src);
1473 } else {
1474 fmv_d(dst, src);
1475 }
1476 bind(no_set);
1477 }
1478
1479 void MacroAssembler::cmov_fp_gt(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1480 Label no_set;
1481 ble(cmp1, cmp2, no_set);
1482 if (is_single) {
1483 fmv_s(dst, src);
1484 } else {
1485 fmv_d(dst, src);
1486 }
1487 bind(no_set);
1488 }
1489
1490 void MacroAssembler::cmov_fp_gtu(Register cmp1, Register cmp2, FloatRegister dst, FloatRegister src, bool is_single) {
1491 Label no_set;
1492 bleu(cmp1, cmp2, no_set);
1493 if (is_single) {
1494 fmv_s(dst, src);
1495 } else {
1496 fmv_d(dst, src);
1497 }
1498 bind(no_set);
1499 }
1500
1501 // ----------- cmove, compare float/double -----------
1502 //
1503 // For CmpF/D + CMoveI/L, ordered ones are quite straight and simple,
1504 // so, just list behaviour of unordered ones as follow.
1505 //
1506 // Set dst (CMoveI (Binary cop (CmpF/D op1 op2)) (Binary dst src))
1507 // (If one or both inputs to the compare are NaN, then)
1508 // 1. (op1 lt op2) => true => CMove: dst = src
1509 // 2. (op1 le op2) => true => CMove: dst = src
1510 // 3. (op1 gt op2) => false => CMove: dst = dst
1511 // 4. (op1 ge op2) => false => CMove: dst = dst
1512 // 5. (op1 eq op2) => false => CMove: dst = dst
1513 // 6. (op1 ne op2) => true => CMove: dst = src
1514
1515 void MacroAssembler::cmov_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1516 if (UseZicond) {
1517 if (is_single) {
1518 feq_s(t0, cmp1, cmp2);
1519 } else {
1520 feq_d(t0, cmp1, cmp2);
1521 }
1522 czero_nez(dst, dst, t0);
1523 czero_eqz(t0 , src, t0);
1524 orr(dst, dst, t0);
1525 return;
1526 }
1527 Label no_set;
1528 if (is_single) {
1529 // jump if cmp1 != cmp2, including the case of NaN
1530 // fallthrough (i.e. move src to dst) if cmp1 == cmp2
1531 float_bne(cmp1, cmp2, no_set);
1532 } else {
1533 double_bne(cmp1, cmp2, no_set);
1534 }
1535 mv(dst, src);
1536 bind(no_set);
1537 }
1538
1539 void MacroAssembler::cmov_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1540 if (UseZicond) {
1541 if (is_single) {
1542 feq_s(t0, cmp1, cmp2);
1543 } else {
1544 feq_d(t0, cmp1, cmp2);
1545 }
1546 czero_eqz(dst, dst, t0);
1547 czero_nez(t0 , src, t0);
1548 orr(dst, dst, t0);
1549 return;
1550 }
1551 Label no_set;
1552 if (is_single) {
1553 // jump if cmp1 == cmp2
1554 // fallthrough (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1555 float_beq(cmp1, cmp2, no_set);
1556 } else {
1557 double_beq(cmp1, cmp2, no_set);
1558 }
1559 mv(dst, src);
1560 bind(no_set);
1561 }
1562
1563 void MacroAssembler::cmov_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1564 if (UseZicond) {
1565 if (is_single) {
1566 flt_s(t0, cmp2, cmp1);
1567 } else {
1568 flt_d(t0, cmp2, cmp1);
1569 }
1570 czero_eqz(dst, dst, t0);
1571 czero_nez(t0 , src, t0);
1572 orr(dst, dst, t0);
1573 return;
1574 }
1575 Label no_set;
1576 if (is_single) {
1577 // jump if cmp1 > cmp2
1578 // fallthrough (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1579 float_bgt(cmp1, cmp2, no_set);
1580 } else {
1581 double_bgt(cmp1, cmp2, no_set);
1582 }
1583 mv(dst, src);
1584 bind(no_set);
1585 }
1586
1587 void MacroAssembler::cmov_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1588 if (UseZicond) {
1589 if (is_single) {
1590 fle_s(t0, cmp2, cmp1);
1591 } else {
1592 fle_d(t0, cmp2, cmp1);
1593 }
1594 czero_nez(dst, dst, t0);
1595 czero_eqz(t0 , src, t0);
1596 orr(dst, dst, t0);
1597 return;
1598 }
1599 Label no_set;
1600 if (is_single) {
1601 // jump if cmp1 < cmp2 or either is NaN
1602 // fallthrough (i.e. move src to dst) if cmp1 >= cmp2
1603 float_blt(cmp1, cmp2, no_set, false, true);
1604 } else {
1605 double_blt(cmp1, cmp2, no_set, false, true);
1606 }
1607 mv(dst, src);
1608 bind(no_set);
1609 }
1610
1611 void MacroAssembler::cmov_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1612 if (UseZicond) {
1613 if (is_single) {
1614 fle_s(t0, cmp2, cmp1);
1615 } else {
1616 fle_d(t0, cmp2, cmp1);
1617 }
1618 czero_eqz(dst, dst, t0);
1619 czero_nez(t0 , src, t0);
1620 orr(dst, dst, t0);
1621 return;
1622 }
1623 Label no_set;
1624 if (is_single) {
1625 // jump if cmp1 >= cmp2
1626 // fallthrough (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1627 float_bge(cmp1, cmp2, no_set);
1628 } else {
1629 double_bge(cmp1, cmp2, no_set);
1630 }
1631 mv(dst, src);
1632 bind(no_set);
1633 }
1634
1635 void MacroAssembler::cmov_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2, Register dst, Register src, bool is_single) {
1636 if (UseZicond) {
1637 if (is_single) {
1638 flt_s(t0, cmp2, cmp1);
1639 } else {
1640 flt_d(t0, cmp2, cmp1);
1641 }
1642 czero_nez(dst, dst, t0);
1643 czero_eqz(t0 , src, t0);
1644 orr(dst, dst, t0);
1645 return;
1646 }
1647 Label no_set;
1648 if (is_single) {
1649 // jump if cmp1 <= cmp2 or either is NaN
1650 // fallthrough (i.e. move src to dst) if cmp1 > cmp2
1651 float_ble(cmp1, cmp2, no_set, false, true);
1652 } else {
1653 double_ble(cmp1, cmp2, no_set, false, true);
1654 }
1655 mv(dst, src);
1656 bind(no_set);
1657 }
1658
1659 // ----------- cmove float/double, compare float/double -----------
1660
1661 // Move src to dst only if cmp1 == cmp2,
1662 // otherwise leave dst unchanged, including the case where one of them is NaN.
1663 // Clarification:
1664 // java code : cmp1 != cmp2 ? dst : src
1665 // transformed to : CMove dst, (cmp1 eq cmp2), dst, src
1666 void MacroAssembler::cmov_fp_cmp_fp_eq(FloatRegister cmp1, FloatRegister cmp2,
1667 FloatRegister dst, FloatRegister src,
1668 bool cmp_single, bool cmov_single) {
1669 Label no_set;
1670 if (cmp_single) {
1671 // jump if cmp1 != cmp2, including the case of NaN
1672 // not jump (i.e. move src to dst) if cmp1 == cmp2
1673 float_bne(cmp1, cmp2, no_set);
1674 } else {
1675 double_bne(cmp1, cmp2, no_set);
1676 }
1677 if (cmov_single) {
1678 fmv_s(dst, src);
1679 } else {
1680 fmv_d(dst, src);
1681 }
1682 bind(no_set);
1683 }
1684
1685 // Keep dst unchanged only if cmp1 == cmp2,
1686 // otherwise move src to dst, including the case where one of them is NaN.
1687 // Clarification:
1688 // java code : cmp1 == cmp2 ? dst : src
1689 // transformed to : CMove dst, (cmp1 ne cmp2), dst, src
1690 void MacroAssembler::cmov_fp_cmp_fp_ne(FloatRegister cmp1, FloatRegister cmp2,
1691 FloatRegister dst, FloatRegister src,
1692 bool cmp_single, bool cmov_single) {
1693 Label no_set;
1694 if (cmp_single) {
1695 // jump if cmp1 == cmp2
1696 // not jump (i.e. move src to dst) if cmp1 != cmp2, including the case of NaN
1697 float_beq(cmp1, cmp2, no_set);
1698 } else {
1699 double_beq(cmp1, cmp2, no_set);
1700 }
1701 if (cmov_single) {
1702 fmv_s(dst, src);
1703 } else {
1704 fmv_d(dst, src);
1705 }
1706 bind(no_set);
1707 }
1708
1709 // When cmp1 <= cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1710 // Clarification
1711 // scenario 1:
1712 // java code : cmp2 < cmp1 ? dst : src
1713 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1714 // scenario 2:
1715 // java code : cmp1 > cmp2 ? dst : src
1716 // transformed to : CMove dst, (cmp1 le cmp2), dst, src
1717 void MacroAssembler::cmov_fp_cmp_fp_le(FloatRegister cmp1, FloatRegister cmp2,
1718 FloatRegister dst, FloatRegister src,
1719 bool cmp_single, bool cmov_single) {
1720 Label no_set;
1721 if (cmp_single) {
1722 // jump if cmp1 > cmp2
1723 // not jump (i.e. move src to dst) if cmp1 <= cmp2 or either is NaN
1724 float_bgt(cmp1, cmp2, no_set);
1725 } else {
1726 double_bgt(cmp1, cmp2, no_set);
1727 }
1728 if (cmov_single) {
1729 fmv_s(dst, src);
1730 } else {
1731 fmv_d(dst, src);
1732 }
1733 bind(no_set);
1734 }
1735
1736 void MacroAssembler::cmov_fp_cmp_fp_ge(FloatRegister cmp1, FloatRegister cmp2,
1737 FloatRegister dst, FloatRegister src,
1738 bool cmp_single, bool cmov_single) {
1739 Label no_set;
1740 if (cmp_single) {
1741 // jump if cmp1 < cmp2 or either is NaN
1742 // not jump (i.e. move src to dst) if cmp1 >= cmp2
1743 float_blt(cmp1, cmp2, no_set, false, true);
1744 } else {
1745 double_blt(cmp1, cmp2, no_set, false, true);
1746 }
1747 if (cmov_single) {
1748 fmv_s(dst, src);
1749 } else {
1750 fmv_d(dst, src);
1751 }
1752 bind(no_set);
1753 }
1754
1755 // When cmp1 < cmp2 or any of them is NaN then dst = src, otherwise, dst = dst
1756 // Clarification
1757 // scenario 1:
1758 // java code : cmp2 <= cmp1 ? dst : src
1759 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1760 // scenario 2:
1761 // java code : cmp1 >= cmp2 ? dst : src
1762 // transformed to : CMove dst, (cmp1 lt cmp2), dst, src
1763 void MacroAssembler::cmov_fp_cmp_fp_lt(FloatRegister cmp1, FloatRegister cmp2,
1764 FloatRegister dst, FloatRegister src,
1765 bool cmp_single, bool cmov_single) {
1766 Label no_set;
1767 if (cmp_single) {
1768 // jump if cmp1 >= cmp2
1769 // not jump (i.e. move src to dst) if cmp1 < cmp2 or either is NaN
1770 float_bge(cmp1, cmp2, no_set);
1771 } else {
1772 double_bge(cmp1, cmp2, no_set);
1773 }
1774 if (cmov_single) {
1775 fmv_s(dst, src);
1776 } else {
1777 fmv_d(dst, src);
1778 }
1779 bind(no_set);
1780 }
1781
1782 void MacroAssembler::cmov_fp_cmp_fp_gt(FloatRegister cmp1, FloatRegister cmp2,
1783 FloatRegister dst, FloatRegister src,
1784 bool cmp_single, bool cmov_single) {
1785 Label no_set;
1786 if (cmp_single) {
1787 // jump if cmp1 <= cmp2 or either is NaN
1788 // not jump (i.e. move src to dst) if cmp1 > cmp2
1789 float_ble(cmp1, cmp2, no_set, false, true);
1790 } else {
1791 double_ble(cmp1, cmp2, no_set, false, true);
1792 }
1793 if (cmov_single) {
1794 fmv_s(dst, src);
1795 } else {
1796 fmv_d(dst, src);
1797 }
1798 bind(no_set);
1799 }
1800
1801 // Float compare branch instructions
1802
1803 #define INSN(NAME, FLOATCMP, BRANCH) \
1804 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1805 FLOATCMP##_s(t0, Rs1, Rs2); \
1806 BRANCH(t0, l, is_far); \
1807 } \
1808 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, bool is_far, bool is_unordered) { \
1809 FLOATCMP##_d(t0, Rs1, Rs2); \
1810 BRANCH(t0, l, is_far); \
1811 }
1812
1813 INSN(beq, feq, bnez);
1814 INSN(bne, feq, beqz);
1815
1816 #undef INSN
1817
1818
1819 #define INSN(NAME, FLOATCMP1, FLOATCMP2) \
1820 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1821 bool is_far, bool is_unordered) { \
1822 if (is_unordered) { \
1823 /* jump if either source is NaN or condition is expected */ \
1824 FLOATCMP2##_s(t0, Rs2, Rs1); \
1825 beqz(t0, l, is_far); \
1826 } else { \
1827 /* jump if no NaN in source and condition is expected */ \
1828 FLOATCMP1##_s(t0, Rs1, Rs2); \
1829 bnez(t0, l, is_far); \
1830 } \
1831 } \
1832 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1833 bool is_far, bool is_unordered) { \
1834 if (is_unordered) { \
1835 /* jump if either source is NaN or condition is expected */ \
1836 FLOATCMP2##_d(t0, Rs2, Rs1); \
1837 beqz(t0, l, is_far); \
1838 } else { \
1839 /* jump if no NaN in source and condition is expected */ \
1840 FLOATCMP1##_d(t0, Rs1, Rs2); \
1841 bnez(t0, l, is_far); \
1842 } \
1843 }
1844
1845 INSN(ble, fle, flt);
1846 INSN(blt, flt, fle);
1847
1848 #undef INSN
1849
1850 #define INSN(NAME, CMP) \
1851 void MacroAssembler::float_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1852 bool is_far, bool is_unordered) { \
1853 float_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1854 } \
1855 void MacroAssembler::double_##NAME(FloatRegister Rs1, FloatRegister Rs2, Label &l, \
1856 bool is_far, bool is_unordered) { \
1857 double_##CMP(Rs2, Rs1, l, is_far, is_unordered); \
1858 }
1859
1860 INSN(bgt, blt);
1861 INSN(bge, ble);
1862
1863 #undef INSN
1864
1865 void MacroAssembler::csrr(Register Rd, unsigned csr) {
1866 // These three are specified in zicntr and are unused.
1867 // Before adding use-cases add the appropriate hwprobe and flag.
1868 assert(csr != CSR_INSTRET && csr != CSR_CYCLE && csr != CSR_TIME,
1869 "Not intended for use without enabling zicntr.");
1870 csrrs(Rd, csr, x0);
1871 }
1872
1873 #define INSN(NAME, OPFUN) \
1874 void MacroAssembler::NAME(unsigned csr, Register Rs) { \
1875 OPFUN(x0, csr, Rs); \
1876 }
1877
1878 INSN(csrw, csrrw);
1879 INSN(csrs, csrrs);
1880 INSN(csrc, csrrc);
1881
1882 #undef INSN
1883
1884 #define INSN(NAME, OPFUN) \
1885 void MacroAssembler::NAME(unsigned csr, unsigned imm) { \
1886 OPFUN(x0, csr, imm); \
1887 }
1888
1889 INSN(csrwi, csrrwi);
1890 INSN(csrsi, csrrsi);
1891 INSN(csrci, csrrci);
1892
1893 #undef INSN
1894
1895 #define INSN(NAME, CSR) \
1896 void MacroAssembler::NAME(Register Rd, Register Rs) { \
1897 csrrw(Rd, CSR, Rs); \
1898 }
1899
1900 INSN(fscsr, CSR_FCSR);
1901 INSN(fsrm, CSR_FRM);
1902 INSN(fsflags, CSR_FFLAGS);
1903
1904 #undef INSN
1905
1906 #define INSN(NAME) \
1907 void MacroAssembler::NAME(Register Rs) { \
1908 NAME(x0, Rs); \
1909 }
1910
1911 INSN(fscsr);
1912 INSN(fsrm);
1913 INSN(fsflags);
1914
1915 #undef INSN
1916
1917 void MacroAssembler::fsrmi(Register Rd, unsigned imm) {
1918 guarantee(imm < 5, "Rounding Mode is invalid in Rounding Mode register");
1919 csrrwi(Rd, CSR_FRM, imm);
1920 }
1921
1922 void MacroAssembler::fsflagsi(Register Rd, unsigned imm) {
1923 csrrwi(Rd, CSR_FFLAGS, imm);
1924 }
1925
1926 #define INSN(NAME) \
1927 void MacroAssembler::NAME(unsigned imm) { \
1928 NAME(x0, imm); \
1929 }
1930
1931 INSN(fsrmi);
1932 INSN(fsflagsi);
1933
1934 #undef INSN
1935
1936 void MacroAssembler::restore_cpu_control_state_after_jni(Register tmp) {
1937 if (RestoreMXCSROnJNICalls) {
1938 Label skip_fsrmi;
1939 frrm(tmp);
1940 // Set FRM to the state we need. We do want Round to Nearest.
1941 // We don't want non-IEEE rounding modes.
1942 guarantee(RoundingMode::rne == 0, "must be");
1943 beqz(tmp, skip_fsrmi); // Only reset FRM if it's wrong
1944 fsrmi(RoundingMode::rne);
1945 bind(skip_fsrmi);
1946 }
1947 }
1948
1949 void MacroAssembler::push_reg(Register Rs) {
1950 subi(esp, esp, wordSize);
1951 sd(Rs, Address(esp, 0));
1952 }
1953
1954 void MacroAssembler::pop_reg(Register Rd) {
1955 ld(Rd, Address(esp, 0));
1956 addi(esp, esp, wordSize);
1957 }
1958
1959 int MacroAssembler::bitset_to_regs(unsigned int bitset, unsigned char* regs) {
1960 int count = 0;
1961 // Scan bitset to accumulate register pairs
1962 for (int reg = 31; reg >= 0; reg--) {
1963 if ((1U << 31) & bitset) {
1964 regs[count++] = reg;
1965 }
1966 bitset <<= 1;
1967 }
1968 return count;
1969 }
1970
1971 // Push integer registers in the bitset supplied. Don't push sp.
1972 // Return the number of words pushed
1973 int MacroAssembler::push_reg(RegSet regset, Register stack) {
1974 if (regset.bits() == 0) {
1975 return 0;
1976 }
1977 auto bitset = integer_cast<unsigned int>(regset.bits());
1978 DEBUG_ONLY(int words_pushed = 0;)
1979 unsigned char regs[32];
1980 int count = bitset_to_regs(bitset, regs);
1981 // reserve one slot to align for odd count
1982 int offset = is_even(count) ? 0 : wordSize;
1983
1984 if (count) {
1985 sub(stack, stack, count * wordSize + offset);
1986 }
1987 for (int i = count - 1; i >= 0; i--) {
1988 sd(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
1989 DEBUG_ONLY(words_pushed++;)
1990 }
1991
1992 assert(words_pushed == count, "oops, pushed != count");
1993
1994 return count;
1995 }
1996
1997 int MacroAssembler::pop_reg(RegSet regset, Register stack) {
1998 if (regset.bits() == 0) {
1999 return 0;
2000 }
2001 auto bitset = integer_cast<unsigned int>(regset.bits());
2002 DEBUG_ONLY(int words_popped = 0;)
2003 unsigned char regs[32];
2004 int count = bitset_to_regs(bitset, regs);
2005 // reserve one slot to align for odd count
2006 int offset = is_even(count) ? 0 : wordSize;
2007
2008 for (int i = count - 1; i >= 0; i--) {
2009 ld(as_Register(regs[i]), Address(stack, (count - 1 - i) * wordSize + offset));
2010 DEBUG_ONLY(words_popped++;)
2011 }
2012
2013 if (count) {
2014 add(stack, stack, count * wordSize + offset);
2015 }
2016 assert(words_popped == count, "oops, popped != count");
2017
2018 return count;
2019 }
2020
2021 // Push floating-point registers in the bitset supplied.
2022 // Return the number of words pushed
2023 int MacroAssembler::push_fp(FloatRegSet regset, Register stack) {
2024 if (regset.bits() == 0) {
2025 return 0;
2026 }
2027 auto bitset = integer_cast<unsigned int>(regset.bits());
2028 DEBUG_ONLY(int words_pushed = 0;)
2029 unsigned char regs[32];
2030 int count = bitset_to_regs(bitset, regs);
2031 int push_slots = count + (count & 1);
2032
2033 if (count) {
2034 subi(stack, stack, push_slots * wordSize);
2035 }
2036
2037 for (int i = count - 1; i >= 0; i--) {
2038 fsd(as_FloatRegister(regs[i]), Address(stack, (push_slots - 1 - i) * wordSize));
2039 DEBUG_ONLY(words_pushed++;)
2040 }
2041
2042 assert(words_pushed == count, "oops, pushed(%d) != count(%d)", words_pushed, count);
2043
2044 return count;
2045 }
2046
2047 int MacroAssembler::pop_fp(FloatRegSet regset, Register stack) {
2048 if (regset.bits() == 0) {
2049 return 0;
2050 }
2051 auto bitset = integer_cast<unsigned int>(regset.bits());
2052 DEBUG_ONLY(int words_popped = 0;)
2053 unsigned char regs[32];
2054 int count = bitset_to_regs(bitset, regs);
2055 int pop_slots = count + (count & 1);
2056
2057 for (int i = count - 1; i >= 0; i--) {
2058 fld(as_FloatRegister(regs[i]), Address(stack, (pop_slots - 1 - i) * wordSize));
2059 DEBUG_ONLY(words_popped++;)
2060 }
2061
2062 if (count) {
2063 addi(stack, stack, pop_slots * wordSize);
2064 }
2065
2066 assert(words_popped == count, "oops, popped(%d) != count(%d)", words_popped, count);
2067
2068 return count;
2069 }
2070
2071 /**
2072 * Emits code to update CRC-32 with a byte value according to constants in table
2073 *
2074 * @param [in,out]crc Register containing the crc.
2075 * @param [in]val Register containing the byte to fold into the CRC.
2076 * @param [in]table Register containing the table of crc constants.
2077 *
2078 * uint32_t crc;
2079 * val = crc_table[(val ^ crc) & 0xFF];
2080 * crc = val ^ (crc >> 8);
2081 *
2082 */
2083 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2084 assert_different_registers(crc, val, table);
2085
2086 xorr(val, val, crc);
2087 zext(val, val, 8);
2088 shadd(val, val, table, val, 2);
2089 lwu(val, Address(val));
2090 srli(crc, crc, 8);
2091 xorr(crc, val, crc);
2092 }
2093
2094 /**
2095 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2096 *
2097 * @param [in,out]crc Register containing the crc.
2098 * @param [in]v Register containing the 32-bit to fold into the CRC.
2099 * @param [in]table0 Register containing table 0 of crc constants.
2100 * @param [in]table1 Register containing table 1 of crc constants.
2101 * @param [in]table2 Register containing table 2 of crc constants.
2102 * @param [in]table3 Register containing table 3 of crc constants.
2103 *
2104 * uint32_t crc;
2105 * v = crc ^ v
2106 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2107 *
2108 */
2109 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp1, Register tmp2, Register tmp3,
2110 Register table0, Register table1, Register table2, Register table3, bool upper) {
2111 assert_different_registers(crc, v, tmp1, tmp2, tmp3, table0, table1, table2, table3);
2112
2113 if (upper)
2114 srli(v, v, 32);
2115 xorr(v, v, crc);
2116
2117 zext(tmp1, v, 8);
2118 shadd(tmp1, tmp1, table3, tmp2, 2);
2119 lwu(crc, Address(tmp1));
2120
2121 slli(tmp1, v, 16);
2122 slli(tmp3, v, 8);
2123
2124 srliw(tmp1, tmp1, 24);
2125 srliw(tmp3, tmp3, 24);
2126
2127 shadd(tmp1, tmp1, table2, tmp1, 2);
2128 lwu(tmp2, Address(tmp1));
2129
2130 shadd(tmp3, tmp3, table1, tmp3, 2);
2131 xorr(crc, crc, tmp2);
2132
2133 lwu(tmp2, Address(tmp3));
2134 // It is more optimal to use 'srli' instead of 'srliw' for case when it is not necessary to clean upper bits
2135 if (upper)
2136 srli(tmp1, v, 24);
2137 else
2138 srliw(tmp1, v, 24);
2139
2140 // no need to clear bits other than lowest two
2141 shadd(tmp1, tmp1, table0, tmp1, 2);
2142 xorr(crc, crc, tmp2);
2143 lwu(tmp2, Address(tmp1));
2144 xorr(crc, crc, tmp2);
2145 }
2146
2147
2148 #ifdef COMPILER2
2149 // This improvement (vectorization) is based on java.base/share/native/libzip/zlib/zcrc32.c.
2150 // To make it, following steps are taken:
2151 // 1. in zcrc32.c, modify N to 16 and related code,
2152 // 2. re-generate the tables needed, we use tables of (N == 16, W == 4)
2153 // 3. finally vectorize the code (original implementation in zcrc32.c is just scalar code).
2154 // New tables for vector version is after table3.
2155 void MacroAssembler::vector_update_crc32(Register crc, Register buf, Register len,
2156 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5,
2157 Register table0, Register table3) {
2158 assert_different_registers(t1, crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp5, table0, table3);
2159 const int N = 16, W = 4;
2160 const int64_t single_table_size = 256;
2161 const Register blks = tmp2;
2162 const Register tmpTable = tmp3, tableN16 = tmp4;
2163 const VectorRegister vcrc = v4, vword = v8, vtmp = v12;
2164 Label VectorLoop;
2165 Label LastBlock;
2166
2167 add(tableN16, table3, 1 * single_table_size * sizeof(juint), tmp1);
2168 mv(tmp5, 0xff);
2169
2170 if (MaxVectorSize == 16) {
2171 vsetivli(zr, N, Assembler::e32, Assembler::m4, Assembler::ma, Assembler::ta);
2172 } else if (MaxVectorSize == 32) {
2173 vsetivli(zr, N, Assembler::e32, Assembler::m2, Assembler::ma, Assembler::ta);
2174 } else {
2175 assert(MaxVectorSize > 32, "sanity");
2176 vsetivli(zr, N, Assembler::e32, Assembler::m1, Assembler::ma, Assembler::ta);
2177 }
2178
2179 vmv_v_x(vcrc, zr);
2180 vmv_s_x(vcrc, crc);
2181
2182 // multiple of 64
2183 srli(blks, len, 6);
2184 slli(t1, blks, 6);
2185 sub(len, len, t1);
2186 subi(blks, blks, 1);
2187 blez(blks, LastBlock);
2188
2189 bind(VectorLoop);
2190 {
2191 mv(tmpTable, tableN16);
2192
2193 vle32_v(vword, buf);
2194 vxor_vv(vword, vword, vcrc);
2195
2196 addi(buf, buf, N*4);
2197
2198 vand_vx(vtmp, vword, tmp5);
2199 vsll_vi(vtmp, vtmp, 2);
2200 vluxei32_v(vcrc, tmpTable, vtmp);
2201
2202 mv(tmp1, 1);
2203 for (int k = 1; k < W; k++) {
2204 addi(tmpTable, tmpTable, single_table_size*4);
2205
2206 slli(t1, tmp1, 3);
2207 vsrl_vx(vtmp, vword, t1);
2208
2209 vand_vx(vtmp, vtmp, tmp5);
2210 vsll_vi(vtmp, vtmp, 2);
2211 vluxei32_v(vtmp, tmpTable, vtmp);
2212
2213 vxor_vv(vcrc, vcrc, vtmp);
2214
2215 addi(tmp1, tmp1, 1);
2216 }
2217
2218 subi(blks, blks, 1);
2219 bgtz(blks, VectorLoop);
2220 }
2221
2222 bind(LastBlock);
2223 {
2224 vle32_v(vtmp, buf);
2225 vxor_vv(vcrc, vcrc, vtmp);
2226 mv(crc, zr);
2227 for (int i = 0; i < N; i++) {
2228 vmv_x_s(tmp2, vcrc);
2229 // in vmv_x_s, the value is sign-extended to SEW bits, but we need zero-extended here.
2230 zext(tmp2, tmp2, 32);
2231 vslidedown_vi(vcrc, vcrc, 1);
2232 xorr(crc, crc, tmp2);
2233 for (int j = 0; j < W; j++) {
2234 andr(t1, crc, tmp5);
2235 shadd(t1, t1, table0, tmp1, 2);
2236 lwu(t1, Address(t1, 0));
2237 srli(tmp2, crc, 8);
2238 xorr(crc, tmp2, t1);
2239 }
2240 }
2241 addi(buf, buf, N*4);
2242 }
2243 }
2244
2245 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16(VectorRegister vx, VectorRegister vt,
2246 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2247 Register buf, Register tmp, const int STEP) {
2248 assert_different_registers(vx, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2249 vclmul_vv(vtmp1, vx, vt);
2250 vclmulh_vv(vtmp2, vx, vt);
2251 vle64_v(vtmp4, buf); addi(buf, buf, STEP);
2252 // low parts
2253 vredxor_vs(vtmp3, vtmp1, vtmp4);
2254 // high parts
2255 vslidedown_vi(vx, vtmp4, 1);
2256 vredxor_vs(vtmp1, vtmp2, vx);
2257 // merge low and high back
2258 vslideup_vi(vx, vtmp1, 1);
2259 vmv_x_s(tmp, vtmp3);
2260 vmv_s_x(vx, tmp);
2261 }
2262
2263 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_2(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2264 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2265 Register tmp) {
2266 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2267 vclmul_vv(vtmp1, vx, vt);
2268 vclmulh_vv(vtmp2, vx, vt);
2269 // low parts
2270 vredxor_vs(vtmp3, vtmp1, vy);
2271 // high parts
2272 vslidedown_vi(vtmp4, vy, 1);
2273 vredxor_vs(vtmp1, vtmp2, vtmp4);
2274 // merge low and high back
2275 vslideup_vi(vx, vtmp1, 1);
2276 vmv_x_s(tmp, vtmp3);
2277 vmv_s_x(vx, tmp);
2278 }
2279
2280 void MacroAssembler::crc32_vclmul_fold_16_bytes_vectorsize_16_3(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2281 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4,
2282 Register tmp) {
2283 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2284 vclmul_vv(vtmp1, vx, vt);
2285 vclmulh_vv(vtmp2, vx, vt);
2286 // low parts
2287 vredxor_vs(vtmp3, vtmp1, vy);
2288 // high parts
2289 vslidedown_vi(vtmp4, vy, 1);
2290 vredxor_vs(vtmp1, vtmp2, vtmp4);
2291 // merge low and high back
2292 vslideup_vi(vy, vtmp1, 1);
2293 vmv_x_s(tmp, vtmp3);
2294 vmv_s_x(vy, tmp);
2295 }
2296
2297 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_16(Register crc, Register buf, Register len,
2298 Register vclmul_table, Register tmp1, Register tmp2) {
2299 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2300 assert(MaxVectorSize == 16, "sanity");
2301
2302 const int TABLE_STEP = 16;
2303 const int STEP = 16;
2304 const int LOOP_STEP = 128;
2305 const int N = 2;
2306
2307 Register loop_step = t1;
2308
2309 // ======== preparation ========
2310
2311 mv(loop_step, LOOP_STEP);
2312 sub(len, len, loop_step);
2313
2314 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2315 vle64_v(v0, buf); addi(buf, buf, STEP);
2316 vle64_v(v1, buf); addi(buf, buf, STEP);
2317 vle64_v(v2, buf); addi(buf, buf, STEP);
2318 vle64_v(v3, buf); addi(buf, buf, STEP);
2319 vle64_v(v4, buf); addi(buf, buf, STEP);
2320 vle64_v(v5, buf); addi(buf, buf, STEP);
2321 vle64_v(v6, buf); addi(buf, buf, STEP);
2322 vle64_v(v7, buf); addi(buf, buf, STEP);
2323
2324 vmv_v_x(v31, zr);
2325 vsetivli(zr, 1, Assembler::e32, Assembler::m1, Assembler::mu, Assembler::tu);
2326 vmv_s_x(v31, crc);
2327 vsetivli(zr, N, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2328 vxor_vv(v0, v0, v31);
2329
2330 // load table
2331 vle64_v(v31, vclmul_table);
2332
2333 Label L_16_bytes_loop;
2334 j(L_16_bytes_loop);
2335
2336
2337 // ======== folding 128 bytes in data buffer per round ========
2338
2339 align(OptoLoopAlignment);
2340 bind(L_16_bytes_loop);
2341 {
2342 crc32_vclmul_fold_16_bytes_vectorsize_16(v0, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2343 crc32_vclmul_fold_16_bytes_vectorsize_16(v1, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2344 crc32_vclmul_fold_16_bytes_vectorsize_16(v2, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2345 crc32_vclmul_fold_16_bytes_vectorsize_16(v3, v31, v20, v21, v22, v23, buf, tmp2, STEP);
2346 crc32_vclmul_fold_16_bytes_vectorsize_16(v4, v31, v24, v25, v26, v27, buf, tmp2, STEP);
2347 crc32_vclmul_fold_16_bytes_vectorsize_16(v5, v31, v8, v9, v10, v11, buf, tmp2, STEP);
2348 crc32_vclmul_fold_16_bytes_vectorsize_16(v6, v31, v12, v13, v14, v15, buf, tmp2, STEP);
2349 crc32_vclmul_fold_16_bytes_vectorsize_16(v7, v31, v16, v17, v18, v19, buf, tmp2, STEP);
2350 }
2351 sub(len, len, loop_step);
2352 bge(len, loop_step, L_16_bytes_loop);
2353
2354
2355 // ======== folding into 64 bytes from 128 bytes in register ========
2356
2357 // load table
2358 addi(vclmul_table, vclmul_table, TABLE_STEP);
2359 vle64_v(v31, vclmul_table);
2360
2361 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v0, v4, v31, v8, v9, v10, v11, tmp2);
2362 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v1, v5, v31, v12, v13, v14, v15, tmp2);
2363 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v2, v6, v31, v16, v17, v18, v19, tmp2);
2364 crc32_vclmul_fold_16_bytes_vectorsize_16_2(v3, v7, v31, v20, v21, v22, v23, tmp2);
2365
2366
2367 // ======== folding into 16 bytes from 64 bytes in register ========
2368
2369 addi(vclmul_table, vclmul_table, TABLE_STEP);
2370 vle64_v(v31, vclmul_table);
2371 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v0, v3, v31, v8, v9, v10, v11, tmp2);
2372
2373 addi(vclmul_table, vclmul_table, TABLE_STEP);
2374 vle64_v(v31, vclmul_table);
2375 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v1, v3, v31, v12, v13, v14, v15, tmp2);
2376
2377 addi(vclmul_table, vclmul_table, TABLE_STEP);
2378 vle64_v(v31, vclmul_table);
2379 crc32_vclmul_fold_16_bytes_vectorsize_16_3(v2, v3, v31, v16, v17, v18, v19, tmp2);
2380
2381 #undef FOLD_2_VCLMUL_3
2382
2383
2384 // ======== final: move result to scalar regsiters ========
2385
2386 vmv_x_s(tmp1, v3);
2387 vslidedown_vi(v1, v3, 1);
2388 vmv_x_s(tmp2, v1);
2389 }
2390
2391 void MacroAssembler::crc32_vclmul_fold_to_16_bytes_vectorsize_32(VectorRegister vx, VectorRegister vy, VectorRegister vt,
2392 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3, VectorRegister vtmp4) {
2393 assert_different_registers(vx, vy, vt, vtmp1, vtmp2, vtmp3, vtmp4);
2394 vclmul_vv(vtmp1, vx, vt);
2395 vclmulh_vv(vtmp2, vx, vt);
2396 // low parts
2397 vredxor_vs(vtmp3, vtmp1, vy);
2398 // high parts
2399 vslidedown_vi(vtmp4, vy, 1);
2400 vredxor_vs(vtmp1, vtmp2, vtmp4);
2401 // merge low and high back
2402 vslideup_vi(vy, vtmp1, 1);
2403 vmv_x_s(t1, vtmp3);
2404 vmv_s_x(vy, t1);
2405 }
2406
2407 void MacroAssembler::kernel_crc32_vclmul_fold_vectorsize_32(Register crc, Register buf, Register len,
2408 Register vclmul_table, Register tmp1, Register tmp2) {
2409 assert_different_registers(crc, buf, len, vclmul_table, tmp1, tmp2, t1);
2410 assert(MaxVectorSize >= 32, "sanity");
2411
2412 // utility: load table
2413 #define CRC32_VCLMUL_LOAD_TABLE(vt, rt, vtmp, rtmp) \
2414 vid_v(vtmp); \
2415 mv(rtmp, 2); \
2416 vremu_vx(vtmp, vtmp, rtmp); \
2417 vsll_vi(vtmp, vtmp, 3); \
2418 vluxei64_v(vt, rt, vtmp);
2419
2420 const int TABLE_STEP = 16;
2421 const int STEP = 128; // 128 bytes per round
2422 const int N = 2 * 8; // 2: 128-bits/64-bits, 8: 8 pairs of double 64-bits
2423
2424 Register step = tmp2;
2425
2426
2427 // ======== preparation ========
2428
2429 mv(step, STEP);
2430 sub(len, len, step); // 2 rounds of folding with carry-less multiplication
2431
2432 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2433 // load data
2434 vle64_v(v4, buf);
2435 add(buf, buf, step);
2436
2437 // load table
2438 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2439 // load mask,
2440 // v28 should already contains: 0, 8, 0, 8, ...
2441 vmseq_vi(v2, v28, 0);
2442 // now, v2 should contains: 101010...
2443 vmnand_mm(v1, v2, v2);
2444 // now, v1 should contains: 010101...
2445
2446 // initial crc
2447 vmv_v_x(v24, zr);
2448 vsetivli(zr, 1, Assembler::e32, Assembler::m4, Assembler::mu, Assembler::tu);
2449 vmv_s_x(v24, crc);
2450 vsetivli(zr, N, Assembler::e64, Assembler::m4, Assembler::mu, Assembler::tu);
2451 vxor_vv(v4, v4, v24);
2452
2453 Label L_128_bytes_loop;
2454 j(L_128_bytes_loop);
2455
2456
2457 // ======== folding 128 bytes in data buffer per round ========
2458
2459 align(OptoLoopAlignment);
2460 bind(L_128_bytes_loop);
2461 {
2462 // v4: data
2463 // v4: buf, reused
2464 // v8: table
2465 // v12: lows
2466 // v16: highs
2467 // v20: low_slides
2468 // v24: high_slides
2469 vclmul_vv(v12, v4, v8);
2470 vclmulh_vv(v16, v4, v8);
2471 vle64_v(v4, buf);
2472 add(buf, buf, step);
2473 // lows
2474 vslidedown_vi(v20, v12, 1);
2475 vmand_mm(v0, v2, v2);
2476 vxor_vv(v12, v12, v20, v0_t);
2477 // with buf data
2478 vxor_vv(v4, v4, v12, v0_t);
2479
2480 // highs
2481 vslideup_vi(v24, v16, 1);
2482 vmand_mm(v0, v1, v1);
2483 vxor_vv(v16, v16, v24, v0_t);
2484 // with buf data
2485 vxor_vv(v4, v4, v16, v0_t);
2486 }
2487 sub(len, len, step);
2488 bge(len, step, L_128_bytes_loop);
2489
2490
2491 // ======== folding into 64 bytes from 128 bytes in register ========
2492
2493 // load table
2494 addi(vclmul_table, vclmul_table, TABLE_STEP);
2495 CRC32_VCLMUL_LOAD_TABLE(v8, vclmul_table, v28, t1);
2496
2497 // v4: data, first (low) part, N/2 of 64-bits
2498 // v20: data, second (high) part, N/2 of 64-bits
2499 // v8: table
2500 // v10: lows
2501 // v12: highs
2502 // v14: low_slides
2503 // v16: high_slides
2504
2505 // high part
2506 vslidedown_vi(v20, v4, N/2);
2507
2508 vsetivli(zr, N/2, Assembler::e64, Assembler::m2, Assembler::mu, Assembler::tu);
2509
2510 vclmul_vv(v10, v4, v8);
2511 vclmulh_vv(v12, v4, v8);
2512
2513 // lows
2514 vslidedown_vi(v14, v10, 1);
2515 vmand_mm(v0, v2, v2);
2516 vxor_vv(v10, v10, v14, v0_t);
2517 // with data part 2
2518 vxor_vv(v4, v20, v10, v0_t);
2519
2520 // highs
2521 vslideup_vi(v16, v12, 1);
2522 vmand_mm(v0, v1, v1);
2523 vxor_vv(v12, v12, v16, v0_t);
2524 // with data part 2
2525 vxor_vv(v4, v20, v12, v0_t);
2526
2527
2528 // ======== folding into 16 bytes from 64 bytes in register ========
2529
2530 // v4: data, first part, 2 of 64-bits
2531 // v16: data, second part, 2 of 64-bits
2532 // v18: data, third part, 2 of 64-bits
2533 // v20: data, second part, 2 of 64-bits
2534 // v8: table
2535
2536 vslidedown_vi(v16, v4, 2);
2537 vslidedown_vi(v18, v4, 4);
2538 vslidedown_vi(v20, v4, 6);
2539
2540 vsetivli(zr, 2, Assembler::e64, Assembler::m1, Assembler::mu, Assembler::tu);
2541
2542 addi(vclmul_table, vclmul_table, TABLE_STEP);
2543 vle64_v(v8, vclmul_table);
2544 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v4, v20, v8, v28, v29, v30, v31);
2545
2546 addi(vclmul_table, vclmul_table, TABLE_STEP);
2547 vle64_v(v8, vclmul_table);
2548 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v16, v20, v8, v28, v29, v30, v31);
2549
2550 addi(vclmul_table, vclmul_table, TABLE_STEP);
2551 vle64_v(v8, vclmul_table);
2552 crc32_vclmul_fold_to_16_bytes_vectorsize_32(v18, v20, v8, v28, v29, v30, v31);
2553
2554
2555 // ======== final: move result to scalar regsiters ========
2556
2557 vmv_x_s(tmp1, v20);
2558 vslidedown_vi(v4, v20, 1);
2559 vmv_x_s(tmp2, v4);
2560
2561 #undef CRC32_VCLMUL_LOAD_TABLE
2562 }
2563
2564 // For more details of the algorithm, please check the paper:
2565 // "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction - Intel"
2566 //
2567 // Please also refer to the corresponding code in aarch64 or x86 ones.
2568 //
2569 // As the riscv carry-less multiplication is a bit different from the other platforms,
2570 // so the implementation itself is also a bit different from others.
2571
2572 void MacroAssembler::kernel_crc32_vclmul_fold(Register crc, Register buf, Register len,
2573 Register table0, Register table1, Register table2, Register table3,
2574 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
2575 const int64_t single_table_size = 256;
2576 const int64_t table_num = 8; // 4 for scalar, 4 for plain vector
2577 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2578 Register vclmul_table = tmp3;
2579
2580 la(vclmul_table, table_addr);
2581 add(vclmul_table, vclmul_table, table_num * single_table_size * sizeof(juint), tmp1);
2582 la(table0, table_addr);
2583
2584 if (MaxVectorSize == 16) {
2585 kernel_crc32_vclmul_fold_vectorsize_16(crc, buf, len, vclmul_table, tmp1, tmp2);
2586 } else {
2587 kernel_crc32_vclmul_fold_vectorsize_32(crc, buf, len, vclmul_table, tmp1, tmp2);
2588 }
2589
2590 mv(crc, zr);
2591 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2592 update_word_crc32(crc, tmp1, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2593 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, false);
2594 update_word_crc32(crc, tmp2, tmp3, tmp4, tmp5, table0, table1, table2, table3, true);
2595 }
2596
2597 #endif // COMPILER2
2598
2599 /**
2600 * @param crc register containing existing CRC (32-bit)
2601 * @param buf register pointing to input byte buffer (byte*)
2602 * @param len register containing number of bytes
2603 * @param table register that will contain address of CRC table
2604 * @param tmp scratch registers
2605 */
2606 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2607 Register table0, Register table1, Register table2, Register table3,
2608 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register tmp6) {
2609 assert_different_registers(crc, buf, len, table0, table1, table2, table3, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2610 Label L_vector_entry,
2611 L_unroll_loop,
2612 L_by4_loop_entry, L_by4_loop,
2613 L_by1_loop, L_exit, L_skip1, L_skip2;
2614
2615 const int64_t single_table_size = 256;
2616 const int64_t unroll = 16;
2617 const int64_t unroll_words = unroll*wordSize;
2618
2619 // tmp5 = 0xffffffff
2620 notr(tmp5, zr);
2621 srli(tmp5, tmp5, 32);
2622
2623 andn(crc, tmp5, crc);
2624
2625 const ExternalAddress table_addr = StubRoutines::crc_table_addr();
2626 la(table0, table_addr);
2627 add(table1, table0, 1 * single_table_size * sizeof(juint), tmp1);
2628 add(table2, table0, 2 * single_table_size * sizeof(juint), tmp1);
2629 add(table3, table2, 1 * single_table_size * sizeof(juint), tmp1);
2630
2631 // Ensure basic 4-byte alignment of input byte buffer
2632 mv(tmp1, 4);
2633 blt(len, tmp1, L_by1_loop);
2634 test_bit(tmp1, buf, 0);
2635 beqz(tmp1, L_skip1);
2636 subiw(len, len, 1);
2637 lbu(tmp1, Address(buf));
2638 addi(buf, buf, 1);
2639 update_byte_crc32(crc, tmp1, table0);
2640 bind(L_skip1);
2641 test_bit(tmp1, buf, 1);
2642 beqz(tmp1, L_skip2);
2643 subiw(len, len, 2);
2644 lhu(tmp1, Address(buf));
2645 addi(buf, buf, 2);
2646 zext(tmp2, tmp1, 8);
2647 update_byte_crc32(crc, tmp2, table0);
2648 srli(tmp2, tmp1, 8);
2649 update_byte_crc32(crc, tmp2, table0);
2650 bind(L_skip2);
2651
2652 #ifdef COMPILER2
2653 if (UseRVV) {
2654 const int64_t tmp_limit =
2655 UseZvbc ? 128 * 3 // 3 rounds of folding with carry-less multiplication
2656 : MaxVectorSize >= 32 ? unroll_words*3 : unroll_words*5;
2657 mv(tmp1, tmp_limit);
2658 bge(len, tmp1, L_vector_entry);
2659 }
2660 #endif // COMPILER2
2661
2662 mv(tmp1, unroll_words);
2663 blt(len, tmp1, L_by4_loop_entry);
2664
2665 const Register loop_buf_end = tmp3;
2666
2667 align(CodeEntryAlignment);
2668 // Entry for L_unroll_loop
2669 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2670 andi(len, len, unroll_words - 1); // len = (len % unroll_words)
2671 sub(loop_buf_end, loop_buf_end, len);
2672 bind(L_unroll_loop);
2673 for (int i = 0; i < unroll; i++) {
2674 ld(tmp1, Address(buf, i*wordSize));
2675 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2676 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, true);
2677 }
2678
2679 addi(buf, buf, unroll_words);
2680 blt(buf, loop_buf_end, L_unroll_loop);
2681
2682 bind(L_by4_loop_entry);
2683 mv(tmp1, 4);
2684 blt(len, tmp1, L_by1_loop);
2685 add(loop_buf_end, buf, len); // loop_buf_end will be used as endpoint for loop below
2686 andi(len, len, 3);
2687 sub(loop_buf_end, loop_buf_end, len);
2688 bind(L_by4_loop);
2689 lwu(tmp1, Address(buf));
2690 update_word_crc32(crc, tmp1, tmp2, tmp4, tmp6, table0, table1, table2, table3, false);
2691 addi(buf, buf, 4);
2692 blt(buf, loop_buf_end, L_by4_loop);
2693
2694 bind(L_by1_loop);
2695 beqz(len, L_exit);
2696
2697 subiw(len, len, 1);
2698 lbu(tmp1, Address(buf));
2699 update_byte_crc32(crc, tmp1, table0);
2700 beqz(len, L_exit);
2701
2702 subiw(len, len, 1);
2703 lbu(tmp1, Address(buf, 1));
2704 update_byte_crc32(crc, tmp1, table0);
2705 beqz(len, L_exit);
2706
2707 subiw(len, len, 1);
2708 lbu(tmp1, Address(buf, 2));
2709 update_byte_crc32(crc, tmp1, table0);
2710
2711 #ifdef COMPILER2
2712 // put vector code here, otherwise "offset is too large" error occurs.
2713 if (UseRVV) {
2714 // only need to jump exit when UseRVV == true, it's a jump from end of block `L_by1_loop`.
2715 j(L_exit);
2716
2717 bind(L_vector_entry);
2718 if (UseZvbc) { // carry-less multiplication
2719 kernel_crc32_vclmul_fold(crc, buf, len,
2720 table0, table1, table2, table3,
2721 tmp1, tmp2, tmp3, tmp4, tmp6);
2722 } else { // plain vector instructions
2723 vector_update_crc32(crc, buf, len, tmp1, tmp2, tmp3, tmp4, tmp6, table0, table3);
2724 }
2725
2726 bgtz(len, L_by4_loop_entry);
2727 }
2728 #endif // COMPILER2
2729
2730 bind(L_exit);
2731 andn(crc, tmp5, crc);
2732 }
2733
2734 #ifdef COMPILER2
2735 // Push vector registers in the bitset supplied.
2736 // Return the number of words pushed
2737 int MacroAssembler::push_v(VectorRegSet regset, Register stack) {
2738 if (regset.bits() == 0) {
2739 return 0;
2740 }
2741 auto bitset = integer_cast<unsigned int>(regset.bits());
2742 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2743
2744 // Scan bitset to accumulate register pairs
2745 unsigned char regs[32];
2746 int count = bitset_to_regs(bitset, regs);
2747
2748 for (int i = 0; i < count; i++) {
2749 sub(stack, stack, vector_size_in_bytes);
2750 vs1r_v(as_VectorRegister(regs[i]), stack);
2751 }
2752
2753 return count * vector_size_in_bytes / wordSize;
2754 }
2755
2756 int MacroAssembler::pop_v(VectorRegSet regset, Register stack) {
2757 if (regset.bits() == 0) {
2758 return 0;
2759 }
2760 auto bitset = integer_cast<unsigned int>(regset.bits());
2761 int vector_size_in_bytes = Matcher::scalable_vector_reg_size(T_BYTE);
2762
2763 // Scan bitset to accumulate register pairs
2764 unsigned char regs[32];
2765 int count = bitset_to_regs(bitset, regs);
2766
2767 for (int i = count - 1; i >= 0; i--) {
2768 vl1r_v(as_VectorRegister(regs[i]), stack);
2769 add(stack, stack, vector_size_in_bytes);
2770 }
2771
2772 return count * vector_size_in_bytes / wordSize;
2773 }
2774 #endif // COMPILER2
2775
2776 void MacroAssembler::push_call_clobbered_registers_except(RegSet exclude) {
2777 // Push integer registers x7, x10-x17, x28-x31.
2778 push_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2779
2780 // Push float registers f0-f7, f10-f17, f28-f31.
2781 subi(sp, sp, wordSize * 20);
2782 int offset = 0;
2783 for (int i = 0; i < 32; i++) {
2784 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2785 fsd(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2786 }
2787 }
2788 }
2789
2790 void MacroAssembler::pop_call_clobbered_registers_except(RegSet exclude) {
2791 int offset = 0;
2792 for (int i = 0; i < 32; i++) {
2793 if (i <= f7->encoding() || i >= f28->encoding() || (i >= f10->encoding() && i <= f17->encoding())) {
2794 fld(as_FloatRegister(i), Address(sp, wordSize * (offset++)));
2795 }
2796 }
2797 addi(sp, sp, wordSize * 20);
2798
2799 pop_reg(RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31) - exclude, sp);
2800 }
2801
2802 void MacroAssembler::push_CPU_state(bool save_vectors, int vector_size_in_bytes) {
2803 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2804 push_reg(RegSet::range(x5, x31), sp);
2805
2806 // float registers
2807 subi(sp, sp, 32 * wordSize);
2808 for (int i = 0; i < 32; i++) {
2809 fsd(as_FloatRegister(i), Address(sp, i * wordSize));
2810 }
2811
2812 // vector registers
2813 if (save_vectors) {
2814 sub(sp, sp, vector_size_in_bytes * VectorRegister::number_of_registers);
2815 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2816 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2817 add(t0, sp, vector_size_in_bytes * i);
2818 vse64_v(as_VectorRegister(i), t0);
2819 }
2820 }
2821 }
2822
2823 void MacroAssembler::pop_CPU_state(bool restore_vectors, int vector_size_in_bytes) {
2824 // vector registers
2825 if (restore_vectors) {
2826 vsetvli(t0, x0, Assembler::e64, Assembler::m8);
2827 for (int i = 0; i < VectorRegister::number_of_registers; i += 8) {
2828 vle64_v(as_VectorRegister(i), sp);
2829 add(sp, sp, vector_size_in_bytes * 8);
2830 }
2831 }
2832
2833 // float registers
2834 for (int i = 0; i < 32; i++) {
2835 fld(as_FloatRegister(i), Address(sp, i * wordSize));
2836 }
2837 addi(sp, sp, 32 * wordSize);
2838
2839 // integer registers, except zr(x0) & ra(x1) & sp(x2) & gp(x3) & tp(x4)
2840 pop_reg(RegSet::range(x5, x31), sp);
2841 }
2842
2843 static int patch_offset_in_jal(address branch, int64_t offset) {
2844 assert(Assembler::is_simm21(offset) && ((offset % 2) == 0),
2845 "offset (%ld) is too large to be patched in one jal instruction!\n", offset);
2846 Assembler::patch(branch, 31, 31, (offset >> 20) & 0x1); // offset[20] ==> branch[31]
2847 Assembler::patch(branch, 30, 21, (offset >> 1) & 0x3ff); // offset[10:1] ==> branch[30:21]
2848 Assembler::patch(branch, 20, 20, (offset >> 11) & 0x1); // offset[11] ==> branch[20]
2849 Assembler::patch(branch, 19, 12, (offset >> 12) & 0xff); // offset[19:12] ==> branch[19:12]
2850 return MacroAssembler::instruction_size; // only one instruction
2851 }
2852
2853 static int patch_offset_in_conditional_branch(address branch, int64_t offset) {
2854 assert(Assembler::is_simm13(offset) && ((offset % 2) == 0),
2855 "offset (%ld) is too large to be patched in one beq/bge/bgeu/blt/bltu/bne instruction!\n", offset);
2856 Assembler::patch(branch, 31, 31, (offset >> 12) & 0x1); // offset[12] ==> branch[31]
2857 Assembler::patch(branch, 30, 25, (offset >> 5) & 0x3f); // offset[10:5] ==> branch[30:25]
2858 Assembler::patch(branch, 7, 7, (offset >> 11) & 0x1); // offset[11] ==> branch[7]
2859 Assembler::patch(branch, 11, 8, (offset >> 1) & 0xf); // offset[4:1] ==> branch[11:8]
2860 return MacroAssembler::instruction_size; // only one instruction
2861 }
2862
2863 static int patch_offset_in_pc_relative(address branch, int64_t offset) {
2864 const int PC_RELATIVE_INSTRUCTION_NUM = 2; // auipc, addi/jalr/load
2865 Assembler::patch(branch, 31, 12, ((offset + 0x800) >> 12) & 0xfffff); // Auipc. offset[31:12] ==> branch[31:12]
2866 Assembler::patch(branch + 4, 31, 20, offset & 0xfff); // Addi/Jalr/Load. offset[11:0] ==> branch[31:20]
2867 return PC_RELATIVE_INSTRUCTION_NUM * MacroAssembler::instruction_size;
2868 }
2869
2870 static int patch_addr_in_movptr1(address branch, address target) {
2871 int32_t lower = ((intptr_t)target << 35) >> 35;
2872 int64_t upper = ((intptr_t)target - lower) >> 29;
2873 Assembler::patch(branch + 0, 31, 12, upper & 0xfffff); // Lui. target[48:29] + target[28] ==> branch[31:12]
2874 Assembler::patch(branch + 4, 31, 20, (lower >> 17) & 0xfff); // Addi. target[28:17] ==> branch[31:20]
2875 Assembler::patch(branch + 12, 31, 20, (lower >> 6) & 0x7ff); // Addi. target[16: 6] ==> branch[31:20]
2876 Assembler::patch(branch + 20, 31, 20, lower & 0x3f); // Addi/Jalr/Load. target[ 5: 0] ==> branch[31:20]
2877 return MacroAssembler::movptr1_instruction_size;
2878 }
2879
2880 static int patch_addr_in_movptr2(address instruction_address, address target) {
2881 uintptr_t addr = (uintptr_t)target;
2882
2883 assert(addr < (1ull << 48), "48-bit overflow in address constant");
2884 unsigned int upper18 = (addr >> 30ull);
2885 int lower30 = (addr & 0x3fffffffu);
2886 int low12 = (lower30 << 20) >> 20;
2887 int mid18 = ((lower30 - low12) >> 12);
2888
2889 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 0), 31, 12, (upper18 & 0xfffff)); // Lui
2890 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 1), 31, 12, (mid18 & 0xfffff)); // Lui
2891 // Slli
2892 // Add
2893 Assembler::patch(instruction_address + (MacroAssembler::instruction_size * 4), 31, 20, low12 & 0xfff); // Addi/Jalr/Load
2894
2895 assert(MacroAssembler::target_addr_for_insn(instruction_address) == target, "Must be");
2896
2897 return MacroAssembler::movptr2_instruction_size;
2898 }
2899
2900 static int patch_imm_in_li16u(address branch, uint16_t target) {
2901 Assembler::patch(branch, 31, 12, target); // patch lui only
2902 return MacroAssembler::instruction_size;
2903 }
2904
2905 int MacroAssembler::patch_imm_in_li32(address branch, int32_t target) {
2906 const int LI32_INSTRUCTIONS_NUM = 2; // lui + addiw
2907 int64_t upper = (intptr_t)target;
2908 int32_t lower = (((int32_t)target) << 20) >> 20;
2909 upper -= lower;
2910 upper = (int32_t)upper;
2911 Assembler::patch(branch + 0, 31, 12, (upper >> 12) & 0xfffff); // Lui.
2912 Assembler::patch(branch + 4, 31, 20, lower & 0xfff); // Addiw.
2913 return LI32_INSTRUCTIONS_NUM * MacroAssembler::instruction_size;
2914 }
2915
2916 static long get_offset_of_jal(address insn_addr) {
2917 assert_cond(insn_addr != nullptr);
2918 long offset = 0;
2919 unsigned insn = Assembler::ld_instr(insn_addr);
2920 long val = (long)Assembler::sextract(insn, 31, 12);
2921 offset |= ((val >> 19) & 0x1) << 20;
2922 offset |= (val & 0xff) << 12;
2923 offset |= ((val >> 8) & 0x1) << 11;
2924 offset |= ((val >> 9) & 0x3ff) << 1;
2925 offset = (offset << 43) >> 43;
2926 return offset;
2927 }
2928
2929 static long get_offset_of_conditional_branch(address insn_addr) {
2930 long offset = 0;
2931 assert_cond(insn_addr != nullptr);
2932 unsigned insn = Assembler::ld_instr(insn_addr);
2933 offset = (long)Assembler::sextract(insn, 31, 31);
2934 offset = (offset << 12) | (((long)(Assembler::sextract(insn, 7, 7) & 0x1)) << 11);
2935 offset = offset | (((long)(Assembler::sextract(insn, 30, 25) & 0x3f)) << 5);
2936 offset = offset | (((long)(Assembler::sextract(insn, 11, 8) & 0xf)) << 1);
2937 offset = (offset << 41) >> 41;
2938 return offset;
2939 }
2940
2941 static long get_offset_of_pc_relative(address insn_addr) {
2942 long offset = 0;
2943 assert_cond(insn_addr != nullptr);
2944 offset = ((long)(Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12))) << 12; // Auipc.
2945 offset += ((long)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addi/Jalr/Load.
2946 offset = (offset << 32) >> 32;
2947 return offset;
2948 }
2949
2950 static address get_target_of_movptr1(address insn_addr) {
2951 assert_cond(insn_addr != nullptr);
2952 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 29; // Lui.
2953 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)) << 17; // Addi.
2954 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 12), 31, 20)) << 6; // Addi.
2955 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 20), 31, 20)); // Addi/Jalr/Load.
2956 return (address) target_address;
2957 }
2958
2959 static address get_target_of_movptr2(address insn_addr) {
2960 assert_cond(insn_addr != nullptr);
2961 int32_t upper18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 0), 31, 12)) & 0xfffff); // Lui
2962 int32_t mid18 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 1), 31, 12)) & 0xfffff); // Lui
2963 // 2 // Slli
2964 // 3 // Add
2965 int32_t low12 = ((Assembler::sextract(Assembler::ld_instr(insn_addr + MacroAssembler::instruction_size * 4), 31, 20))); // Addi/Jalr/Load.
2966 address ret = (address)(((intptr_t)upper18<<30ll) + ((intptr_t)mid18<<12ll) + low12);
2967 return ret;
2968 }
2969
2970 address MacroAssembler::get_target_of_li32(address insn_addr) {
2971 assert_cond(insn_addr != nullptr);
2972 intptr_t target_address = (((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr), 31, 12)) & 0xfffff) << 12; // Lui.
2973 target_address += ((int64_t)Assembler::sextract(Assembler::ld_instr(insn_addr + 4), 31, 20)); // Addiw.
2974 return (address)target_address;
2975 }
2976
2977 // Patch any kind of instruction; there may be several instructions.
2978 // Return the total length (in bytes) of the instructions.
2979 int MacroAssembler::pd_patch_instruction_size(address instruction_address, address target) {
2980 assert_cond(instruction_address != nullptr);
2981 int64_t offset = target - instruction_address;
2982 if (MacroAssembler::is_jal_at(instruction_address)) { // jal
2983 return patch_offset_in_jal(instruction_address, offset);
2984 } else if (MacroAssembler::is_branch_at(instruction_address)) { // beq/bge/bgeu/blt/bltu/bne
2985 return patch_offset_in_conditional_branch(instruction_address, offset);
2986 } else if (MacroAssembler::is_pc_relative_at(instruction_address)) { // auipc, addi/jalr/load
2987 return patch_offset_in_pc_relative(instruction_address, offset);
2988 } else if (MacroAssembler::is_movptr1_at(instruction_address)) { // movptr1
2989 return patch_addr_in_movptr1(instruction_address, target);
2990 } else if (MacroAssembler::is_movptr2_at(instruction_address)) { // movptr2
2991 return patch_addr_in_movptr2(instruction_address, target);
2992 } else if (MacroAssembler::is_li32_at(instruction_address)) { // li32
2993 int64_t imm = (intptr_t)target;
2994 return patch_imm_in_li32(instruction_address, (int32_t)imm);
2995 } else if (MacroAssembler::is_li16u_at(instruction_address)) {
2996 int64_t imm = (intptr_t)target;
2997 return patch_imm_in_li16u(instruction_address, (uint16_t)imm);
2998 } else {
2999 #ifdef ASSERT
3000 tty->print_cr("pd_patch_instruction_size: instruction 0x%x at " INTPTR_FORMAT " could not be patched!\n",
3001 Assembler::ld_instr(instruction_address), p2i(instruction_address));
3002 Disassembler::decode(instruction_address - 16, instruction_address + 16);
3003 #endif
3004 ShouldNotReachHere();
3005 return -1;
3006 }
3007 }
3008
3009 address MacroAssembler::target_addr_for_insn(address insn_addr) {
3010 long offset = 0;
3011 assert_cond(insn_addr != nullptr);
3012 if (MacroAssembler::is_jal_at(insn_addr)) { // jal
3013 offset = get_offset_of_jal(insn_addr);
3014 } else if (MacroAssembler::is_branch_at(insn_addr)) { // beq/bge/bgeu/blt/bltu/bne
3015 offset = get_offset_of_conditional_branch(insn_addr);
3016 } else if (MacroAssembler::is_pc_relative_at(insn_addr)) { // auipc, addi/jalr/load
3017 offset = get_offset_of_pc_relative(insn_addr);
3018 } else if (MacroAssembler::is_movptr1_at(insn_addr)) { // movptr1
3019 return get_target_of_movptr1(insn_addr);
3020 } else if (MacroAssembler::is_movptr2_at(insn_addr)) { // movptr2
3021 return get_target_of_movptr2(insn_addr);
3022 } else if (MacroAssembler::is_li32_at(insn_addr)) { // li32
3023 return get_target_of_li32(insn_addr);
3024 } else {
3025 ShouldNotReachHere();
3026 }
3027 return address(((uintptr_t)insn_addr + offset));
3028 }
3029
3030 int MacroAssembler::patch_oop(address insn_addr, address o) {
3031 // OOPs are either narrow (32 bits) or wide (48 bits). We encode
3032 // narrow OOPs by setting the upper 16 bits in the first
3033 // instruction.
3034 if (MacroAssembler::is_li32_at(insn_addr)) {
3035 // Move narrow OOP
3036 uint32_t n = CompressedOops::narrow_oop_value(cast_to_oop(o));
3037 return patch_imm_in_li32(insn_addr, (int32_t)n);
3038 } else if (MacroAssembler::is_movptr1_at(insn_addr)) {
3039 // Move wide OOP
3040 return patch_addr_in_movptr1(insn_addr, o);
3041 } else if (MacroAssembler::is_movptr2_at(insn_addr)) {
3042 // Move wide OOP
3043 return patch_addr_in_movptr2(insn_addr, o);
3044 }
3045 ShouldNotReachHere();
3046 return -1;
3047 }
3048
3049 void MacroAssembler::reinit_heapbase() {
3050 if (UseCompressedOops) {
3051 if (Universe::is_fully_initialized()) {
3052 mv(xheapbase, CompressedOops::base());
3053 } else {
3054 ld(xheapbase, ExternalAddress(CompressedOops::base_addr()));
3055 }
3056 }
3057 }
3058
3059 void MacroAssembler::movptr(Register Rd, const Address &addr, Register temp) {
3060 assert(addr.getMode() == Address::literal, "must be applied to a literal address");
3061 relocate(addr.rspec(), [&] {
3062 movptr(Rd, addr.target(), temp);
3063 });
3064 }
3065
3066 void MacroAssembler::movptr(Register Rd, address addr, Register temp) {
3067 int offset = 0;
3068 movptr(Rd, addr, offset, temp);
3069 addi(Rd, Rd, offset);
3070 }
3071
3072 void MacroAssembler::movptr(Register Rd, address addr, int32_t &offset, Register temp) {
3073 uint64_t uimm64 = (uint64_t)addr;
3074 #ifndef PRODUCT
3075 {
3076 char buffer[64];
3077 os::snprintf_checked(buffer, sizeof(buffer), "0x%" PRIx64, uimm64);
3078 block_comment(buffer);
3079 }
3080 #endif
3081 assert(uimm64 < (1ull << 48), "48-bit overflow in address constant");
3082
3083 if (temp == noreg) {
3084 movptr1(Rd, uimm64, offset);
3085 } else {
3086 movptr2(Rd, uimm64, offset, temp);
3087 }
3088 }
3089
3090 void MacroAssembler::movptr1(Register Rd, uint64_t imm64, int32_t &offset) {
3091 // Load upper 31 bits
3092 //
3093 // In case of 11th bit of `lower` is 0, it's straightforward to understand.
3094 // In case of 11th bit of `lower` is 1, it's a bit tricky, to help understand,
3095 // imagine divide both `upper` and `lower` into 2 parts respectively, i.e.
3096 // [upper_20, upper_12], [lower_20, lower_12], they are the same just before
3097 // `lower = (lower << 52) >> 52;`.
3098 // After `upper -= lower;`,
3099 // upper_20' = upper_20 - (-1) == upper_20 + 1
3100 // upper_12 = 0x000
3101 // After `lui(Rd, upper);`, `Rd` = upper_20' << 12
3102 // Also divide `Rd` into 2 parts [Rd_20, Rd_12],
3103 // Rd_20 == upper_20'
3104 // Rd_12 == 0x000
3105 // After `addi(Rd, Rd, lower);`,
3106 // Rd_20 = upper_20' + (-1) == upper_20 + 1 - 1 = upper_20
3107 // Rd_12 = lower_12
3108 // So, finally Rd == [upper_20, lower_12]
3109 int64_t imm = imm64 >> 17;
3110 int64_t upper = imm, lower = imm;
3111 lower = (lower << 52) >> 52;
3112 upper -= lower;
3113 upper = (int32_t)upper;
3114 lui(Rd, upper);
3115 addi(Rd, Rd, lower);
3116
3117 // Load the rest 17 bits.
3118 slli(Rd, Rd, 11);
3119 addi(Rd, Rd, (imm64 >> 6) & 0x7ff);
3120 slli(Rd, Rd, 6);
3121
3122 // This offset will be used by following jalr/ld.
3123 offset = imm64 & 0x3f;
3124 }
3125
3126 void MacroAssembler::movptr2(Register Rd, uint64_t addr, int32_t &offset, Register tmp) {
3127 assert_different_registers(Rd, tmp, noreg);
3128
3129 // addr: [upper18, lower30[mid18, lower12]]
3130
3131 int64_t upper18 = addr >> 18;
3132 lui(tmp, upper18);
3133
3134 int64_t lower30 = addr & 0x3fffffff;
3135 int64_t mid18 = lower30, lower12 = lower30;
3136 lower12 = (lower12 << 52) >> 52;
3137 // For this tricky part (`mid18 -= lower12;` + `offset = lower12;`),
3138 // please refer to movptr1 above.
3139 mid18 -= (int32_t)lower12;
3140 lui(Rd, mid18);
3141
3142 slli(tmp, tmp, 18);
3143 add(Rd, Rd, tmp);
3144
3145 offset = lower12;
3146 }
3147
3148 // floating point imm move
3149 bool MacroAssembler::can_hf_imm_load(short imm) {
3150 jshort h_bits = (jshort)imm;
3151 if (h_bits == 0) {
3152 return true;
3153 }
3154 return can_zfa_zli_half_float(imm);
3155 }
3156
3157 bool MacroAssembler::can_fp_imm_load(float imm) {
3158 jint f_bits = jint_cast(imm);
3159 if (f_bits == 0) {
3160 return true;
3161 }
3162 return can_zfa_zli_float(imm);
3163 }
3164
3165 bool MacroAssembler::can_dp_imm_load(double imm) {
3166 julong d_bits = julong_cast(imm);
3167 if (d_bits == 0) {
3168 return true;
3169 }
3170 return can_zfa_zli_double(imm);
3171 }
3172
3173 void MacroAssembler::fli_h(FloatRegister Rd, short imm) {
3174 jshort h_bits = (jshort)imm;
3175 if (h_bits == 0) {
3176 fmv_h_x(Rd, zr);
3177 return;
3178 }
3179 int Rs = zfa_zli_lookup_half_float(h_bits);
3180 assert(Rs != -1, "Must be");
3181 _fli_h(Rd, Rs);
3182 }
3183
3184 void MacroAssembler::fli_s(FloatRegister Rd, float imm) {
3185 jint f_bits = jint_cast(imm);
3186 if (f_bits == 0) {
3187 fmv_w_x(Rd, zr);
3188 return;
3189 }
3190 int Rs = zfa_zli_lookup_float(f_bits);
3191 assert(Rs != -1, "Must be");
3192 _fli_s(Rd, Rs);
3193 }
3194
3195 void MacroAssembler::fli_d(FloatRegister Rd, double imm) {
3196 uint64_t d_bits = (uint64_t)julong_cast(imm);
3197 if (d_bits == 0) {
3198 fmv_d_x(Rd, zr);
3199 return;
3200 }
3201 int Rs = zfa_zli_lookup_double(d_bits);
3202 assert(Rs != -1, "Must be");
3203 _fli_d(Rd, Rs);
3204 }
3205
3206 void MacroAssembler::add(Register Rd, Register Rn, int64_t increment, Register tmp) {
3207 if (is_simm12(increment)) {
3208 addi(Rd, Rn, increment);
3209 } else {
3210 assert_different_registers(Rn, tmp);
3211 mv(tmp, increment);
3212 add(Rd, Rn, tmp);
3213 }
3214 }
3215
3216 void MacroAssembler::sub(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3217 add(Rd, Rn, -decrement, tmp);
3218 }
3219
3220 void MacroAssembler::addw(Register Rd, Register Rn, int64_t increment, Register tmp) {
3221 if (is_simm12(increment)) {
3222 addiw(Rd, Rn, increment);
3223 } else {
3224 assert_different_registers(Rn, tmp);
3225 mv(tmp, increment);
3226 addw(Rd, Rn, tmp);
3227 }
3228 }
3229
3230 void MacroAssembler::subw(Register Rd, Register Rn, int64_t decrement, Register tmp) {
3231 addw(Rd, Rn, -decrement, tmp);
3232 }
3233
3234 void MacroAssembler::andrw(Register Rd, Register Rs1, Register Rs2) {
3235 andr(Rd, Rs1, Rs2);
3236 sext(Rd, Rd, 32);
3237 }
3238
3239 void MacroAssembler::orrw(Register Rd, Register Rs1, Register Rs2) {
3240 orr(Rd, Rs1, Rs2);
3241 sext(Rd, Rd, 32);
3242 }
3243
3244 void MacroAssembler::xorrw(Register Rd, Register Rs1, Register Rs2) {
3245 xorr(Rd, Rs1, Rs2);
3246 sext(Rd, Rd, 32);
3247 }
3248
3249 // Rd = Rs1 & (~Rd2)
3250 void MacroAssembler::andn(Register Rd, Register Rs1, Register Rs2) {
3251 if (UseZbb) {
3252 Assembler::andn(Rd, Rs1, Rs2);
3253 return;
3254 }
3255
3256 notr(Rd, Rs2);
3257 andr(Rd, Rs1, Rd);
3258 }
3259
3260 // Rd = Rs1 | (~Rd2)
3261 void MacroAssembler::orn(Register Rd, Register Rs1, Register Rs2) {
3262 if (UseZbb) {
3263 Assembler::orn(Rd, Rs1, Rs2);
3264 return;
3265 }
3266
3267 notr(Rd, Rs2);
3268 orr(Rd, Rs1, Rd);
3269 }
3270
3271 // Note: load_unsigned_short used to be called load_unsigned_word.
3272 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
3273 int off = offset();
3274 lhu(dst, src);
3275 return off;
3276 }
3277
3278 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
3279 int off = offset();
3280 lbu(dst, src);
3281 return off;
3282 }
3283
3284 int MacroAssembler::load_signed_short(Register dst, Address src) {
3285 int off = offset();
3286 lh(dst, src);
3287 return off;
3288 }
3289
3290 int MacroAssembler::load_signed_byte(Register dst, Address src) {
3291 int off = offset();
3292 lb(dst, src);
3293 return off;
3294 }
3295
3296 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
3297 switch (size_in_bytes) {
3298 case 8: ld(dst, src); break;
3299 case 4: is_signed ? lw(dst, src) : lwu(dst, src); break;
3300 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
3301 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
3302 default: ShouldNotReachHere();
3303 }
3304 }
3305
3306 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes) {
3307 switch (size_in_bytes) {
3308 case 8: sd(src, dst); break;
3309 case 4: sw(src, dst); break;
3310 case 2: sh(src, dst); break;
3311 case 1: sb(src, dst); break;
3312 default: ShouldNotReachHere();
3313 }
3314 }
3315
3316 // granularity is 1 OR 2 bytes per load. dst and src.base() allowed to be the same register
3317 void MacroAssembler::load_short_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3318 if (granularity != 1 && granularity != 2) {
3319 ShouldNotReachHere();
3320 }
3321 if (AvoidUnalignedAccesses && (granularity != 2)) {
3322 assert_different_registers(dst, tmp);
3323 assert_different_registers(tmp, src.base());
3324 is_signed ? lb(tmp, Address(src.base(), src.offset() + 1)) : lbu(tmp, Address(src.base(), src.offset() + 1));
3325 slli(tmp, tmp, 8);
3326 lbu(dst, src);
3327 add(dst, dst, tmp);
3328 } else {
3329 is_signed ? lh(dst, src) : lhu(dst, src);
3330 }
3331 }
3332
3333 // granularity is 1, 2 OR 4 bytes per load, if granularity 2 or 4 then dst and src.base() allowed to be the same register
3334 void MacroAssembler::load_int_misaligned(Register dst, Address src, Register tmp, bool is_signed, int granularity) {
3335 if (AvoidUnalignedAccesses && (granularity != 4)) {
3336 switch(granularity) {
3337 case 1:
3338 assert_different_registers(dst, tmp, src.base());
3339 lbu(dst, src);
3340 lbu(tmp, Address(src.base(), src.offset() + 1));
3341 slli(tmp, tmp, 8);
3342 add(dst, dst, tmp);
3343 lbu(tmp, Address(src.base(), src.offset() + 2));
3344 slli(tmp, tmp, 16);
3345 add(dst, dst, tmp);
3346 is_signed ? lb(tmp, Address(src.base(), src.offset() + 3)) : lbu(tmp, Address(src.base(), src.offset() + 3));
3347 slli(tmp, tmp, 24);
3348 add(dst, dst, tmp);
3349 break;
3350 case 2:
3351 assert_different_registers(dst, tmp);
3352 assert_different_registers(tmp, src.base());
3353 is_signed ? lh(tmp, Address(src.base(), src.offset() + 2)) : lhu(tmp, Address(src.base(), src.offset() + 2));
3354 slli(tmp, tmp, 16);
3355 lhu(dst, src);
3356 add(dst, dst, tmp);
3357 break;
3358 default:
3359 ShouldNotReachHere();
3360 }
3361 } else {
3362 is_signed ? lw(dst, src) : lwu(dst, src);
3363 }
3364 }
3365
3366 // granularity is 1, 2, 4 or 8 bytes per load, if granularity 4 or 8 then dst and src.base() allowed to be same register
3367 void MacroAssembler::load_long_misaligned(Register dst, Address src, Register tmp, int granularity) {
3368 if (AvoidUnalignedAccesses && (granularity != 8)) {
3369 switch(granularity){
3370 case 1:
3371 assert_different_registers(dst, tmp, src.base());
3372 lbu(dst, src);
3373 lbu(tmp, Address(src.base(), src.offset() + 1));
3374 slli(tmp, tmp, 8);
3375 add(dst, dst, tmp);
3376 lbu(tmp, Address(src.base(), src.offset() + 2));
3377 slli(tmp, tmp, 16);
3378 add(dst, dst, tmp);
3379 lbu(tmp, Address(src.base(), src.offset() + 3));
3380 slli(tmp, tmp, 24);
3381 add(dst, dst, tmp);
3382 lbu(tmp, Address(src.base(), src.offset() + 4));
3383 slli(tmp, tmp, 32);
3384 add(dst, dst, tmp);
3385 lbu(tmp, Address(src.base(), src.offset() + 5));
3386 slli(tmp, tmp, 40);
3387 add(dst, dst, tmp);
3388 lbu(tmp, Address(src.base(), src.offset() + 6));
3389 slli(tmp, tmp, 48);
3390 add(dst, dst, tmp);
3391 lbu(tmp, Address(src.base(), src.offset() + 7));
3392 slli(tmp, tmp, 56);
3393 add(dst, dst, tmp);
3394 break;
3395 case 2:
3396 assert_different_registers(dst, tmp, src.base());
3397 lhu(dst, src);
3398 lhu(tmp, Address(src.base(), src.offset() + 2));
3399 slli(tmp, tmp, 16);
3400 add(dst, dst, tmp);
3401 lhu(tmp, Address(src.base(), src.offset() + 4));
3402 slli(tmp, tmp, 32);
3403 add(dst, dst, tmp);
3404 lhu(tmp, Address(src.base(), src.offset() + 6));
3405 slli(tmp, tmp, 48);
3406 add(dst, dst, tmp);
3407 break;
3408 case 4:
3409 assert_different_registers(dst, tmp);
3410 assert_different_registers(tmp, src.base());
3411 lwu(tmp, Address(src.base(), src.offset() + 4));
3412 slli(tmp, tmp, 32);
3413 lwu(dst, src);
3414 add(dst, dst, tmp);
3415 break;
3416 default:
3417 ShouldNotReachHere();
3418 }
3419 } else {
3420 ld(dst, src);
3421 }
3422 }
3423
3424 // reverse bytes in lower word, sign-extend
3425 // Rd[32:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24]
3426 void MacroAssembler::revbw(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3427 if (UseZbb) {
3428 rev8(Rd, Rs);
3429 srai(Rd, Rd, 32);
3430 return;
3431 }
3432 assert_different_registers(Rs, tmp1, tmp2);
3433 assert_different_registers(Rd, tmp1, tmp2);
3434 zext(tmp1, Rs, 8);
3435 slli(tmp1, tmp1, 8);
3436 for (int step = 8; step < 24; step += 8) {
3437 srli(tmp2, Rs, step);
3438 zext(tmp2, tmp2, 8);
3439 orr(tmp1, tmp1, tmp2);
3440 slli(tmp1, tmp1, 8);
3441 }
3442 srli(Rd, Rs, 24);
3443 zext(Rd, Rd, 8);
3444 orr(Rd, tmp1, Rd);
3445 sext(Rd, Rd, 32);
3446 }
3447
3448 // reverse bytes in doubleword
3449 // Rd[63:0] = Rs[7:0] Rs[15:8] Rs[23:16] Rs[31:24] Rs[39:32] Rs[47,40] Rs[55,48] Rs[63:56]
3450 void MacroAssembler::revb(Register Rd, Register Rs, Register tmp1, Register tmp2) {
3451 if (UseZbb) {
3452 rev8(Rd, Rs);
3453 return;
3454 }
3455 assert_different_registers(Rs, tmp1, tmp2);
3456 assert_different_registers(Rd, tmp1, tmp2);
3457 zext(tmp1, Rs, 8);
3458 slli(tmp1, tmp1, 8);
3459 for (int step = 8; step < 56; step += 8) {
3460 srli(tmp2, Rs, step);
3461 zext(tmp2, tmp2, 8);
3462 orr(tmp1, tmp1, tmp2);
3463 slli(tmp1, tmp1, 8);
3464 }
3465 srli(Rd, Rs, 56);
3466 orr(Rd, tmp1, Rd);
3467 }
3468
3469 // rotate right with shift bits
3470 void MacroAssembler::ror(Register dst, Register src, Register shift, Register tmp)
3471 {
3472 if (UseZbb) {
3473 rorr(dst, src, shift);
3474 return;
3475 }
3476
3477 assert_different_registers(dst, tmp);
3478 assert_different_registers(src, tmp);
3479
3480 mv(tmp, 64);
3481 sub(tmp, tmp, shift);
3482 sll(tmp, src, tmp);
3483 srl(dst, src, shift);
3484 orr(dst, dst, tmp);
3485 }
3486
3487 // rotate right with shift bits
3488 void MacroAssembler::ror(Register dst, Register src, uint32_t shift, Register tmp)
3489 {
3490 if (UseZbb) {
3491 rori(dst, src, shift);
3492 return;
3493 }
3494
3495 assert_different_registers(dst, tmp);
3496 assert_different_registers(src, tmp);
3497 assert(shift < 64, "shift amount must be < 64");
3498 slli(tmp, src, 64 - shift);
3499 srli(dst, src, shift);
3500 orr(dst, dst, tmp);
3501 }
3502
3503 // rotate left with shift bits, 32-bit version
3504 void MacroAssembler::rolw(Register dst, Register src, uint32_t shift, Register tmp) {
3505 if (UseZbb) {
3506 // no roliw available
3507 roriw(dst, src, 32 - shift);
3508 return;
3509 }
3510
3511 assert_different_registers(dst, tmp);
3512 assert_different_registers(src, tmp);
3513 assert(shift < 32, "shift amount must be < 32");
3514 srliw(tmp, src, 32 - shift);
3515 slliw(dst, src, shift);
3516 orr(dst, dst, tmp);
3517 }
3518
3519 void MacroAssembler::orptr(Address adr, RegisterOrConstant src, Register tmp1, Register tmp2) {
3520 ld(tmp1, adr);
3521 if (src.is_register()) {
3522 orr(tmp1, tmp1, src.as_register());
3523 } else {
3524 if (is_simm12(src.as_constant())) {
3525 ori(tmp1, tmp1, src.as_constant());
3526 } else {
3527 assert_different_registers(tmp1, tmp2);
3528 mv(tmp2, src.as_constant());
3529 orr(tmp1, tmp1, tmp2);
3530 }
3531 }
3532 sd(tmp1, adr);
3533 }
3534
3535 void MacroAssembler::cmp_klass_beq(Register obj, Register klass,
3536 Register tmp1, Register tmp2,
3537 Label &L, bool is_far) {
3538 assert_different_registers(obj, klass, tmp1, tmp2);
3539 if (UseCompactObjectHeaders) {
3540 load_narrow_klass_compact(tmp1, obj);
3541 } else {
3542 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3543 }
3544 decode_klass_not_null(tmp1, tmp2);
3545 beq(klass, tmp1, L, is_far);
3546 }
3547
3548 void MacroAssembler::cmp_klass_bne(Register obj, Register klass,
3549 Register tmp1, Register tmp2,
3550 Label &L, bool is_far) {
3551 assert_different_registers(obj, klass, tmp1, tmp2);
3552 if (UseCompactObjectHeaders) {
3553 load_narrow_klass_compact(tmp1, obj);
3554 } else {
3555 lwu(tmp1, Address(obj, oopDesc::klass_offset_in_bytes()));
3556 }
3557 decode_klass_not_null(tmp1, tmp2);
3558 bne(klass, tmp1, L, is_far);
3559 }
3560
3561 // Move an oop into a register.
3562 void MacroAssembler::movoop(Register dst, jobject obj) {
3563 int oop_index;
3564 if (obj == nullptr) {
3565 oop_index = oop_recorder()->allocate_oop_index(obj);
3566 } else {
3567 #ifdef ASSERT
3568 {
3569 ThreadInVMfromUnknown tiv;
3570 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
3571 }
3572 #endif
3573 oop_index = oop_recorder()->find_index(obj);
3574 }
3575 RelocationHolder rspec = oop_Relocation::spec(oop_index);
3576
3577 if (BarrierSet::barrier_set()->barrier_set_assembler()->supports_instruction_patching()) {
3578 movptr(dst, Address((address)obj, rspec));
3579 } else {
3580 address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3581 ld(dst, Address(dummy, rspec));
3582 }
3583 }
3584
3585 // Move a metadata address into a register.
3586 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3587 assert((uintptr_t)obj < (1ull << 48), "48-bit overflow in metadata");
3588 int oop_index;
3589 if (obj == nullptr) {
3590 oop_index = oop_recorder()->allocate_metadata_index(obj);
3591 } else {
3592 oop_index = oop_recorder()->find_index(obj);
3593 }
3594 RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3595 movptr(dst, Address((address)obj, rspec));
3596 }
3597
3598 // Writes to stack successive pages until offset reached to check for
3599 // stack overflow + shadow pages. This clobbers tmp.
3600 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3601 assert_different_registers(tmp, size, t0);
3602 // Bang stack for total size given plus shadow page size.
3603 // Bang one page at a time because large size can bang beyond yellow and
3604 // red zones.
3605 mv(t0, (int)os::vm_page_size());
3606 Label loop;
3607 bind(loop);
3608 sub(tmp, sp, t0);
3609 subw(size, size, t0);
3610 sd(size, Address(tmp));
3611 bgtz(size, loop);
3612
3613 // Bang down shadow pages too.
3614 // At this point, (tmp-0) is the last address touched, so don't
3615 // touch it again. (It was touched as (tmp-pagesize) but then tmp
3616 // was post-decremented.) Skip this address by starting at i=1, and
3617 // touch a few more pages below. N.B. It is important to touch all
3618 // the way down to and including i=StackShadowPages.
3619 for (int i = 0; i < (int)(StackOverflow::stack_shadow_zone_size() / (int)os::vm_page_size()) - 1; i++) {
3620 // this could be any sized move but this is can be a debugging crumb
3621 // so the bigger the better.
3622 sub(tmp, tmp, (int)os::vm_page_size());
3623 sd(size, Address(tmp, 0));
3624 }
3625 }
3626
3627 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp1, Register tmp2) {
3628 const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3629 ld(dst, Address(xmethod, Method::const_offset()));
3630 ld(dst, Address(dst, ConstMethod::constants_offset()));
3631 ld(dst, Address(dst, ConstantPool::pool_holder_offset()));
3632 ld(dst, Address(dst, mirror_offset));
3633 resolve_oop_handle(dst, tmp1, tmp2);
3634 }
3635
3636 void MacroAssembler::resolve_oop_handle(Register result, Register tmp1, Register tmp2) {
3637 // OopHandle::resolve is an indirection.
3638 assert_different_registers(result, tmp1, tmp2);
3639 access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp1, tmp2);
3640 }
3641
3642 // ((WeakHandle)result).resolve()
3643 void MacroAssembler::resolve_weak_handle(Register result, Register tmp1, Register tmp2) {
3644 assert_different_registers(result, tmp1, tmp2);
3645 Label resolved;
3646
3647 // A null weak handle resolves to null.
3648 beqz(result, resolved);
3649
3650 // Only 64 bit platforms support GCs that require a tmp register
3651 // Only IN_HEAP loads require a thread_tmp register
3652 // WeakHandle::resolve is an indirection like jweak.
3653 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3654 result, Address(result), tmp1, tmp2);
3655 bind(resolved);
3656 }
3657
3658 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3659 Register dst, Address src,
3660 Register tmp1, Register tmp2) {
3661 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3662 decorators = AccessInternal::decorator_fixup(decorators, type);
3663 bool as_raw = (decorators & AS_RAW) != 0;
3664 if (as_raw) {
3665 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, tmp2);
3666 } else {
3667 bs->load_at(this, decorators, type, dst, src, tmp1, tmp2);
3668 }
3669 }
3670
3671 void MacroAssembler::null_check(Register reg, int offset) {
3672 if (needs_explicit_null_check(offset)) {
3673 // provoke OS null exception if reg is null by
3674 // accessing M[reg] w/o changing any registers
3675 // NOTE: this is plenty to provoke a segv
3676 ld(zr, Address(reg, 0));
3677 } else {
3678 // nothing to do, (later) access of M[reg + offset]
3679 // will provoke OS null exception if reg is null
3680 }
3681 }
3682
3683 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3684 Address dst, Register val,
3685 Register tmp1, Register tmp2, Register tmp3) {
3686 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3687 decorators = AccessInternal::decorator_fixup(decorators, type);
3688 bool as_raw = (decorators & AS_RAW) != 0;
3689 if (as_raw) {
3690 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3691 } else {
3692 bs->store_at(this, decorators, type, dst, val, tmp1, tmp2, tmp3);
3693 }
3694 }
3695
3696 // Algorithm must match CompressedOops::encode.
3697 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3698 verify_oop_msg(s, "broken oop in encode_heap_oop");
3699 if (CompressedOops::base() == nullptr) {
3700 if (CompressedOops::shift() != 0) {
3701 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3702 srli(d, s, LogMinObjAlignmentInBytes);
3703 } else {
3704 mv(d, s);
3705 }
3706 } else {
3707 Label notNull;
3708 sub(d, s, xheapbase);
3709 bgez(d, notNull);
3710 mv(d, zr);
3711 bind(notNull);
3712 if (CompressedOops::shift() != 0) {
3713 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3714 srli(d, d, CompressedOops::shift());
3715 }
3716 }
3717 }
3718
3719 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3720 #ifdef ASSERT
3721 if (CheckCompressedOops) {
3722 Label ok;
3723 bnez(r, ok);
3724 stop("null oop passed to encode_heap_oop_not_null");
3725 bind(ok);
3726 }
3727 #endif
3728 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null");
3729 if (CompressedOops::base() != nullptr) {
3730 sub(r, r, xheapbase);
3731 }
3732 if (CompressedOops::shift() != 0) {
3733 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3734 srli(r, r, LogMinObjAlignmentInBytes);
3735 }
3736 }
3737
3738 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3739 #ifdef ASSERT
3740 if (CheckCompressedOops) {
3741 Label ok;
3742 bnez(src, ok);
3743 stop("null oop passed to encode_heap_oop_not_null2");
3744 bind(ok);
3745 }
3746 #endif
3747 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2");
3748
3749 Register data = src;
3750 if (CompressedOops::base() != nullptr) {
3751 sub(dst, src, xheapbase);
3752 data = dst;
3753 }
3754 if (CompressedOops::shift() != 0) {
3755 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3756 srli(dst, data, LogMinObjAlignmentInBytes);
3757 data = dst;
3758 }
3759 if (data == src) {
3760 mv(dst, src);
3761 }
3762 }
3763
3764 void MacroAssembler::load_narrow_klass_compact(Register dst, Register src) {
3765 assert(UseCompactObjectHeaders, "expects UseCompactObjectHeaders");
3766 ld(dst, Address(src, oopDesc::mark_offset_in_bytes()));
3767 srli(dst, dst, markWord::klass_shift);
3768 }
3769
3770 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) {
3771 assert_different_registers(dst, tmp);
3772 assert_different_registers(src, tmp);
3773 if (UseCompactObjectHeaders) {
3774 load_narrow_klass_compact(dst, src);
3775 decode_klass_not_null(dst, tmp);
3776 } else {
3777 lwu(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3778 decode_klass_not_null(dst, tmp);
3779 }
3780 }
3781
3782 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) {
3783 // FIXME: Should this be a store release? concurrent gcs assumes
3784 // klass length is valid if klass field is not null.
3785 assert(!UseCompactObjectHeaders, "not with compact headers");
3786 encode_klass_not_null(src, tmp);
3787 sw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3788
3789 }
3790
3791 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3792 assert(!UseCompactObjectHeaders, "not with compact headers");
3793 // Store to klass gap in destination
3794 sw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3795 }
3796
3797 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) {
3798 assert_different_registers(r, tmp);
3799 decode_klass_not_null(r, r, tmp);
3800 }
3801
3802 void MacroAssembler::decode_klass_not_null(Register dst, Register src, Register tmp) {
3803 assert_different_registers(dst, tmp);
3804 assert_different_registers(src, tmp);
3805
3806 if (CompressedKlassPointers::base() == nullptr) {
3807 if (CompressedKlassPointers::shift() != 0) {
3808 slli(dst, src, CompressedKlassPointers::shift());
3809 } else {
3810 mv(dst, src);
3811 }
3812 return;
3813 }
3814
3815 Register xbase = tmp;
3816
3817 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3818
3819 if (CompressedKlassPointers::shift() != 0) {
3820 // dst = (src << shift) + xbase
3821 shadd(dst, src, xbase, dst /* temporary, dst != xbase */, CompressedKlassPointers::shift());
3822 } else {
3823 add(dst, xbase, src);
3824 }
3825 }
3826
3827 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) {
3828 assert_different_registers(r, tmp);
3829 encode_klass_not_null(r, r, tmp);
3830 }
3831
3832 void MacroAssembler::encode_klass_not_null(Register dst, Register src, Register tmp) {
3833 if (CompressedKlassPointers::base() == nullptr) {
3834 if (CompressedKlassPointers::shift() != 0) {
3835 srli(dst, src, CompressedKlassPointers::shift());
3836 } else {
3837 mv(dst, src);
3838 }
3839 return;
3840 }
3841
3842 if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0 &&
3843 CompressedKlassPointers::shift() == 0) {
3844 zext(dst, src, 32);
3845 return;
3846 }
3847
3848 Register xbase = dst;
3849 if (dst == src) {
3850 xbase = tmp;
3851 }
3852
3853 assert_different_registers(src, xbase);
3854 mv(xbase, (uintptr_t)CompressedKlassPointers::base());
3855 sub(dst, src, xbase);
3856 if (CompressedKlassPointers::shift() != 0) {
3857 srli(dst, dst, CompressedKlassPointers::shift());
3858 }
3859 }
3860
3861 void MacroAssembler::decode_heap_oop_not_null(Register r) {
3862 decode_heap_oop_not_null(r, r);
3863 }
3864
3865 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3866 assert(UseCompressedOops, "should only be used for compressed headers");
3867 assert(Universe::heap() != nullptr, "java heap should be initialized");
3868 // Cannot assert, unverified entry point counts instructions (see .ad file)
3869 // vtableStubs also counts instructions in pd_code_size_limit.
3870 // Also do not verify_oop as this is called by verify_oop.
3871 if (CompressedOops::shift() != 0) {
3872 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3873 slli(dst, src, LogMinObjAlignmentInBytes);
3874 if (CompressedOops::base() != nullptr) {
3875 add(dst, xheapbase, dst);
3876 }
3877 } else {
3878 assert(CompressedOops::base() == nullptr, "sanity");
3879 mv(dst, src);
3880 }
3881 }
3882
3883 void MacroAssembler::decode_heap_oop(Register d, Register s) {
3884 if (CompressedOops::base() == nullptr) {
3885 if (CompressedOops::shift() != 0 || d != s) {
3886 slli(d, s, CompressedOops::shift());
3887 }
3888 } else {
3889 Label done;
3890 mv(d, s);
3891 beqz(s, done);
3892 shadd(d, s, xheapbase, d, LogMinObjAlignmentInBytes);
3893 bind(done);
3894 }
3895 verify_oop_msg(d, "broken oop in decode_heap_oop");
3896 }
3897
3898 void MacroAssembler::store_heap_oop(Address dst, Register val, Register tmp1,
3899 Register tmp2, Register tmp3, DecoratorSet decorators) {
3900 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, val, tmp1, tmp2, tmp3);
3901 }
3902
3903 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3904 Register tmp2, DecoratorSet decorators) {
3905 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2);
3906 }
3907
3908 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3909 Register tmp2, DecoratorSet decorators) {
3910 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL, dst, src, tmp1, tmp2);
3911 }
3912
3913 // Used for storing nulls.
3914 void MacroAssembler::store_heap_oop_null(Address dst) {
3915 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
3916 }
3917
3918 // Look up the method for a megamorphic invokeinterface call.
3919 // The target method is determined by <intf_klass, itable_index>.
3920 // The receiver klass is in recv_klass.
3921 // On success, the result will be in method_result, and execution falls through.
3922 // On failure, execution transfers to the given label.
3923 void MacroAssembler::lookup_interface_method(Register recv_klass,
3924 Register intf_klass,
3925 RegisterOrConstant itable_index,
3926 Register method_result,
3927 Register scan_tmp,
3928 Label& L_no_such_interface,
3929 bool return_method) {
3930 assert_different_registers(recv_klass, intf_klass, scan_tmp);
3931 assert_different_registers(method_result, intf_klass, scan_tmp);
3932 assert(recv_klass != method_result || !return_method,
3933 "recv_klass can be destroyed when method isn't needed");
3934 assert(itable_index.is_constant() || itable_index.as_register() == method_result,
3935 "caller must use same register for non-constant itable index as for method");
3936
3937 // Compute start of first itableOffsetEntry (which is at the end of the vtable).
3938 int vtable_base = in_bytes(Klass::vtable_start_offset());
3939 int itentry_off = in_bytes(itableMethodEntry::method_offset());
3940 int scan_step = itableOffsetEntry::size() * wordSize;
3941 int vte_size = vtableEntry::size_in_bytes();
3942 assert(vte_size == wordSize, "else adjust times_vte_scale");
3943
3944 lwu(scan_tmp, Address(recv_klass, Klass::vtable_length_offset()));
3945
3946 // Could store the aligned, prescaled offset in the klass.
3947 shadd(scan_tmp, scan_tmp, recv_klass, scan_tmp, 3);
3948 add(scan_tmp, scan_tmp, vtable_base);
3949
3950 if (return_method) {
3951 // Adjust recv_klass by scaled itable_index, so we can free itable_index.
3952 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
3953 if (itable_index.is_register()) {
3954 slli(t0, itable_index.as_register(), 3);
3955 } else {
3956 mv(t0, itable_index.as_constant() << 3);
3957 }
3958 add(recv_klass, recv_klass, t0);
3959 if (itentry_off) {
3960 add(recv_klass, recv_klass, itentry_off);
3961 }
3962 }
3963
3964 Label search, found_method;
3965
3966 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3967 beq(intf_klass, method_result, found_method);
3968 bind(search);
3969 // Check that the previous entry is non-null. A null entry means that
3970 // the receiver class doesn't implement the interface, and wasn't the
3971 // same as when the caller was compiled.
3972 beqz(method_result, L_no_such_interface, /* is_far */ true);
3973 addi(scan_tmp, scan_tmp, scan_step);
3974 ld(method_result, Address(scan_tmp, itableOffsetEntry::interface_offset()));
3975 bne(intf_klass, method_result, search);
3976
3977 bind(found_method);
3978
3979 // Got a hit.
3980 if (return_method) {
3981 lwu(scan_tmp, Address(scan_tmp, itableOffsetEntry::offset_offset()));
3982 add(method_result, recv_klass, scan_tmp);
3983 ld(method_result, Address(method_result));
3984 }
3985 }
3986
3987 // Look up the method for a megamorphic invokeinterface call in a single pass over itable:
3988 // - check recv_klass (actual object class) is a subtype of resolved_klass from CompiledICData
3989 // - find a holder_klass (class that implements the method) vtable offset and get the method from vtable by index
3990 // The target method is determined by <holder_klass, itable_index>.
3991 // The receiver klass is in recv_klass.
3992 // On success, the result will be in method_result, and execution falls through.
3993 // On failure, execution transfers to the given label.
3994 void MacroAssembler::lookup_interface_method_stub(Register recv_klass,
3995 Register holder_klass,
3996 Register resolved_klass,
3997 Register method_result,
3998 Register temp_itbl_klass,
3999 Register scan_temp,
4000 int itable_index,
4001 Label& L_no_such_interface) {
4002 // 'method_result' is only used as output register at the very end of this method.
4003 // Until then we can reuse it as 'holder_offset'.
4004 Register holder_offset = method_result;
4005 assert_different_registers(resolved_klass, recv_klass, holder_klass, temp_itbl_klass, scan_temp, holder_offset);
4006
4007 int vtable_start_offset_bytes = in_bytes(Klass::vtable_start_offset());
4008 int scan_step = itableOffsetEntry::size() * wordSize;
4009 int ioffset_bytes = in_bytes(itableOffsetEntry::interface_offset());
4010 int ooffset_bytes = in_bytes(itableOffsetEntry::offset_offset());
4011 int itmentry_off_bytes = in_bytes(itableMethodEntry::method_offset());
4012 const int vte_scale = exact_log2(vtableEntry::size_in_bytes());
4013
4014 Label L_loop_search_resolved_entry, L_resolved_found, L_holder_found;
4015
4016 lwu(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
4017 add(recv_klass, recv_klass, vtable_start_offset_bytes + ioffset_bytes);
4018 // itableOffsetEntry[] itable = recv_klass + Klass::vtable_start_offset()
4019 // + sizeof(vtableEntry) * (recv_klass->_vtable_len);
4020 // scan_temp = &(itable[0]._interface)
4021 // temp_itbl_klass = itable[0]._interface;
4022 shadd(scan_temp, scan_temp, recv_klass, scan_temp, vte_scale);
4023 ld(temp_itbl_klass, Address(scan_temp));
4024 mv(holder_offset, zr);
4025
4026 // Initial checks:
4027 // - if (holder_klass != resolved_klass), go to "scan for resolved"
4028 // - if (itable[0] == holder_klass), shortcut to "holder found"
4029 // - if (itable[0] == 0), no such interface
4030 bne(resolved_klass, holder_klass, L_loop_search_resolved_entry);
4031 beq(holder_klass, temp_itbl_klass, L_holder_found);
4032 beqz(temp_itbl_klass, L_no_such_interface);
4033
4034 // Loop: Look for holder_klass record in itable
4035 // do {
4036 // temp_itbl_klass = *(scan_temp += scan_step);
4037 // if (temp_itbl_klass == holder_klass) {
4038 // goto L_holder_found; // Found!
4039 // }
4040 // } while (temp_itbl_klass != 0);
4041 // goto L_no_such_interface // Not found.
4042 Label L_search_holder;
4043 bind(L_search_holder);
4044 add(scan_temp, scan_temp, scan_step);
4045 ld(temp_itbl_klass, Address(scan_temp));
4046 beq(holder_klass, temp_itbl_klass, L_holder_found);
4047 bnez(temp_itbl_klass, L_search_holder);
4048
4049 j(L_no_such_interface);
4050
4051 // Loop: Look for resolved_class record in itable
4052 // while (true) {
4053 // temp_itbl_klass = *(scan_temp += scan_step);
4054 // if (temp_itbl_klass == 0) {
4055 // goto L_no_such_interface;
4056 // }
4057 // if (temp_itbl_klass == resolved_klass) {
4058 // goto L_resolved_found; // Found!
4059 // }
4060 // if (temp_itbl_klass == holder_klass) {
4061 // holder_offset = scan_temp;
4062 // }
4063 // }
4064 //
4065 Label L_loop_search_resolved;
4066 bind(L_loop_search_resolved);
4067 add(scan_temp, scan_temp, scan_step);
4068 ld(temp_itbl_klass, Address(scan_temp));
4069 bind(L_loop_search_resolved_entry);
4070 beqz(temp_itbl_klass, L_no_such_interface);
4071 beq(resolved_klass, temp_itbl_klass, L_resolved_found);
4072 bne(holder_klass, temp_itbl_klass, L_loop_search_resolved);
4073 mv(holder_offset, scan_temp);
4074 j(L_loop_search_resolved);
4075
4076 // See if we already have a holder klass. If not, go and scan for it.
4077 bind(L_resolved_found);
4078 beqz(holder_offset, L_search_holder);
4079 mv(scan_temp, holder_offset);
4080
4081 // Finally, scan_temp contains holder_klass vtable offset
4082 bind(L_holder_found);
4083 lwu(method_result, Address(scan_temp, ooffset_bytes - ioffset_bytes));
4084 add(recv_klass, recv_klass, itable_index * wordSize + itmentry_off_bytes
4085 - vtable_start_offset_bytes - ioffset_bytes); // substract offsets to restore the original value of recv_klass
4086 add(method_result, recv_klass, method_result);
4087 ld(method_result, Address(method_result));
4088 }
4089
4090 // virtual method calling
4091 void MacroAssembler::lookup_virtual_method(Register recv_klass,
4092 RegisterOrConstant vtable_index,
4093 Register method_result) {
4094 const ByteSize base = Klass::vtable_start_offset();
4095 assert(vtableEntry::size() * wordSize == 8,
4096 "adjust the scaling in the code below");
4097 int vtable_offset_in_bytes = in_bytes(base + vtableEntry::method_offset());
4098
4099 if (vtable_index.is_register()) {
4100 shadd(method_result, vtable_index.as_register(), recv_klass, method_result, LogBytesPerWord);
4101 ld(method_result, Address(method_result, vtable_offset_in_bytes));
4102 } else {
4103 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
4104 ld(method_result, form_address(method_result, recv_klass, vtable_offset_in_bytes));
4105 }
4106 }
4107
4108 void MacroAssembler::membar(uint32_t order_constraint) {
4109 if (UseZtso && ((order_constraint & StoreLoad) != StoreLoad)) {
4110 // TSO allows for stores to be reordered after loads. When the compiler
4111 // generates a fence to disallow that, we are required to generate the
4112 // fence for correctness.
4113 BLOCK_COMMENT("elided tso membar");
4114 return;
4115 }
4116
4117 address prev = pc() - MacroAssembler::instruction_size;
4118 address last = code()->last_merge_candidate();
4119
4120 if (last != nullptr && is_membar(last) && prev == last) {
4121 // We are merging two memory barrier instructions. On RISCV we
4122 // can do this simply by ORing them together.
4123 set_membar_kind(prev, get_membar_kind(prev) | order_constraint);
4124 BLOCK_COMMENT("merged membar");
4125 return;
4126 }
4127
4128 code()->set_last_merge_candidate(pc());
4129 uint32_t predecessor = 0;
4130 uint32_t successor = 0;
4131 membar_mask_to_pred_succ(order_constraint, predecessor, successor);
4132 fence(predecessor, successor);
4133 }
4134
4135 void MacroAssembler::cmodx_fence() {
4136 BLOCK_COMMENT("cmodx fence");
4137 if (VM_Version::supports_fencei_barrier()) {
4138 Assembler::fencei();
4139 }
4140 }
4141
4142 // Form an address from base + offset in Rd. Rd my or may not
4143 // actually be used: you must use the Address that is returned. It
4144 // is up to you to ensure that the shift provided matches the size
4145 // of your data.
4146 Address MacroAssembler::form_address(Register Rd, Register base, int64_t byte_offset) {
4147 if (is_simm12(byte_offset)) { // 12: imm in range 2^12
4148 return Address(base, byte_offset);
4149 }
4150
4151 assert_different_registers(Rd, base, noreg);
4152
4153 // Do it the hard way
4154 mv(Rd, byte_offset);
4155 add(Rd, base, Rd);
4156 return Address(Rd);
4157 }
4158
4159 void MacroAssembler::check_klass_subtype(Register sub_klass,
4160 Register super_klass,
4161 Register tmp_reg,
4162 Label& L_success) {
4163 Label L_failure;
4164 check_klass_subtype_fast_path(sub_klass, super_klass, tmp_reg, &L_success, &L_failure, nullptr);
4165 check_klass_subtype_slow_path(sub_klass, super_klass, tmp_reg, noreg, &L_success, nullptr);
4166 bind(L_failure);
4167 }
4168
4169 void MacroAssembler::safepoint_poll(Label& slow_path, bool at_return, bool in_nmethod, Register tmp_reg) {
4170 ld(tmp_reg, Address(xthread, JavaThread::polling_word_offset()));
4171 if (at_return) {
4172 bgtu(in_nmethod ? sp : fp, tmp_reg, slow_path, /* is_far */ true);
4173 } else {
4174 test_bit(tmp_reg, tmp_reg, exact_log2(SafepointMechanism::poll_bit()));
4175 bnez(tmp_reg, slow_path, /* is_far */ true);
4176 }
4177 }
4178
4179 void MacroAssembler::load_reserved(Register dst,
4180 Register addr,
4181 Assembler::operand_size size,
4182 Assembler::Aqrl acquire) {
4183 switch (size) {
4184 case int64:
4185 lr_d(dst, addr, acquire);
4186 break;
4187 case int32:
4188 lr_w(dst, addr, acquire);
4189 break;
4190 case uint32:
4191 lr_w(dst, addr, acquire);
4192 zext(dst, dst, 32);
4193 break;
4194 default:
4195 ShouldNotReachHere();
4196 }
4197 }
4198
4199 void MacroAssembler::store_conditional(Register dst,
4200 Register new_val,
4201 Register addr,
4202 Assembler::operand_size size,
4203 Assembler::Aqrl release) {
4204 switch (size) {
4205 case int64:
4206 sc_d(dst, addr, new_val, release);
4207 break;
4208 case int32:
4209 case uint32:
4210 sc_w(dst, addr, new_val, release);
4211 break;
4212 default:
4213 ShouldNotReachHere();
4214 }
4215 }
4216
4217
4218 void MacroAssembler::cmpxchg_narrow_value_helper(Register addr, Register expected, Register new_val,
4219 Assembler::operand_size size,
4220 Register shift, Register mask, Register aligned_addr) {
4221 assert(size == int8 || size == int16, "unsupported operand size");
4222
4223 andi(shift, addr, 3);
4224 slli(shift, shift, 3);
4225
4226 andi(aligned_addr, addr, ~3);
4227
4228 if (size == int8) {
4229 mv(mask, 0xff);
4230 } else {
4231 // size == int16 case
4232 mv(mask, -1);
4233 zext(mask, mask, 16);
4234 }
4235 sll(mask, mask, shift);
4236
4237 sll(expected, expected, shift);
4238 andr(expected, expected, mask);
4239
4240 sll(new_val, new_val, shift);
4241 andr(new_val, new_val, mask);
4242 }
4243
4244 // cmpxchg_narrow_value will kill t0, t1, expected, new_val and tmps.
4245 // It's designed to implement compare and swap byte/boolean/char/short by lr.w/sc.w or amocas.w,
4246 // which are forced to work with 4-byte aligned address.
4247 void MacroAssembler::cmpxchg_narrow_value(Register addr, Register expected,
4248 Register new_val,
4249 Assembler::operand_size size,
4250 Assembler::Aqrl acquire, Assembler::Aqrl release,
4251 Register result, bool result_as_bool,
4252 Register tmp1, Register tmp2, Register tmp3) {
4253 assert(!(UseZacas && UseZabha), "Use amocas");
4254 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4255
4256 Register scratch0 = t0, aligned_addr = t1;
4257 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4258
4259 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4260
4261 Label retry, fail, done;
4262
4263 if (UseZacas) {
4264 lw(result, aligned_addr);
4265
4266 bind(retry); // amocas loads the current value into result
4267 notr(scratch1, mask);
4268
4269 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4270 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4271 bne(result, scratch1, fail); // cas bits differ, cas failed
4272
4273 // result is the same as expected, use as expected value.
4274
4275 // scratch0 is still = word - cas bits
4276 // Or in the new value to create complete new value.
4277 orr(scratch0, scratch0, new_val);
4278
4279 mv(scratch1, result); // save our expected value
4280 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4281 bne(scratch1, result, retry);
4282 } else {
4283 notr(scratch1, mask);
4284 bind(retry);
4285
4286 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4287 andr(scratch0, result, mask);
4288 bne(scratch0, expected, fail);
4289
4290 andr(scratch0, result, scratch1); // scratch1 is ~mask
4291 orr(scratch0, scratch0, new_val);
4292 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4293 bnez(scratch0, retry);
4294 }
4295
4296 if (result_as_bool) {
4297 mv(result, 1);
4298 j(done);
4299
4300 bind(fail);
4301 mv(result, zr);
4302
4303 bind(done);
4304 } else {
4305 bind(fail);
4306
4307 andr(scratch0, result, mask);
4308 srl(result, scratch0, shift);
4309
4310 if (size == int8) {
4311 sext(result, result, 8);
4312 } else {
4313 // size == int16 case
4314 sext(result, result, 16);
4315 }
4316 }
4317 }
4318
4319 // weak_cmpxchg_narrow_value is a weak version of cmpxchg_narrow_value, to implement
4320 // the weak CAS stuff. The major difference is that it just failed when store conditional
4321 // failed.
4322 void MacroAssembler::weak_cmpxchg_narrow_value(Register addr, Register expected,
4323 Register new_val,
4324 Assembler::operand_size size,
4325 Assembler::Aqrl acquire, Assembler::Aqrl release,
4326 Register result,
4327 Register tmp1, Register tmp2, Register tmp3) {
4328 assert(!(UseZacas && UseZabha), "Use amocas");
4329 assert_different_registers(addr, expected, new_val, result, tmp1, tmp2, tmp3, t0, t1);
4330
4331 Register scratch0 = t0, aligned_addr = t1;
4332 Register shift = tmp1, mask = tmp2, scratch1 = tmp3;
4333
4334 cmpxchg_narrow_value_helper(addr, expected, new_val, size, shift, mask, aligned_addr);
4335
4336 Label fail, done;
4337
4338 if (UseZacas) {
4339 lw(result, aligned_addr);
4340
4341 notr(scratch1, mask);
4342
4343 andr(scratch0, result, scratch1); // scratch0 = word - cas bits
4344 orr(scratch1, expected, scratch0); // scratch1 = non-cas bits + cas bits
4345 bne(result, scratch1, fail); // cas bits differ, cas failed
4346
4347 // result is the same as expected, use as expected value.
4348
4349 // scratch0 is still = word - cas bits
4350 // Or in the new value to create complete new value.
4351 orr(scratch0, scratch0, new_val);
4352
4353 mv(scratch1, result); // save our expected value
4354 atomic_cas(result, scratch0, aligned_addr, operand_size::int32, acquire, release);
4355 bne(scratch1, result, fail); // This weak, so just bail-out.
4356 } else {
4357 notr(scratch1, mask);
4358
4359 load_reserved(result, aligned_addr, operand_size::int32, acquire);
4360 andr(scratch0, result, mask);
4361 bne(scratch0, expected, fail);
4362
4363 andr(scratch0, result, scratch1); // scratch1 is ~mask
4364 orr(scratch0, scratch0, new_val);
4365 store_conditional(scratch0, scratch0, aligned_addr, operand_size::int32, release);
4366 bnez(scratch0, fail);
4367 }
4368
4369 // Success
4370 mv(result, 1);
4371 j(done);
4372
4373 // Fail
4374 bind(fail);
4375 mv(result, zr);
4376
4377 bind(done);
4378 }
4379
4380 void MacroAssembler::cmpxchg(Register addr, Register expected,
4381 Register new_val,
4382 Assembler::operand_size size,
4383 Assembler::Aqrl acquire, Assembler::Aqrl release,
4384 Register result, bool result_as_bool) {
4385 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4386 assert_different_registers(addr, t0);
4387 assert_different_registers(expected, t0);
4388 assert_different_registers(new_val, t0);
4389
4390 // NOTE:
4391 // Register _result_ may be the same register as _new_val_ or _expected_.
4392 // Hence do NOT use _result_ until after 'cas'.
4393 //
4394 // Register _expected_ may be the same register as _new_val_ and is assumed to be preserved.
4395 // Hence do NOT change _expected_ or _new_val_.
4396 //
4397 // Having _expected_ and _new_val_ being the same register is a very puzzling cas.
4398 //
4399 // TODO: Address these issues.
4400
4401 if (UseZacas) {
4402 if (result_as_bool) {
4403 mv(t0, expected);
4404 atomic_cas(t0, new_val, addr, size, acquire, release);
4405 xorr(t0, t0, expected);
4406 seqz(result, t0);
4407 } else {
4408 mv(t0, expected);
4409 atomic_cas(t0, new_val, addr, size, acquire, release);
4410 mv(result, t0);
4411 }
4412 return;
4413 }
4414
4415 Label retry_load, done, ne_done;
4416 bind(retry_load);
4417 load_reserved(t0, addr, size, acquire);
4418 bne(t0, expected, ne_done);
4419 store_conditional(t0, new_val, addr, size, release);
4420 bnez(t0, retry_load);
4421
4422 // equal, succeed
4423 if (result_as_bool) {
4424 mv(result, 1);
4425 } else {
4426 mv(result, expected);
4427 }
4428 j(done);
4429
4430 // not equal, failed
4431 bind(ne_done);
4432 if (result_as_bool) {
4433 mv(result, zr);
4434 } else {
4435 mv(result, t0);
4436 }
4437
4438 bind(done);
4439 }
4440
4441 void MacroAssembler::weak_cmpxchg(Register addr, Register expected,
4442 Register new_val,
4443 Assembler::operand_size size,
4444 Assembler::Aqrl acquire, Assembler::Aqrl release,
4445 Register result) {
4446 assert((UseZacas && UseZabha) || (size != int8 && size != int16), "unsupported operand size");
4447 assert_different_registers(addr, t0);
4448 assert_different_registers(expected, t0);
4449 assert_different_registers(new_val, t0);
4450
4451 if (UseZacas) {
4452 cmpxchg(addr, expected, new_val, size, acquire, release, result, true);
4453 return;
4454 }
4455
4456 Label fail, done;
4457 load_reserved(t0, addr, size, acquire);
4458 bne(t0, expected, fail);
4459 store_conditional(t0, new_val, addr, size, release);
4460 bnez(t0, fail);
4461
4462 // Success
4463 mv(result, 1);
4464 j(done);
4465
4466 // Fail
4467 bind(fail);
4468 mv(result, zr);
4469
4470 bind(done);
4471 }
4472
4473 #define ATOMIC_OP(NAME, AOP, ACQUIRE, RELEASE) \
4474 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
4475 prev = prev->is_valid() ? prev : zr; \
4476 if (incr.is_register()) { \
4477 AOP(prev, addr, incr.as_register(), (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4478 } else { \
4479 mv(t0, incr.as_constant()); \
4480 AOP(prev, addr, t0, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4481 } \
4482 return; \
4483 }
4484
4485 ATOMIC_OP(add, amoadd_d, Assembler::relaxed, Assembler::relaxed)
4486 ATOMIC_OP(addw, amoadd_w, Assembler::relaxed, Assembler::relaxed)
4487 ATOMIC_OP(addal, amoadd_d, Assembler::aq, Assembler::rl)
4488 ATOMIC_OP(addalw, amoadd_w, Assembler::aq, Assembler::rl)
4489
4490 #undef ATOMIC_OP
4491
4492 #define ATOMIC_XCHG(OP, AOP, ACQUIRE, RELEASE) \
4493 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
4494 prev = prev->is_valid() ? prev : zr; \
4495 AOP(prev, addr, newv, (Assembler::Aqrl)(ACQUIRE | RELEASE)); \
4496 return; \
4497 }
4498
4499 ATOMIC_XCHG(xchg, amoswap_d, Assembler::relaxed, Assembler::relaxed)
4500 ATOMIC_XCHG(xchgw, amoswap_w, Assembler::relaxed, Assembler::relaxed)
4501 ATOMIC_XCHG(xchgal, amoswap_d, Assembler::aq, Assembler::rl)
4502 ATOMIC_XCHG(xchgalw, amoswap_w, Assembler::aq, Assembler::rl)
4503
4504 #undef ATOMIC_XCHG
4505
4506 #define ATOMIC_XCHGU(OP1, OP2) \
4507 void MacroAssembler::atomic_##OP1(Register prev, Register newv, Register addr) { \
4508 atomic_##OP2(prev, newv, addr); \
4509 zext(prev, prev, 32); \
4510 return; \
4511 }
4512
4513 ATOMIC_XCHGU(xchgwu, xchgw)
4514 ATOMIC_XCHGU(xchgalwu, xchgalw)
4515
4516 #undef ATOMIC_XCHGU
4517
4518 void MacroAssembler::atomic_cas(Register prev, Register newv, Register addr,
4519 Assembler::operand_size size, Assembler::Aqrl acquire, Assembler::Aqrl release) {
4520 switch (size) {
4521 case int64:
4522 amocas_d(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4523 break;
4524 case int32:
4525 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4526 break;
4527 case uint32:
4528 amocas_w(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4529 zext(prev, prev, 32);
4530 break;
4531 case int16:
4532 amocas_h(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4533 break;
4534 case int8:
4535 amocas_b(prev, addr, newv, (Assembler::Aqrl)(acquire | release));
4536 break;
4537 default:
4538 ShouldNotReachHere();
4539 }
4540 }
4541
4542 void MacroAssembler::far_jump(const Address &entry, Register tmp) {
4543 assert(CodeCache::contains(entry.target()),
4544 "destination of far jump not found in code cache");
4545 assert(entry.rspec().type() == relocInfo::external_word_type
4546 || entry.rspec().type() == relocInfo::runtime_call_type
4547 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4548 // Fixed length: see MacroAssembler::far_branch_size()
4549 // We can use auipc + jr here because we know that the total size of
4550 // the code cache cannot exceed 2Gb.
4551 relocate(entry.rspec(), [&] {
4552 int64_t distance = entry.target() - pc();
4553 int32_t offset = ((int32_t)distance << 20) >> 20;
4554 assert(is_valid_32bit_offset(distance), "Far jump using wrong instructions.");
4555 auipc(tmp, (int32_t)distance + 0x800);
4556 jr(tmp, offset);
4557 });
4558 }
4559
4560 void MacroAssembler::far_call(const Address &entry, Register tmp) {
4561 assert(tmp != x5, "tmp register must not be x5.");
4562 assert(CodeCache::contains(entry.target()),
4563 "destination of far call not found in code cache");
4564 assert(entry.rspec().type() == relocInfo::external_word_type
4565 || entry.rspec().type() == relocInfo::runtime_call_type
4566 || entry.rspec().type() == relocInfo::none, "wrong entry relocInfo type");
4567 // Fixed length: see MacroAssembler::far_branch_size()
4568 // We can use auipc + jalr here because we know that the total size of
4569 // the code cache cannot exceed 2Gb.
4570 relocate(entry.rspec(), [&] {
4571 int64_t distance = entry.target() - pc();
4572 int32_t offset = ((int32_t)distance << 20) >> 20;
4573 assert(is_valid_32bit_offset(distance), "Far call using wrong instructions.");
4574 auipc(tmp, (int32_t)distance + 0x800);
4575 jalr(tmp, offset);
4576 });
4577 }
4578
4579 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
4580 Register super_klass,
4581 Register tmp_reg,
4582 Label* L_success,
4583 Label* L_failure,
4584 Label* L_slow_path,
4585 Register super_check_offset) {
4586 assert_different_registers(sub_klass, super_klass, tmp_reg, super_check_offset);
4587 bool must_load_sco = !super_check_offset->is_valid();
4588 if (must_load_sco) {
4589 assert(tmp_reg != noreg, "supply either a temp or a register offset");
4590 }
4591
4592 Label L_fallthrough;
4593 int label_nulls = 0;
4594 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4595 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4596 if (L_slow_path == nullptr) { L_slow_path = &L_fallthrough; label_nulls++; }
4597 assert(label_nulls <= 1, "at most one null in batch");
4598
4599 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4600 int sco_offset = in_bytes(Klass::super_check_offset_offset());
4601 Address super_check_offset_addr(super_klass, sco_offset);
4602
4603 // Hacked jmp, which may only be used just before L_fallthrough.
4604 #define final_jmp(label) \
4605 if (&(label) == &L_fallthrough) { /*do nothing*/ } \
4606 else j(label) /*omit semi*/
4607
4608 // If the pointers are equal, we are done (e.g., String[] elements).
4609 // This self-check enables sharing of secondary supertype arrays among
4610 // non-primary types such as array-of-interface. Otherwise, each such
4611 // type would need its own customized SSA.
4612 // We move this check to the front of the fast path because many
4613 // type checks are in fact trivially successful in this manner,
4614 // so we get a nicely predicted branch right at the start of the check.
4615 beq(sub_klass, super_klass, *L_success);
4616
4617 // Check the supertype display:
4618 if (must_load_sco) {
4619 lwu(tmp_reg, super_check_offset_addr);
4620 super_check_offset = tmp_reg;
4621 }
4622 add(t0, sub_klass, super_check_offset);
4623 Address super_check_addr(t0);
4624 ld(t0, super_check_addr); // load displayed supertype
4625 beq(super_klass, t0, *L_success);
4626
4627 // This check has worked decisively for primary supers.
4628 // Secondary supers are sought in the super_cache ('super_cache_addr').
4629 // (Secondary supers are interfaces and very deeply nested subtypes.)
4630 // This works in the same check above because of a tricky aliasing
4631 // between the super_Cache and the primary super display elements.
4632 // (The 'super_check_addr' can address either, as the case requires.)
4633 // Note that the cache is updated below if it does not help us find
4634 // what we need immediately.
4635 // So if it was a primary super, we can just fail immediately.
4636 // Otherwise, it's the slow path for us (no success at this point).
4637
4638 mv(t1, sc_offset);
4639 if (L_failure == &L_fallthrough) {
4640 beq(super_check_offset, t1, *L_slow_path);
4641 } else {
4642 bne(super_check_offset, t1, *L_failure, /* is_far */ true);
4643 final_jmp(*L_slow_path);
4644 }
4645
4646 bind(L_fallthrough);
4647
4648 #undef final_jmp
4649 }
4650
4651 // Scans count pointer sized words at [addr] for occurrence of value,
4652 // generic
4653 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
4654 Register tmp) {
4655 Label Lloop, Lexit;
4656 beqz(count, Lexit);
4657 bind(Lloop);
4658 ld(tmp, addr);
4659 beq(value, tmp, Lexit);
4660 addi(addr, addr, wordSize);
4661 subi(count, count, 1);
4662 bnez(count, Lloop);
4663 bind(Lexit);
4664 }
4665
4666 void MacroAssembler::check_klass_subtype_slow_path_linear(Register sub_klass,
4667 Register super_klass,
4668 Register tmp1_reg,
4669 Register tmp2_reg,
4670 Label* L_success,
4671 Label* L_failure,
4672 bool set_cond_codes) {
4673 assert_different_registers(sub_klass, super_klass, tmp1_reg);
4674 if (tmp2_reg != noreg) {
4675 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg, t0);
4676 }
4677 #define IS_A_TEMP(reg) ((reg) == tmp1_reg || (reg) == tmp2_reg)
4678
4679 Label L_fallthrough;
4680 int label_nulls = 0;
4681 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4682 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4683
4684 assert(label_nulls <= 1, "at most one null in the batch");
4685
4686 // A couple of useful fields in sub_klass:
4687 int ss_offset = in_bytes(Klass::secondary_supers_offset());
4688 int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
4689 Address secondary_supers_addr(sub_klass, ss_offset);
4690 Address super_cache_addr( sub_klass, sc_offset);
4691
4692 BLOCK_COMMENT("check_klass_subtype_slow_path");
4693
4694 // Do a linear scan of the secondary super-klass chain.
4695 // This code is rarely used, so simplicity is a virtue here.
4696 // The repne_scan instruction uses fixed registers, which we must spill.
4697 // Don't worry too much about pre-existing connections with the input regs.
4698
4699 assert(sub_klass != x10, "killed reg"); // killed by mv(x10, super)
4700 assert(sub_klass != x12, "killed reg"); // killed by la(x12, &pst_counter)
4701
4702 RegSet pushed_registers;
4703 if (!IS_A_TEMP(x12)) {
4704 pushed_registers += x12;
4705 }
4706 if (!IS_A_TEMP(x15)) {
4707 pushed_registers += x15;
4708 }
4709
4710 if (super_klass != x10) {
4711 if (!IS_A_TEMP(x10)) {
4712 pushed_registers += x10;
4713 }
4714 }
4715
4716 push_reg(pushed_registers, sp);
4717
4718 // Get super_klass value into x10 (even if it was in x15 or x12)
4719 mv(x10, super_klass);
4720
4721 #ifndef PRODUCT
4722 incrementw(ExternalAddress((address)&SharedRuntime::_partial_subtype_ctr));
4723 #endif // PRODUCT
4724
4725 // We will consult the secondary-super array.
4726 ld(x15, secondary_supers_addr);
4727 // Load the array length.
4728 lwu(x12, Address(x15, Array<Klass*>::length_offset_in_bytes()));
4729 // Skip to start of data.
4730 addi(x15, x15, Array<Klass*>::base_offset_in_bytes());
4731
4732 // Set t0 to an obvious invalid value, falling through by default
4733 mv(t0, -1);
4734 // Scan X12 words at [X15] for an occurrence of X10.
4735 repne_scan(x15, x10, x12, t0);
4736
4737 // pop will restore x10, so we should use a temp register to keep its value
4738 mv(t1, x10);
4739
4740 // Unspill the temp registers:
4741 pop_reg(pushed_registers, sp);
4742
4743 bne(t1, t0, *L_failure);
4744
4745 // Success. Cache the super we found an proceed in triumph.
4746 if (UseSecondarySupersCache) {
4747 sd(super_klass, super_cache_addr);
4748 }
4749
4750 if (L_success != &L_fallthrough) {
4751 j(*L_success);
4752 }
4753
4754 #undef IS_A_TEMP
4755
4756 bind(L_fallthrough);
4757 }
4758
4759 // population_count variant for running without the CPOP
4760 // instruction, which was introduced with Zbb extension.
4761 void MacroAssembler::population_count(Register dst, Register src,
4762 Register tmp1, Register tmp2) {
4763 if (UsePopCountInstruction) {
4764 cpop(dst, src);
4765 } else {
4766 assert_different_registers(src, tmp1, tmp2);
4767 assert_different_registers(dst, tmp1, tmp2);
4768 Label loop, done;
4769
4770 mv(tmp1, src);
4771 // dst = 0;
4772 // while(tmp1 != 0) {
4773 // dst++;
4774 // tmp1 &= (tmp1 - 1);
4775 // }
4776 mv(dst, zr);
4777 beqz(tmp1, done);
4778 {
4779 bind(loop);
4780 addi(dst, dst, 1);
4781 subi(tmp2, tmp1, 1);
4782 andr(tmp1, tmp1, tmp2);
4783 bnez(tmp1, loop);
4784 }
4785 bind(done);
4786 }
4787 }
4788
4789 // If Register r is invalid, remove a new register from
4790 // available_regs, and add new register to regs_to_push.
4791 Register MacroAssembler::allocate_if_noreg(Register r,
4792 RegSetIterator<Register> &available_regs,
4793 RegSet ®s_to_push) {
4794 if (!r->is_valid()) {
4795 r = *available_regs++;
4796 regs_to_push += r;
4797 }
4798 return r;
4799 }
4800
4801 // check_klass_subtype_slow_path_table() looks for super_klass in the
4802 // hash table belonging to super_klass, branching to L_success or
4803 // L_failure as appropriate. This is essentially a shim which
4804 // allocates registers as necessary then calls
4805 // lookup_secondary_supers_table() to do the work. Any of the tmp
4806 // regs may be noreg, in which case this logic will chooses some
4807 // registers push and pop them from the stack.
4808 void MacroAssembler::check_klass_subtype_slow_path_table(Register sub_klass,
4809 Register super_klass,
4810 Register tmp1_reg,
4811 Register tmp2_reg,
4812 Label* L_success,
4813 Label* L_failure,
4814 bool set_cond_codes) {
4815 RegSet tmps = RegSet::of(tmp1_reg, tmp2_reg);
4816
4817 assert_different_registers(sub_klass, super_klass, tmp1_reg, tmp2_reg);
4818
4819 Label L_fallthrough;
4820 int label_nulls = 0;
4821 if (L_success == nullptr) { L_success = &L_fallthrough; label_nulls++; }
4822 if (L_failure == nullptr) { L_failure = &L_fallthrough; label_nulls++; }
4823 assert(label_nulls <= 1, "at most one null in the batch");
4824
4825 BLOCK_COMMENT("check_klass_subtype_slow_path");
4826
4827 RegSet caller_save_regs = RegSet::of(x7) + RegSet::range(x10, x17) + RegSet::range(x28, x31);
4828 RegSetIterator<Register> available_regs = (caller_save_regs - tmps - sub_klass - super_klass).begin();
4829
4830 RegSet pushed_regs;
4831
4832 tmp1_reg = allocate_if_noreg(tmp1_reg, available_regs, pushed_regs);
4833 tmp2_reg = allocate_if_noreg(tmp2_reg, available_regs, pushed_regs);
4834
4835 Register tmp3_reg = noreg, tmp4_reg = noreg, result_reg = noreg;
4836
4837 tmp3_reg = allocate_if_noreg(tmp3_reg, available_regs, pushed_regs);
4838 tmp4_reg = allocate_if_noreg(tmp4_reg, available_regs, pushed_regs);
4839 result_reg = allocate_if_noreg(result_reg, available_regs, pushed_regs);
4840
4841 push_reg(pushed_regs, sp);
4842
4843 lookup_secondary_supers_table_var(sub_klass,
4844 super_klass,
4845 result_reg,
4846 tmp1_reg, tmp2_reg, tmp3_reg,
4847 tmp4_reg, nullptr);
4848
4849 // Move the result to t1 as we are about to unspill the tmp registers.
4850 mv(t1, result_reg);
4851
4852 // Unspill the tmp. registers:
4853 pop_reg(pushed_regs, sp);
4854
4855 // NB! Callers may assume that, when set_cond_codes is true, this
4856 // code sets tmp2_reg to a nonzero value.
4857 if (set_cond_codes) {
4858 mv(tmp2_reg, 1);
4859 }
4860
4861 bnez(t1, *L_failure);
4862
4863 if (L_success != &L_fallthrough) {
4864 j(*L_success);
4865 }
4866
4867 bind(L_fallthrough);
4868 }
4869
4870 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
4871 Register super_klass,
4872 Register tmp1_reg,
4873 Register tmp2_reg,
4874 Label* L_success,
4875 Label* L_failure,
4876 bool set_cond_codes) {
4877 if (UseSecondarySupersTable) {
4878 check_klass_subtype_slow_path_table
4879 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4880 } else {
4881 check_klass_subtype_slow_path_linear
4882 (sub_klass, super_klass, tmp1_reg, tmp2_reg, L_success, L_failure, set_cond_codes);
4883 }
4884 }
4885
4886 // Ensure that the inline code and the stub are using the same registers
4887 // as we need to call the stub from inline code when there is a collision
4888 // in the hashed lookup in the secondary supers array.
4889 #define LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length, \
4890 r_array_index, r_sub_klass, result, r_bitmap) \
4891 do { \
4892 assert(r_super_klass == x10 && \
4893 r_array_base == x11 && \
4894 r_array_length == x12 && \
4895 (r_array_index == x13 || r_array_index == noreg) && \
4896 (r_sub_klass == x14 || r_sub_klass == noreg) && \
4897 (result == x15 || result == noreg) && \
4898 (r_bitmap == x16 || r_bitmap == noreg), "registers must match riscv.ad"); \
4899 } while(0)
4900
4901 bool MacroAssembler::lookup_secondary_supers_table_const(Register r_sub_klass,
4902 Register r_super_klass,
4903 Register result,
4904 Register tmp1,
4905 Register tmp2,
4906 Register tmp3,
4907 Register tmp4,
4908 u1 super_klass_slot,
4909 bool stub_is_near) {
4910 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
4911
4912 Label L_fallthrough;
4913
4914 BLOCK_COMMENT("lookup_secondary_supers_table {");
4915
4916 const Register
4917 r_array_base = tmp1, // x11
4918 r_array_length = tmp2, // x12
4919 r_array_index = tmp3, // x13
4920 r_bitmap = tmp4; // x16
4921
4922 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
4923 r_array_index, r_sub_klass, result, r_bitmap);
4924
4925 u1 bit = super_klass_slot;
4926
4927 // Initialize result value to 1 which means mismatch.
4928 mv(result, 1);
4929
4930 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
4931
4932 // First check the bitmap to see if super_klass might be present. If
4933 // the bit is zero, we are certain that super_klass is not one of
4934 // the secondary supers.
4935 test_bit(t0, r_bitmap, bit);
4936 beqz(t0, L_fallthrough);
4937
4938 // Get the first array index that can contain super_klass into r_array_index.
4939 if (bit != 0) {
4940 slli(r_array_index, r_bitmap, (Klass::SECONDARY_SUPERS_TABLE_MASK - bit));
4941 population_count(r_array_index, r_array_index, tmp1, tmp2);
4942 } else {
4943 mv(r_array_index, (u1)1);
4944 }
4945
4946 // We will consult the secondary-super array.
4947 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
4948
4949 // The value i in r_array_index is >= 1, so even though r_array_base
4950 // points to the length, we don't need to adjust it to point to the data.
4951 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
4952 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
4953
4954 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
4955 ld(result, Address(result));
4956 xorr(result, result, r_super_klass);
4957 beqz(result, L_fallthrough); // Found a match
4958
4959 // Is there another entry to check? Consult the bitmap.
4960 test_bit(t0, r_bitmap, (bit + 1) & Klass::SECONDARY_SUPERS_TABLE_MASK);
4961 beqz(t0, L_fallthrough);
4962
4963 // Linear probe.
4964 if (bit != 0) {
4965 ror(r_bitmap, r_bitmap, bit);
4966 }
4967
4968 // The slot we just inspected is at secondary_supers[r_array_index - 1].
4969 // The next slot to be inspected, by the stub we're about to call,
4970 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
4971 // have been checked.
4972 rt_call(StubRoutines::lookup_secondary_supers_table_slow_path_stub());
4973
4974 BLOCK_COMMENT("} lookup_secondary_supers_table");
4975
4976 bind(L_fallthrough);
4977
4978 if (VerifySecondarySupers) {
4979 verify_secondary_supers_table(r_sub_klass, r_super_klass, // x14, x10
4980 result, tmp1, tmp2, tmp3); // x15, x11, x12, x13
4981 }
4982 return true;
4983 }
4984
4985 // At runtime, return 0 in result if r_super_klass is a superclass of
4986 // r_sub_klass, otherwise return nonzero. Use this version of
4987 // lookup_secondary_supers_table() if you don't know ahead of time
4988 // which superclass will be searched for. Used by interpreter and
4989 // runtime stubs. It is larger and has somewhat greater latency than
4990 // the version above, which takes a constant super_klass_slot.
4991 void MacroAssembler::lookup_secondary_supers_table_var(Register r_sub_klass,
4992 Register r_super_klass,
4993 Register result,
4994 Register tmp1,
4995 Register tmp2,
4996 Register tmp3,
4997 Register tmp4,
4998 Label *L_success) {
4999 assert_different_registers(r_sub_klass, r_super_klass, result, tmp1, tmp2, tmp3, tmp4, t0, t1);
5000
5001 Label L_fallthrough;
5002
5003 BLOCK_COMMENT("lookup_secondary_supers_table {");
5004
5005 const Register
5006 r_array_index = tmp3,
5007 r_bitmap = tmp4,
5008 slot = t1;
5009
5010 lbu(slot, Address(r_super_klass, Klass::hash_slot_offset()));
5011
5012 // Make sure that result is nonzero if the test below misses.
5013 mv(result, 1);
5014
5015 ld(r_bitmap, Address(r_sub_klass, Klass::secondary_supers_bitmap_offset()));
5016
5017 // First check the bitmap to see if super_klass might be present. If
5018 // the bit is zero, we are certain that super_klass is not one of
5019 // the secondary supers.
5020
5021 // This next instruction is equivalent to:
5022 // mv(tmp_reg, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5023 // sub(r_array_index, slot, tmp_reg);
5024 xori(r_array_index, slot, (u1)(Klass::SECONDARY_SUPERS_TABLE_SIZE - 1));
5025 sll(r_array_index, r_bitmap, r_array_index);
5026 test_bit(t0, r_array_index, Klass::SECONDARY_SUPERS_TABLE_SIZE - 1);
5027 beqz(t0, L_fallthrough);
5028
5029 // Get the first array index that can contain super_klass into r_array_index.
5030 population_count(r_array_index, r_array_index, tmp1, tmp2);
5031
5032 // NB! r_array_index is off by 1. It is compensated by keeping r_array_base off by 1 word.
5033
5034 const Register
5035 r_array_base = tmp1,
5036 r_array_length = tmp2;
5037
5038 // The value i in r_array_index is >= 1, so even though r_array_base
5039 // points to the length, we don't need to adjust it to point to the data.
5040 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "Adjust this code");
5041 assert(Array<Klass*>::length_offset_in_bytes() == 0, "Adjust this code");
5042
5043 // We will consult the secondary-super array.
5044 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5045
5046 shadd(result, r_array_index, r_array_base, result, LogBytesPerWord);
5047 ld(result, Address(result));
5048 xorr(result, result, r_super_klass);
5049 beqz(result, L_success ? *L_success : L_fallthrough); // Found a match
5050
5051 // Is there another entry to check? Consult the bitmap.
5052 ror(r_bitmap, r_bitmap, slot);
5053 test_bit(t0, r_bitmap, 1);
5054 beqz(t0, L_fallthrough);
5055
5056 // The slot we just inspected is at secondary_supers[r_array_index - 1].
5057 // The next slot to be inspected, by the logic we're about to call,
5058 // is secondary_supers[r_array_index]. Bits 0 and 1 in the bitmap
5059 // have been checked.
5060 lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index,
5061 r_bitmap, result, r_array_length, false /*is_stub*/);
5062
5063 BLOCK_COMMENT("} lookup_secondary_supers_table");
5064
5065 bind(L_fallthrough);
5066
5067 if (VerifySecondarySupers) {
5068 verify_secondary_supers_table(r_sub_klass, r_super_klass,
5069 result, tmp1, tmp2, tmp3);
5070 }
5071
5072 if (L_success) {
5073 beqz(result, *L_success);
5074 }
5075 }
5076
5077 // Called by code generated by check_klass_subtype_slow_path
5078 // above. This is called when there is a collision in the hashed
5079 // lookup in the secondary supers array.
5080 void MacroAssembler::lookup_secondary_supers_table_slow_path(Register r_super_klass,
5081 Register r_array_base,
5082 Register r_array_index,
5083 Register r_bitmap,
5084 Register result,
5085 Register tmp,
5086 bool is_stub) {
5087 assert_different_registers(r_super_klass, r_array_base, r_array_index, r_bitmap, tmp, result, t0);
5088
5089 const Register
5090 r_array_length = tmp,
5091 r_sub_klass = noreg; // unused
5092
5093 if (is_stub) {
5094 LOOKUP_SECONDARY_SUPERS_TABLE_REGISTERS(r_super_klass, r_array_base, r_array_length,
5095 r_array_index, r_sub_klass, result, r_bitmap);
5096 }
5097
5098 Label L_matched, L_fallthrough, L_bitmap_full;
5099
5100 // Initialize result value to 1 which means mismatch.
5101 mv(result, 1);
5102
5103 // Load the array length.
5104 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5105 // And adjust the array base to point to the data.
5106 // NB! Effectively increments current slot index by 1.
5107 assert(Array<Klass*>::base_offset_in_bytes() == wordSize, "");
5108 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5109
5110 // Check if bitmap is SECONDARY_SUPERS_BITMAP_FULL
5111 assert(Klass::SECONDARY_SUPERS_BITMAP_FULL == ~uintx(0), "Adjust this code");
5112 subw(t0, r_array_length, Klass::SECONDARY_SUPERS_TABLE_SIZE - 2);
5113 bgtz(t0, L_bitmap_full);
5114
5115 // NB! Our caller has checked bits 0 and 1 in the bitmap. The
5116 // current slot (at secondary_supers[r_array_index]) has not yet
5117 // been inspected, and r_array_index may be out of bounds if we
5118 // wrapped around the end of the array.
5119
5120 { // This is conventional linear probing, but instead of terminating
5121 // when a null entry is found in the table, we maintain a bitmap
5122 // in which a 0 indicates missing entries.
5123 // As long as the bitmap is not completely full,
5124 // array_length == popcount(bitmap). The array_length check above
5125 // guarantees there are 0s in the bitmap, so the loop eventually
5126 // terminates.
5127 Label L_loop;
5128 bind(L_loop);
5129
5130 // Check for wraparound.
5131 Label skip;
5132 blt(r_array_index, r_array_length, skip);
5133 mv(r_array_index, zr);
5134 bind(skip);
5135
5136 shadd(t0, r_array_index, r_array_base, t0, LogBytesPerWord);
5137 ld(t0, Address(t0));
5138 beq(t0, r_super_klass, L_matched);
5139
5140 test_bit(t0, r_bitmap, 2); // look-ahead check (Bit 2); result is non-zero
5141 beqz(t0, L_fallthrough);
5142
5143 ror(r_bitmap, r_bitmap, 1);
5144 addi(r_array_index, r_array_index, 1);
5145 j(L_loop);
5146 }
5147
5148 { // Degenerate case: more than 64 secondary supers.
5149 // FIXME: We could do something smarter here, maybe a vectorized
5150 // comparison or a binary search, but is that worth any added
5151 // complexity?
5152 bind(L_bitmap_full);
5153 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5154 bne(r_super_klass, t0, L_fallthrough);
5155 }
5156
5157 bind(L_matched);
5158 mv(result, zr);
5159
5160 bind(L_fallthrough);
5161 }
5162
5163 // Make sure that the hashed lookup and a linear scan agree.
5164 void MacroAssembler::verify_secondary_supers_table(Register r_sub_klass,
5165 Register r_super_klass,
5166 Register result,
5167 Register tmp1,
5168 Register tmp2,
5169 Register tmp3) {
5170 assert_different_registers(r_sub_klass, r_super_klass, tmp1, tmp2, tmp3, result, t0, t1);
5171
5172 const Register
5173 r_array_base = tmp1, // X11
5174 r_array_length = tmp2, // X12
5175 r_array_index = noreg, // unused
5176 r_bitmap = noreg; // unused
5177
5178 BLOCK_COMMENT("verify_secondary_supers_table {");
5179
5180 // We will consult the secondary-super array.
5181 ld(r_array_base, Address(r_sub_klass, in_bytes(Klass::secondary_supers_offset())));
5182
5183 // Load the array length.
5184 lwu(r_array_length, Address(r_array_base, Array<Klass*>::length_offset_in_bytes()));
5185 // And adjust the array base to point to the data.
5186 addi(r_array_base, r_array_base, Array<Klass*>::base_offset_in_bytes());
5187
5188 repne_scan(r_array_base, r_super_klass, r_array_length, t0);
5189 Label failed;
5190 mv(tmp3, 1);
5191 bne(r_super_klass, t0, failed);
5192 mv(tmp3, zr);
5193 bind(failed);
5194
5195 snez(result, result); // normalize result to 0/1 for comparison
5196
5197 Label passed;
5198 beq(tmp3, result, passed);
5199 {
5200 mv(x10, r_super_klass);
5201 mv(x11, r_sub_klass);
5202 mv(x12, tmp3);
5203 mv(x13, result);
5204 mv(x14, (address)("mismatch"));
5205 rt_call(CAST_FROM_FN_PTR(address, Klass::on_secondary_supers_verification_failure));
5206 should_not_reach_here();
5207 }
5208 bind(passed);
5209
5210 BLOCK_COMMENT("} verify_secondary_supers_table");
5211 }
5212
5213 // Defines obj, preserves var_size_in_bytes, okay for tmp2 == var_size_in_bytes.
5214 void MacroAssembler::tlab_allocate(Register obj,
5215 Register var_size_in_bytes,
5216 int con_size_in_bytes,
5217 Register tmp1,
5218 Register tmp2,
5219 Label& slow_case,
5220 bool is_far) {
5221 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
5222 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, tmp1, tmp2, slow_case, is_far);
5223 }
5224
5225 // get_thread() can be called anywhere inside generated code so we
5226 // need to save whatever non-callee save context might get clobbered
5227 // by the call to Thread::current() or, indeed, the call setup code.
5228 void MacroAssembler::get_thread(Register thread) {
5229 // save all call-clobbered regs except thread
5230 RegSet saved_regs = RegSet::range(x5, x7) + RegSet::range(x10, x17) +
5231 RegSet::range(x28, x31) + ra - thread;
5232 push_reg(saved_regs, sp);
5233
5234 mv(t1, CAST_FROM_FN_PTR(address, Thread::current));
5235 jalr(t1);
5236 if (thread != c_rarg0) {
5237 mv(thread, c_rarg0);
5238 }
5239
5240 // restore pushed registers
5241 pop_reg(saved_regs, sp);
5242 }
5243
5244 void MacroAssembler::load_byte_map_base(Register reg) {
5245 CardTableBarrierSet* ctbs = CardTableBarrierSet::barrier_set();
5246 mv(reg, (uint64_t)ctbs->card_table_base_const());
5247 }
5248
5249 void MacroAssembler::build_frame(int framesize) {
5250 assert(framesize >= 2, "framesize must include space for FP/RA");
5251 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5252 sub(sp, sp, framesize);
5253 sd(fp, Address(sp, framesize - 2 * wordSize));
5254 sd(ra, Address(sp, framesize - wordSize));
5255 if (PreserveFramePointer) { add(fp, sp, framesize); }
5256 }
5257
5258 void MacroAssembler::remove_frame(int framesize) {
5259 assert(framesize >= 2, "framesize must include space for FP/RA");
5260 assert(framesize % (2*wordSize) == 0, "must preserve 2*wordSize alignment");
5261 ld(fp, Address(sp, framesize - 2 * wordSize));
5262 ld(ra, Address(sp, framesize - wordSize));
5263 add(sp, sp, framesize);
5264 }
5265
5266 void MacroAssembler::reserved_stack_check() {
5267 // testing if reserved zone needs to be enabled
5268 Label no_reserved_zone_enabling;
5269
5270 ld(t0, Address(xthread, JavaThread::reserved_stack_activation_offset()));
5271 bltu(sp, t0, no_reserved_zone_enabling);
5272
5273 enter(); // RA and FP are live.
5274 mv(c_rarg0, xthread);
5275 rt_call(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
5276 leave();
5277
5278 // We have already removed our own frame.
5279 // throw_delayed_StackOverflowError will think that it's been
5280 // called by our caller.
5281 j(RuntimeAddress(SharedRuntime::throw_delayed_StackOverflowError_entry()));
5282 should_not_reach_here();
5283
5284 bind(no_reserved_zone_enabling);
5285 }
5286
5287 // Move the address of the polling page into dest.
5288 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
5289 ld(dest, Address(xthread, JavaThread::polling_page_offset()));
5290 }
5291
5292 // Read the polling page. The address of the polling page must
5293 // already be in r.
5294 void MacroAssembler::read_polling_page(Register r, int32_t offset, relocInfo::relocType rtype) {
5295 relocate(rtype, [&] {
5296 lwu(zr, Address(r, offset));
5297 });
5298 }
5299
5300 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
5301 #ifdef ASSERT
5302 {
5303 ThreadInVMfromUnknown tiv;
5304 assert (UseCompressedOops, "should only be used for compressed oops");
5305 assert (Universe::heap() != nullptr, "java heap should be initialized");
5306 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5307 assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
5308 }
5309 #endif
5310 int oop_index = oop_recorder()->find_index(obj);
5311 relocate(oop_Relocation::spec(oop_index), [&] {
5312 li32(dst, 0xDEADBEEF);
5313 });
5314 zext(dst, dst, 32);
5315 }
5316
5317 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
5318 assert (oop_recorder() != nullptr, "this assembler needs an OopRecorder");
5319 int index = oop_recorder()->find_index(k);
5320
5321 narrowKlass nk = CompressedKlassPointers::encode(k);
5322 relocate(metadata_Relocation::spec(index), [&] {
5323 li32(dst, nk);
5324 });
5325 zext(dst, dst, 32);
5326 }
5327
5328 address MacroAssembler::reloc_call(Address entry, Register tmp) {
5329 assert(entry.rspec().type() == relocInfo::runtime_call_type ||
5330 entry.rspec().type() == relocInfo::opt_virtual_call_type ||
5331 entry.rspec().type() == relocInfo::static_call_type ||
5332 entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
5333
5334 address target = entry.target();
5335
5336 if (!in_scratch_emit_size()) {
5337 address stub = emit_reloc_call_address_stub(offset(), target);
5338 if (stub == nullptr) {
5339 postcond(pc() == badAddress);
5340 return nullptr; // CodeCache is full
5341 }
5342 }
5343
5344 address call_pc = pc();
5345 #ifdef ASSERT
5346 if (entry.rspec().type() != relocInfo::runtime_call_type) {
5347 assert_alignment(call_pc);
5348 }
5349 #endif
5350
5351 // The relocation created while emitting the stub will ensure this
5352 // call instruction is subsequently patched to call the stub.
5353 relocate(entry.rspec(), [&] {
5354 auipc(tmp, 0);
5355 ld(tmp, Address(tmp, 0));
5356 jalr(tmp);
5357 });
5358
5359 postcond(pc() != badAddress);
5360 return call_pc;
5361 }
5362
5363 address MacroAssembler::ic_call(address entry, jint method_index) {
5364 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
5365 assert(!in_compressible_scope(), "Must be");
5366 movptr(t0, (address)Universe::non_oop_word(), t1);
5367 assert_cond(entry != nullptr);
5368 return reloc_call(Address(entry, rh));
5369 }
5370
5371 int MacroAssembler::ic_check_size() {
5372 // No compressed
5373 return (MacroAssembler::instruction_size * (2 /* 2 loads */ + 1 /* branch */)) +
5374 far_branch_size() + (UseCompactObjectHeaders ? MacroAssembler::instruction_size * 1 : 0);
5375 }
5376
5377 int MacroAssembler::ic_check(int end_alignment) {
5378 IncompressibleScope scope(this);
5379 Register receiver = j_rarg0;
5380 Register data = t0;
5381
5382 Register tmp1 = t1; // scratch
5383 // t2 is saved on call, thus should have been saved before this check.
5384 // Hence we can clobber it.
5385 Register tmp2 = t2;
5386
5387 // The UEP of a code blob ensures that the VEP is padded. However, the padding of the UEP is placed
5388 // before the inline cache check, so we don't have to execute any nop instructions when dispatching
5389 // through the UEP, yet we can ensure that the VEP is aligned appropriately. That's why we align
5390 // before the inline cache check here, and not after
5391 align(end_alignment, ic_check_size());
5392 int uep_offset = offset();
5393
5394 if (UseCompactObjectHeaders) {
5395 load_narrow_klass_compact(tmp1, receiver);
5396 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5397 } else {
5398 lwu(tmp1, Address(receiver, oopDesc::klass_offset_in_bytes()));
5399 lwu(tmp2, Address(data, CompiledICData::speculated_klass_offset()));
5400 }
5401
5402 Label ic_hit;
5403 beq(tmp1, tmp2, ic_hit);
5404 // Note, far_jump is not fixed size.
5405 // Is this ever generates a movptr alignment/size will be off.
5406 far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
5407 bind(ic_hit);
5408
5409 assert((offset() % end_alignment) == 0, "Misaligned verified entry point.");
5410 return uep_offset;
5411 }
5412
5413 // Emit an address stub for a call to a target which is too far away.
5414 // Note that we only put the target address of the call in the stub.
5415 //
5416 // code sequences:
5417 //
5418 // call-site:
5419 // load target address from stub
5420 // jump-and-link target address
5421 //
5422 // Related address stub for this call site in the stub section:
5423 // alignment nop
5424 // target address
5425
5426 address MacroAssembler::emit_reloc_call_address_stub(int insts_call_instruction_offset, address dest) {
5427 address stub = start_a_stub(max_reloc_call_address_stub_size());
5428 if (stub == nullptr) {
5429 return nullptr; // CodeBuffer::expand failed
5430 }
5431
5432 // We are always 4-byte aligned here.
5433 assert_alignment(pc());
5434
5435 // Make sure the address of destination 8-byte aligned.
5436 align(wordSize, 0);
5437
5438 RelocationHolder rh = trampoline_stub_Relocation::spec(code()->insts()->start() +
5439 insts_call_instruction_offset);
5440 const int stub_start_offset = offset();
5441 relocate(rh, [&] {
5442 assert(offset() - stub_start_offset == 0,
5443 "%ld - %ld == %ld : should be", (long)offset(), (long)stub_start_offset, (long)0);
5444 assert(offset() % wordSize == 0, "bad alignment");
5445 emit_int64((int64_t)dest);
5446 });
5447
5448 const address stub_start_addr = addr_at(stub_start_offset);
5449 end_a_stub();
5450
5451 return stub_start_addr;
5452 }
5453
5454 int MacroAssembler::max_reloc_call_address_stub_size() {
5455 // Max stub size: alignment nop, target address.
5456 return 1 * MacroAssembler::instruction_size + wordSize;
5457 }
5458
5459 int MacroAssembler::static_call_stub_size() {
5460 // (lui, addi, slli, addi, slli, addi) + (lui + lui + slli + add) + jalr
5461 return 11 * MacroAssembler::instruction_size;
5462 }
5463
5464 Address MacroAssembler::add_memory_helper(const Address dst, Register tmp) {
5465 switch (dst.getMode()) {
5466 case Address::base_plus_offset:
5467 // This is the expected mode, although we allow all the other
5468 // forms below.
5469 return form_address(tmp, dst.base(), dst.offset());
5470 default:
5471 la(tmp, dst);
5472 return Address(tmp);
5473 }
5474 }
5475
5476 void MacroAssembler::increment(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5477 assert(((dst.getMode() == Address::base_plus_offset &&
5478 is_simm12(dst.offset())) || is_simm12(value)),
5479 "invalid value and address mode combination");
5480 Address adr = add_memory_helper(dst, tmp2);
5481 assert(!adr.uses(tmp1), "invalid dst for address increment");
5482 ld(tmp1, adr);
5483 add(tmp1, tmp1, value, tmp2);
5484 sd(tmp1, adr);
5485 }
5486
5487 void MacroAssembler::incrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5488 assert(((dst.getMode() == Address::base_plus_offset &&
5489 is_simm12(dst.offset())) || is_simm12(value)),
5490 "invalid value and address mode combination");
5491 Address adr = add_memory_helper(dst, tmp2);
5492 assert(!adr.uses(tmp1), "invalid dst for address increment");
5493 lwu(tmp1, adr);
5494 addw(tmp1, tmp1, value, tmp2);
5495 sw(tmp1, adr);
5496 }
5497
5498 void MacroAssembler::decrement(const Address dst, int64_t value, Register tmp1, Register tmp2) {
5499 assert(((dst.getMode() == Address::base_plus_offset &&
5500 is_simm12(dst.offset())) || is_simm12(value)),
5501 "invalid value and address mode combination");
5502 Address adr = add_memory_helper(dst, tmp2);
5503 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5504 ld(tmp1, adr);
5505 sub(tmp1, tmp1, value, tmp2);
5506 sd(tmp1, adr);
5507 }
5508
5509 void MacroAssembler::decrementw(const Address dst, int32_t value, Register tmp1, Register tmp2) {
5510 assert(((dst.getMode() == Address::base_plus_offset &&
5511 is_simm12(dst.offset())) || is_simm12(value)),
5512 "invalid value and address mode combination");
5513 Address adr = add_memory_helper(dst, tmp2);
5514 assert(!adr.uses(tmp1), "invalid dst for address decrement");
5515 lwu(tmp1, adr);
5516 subw(tmp1, tmp1, value, tmp2);
5517 sw(tmp1, adr);
5518 }
5519
5520 void MacroAssembler::load_method_holder_cld(Register result, Register method) {
5521 load_method_holder(result, method);
5522 ld(result, Address(result, InstanceKlass::class_loader_data_offset()));
5523 }
5524
5525 void MacroAssembler::load_method_holder(Register holder, Register method) {
5526 ld(holder, Address(method, Method::const_offset())); // ConstMethod*
5527 ld(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool*
5528 ld(holder, Address(holder, ConstantPool::pool_holder_offset())); // InstanceKlass*
5529 }
5530
5531 // string indexof
5532 // compute index by trailing zeros
5533 void MacroAssembler::compute_index(Register haystack, Register trailing_zeros,
5534 Register match_mask, Register result,
5535 Register ch2, Register tmp,
5536 bool haystack_isL) {
5537 int haystack_chr_shift = haystack_isL ? 0 : 1;
5538 srl(match_mask, match_mask, trailing_zeros);
5539 srli(match_mask, match_mask, 1);
5540 srli(tmp, trailing_zeros, LogBitsPerByte);
5541 if (!haystack_isL) andi(tmp, tmp, 0xE);
5542 add(haystack, haystack, tmp);
5543 ld(ch2, Address(haystack));
5544 if (!haystack_isL) srli(tmp, tmp, haystack_chr_shift);
5545 add(result, result, tmp);
5546 }
5547
5548 // string indexof
5549 // Find pattern element in src, compute match mask,
5550 // only the first occurrence of 0x80/0x8000 at low bits is the valid match index
5551 // match mask patterns and corresponding indices would be like:
5552 // - 0x8080808080808080 (Latin1)
5553 // - 7 6 5 4 3 2 1 0 (match index)
5554 // - 0x8000800080008000 (UTF16)
5555 // - 3 2 1 0 (match index)
5556 void MacroAssembler::compute_match_mask(Register src, Register pattern, Register match_mask,
5557 Register mask1, Register mask2) {
5558 xorr(src, pattern, src);
5559 sub(match_mask, src, mask1);
5560 orr(src, src, mask2);
5561 notr(src, src);
5562 andr(match_mask, match_mask, src);
5563 }
5564
5565 #ifdef COMPILER2
5566 // Code for BigInteger::mulAdd intrinsic
5567 // out = x10
5568 // in = x11
5569 // offset = x12 (already out.length-offset)
5570 // len = x13
5571 // k = x14
5572 // tmp = x28
5573 //
5574 // pseudo code from java implementation:
5575 // long kLong = k & LONG_MASK;
5576 // carry = 0;
5577 // offset = out.length-offset - 1;
5578 // for (int j = len - 1; j >= 0; j--) {
5579 // product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
5580 // out[offset--] = (int)product;
5581 // carry = product >>> 32;
5582 // }
5583 // return (int)carry;
5584 void MacroAssembler::mul_add(Register out, Register in, Register offset,
5585 Register len, Register k, Register tmp) {
5586 Label L_tail_loop, L_unroll, L_end;
5587 mv(tmp, out);
5588 mv(out, zr);
5589 blez(len, L_end);
5590 zext(k, k, 32);
5591 slliw(t0, offset, LogBytesPerInt);
5592 add(offset, tmp, t0);
5593 slliw(t0, len, LogBytesPerInt);
5594 add(in, in, t0);
5595
5596 const int unroll = 8;
5597 mv(tmp, unroll);
5598 blt(len, tmp, L_tail_loop);
5599 bind(L_unroll);
5600 for (int i = 0; i < unroll; i++) {
5601 subi(in, in, BytesPerInt);
5602 lwu(t0, Address(in, 0));
5603 mul(t1, t0, k);
5604 add(t0, t1, out);
5605 subi(offset, offset, BytesPerInt);
5606 lwu(t1, Address(offset, 0));
5607 add(t0, t0, t1);
5608 sw(t0, Address(offset, 0));
5609 srli(out, t0, 32);
5610 }
5611 subw(len, len, tmp);
5612 bge(len, tmp, L_unroll);
5613
5614 bind(L_tail_loop);
5615 blez(len, L_end);
5616 subi(in, in, BytesPerInt);
5617 lwu(t0, Address(in, 0));
5618 mul(t1, t0, k);
5619 add(t0, t1, out);
5620 subi(offset, offset, BytesPerInt);
5621 lwu(t1, Address(offset, 0));
5622 add(t0, t0, t1);
5623 sw(t0, Address(offset, 0));
5624 srli(out, t0, 32);
5625 subiw(len, len, 1);
5626 j(L_tail_loop);
5627
5628 bind(L_end);
5629 }
5630
5631 // Multiply and multiply-accumulate unsigned 64-bit registers.
5632 void MacroAssembler::wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
5633 assert_different_registers(prod_lo, prod_hi);
5634
5635 mul(prod_lo, n, m);
5636 mulhu(prod_hi, n, m);
5637 }
5638
5639 void MacroAssembler::wide_madd(Register sum_lo, Register sum_hi, Register n,
5640 Register m, Register tmp1, Register tmp2) {
5641 assert_different_registers(sum_lo, sum_hi);
5642 assert_different_registers(sum_hi, tmp2);
5643
5644 wide_mul(tmp1, tmp2, n, m);
5645 cad(sum_lo, sum_lo, tmp1, tmp1); // Add tmp1 to sum_lo with carry output to tmp1
5646 adc(sum_hi, sum_hi, tmp2, tmp1); // Add tmp2 with carry to sum_hi
5647 }
5648
5649 // add two unsigned input and output carry
5650 void MacroAssembler::cad(Register dst, Register src1, Register src2, Register carry)
5651 {
5652 assert_different_registers(dst, carry);
5653 assert_different_registers(dst, src2);
5654 add(dst, src1, src2);
5655 sltu(carry, dst, src2);
5656 }
5657
5658 // add two input with carry
5659 void MacroAssembler::adc(Register dst, Register src1, Register src2, Register carry) {
5660 assert_different_registers(dst, carry);
5661 add(dst, src1, src2);
5662 add(dst, dst, carry);
5663 }
5664
5665 // add two unsigned input with carry and output carry
5666 void MacroAssembler::cadc(Register dst, Register src1, Register src2, Register carry) {
5667 assert_different_registers(dst, src2);
5668 adc(dst, src1, src2, carry);
5669 sltu(carry, dst, src2);
5670 }
5671
5672 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
5673 Register src1, Register src2, Register carry) {
5674 cad(dest_lo, dest_lo, src1, carry);
5675 add(dest_hi, dest_hi, carry);
5676 cad(dest_lo, dest_lo, src2, carry);
5677 add(final_dest_hi, dest_hi, carry);
5678 }
5679
5680 /**
5681 * Multiply 64 bit by 64 bit first loop.
5682 */
5683 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
5684 Register y, Register y_idx, Register z,
5685 Register carry, Register product,
5686 Register idx, Register kdx) {
5687 //
5688 // jlong carry, x[], y[], z[];
5689 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
5690 // huge_128 product = y[idx] * x[xstart] + carry;
5691 // z[kdx] = (jlong)product;
5692 // carry = (jlong)(product >>> 64);
5693 // }
5694 // z[xstart] = carry;
5695 //
5696
5697 Label L_first_loop, L_first_loop_exit;
5698 Label L_one_x, L_one_y, L_multiply;
5699
5700 subiw(xstart, xstart, 1);
5701 bltz(xstart, L_one_x);
5702
5703 shadd(t0, xstart, x, t0, LogBytesPerInt);
5704 ld(x_xstart, Address(t0, 0));
5705 ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
5706
5707 bind(L_first_loop);
5708 subiw(idx, idx, 1);
5709 bltz(idx, L_first_loop_exit);
5710 subiw(idx, idx, 1);
5711 bltz(idx, L_one_y);
5712
5713 shadd(t0, idx, y, t0, LogBytesPerInt);
5714 ld(y_idx, Address(t0, 0));
5715 ror(y_idx, y_idx, 32); // convert big-endian to little-endian
5716 bind(L_multiply);
5717
5718 mulhu(t0, x_xstart, y_idx);
5719 mul(product, x_xstart, y_idx);
5720 cad(product, product, carry, t1);
5721 adc(carry, t0, zr, t1);
5722
5723 subiw(kdx, kdx, 2);
5724 ror(product, product, 32); // back to big-endian
5725 shadd(t0, kdx, z, t0, LogBytesPerInt);
5726 sd(product, Address(t0, 0));
5727
5728 j(L_first_loop);
5729
5730 bind(L_one_y);
5731 lwu(y_idx, Address(y, 0));
5732 j(L_multiply);
5733
5734 bind(L_one_x);
5735 lwu(x_xstart, Address(x, 0));
5736 j(L_first_loop);
5737
5738 bind(L_first_loop_exit);
5739 }
5740
5741 /**
5742 * Multiply 128 bit by 128 bit. Unrolled inner loop.
5743 *
5744 */
5745 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
5746 Register carry, Register carry2,
5747 Register idx, Register jdx,
5748 Register yz_idx1, Register yz_idx2,
5749 Register tmp, Register tmp3, Register tmp4,
5750 Register tmp6, Register product_hi) {
5751 // jlong carry, x[], y[], z[];
5752 // int kdx = xstart+1;
5753 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
5754 // huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
5755 // jlong carry2 = (jlong)(tmp3 >>> 64);
5756 // huge_128 tmp4 = (y[idx] * product_hi) + z[kdx+idx] + carry2;
5757 // carry = (jlong)(tmp4 >>> 64);
5758 // z[kdx+idx+1] = (jlong)tmp3;
5759 // z[kdx+idx] = (jlong)tmp4;
5760 // }
5761 // idx += 2;
5762 // if (idx > 0) {
5763 // yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
5764 // z[kdx+idx] = (jlong)yz_idx1;
5765 // carry = (jlong)(yz_idx1 >>> 64);
5766 // }
5767 //
5768
5769 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
5770
5771 srliw(jdx, idx, 2);
5772
5773 bind(L_third_loop);
5774
5775 subw(jdx, jdx, 1);
5776 bltz(jdx, L_third_loop_exit);
5777 subw(idx, idx, 4);
5778
5779 shadd(t0, idx, y, t0, LogBytesPerInt);
5780 ld(yz_idx2, Address(t0, 0));
5781 ld(yz_idx1, Address(t0, wordSize));
5782
5783 shadd(tmp6, idx, z, t0, LogBytesPerInt);
5784
5785 ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
5786 ror(yz_idx2, yz_idx2, 32);
5787
5788 ld(t1, Address(tmp6, 0));
5789 ld(t0, Address(tmp6, wordSize));
5790
5791 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5792 mulhu(tmp4, product_hi, yz_idx1);
5793
5794 ror(t0, t0, 32, tmp); // convert big-endian to little-endian
5795 ror(t1, t1, 32, tmp);
5796
5797 mul(tmp, product_hi, yz_idx2); // yz_idx2 * product_hi -> carry2:tmp
5798 mulhu(carry2, product_hi, yz_idx2);
5799
5800 cad(tmp3, tmp3, carry, carry);
5801 adc(tmp4, tmp4, zr, carry);
5802 cad(tmp3, tmp3, t0, t0);
5803 cadc(tmp4, tmp4, tmp, t0);
5804 adc(carry, carry2, zr, t0);
5805 cad(tmp4, tmp4, t1, carry2);
5806 adc(carry, carry, zr, carry2);
5807
5808 ror(tmp3, tmp3, 32); // convert little-endian to big-endian
5809 ror(tmp4, tmp4, 32);
5810 sd(tmp4, Address(tmp6, 0));
5811 sd(tmp3, Address(tmp6, wordSize));
5812
5813 j(L_third_loop);
5814
5815 bind(L_third_loop_exit);
5816
5817 andi(idx, idx, 0x3);
5818 beqz(idx, L_post_third_loop_done);
5819
5820 Label L_check_1;
5821 subiw(idx, idx, 2);
5822 bltz(idx, L_check_1);
5823
5824 shadd(t0, idx, y, t0, LogBytesPerInt);
5825 ld(yz_idx1, Address(t0, 0));
5826 ror(yz_idx1, yz_idx1, 32);
5827
5828 mul(tmp3, product_hi, yz_idx1); // yz_idx1 * product_hi -> tmp4:tmp3
5829 mulhu(tmp4, product_hi, yz_idx1);
5830
5831 shadd(t0, idx, z, t0, LogBytesPerInt);
5832 ld(yz_idx2, Address(t0, 0));
5833 ror(yz_idx2, yz_idx2, 32, tmp);
5834
5835 add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2, tmp);
5836
5837 ror(tmp3, tmp3, 32, tmp);
5838 sd(tmp3, Address(t0, 0));
5839
5840 bind(L_check_1);
5841
5842 andi(idx, idx, 0x1);
5843 subiw(idx, idx, 1);
5844 bltz(idx, L_post_third_loop_done);
5845 shadd(t0, idx, y, t0, LogBytesPerInt);
5846 lwu(tmp4, Address(t0, 0));
5847 mul(tmp3, tmp4, product_hi); // tmp4 * product_hi -> carry2:tmp3
5848 mulhu(carry2, tmp4, product_hi);
5849
5850 shadd(t0, idx, z, t0, LogBytesPerInt);
5851 lwu(tmp4, Address(t0, 0));
5852
5853 add2_with_carry(carry2, carry2, tmp3, tmp4, carry, t0);
5854
5855 shadd(t0, idx, z, t0, LogBytesPerInt);
5856 sw(tmp3, Address(t0, 0));
5857
5858 slli(t0, carry2, 32);
5859 srli(carry, tmp3, 32);
5860 orr(carry, carry, t0);
5861
5862 bind(L_post_third_loop_done);
5863 }
5864
5865 /**
5866 * Code for BigInteger::multiplyToLen() intrinsic.
5867 *
5868 * x10: x
5869 * x11: xlen
5870 * x12: y
5871 * x13: ylen
5872 * x14: z
5873 * x15: tmp0
5874 * x16: tmp1
5875 * x17: tmp2
5876 * x7: tmp3
5877 * x28: tmp4
5878 * x29: tmp5
5879 * x30: tmp6
5880 * x31: tmp7
5881 */
5882 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
5883 Register z, Register tmp0,
5884 Register tmp1, Register tmp2, Register tmp3, Register tmp4,
5885 Register tmp5, Register tmp6, Register product_hi) {
5886 assert_different_registers(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
5887
5888 const Register idx = tmp1;
5889 const Register kdx = tmp2;
5890 const Register xstart = tmp3;
5891
5892 const Register y_idx = tmp4;
5893 const Register carry = tmp5;
5894 const Register product = xlen;
5895 const Register x_xstart = tmp0;
5896 const Register jdx = tmp1;
5897
5898 mv(idx, ylen); // idx = ylen;
5899 addw(kdx, xlen, ylen); // kdx = xlen+ylen;
5900 mv(carry, zr); // carry = 0;
5901
5902 Label L_done;
5903 subiw(xstart, xlen, 1);
5904 bltz(xstart, L_done);
5905
5906 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
5907
5908 Label L_second_loop_aligned;
5909 beqz(kdx, L_second_loop_aligned);
5910
5911 Label L_carry;
5912 subiw(kdx, kdx, 1);
5913 beqz(kdx, L_carry);
5914
5915 shadd(t0, kdx, z, t0, LogBytesPerInt);
5916 sw(carry, Address(t0, 0));
5917 srli(carry, carry, 32);
5918 subiw(kdx, kdx, 1);
5919
5920 bind(L_carry);
5921 shadd(t0, kdx, z, t0, LogBytesPerInt);
5922 sw(carry, Address(t0, 0));
5923
5924 // Second and third (nested) loops.
5925 //
5926 // for (int i = xstart-1; i >= 0; i--) { // Second loop
5927 // carry = 0;
5928 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
5929 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
5930 // (z[k] & LONG_MASK) + carry;
5931 // z[k] = (int)product;
5932 // carry = product >>> 32;
5933 // }
5934 // z[i] = (int)carry;
5935 // }
5936 //
5937 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
5938
5939 bind(L_second_loop_aligned);
5940 mv(carry, zr); // carry = 0;
5941 mv(jdx, ylen); // j = ystart+1
5942
5943 subiw(xstart, xstart, 1); // i = xstart-1;
5944 bltz(xstart, L_done);
5945
5946 subi(sp, sp, 4 * wordSize);
5947 sd(z, Address(sp, 0));
5948
5949 Label L_last_x;
5950 shadd(t0, xstart, z, t0, LogBytesPerInt);
5951 addi(z, t0, 4);
5952 subiw(xstart, xstart, 1); // i = xstart-1;
5953 bltz(xstart, L_last_x);
5954
5955 shadd(t0, xstart, x, t0, LogBytesPerInt);
5956 ld(product_hi, Address(t0, 0));
5957 ror(product_hi, product_hi, 32); // convert big-endian to little-endian
5958
5959 Label L_third_loop_prologue;
5960 bind(L_third_loop_prologue);
5961
5962 sd(ylen, Address(sp, wordSize));
5963 sd(x, Address(sp, 2 * wordSize));
5964 sd(xstart, Address(sp, 3 * wordSize));
5965 multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
5966 tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
5967 ld(z, Address(sp, 0));
5968 ld(ylen, Address(sp, wordSize));
5969 ld(x, Address(sp, 2 * wordSize));
5970 ld(xlen, Address(sp, 3 * wordSize)); // copy old xstart -> xlen
5971 addi(sp, sp, 4 * wordSize);
5972
5973 addiw(tmp3, xlen, 1);
5974 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5975 sw(carry, Address(t0, 0));
5976
5977 subiw(tmp3, tmp3, 1);
5978 bltz(tmp3, L_done);
5979
5980 srli(carry, carry, 32);
5981 shadd(t0, tmp3, z, t0, LogBytesPerInt);
5982 sw(carry, Address(t0, 0));
5983 j(L_second_loop_aligned);
5984
5985 // Next infrequent code is moved outside loops.
5986 bind(L_last_x);
5987 lwu(product_hi, Address(x, 0));
5988 j(L_third_loop_prologue);
5989
5990 bind(L_done);
5991 }
5992 #endif
5993
5994 // Count bits of trailing zero chars from lsb to msb until first non-zero
5995 // char seen. For the LL case, shift 8 bits once as there is only one byte
5996 // per each char. For other cases, shift 16 bits once.
5997 void MacroAssembler::ctzc_bits(Register Rd, Register Rs, bool isLL,
5998 Register tmp1, Register tmp2) {
5999 int step = isLL ? 8 : 16;
6000 if (UseZbb) {
6001 ctz(Rd, Rs);
6002 andi(Rd, Rd, -step);
6003 return;
6004 }
6005
6006 assert_different_registers(Rd, tmp1, tmp2);
6007 Label Loop;
6008 mv(tmp2, Rs);
6009 mv(Rd, -step);
6010
6011 bind(Loop);
6012 addi(Rd, Rd, step);
6013 zext(tmp1, tmp2, step);
6014 srli(tmp2, tmp2, step);
6015 beqz(tmp1, Loop);
6016 }
6017
6018 // This instruction reads adjacent 4 bytes from the lower half of source register,
6019 // inflate into a register, for example:
6020 // Rs: A7A6A5A4A3A2A1A0
6021 // Rd: 00A300A200A100A0
6022 void MacroAssembler::inflate_lo32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6023 assert_different_registers(Rd, Rs, tmp1, tmp2);
6024
6025 mv(tmp1, 0xFF000000); // first byte mask at lower word
6026 andr(Rd, Rs, tmp1);
6027 for (int i = 0; i < 2; i++) {
6028 slli(Rd, Rd, wordSize);
6029 srli(tmp1, tmp1, wordSize);
6030 andr(tmp2, Rs, tmp1);
6031 orr(Rd, Rd, tmp2);
6032 }
6033 slli(Rd, Rd, wordSize);
6034 zext(tmp2, Rs, 8); // last byte mask at lower word
6035 orr(Rd, Rd, tmp2);
6036 }
6037
6038 // This instruction reads adjacent 4 bytes from the upper half of source register,
6039 // inflate into a register, for example:
6040 // Rs: A7A6A5A4A3A2A1A0
6041 // Rd: 00A700A600A500A4
6042 void MacroAssembler::inflate_hi32(Register Rd, Register Rs, Register tmp1, Register tmp2) {
6043 assert_different_registers(Rd, Rs, tmp1, tmp2);
6044 srli(Rs, Rs, 32); // only upper 32 bits are needed
6045 inflate_lo32(Rd, Rs, tmp1, tmp2);
6046 }
6047
6048 // The size of the blocks erased by the zero_blocks stub. We must
6049 // handle anything smaller than this ourselves in zero_words().
6050 const int MacroAssembler::zero_words_block_size = 8;
6051
6052 // zero_words() is used by C2 ClearArray patterns. It is as small as
6053 // possible, handling small word counts locally and delegating
6054 // anything larger to the zero_blocks stub. It is expanded many times
6055 // in compiled code, so it is important to keep it short.
6056
6057 // ptr: Address of a buffer to be zeroed.
6058 // cnt: Count in HeapWords.
6059 //
6060 // ptr, cnt, t1, and t0 are clobbered.
6061 address MacroAssembler::zero_words(Register ptr, Register cnt) {
6062 assert(is_power_of_2(zero_words_block_size), "adjust this");
6063 assert(ptr == x28 && cnt == x29, "mismatch in register usage");
6064 assert_different_registers(cnt, t0, t1);
6065
6066 BLOCK_COMMENT("zero_words {");
6067
6068 mv(t0, zero_words_block_size);
6069 Label around, done, done16;
6070 bltu(cnt, t0, around);
6071 {
6072 RuntimeAddress zero_blocks(StubRoutines::riscv::zero_blocks());
6073 assert(zero_blocks.target() != nullptr, "zero_blocks stub has not been generated");
6074 if (StubRoutines::riscv::complete()) {
6075 address tpc = reloc_call(zero_blocks);
6076 if (tpc == nullptr) {
6077 DEBUG_ONLY(reset_labels(around));
6078 postcond(pc() == badAddress);
6079 return nullptr;
6080 }
6081 } else {
6082 // Clobbers t1
6083 rt_call(zero_blocks.target());
6084 }
6085 }
6086 bind(around);
6087 for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
6088 Label l;
6089 test_bit(t0, cnt, exact_log2(i));
6090 beqz(t0, l);
6091 for (int j = 0; j < i; j++) {
6092 sd(zr, Address(ptr, j * wordSize));
6093 }
6094 addi(ptr, ptr, i * wordSize);
6095 bind(l);
6096 }
6097 {
6098 Label l;
6099 test_bit(t0, cnt, 0);
6100 beqz(t0, l);
6101 sd(zr, Address(ptr, 0));
6102 bind(l);
6103 }
6104
6105 BLOCK_COMMENT("} zero_words");
6106 postcond(pc() != badAddress);
6107 return pc();
6108 }
6109
6110 #define SmallArraySize (18 * BytesPerLong)
6111
6112 // base: Address of a buffer to be zeroed, 8 bytes aligned.
6113 // cnt: Immediate count in HeapWords.
6114 void MacroAssembler::zero_words(Register base, uint64_t cnt) {
6115 assert_different_registers(base, t0, t1);
6116
6117 BLOCK_COMMENT("zero_words {");
6118
6119 if (cnt <= SmallArraySize / BytesPerLong) {
6120 for (int i = 0; i < (int)cnt; i++) {
6121 sd(zr, Address(base, i * wordSize));
6122 }
6123 } else {
6124 const int unroll = 8; // Number of sd(zr, adr), instructions we'll unroll
6125 int remainder = cnt % unroll;
6126 for (int i = 0; i < remainder; i++) {
6127 sd(zr, Address(base, i * wordSize));
6128 }
6129
6130 Label loop;
6131 Register cnt_reg = t0;
6132 Register loop_base = t1;
6133 cnt = cnt - remainder;
6134 mv(cnt_reg, cnt);
6135 addi(loop_base, base, remainder * wordSize);
6136 bind(loop);
6137 sub(cnt_reg, cnt_reg, unroll);
6138 for (int i = 0; i < unroll; i++) {
6139 sd(zr, Address(loop_base, i * wordSize));
6140 }
6141 addi(loop_base, loop_base, unroll * wordSize);
6142 bnez(cnt_reg, loop);
6143 }
6144
6145 BLOCK_COMMENT("} zero_words");
6146 }
6147
6148 // base: Address of a buffer to be filled, 8 bytes aligned.
6149 // cnt: Count in 8-byte unit.
6150 // value: Value to be filled with.
6151 // base will point to the end of the buffer after filling.
6152 void MacroAssembler::fill_words(Register base, Register cnt, Register value) {
6153 // Algorithm:
6154 //
6155 // t0 = cnt & 7
6156 // cnt -= t0
6157 // p += t0
6158 // switch (t0):
6159 // switch start:
6160 // do while cnt
6161 // cnt -= 8
6162 // p[-8] = value
6163 // case 7:
6164 // p[-7] = value
6165 // case 6:
6166 // p[-6] = value
6167 // // ...
6168 // case 1:
6169 // p[-1] = value
6170 // case 0:
6171 // p += 8
6172 // do-while end
6173 // switch end
6174
6175 assert_different_registers(base, cnt, value, t0, t1);
6176
6177 Label fini, skip, entry, loop;
6178 const int unroll = 8; // Number of sd instructions we'll unroll
6179
6180 beqz(cnt, fini);
6181
6182 andi(t0, cnt, unroll - 1);
6183 sub(cnt, cnt, t0);
6184 shadd(base, t0, base, t1, 3);
6185 la(t1, entry);
6186 slli(t0, t0, 2);
6187 sub(t1, t1, t0);
6188 jr(t1);
6189
6190 bind(loop);
6191 addi(base, base, unroll * wordSize);
6192 {
6193 IncompressibleScope scope(this); // Fixed length
6194 for (int i = -unroll; i < 0; i++) {
6195 sd(value, Address(base, i * 8));
6196 }
6197 }
6198 bind(entry);
6199 subi(cnt, cnt, unroll);
6200 bgez(cnt, loop);
6201
6202 bind(fini);
6203 }
6204
6205 // Zero blocks of memory by using CBO.ZERO.
6206 //
6207 // Aligns the base address first sufficiently for CBO.ZERO, then uses
6208 // CBO.ZERO repeatedly for every full block. cnt is the size to be
6209 // zeroed in HeapWords. Returns the count of words left to be zeroed
6210 // in cnt.
6211 //
6212 // NOTE: This is intended to be used in the zero_blocks() stub. If
6213 // you want to use it elsewhere, note that cnt must be >= zicboz_block_size.
6214 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt, Register tmp1, Register tmp2) {
6215 int zicboz_block_size = VM_Version::zicboz_block_size.value();
6216 Label initial_table_end, loop;
6217
6218 // Align base with cache line size.
6219 neg(tmp1, base);
6220 andi(tmp1, tmp1, zicboz_block_size - 1);
6221
6222 // tmp1: the number of bytes to be filled to align the base with cache line size.
6223 add(base, base, tmp1);
6224 srai(tmp2, tmp1, 3);
6225 sub(cnt, cnt, tmp2);
6226 srli(tmp2, tmp1, 1);
6227 la(tmp1, initial_table_end);
6228 sub(tmp2, tmp1, tmp2);
6229 jr(tmp2);
6230 for (int i = -zicboz_block_size + wordSize; i < 0; i += wordSize) {
6231 sd(zr, Address(base, i));
6232 }
6233 bind(initial_table_end);
6234
6235 mv(tmp1, zicboz_block_size / wordSize);
6236 bind(loop);
6237 cbo_zero(base);
6238 sub(cnt, cnt, tmp1);
6239 addi(base, base, zicboz_block_size);
6240 bge(cnt, tmp1, loop);
6241 }
6242
6243 // java.lang.Math.round(float a)
6244 // Returns the closest int to the argument, with ties rounding to positive infinity.
6245 void MacroAssembler::java_round_float(Register dst, FloatRegister src, FloatRegister ftmp) {
6246 // this instructions calling sequence provides performance improvement on all tested devices;
6247 // don't change it without re-verification
6248 Label done;
6249 mv(t0, jint_cast(0.5f));
6250 fmv_w_x(ftmp, t0);
6251
6252 // dst = 0 if NaN
6253 feq_s(t0, src, src); // replacing fclass with feq as performance optimization
6254 mv(dst, zr);
6255 beqz(t0, done);
6256
6257 // dst = (src + 0.5f) rounded down towards negative infinity
6258 // Adding 0.5f to some floats exceeds the precision limits for a float and rounding takes place.
6259 // RDN is required for fadd_s, RNE gives incorrect results:
6260 // --------------------------------------------------------------------
6261 // fadd.s rne (src + 0.5f): src = 8388609.000000 ftmp = 8388610.000000
6262 // fcvt.w.s rdn: ftmp = 8388610.000000 dst = 8388610
6263 // --------------------------------------------------------------------
6264 // fadd.s rdn (src + 0.5f): src = 8388609.000000 ftmp = 8388609.000000
6265 // fcvt.w.s rdn: ftmp = 8388609.000000 dst = 8388609
6266 // --------------------------------------------------------------------
6267 fadd_s(ftmp, src, ftmp, RoundingMode::rdn);
6268 fcvt_w_s(dst, ftmp, RoundingMode::rdn);
6269
6270 bind(done);
6271 }
6272
6273 // java.lang.Math.round(double a)
6274 // Returns the closest long to the argument, with ties rounding to positive infinity.
6275 void MacroAssembler::java_round_double(Register dst, FloatRegister src, FloatRegister ftmp) {
6276 // this instructions calling sequence provides performance improvement on all tested devices;
6277 // don't change it without re-verification
6278 Label done;
6279 mv(t0, julong_cast(0.5));
6280 fmv_d_x(ftmp, t0);
6281
6282 // dst = 0 if NaN
6283 feq_d(t0, src, src); // replacing fclass with feq as performance optimization
6284 mv(dst, zr);
6285 beqz(t0, done);
6286
6287 // dst = (src + 0.5) rounded down towards negative infinity
6288 fadd_d(ftmp, src, ftmp, RoundingMode::rdn); // RDN is required here otherwise some inputs produce incorrect results
6289 fcvt_l_d(dst, ftmp, RoundingMode::rdn);
6290
6291 bind(done);
6292 }
6293
6294 // Helper routine processing the slow path of NaN when converting float to float16
6295 void MacroAssembler::float_to_float16_NaN(Register dst, FloatRegister src,
6296 Register tmp1, Register tmp2) {
6297 fmv_x_w(dst, src);
6298
6299 // Float (32 bits)
6300 // Bit: 31 30 to 23 22 to 0
6301 // +---+------------------+-----------------------------+
6302 // | S | Exponent | Mantissa (Fraction) |
6303 // +---+------------------+-----------------------------+
6304 // 1 bit 8 bits 23 bits
6305 //
6306 // Float (16 bits)
6307 // Bit: 15 14 to 10 9 to 0
6308 // +---+----------------+------------------+
6309 // | S | Exponent | Mantissa |
6310 // +---+----------------+------------------+
6311 // 1 bit 5 bits 10 bits
6312 const int fp_sign_bits = 1;
6313 const int fp32_bits = 32;
6314 const int fp32_exponent_bits = 8;
6315 const int fp32_mantissa_1st_part_bits = 10;
6316 const int fp32_mantissa_2nd_part_bits = 9;
6317 const int fp32_mantissa_3rd_part_bits = 4;
6318 const int fp16_exponent_bits = 5;
6319 const int fp16_mantissa_bits = 10;
6320
6321 // preserve the sign bit and exponent, clear mantissa.
6322 srai(tmp2, dst, fp32_bits - fp_sign_bits - fp16_exponent_bits);
6323 slli(tmp2, tmp2, fp16_mantissa_bits);
6324
6325 // Preserve high order bit of float NaN in the
6326 // binary16 result NaN (tenth bit); OR in remaining
6327 // bits into lower 9 bits of binary 16 significand.
6328 // | (doppel & 0x007f_e000) >> 13 // 10 bits
6329 // | (doppel & 0x0000_1ff0) >> 4 // 9 bits
6330 // | (doppel & 0x0000_000f)); // 4 bits
6331 //
6332 // Check j.l.Float.floatToFloat16 for more information.
6333 // 10 bits
6334 int left_shift = fp_sign_bits + fp32_exponent_bits + 32;
6335 int right_shift = left_shift + fp32_mantissa_2nd_part_bits + fp32_mantissa_3rd_part_bits;
6336 slli(tmp1, dst, left_shift);
6337 srli(tmp1, tmp1, right_shift);
6338 orr(tmp2, tmp2, tmp1);
6339 // 9 bits
6340 left_shift += fp32_mantissa_1st_part_bits;
6341 right_shift = left_shift + fp32_mantissa_3rd_part_bits;
6342 slli(tmp1, dst, left_shift);
6343 srli(tmp1, tmp1, right_shift);
6344 orr(tmp2, tmp2, tmp1);
6345 // 4 bits
6346 andi(tmp1, dst, 0xf);
6347 orr(dst, tmp2, tmp1);
6348 }
6349
6350 #define FCVT_SAFE(FLOATCVT, FLOATSIG) \
6351 void MacroAssembler::FLOATCVT##_safe(Register dst, FloatRegister src, Register tmp) { \
6352 Label done; \
6353 assert_different_registers(dst, tmp); \
6354 fclass_##FLOATSIG(tmp, src); \
6355 mv(dst, zr); \
6356 /* check if src is NaN */ \
6357 andi(tmp, tmp, FClassBits::nan); \
6358 bnez(tmp, done); \
6359 FLOATCVT(dst, src); \
6360 bind(done); \
6361 }
6362
6363 FCVT_SAFE(fcvt_w_s, s);
6364 FCVT_SAFE(fcvt_l_s, s);
6365 FCVT_SAFE(fcvt_w_d, d);
6366 FCVT_SAFE(fcvt_l_d, d);
6367
6368 #undef FCVT_SAFE
6369
6370 #define FCMP(FLOATTYPE, FLOATSIG) \
6371 void MacroAssembler::FLOATTYPE##_compare(Register result, FloatRegister Rs1, \
6372 FloatRegister Rs2, int unordered_result) { \
6373 Label Ldone; \
6374 if (unordered_result < 0) { \
6375 /* we want -1 for unordered or less than, 0 for equal and 1 for greater than. */ \
6376 /* installs 1 if gt else 0 */ \
6377 flt_##FLOATSIG(result, Rs2, Rs1); \
6378 /* Rs1 > Rs2, install 1 */ \
6379 bgtz(result, Ldone); \
6380 feq_##FLOATSIG(result, Rs1, Rs2); \
6381 subi(result, result, 1); \
6382 /* Rs1 = Rs2, install 0 */ \
6383 /* NaN or Rs1 < Rs2, install -1 */ \
6384 bind(Ldone); \
6385 } else { \
6386 /* we want -1 for less than, 0 for equal and 1 for unordered or greater than. */ \
6387 /* installs 1 if gt or unordered else 0 */ \
6388 flt_##FLOATSIG(result, Rs1, Rs2); \
6389 /* Rs1 < Rs2, install -1 */ \
6390 bgtz(result, Ldone); \
6391 feq_##FLOATSIG(result, Rs1, Rs2); \
6392 subi(result, result, 1); \
6393 /* Rs1 = Rs2, install 0 */ \
6394 /* NaN or Rs1 > Rs2, install 1 */ \
6395 bind(Ldone); \
6396 neg(result, result); \
6397 } \
6398 }
6399
6400 FCMP(float, s);
6401 FCMP(double, d);
6402
6403 #undef FCMP
6404
6405 // Zero words; len is in bytes
6406 // Destroys all registers except addr
6407 // len must be a nonzero multiple of wordSize
6408 void MacroAssembler::zero_memory(Register addr, Register len, Register tmp) {
6409 assert_different_registers(addr, len, tmp, t0, t1);
6410
6411 #ifdef ASSERT
6412 {
6413 Label L;
6414 andi(t0, len, BytesPerWord - 1);
6415 beqz(t0, L);
6416 stop("len is not a multiple of BytesPerWord");
6417 bind(L);
6418 }
6419 #endif // ASSERT
6420
6421 #ifndef PRODUCT
6422 block_comment("zero memory");
6423 #endif // PRODUCT
6424
6425 Label loop;
6426 Label entry;
6427
6428 // Algorithm:
6429 //
6430 // t0 = cnt & 7
6431 // cnt -= t0
6432 // p += t0
6433 // switch (t0) {
6434 // do {
6435 // cnt -= 8
6436 // p[-8] = 0
6437 // case 7:
6438 // p[-7] = 0
6439 // case 6:
6440 // p[-6] = 0
6441 // ...
6442 // case 1:
6443 // p[-1] = 0
6444 // case 0:
6445 // p += 8
6446 // } while (cnt)
6447 // }
6448
6449 const int unroll = 8; // Number of sd(zr) instructions we'll unroll
6450
6451 srli(len, len, LogBytesPerWord);
6452 andi(t0, len, unroll - 1); // t0 = cnt % unroll
6453 sub(len, len, t0); // cnt -= unroll
6454 // tmp always points to the end of the region we're about to zero
6455 shadd(tmp, t0, addr, t1, LogBytesPerWord);
6456 la(t1, entry);
6457 slli(t0, t0, 2);
6458 sub(t1, t1, t0);
6459 jr(t1);
6460
6461 bind(loop);
6462 sub(len, len, unroll);
6463 {
6464 IncompressibleScope scope(this); // Fixed length
6465 for (int i = -unroll; i < 0; i++) {
6466 sd(zr, Address(tmp, i * wordSize));
6467 }
6468 }
6469 bind(entry);
6470 add(tmp, tmp, unroll * wordSize);
6471 bnez(len, loop);
6472 }
6473
6474 // shift left by shamt and add
6475 // Rd = (Rs1 << shamt) + Rs2
6476 void MacroAssembler::shadd(Register Rd, Register Rs1, Register Rs2, Register tmp, int shamt) {
6477 if (UseZba) {
6478 if (shamt == 1) {
6479 sh1add(Rd, Rs1, Rs2);
6480 return;
6481 } else if (shamt == 2) {
6482 sh2add(Rd, Rs1, Rs2);
6483 return;
6484 } else if (shamt == 3) {
6485 sh3add(Rd, Rs1, Rs2);
6486 return;
6487 }
6488 }
6489
6490 if (shamt != 0) {
6491 assert_different_registers(Rs2, tmp);
6492 slli(tmp, Rs1, shamt);
6493 add(Rd, Rs2, tmp);
6494 } else {
6495 add(Rd, Rs1, Rs2);
6496 }
6497 }
6498
6499 void MacroAssembler::zext(Register dst, Register src, int bits) {
6500 switch (bits) {
6501 case 32:
6502 if (UseZba) {
6503 zext_w(dst, src);
6504 return;
6505 }
6506 break;
6507 case 16:
6508 if (UseZbb) {
6509 zext_h(dst, src);
6510 return;
6511 }
6512 break;
6513 case 8:
6514 zext_b(dst, src);
6515 return;
6516 default:
6517 break;
6518 }
6519
6520 slli(dst, src, XLEN - bits);
6521 srli(dst, dst, XLEN - bits);
6522 }
6523
6524 void MacroAssembler::sext(Register dst, Register src, int bits) {
6525 switch (bits) {
6526 case 32:
6527 sext_w(dst, src);
6528 return;
6529 case 16:
6530 if (UseZbb) {
6531 sext_h(dst, src);
6532 return;
6533 }
6534 break;
6535 case 8:
6536 if (UseZbb) {
6537 sext_b(dst, src);
6538 return;
6539 }
6540 break;
6541 default:
6542 break;
6543 }
6544
6545 slli(dst, src, XLEN - bits);
6546 srai(dst, dst, XLEN - bits);
6547 }
6548
6549 void MacroAssembler::cmp_x2i(Register dst, Register src1, Register src2,
6550 Register tmp, bool is_signed) {
6551 if (src1 == src2) {
6552 mv(dst, zr);
6553 return;
6554 }
6555 Label done;
6556 Register left = src1;
6557 Register right = src2;
6558 if (dst == src1) {
6559 assert_different_registers(dst, src2, tmp);
6560 mv(tmp, src1);
6561 left = tmp;
6562 } else if (dst == src2) {
6563 assert_different_registers(dst, src1, tmp);
6564 mv(tmp, src2);
6565 right = tmp;
6566 }
6567
6568 // installs 1 if gt else 0
6569 if (is_signed) {
6570 slt(dst, right, left);
6571 } else {
6572 sltu(dst, right, left);
6573 }
6574 bnez(dst, done);
6575 if (is_signed) {
6576 slt(dst, left, right);
6577 } else {
6578 sltu(dst, left, right);
6579 }
6580 // dst = -1 if lt; else if eq , dst = 0
6581 neg(dst, dst);
6582 bind(done);
6583 }
6584
6585 void MacroAssembler::cmp_l2i(Register dst, Register src1, Register src2, Register tmp)
6586 {
6587 cmp_x2i(dst, src1, src2, tmp);
6588 }
6589
6590 void MacroAssembler::cmp_ul2i(Register dst, Register src1, Register src2, Register tmp) {
6591 cmp_x2i(dst, src1, src2, tmp, false);
6592 }
6593
6594 void MacroAssembler::cmp_uw2i(Register dst, Register src1, Register src2, Register tmp) {
6595 cmp_x2i(dst, src1, src2, tmp, false);
6596 }
6597
6598 // The java_calling_convention describes stack locations as ideal slots on
6599 // a frame with no abi restrictions. Since we must observe abi restrictions
6600 // (like the placement of the register window) the slots must be biased by
6601 // the following value.
6602 static int reg2offset_in(VMReg r) {
6603 // Account for saved fp and ra
6604 // This should really be in_preserve_stack_slots
6605 return r->reg2stack() * VMRegImpl::stack_slot_size;
6606 }
6607
6608 static int reg2offset_out(VMReg r) {
6609 return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
6610 }
6611
6612 // The C ABI specifies:
6613 // "integer scalars narrower than XLEN bits are widened according to the sign
6614 // of their type up to 32 bits, then sign-extended to XLEN bits."
6615 // Applies for both passed in register and stack.
6616 //
6617 // Java uses 32-bit stack slots; jint, jshort, jchar, jbyte uses one slot.
6618 // Native uses 64-bit stack slots for all integer scalar types.
6619 //
6620 // lw loads the Java stack slot, sign-extends and
6621 // sd store this widened integer into a 64 bit native stack slot.
6622 void MacroAssembler::move32_64(VMRegPair src, VMRegPair dst, Register tmp) {
6623 if (src.first()->is_stack()) {
6624 if (dst.first()->is_stack()) {
6625 // stack to stack
6626 lw(tmp, Address(fp, reg2offset_in(src.first())));
6627 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6628 } else {
6629 // stack to reg
6630 lw(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6631 }
6632 } else if (dst.first()->is_stack()) {
6633 // reg to stack
6634 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6635 } else {
6636 if (dst.first() != src.first()) {
6637 sext(dst.first()->as_Register(), src.first()->as_Register(), 32);
6638 }
6639 }
6640 }
6641
6642 // An oop arg. Must pass a handle not the oop itself
6643 void MacroAssembler::object_move(OopMap* map,
6644 int oop_handle_offset,
6645 int framesize_in_slots,
6646 VMRegPair src,
6647 VMRegPair dst,
6648 bool is_receiver,
6649 int* receiver_offset) {
6650 assert_cond(map != nullptr && receiver_offset != nullptr);
6651
6652 // must pass a handle. First figure out the location we use as a handle
6653 Register rHandle = dst.first()->is_stack() ? t1 : dst.first()->as_Register();
6654
6655 // See if oop is null if it is we need no handle
6656
6657 if (src.first()->is_stack()) {
6658 // Oop is already on the stack as an argument
6659 int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
6660 map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
6661 if (is_receiver) {
6662 *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
6663 }
6664
6665 ld(t0, Address(fp, reg2offset_in(src.first())));
6666 la(rHandle, Address(fp, reg2offset_in(src.first())));
6667 // conditionally move a null
6668 Label notZero1;
6669 bnez(t0, notZero1);
6670 mv(rHandle, zr);
6671 bind(notZero1);
6672 } else {
6673
6674 // Oop is in a register we must store it to the space we reserve
6675 // on the stack for oop_handles and pass a handle if oop is non-null
6676
6677 const Register rOop = src.first()->as_Register();
6678 int oop_slot = -1;
6679 if (rOop == j_rarg0) {
6680 oop_slot = 0;
6681 } else if (rOop == j_rarg1) {
6682 oop_slot = 1;
6683 } else if (rOop == j_rarg2) {
6684 oop_slot = 2;
6685 } else if (rOop == j_rarg3) {
6686 oop_slot = 3;
6687 } else if (rOop == j_rarg4) {
6688 oop_slot = 4;
6689 } else if (rOop == j_rarg5) {
6690 oop_slot = 5;
6691 } else if (rOop == j_rarg6) {
6692 oop_slot = 6;
6693 } else {
6694 assert(rOop == j_rarg7, "wrong register");
6695 oop_slot = 7;
6696 }
6697
6698 oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
6699 int offset = oop_slot * VMRegImpl::stack_slot_size;
6700
6701 map->set_oop(VMRegImpl::stack2reg(oop_slot));
6702 // Store oop in handle area, may be null
6703 sd(rOop, Address(sp, offset));
6704 if (is_receiver) {
6705 *receiver_offset = offset;
6706 }
6707
6708 //rOop maybe the same as rHandle
6709 if (rOop == rHandle) {
6710 Label isZero;
6711 beqz(rOop, isZero);
6712 la(rHandle, Address(sp, offset));
6713 bind(isZero);
6714 } else {
6715 Label notZero2;
6716 la(rHandle, Address(sp, offset));
6717 bnez(rOop, notZero2);
6718 mv(rHandle, zr);
6719 bind(notZero2);
6720 }
6721 }
6722
6723 // If arg is on the stack then place it otherwise it is already in correct reg.
6724 if (dst.first()->is_stack()) {
6725 sd(rHandle, Address(sp, reg2offset_out(dst.first())));
6726 }
6727 }
6728
6729 // A float arg may have to do float reg int reg conversion
6730 void MacroAssembler::float_move(VMRegPair src, VMRegPair dst, Register tmp) {
6731 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6732 (src.first()->is_reg() && dst.first()->is_reg()) ||
6733 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6734 if (src.first()->is_stack()) {
6735 if (dst.first()->is_stack()) {
6736 lwu(tmp, Address(fp, reg2offset_in(src.first())));
6737 sw(tmp, Address(sp, reg2offset_out(dst.first())));
6738 } else if (dst.first()->is_Register()) {
6739 lwu(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6740 } else {
6741 ShouldNotReachHere();
6742 }
6743 } else if (src.first() != dst.first()) {
6744 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6745 fmv_s(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6746 } else {
6747 ShouldNotReachHere();
6748 }
6749 }
6750 }
6751
6752 // A long move
6753 void MacroAssembler::long_move(VMRegPair src, VMRegPair dst, Register tmp) {
6754 if (src.first()->is_stack()) {
6755 if (dst.first()->is_stack()) {
6756 // stack to stack
6757 ld(tmp, Address(fp, reg2offset_in(src.first())));
6758 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6759 } else {
6760 // stack to reg
6761 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6762 }
6763 } else if (dst.first()->is_stack()) {
6764 // reg to stack
6765 sd(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
6766 } else {
6767 if (dst.first() != src.first()) {
6768 mv(dst.first()->as_Register(), src.first()->as_Register());
6769 }
6770 }
6771 }
6772
6773 // A double move
6774 void MacroAssembler::double_move(VMRegPair src, VMRegPair dst, Register tmp) {
6775 assert((src.first()->is_stack() && dst.first()->is_stack()) ||
6776 (src.first()->is_reg() && dst.first()->is_reg()) ||
6777 (src.first()->is_stack() && dst.first()->is_reg()), "Unexpected error");
6778 if (src.first()->is_stack()) {
6779 if (dst.first()->is_stack()) {
6780 ld(tmp, Address(fp, reg2offset_in(src.first())));
6781 sd(tmp, Address(sp, reg2offset_out(dst.first())));
6782 } else if (dst.first()-> is_Register()) {
6783 ld(dst.first()->as_Register(), Address(fp, reg2offset_in(src.first())));
6784 } else {
6785 ShouldNotReachHere();
6786 }
6787 } else if (src.first() != dst.first()) {
6788 if (src.is_single_phys_reg() && dst.is_single_phys_reg()) {
6789 fmv_d(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
6790 } else {
6791 ShouldNotReachHere();
6792 }
6793 }
6794 }
6795
6796 void MacroAssembler::test_bit(Register Rd, Register Rs, uint32_t bit_pos) {
6797 assert(bit_pos < 64, "invalid bit range");
6798 if (UseZbs) {
6799 bexti(Rd, Rs, bit_pos);
6800 return;
6801 }
6802 int64_t imm = (int64_t)(1UL << bit_pos);
6803 if (is_simm12(imm)) {
6804 andi(Rd, Rs, imm);
6805 } else {
6806 srli(Rd, Rs, bit_pos);
6807 andi(Rd, Rd, 1);
6808 }
6809 }
6810
6811 // Implements fast-locking.
6812 //
6813 // - obj: the object to be locked
6814 // - tmp1, tmp2, tmp3: temporary registers, will be destroyed
6815 // - slow: branched to if locking fails
6816 void MacroAssembler::fast_lock(Register basic_lock, Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6817 assert_different_registers(basic_lock, obj, tmp1, tmp2, tmp3, t0);
6818
6819 Label push;
6820 const Register top = tmp1;
6821 const Register mark = tmp2;
6822 const Register t = tmp3;
6823
6824 // Preload the markWord. It is important that this is the first
6825 // instruction emitted as it is part of C1's null check semantics.
6826 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6827
6828 if (UseObjectMonitorTable) {
6829 // Clear cache in case fast locking succeeds or we need to take the slow-path.
6830 sd(zr, Address(basic_lock, BasicObjectLock::lock_offset() + in_ByteSize((BasicLock::object_monitor_cache_offset_in_bytes()))));
6831 }
6832
6833 if (DiagnoseSyncOnValueBasedClasses != 0) {
6834 load_klass(tmp1, obj);
6835 lbu(tmp1, Address(tmp1, Klass::misc_flags_offset()));
6836 test_bit(tmp1, tmp1, exact_log2(KlassFlags::_misc_is_value_based_class));
6837 bnez(tmp1, slow, /* is_far */ true);
6838 }
6839
6840 // Check if the lock-stack is full.
6841 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6842 mv(t, (unsigned)LockStack::end_offset());
6843 bge(top, t, slow, /* is_far */ true);
6844
6845 // Check for recursion.
6846 add(t, xthread, top);
6847 ld(t, Address(t, -oopSize));
6848 beq(obj, t, push);
6849
6850 // Check header for monitor (0b10).
6851 test_bit(t, mark, exact_log2(markWord::monitor_value));
6852 bnez(t, slow, /* is_far */ true);
6853
6854 // Try to lock. Transition lock-bits 0b01 => 0b00
6855 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid a la");
6856 ori(mark, mark, markWord::unlocked_value);
6857 xori(t, mark, markWord::unlocked_value);
6858 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6859 /*acquire*/ Assembler::aq, /*release*/ Assembler::relaxed, /*result*/ t);
6860 bne(mark, t, slow, /* is_far */ true);
6861
6862 bind(push);
6863 // After successful lock, push object on lock-stack.
6864 add(t, xthread, top);
6865 sd(obj, Address(t));
6866 addiw(top, top, oopSize);
6867 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6868 }
6869
6870 // Implements ligthweight-unlocking.
6871 //
6872 // - obj: the object to be unlocked
6873 // - tmp1, tmp2, tmp3: temporary registers
6874 // - slow: branched to if unlocking fails
6875 void MacroAssembler::fast_unlock(Register obj, Register tmp1, Register tmp2, Register tmp3, Label& slow) {
6876 assert_different_registers(obj, tmp1, tmp2, tmp3, t0);
6877
6878 #ifdef ASSERT
6879 {
6880 // Check for lock-stack underflow.
6881 Label stack_ok;
6882 lwu(tmp1, Address(xthread, JavaThread::lock_stack_top_offset()));
6883 mv(tmp2, (unsigned)LockStack::start_offset());
6884 bge(tmp1, tmp2, stack_ok);
6885 STOP("Lock-stack underflow");
6886 bind(stack_ok);
6887 }
6888 #endif
6889
6890 Label unlocked, push_and_slow;
6891 const Register top = tmp1;
6892 const Register mark = tmp2;
6893 const Register t = tmp3;
6894
6895 // Check if obj is top of lock-stack.
6896 lwu(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6897 subiw(top, top, oopSize);
6898 add(t, xthread, top);
6899 ld(t, Address(t));
6900 bne(obj, t, slow, /* is_far */ true);
6901
6902 // Pop lock-stack.
6903 DEBUG_ONLY(add(t, xthread, top);)
6904 DEBUG_ONLY(sd(zr, Address(t));)
6905 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6906
6907 // Check if recursive.
6908 add(t, xthread, top);
6909 ld(t, Address(t, -oopSize));
6910 beq(obj, t, unlocked);
6911
6912 // Not recursive. Check header for monitor (0b10).
6913 ld(mark, Address(obj, oopDesc::mark_offset_in_bytes()));
6914 test_bit(t, mark, exact_log2(markWord::monitor_value));
6915 bnez(t, push_and_slow);
6916
6917 #ifdef ASSERT
6918 // Check header not unlocked (0b01).
6919 Label not_unlocked;
6920 test_bit(t, mark, exact_log2(markWord::unlocked_value));
6921 beqz(t, not_unlocked);
6922 stop("fast_unlock already unlocked");
6923 bind(not_unlocked);
6924 #endif
6925
6926 // Try to unlock. Transition lock bits 0b00 => 0b01
6927 assert(oopDesc::mark_offset_in_bytes() == 0, "required to avoid lea");
6928 ori(t, mark, markWord::unlocked_value);
6929 cmpxchg(/*addr*/ obj, /*expected*/ mark, /*new*/ t, Assembler::int64,
6930 /*acquire*/ Assembler::relaxed, /*release*/ Assembler::rl, /*result*/ t);
6931 beq(mark, t, unlocked);
6932
6933 bind(push_and_slow);
6934 // Restore lock-stack and handle the unlock in runtime.
6935 DEBUG_ONLY(add(t, xthread, top);)
6936 DEBUG_ONLY(sd(obj, Address(t));)
6937 addiw(top, top, oopSize);
6938 sw(top, Address(xthread, JavaThread::lock_stack_top_offset()));
6939 j(slow);
6940
6941 bind(unlocked);
6942 }