1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "compiler/oopMap.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/universe.hpp"
34 #include "nativeInst_riscv.hpp"
35 #include "oops/instanceOop.hpp"
36 #include "oops/method.hpp"
37 #include "oops/objArrayKlass.hpp"
38 #include "oops/oop.inline.hpp"
39 #include "prims/methodHandles.hpp"
40 #include "prims/upcallLinker.hpp"
41 #include "runtime/continuation.hpp"
42 #include "runtime/continuationEntry.inline.hpp"
43 #include "runtime/frame.inline.hpp"
44 #include "runtime/handles.inline.hpp"
45 #include "runtime/javaThread.hpp"
46 #include "runtime/sharedRuntime.hpp"
47 #include "runtime/stubCodeGenerator.hpp"
48 #include "runtime/stubRoutines.hpp"
49 #include "utilities/align.hpp"
50 #include "utilities/powerOfTwo.hpp"
51 #ifdef COMPILER2
52 #include "opto/runtime.hpp"
53 #endif
54
55 // Declaration and definition of StubGenerator (no .hpp file).
56 // For a more detailed description of the stub routine structure
57 // see the comment in stubRoutines.hpp
58
59 #undef __
60 #define __ _masm->
61
62 #ifdef PRODUCT
63 #define BLOCK_COMMENT(str) /* nothing */
64 #else
65 #define BLOCK_COMMENT(str) __ block_comment(str)
66 #endif
67
68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
69
70 // Stub Code definitions
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 #ifdef PRODUCT
76 #define inc_counter_np(counter) ((void)0)
77 #else
78 void inc_counter_np_(uint& counter) {
79 __ incrementw(ExternalAddress((address)&counter));
80 }
81 #define inc_counter_np(counter) \
82 BLOCK_COMMENT("inc_counter " #counter); \
83 inc_counter_np_(counter);
84 #endif
85
86 // Call stubs are used to call Java from C
87 //
88 // Arguments:
89 // c_rarg0: call wrapper address address
90 // c_rarg1: result address
91 // c_rarg2: result type BasicType
92 // c_rarg3: method Method*
93 // c_rarg4: (interpreter) entry point address
94 // c_rarg5: parameters intptr_t*
95 // c_rarg6: parameter size (in words) int
96 // c_rarg7: thread Thread*
97 //
98 // There is no return from the stub itself as any Java result
99 // is written to result
100 //
101 // we save x1 (ra) as the return PC at the base of the frame and
102 // link x8 (fp) below it as the frame pointer installing sp (x2)
103 // into fp.
104 //
105 // we save x10-x17, which accounts for all the c arguments.
106 //
107 // TODO: strictly do we need to save them all? they are treated as
108 // volatile by C so could we omit saving the ones we are going to
109 // place in global registers (thread? method?) or those we only use
110 // during setup of the Java call?
111 //
112 // we don't need to save x5 which C uses as an indirect result location
113 // return register.
114 //
115 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
116 // volatile
117 //
118 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
119 // registers and C expects to be callee-save
120 //
121 // so the stub frame looks like this when we enter Java code
122 //
123 // [ return_from_Java ] <--- sp
124 // [ argument word n ]
125 // ...
126 // -35 [ argument word 1 ]
127 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
128 // -33 [ saved f27 ]
129 // -32 [ saved f26 ]
130 // -31 [ saved f25 ]
131 // -30 [ saved f24 ]
132 // -29 [ saved f23 ]
133 // -28 [ saved f22 ]
134 // -27 [ saved f21 ]
135 // -26 [ saved f20 ]
136 // -25 [ saved f19 ]
137 // -24 [ saved f18 ]
138 // -23 [ saved f9 ]
139 // -22 [ saved f8 ]
140 // -21 [ saved x27 ]
141 // -20 [ saved x26 ]
142 // -19 [ saved x25 ]
143 // -18 [ saved x24 ]
144 // -17 [ saved x23 ]
145 // -16 [ saved x22 ]
146 // -15 [ saved x21 ]
147 // -14 [ saved x20 ]
148 // -13 [ saved x19 ]
149 // -12 [ saved x18 ]
150 // -11 [ saved x9 ]
151 // -10 [ call wrapper (x10) ]
152 // -9 [ result (x11) ]
153 // -8 [ result type (x12) ]
154 // -7 [ method (x13) ]
155 // -6 [ entry point (x14) ]
156 // -5 [ parameters (x15) ]
157 // -4 [ parameter size (x16) ]
158 // -3 [ thread (x17) ]
159 // -2 [ saved fp (x8) ]
160 // -1 [ saved ra (x1) ]
161 // 0 [ ] <--- fp == saved sp (x2)
162
163 // Call stub stack layout word offsets from fp
164 enum call_stub_layout {
165 sp_after_call_off = -34,
166
167 frm_off = sp_after_call_off,
168 f27_off = -33,
169 f26_off = -32,
170 f25_off = -31,
171 f24_off = -30,
172 f23_off = -29,
173 f22_off = -28,
174 f21_off = -27,
175 f20_off = -26,
176 f19_off = -25,
177 f18_off = -24,
178 f9_off = -23,
179 f8_off = -22,
180
181 x27_off = -21,
182 x26_off = -20,
183 x25_off = -19,
184 x24_off = -18,
185 x23_off = -17,
186 x22_off = -16,
187 x21_off = -15,
188 x20_off = -14,
189 x19_off = -13,
190 x18_off = -12,
191 x9_off = -11,
192
193 call_wrapper_off = -10,
194 result_off = -9,
195 result_type_off = -8,
196 method_off = -7,
197 entry_point_off = -6,
198 parameters_off = -5,
199 parameter_size_off = -4,
200 thread_off = -3,
201 fp_f = -2,
202 retaddr_off = -1,
203 };
204
205 address generate_call_stub(address& return_address) {
206 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
207 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
208 "adjust this code");
209
210 StubId stub_id = StubId::stubgen_call_stub_id;
211 StubCodeMark mark(this, stub_id);
212 address start = __ pc();
213
214 const Address sp_after_call (fp, sp_after_call_off * wordSize);
215
216 const Address frm_save (fp, frm_off * wordSize);
217 const Address call_wrapper (fp, call_wrapper_off * wordSize);
218 const Address result (fp, result_off * wordSize);
219 const Address result_type (fp, result_type_off * wordSize);
220 const Address method (fp, method_off * wordSize);
221 const Address entry_point (fp, entry_point_off * wordSize);
222 const Address parameters (fp, parameters_off * wordSize);
223 const Address parameter_size(fp, parameter_size_off * wordSize);
224
225 const Address thread (fp, thread_off * wordSize);
226
227 const Address f27_save (fp, f27_off * wordSize);
228 const Address f26_save (fp, f26_off * wordSize);
229 const Address f25_save (fp, f25_off * wordSize);
230 const Address f24_save (fp, f24_off * wordSize);
231 const Address f23_save (fp, f23_off * wordSize);
232 const Address f22_save (fp, f22_off * wordSize);
233 const Address f21_save (fp, f21_off * wordSize);
234 const Address f20_save (fp, f20_off * wordSize);
235 const Address f19_save (fp, f19_off * wordSize);
236 const Address f18_save (fp, f18_off * wordSize);
237 const Address f9_save (fp, f9_off * wordSize);
238 const Address f8_save (fp, f8_off * wordSize);
239
240 const Address x27_save (fp, x27_off * wordSize);
241 const Address x26_save (fp, x26_off * wordSize);
242 const Address x25_save (fp, x25_off * wordSize);
243 const Address x24_save (fp, x24_off * wordSize);
244 const Address x23_save (fp, x23_off * wordSize);
245 const Address x22_save (fp, x22_off * wordSize);
246 const Address x21_save (fp, x21_off * wordSize);
247 const Address x20_save (fp, x20_off * wordSize);
248 const Address x19_save (fp, x19_off * wordSize);
249 const Address x18_save (fp, x18_off * wordSize);
250
251 const Address x9_save (fp, x9_off * wordSize);
252
253 // stub code
254
255 address riscv_entry = __ pc();
256
257 // set up frame and move sp to end of save area
258 __ enter();
259 __ addi(sp, fp, sp_after_call_off * wordSize);
260
261 // save register parameters and Java temporary/global registers
262 // n.b. we save thread even though it gets installed in
263 // xthread because we want to sanity check tp later
264 __ sd(c_rarg7, thread);
265 __ sw(c_rarg6, parameter_size);
266 __ sd(c_rarg5, parameters);
267 __ sd(c_rarg4, entry_point);
268 __ sd(c_rarg3, method);
269 __ sd(c_rarg2, result_type);
270 __ sd(c_rarg1, result);
271 __ sd(c_rarg0, call_wrapper);
272
273 __ sd(x9, x9_save);
274
275 __ sd(x18, x18_save);
276 __ sd(x19, x19_save);
277 __ sd(x20, x20_save);
278 __ sd(x21, x21_save);
279 __ sd(x22, x22_save);
280 __ sd(x23, x23_save);
281 __ sd(x24, x24_save);
282 __ sd(x25, x25_save);
283 __ sd(x26, x26_save);
284 __ sd(x27, x27_save);
285
286 __ fsd(f8, f8_save);
287 __ fsd(f9, f9_save);
288 __ fsd(f18, f18_save);
289 __ fsd(f19, f19_save);
290 __ fsd(f20, f20_save);
291 __ fsd(f21, f21_save);
292 __ fsd(f22, f22_save);
293 __ fsd(f23, f23_save);
294 __ fsd(f24, f24_save);
295 __ fsd(f25, f25_save);
296 __ fsd(f26, f26_save);
297 __ fsd(f27, f27_save);
298
299 __ frrm(t0);
300 __ sd(t0, frm_save);
301 // Set frm to the state we need. We do want Round to Nearest. We
302 // don't want non-IEEE rounding modes.
303 Label skip_fsrmi;
304 guarantee(__ RoundingMode::rne == 0, "must be");
305 __ beqz(t0, skip_fsrmi);
306 __ fsrmi(__ RoundingMode::rne);
307 __ bind(skip_fsrmi);
308
309 // install Java thread in global register now we have saved
310 // whatever value it held
311 __ mv(xthread, c_rarg7);
312
313 // And method
314 __ mv(xmethod, c_rarg3);
315
316 // set up the heapbase register
317 __ reinit_heapbase();
318
319 #ifdef ASSERT
320 // make sure we have no pending exceptions
321 {
322 Label L;
323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
324 __ beqz(t0, L);
325 __ stop("StubRoutines::call_stub: entered with pending exception");
326 __ BIND(L);
327 }
328 #endif
329 // pass parameters if any
330 __ mv(esp, sp);
331 __ slli(t0, c_rarg6, LogBytesPerWord);
332 __ sub(t0, sp, t0); // Move SP out of the way
333 __ andi(sp, t0, -2 * wordSize);
334
335 BLOCK_COMMENT("pass parameters if any");
336 Label parameters_done;
337 // parameter count is still in c_rarg6
338 // and parameter pointer identifying param 1 is in c_rarg5
339 __ beqz(c_rarg6, parameters_done);
340
341 address loop = __ pc();
342 __ ld(t0, Address(c_rarg5, 0));
343 __ addi(c_rarg5, c_rarg5, wordSize);
344 __ subi(c_rarg6, c_rarg6, 1);
345 __ push_reg(t0);
346 __ bgtz(c_rarg6, loop);
347
348 __ BIND(parameters_done);
349
350 // call Java entry -- passing methdoOop, and current sp
351 // xmethod: Method*
352 // x19_sender_sp: sender sp
353 BLOCK_COMMENT("call Java function");
354 __ mv(x19_sender_sp, sp);
355 __ jalr(c_rarg4);
356
357 // save current address for use by exception handling code
358
359 return_address = __ pc();
360
361 // store result depending on type (everything that is not
362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
363 // n.b. this assumes Java returns an integral result in x10
364 // and a floating result in j_farg0
365 __ ld(j_rarg2, result);
366 Label is_long, is_float, is_double, exit;
367 __ ld(j_rarg1, result_type);
368 __ mv(t0, (u1)T_OBJECT);
369 __ beq(j_rarg1, t0, is_long);
370 __ mv(t0, (u1)T_LONG);
371 __ beq(j_rarg1, t0, is_long);
372 __ mv(t0, (u1)T_FLOAT);
373 __ beq(j_rarg1, t0, is_float);
374 __ mv(t0, (u1)T_DOUBLE);
375 __ beq(j_rarg1, t0, is_double);
376
377 // handle T_INT case
378 __ sw(x10, Address(j_rarg2));
379
380 __ BIND(exit);
381
382 // pop parameters
383 __ addi(esp, fp, sp_after_call_off * wordSize);
384
385 #ifdef ASSERT
386 // verify that threads correspond
387 {
388 Label L, S;
389 __ ld(t0, thread);
390 __ bne(xthread, t0, S);
391 __ get_thread(t0);
392 __ beq(xthread, t0, L);
393 __ BIND(S);
394 __ stop("StubRoutines::call_stub: threads must correspond");
395 __ BIND(L);
396 }
397 #endif
398
399 __ pop_cont_fastpath(xthread);
400
401 // restore callee-save registers
402 __ fld(f27, f27_save);
403 __ fld(f26, f26_save);
404 __ fld(f25, f25_save);
405 __ fld(f24, f24_save);
406 __ fld(f23, f23_save);
407 __ fld(f22, f22_save);
408 __ fld(f21, f21_save);
409 __ fld(f20, f20_save);
410 __ fld(f19, f19_save);
411 __ fld(f18, f18_save);
412 __ fld(f9, f9_save);
413 __ fld(f8, f8_save);
414
415 __ ld(x27, x27_save);
416 __ ld(x26, x26_save);
417 __ ld(x25, x25_save);
418 __ ld(x24, x24_save);
419 __ ld(x23, x23_save);
420 __ ld(x22, x22_save);
421 __ ld(x21, x21_save);
422 __ ld(x20, x20_save);
423 __ ld(x19, x19_save);
424 __ ld(x18, x18_save);
425
426 __ ld(x9, x9_save);
427
428 // restore frm
429 Label skip_fsrm;
430 __ ld(t0, frm_save);
431 __ frrm(t1);
432 __ beq(t0, t1, skip_fsrm);
433 __ fsrm(t0);
434 __ bind(skip_fsrm);
435
436 __ ld(c_rarg0, call_wrapper);
437 __ ld(c_rarg1, result);
438 __ ld(c_rarg2, result_type);
439 __ ld(c_rarg3, method);
440 __ ld(c_rarg4, entry_point);
441 __ ld(c_rarg5, parameters);
442 __ ld(c_rarg6, parameter_size);
443 __ ld(c_rarg7, thread);
444
445 // leave frame and return to caller
446 __ leave();
447 __ ret();
448
449 // handle return types different from T_INT
450
451 __ BIND(is_long);
452 __ sd(x10, Address(j_rarg2, 0));
453 __ j(exit);
454
455 __ BIND(is_float);
456 __ fsw(j_farg0, Address(j_rarg2, 0), t0);
457 __ j(exit);
458
459 __ BIND(is_double);
460 __ fsd(j_farg0, Address(j_rarg2, 0), t0);
461 __ j(exit);
462
463 return start;
464 }
465
466 // Return point for a Java call if there's an exception thrown in
467 // Java code. The exception is caught and transformed into a
468 // pending exception stored in JavaThread that can be tested from
469 // within the VM.
470 //
471 // Note: Usually the parameters are removed by the callee. In case
472 // of an exception crossing an activation frame boundary, that is
473 // not the case if the callee is compiled code => need to setup the
474 // sp.
475 //
476 // x10: exception oop
477
478 address generate_catch_exception() {
479 StubId stub_id = StubId::stubgen_catch_exception_id;
480 StubCodeMark mark(this, stub_id);
481 address start = __ pc();
482
483 // same as in generate_call_stub():
484 const Address thread(fp, thread_off * wordSize);
485
486 #ifdef ASSERT
487 // verify that threads correspond
488 {
489 Label L, S;
490 __ ld(t0, thread);
491 __ bne(xthread, t0, S);
492 __ get_thread(t0);
493 __ beq(xthread, t0, L);
494 __ bind(S);
495 __ stop("StubRoutines::catch_exception: threads must correspond");
496 __ bind(L);
497 }
498 #endif
499
500 // set pending exception
501 __ verify_oop(x10);
502
503 __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
504 __ mv(t0, (address)__FILE__);
505 __ sd(t0, Address(xthread, Thread::exception_file_offset()));
506 __ mv(t0, (int)__LINE__);
507 __ sw(t0, Address(xthread, Thread::exception_line_offset()));
508
509 // complete return to VM
510 assert(StubRoutines::_call_stub_return_address != nullptr,
511 "_call_stub_return_address must have been generated before");
512 __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
513
514 return start;
515 }
516
517 // Continuation point for runtime calls returning with a pending
518 // exception. The pending exception check happened in the runtime
519 // or native call stub. The pending exception in Thread is
520 // converted into a Java-level exception.
521 //
522 // Contract with Java-level exception handlers:
523 // x10: exception
524 // x13: throwing pc
525 //
526 // NOTE: At entry of this stub, exception-pc must be in RA !!
527
528 // NOTE: this is always used as a jump target within generated code
529 // so it just needs to be generated code with no x86 prolog
530
531 address generate_forward_exception() {
532 StubId stub_id = StubId::stubgen_forward_exception_id;
533 StubCodeMark mark(this, stub_id);
534 address start = __ pc();
535
536 // Upon entry, RA points to the return address returning into
537 // Java (interpreted or compiled) code; i.e., the return address
538 // becomes the throwing pc.
539 //
540 // Arguments pushed before the runtime call are still on the stack
541 // but the exception handler will reset the stack pointer ->
542 // ignore them. A potential result in registers can be ignored as
543 // well.
544
545 #ifdef ASSERT
546 // make sure this code is only executed if there is a pending exception
547 {
548 Label L;
549 __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
550 __ bnez(t0, L);
551 __ stop("StubRoutines::forward exception: no pending exception (1)");
552 __ bind(L);
553 }
554 #endif
555
556 // compute exception handler into x9
557
558 // call the VM to find the handler address associated with the
559 // caller address. pass thread in x10 and caller pc (ret address)
560 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
561 // the stack.
562 __ mv(c_rarg1, ra);
563 // ra will be trashed by the VM call so we move it to x9
564 // (callee-saved) because we also need to pass it to the handler
565 // returned by this call.
566 __ mv(x9, ra);
567 BLOCK_COMMENT("call exception_handler_for_return_address");
568 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
569 SharedRuntime::exception_handler_for_return_address),
570 xthread, c_rarg1);
571 // we should not really care that ra is no longer the callee
572 // address. we saved the value the handler needs in x9 so we can
573 // just copy it to x13. however, the C2 handler will push its own
574 // frame and then calls into the VM and the VM code asserts that
575 // the PC for the frame above the handler belongs to a compiled
576 // Java method. So, we restore ra here to satisfy that assert.
577 __ mv(ra, x9);
578 // setup x10 & x13 & clear pending exception
579 __ mv(x13, x9);
580 __ mv(x9, x10);
581 __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
582 __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
583
584 #ifdef ASSERT
585 // make sure exception is set
586 {
587 Label L;
588 __ bnez(x10, L);
589 __ stop("StubRoutines::forward exception: no pending exception (2)");
590 __ bind(L);
591 }
592 #endif
593
594 // continue at exception handler
595 // x10: exception
596 // x13: throwing pc
597 // x9: exception handler
598 __ verify_oop(x10);
599 __ jr(x9);
600
601 return start;
602 }
603
604 // Non-destructive plausibility checks for oops
605 //
606 // Arguments:
607 // x10: oop to verify
608 // t0: error message
609 //
610 // Stack after saving c_rarg3:
611 // [tos + 0]: saved c_rarg3
612 // [tos + 1]: saved c_rarg2
613 // [tos + 2]: saved ra
614 // [tos + 3]: saved t1
615 // [tos + 4]: saved x10
616 // [tos + 5]: saved t0
617 address generate_verify_oop() {
618
619 StubId stub_id = StubId::stubgen_verify_oop_id;
620 StubCodeMark mark(this, stub_id);
621 address start = __ pc();
622
623 Label exit, error;
624
625 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
626
627 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
628 __ ld(c_rarg3, Address(c_rarg2));
629 __ addi(c_rarg3, c_rarg3, 1);
630 __ sd(c_rarg3, Address(c_rarg2));
631
632 // object is in x10
633 // make sure object is 'reasonable'
634 __ beqz(x10, exit); // if obj is null it is OK
635
636 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
637 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
638
639 // return if everything seems ok
640 __ bind(exit);
641
642 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
643 __ ret();
644
645 // handle errors
646 __ bind(error);
647 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
648
649 __ push_reg(RegSet::range(x0, x31), sp);
650 // debug(char* msg, int64_t pc, int64_t regs[])
651 __ mv(c_rarg0, t0); // pass address of error message
652 __ mv(c_rarg1, ra); // pass return address
653 __ mv(c_rarg2, sp); // pass address of regs on stack
654 #ifndef PRODUCT
655 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
656 #endif
657 BLOCK_COMMENT("call MacroAssembler::debug");
658 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
659 __ ebreak();
660
661 return start;
662 }
663
664 // The inner part of zero_words().
665 //
666 // Inputs:
667 // x28: the HeapWord-aligned base address of an array to zero.
668 // x29: the count in HeapWords, x29 > 0.
669 //
670 // Returns x28 and x29, adjusted for the caller to clear.
671 // x28: the base address of the tail of words left to clear.
672 // x29: the number of words in the tail.
673 // x29 < MacroAssembler::zero_words_block_size.
674
675 address generate_zero_blocks() {
676 Label done;
677
678 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
679
680 __ align(CodeEntryAlignment);
681 StubId stub_id = StubId::stubgen_zero_blocks_id;
682 StubCodeMark mark(this, stub_id);
683 address start = __ pc();
684
685 if (UseBlockZeroing) {
686 int zicboz_block_size = VM_Version::zicboz_block_size.value();
687 // Ensure count >= 2 * zicboz_block_size so that it still deserves
688 // a cbo.zero after alignment.
689 Label small;
690 int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
691 __ mv(tmp1, low_limit);
692 __ blt(cnt, tmp1, small);
693 __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
694 __ bind(small);
695 }
696
697 {
698 // Clear the remaining blocks.
699 Label loop;
700 __ mv(tmp1, MacroAssembler::zero_words_block_size);
701 __ blt(cnt, tmp1, done);
702 __ bind(loop);
703 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
704 __ sd(zr, Address(base, i * wordSize));
705 }
706 __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
707 __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
708 __ bge(cnt, tmp1, loop);
709 __ bind(done);
710 }
711
712 __ ret();
713
714 return start;
715 }
716
717 typedef enum {
718 copy_forwards = 1,
719 copy_backwards = -1
720 } copy_direction;
721
722 // Bulk copy of blocks of 8 words.
723 //
724 // count is a count of words.
725 //
726 // Precondition: count >= 8
727 //
728 // Postconditions:
729 //
730 // The least significant bit of count contains the remaining count
731 // of words to copy. The rest of count is trash.
732 //
733 // s and d are adjusted to point to the remaining words to copy
734 //
735 address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
736 BasicType type;
737 copy_direction direction;
738 switch (stub_id) {
739 case StubId::stubgen_copy_byte_f_id:
740 direction = copy_forwards;
741 type = T_BYTE;
742 break;
743 case StubId::stubgen_copy_byte_b_id:
744 direction = copy_backwards;
745 type = T_BYTE;
746 break;
747 default:
748 ShouldNotReachHere();
749 }
750 int unit = wordSize * direction;
751 int bias = wordSize;
752
753 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
754 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
755
756 const Register stride = x30;
757
758 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
759 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
760 assert_different_registers(s, d, count, t0);
761
762 Label again, drain;
763 StubCodeMark mark(this, stub_id);
764 __ align(CodeEntryAlignment);
765 address start = __ pc();
766
767 if (direction == copy_forwards) {
768 __ sub(s, s, bias);
769 __ sub(d, d, bias);
770 }
771
772 #ifdef ASSERT
773 // Make sure we are never given < 8 words
774 {
775 Label L;
776
777 __ mv(t0, 8);
778 __ bge(count, t0, L);
779 __ stop("genrate_copy_longs called with < 8 words");
780 __ bind(L);
781 }
782 #endif
783
784 __ ld(tmp_reg0, Address(s, 1 * unit));
785 __ ld(tmp_reg1, Address(s, 2 * unit));
786 __ ld(tmp_reg2, Address(s, 3 * unit));
787 __ ld(tmp_reg3, Address(s, 4 * unit));
788 __ ld(tmp_reg4, Address(s, 5 * unit));
789 __ ld(tmp_reg5, Address(s, 6 * unit));
790 __ ld(tmp_reg6, Address(s, 7 * unit));
791 __ ld(tmp_reg7, Address(s, 8 * unit));
792 __ addi(s, s, 8 * unit);
793
794 __ subi(count, count, 16);
795 __ bltz(count, drain);
796
797 __ bind(again);
798
799 __ sd(tmp_reg0, Address(d, 1 * unit));
800 __ sd(tmp_reg1, Address(d, 2 * unit));
801 __ sd(tmp_reg2, Address(d, 3 * unit));
802 __ sd(tmp_reg3, Address(d, 4 * unit));
803 __ sd(tmp_reg4, Address(d, 5 * unit));
804 __ sd(tmp_reg5, Address(d, 6 * unit));
805 __ sd(tmp_reg6, Address(d, 7 * unit));
806 __ sd(tmp_reg7, Address(d, 8 * unit));
807
808 __ ld(tmp_reg0, Address(s, 1 * unit));
809 __ ld(tmp_reg1, Address(s, 2 * unit));
810 __ ld(tmp_reg2, Address(s, 3 * unit));
811 __ ld(tmp_reg3, Address(s, 4 * unit));
812 __ ld(tmp_reg4, Address(s, 5 * unit));
813 __ ld(tmp_reg5, Address(s, 6 * unit));
814 __ ld(tmp_reg6, Address(s, 7 * unit));
815 __ ld(tmp_reg7, Address(s, 8 * unit));
816
817 __ addi(s, s, 8 * unit);
818 __ addi(d, d, 8 * unit);
819
820 __ subi(count, count, 8);
821 __ bgez(count, again);
822
823 // Drain
824 __ bind(drain);
825
826 __ sd(tmp_reg0, Address(d, 1 * unit));
827 __ sd(tmp_reg1, Address(d, 2 * unit));
828 __ sd(tmp_reg2, Address(d, 3 * unit));
829 __ sd(tmp_reg3, Address(d, 4 * unit));
830 __ sd(tmp_reg4, Address(d, 5 * unit));
831 __ sd(tmp_reg5, Address(d, 6 * unit));
832 __ sd(tmp_reg6, Address(d, 7 * unit));
833 __ sd(tmp_reg7, Address(d, 8 * unit));
834 __ addi(d, d, 8 * unit);
835
836 {
837 Label L1, L2;
838 __ test_bit(t0, count, 2);
839 __ beqz(t0, L1);
840
841 __ ld(tmp_reg0, Address(s, 1 * unit));
842 __ ld(tmp_reg1, Address(s, 2 * unit));
843 __ ld(tmp_reg2, Address(s, 3 * unit));
844 __ ld(tmp_reg3, Address(s, 4 * unit));
845 __ addi(s, s, 4 * unit);
846
847 __ sd(tmp_reg0, Address(d, 1 * unit));
848 __ sd(tmp_reg1, Address(d, 2 * unit));
849 __ sd(tmp_reg2, Address(d, 3 * unit));
850 __ sd(tmp_reg3, Address(d, 4 * unit));
851 __ addi(d, d, 4 * unit);
852
853 __ bind(L1);
854
855 if (direction == copy_forwards) {
856 __ addi(s, s, bias);
857 __ addi(d, d, bias);
858 }
859
860 __ test_bit(t0, count, 1);
861 __ beqz(t0, L2);
862 if (direction == copy_backwards) {
863 __ addi(s, s, 2 * unit);
864 __ ld(tmp_reg0, Address(s));
865 __ ld(tmp_reg1, Address(s, wordSize));
866 __ addi(d, d, 2 * unit);
867 __ sd(tmp_reg0, Address(d));
868 __ sd(tmp_reg1, Address(d, wordSize));
869 } else {
870 __ ld(tmp_reg0, Address(s));
871 __ ld(tmp_reg1, Address(s, wordSize));
872 __ addi(s, s, 2 * unit);
873 __ sd(tmp_reg0, Address(d));
874 __ sd(tmp_reg1, Address(d, wordSize));
875 __ addi(d, d, 2 * unit);
876 }
877 __ bind(L2);
878 }
879
880 __ ret();
881
882 return start;
883 }
884
885 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
886
887 void copy_memory_v(Register s, Register d, Register count, int step) {
888 bool is_backward = step < 0;
889 int granularity = g_uabs(step);
890
891 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
892 assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
893 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
894 Label loop_forward, loop_backward, done;
895
896 __ mv(dst, d);
897 __ mv(src, s);
898 __ mv(cnt, count);
899
900 __ bind(loop_forward);
901 __ vsetvli(vl, cnt, sew, Assembler::m8);
902 if (is_backward) {
903 __ bne(vl, cnt, loop_backward);
904 }
905
906 __ vlex_v(v0, src, sew);
907 __ sub(cnt, cnt, vl);
908 if (sew != Assembler::e8) {
909 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
910 __ slli(vl, vl, sew);
911 }
912 __ add(src, src, vl);
913
914 __ vsex_v(v0, dst, sew);
915 __ add(dst, dst, vl);
916 __ bnez(cnt, loop_forward);
917
918 if (is_backward) {
919 __ j(done);
920
921 __ bind(loop_backward);
922 __ sub(t0, cnt, vl);
923 if (sew != Assembler::e8) {
924 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
925 __ slli(t0, t0, sew);
926 }
927 __ add(tmp1, s, t0);
928 __ vlex_v(v0, tmp1, sew);
929 __ add(tmp2, d, t0);
930 __ vsex_v(v0, tmp2, sew);
931 __ sub(cnt, cnt, vl);
932 __ bnez(cnt, loop_forward);
933 __ bind(done);
934 }
935 }
936
937 // All-singing all-dancing memory copy.
938 //
939 // Copy count units of memory from s to d. The size of a unit is
940 // step, which can be positive or negative depending on the direction
941 // of copy.
942 //
943 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
944 Register s, Register d, Register count, int step) {
945 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
946 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
947 return copy_memory_v(s, d, count, step);
948 }
949
950 bool is_backwards = step < 0;
951 int granularity = g_uabs(step);
952
953 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
954 const Register gct1 = x28, gct2 = x29, gct3 = t2;
955
956 Label same_aligned;
957 Label copy_big, copy32_loop, copy8_loop, copy_small, done;
958
959 // The size of copy32_loop body increases significantly with ZGC GC barriers.
960 // Need conditional far branches to reach a point beyond the loop in this case.
961 bool is_far = UseZGC;
962
963 __ beqz(count, done, is_far);
964 __ slli(cnt, count, exact_log2(granularity));
965 if (is_backwards) {
966 __ add(src, s, cnt);
967 __ add(dst, d, cnt);
968 } else {
969 __ mv(src, s);
970 __ mv(dst, d);
971 }
972
973 if (is_aligned) {
974 __ subi(t0, cnt, 32);
975 __ bgez(t0, copy32_loop);
976 __ subi(t0, cnt, 8);
977 __ bgez(t0, copy8_loop, is_far);
978 __ j(copy_small);
979 } else {
980 __ mv(t0, 16);
981 __ blt(cnt, t0, copy_small, is_far);
982
983 __ xorr(t0, src, dst);
984 __ andi(t0, t0, 0b111);
985 __ bnez(t0, copy_small, is_far);
986
987 __ bind(same_aligned);
988 __ andi(t0, src, 0b111);
989 __ beqz(t0, copy_big);
990 if (is_backwards) {
991 __ addi(src, src, step);
992 __ addi(dst, dst, step);
993 }
994 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
995 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
996 if (!is_backwards) {
997 __ addi(src, src, step);
998 __ addi(dst, dst, step);
999 }
1000 __ subi(cnt, cnt, granularity);
1001 __ beqz(cnt, done, is_far);
1002 __ j(same_aligned);
1003
1004 __ bind(copy_big);
1005 __ mv(t0, 32);
1006 __ blt(cnt, t0, copy8_loop, is_far);
1007 }
1008
1009 __ bind(copy32_loop);
1010 if (is_backwards) {
1011 __ subi(src, src, wordSize * 4);
1012 __ subi(dst, dst, wordSize * 4);
1013 }
1014 // we first load 32 bytes, then write it, so the direction here doesn't matter
1015 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1016 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1);
1017 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019
1020 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1021 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3);
1022 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024
1025 if (!is_backwards) {
1026 __ addi(src, src, wordSize * 4);
1027 __ addi(dst, dst, wordSize * 4);
1028 }
1029 __ subi(t0, cnt, 32 + wordSize * 4);
1030 __ subi(cnt, cnt, wordSize * 4);
1031 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032
1033 __ beqz(cnt, done); // if that's all - done
1034
1035 __ subi(t0, cnt, 8); // if not - copy the reminder
1036 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037
1038 __ bind(copy8_loop);
1039 if (is_backwards) {
1040 __ subi(src, src, wordSize);
1041 __ subi(dst, dst, wordSize);
1042 }
1043 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045
1046 if (!is_backwards) {
1047 __ addi(src, src, wordSize);
1048 __ addi(dst, dst, wordSize);
1049 }
1050 __ subi(t0, cnt, 8 + wordSize);
1051 __ subi(cnt, cnt, wordSize);
1052 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053
1054 __ beqz(cnt, done); // if that's all - done
1055
1056 __ bind(copy_small);
1057 if (is_backwards) {
1058 __ addi(src, src, step);
1059 __ addi(dst, dst, step);
1060 }
1061
1062 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064
1065 if (!is_backwards) {
1066 __ addi(src, src, step);
1067 __ addi(dst, dst, step);
1068 }
1069 __ subi(cnt, cnt, granularity);
1070 __ bgtz(cnt, copy_small);
1071
1072 __ bind(done);
1073 }
1074
1075 // Scan over array at a for count oops, verifying each one.
1076 // Preserves a and count, clobbers t0 and t1.
1077 void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078 Label loop, end;
1079 __ mv(t1, zr);
1080 __ slli(t0, count, exact_log2(size));
1081 __ bind(loop);
1082 __ bgeu(t1, t0, end);
1083
1084 __ add(temp, a, t1);
1085 if (size == (size_t)wordSize) {
1086 __ ld(temp, Address(temp, 0));
1087 __ verify_oop(temp);
1088 } else {
1089 __ lwu(temp, Address(temp, 0));
1090 __ decode_heap_oop(temp); // calls verify_oop
1091 }
1092 __ add(t1, t1, size);
1093 __ j(loop);
1094 __ bind(end);
1095 }
1096
1097 // Arguments:
1098 // stub_id - is used to name the stub and identify all details of
1099 // how to perform the copy.
1100 //
1101 // nopush_entry - is assigned to the stub's post push entry point
1102 // unless it is null
1103 //
1104 // Inputs:
1105 // c_rarg0 - source array address
1106 // c_rarg1 - destination array address
1107 // c_rarg2 - element count, treated as ssize_t, can be zero
1108 //
1109 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110 // the hardware handle it. The two dwords within qwords that span
1111 // cache line boundaries will still be loaded and stored atomically.
1112 //
1113 // Side Effects: nopush_entry is set to the (post push) entry point
1114 // so it can be used by the corresponding conjoint
1115 // copy method
1116 //
1117 address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118 size_t size;
1119 bool aligned;
1120 bool is_oop;
1121 bool dest_uninitialized;
1122 switch (stub_id) {
1123 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124 size = sizeof(jbyte);
1125 aligned = false;
1126 is_oop = false;
1127 dest_uninitialized = false;
1128 break;
1129 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130 size = sizeof(jbyte);
1131 aligned = true;
1132 is_oop = false;
1133 dest_uninitialized = false;
1134 break;
1135 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136 size = sizeof(jshort);
1137 aligned = false;
1138 is_oop = false;
1139 dest_uninitialized = false;
1140 break;
1141 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142 size = sizeof(jshort);
1143 aligned = true;
1144 is_oop = false;
1145 dest_uninitialized = false;
1146 break;
1147 case StubId::stubgen_jint_disjoint_arraycopy_id:
1148 size = sizeof(jint);
1149 aligned = false;
1150 is_oop = false;
1151 dest_uninitialized = false;
1152 break;
1153 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154 size = sizeof(jint);
1155 aligned = true;
1156 is_oop = false;
1157 dest_uninitialized = false;
1158 break;
1159 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160 // since this is always aligned we can (should!) use the same
1161 // stub as for case arrayof_jlong_disjoint_arraycopy
1162 ShouldNotReachHere();
1163 break;
1164 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165 size = sizeof(jlong);
1166 aligned = true;
1167 is_oop = false;
1168 dest_uninitialized = false;
1169 break;
1170 case StubId::stubgen_oop_disjoint_arraycopy_id:
1171 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172 aligned = !UseCompressedOops;
1173 is_oop = true;
1174 dest_uninitialized = false;
1175 break;
1176 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178 aligned = !UseCompressedOops;
1179 is_oop = true;
1180 dest_uninitialized = false;
1181 break;
1182 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184 aligned = !UseCompressedOops;
1185 is_oop = true;
1186 dest_uninitialized = true;
1187 break;
1188 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190 aligned = !UseCompressedOops;
1191 is_oop = true;
1192 dest_uninitialized = true;
1193 break;
1194 default:
1195 ShouldNotReachHere();
1196 break;
1197 }
1198
1199 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200 RegSet saved_reg = RegSet::of(s, d, count);
1201 __ align(CodeEntryAlignment);
1202 StubCodeMark mark(this, stub_id);
1203 address start = __ pc();
1204 __ enter();
1205
1206 if (nopush_entry != nullptr) {
1207 *nopush_entry = __ pc();
1208 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209 BLOCK_COMMENT("Entry:");
1210 }
1211
1212 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213 if (dest_uninitialized) {
1214 decorators |= IS_DEST_UNINITIALIZED;
1215 }
1216 if (aligned) {
1217 decorators |= ARRAYCOPY_ALIGNED;
1218 }
1219
1220 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222
1223 if (is_oop) {
1224 // save regs before copy_memory
1225 __ push_reg(RegSet::of(d, count), sp);
1226 }
1227
1228 {
1229 // UnsafeMemoryAccess page error: continue after unsafe access
1230 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231 UnsafeMemoryAccessMark umam(this, add_entry, true);
1232 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233 }
1234
1235 if (is_oop) {
1236 __ pop_reg(RegSet::of(d, count), sp);
1237 if (VerifyOops) {
1238 verify_oop_array(size, d, count, t2);
1239 }
1240 }
1241
1242 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243
1244 __ leave();
1245 __ mv(x10, zr); // return 0
1246 __ ret();
1247 return start;
1248 }
1249
1250 // Arguments:
1251 // stub_id - is used to name the stub and identify all details of
1252 // how to perform the copy.
1253 //
1254 // nooverlap_target - identifes the (post push) entry for the
1255 // corresponding disjoint copy routine which can be
1256 // jumped to if the ranges do not actually overlap
1257 //
1258 // nopush_entry - is assigned to the stub's post push entry point
1259 // unless it is null
1260 //
1261 // Inputs:
1262 // c_rarg0 - source array address
1263 // c_rarg1 - destination array address
1264 // c_rarg2 - element count, treated as ssize_t, can be zero
1265 //
1266 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267 // the hardware handle it. The two dwords within qwords that span
1268 // cache line boundaries will still be loaded and stored atomically.
1269 //
1270 // Side Effects:
1271 // nopush_entry is set to the no-overlap entry point so it can be
1272 // used by some other conjoint copy method
1273 //
1274 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276 RegSet saved_regs = RegSet::of(s, d, count);
1277 int size;
1278 bool aligned;
1279 bool is_oop;
1280 bool dest_uninitialized;
1281 switch (stub_id) {
1282 case StubId::stubgen_jbyte_arraycopy_id:
1283 size = sizeof(jbyte);
1284 aligned = false;
1285 is_oop = false;
1286 dest_uninitialized = false;
1287 break;
1288 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289 size = sizeof(jbyte);
1290 aligned = true;
1291 is_oop = false;
1292 dest_uninitialized = false;
1293 break;
1294 case StubId::stubgen_jshort_arraycopy_id:
1295 size = sizeof(jshort);
1296 aligned = false;
1297 is_oop = false;
1298 dest_uninitialized = false;
1299 break;
1300 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301 size = sizeof(jshort);
1302 aligned = true;
1303 is_oop = false;
1304 dest_uninitialized = false;
1305 break;
1306 case StubId::stubgen_jint_arraycopy_id:
1307 size = sizeof(jint);
1308 aligned = false;
1309 is_oop = false;
1310 dest_uninitialized = false;
1311 break;
1312 case StubId::stubgen_arrayof_jint_arraycopy_id:
1313 size = sizeof(jint);
1314 aligned = true;
1315 is_oop = false;
1316 dest_uninitialized = false;
1317 break;
1318 case StubId::stubgen_jlong_arraycopy_id:
1319 // since this is always aligned we can (should!) use the same
1320 // stub as for case arrayof_jlong_disjoint_arraycopy
1321 ShouldNotReachHere();
1322 break;
1323 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324 size = sizeof(jlong);
1325 aligned = true;
1326 is_oop = false;
1327 dest_uninitialized = false;
1328 break;
1329 case StubId::stubgen_oop_arraycopy_id:
1330 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331 aligned = !UseCompressedOops;
1332 is_oop = true;
1333 dest_uninitialized = false;
1334 break;
1335 case StubId::stubgen_arrayof_oop_arraycopy_id:
1336 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337 aligned = !UseCompressedOops;
1338 is_oop = true;
1339 dest_uninitialized = false;
1340 break;
1341 case StubId::stubgen_oop_arraycopy_uninit_id:
1342 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343 aligned = !UseCompressedOops;
1344 is_oop = true;
1345 dest_uninitialized = true;
1346 break;
1347 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349 aligned = !UseCompressedOops;
1350 is_oop = true;
1351 dest_uninitialized = true;
1352 break;
1353 default:
1354 ShouldNotReachHere();
1355 }
1356
1357 StubCodeMark mark(this, stub_id);
1358 address start = __ pc();
1359 __ enter();
1360
1361 if (nopush_entry != nullptr) {
1362 *nopush_entry = __ pc();
1363 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364 BLOCK_COMMENT("Entry:");
1365 }
1366
1367 // use fwd copy when (d-s) above_equal (count*size)
1368 __ sub(t0, d, s);
1369 __ slli(t1, count, exact_log2(size));
1370 Label L_continue;
1371 __ bltu(t0, t1, L_continue);
1372 __ j(RuntimeAddress(nooverlap_target));
1373 __ bind(L_continue);
1374
1375 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376 if (dest_uninitialized) {
1377 decorators |= IS_DEST_UNINITIALIZED;
1378 }
1379 if (aligned) {
1380 decorators |= ARRAYCOPY_ALIGNED;
1381 }
1382
1383 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385
1386 if (is_oop) {
1387 // save regs before copy_memory
1388 __ push_reg(RegSet::of(d, count), sp);
1389 }
1390
1391 {
1392 // UnsafeMemoryAccess page error: continue after unsafe access
1393 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394 UnsafeMemoryAccessMark umam(this, add_entry, true);
1395 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396 }
1397
1398 if (is_oop) {
1399 __ pop_reg(RegSet::of(d, count), sp);
1400 if (VerifyOops) {
1401 verify_oop_array(size, d, count, t2);
1402 }
1403 }
1404 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405 __ leave();
1406 __ mv(x10, zr); // return 0
1407 __ ret();
1408 return start;
1409 }
1410
1411 // Helper for generating a dynamic type check.
1412 // Smashes t0, t1.
1413 void generate_type_check(Register sub_klass,
1414 Register super_check_offset,
1415 Register super_klass,
1416 Register result,
1417 Register tmp1,
1418 Register tmp2,
1419 Label& L_success) {
1420 assert_different_registers(sub_klass, super_check_offset, super_klass);
1421
1422 BLOCK_COMMENT("type_check:");
1423
1424 Label L_miss;
1425
1426 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427 __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428
1429 // Fall through on failure!
1430 __ BIND(L_miss);
1431 }
1432
1433 //
1434 // Generate checkcasting array copy stub
1435 //
1436 // Input:
1437 // c_rarg0 - source array address
1438 // c_rarg1 - destination array address
1439 // c_rarg2 - element count, treated as ssize_t, can be zero
1440 // c_rarg3 - size_t ckoff (super_check_offset)
1441 // c_rarg4 - oop ckval (super_klass)
1442 //
1443 // Output:
1444 // x10 == 0 - success
1445 // x10 == -1^K - failure, where K is partial transfer count
1446 //
1447 address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448 bool dest_uninitialized;
1449 switch (stub_id) {
1450 case StubId::stubgen_checkcast_arraycopy_id:
1451 dest_uninitialized = false;
1452 break;
1453 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454 dest_uninitialized = true;
1455 break;
1456 default:
1457 ShouldNotReachHere();
1458 }
1459
1460 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461
1462 // Input registers (after setup_arg_regs)
1463 const Register from = c_rarg0; // source array address
1464 const Register to = c_rarg1; // destination array address
1465 const Register count = c_rarg2; // elementscount
1466 const Register ckoff = c_rarg3; // super_check_offset
1467 const Register ckval = c_rarg4; // super_klass
1468
1469 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1470
1471 // Registers used as temps (x7, x9, x18 are save-on-entry)
1472 const Register count_save = x19; // orig elementscount
1473 const Register start_to = x18; // destination array start address
1474 const Register copied_oop = x7; // actual oop copied
1475 const Register r9_klass = x9; // oop._klass
1476
1477 // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478 const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479
1480 //---------------------------------------------------------------
1481 // Assembler stub will be used for this call to arraycopy
1482 // if the two arrays are subtypes of Object[] but the
1483 // destination array type is not equal to or a supertype
1484 // of the source type. Each element must be separately
1485 // checked.
1486
1487 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488 copied_oop, r9_klass, count_save);
1489
1490 __ align(CodeEntryAlignment);
1491 StubCodeMark mark(this, stub_id);
1492 address start = __ pc();
1493
1494 __ enter(); // required for proper stackwalking of RuntimeStub frame
1495
1496 // Caller of this entry point must set up the argument registers.
1497 if (nopush_entry != nullptr) {
1498 *nopush_entry = __ pc();
1499 BLOCK_COMMENT("Entry:");
1500 }
1501
1502 // Empty array: Nothing to do
1503 __ beqz(count, L_done);
1504
1505 __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506
1507 #ifdef ASSERT
1508 BLOCK_COMMENT("assert consistent ckoff/ckval");
1509 // The ckoff and ckval must be mutually consistent,
1510 // even though caller generates both.
1511 { Label L;
1512 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513 __ lwu(start_to, Address(ckval, sco_offset));
1514 __ beq(ckoff, start_to, L);
1515 __ stop("super_check_offset inconsistent");
1516 __ bind(L);
1517 }
1518 #endif //ASSERT
1519
1520 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521 if (dest_uninitialized) {
1522 decorators |= IS_DEST_UNINITIALIZED;
1523 }
1524
1525 bool is_oop = true;
1526 int element_size = UseCompressedOops ? 4 : 8;
1527
1528 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530
1531 // save the original count
1532 __ mv(count_save, count);
1533
1534 // Copy from low to high addresses
1535 __ mv(start_to, to); // Save destination array start address
1536 __ j(L_load_element);
1537
1538 // ======== begin loop ========
1539 // (Loop is rotated; its entry is L_load_element.)
1540 // Loop control:
1541 // for count to 0 do
1542 // copied_oop = load_heap_oop(from++)
1543 // ... generate_type_check ...
1544 // store_heap_oop(to++, copied_oop)
1545 // end
1546
1547 __ align(OptoLoopAlignment);
1548
1549 __ BIND(L_store_element);
1550 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551 Address(to, 0), copied_oop,
1552 gct1, gct2, gct3);
1553 __ addi(to, to, UseCompressedOops ? 4 : 8);
1554 __ subi(count, count, 1);
1555 __ beqz(count, L_do_card_marks);
1556
1557 // ======== loop entry is here ========
1558 __ BIND(L_load_element);
1559 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560 copied_oop, Address(from, 0),
1561 gct1);
1562 __ addi(from, from, UseCompressedOops ? 4 : 8);
1563 __ beqz(copied_oop, L_store_element);
1564
1565 __ load_klass(r9_klass, copied_oop);// query the object klass
1566
1567 BLOCK_COMMENT("type_check:");
1568 generate_type_check(r9_klass, /*sub_klass*/
1569 ckoff, /*super_check_offset*/
1570 ckval, /*super_klass*/
1571 x10, /*result*/
1572 gct1, /*tmp1*/
1573 gct2, /*tmp2*/
1574 L_store_element);
1575
1576 // Fall through on failure!
1577
1578 // ======== end loop ========
1579
1580 // It was a real error; we must depend on the caller to finish the job.
1581 // Register count = remaining oops, count_orig = total oops.
1582 // Emit GC store barriers for the oops we have copied and report
1583 // their number to the caller.
1584
1585 __ sub(count, count_save, count); // K = partially copied oop count
1586 __ xori(count, count, -1); // report (-1^K) to caller
1587 __ beqz(count, L_done_pop);
1588
1589 __ BIND(L_do_card_marks);
1590 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591
1592 __ bind(L_done_pop);
1593 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595
1596 __ bind(L_done);
1597 __ mv(x10, count);
1598 __ leave();
1599 __ ret();
1600
1601 return start;
1602 }
1603
1604 // Perform range checks on the proposed arraycopy.
1605 // Kills temp, but nothing else.
1606 // Also, clean the sign bits of src_pos and dst_pos.
1607 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1608 Register src_pos, // source position (c_rarg1)
1609 Register dst, // destination array oo (c_rarg2)
1610 Register dst_pos, // destination position (c_rarg3)
1611 Register length,
1612 Register temp,
1613 Label& L_failed) {
1614 BLOCK_COMMENT("arraycopy_range_checks:");
1615
1616 assert_different_registers(t0, temp);
1617
1618 // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620 __ addw(temp, length, src_pos);
1621 __ bgtu(temp, t0, L_failed);
1622
1623 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625 __ addw(temp, length, dst_pos);
1626 __ bgtu(temp, t0, L_failed);
1627
1628 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629 __ zext(src_pos, src_pos, 32);
1630 __ zext(dst_pos, dst_pos, 32);
1631
1632 BLOCK_COMMENT("arraycopy_range_checks done");
1633 }
1634
1635 address generate_unsafecopy_common_error_exit() {
1636 address start = __ pc();
1637 __ mv(x10, 0);
1638 __ leave();
1639 __ ret();
1640 return start;
1641 }
1642
1643 //
1644 // Generate 'unsafe' set memory stub
1645 // Though just as safe as the other stubs, it takes an unscaled
1646 // size_t (# bytes) argument instead of an element count.
1647 //
1648 // Input:
1649 // c_rarg0 - destination array address
1650 // c_rarg1 - byte count (size_t)
1651 // c_rarg2 - byte value
1652 //
1653 address generate_unsafe_setmemory() {
1654 __ align(CodeEntryAlignment);
1655 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656 StubCodeMark mark(this, stub_id);
1657 address start = __ pc();
1658
1659 // bump this on entry, not on exit:
1660 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661
1662 Label L_fill_elements;
1663
1664 const Register dest = c_rarg0;
1665 const Register count = c_rarg1;
1666 const Register value = c_rarg2;
1667 const Register cnt_words = x28; // temp register
1668 const Register tmp_reg = x29; // temp register
1669
1670 // Mark remaining code as such which performs Unsafe accesses.
1671 UnsafeMemoryAccessMark umam(this, true, false);
1672
1673 __ enter(); // required for proper stackwalking of RuntimeStub frame
1674
1675 // if count < 8, jump to L_fill_elements
1676 __ mv(tmp_reg, 8); // 8 bytes fill by element
1677 __ bltu(count, tmp_reg, L_fill_elements);
1678
1679 // Propagate byte to 64-bit width
1680 // 8 bit -> 16 bit
1681 __ zext(value, value, 8);
1682 __ slli(tmp_reg, value, 8);
1683 __ orr(value, value, tmp_reg);
1684 // 16 bit -> 32 bit
1685 __ slli(tmp_reg, value, 16);
1686 __ orr(value, value, tmp_reg);
1687 // 32 bit -> 64 bit
1688 __ slli(tmp_reg, value, 32);
1689 __ orr(value, value, tmp_reg);
1690
1691 // Align source address at 8 bytes address boundary.
1692 Label L_skip_align1, L_skip_align2, L_skip_align4;
1693 // One byte misalignment happens.
1694 __ test_bit(tmp_reg, dest, 0);
1695 __ beqz(tmp_reg, L_skip_align1);
1696 __ sb(value, Address(dest, 0));
1697 __ addi(dest, dest, 1);
1698 __ subi(count, count, 1);
1699
1700 __ bind(L_skip_align1);
1701 // Two bytes misalignment happens.
1702 __ test_bit(tmp_reg, dest, 1);
1703 __ beqz(tmp_reg, L_skip_align2);
1704 __ sh(value, Address(dest, 0));
1705 __ addi(dest, dest, 2);
1706 __ subi(count, count, 2);
1707
1708 __ bind(L_skip_align2);
1709 // Four bytes misalignment happens.
1710 __ test_bit(tmp_reg, dest, 2);
1711 __ beqz(tmp_reg, L_skip_align4);
1712 __ sw(value, Address(dest, 0));
1713 __ addi(dest, dest, 4);
1714 __ subi(count, count, 4);
1715 __ bind(L_skip_align4);
1716
1717 // Fill large chunks
1718 __ srli(cnt_words, count, 3); // number of words
1719 __ slli(tmp_reg, cnt_words, 3);
1720 __ sub(count, count, tmp_reg);
1721 {
1722 __ fill_words(dest, cnt_words, value);
1723 }
1724
1725 // Handle copies less than 8 bytes
1726 __ bind(L_fill_elements);
1727 Label L_fill_2, L_fill_1, L_exit;
1728 __ test_bit(tmp_reg, count, 2);
1729 __ beqz(tmp_reg, L_fill_2);
1730 __ sb(value, Address(dest, 0));
1731 __ sb(value, Address(dest, 1));
1732 __ sb(value, Address(dest, 2));
1733 __ sb(value, Address(dest, 3));
1734 __ addi(dest, dest, 4);
1735
1736 __ bind(L_fill_2);
1737 __ test_bit(tmp_reg, count, 1);
1738 __ beqz(tmp_reg, L_fill_1);
1739 __ sb(value, Address(dest, 0));
1740 __ sb(value, Address(dest, 1));
1741 __ addi(dest, dest, 2);
1742
1743 __ bind(L_fill_1);
1744 __ test_bit(tmp_reg, count, 0);
1745 __ beqz(tmp_reg, L_exit);
1746 __ sb(value, Address(dest, 0));
1747
1748 __ bind(L_exit);
1749 __ leave();
1750 __ ret();
1751
1752 return start;
1753 }
1754
1755 //
1756 // Generate 'unsafe' array copy stub
1757 // Though just as safe as the other stubs, it takes an unscaled
1758 // size_t argument instead of an element count.
1759 //
1760 // Input:
1761 // c_rarg0 - source array address
1762 // c_rarg1 - destination array address
1763 // c_rarg2 - byte count, treated as ssize_t, can be zero
1764 //
1765 // Examines the alignment of the operands and dispatches
1766 // to a long, int, short, or byte copy loop.
1767 //
1768 address generate_unsafe_copy(address byte_copy_entry,
1769 address short_copy_entry,
1770 address int_copy_entry,
1771 address long_copy_entry) {
1772 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774 Label L_long_aligned, L_int_aligned, L_short_aligned;
1775 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776
1777 __ align(CodeEntryAlignment);
1778 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779 StubCodeMark mark(this, stub_id);
1780 address start = __ pc();
1781 __ enter(); // required for proper stackwalking of RuntimeStub frame
1782
1783 // bump this on entry, not on exit:
1784 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785
1786 __ orr(t0, s, d);
1787 __ orr(t0, t0, count);
1788
1789 __ andi(t0, t0, BytesPerLong - 1);
1790 __ beqz(t0, L_long_aligned);
1791 __ andi(t0, t0, BytesPerInt - 1);
1792 __ beqz(t0, L_int_aligned);
1793 __ test_bit(t0, t0, 0);
1794 __ beqz(t0, L_short_aligned);
1795 __ j(RuntimeAddress(byte_copy_entry));
1796
1797 __ BIND(L_short_aligned);
1798 __ srli(count, count, LogBytesPerShort); // size => short_count
1799 __ j(RuntimeAddress(short_copy_entry));
1800 __ BIND(L_int_aligned);
1801 __ srli(count, count, LogBytesPerInt); // size => int_count
1802 __ j(RuntimeAddress(int_copy_entry));
1803 __ BIND(L_long_aligned);
1804 __ srli(count, count, LogBytesPerLong); // size => long_count
1805 __ j(RuntimeAddress(long_copy_entry));
1806
1807 return start;
1808 }
1809
1810 //
1811 // Generate generic array copy stubs
1812 //
1813 // Input:
1814 // c_rarg0 - src oop
1815 // c_rarg1 - src_pos (32-bits)
1816 // c_rarg2 - dst oop
1817 // c_rarg3 - dst_pos (32-bits)
1818 // c_rarg4 - element count (32-bits)
1819 //
1820 // Output:
1821 // x10 == 0 - success
1822 // x10 == -1^K - failure, where K is partial transfer count
1823 //
1824 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825 address int_copy_entry, address oop_copy_entry,
1826 address long_copy_entry, address checkcast_copy_entry) {
1827 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830 Label L_failed, L_failed_0, L_objArray;
1831 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832
1833 // Input registers
1834 const Register src = c_rarg0; // source array oop
1835 const Register src_pos = c_rarg1; // source position
1836 const Register dst = c_rarg2; // destination array oop
1837 const Register dst_pos = c_rarg3; // destination position
1838 const Register length = c_rarg4;
1839
1840 // Registers used as temps
1841 const Register dst_klass = c_rarg5;
1842
1843 __ align(CodeEntryAlignment);
1844
1845 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846 StubCodeMark mark(this, stub_id);
1847
1848 address start = __ pc();
1849
1850 __ enter(); // required for proper stackwalking of RuntimeStub frame
1851
1852 // bump this on entry, not on exit:
1853 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854
1855 //-----------------------------------------------------------------------
1856 // Assembler stub will be used for this call to arraycopy
1857 // if the following conditions are met:
1858 //
1859 // (1) src and dst must not be null.
1860 // (2) src_pos must not be negative.
1861 // (3) dst_pos must not be negative.
1862 // (4) length must not be negative.
1863 // (5) src klass and dst klass should be the same and not null.
1864 // (6) src and dst should be arrays.
1865 // (7) src_pos + length must not exceed length of src.
1866 // (8) dst_pos + length must not exceed length of dst.
1867 //
1868
1869 // if src is null then return -1
1870 __ beqz(src, L_failed);
1871
1872 // if [src_pos < 0] then return -1
1873 __ sext(t0, src_pos, 32);
1874 __ bltz(t0, L_failed);
1875
1876 // if dst is null then return -1
1877 __ beqz(dst, L_failed);
1878
1879 // if [dst_pos < 0] then return -1
1880 __ sext(t0, dst_pos, 32);
1881 __ bltz(t0, L_failed);
1882
1883 // registers used as temp
1884 const Register scratch_length = x28; // elements count to copy
1885 const Register scratch_src_klass = x29; // array klass
1886 const Register lh = x30; // layout helper
1887
1888 // if [length < 0] then return -1
1889 __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890 __ bltz(scratch_length, L_failed);
1891
1892 __ load_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894 {
1895 BLOCK_COMMENT("assert klasses not null {");
1896 Label L1, L2;
1897 __ bnez(scratch_src_klass, L2); // it is broken if klass is null
1898 __ bind(L1);
1899 __ stop("broken null klass");
1900 __ bind(L2);
1901 __ load_klass(t0, dst, t1);
1902 __ beqz(t0, L1); // this would be broken also
1903 BLOCK_COMMENT("} assert klasses not null done");
1904 }
1905 #endif
1906
1907 // Load layout helper (32-bits)
1908 //
1909 // |array_tag| | header_size | element_type | |log2_element_size|
1910 // 32 30 24 16 8 2 0
1911 //
1912 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1913 //
1914
1915 const int lh_offset = in_bytes(Klass::layout_helper_offset());
1916
1917 // Handle objArrays completely differently...
1918 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1919 __ lw(lh, Address(scratch_src_klass, lh_offset));
1920 __ mv(t0, objArray_lh);
1921 __ beq(lh, t0, L_objArray);
1922
1923 // if [src->klass() != dst->klass()] then return -1
1924 __ load_klass(t1, dst);
1925 __ bne(t1, scratch_src_klass, L_failed);
1926
1927 // if src->is_Array() isn't null then return -1
1928 // i.e. (lh >= 0)
1929 __ bgez(lh, L_failed);
1930
1931 // At this point, it is known to be a typeArray (array_tag 0x3).
1932 #ifdef ASSERT
1933 {
1934 BLOCK_COMMENT("assert primitive array {");
1935 Label L;
1936 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1937 __ bge(lh, t1, L);
1938 __ stop("must be a primitive array");
1939 __ bind(L);
1940 BLOCK_COMMENT("} assert primitive array done");
1941 }
1942 #endif
1943
1944 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1945 t1, L_failed);
1946
1947 // TypeArrayKlass
1948 //
1949 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1950 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1951 //
1952
1953 const Register t0_offset = t0; // array offset
1954 const Register x30_elsize = lh; // element size
1955
1956 // Get array_header_in_bytes()
1957 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1958 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1959 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32;
1960 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1961
1962 __ add(src, src, t0_offset); // src array offset
1963 __ add(dst, dst, t0_offset); // dst array offset
1964 BLOCK_COMMENT("choose copy loop based on element size");
1965
1966 // next registers should be set before the jump to corresponding stub
1967 const Register from = c_rarg0; // source array address
1968 const Register to = c_rarg1; // destination array address
1969 const Register count = c_rarg2; // elements count
1970
1971 // 'from', 'to', 'count' registers should be set in such order
1972 // since they are the same as 'src', 'src_pos', 'dst'.
1973
1974 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1975
1976 // The possible values of elsize are 0-3, i.e. exact_log2(element
1977 // size in bytes). We do a simple bitwise binary search.
1978 __ BIND(L_copy_bytes);
1979 __ test_bit(t0, x30_elsize, 1);
1980 __ bnez(t0, L_copy_ints);
1981 __ test_bit(t0, x30_elsize, 0);
1982 __ bnez(t0, L_copy_shorts);
1983 __ add(from, src, src_pos); // src_addr
1984 __ add(to, dst, dst_pos); // dst_addr
1985 __ sext(count, scratch_length, 32); // length
1986 __ j(RuntimeAddress(byte_copy_entry));
1987
1988 __ BIND(L_copy_shorts);
1989 __ shadd(from, src_pos, src, t0, 1); // src_addr
1990 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1991 __ sext(count, scratch_length, 32); // length
1992 __ j(RuntimeAddress(short_copy_entry));
1993
1994 __ BIND(L_copy_ints);
1995 __ test_bit(t0, x30_elsize, 0);
1996 __ bnez(t0, L_copy_longs);
1997 __ shadd(from, src_pos, src, t0, 2); // src_addr
1998 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
1999 __ sext(count, scratch_length, 32); // length
2000 __ j(RuntimeAddress(int_copy_entry));
2001
2002 __ BIND(L_copy_longs);
2003 #ifdef ASSERT
2004 {
2005 BLOCK_COMMENT("assert long copy {");
2006 Label L;
2007 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2008 __ sext(lh, lh, 32);
2009 __ mv(t0, LogBytesPerLong);
2010 __ beq(x30_elsize, t0, L);
2011 __ stop("must be long copy, but elsize is wrong");
2012 __ bind(L);
2013 BLOCK_COMMENT("} assert long copy done");
2014 }
2015 #endif
2016 __ shadd(from, src_pos, src, t0, 3); // src_addr
2017 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2018 __ sext(count, scratch_length, 32); // length
2019 __ j(RuntimeAddress(long_copy_entry));
2020
2021 // ObjArrayKlass
2022 __ BIND(L_objArray);
2023 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2024
2025 Label L_plain_copy, L_checkcast_copy;
2026 // test array classes for subtyping
2027 __ load_klass(t2, dst);
2028 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2029
2030 // Identically typed arrays can be copied without element-wise checks.
2031 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2032 t1, L_failed);
2033
2034 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2035 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2036 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2037 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2038 __ sext(count, scratch_length, 32); // length
2039 __ BIND(L_plain_copy);
2040 __ j(RuntimeAddress(oop_copy_entry));
2041
2042 __ BIND(L_checkcast_copy);
2043 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass)
2044 {
2045 // Before looking at dst.length, make sure dst is also an objArray.
2046 __ lwu(t0, Address(t2, lh_offset));
2047 __ mv(t1, objArray_lh);
2048 __ bne(t0, t1, L_failed);
2049
2050 // It is safe to examine both src.length and dst.length.
2051 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2052 t2, L_failed);
2053
2054 __ load_klass(dst_klass, dst); // reload
2055
2056 // Marshal the base address arguments now, freeing registers.
2057 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2058 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2059 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2060 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2061 __ sext(count, length, 32); // length (reloaded)
2062 const Register sco_temp = c_rarg3; // this register is free now
2063 assert_different_registers(from, to, count, sco_temp,
2064 dst_klass, scratch_src_klass);
2065
2066 // Generate the type check.
2067 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2068 __ lwu(sco_temp, Address(dst_klass, sco_offset));
2069
2070 // Smashes t0, t1
2071 generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2072
2073 // Fetch destination element klass from the ObjArrayKlass header.
2074 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2075 __ ld(dst_klass, Address(dst_klass, ek_offset));
2076 __ lwu(sco_temp, Address(dst_klass, sco_offset));
2077
2078 // the checkcast_copy loop needs two extra arguments:
2079 assert(c_rarg3 == sco_temp, "#3 already in place");
2080 // Set up arguments for checkcast_copy_entry.
2081 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass
2082 __ j(RuntimeAddress(checkcast_copy_entry));
2083 }
2084
2085 __ BIND(L_failed);
2086 __ mv(x10, -1);
2087 __ leave(); // required for proper stackwalking of RuntimeStub frame
2088 __ ret();
2089
2090 return start;
2091 }
2092
2093 //
2094 // Generate stub for array fill. If "aligned" is true, the
2095 // "to" address is assumed to be heapword aligned.
2096 //
2097 // Arguments for generated stub:
2098 // to: c_rarg0
2099 // value: c_rarg1
2100 // count: c_rarg2 treated as signed
2101 //
2102 address generate_fill(StubId stub_id) {
2103 BasicType t;
2104 bool aligned;
2105
2106 switch (stub_id) {
2107 case StubId::stubgen_jbyte_fill_id:
2108 t = T_BYTE;
2109 aligned = false;
2110 break;
2111 case StubId::stubgen_jshort_fill_id:
2112 t = T_SHORT;
2113 aligned = false;
2114 break;
2115 case StubId::stubgen_jint_fill_id:
2116 t = T_INT;
2117 aligned = false;
2118 break;
2119 case StubId::stubgen_arrayof_jbyte_fill_id:
2120 t = T_BYTE;
2121 aligned = true;
2122 break;
2123 case StubId::stubgen_arrayof_jshort_fill_id:
2124 t = T_SHORT;
2125 aligned = true;
2126 break;
2127 case StubId::stubgen_arrayof_jint_fill_id:
2128 t = T_INT;
2129 aligned = true;
2130 break;
2131 default:
2132 ShouldNotReachHere();
2133 };
2134
2135 __ align(CodeEntryAlignment);
2136 StubCodeMark mark(this, stub_id);
2137 address start = __ pc();
2138
2139 BLOCK_COMMENT("Entry:");
2140
2141 const Register to = c_rarg0; // source array address
2142 const Register value = c_rarg1; // value
2143 const Register count = c_rarg2; // elements count
2144
2145 const Register bz_base = x28; // base for block_zero routine
2146 const Register cnt_words = x29; // temp register
2147 const Register tmp_reg = t1;
2148
2149 __ enter();
2150
2151 Label L_fill_elements;
2152
2153 int shift = -1;
2154 switch (t) {
2155 case T_BYTE:
2156 shift = 0;
2157 // Short arrays (< 8 bytes) fill by element
2158 __ mv(tmp_reg, 8 >> shift);
2159 __ bltu(count, tmp_reg, L_fill_elements);
2160
2161 // Zero extend value
2162 // 8 bit -> 16 bit
2163 __ zext(value, value, 8);
2164 __ slli(tmp_reg, value, 8);
2165 __ orr(value, value, tmp_reg);
2166
2167 // 16 bit -> 32 bit
2168 __ slli(tmp_reg, value, 16);
2169 __ orr(value, value, tmp_reg);
2170 break;
2171 case T_SHORT:
2172 shift = 1;
2173 // Short arrays (< 8 bytes) fill by element
2174 __ mv(tmp_reg, 8 >> shift);
2175 __ bltu(count, tmp_reg, L_fill_elements);
2176
2177 // Zero extend value
2178 // 16 bit -> 32 bit
2179 __ zext(value, value, 16);
2180 __ slli(tmp_reg, value, 16);
2181 __ orr(value, value, tmp_reg);
2182 break;
2183 case T_INT:
2184 shift = 2;
2185 // Short arrays (< 8 bytes) fill by element
2186 __ mv(tmp_reg, 8 >> shift);
2187 __ bltu(count, tmp_reg, L_fill_elements);
2188 break;
2189 default: ShouldNotReachHere();
2190 }
2191
2192 // Align source address at 8 bytes address boundary.
2193 Label L_skip_align1, L_skip_align2, L_skip_align4;
2194 if (!aligned) {
2195 switch (t) {
2196 case T_BYTE:
2197 // One byte misalignment happens only for byte arrays.
2198 __ test_bit(tmp_reg, to, 0);
2199 __ beqz(tmp_reg, L_skip_align1);
2200 __ sb(value, Address(to, 0));
2201 __ addi(to, to, 1);
2202 __ subiw(count, count, 1);
2203 __ bind(L_skip_align1);
2204 // Fallthrough
2205 case T_SHORT:
2206 // Two bytes misalignment happens only for byte and short (char) arrays.
2207 __ test_bit(tmp_reg, to, 1);
2208 __ beqz(tmp_reg, L_skip_align2);
2209 __ sh(value, Address(to, 0));
2210 __ addi(to, to, 2);
2211 __ subiw(count, count, 2 >> shift);
2212 __ bind(L_skip_align2);
2213 // Fallthrough
2214 case T_INT:
2215 // Align to 8 bytes, we know we are 4 byte aligned to start.
2216 __ test_bit(tmp_reg, to, 2);
2217 __ beqz(tmp_reg, L_skip_align4);
2218 __ sw(value, Address(to, 0));
2219 __ addi(to, to, 4);
2220 __ subiw(count, count, 4 >> shift);
2221 __ bind(L_skip_align4);
2222 break;
2223 default: ShouldNotReachHere();
2224 }
2225 }
2226
2227 //
2228 // Fill large chunks
2229 //
2230 __ srliw(cnt_words, count, 3 - shift); // number of words
2231
2232 // 32 bit -> 64 bit
2233 __ zext(value, value, 32);
2234 __ slli(tmp_reg, value, 32);
2235 __ orr(value, value, tmp_reg);
2236
2237 __ slli(tmp_reg, cnt_words, 3 - shift);
2238 __ subw(count, count, tmp_reg);
2239 {
2240 __ fill_words(to, cnt_words, value);
2241 }
2242
2243 // Handle copies less than 8 bytes.
2244 // Address may not be heapword aligned.
2245 Label L_fill_1, L_fill_2, L_exit;
2246 __ bind(L_fill_elements);
2247 switch (t) {
2248 case T_BYTE:
2249 __ test_bit(tmp_reg, count, 2);
2250 __ beqz(tmp_reg, L_fill_2);
2251 __ sb(value, Address(to, 0));
2252 __ sb(value, Address(to, 1));
2253 __ sb(value, Address(to, 2));
2254 __ sb(value, Address(to, 3));
2255 __ addi(to, to, 4);
2256
2257 __ bind(L_fill_2);
2258 __ test_bit(tmp_reg, count, 1);
2259 __ beqz(tmp_reg, L_fill_1);
2260 __ sb(value, Address(to, 0));
2261 __ sb(value, Address(to, 1));
2262 __ addi(to, to, 2);
2263
2264 __ bind(L_fill_1);
2265 __ test_bit(tmp_reg, count, 0);
2266 __ beqz(tmp_reg, L_exit);
2267 __ sb(value, Address(to, 0));
2268 break;
2269 case T_SHORT:
2270 __ test_bit(tmp_reg, count, 1);
2271 __ beqz(tmp_reg, L_fill_2);
2272 __ sh(value, Address(to, 0));
2273 __ sh(value, Address(to, 2));
2274 __ addi(to, to, 4);
2275
2276 __ bind(L_fill_2);
2277 __ test_bit(tmp_reg, count, 0);
2278 __ beqz(tmp_reg, L_exit);
2279 __ sh(value, Address(to, 0));
2280 break;
2281 case T_INT:
2282 __ beqz(count, L_exit);
2283 __ sw(value, Address(to, 0));
2284 break;
2285 default: ShouldNotReachHere();
2286 }
2287 __ bind(L_exit);
2288 __ leave();
2289 __ ret();
2290
2291 return start;
2292 }
2293
2294 void generate_arraycopy_stubs() {
2295 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2296 // entry immediately following their stack push. This can be used
2297 // as a post-push branch target for compatible stubs when they
2298 // identify a special case that can be handled by the fallback
2299 // stub e.g a disjoint copy stub may be use as a special case
2300 // fallback for its compatible conjoint copy stub.
2301 //
2302 // A no push entry is always returned in the following local and
2303 // then published by assigning to the appropriate entry field in
2304 // class StubRoutines. The entry value is then passed to the
2305 // generator for the compatible stub. That means the entry must be
2306 // listed when saving to/restoring from the AOT cache, ensuring
2307 // that the inter-stub jumps are noted at AOT-cache save and
2308 // relocated at AOT cache load.
2309 address nopush_entry = nullptr;
2310
2311 // generate the common exit first so later stubs can rely on it if
2312 // they want an UnsafeMemoryAccess exit non-local to the stub
2313 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2314 // register the stub as the default exit with class UnsafeMemoryAccess
2315 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2316
2317 // generate and publish riscv-specific bulk copy routines first
2318 // so we can call them from other copy stubs
2319 StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2320 StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2321
2322 StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2323
2324 //*** jbyte
2325 // Always need aligned and unaligned versions
2326 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2327 // disjoint nopush entry is needed by conjoint copy
2328 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2329 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2330 // conjoint nopush entry is needed by generic/unsafe copy
2331 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2332 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2333 // disjoint arrayof nopush entry is needed by conjoint copy
2334 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2335 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2336
2337 //*** jshort
2338 // Always need aligned and unaligned versions
2339 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2340 // disjoint nopush entry is needed by conjoint copy
2341 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2342 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2343 // conjoint nopush entry is used by generic/unsafe copy
2344 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2345 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2346 // disjoint arrayof nopush entry is needed by conjoint copy
2347 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2348 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2349
2350 //*** jint
2351 // Aligned versions
2352 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2353 // disjoint arrayof nopush entry is needed by conjoint copy
2354 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2355 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2356 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2357 // entry_jint_arraycopy always points to the unaligned version
2358 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2359 // disjoint nopush entry is needed by conjoint copy
2360 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2361 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2362 // conjoint nopush entry is needed by generic/unsafe copy
2363 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2364
2365 //*** jlong
2366 // It is always aligned
2367 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2368 // disjoint arrayof nopush entry is needed by conjoint copy
2369 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2370 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2371 // conjoint nopush entry is needed by generic/unsafe copy
2372 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2373 // disjoint normal/nopush and conjoint normal entries are not
2374 // generated since the arrayof versions are the same
2375 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2376 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2377 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2378
2379 //*** oops
2380 StubRoutines::_arrayof_oop_disjoint_arraycopy
2381 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2382 // disjoint arrayof nopush entry is needed by conjoint copy
2383 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2384 StubRoutines::_arrayof_oop_arraycopy
2385 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2386 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2387 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2388 // Aligned versions without pre-barriers
2389 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2390 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2391 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2392 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2393
2394 // note that we don't need a returned nopush entry because the
2395 // generic/unsafe copy does not cater for uninit arrays.
2396 StubRoutines::_arrayof_oop_arraycopy_uninit
2397 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2398
2399 // for oop copies reuse arrayof entries for non-arrayof cases
2400 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2401 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2402 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2403 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2404 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2405 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2406
2407 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2408 // checkcast nopush entry is needed by generic copy
2409 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2410 // note that we don't need a returned nopush entry because the
2411 // generic copy does not cater for uninit arrays.
2412 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2413
2414
2415 // unsafe arraycopy may fallback on conjoint stubs
2416 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2417 StubRoutines::_jshort_arraycopy_nopush,
2418 StubRoutines::_jint_arraycopy_nopush,
2419 StubRoutines::_jlong_arraycopy_nopush);
2420
2421 // generic arraycopy may fallback on conjoint stubs
2422 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2423 StubRoutines::_jshort_arraycopy_nopush,
2424 StubRoutines::_jint_arraycopy_nopush,
2425 StubRoutines::_oop_arraycopy_nopush,
2426 StubRoutines::_jlong_arraycopy_nopush,
2427 StubRoutines::_checkcast_arraycopy_nopush);
2428
2429 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2430 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2431 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2432 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2433 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2434 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2435
2436 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
2437 }
2438
2439 void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2440 const int step = 16;
2441 for (int i = 0; i < rounds; i++) {
2442 __ vle32_v(working_vregs[i], key);
2443 // The keys are stored in little-endian array, while we need
2444 // to operate in big-endian.
2445 // So performing an endian-swap here with vrev8.v instruction
2446 __ vrev8_v(working_vregs[i], working_vregs[i]);
2447 __ addi(key, key, step);
2448 }
2449 }
2450
2451 void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2452 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2453
2454 __ vxor_vv(res, res, working_vregs[0]);
2455 for (int i = 1; i < rounds - 1; i++) {
2456 __ vaesem_vv(res, working_vregs[i]);
2457 }
2458 __ vaesef_vv(res, working_vregs[rounds - 1]);
2459 }
2460
2461 // Arguments:
2462 //
2463 // Inputs:
2464 // c_rarg0 - source byte array address
2465 // c_rarg1 - destination byte array address
2466 // c_rarg2 - sessionKe (key) in little endian int array
2467 //
2468 address generate_aescrypt_encryptBlock() {
2469 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2470
2471 __ align(CodeEntryAlignment);
2472 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2473 StubCodeMark mark(this, stub_id);
2474
2475 Label L_aes128, L_aes192;
2476
2477 const Register from = c_rarg0; // source array address
2478 const Register to = c_rarg1; // destination array address
2479 const Register key = c_rarg2; // key array address
2480 const Register keylen = c_rarg3;
2481
2482 VectorRegister working_vregs[] = {
2483 v4, v5, v6, v7, v8, v9, v10, v11,
2484 v12, v13, v14, v15, v16, v17, v18
2485 };
2486 const VectorRegister res = v19;
2487
2488 address start = __ pc();
2489 __ enter();
2490
2491 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2492
2493 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2494 __ vle32_v(res, from);
2495
2496 __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2497 __ bltu(keylen, t2, L_aes128);
2498 __ beq(keylen, t2, L_aes192);
2499 // Else we fallthrough to the biggest case (256-bit key size)
2500
2501 // Note: the following function performs key += 15*16
2502 aes_load_keys(key, working_vregs, 15);
2503 aes_encrypt(res, working_vregs, 15);
2504 __ vse32_v(res, to);
2505 __ mv(c_rarg0, 0);
2506 __ leave();
2507 __ ret();
2508
2509 __ bind(L_aes192);
2510 // Note: the following function performs key += 13*16
2511 aes_load_keys(key, working_vregs, 13);
2512 aes_encrypt(res, working_vregs, 13);
2513 __ vse32_v(res, to);
2514 __ mv(c_rarg0, 0);
2515 __ leave();
2516 __ ret();
2517
2518 __ bind(L_aes128);
2519 // Note: the following function performs key += 11*16
2520 aes_load_keys(key, working_vregs, 11);
2521 aes_encrypt(res, working_vregs, 11);
2522 __ vse32_v(res, to);
2523 __ mv(c_rarg0, 0);
2524 __ leave();
2525 __ ret();
2526
2527 return start;
2528 }
2529
2530 void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2531 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2532
2533 __ vxor_vv(res, res, working_vregs[rounds - 1]);
2534 for (int i = rounds - 2; i > 0; i--) {
2535 __ vaesdm_vv(res, working_vregs[i]);
2536 }
2537 __ vaesdf_vv(res, working_vregs[0]);
2538 }
2539
2540 // Arguments:
2541 //
2542 // Inputs:
2543 // c_rarg0 - source byte array address
2544 // c_rarg1 - destination byte array address
2545 // c_rarg2 - sessionKe (key) in little endian int array
2546 //
2547 address generate_aescrypt_decryptBlock() {
2548 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2549
2550 __ align(CodeEntryAlignment);
2551 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2552 StubCodeMark mark(this, stub_id);
2553
2554 Label L_aes128, L_aes192;
2555
2556 const Register from = c_rarg0; // source array address
2557 const Register to = c_rarg1; // destination array address
2558 const Register key = c_rarg2; // key array address
2559 const Register keylen = c_rarg3;
2560
2561 VectorRegister working_vregs[] = {
2562 v4, v5, v6, v7, v8, v9, v10, v11,
2563 v12, v13, v14, v15, v16, v17, v18
2564 };
2565 const VectorRegister res = v19;
2566
2567 address start = __ pc();
2568 __ enter(); // required for proper stackwalking of RuntimeStub frame
2569
2570 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2571
2572 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2573 __ vle32_v(res, from);
2574
2575 __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2576 __ bltu(keylen, t2, L_aes128);
2577 __ beq(keylen, t2, L_aes192);
2578 // Else we fallthrough to the biggest case (256-bit key size)
2579
2580 // Note: the following function performs key += 15*16
2581 aes_load_keys(key, working_vregs, 15);
2582 aes_decrypt(res, working_vregs, 15);
2583 __ vse32_v(res, to);
2584 __ mv(c_rarg0, 0);
2585 __ leave();
2586 __ ret();
2587
2588 __ bind(L_aes192);
2589 // Note: the following function performs key += 13*16
2590 aes_load_keys(key, working_vregs, 13);
2591 aes_decrypt(res, working_vregs, 13);
2592 __ vse32_v(res, to);
2593 __ mv(c_rarg0, 0);
2594 __ leave();
2595 __ ret();
2596
2597 __ bind(L_aes128);
2598 // Note: the following function performs key += 11*16
2599 aes_load_keys(key, working_vregs, 11);
2600 aes_decrypt(res, working_vregs, 11);
2601 __ vse32_v(res, to);
2602 __ mv(c_rarg0, 0);
2603 __ leave();
2604 __ ret();
2605
2606 return start;
2607 }
2608
2609 void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2610 Register rvec, Register input_len) {
2611 const Register len = x29;
2612
2613 VectorRegister working_vregs[] = {
2614 v1, v2, v3, v4, v5, v6, v7, v8,
2615 v9, v10, v11, v12, v13, v14, v15
2616 };
2617
2618 const unsigned int BLOCK_SIZE = 16;
2619
2620 __ mv(len, input_len);
2621 // load init rvec
2622 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2623 __ vle32_v(v16, rvec);
2624
2625 aes_load_keys(key, working_vregs, round);
2626 Label L_enc_loop;
2627 __ bind(L_enc_loop);
2628 // Encrypt from source by block size
2629 __ vle32_v(v17, from);
2630 __ addi(from, from, BLOCK_SIZE);
2631 __ vxor_vv(v16, v16, v17);
2632 aes_encrypt(v16, working_vregs, round);
2633 __ vse32_v(v16, to);
2634 __ addi(to, to, BLOCK_SIZE);
2635 __ subi(len, len, BLOCK_SIZE);
2636 __ bnez(len, L_enc_loop);
2637
2638 // save current rvec and return
2639 __ vse32_v(v16, rvec);
2640 __ mv(x10, input_len);
2641 __ leave();
2642 __ ret();
2643 }
2644
2645 // Arguments:
2646 //
2647 // Inputs:
2648 // c_rarg0 - source byte array address
2649 // c_rarg1 - destination byte array address
2650 // c_rarg2 - K (key) in little endian int array
2651 // c_rarg3 - r vector byte array address
2652 // c_rarg4 - input length
2653 //
2654 // Output:
2655 // x10 - input length
2656 //
2657 address generate_cipherBlockChaining_encryptAESCrypt() {
2658 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2659 __ align(CodeEntryAlignment);
2660 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2661 StubCodeMark mark(this, stub_id);
2662
2663 const Register from = c_rarg0;
2664 const Register to = c_rarg1;
2665 const Register key = c_rarg2;
2666 const Register rvec = c_rarg3;
2667 const Register input_len = c_rarg4;
2668
2669 const Register keylen = x28;
2670
2671 address start = __ pc();
2672 __ enter();
2673
2674 Label L_aes128, L_aes192;
2675 // Compute #rounds for AES based on the length of the key array
2676 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2677 __ mv(t0, 52);
2678 __ bltu(keylen, t0, L_aes128);
2679 __ beq(keylen, t0, L_aes192);
2680 // Else we fallthrough to the biggest case (256-bit key size)
2681
2682 // Note: the following function performs key += 15*16
2683 cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2684
2685 // Note: the following function performs key += 11*16
2686 __ bind(L_aes128);
2687 cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2688
2689 // Note: the following function performs key += 13*16
2690 __ bind(L_aes192);
2691 cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2692
2693 return start;
2694 }
2695
2696 void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2697 Register rvec, Register input_len) {
2698 const Register len = x29;
2699
2700 VectorRegister working_vregs[] = {
2701 v1, v2, v3, v4, v5, v6, v7, v8,
2702 v9, v10, v11, v12, v13, v14, v15
2703 };
2704
2705 const unsigned int BLOCK_SIZE = 16;
2706
2707 __ mv(len, input_len);
2708 // load init rvec
2709 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2710 __ vle32_v(v16, rvec);
2711
2712 aes_load_keys(key, working_vregs, round);
2713 Label L_dec_loop;
2714 // Decrypt from source by block size
2715 __ bind(L_dec_loop);
2716 __ vle32_v(v17, from);
2717 __ addi(from, from, BLOCK_SIZE);
2718 __ vmv_v_v(v18, v17);
2719 aes_decrypt(v17, working_vregs, round);
2720 __ vxor_vv(v17, v17, v16);
2721 __ vse32_v(v17, to);
2722 __ vmv_v_v(v16, v18);
2723 __ addi(to, to, BLOCK_SIZE);
2724 __ subi(len, len, BLOCK_SIZE);
2725 __ bnez(len, L_dec_loop);
2726
2727 // save current rvec and return
2728 __ vse32_v(v16, rvec);
2729 __ mv(x10, input_len);
2730 __ leave();
2731 __ ret();
2732 }
2733
2734 // Arguments:
2735 //
2736 // Inputs:
2737 // c_rarg0 - source byte array address
2738 // c_rarg1 - destination byte array address
2739 // c_rarg2 - K (key) in little endian int array
2740 // c_rarg3 - r vector byte array address
2741 // c_rarg4 - input length
2742 //
2743 // Output:
2744 // x10 - input length
2745 //
2746 address generate_cipherBlockChaining_decryptAESCrypt() {
2747 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2748 __ align(CodeEntryAlignment);
2749 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2750 StubCodeMark mark(this, stub_id);
2751
2752 const Register from = c_rarg0;
2753 const Register to = c_rarg1;
2754 const Register key = c_rarg2;
2755 const Register rvec = c_rarg3;
2756 const Register input_len = c_rarg4;
2757
2758 const Register keylen = x28;
2759
2760 address start = __ pc();
2761 __ enter();
2762
2763 Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2764 // Compute #rounds for AES based on the length of the key array
2765 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2766 __ mv(t0, 52);
2767 __ bltu(keylen, t0, L_aes128);
2768 __ beq(keylen, t0, L_aes192);
2769 // Else we fallthrough to the biggest case (256-bit key size)
2770
2771 // Note: the following function performs key += 15*16
2772 cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2773
2774 // Note: the following function performs key += 11*16
2775 __ bind(L_aes128);
2776 cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2777
2778 // Note: the following function performs key += 13*16
2779 __ bind(L_aes192);
2780 cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2781
2782 return start;
2783 }
2784
2785 // Load big-endian 128-bit from memory.
2786 void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2787 __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2788 __ ld(counter_hi, Address(counter));
2789 __ rev8(counter_lo, counter_lo); // Convert big-endian to little-endian
2790 __ rev8(counter_hi, counter_hi);
2791 }
2792
2793 // Little-endian 128-bit + 64-bit -> 128-bit addition.
2794 void add_counter_128(Register counter_hi, Register counter_lo) {
2795 assert_different_registers(counter_hi, counter_lo, t0);
2796 __ addi(counter_lo, counter_lo, 1);
2797 __ seqz(t0, counter_lo); // Check for result overflow
2798 __ add(counter_hi, counter_hi, t0); // Add 1 if overflow otherwise 0
2799 }
2800
2801 // Store big-endian 128-bit to memory.
2802 void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2803 assert_different_registers(counter_hi, counter_lo, t0, t1);
2804 __ rev8(t0, counter_lo); // Convert little-endian to big-endian
2805 __ rev8(t1, counter_hi);
2806 __ sd(t0, Address(counter, 8)); // Store 128-bits to counter
2807 __ sd(t1, Address(counter));
2808 }
2809
2810 void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2811 Register input_len, Register saved_encrypted_ctr, Register used_ptr) {
2812 // Algorithm:
2813 //
2814 // aes_load_keys();
2815 // load_counter_128(counter_hi, counter_lo, counter);
2816 //
2817 // L_next:
2818 // if (used >= BLOCK_SIZE) goto L_main_loop;
2819 //
2820 // L_encrypt_next:
2821 // *out = *in ^ saved_encrypted_ctr[used]);
2822 // out++; in++; used++; len--;
2823 // if (len == 0) goto L_exit;
2824 // goto L_next;
2825 //
2826 // L_main_loop:
2827 // if (len == 0) goto L_exit;
2828 // saved_encrypted_ctr = aes_encrypt(counter);
2829 //
2830 // add_counter_128(counter_hi, counter_lo);
2831 // be_store_counter_128(counter_hi, counter_lo, counter);
2832 // used = 0;
2833 //
2834 // if(len < BLOCK_SIZE) goto L_encrypt_next;
2835 //
2836 // v_in = load_16Byte(in);
2837 // v_out = load_16Byte(out);
2838 // v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2839 // v_out = v_in ^ v_saved_encrypted_ctr;
2840 // out += BLOCK_SIZE;
2841 // in += BLOCK_SIZE;
2842 // len -= BLOCK_SIZE;
2843 // used = BLOCK_SIZE;
2844 // goto L_main_loop;
2845 //
2846 //
2847 // L_exit:
2848 // store(used);
2849 // result = input_len
2850 // return result;
2851
2852 const Register used = x28;
2853 const Register len = x29;
2854 const Register counter_hi = x30;
2855 const Register counter_lo = x31;
2856 const Register block_size = t2;
2857
2858 const unsigned int BLOCK_SIZE = 16;
2859
2860 VectorRegister working_vregs[] = {
2861 v1, v2, v3, v4, v5, v6, v7, v8,
2862 v9, v10, v11, v12, v13, v14, v15
2863 };
2864
2865 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2866
2867 __ lwu(used, Address(used_ptr));
2868 __ mv(len, input_len);
2869 __ mv(block_size, BLOCK_SIZE);
2870
2871 // load keys to working_vregs according to round
2872 aes_load_keys(key, working_vregs, round);
2873
2874 // 128-bit big-endian load
2875 be_load_counter_128(counter_hi, counter_lo, counter);
2876
2877 Label L_next, L_encrypt_next, L_main_loop, L_exit;
2878 // Check the last saved_encrypted_ctr used value, we fall through
2879 // to L_encrypt_next when the used value lower than block_size
2880 __ bind(L_next);
2881 __ bgeu(used, block_size, L_main_loop);
2882
2883 // There is still data left fewer than block_size after L_main_loop
2884 // or last used, we encrypt them one by one.
2885 __ bind(L_encrypt_next);
2886 __ add(t0, saved_encrypted_ctr, used);
2887 __ lbu(t1, Address(t0));
2888 __ lbu(t0, Address(in));
2889 __ xorr(t1, t1, t0);
2890 __ sb(t1, Address(out));
2891 __ addi(in, in, 1);
2892 __ addi(out, out, 1);
2893 __ addi(used, used, 1);
2894 __ subi(len, len, 1);
2895 __ beqz(len, L_exit);
2896 __ j(L_next);
2897
2898 // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2899 // one by one until there is less than a full block remaining if len not zero
2900 __ bind(L_main_loop);
2901 __ beqz(len, L_exit);
2902 __ vle32_v(v16, counter);
2903
2904 // encrypt counter according to round
2905 aes_encrypt(v16, working_vregs, round);
2906
2907 __ vse32_v(v16, saved_encrypted_ctr);
2908
2909 // 128-bit little-endian increment
2910 add_counter_128(counter_hi, counter_lo);
2911 // 128-bit big-endian store
2912 be_store_counter_128(counter_hi, counter_lo, counter);
2913
2914 __ mv(used, 0);
2915 // Check if we have a full block_size
2916 __ bltu(len, block_size, L_encrypt_next);
2917
2918 // We have one full block to encrypt at least
2919 __ vle32_v(v17, in);
2920 __ vxor_vv(v16, v16, v17);
2921 __ vse32_v(v16, out);
2922 __ add(out, out, block_size);
2923 __ add(in, in, block_size);
2924 __ sub(len, len, block_size);
2925 __ mv(used, block_size);
2926 __ j(L_main_loop);
2927
2928 __ bind(L_exit);
2929 __ sw(used, Address(used_ptr));
2930 __ mv(x10, input_len);
2931 __ leave();
2932 __ ret();
2933 };
2934
2935 // CTR AES crypt.
2936 // Arguments:
2937 //
2938 // Inputs:
2939 // c_rarg0 - source byte array address
2940 // c_rarg1 - destination byte array address
2941 // c_rarg2 - K (key) in little endian int array
2942 // c_rarg3 - counter vector byte array address
2943 // c_rarg4 - input length
2944 // c_rarg5 - saved encryptedCounter start
2945 // c_rarg6 - saved used length
2946 //
2947 // Output:
2948 // x10 - input length
2949 //
2950 address generate_counterMode_AESCrypt() {
2951 assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2952
2953 __ align(CodeEntryAlignment);
2954 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2955 StubCodeMark mark(this, stub_id);
2956
2957 const Register in = c_rarg0;
2958 const Register out = c_rarg1;
2959 const Register key = c_rarg2;
2960 const Register counter = c_rarg3;
2961 const Register input_len = c_rarg4;
2962 const Register saved_encrypted_ctr = c_rarg5;
2963 const Register used_len_ptr = c_rarg6;
2964
2965 const Register keylen = c_rarg7; // temporary register
2966
2967 const address start = __ pc();
2968 __ enter();
2969
2970 Label L_exit;
2971 __ beqz(input_len, L_exit);
2972
2973 Label L_aes128, L_aes192;
2974 // Compute #rounds for AES based on the length of the key array
2975 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2976 __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2977 __ bltu(keylen, t0, L_aes128);
2978 __ beq(keylen, t0, L_aes192);
2979 // Else we fallthrough to the biggest case (256-bit key size)
2980
2981 // Note: the following function performs crypt with key += 15*16
2982 counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2983
2984 // Note: the following function performs crypt with key += 13*16
2985 __ bind(L_aes192);
2986 counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2987
2988 // Note: the following function performs crypt with key += 11*16
2989 __ bind(L_aes128);
2990 counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2991
2992 __ bind(L_exit);
2993 __ mv(x10, input_len);
2994 __ leave();
2995 __ ret();
2996
2997 return start;
2998 }
2999
3000 void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3001 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3002 VectorRegister partial_hash = vtmp1;
3003 VectorRegister hash_subkey = vtmp2;
3004 VectorRegister cipher_text = vtmp3;
3005
3006 const unsigned int BLOCK_SIZE = 16;
3007
3008 __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3009 __ vle64_v(hash_subkey, subkeyH);
3010 __ vrev8_v(hash_subkey, hash_subkey);
3011 __ vle64_v(partial_hash, state);
3012 __ vrev8_v(partial_hash, partial_hash);
3013
3014 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3015 Label L_ghash_loop;
3016 __ bind(L_ghash_loop);
3017 __ vle32_v(cipher_text, data);
3018 __ addi(data, data, BLOCK_SIZE);
3019 __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3020 __ subi(blocks, blocks, 1);
3021 __ bnez(blocks, L_ghash_loop);
3022
3023 __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3024 __ vrev8_v(partial_hash, partial_hash);
3025 __ vse64_v(partial_hash, state);
3026 }
3027
3028 /**
3029 * Arguments:
3030 *
3031 * Input:
3032 * c_rarg0 - current state address
3033 * c_rarg1 - H key address
3034 * c_rarg2 - data address
3035 * c_rarg3 - number of blocks
3036 *
3037 * Output:
3038 * Updated state at c_rarg0
3039 */
3040 address generate_ghash_processBlocks() {
3041 assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3042
3043 __ align(CodeEntryAlignment);
3044 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3045 StubCodeMark mark(this, stub_id);
3046
3047 address start = __ pc();
3048 __ enter();
3049
3050 Register state = c_rarg0;
3051 Register subkeyH = c_rarg1;
3052 Register data = c_rarg2;
3053 Register blocks = c_rarg3;
3054
3055 VectorRegister vtmp1 = v1;
3056 VectorRegister vtmp2 = v2;
3057 VectorRegister vtmp3 = v3;
3058
3059 ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3060
3061 __ leave();
3062 __ ret();
3063
3064 return start;
3065 }
3066
3067 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3068 void compare_string_8_x_LU(Register tmpL, Register tmpU,
3069 Register strL, Register strU, Label& DIFF) {
3070 const Register tmp = x30, tmpLval = x12;
3071
3072 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3073 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3074
3075 #ifdef ASSERT
3076 if (AvoidUnalignedAccesses) {
3077 Label align_ok;
3078 __ andi(t0, strL, 0x7);
3079 __ beqz(t0, align_ok);
3080 __ stop("bad alignment");
3081 __ bind(align_ok);
3082 }
3083 #endif
3084 __ ld(tmpLval, Address(strL));
3085 __ addi(strL, strL, wordSize);
3086
3087 // compare first 4 characters
3088 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3089 __ addi(strU, strU, wordSize);
3090 __ inflate_lo32(tmpL, tmpLval);
3091 __ xorr(tmp, tmpU, tmpL);
3092 __ bnez(tmp, DIFF);
3093
3094 // compare second 4 characters
3095 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3096 __ addi(strU, strU, wordSize);
3097 __ inflate_hi32(tmpL, tmpLval);
3098 __ xorr(tmp, tmpU, tmpL);
3099 __ bnez(tmp, DIFF);
3100 }
3101
3102 // x10 = result
3103 // x11 = str1
3104 // x12 = cnt1
3105 // x13 = str2
3106 // x14 = cnt2
3107 // x28 = tmp1
3108 // x29 = tmp2
3109 // x30 = tmp3
3110 address generate_compare_long_string_different_encoding(StubId stub_id) {
3111 bool isLU;
3112 switch (stub_id) {
3113 case StubId::stubgen_compare_long_string_LU_id:
3114 isLU = true;
3115 break;
3116 case StubId::stubgen_compare_long_string_UL_id:
3117 isLU = false;
3118 break;
3119 default:
3120 ShouldNotReachHere();
3121 };
3122 __ align(CodeEntryAlignment);
3123 StubCodeMark mark(this, stub_id);
3124 address entry = __ pc();
3125 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3126 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3127 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3128
3129 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3130 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3131
3132 Register strU = isLU ? str2 : str1,
3133 strL = isLU ? str1 : str2,
3134 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3135 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3136
3137 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3138 // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3139 // cnt2 is >= 68 here, no need to check it for >= 0
3140 __ lwu(tmpL, Address(strL));
3141 __ addi(strL, strL, wordSize / 2);
3142 __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3143 __ addi(strU, strU, wordSize);
3144 __ inflate_lo32(tmp3, tmpL);
3145 __ mv(tmpL, tmp3);
3146 __ xorr(tmp3, tmpU, tmpL);
3147 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3148 __ subi(cnt2, cnt2, wordSize / 2);
3149 }
3150
3151 // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3152 __ subi(cnt2, cnt2, wordSize * 2);
3153 __ bltz(cnt2, TAIL);
3154 __ bind(SMALL_LOOP); // smaller loop
3155 __ subi(cnt2, cnt2, wordSize * 2);
3156 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3157 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3158 __ bgez(cnt2, SMALL_LOOP);
3159 __ addi(t0, cnt2, wordSize * 2);
3160 __ beqz(t0, DONE);
3161 __ bind(TAIL); // 1..15 characters left
3162 // Aligned access. Load bytes in portions - 4, 2, 1.
3163
3164 __ addi(t0, cnt2, wordSize);
3165 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3166 __ bltz(t0, LOAD_LAST);
3167 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3168 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3169 __ subi(cnt2, cnt2, wordSize);
3170 __ beqz(cnt2, DONE); // no character left
3171 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left
3172
3173 __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3174 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes
3175 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3176 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string
3177 __ load_int_misaligned(tmpL, Address(strL), t0, false);
3178 __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3179 __ inflate_lo32(tmp3, tmpL);
3180 __ mv(tmpL, tmp3);
3181 __ xorr(tmp3, tmpU, tmpL);
3182 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3183
3184 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3185 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string
3186 __ load_int_misaligned(tmpL, Address(strL), t0, false);
3187 __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3188 __ inflate_lo32(tmp3, tmpL);
3189 __ mv(tmpL, tmp3);
3190 __ xorr(tmp3, tmpU, tmpL);
3191 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3192 __ j(DONE); // no character left
3193
3194 // Find the first different characters in the longwords and
3195 // compute their difference.
3196 __ bind(CALCULATE_DIFFERENCE);
3197 // count bits of trailing zero chars
3198 __ ctzc_bits(tmp4, tmp3);
3199 __ srl(tmp1, tmp1, tmp4);
3200 __ srl(tmp2, tmp2, tmp4);
3201 __ zext(tmp1, tmp1, 16);
3202 __ zext(tmp2, tmp2, 16);
3203 __ sub(result, tmp1, tmp2);
3204 __ bind(DONE);
3205 __ ret();
3206 return entry;
3207 }
3208
3209 address generate_method_entry_barrier() {
3210 __ align(CodeEntryAlignment);
3211 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3212 StubCodeMark mark(this, stub_id);
3213
3214 Label deoptimize_label;
3215
3216 address start = __ pc();
3217
3218 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3219
3220 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3221 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3222 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3223 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3224 __ lwu(t1, t1);
3225 __ sw(t1, thread_epoch_addr);
3226 // There are two ways this can work:
3227 // - The writer did system icache shootdown after the instruction stream update.
3228 // Hence do nothing.
3229 // - The writer trust us to make sure our icache is in sync before entering.
3230 // Hence use cmodx fence (fence.i, may change).
3231 if (UseCtxFencei) {
3232 __ cmodx_fence();
3233 }
3234 __ membar(__ LoadLoad);
3235 }
3236
3237 __ set_last_Java_frame(sp, fp, ra);
3238
3239 __ enter();
3240 __ addi(t1, sp, wordSize);
3241
3242 __ subi(sp, sp, 4 * wordSize);
3243
3244 __ push_call_clobbered_registers();
3245
3246 __ mv(c_rarg0, t1);
3247 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3248
3249 __ reset_last_Java_frame(true);
3250
3251 __ mv(t0, x10);
3252
3253 __ pop_call_clobbered_registers();
3254
3255 __ bnez(t0, deoptimize_label);
3256
3257 __ leave();
3258 __ ret();
3259
3260 __ BIND(deoptimize_label);
3261
3262 __ ld(t0, Address(sp, 0));
3263 __ ld(fp, Address(sp, wordSize));
3264 __ ld(ra, Address(sp, wordSize * 2));
3265 __ ld(t1, Address(sp, wordSize * 3));
3266
3267 __ mv(sp, t0);
3268 __ jr(t1);
3269
3270 return start;
3271 }
3272
3273 // x10 = result
3274 // x11 = str1
3275 // x12 = cnt1
3276 // x13 = str2
3277 // x14 = cnt2
3278 // x28 = tmp1
3279 // x29 = tmp2
3280 // x30 = tmp3
3281 // x31 = tmp4
3282 address generate_compare_long_string_same_encoding(StubId stub_id) {
3283 bool isLL;
3284 switch (stub_id) {
3285 case StubId::stubgen_compare_long_string_LL_id:
3286 isLL = true;
3287 break;
3288 case StubId::stubgen_compare_long_string_UU_id:
3289 isLL = false;
3290 break;
3291 default:
3292 ShouldNotReachHere();
3293 };
3294 __ align(CodeEntryAlignment);
3295 StubCodeMark mark(this, stub_id);
3296 address entry = __ pc();
3297 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3298 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3299 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3300 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3301 RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3302
3303 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3304 // update cnt2 counter with already loaded 8 bytes
3305 __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3306 // update pointers, because of previous read
3307 __ addi(str1, str1, wordSize);
3308 __ addi(str2, str2, wordSize);
3309 // less than 16 bytes left?
3310 __ subi(cnt2, cnt2, isLL ? 16 : 8);
3311 __ push_reg(spilled_regs, sp);
3312 __ bltz(cnt2, TAIL);
3313 __ bind(SMALL_LOOP);
3314 // compare 16 bytes of strings with same encoding
3315 __ ld(tmp5, Address(str1));
3316 __ addi(str1, str1, 8);
3317 __ xorr(tmp4, tmp1, tmp2);
3318 __ ld(cnt1, Address(str2));
3319 __ addi(str2, str2, 8);
3320 __ bnez(tmp4, DIFF);
3321 __ ld(tmp1, Address(str1));
3322 __ addi(str1, str1, 8);
3323 __ xorr(tmp4, tmp5, cnt1);
3324 __ ld(tmp2, Address(str2));
3325 __ addi(str2, str2, 8);
3326 __ bnez(tmp4, DIFF2);
3327
3328 __ subi(cnt2, cnt2, isLL ? 16 : 8);
3329 __ bgez(cnt2, SMALL_LOOP);
3330 __ bind(TAIL);
3331 __ addi(cnt2, cnt2, isLL ? 16 : 8);
3332 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3333 __ subi(cnt2, cnt2, isLL ? 8 : 4);
3334 __ blez(cnt2, CHECK_LAST);
3335 __ xorr(tmp4, tmp1, tmp2);
3336 __ bnez(tmp4, DIFF);
3337 __ ld(tmp1, Address(str1));
3338 __ addi(str1, str1, 8);
3339 __ ld(tmp2, Address(str2));
3340 __ addi(str2, str2, 8);
3341 __ subi(cnt2, cnt2, isLL ? 8 : 4);
3342 __ bind(CHECK_LAST);
3343 if (!isLL) {
3344 __ add(cnt2, cnt2, cnt2); // now in bytes
3345 }
3346 __ xorr(tmp4, tmp1, tmp2);
3347 __ bnez(tmp4, DIFF);
3348 __ add(str1, str1, cnt2);
3349 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3350 __ add(str2, str2, cnt2);
3351 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3352 __ xorr(tmp4, tmp5, cnt1);
3353 __ beqz(tmp4, LENGTH_DIFF);
3354 // Find the first different characters in the longwords and
3355 // compute their difference.
3356 __ bind(DIFF2);
3357 // count bits of trailing zero chars
3358 __ ctzc_bits(tmp3, tmp4, isLL);
3359 __ srl(tmp5, tmp5, tmp3);
3360 __ srl(cnt1, cnt1, tmp3);
3361 if (isLL) {
3362 __ zext(tmp5, tmp5, 8);
3363 __ zext(cnt1, cnt1, 8);
3364 } else {
3365 __ zext(tmp5, tmp5, 16);
3366 __ zext(cnt1, cnt1, 16);
3367 }
3368 __ sub(result, tmp5, cnt1);
3369 __ j(LENGTH_DIFF);
3370 __ bind(DIFF);
3371 // count bits of trailing zero chars
3372 __ ctzc_bits(tmp3, tmp4, isLL);
3373 __ srl(tmp1, tmp1, tmp3);
3374 __ srl(tmp2, tmp2, tmp3);
3375 if (isLL) {
3376 __ zext(tmp1, tmp1, 8);
3377 __ zext(tmp2, tmp2, 8);
3378 } else {
3379 __ zext(tmp1, tmp1, 16);
3380 __ zext(tmp2, tmp2, 16);
3381 }
3382 __ sub(result, tmp1, tmp2);
3383 __ j(LENGTH_DIFF);
3384 __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3385 __ xorr(tmp4, tmp1, tmp2);
3386 __ bnez(tmp4, DIFF);
3387 __ bind(LENGTH_DIFF);
3388 __ pop_reg(spilled_regs, sp);
3389 __ ret();
3390 return entry;
3391 }
3392
3393 void generate_compare_long_strings() {
3394 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3395 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3396 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3397 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3398 }
3399
3400 // x10 result
3401 // x11 src
3402 // x12 src count
3403 // x13 pattern
3404 // x14 pattern count
3405 address generate_string_indexof_linear(StubId stub_id)
3406 {
3407 bool needle_isL;
3408 bool haystack_isL;
3409 switch (stub_id) {
3410 case StubId::stubgen_string_indexof_linear_ll_id:
3411 needle_isL = true;
3412 haystack_isL = true;
3413 break;
3414 case StubId::stubgen_string_indexof_linear_ul_id:
3415 needle_isL = true;
3416 haystack_isL = false;
3417 break;
3418 case StubId::stubgen_string_indexof_linear_uu_id:
3419 needle_isL = false;
3420 haystack_isL = false;
3421 break;
3422 default:
3423 ShouldNotReachHere();
3424 };
3425
3426 __ align(CodeEntryAlignment);
3427 StubCodeMark mark(this, stub_id);
3428 address entry = __ pc();
3429
3430 int needle_chr_size = needle_isL ? 1 : 2;
3431 int haystack_chr_size = haystack_isL ? 1 : 2;
3432 int needle_chr_shift = needle_isL ? 0 : 1;
3433 int haystack_chr_shift = haystack_isL ? 0 : 1;
3434 bool isL = needle_isL && haystack_isL;
3435 // parameters
3436 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3437 // temporary registers
3438 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3439 // redefinitions
3440 Register ch1 = x28, ch2 = x29;
3441 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3442
3443 __ push_reg(spilled_regs, sp);
3444
3445 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3446 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3447 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3448 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3449 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3450 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3451
3452 __ ld(ch1, Address(needle));
3453 __ ld(ch2, Address(haystack));
3454 // src.length - pattern.length
3455 __ sub(haystack_len, haystack_len, needle_len);
3456
3457 // first is needle[0]
3458 __ zext(first, ch1, needle_isL ? 8 : 16);
3459
3460 uint64_t mask0101 = UCONST64(0x0101010101010101);
3461 uint64_t mask0001 = UCONST64(0x0001000100010001);
3462 __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3463 __ mul(first, first, mask1);
3464 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3465 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3466 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3467 if (needle_isL != haystack_isL) {
3468 __ mv(tmp, ch1);
3469 }
3470 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3471 __ blez(haystack_len, L_SMALL);
3472
3473 if (needle_isL != haystack_isL) {
3474 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3475 }
3476 // xorr, sub, orr, notr, andr
3477 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3478 // eg:
3479 // first: aa aa aa aa aa aa aa aa
3480 // ch2: aa aa li nx jd ka aa aa
3481 // match_mask: 80 80 00 00 00 00 80 80
3482 __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3483
3484 // search first char of needle, if success, goto L_HAS_ZERO;
3485 __ bnez(match_mask, L_HAS_ZERO);
3486 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3487 __ addi(result, result, wordSize / haystack_chr_size);
3488 __ addi(haystack, haystack, wordSize);
3489 __ bltz(haystack_len, L_POST_LOOP);
3490
3491 __ bind(L_LOOP);
3492 __ ld(ch2, Address(haystack));
3493 __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3494 __ bnez(match_mask, L_HAS_ZERO);
3495
3496 __ bind(L_LOOP_PROCEED);
3497 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3498 __ addi(haystack, haystack, wordSize);
3499 __ addi(result, result, wordSize / haystack_chr_size);
3500 __ bgez(haystack_len, L_LOOP);
3501
3502 __ bind(L_POST_LOOP);
3503 __ mv(ch2, -wordSize / haystack_chr_size);
3504 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3505 __ ld(ch2, Address(haystack));
3506 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3507 __ neg(haystack_len, haystack_len);
3508 __ xorr(ch2, first, ch2);
3509 __ sub(match_mask, ch2, mask1);
3510 __ orr(ch2, ch2, mask2);
3511 __ mv(trailing_zeros, -1); // all bits set
3512 __ j(L_SMALL_PROCEED);
3513
3514 __ align(OptoLoopAlignment);
3515 __ bind(L_SMALL);
3516 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3517 __ neg(haystack_len, haystack_len);
3518 if (needle_isL != haystack_isL) {
3519 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3520 }
3521 __ xorr(ch2, first, ch2);
3522 __ sub(match_mask, ch2, mask1);
3523 __ orr(ch2, ch2, mask2);
3524 __ mv(trailing_zeros, -1); // all bits set
3525
3526 __ bind(L_SMALL_PROCEED);
3527 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3528 __ notr(ch2, ch2);
3529 __ andr(match_mask, match_mask, ch2);
3530 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3531 __ beqz(match_mask, NOMATCH);
3532
3533 __ bind(L_SMALL_HAS_ZERO_LOOP);
3534 // count bits of trailing zero chars
3535 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3536 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3537 __ mv(ch2, wordSize / haystack_chr_size);
3538 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3539 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3540 __ mv(trailing_zeros, wordSize / haystack_chr_size);
3541 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3542
3543 __ bind(L_SMALL_CMP_LOOP);
3544 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3545 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3546 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3547 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3548 __ addi(trailing_zeros, trailing_zeros, 1);
3549 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3550 __ beq(first, ch2, L_SMALL_CMP_LOOP);
3551
3552 __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3553 __ beqz(match_mask, NOMATCH);
3554 // count bits of trailing zero chars
3555 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3556 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3557 __ addi(result, result, 1);
3558 __ addi(haystack, haystack, haystack_chr_size);
3559 __ j(L_SMALL_HAS_ZERO_LOOP);
3560
3561 __ align(OptoLoopAlignment);
3562 __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3563 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3564 __ j(DONE);
3565
3566 __ align(OptoLoopAlignment);
3567 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3568 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3569 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3570 __ j(DONE);
3571
3572 __ align(OptoLoopAlignment);
3573 __ bind(L_HAS_ZERO);
3574 // count bits of trailing zero chars
3575 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3576 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3577 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3578 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3579 __ subi(result, result, 1); // array index from 0, so result -= 1
3580
3581 __ bind(L_HAS_ZERO_LOOP);
3582 __ mv(needle_len, wordSize / haystack_chr_size);
3583 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3584 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3585 // load next 8 bytes from haystack, and increase result index
3586 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3587 __ addi(result, result, 1);
3588 __ mv(trailing_zeros, wordSize / haystack_chr_size);
3589 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3590
3591 // compare one char
3592 __ bind(L_CMP_LOOP);
3593 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3594 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3595 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3596 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3597 __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3598 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3599 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3600 __ beq(needle_len, ch2, L_CMP_LOOP);
3601
3602 __ bind(L_CMP_LOOP_NOMATCH);
3603 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3604 // count bits of trailing zero chars
3605 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3606 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3607 __ addi(haystack, haystack, haystack_chr_size);
3608 __ j(L_HAS_ZERO_LOOP);
3609
3610 __ align(OptoLoopAlignment);
3611 __ bind(L_CMP_LOOP_LAST_CMP);
3612 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3613 __ j(DONE);
3614
3615 __ align(OptoLoopAlignment);
3616 __ bind(L_CMP_LOOP_LAST_CMP2);
3617 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3618 __ addi(result, result, 1);
3619 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3620 __ j(DONE);
3621
3622 __ align(OptoLoopAlignment);
3623 __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3624 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3625 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3626 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3627 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3628 // result by analyzed characters value, so, we can just reset lower bits
3629 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3630 // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3631 // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3632 // index of last analyzed substring inside current octet. So, haystack in at
3633 // respective start address. We need to advance it to next octet
3634 __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3635 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3636 __ andi(result, result, haystack_isL ? -8 : -4);
3637 __ slli(tmp, match_mask, haystack_chr_shift);
3638 __ sub(haystack, haystack, tmp);
3639 __ sext(haystack_len, haystack_len, 32);
3640 __ j(L_LOOP_PROCEED);
3641
3642 __ align(OptoLoopAlignment);
3643 __ bind(NOMATCH);
3644 __ mv(result, -1);
3645
3646 __ bind(DONE);
3647 __ pop_reg(spilled_regs, sp);
3648 __ ret();
3649 return entry;
3650 }
3651
3652 void generate_string_indexof_stubs()
3653 {
3654 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3655 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3656 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3657 }
3658
3659 #ifdef COMPILER2
3660 void generate_lookup_secondary_supers_table_stub() {
3661 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3662 StubCodeMark mark(this, stub_id);
3663
3664 const Register
3665 r_super_klass = x10,
3666 r_array_base = x11,
3667 r_array_length = x12,
3668 r_array_index = x13,
3669 r_sub_klass = x14,
3670 result = x15,
3671 r_bitmap = x16;
3672
3673 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3674 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3675 Label L_success;
3676 __ enter();
3677 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3678 r_array_base, r_array_length, r_array_index,
3679 r_bitmap, slot, /*stub_is_near*/true);
3680 __ leave();
3681 __ ret();
3682 }
3683 }
3684
3685 // Slow path implementation for UseSecondarySupersTable.
3686 address generate_lookup_secondary_supers_table_slow_path_stub() {
3687 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3688 StubCodeMark mark(this, stub_id);
3689
3690 address start = __ pc();
3691 const Register
3692 r_super_klass = x10, // argument
3693 r_array_base = x11, // argument
3694 temp1 = x12, // tmp
3695 r_array_index = x13, // argument
3696 result = x15, // argument
3697 r_bitmap = x16; // argument
3698
3699
3700 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3701 __ ret();
3702
3703 return start;
3704 }
3705
3706 address generate_mulAdd()
3707 {
3708 __ align(CodeEntryAlignment);
3709 StubId stub_id = StubId::stubgen_mulAdd_id;
3710 StubCodeMark mark(this, stub_id);
3711
3712 address entry = __ pc();
3713
3714 const Register out = x10;
3715 const Register in = x11;
3716 const Register offset = x12;
3717 const Register len = x13;
3718 const Register k = x14;
3719 const Register tmp = x28;
3720
3721 BLOCK_COMMENT("Entry:");
3722 __ enter();
3723 __ mul_add(out, in, offset, len, k, tmp);
3724 __ leave();
3725 __ ret();
3726
3727 return entry;
3728 }
3729
3730 /**
3731 * Arguments:
3732 *
3733 * Input:
3734 * c_rarg0 - x address
3735 * c_rarg1 - x length
3736 * c_rarg2 - y address
3737 * c_rarg3 - y length
3738 * c_rarg4 - z address
3739 */
3740 address generate_multiplyToLen()
3741 {
3742 __ align(CodeEntryAlignment);
3743 StubId stub_id = StubId::stubgen_multiplyToLen_id;
3744 StubCodeMark mark(this, stub_id);
3745 address entry = __ pc();
3746
3747 const Register x = x10;
3748 const Register xlen = x11;
3749 const Register y = x12;
3750 const Register ylen = x13;
3751 const Register z = x14;
3752
3753 const Register tmp0 = x15;
3754 const Register tmp1 = x16;
3755 const Register tmp2 = x17;
3756 const Register tmp3 = x7;
3757 const Register tmp4 = x28;
3758 const Register tmp5 = x29;
3759 const Register tmp6 = x30;
3760 const Register tmp7 = x31;
3761
3762 BLOCK_COMMENT("Entry:");
3763 __ enter(); // required for proper stackwalking of RuntimeStub frame
3764 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3765 __ leave(); // required for proper stackwalking of RuntimeStub frame
3766 __ ret();
3767
3768 return entry;
3769 }
3770
3771 address generate_squareToLen()
3772 {
3773 __ align(CodeEntryAlignment);
3774 StubId stub_id = StubId::stubgen_squareToLen_id;
3775 StubCodeMark mark(this, stub_id);
3776 address entry = __ pc();
3777
3778 const Register x = x10;
3779 const Register xlen = x11;
3780 const Register z = x12;
3781 const Register y = x14; // == x
3782 const Register ylen = x15; // == xlen
3783
3784 const Register tmp0 = x13; // zlen, unused
3785 const Register tmp1 = x16;
3786 const Register tmp2 = x17;
3787 const Register tmp3 = x7;
3788 const Register tmp4 = x28;
3789 const Register tmp5 = x29;
3790 const Register tmp6 = x30;
3791 const Register tmp7 = x31;
3792
3793 BLOCK_COMMENT("Entry:");
3794 __ enter();
3795 __ mv(y, x);
3796 __ mv(ylen, xlen);
3797 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3798 __ leave();
3799 __ ret();
3800
3801 return entry;
3802 }
3803
3804 // Arguments:
3805 //
3806 // Input:
3807 // c_rarg0 - newArr address
3808 // c_rarg1 - oldArr address
3809 // c_rarg2 - newIdx
3810 // c_rarg3 - shiftCount
3811 // c_rarg4 - numIter
3812 //
3813 address generate_bigIntegerLeftShift() {
3814 __ align(CodeEntryAlignment);
3815 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3816 StubCodeMark mark(this, stub_id);
3817 address entry = __ pc();
3818
3819 Label loop, exit;
3820
3821 Register newArr = c_rarg0;
3822 Register oldArr = c_rarg1;
3823 Register newIdx = c_rarg2;
3824 Register shiftCount = c_rarg3;
3825 Register numIter = c_rarg4;
3826
3827 Register shiftRevCount = c_rarg5;
3828 Register oldArrNext = t1;
3829
3830 __ beqz(numIter, exit);
3831 __ shadd(newArr, newIdx, newArr, t0, 2);
3832
3833 __ mv(shiftRevCount, 32);
3834 __ sub(shiftRevCount, shiftRevCount, shiftCount);
3835
3836 __ bind(loop);
3837 __ addi(oldArrNext, oldArr, 4);
3838 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3839 __ vle32_v(v0, oldArr);
3840 __ vle32_v(v4, oldArrNext);
3841 __ vsll_vx(v0, v0, shiftCount);
3842 __ vsrl_vx(v4, v4, shiftRevCount);
3843 __ vor_vv(v0, v0, v4);
3844 __ vse32_v(v0, newArr);
3845 __ sub(numIter, numIter, t0);
3846 __ shadd(oldArr, t0, oldArr, t1, 2);
3847 __ shadd(newArr, t0, newArr, t1, 2);
3848 __ bnez(numIter, loop);
3849
3850 __ bind(exit);
3851 __ ret();
3852
3853 return entry;
3854 }
3855
3856 // Arguments:
3857 //
3858 // Input:
3859 // c_rarg0 - newArr address
3860 // c_rarg1 - oldArr address
3861 // c_rarg2 - newIdx
3862 // c_rarg3 - shiftCount
3863 // c_rarg4 - numIter
3864 //
3865 address generate_bigIntegerRightShift() {
3866 __ align(CodeEntryAlignment);
3867 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3868 StubCodeMark mark(this, stub_id);
3869 address entry = __ pc();
3870
3871 Label loop, exit;
3872
3873 Register newArr = c_rarg0;
3874 Register oldArr = c_rarg1;
3875 Register newIdx = c_rarg2;
3876 Register shiftCount = c_rarg3;
3877 Register numIter = c_rarg4;
3878 Register idx = numIter;
3879
3880 Register shiftRevCount = c_rarg5;
3881 Register oldArrNext = c_rarg6;
3882 Register newArrCur = t0;
3883 Register oldArrCur = t1;
3884
3885 __ beqz(idx, exit);
3886 __ shadd(newArr, newIdx, newArr, t0, 2);
3887
3888 __ mv(shiftRevCount, 32);
3889 __ sub(shiftRevCount, shiftRevCount, shiftCount);
3890
3891 __ bind(loop);
3892 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3893 __ sub(idx, idx, t0);
3894 __ shadd(oldArrNext, idx, oldArr, t1, 2);
3895 __ shadd(newArrCur, idx, newArr, t1, 2);
3896 __ addi(oldArrCur, oldArrNext, 4);
3897 __ vle32_v(v0, oldArrCur);
3898 __ vle32_v(v4, oldArrNext);
3899 __ vsrl_vx(v0, v0, shiftCount);
3900 __ vsll_vx(v4, v4, shiftRevCount);
3901 __ vor_vv(v0, v0, v4);
3902 __ vse32_v(v0, newArrCur);
3903 __ bnez(idx, loop);
3904
3905 __ bind(exit);
3906 __ ret();
3907
3908 return entry;
3909 }
3910 #endif
3911
3912 #ifdef COMPILER2
3913 class MontgomeryMultiplyGenerator : public MacroAssembler {
3914
3915 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3916 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3917
3918 RegSet _toSave;
3919 bool _squaring;
3920
3921 public:
3922 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3923 : MacroAssembler(as->code()), _squaring(squaring) {
3924
3925 // Register allocation
3926
3927 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3928 Pa_base = *regs; // Argument registers
3929 if (squaring) {
3930 Pb_base = Pa_base;
3931 } else {
3932 Pb_base = *++regs;
3933 }
3934 Pn_base = *++regs;
3935 Rlen= *++regs;
3936 inv = *++regs;
3937 Pm_base = *++regs;
3938
3939 // Working registers:
3940 Ra = *++regs; // The current digit of a, b, n, and m.
3941 Rb = *++regs;
3942 Rm = *++regs;
3943 Rn = *++regs;
3944
3945 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
3946 Pb = *++regs;
3947 Pm = *++regs;
3948 Pn = *++regs;
3949
3950 tmp0 = *++regs; // Three registers which form a
3951 tmp1 = *++regs; // triple-precision accumuator.
3952 tmp2 = *++regs;
3953
3954 Ri = x6; // Inner and outer loop indexes.
3955 Rj = x7;
3956
3957 Rhi_ab = x28; // Product registers: low and high parts
3958 Rlo_ab = x29; // of a*b and m*n.
3959 Rhi_mn = x30;
3960 Rlo_mn = x31;
3961
3962 // x18 and up are callee-saved.
3963 _toSave = RegSet::range(x18, *regs) + Pm_base;
3964 }
3965
3966 private:
3967 void save_regs() {
3968 push_reg(_toSave, sp);
3969 }
3970
3971 void restore_regs() {
3972 pop_reg(_toSave, sp);
3973 }
3974
3975 template <typename T>
3976 void unroll_2(Register count, T block) {
3977 Label loop, end, odd;
3978 beqz(count, end);
3979 test_bit(t0, count, 0);
3980 bnez(t0, odd);
3981 align(16);
3982 bind(loop);
3983 (this->*block)();
3984 bind(odd);
3985 (this->*block)();
3986 subi(count, count, 2);
3987 bgtz(count, loop);
3988 bind(end);
3989 }
3990
3991 template <typename T>
3992 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3993 Label loop, end, odd;
3994 beqz(count, end);
3995 test_bit(tmp, count, 0);
3996 bnez(tmp, odd);
3997 align(16);
3998 bind(loop);
3999 (this->*block)(d, s, tmp);
4000 bind(odd);
4001 (this->*block)(d, s, tmp);
4002 subi(count, count, 2);
4003 bgtz(count, loop);
4004 bind(end);
4005 }
4006
4007 void pre1(RegisterOrConstant i) {
4008 block_comment("pre1");
4009 // Pa = Pa_base;
4010 // Pb = Pb_base + i;
4011 // Pm = Pm_base;
4012 // Pn = Pn_base + i;
4013 // Ra = *Pa;
4014 // Rb = *Pb;
4015 // Rm = *Pm;
4016 // Rn = *Pn;
4017 if (i.is_register()) {
4018 slli(t0, i.as_register(), LogBytesPerWord);
4019 } else {
4020 mv(t0, i.as_constant());
4021 slli(t0, t0, LogBytesPerWord);
4022 }
4023
4024 mv(Pa, Pa_base);
4025 add(Pb, Pb_base, t0);
4026 mv(Pm, Pm_base);
4027 add(Pn, Pn_base, t0);
4028
4029 ld(Ra, Address(Pa));
4030 ld(Rb, Address(Pb));
4031 ld(Rm, Address(Pm));
4032 ld(Rn, Address(Pn));
4033
4034 // Zero the m*n result.
4035 mv(Rhi_mn, zr);
4036 mv(Rlo_mn, zr);
4037 }
4038
4039 // The core multiply-accumulate step of a Montgomery
4040 // multiplication. The idea is to schedule operations as a
4041 // pipeline so that instructions with long latencies (loads and
4042 // multiplies) have time to complete before their results are
4043 // used. This most benefits in-order implementations of the
4044 // architecture but out-of-order ones also benefit.
4045 void step() {
4046 block_comment("step");
4047 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4048 // Ra = *++Pa;
4049 // Rb = *--Pb;
4050 mulhu(Rhi_ab, Ra, Rb);
4051 mul(Rlo_ab, Ra, Rb);
4052 addi(Pa, Pa, wordSize);
4053 ld(Ra, Address(Pa));
4054 subi(Pb, Pb, wordSize);
4055 ld(Rb, Address(Pb));
4056 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4057 // previous iteration.
4058 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4059 // Rm = *++Pm;
4060 // Rn = *--Pn;
4061 mulhu(Rhi_mn, Rm, Rn);
4062 mul(Rlo_mn, Rm, Rn);
4063 addi(Pm, Pm, wordSize);
4064 ld(Rm, Address(Pm));
4065 subi(Pn, Pn, wordSize);
4066 ld(Rn, Address(Pn));
4067 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4068 }
4069
4070 void post1() {
4071 block_comment("post1");
4072
4073 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4074 // Ra = *++Pa;
4075 // Rb = *--Pb;
4076 mulhu(Rhi_ab, Ra, Rb);
4077 mul(Rlo_ab, Ra, Rb);
4078 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4079 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4080
4081 // *Pm = Rm = tmp0 * inv;
4082 mul(Rm, tmp0, inv);
4083 sd(Rm, Address(Pm));
4084
4085 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4086 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4087 mulhu(Rhi_mn, Rm, Rn);
4088
4089 #ifndef PRODUCT
4090 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4091 {
4092 mul(Rlo_mn, Rm, Rn);
4093 add(Rlo_mn, tmp0, Rlo_mn);
4094 Label ok;
4095 beqz(Rlo_mn, ok);
4096 stop("broken Montgomery multiply");
4097 bind(ok);
4098 }
4099 #endif
4100 // We have very carefully set things up so that
4101 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4102 // the lower half of Rm * Rn because we know the result already:
4103 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
4104 // tmp0 != 0. So, rather than do a mul and an cad we just set
4105 // the carry flag iff tmp0 is nonzero.
4106 //
4107 // mul(Rlo_mn, Rm, Rn);
4108 // cad(zr, tmp0, Rlo_mn);
4109 subi(t0, tmp0, 1);
4110 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4111 cadc(tmp0, tmp1, Rhi_mn, t0);
4112 adc(tmp1, tmp2, zr, t0);
4113 mv(tmp2, zr);
4114 }
4115
4116 void pre2(Register i, Register len) {
4117 block_comment("pre2");
4118 // Pa = Pa_base + i-len;
4119 // Pb = Pb_base + len;
4120 // Pm = Pm_base + i-len;
4121 // Pn = Pn_base + len;
4122
4123 sub(Rj, i, len);
4124 // Rj == i-len
4125
4126 // Ra as temp register
4127 slli(Ra, Rj, LogBytesPerWord);
4128 add(Pa, Pa_base, Ra);
4129 add(Pm, Pm_base, Ra);
4130 slli(Ra, len, LogBytesPerWord);
4131 add(Pb, Pb_base, Ra);
4132 add(Pn, Pn_base, Ra);
4133
4134 // Ra = *++Pa;
4135 // Rb = *--Pb;
4136 // Rm = *++Pm;
4137 // Rn = *--Pn;
4138 addi(Pa, Pa, wordSize);
4139 ld(Ra, Address(Pa));
4140 subi(Pb, Pb, wordSize);
4141 ld(Rb, Address(Pb));
4142 addi(Pm, Pm, wordSize);
4143 ld(Rm, Address(Pm));
4144 subi(Pn, Pn, wordSize);
4145 ld(Rn, Address(Pn));
4146
4147 mv(Rhi_mn, zr);
4148 mv(Rlo_mn, zr);
4149 }
4150
4151 void post2(Register i, Register len) {
4152 block_comment("post2");
4153 sub(Rj, i, len);
4154
4155 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4156
4157 // As soon as we know the least significant digit of our result,
4158 // store it.
4159 // Pm_base[i-len] = tmp0;
4160 // Rj as temp register
4161 slli(Rj, Rj, LogBytesPerWord);
4162 add(Rj, Pm_base, Rj);
4163 sd(tmp0, Address(Rj));
4164
4165 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4166 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4167 adc(tmp1, tmp2, zr, t0);
4168 mv(tmp2, zr);
4169 }
4170
4171 // A carry in tmp0 after Montgomery multiplication means that we
4172 // should subtract multiples of n from our result in m. We'll
4173 // keep doing that until there is no carry.
4174 void normalize(Register len) {
4175 block_comment("normalize");
4176 // while (tmp0)
4177 // tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4178 Label loop, post, again;
4179 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4180 beqz(tmp0, post); {
4181 bind(again); {
4182 mv(i, zr);
4183 mv(cnt, len);
4184 slli(Rn, i, LogBytesPerWord);
4185 add(Rm, Pm_base, Rn);
4186 ld(Rm, Address(Rm));
4187 add(Rn, Pn_base, Rn);
4188 ld(Rn, Address(Rn));
4189 mv(t0, 1); // set carry flag, i.e. no borrow
4190 align(16);
4191 bind(loop); {
4192 notr(Rn, Rn);
4193 add(Rm, Rm, t0);
4194 add(Rm, Rm, Rn);
4195 sltu(t0, Rm, Rn);
4196 slli(Rn, i, LogBytesPerWord); // Rn as temp register
4197 add(Rn, Pm_base, Rn);
4198 sd(Rm, Address(Rn));
4199 addi(i, i, 1);
4200 slli(Rn, i, LogBytesPerWord);
4201 add(Rm, Pm_base, Rn);
4202 ld(Rm, Address(Rm));
4203 add(Rn, Pn_base, Rn);
4204 ld(Rn, Address(Rn));
4205 subi(cnt, cnt, 1);
4206 } bnez(cnt, loop);
4207 subi(tmp0, tmp0, 1);
4208 add(tmp0, tmp0, t0);
4209 } bnez(tmp0, again);
4210 } bind(post);
4211 }
4212
4213 // Move memory at s to d, reversing words.
4214 // Increments d to end of copied memory
4215 // Destroys tmp1, tmp2
4216 // Preserves len
4217 // Leaves s pointing to the address which was in d at start
4218 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4219 assert(tmp1->encoding() < x28->encoding(), "register corruption");
4220 assert(tmp2->encoding() < x28->encoding(), "register corruption");
4221
4222 shadd(s, len, s, tmp1, LogBytesPerWord);
4223 mv(tmp1, len);
4224 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4225 slli(tmp1, len, LogBytesPerWord);
4226 sub(s, d, tmp1);
4227 }
4228 // [63...0] -> [31...0][63...32]
4229 void reverse1(Register d, Register s, Register tmp) {
4230 subi(s, s, wordSize);
4231 ld(tmp, Address(s));
4232 ror(tmp, tmp, 32, t0);
4233 sd(tmp, Address(d));
4234 addi(d, d, wordSize);
4235 }
4236
4237 void step_squaring() {
4238 // An extra ACC
4239 step();
4240 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4241 }
4242
4243 void last_squaring(Register i) {
4244 Label dont;
4245 // if ((i & 1) == 0) {
4246 test_bit(t0, i, 0);
4247 bnez(t0, dont); {
4248 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4249 // Ra = *++Pa;
4250 // Rb = *--Pb;
4251 mulhu(Rhi_ab, Ra, Rb);
4252 mul(Rlo_ab, Ra, Rb);
4253 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4254 } bind(dont);
4255 }
4256
4257 void extra_step_squaring() {
4258 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4259
4260 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4261 // Rm = *++Pm;
4262 // Rn = *--Pn;
4263 mulhu(Rhi_mn, Rm, Rn);
4264 mul(Rlo_mn, Rm, Rn);
4265 addi(Pm, Pm, wordSize);
4266 ld(Rm, Address(Pm));
4267 subi(Pn, Pn, wordSize);
4268 ld(Rn, Address(Pn));
4269 }
4270
4271 void post1_squaring() {
4272 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4273
4274 // *Pm = Rm = tmp0 * inv;
4275 mul(Rm, tmp0, inv);
4276 sd(Rm, Address(Pm));
4277
4278 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4279 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4280 mulhu(Rhi_mn, Rm, Rn);
4281
4282 #ifndef PRODUCT
4283 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4284 {
4285 mul(Rlo_mn, Rm, Rn);
4286 add(Rlo_mn, tmp0, Rlo_mn);
4287 Label ok;
4288 beqz(Rlo_mn, ok); {
4289 stop("broken Montgomery multiply");
4290 } bind(ok);
4291 }
4292 #endif
4293 // We have very carefully set things up so that
4294 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4295 // the lower half of Rm * Rn because we know the result already:
4296 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
4297 // tmp0 != 0. So, rather than do a mul and a cad we just set
4298 // the carry flag iff tmp0 is nonzero.
4299 //
4300 // mul(Rlo_mn, Rm, Rn);
4301 // cad(zr, tmp, Rlo_mn);
4302 subi(t0, tmp0, 1);
4303 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4304 cadc(tmp0, tmp1, Rhi_mn, t0);
4305 adc(tmp1, tmp2, zr, t0);
4306 mv(tmp2, zr);
4307 }
4308
4309 // use t0 as carry
4310 void acc(Register Rhi, Register Rlo,
4311 Register tmp0, Register tmp1, Register tmp2) {
4312 cad(tmp0, tmp0, Rlo, t0);
4313 cadc(tmp1, tmp1, Rhi, t0);
4314 adc(tmp2, tmp2, zr, t0);
4315 }
4316
4317 public:
4318 /**
4319 * Fast Montgomery multiplication. The derivation of the
4320 * algorithm is in A Cryptographic Library for the Motorola
4321 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4322 *
4323 * Arguments:
4324 *
4325 * Inputs for multiplication:
4326 * c_rarg0 - int array elements a
4327 * c_rarg1 - int array elements b
4328 * c_rarg2 - int array elements n (the modulus)
4329 * c_rarg3 - int length
4330 * c_rarg4 - int inv
4331 * c_rarg5 - int array elements m (the result)
4332 *
4333 * Inputs for squaring:
4334 * c_rarg0 - int array elements a
4335 * c_rarg1 - int array elements n (the modulus)
4336 * c_rarg2 - int length
4337 * c_rarg3 - int inv
4338 * c_rarg4 - int array elements m (the result)
4339 *
4340 */
4341 address generate_multiply() {
4342 Label argh, nothing;
4343 bind(argh);
4344 stop("MontgomeryMultiply total_allocation must be <= 8192");
4345
4346 align(CodeEntryAlignment);
4347 address entry = pc();
4348
4349 beqz(Rlen, nothing);
4350
4351 enter();
4352
4353 // Make room.
4354 mv(Ra, 512);
4355 bgt(Rlen, Ra, argh);
4356 slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4357 sub(Ra, sp, Ra);
4358 andi(sp, Ra, -2 * wordSize);
4359
4360 srliw(Rlen, Rlen, 1); // length in longwords = len/2
4361
4362 {
4363 // Copy input args, reversing as we go. We use Ra as a
4364 // temporary variable.
4365 reverse(Ra, Pa_base, Rlen, Ri, Rj);
4366 if (!_squaring)
4367 reverse(Ra, Pb_base, Rlen, Ri, Rj);
4368 reverse(Ra, Pn_base, Rlen, Ri, Rj);
4369 }
4370
4371 // Push all call-saved registers and also Pm_base which we'll need
4372 // at the end.
4373 save_regs();
4374
4375 #ifndef PRODUCT
4376 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4377 {
4378 ld(Rn, Address(Pn_base));
4379 mul(Rlo_mn, Rn, inv);
4380 mv(t0, -1);
4381 Label ok;
4382 beq(Rlo_mn, t0, ok);
4383 stop("broken inverse in Montgomery multiply");
4384 bind(ok);
4385 }
4386 #endif
4387
4388 mv(Pm_base, Ra);
4389
4390 mv(tmp0, zr);
4391 mv(tmp1, zr);
4392 mv(tmp2, zr);
4393
4394 block_comment("for (int i = 0; i < len; i++) {");
4395 mv(Ri, zr); {
4396 Label loop, end;
4397 bge(Ri, Rlen, end);
4398
4399 bind(loop);
4400 pre1(Ri);
4401
4402 block_comment(" for (j = i; j; j--) {"); {
4403 mv(Rj, Ri);
4404 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4405 } block_comment(" } // j");
4406
4407 post1();
4408 addiw(Ri, Ri, 1);
4409 blt(Ri, Rlen, loop);
4410 bind(end);
4411 block_comment("} // i");
4412 }
4413
4414 block_comment("for (int i = len; i < 2*len; i++) {");
4415 mv(Ri, Rlen); {
4416 Label loop, end;
4417 slli(t0, Rlen, 1);
4418 bge(Ri, t0, end);
4419
4420 bind(loop);
4421 pre2(Ri, Rlen);
4422
4423 block_comment(" for (j = len*2-i-1; j; j--) {"); {
4424 slliw(Rj, Rlen, 1);
4425 subw(Rj, Rj, Ri);
4426 subiw(Rj, Rj, 1);
4427 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4428 } block_comment(" } // j");
4429
4430 post2(Ri, Rlen);
4431 addiw(Ri, Ri, 1);
4432 slli(t0, Rlen, 1);
4433 blt(Ri, t0, loop);
4434 bind(end);
4435 }
4436 block_comment("} // i");
4437
4438 normalize(Rlen);
4439
4440 mv(Ra, Pm_base); // Save Pm_base in Ra
4441 restore_regs(); // Restore caller's Pm_base
4442
4443 // Copy our result into caller's Pm_base
4444 reverse(Pm_base, Ra, Rlen, Ri, Rj);
4445
4446 leave();
4447 bind(nothing);
4448 ret();
4449
4450 return entry;
4451 }
4452
4453 /**
4454 *
4455 * Arguments:
4456 *
4457 * Inputs:
4458 * c_rarg0 - int array elements a
4459 * c_rarg1 - int array elements n (the modulus)
4460 * c_rarg2 - int length
4461 * c_rarg3 - int inv
4462 * c_rarg4 - int array elements m (the result)
4463 *
4464 */
4465 address generate_square() {
4466 Label argh;
4467 bind(argh);
4468 stop("MontgomeryMultiply total_allocation must be <= 8192");
4469
4470 align(CodeEntryAlignment);
4471 address entry = pc();
4472
4473 enter();
4474
4475 // Make room.
4476 mv(Ra, 512);
4477 bgt(Rlen, Ra, argh);
4478 slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4479 sub(Ra, sp, Ra);
4480 andi(sp, Ra, -2 * wordSize);
4481
4482 srliw(Rlen, Rlen, 1); // length in longwords = len/2
4483
4484 {
4485 // Copy input args, reversing as we go. We use Ra as a
4486 // temporary variable.
4487 reverse(Ra, Pa_base, Rlen, Ri, Rj);
4488 reverse(Ra, Pn_base, Rlen, Ri, Rj);
4489 }
4490
4491 // Push all call-saved registers and also Pm_base which we'll need
4492 // at the end.
4493 save_regs();
4494
4495 mv(Pm_base, Ra);
4496
4497 mv(tmp0, zr);
4498 mv(tmp1, zr);
4499 mv(tmp2, zr);
4500
4501 block_comment("for (int i = 0; i < len; i++) {");
4502 mv(Ri, zr); {
4503 Label loop, end;
4504 bind(loop);
4505 bge(Ri, Rlen, end);
4506
4507 pre1(Ri);
4508
4509 block_comment("for (j = (i+1)/2; j; j--) {"); {
4510 addi(Rj, Ri, 1);
4511 srliw(Rj, Rj, 1);
4512 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4513 } block_comment(" } // j");
4514
4515 last_squaring(Ri);
4516
4517 block_comment(" for (j = i/2; j; j--) {"); {
4518 srliw(Rj, Ri, 1);
4519 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4520 } block_comment(" } // j");
4521
4522 post1_squaring();
4523 addi(Ri, Ri, 1);
4524 blt(Ri, Rlen, loop);
4525
4526 bind(end);
4527 block_comment("} // i");
4528 }
4529
4530 block_comment("for (int i = len; i < 2*len; i++) {");
4531 mv(Ri, Rlen); {
4532 Label loop, end;
4533 bind(loop);
4534 slli(t0, Rlen, 1);
4535 bge(Ri, t0, end);
4536
4537 pre2(Ri, Rlen);
4538
4539 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
4540 slli(Rj, Rlen, 1);
4541 sub(Rj, Rj, Ri);
4542 subi(Rj, Rj, 1);
4543 srliw(Rj, Rj, 1);
4544 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4545 } block_comment(" } // j");
4546
4547 last_squaring(Ri);
4548
4549 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
4550 slli(Rj, Rlen, 1);
4551 sub(Rj, Rj, Ri);
4552 srliw(Rj, Rj, 1);
4553 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4554 } block_comment(" } // j");
4555
4556 post2(Ri, Rlen);
4557 addi(Ri, Ri, 1);
4558 slli(t0, Rlen, 1);
4559 blt(Ri, t0, loop);
4560
4561 bind(end);
4562 block_comment("} // i");
4563 }
4564
4565 normalize(Rlen);
4566
4567 mv(Ra, Pm_base); // Save Pm_base in Ra
4568 restore_regs(); // Restore caller's Pm_base
4569
4570 // Copy our result into caller's Pm_base
4571 reverse(Pm_base, Ra, Rlen, Ri, Rj);
4572
4573 leave();
4574 ret();
4575
4576 return entry;
4577 }
4578 };
4579
4580 #endif // COMPILER2
4581
4582 address generate_cont_thaw(Continuation::thaw_kind kind) {
4583 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4584 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4585
4586 address start = __ pc();
4587
4588 if (return_barrier) {
4589 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4590 }
4591
4592 #ifndef PRODUCT
4593 {
4594 Label OK;
4595 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4596 __ beq(sp, t0, OK);
4597 __ stop("incorrect sp");
4598 __ bind(OK);
4599 }
4600 #endif
4601
4602 if (return_barrier) {
4603 // preserve possible return value from a method returning to the return barrier
4604 __ subi(sp, sp, 2 * wordSize);
4605 __ fsd(f10, Address(sp, 0 * wordSize));
4606 __ sd(x10, Address(sp, 1 * wordSize));
4607 }
4608
4609 __ mv(c_rarg1, (return_barrier ? 1 : 0));
4610 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4611 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4612
4613 if (return_barrier) {
4614 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4615 __ ld(x10, Address(sp, 1 * wordSize));
4616 __ fld(f10, Address(sp, 0 * wordSize));
4617 __ addi(sp, sp, 2 * wordSize);
4618 }
4619
4620 #ifndef PRODUCT
4621 {
4622 Label OK;
4623 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4624 __ beq(sp, t0, OK);
4625 __ stop("incorrect sp");
4626 __ bind(OK);
4627 }
4628 #endif
4629
4630 Label thaw_success;
4631 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4632 __ bnez(t1, thaw_success);
4633 __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4634 __ bind(thaw_success);
4635
4636 // make room for the thawed frames
4637 __ sub(t0, sp, t1);
4638 __ andi(sp, t0, -16); // align
4639
4640 if (return_barrier) {
4641 // save original return value -- again
4642 __ subi(sp, sp, 2 * wordSize);
4643 __ fsd(f10, Address(sp, 0 * wordSize));
4644 __ sd(x10, Address(sp, 1 * wordSize));
4645 }
4646
4647 // If we want, we can templatize thaw by kind, and have three different entries
4648 __ mv(c_rarg1, kind);
4649
4650 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4651 __ mv(t1, x10); // x10 is the sp of the yielding frame
4652
4653 if (return_barrier) {
4654 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4655 __ ld(x10, Address(sp, 1 * wordSize));
4656 __ fld(f10, Address(sp, 0 * wordSize));
4657 __ addi(sp, sp, 2 * wordSize);
4658 } else {
4659 __ mv(x10, zr); // return 0 (success) from doYield
4660 }
4661
4662 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4663 __ mv(fp, t1);
4664 __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4665
4666 if (return_barrier_exception) {
4667 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4668 __ verify_oop(x10);
4669 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4670
4671 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4672
4673 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4674
4675 __ mv(x11, x10); // the exception handler
4676 __ mv(x10, x9); // restore return value contaning the exception oop
4677 __ verify_oop(x10);
4678
4679 __ leave();
4680 __ mv(x13, ra);
4681 __ jr(x11); // the exception handler
4682 } else {
4683 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4684 __ leave();
4685 __ ret();
4686 }
4687
4688 return start;
4689 }
4690
4691 address generate_cont_thaw() {
4692 if (!Continuations::enabled()) return nullptr;
4693
4694 StubId stub_id = StubId::stubgen_cont_thaw_id;
4695 StubCodeMark mark(this, stub_id);
4696 address start = __ pc();
4697 generate_cont_thaw(Continuation::thaw_top);
4698 return start;
4699 }
4700
4701 address generate_cont_returnBarrier() {
4702 if (!Continuations::enabled()) return nullptr;
4703
4704 // TODO: will probably need multiple return barriers depending on return type
4705 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4706 StubCodeMark mark(this, stub_id);
4707 address start = __ pc();
4708
4709 generate_cont_thaw(Continuation::thaw_return_barrier);
4710
4711 return start;
4712 }
4713
4714 address generate_cont_returnBarrier_exception() {
4715 if (!Continuations::enabled()) return nullptr;
4716
4717 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4718 StubCodeMark mark(this, stub_id);
4719 address start = __ pc();
4720
4721 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4722
4723 return start;
4724 }
4725
4726 address generate_cont_preempt_stub() {
4727 if (!Continuations::enabled()) return nullptr;
4728 StubId stub_id = StubId::stubgen_cont_preempt_id;
4729 StubCodeMark mark(this, stub_id);
4730 address start = __ pc();
4731
4732 __ reset_last_Java_frame(true);
4733
4734 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4735 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4736
4737 Label preemption_cancelled;
4738 __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4739 __ bnez(t0, preemption_cancelled);
4740
4741 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4742 SharedRuntime::continuation_enter_cleanup(_masm);
4743 __ leave();
4744 __ ret();
4745
4746 // We acquired the monitor after freezing the frames so call thaw to continue execution.
4747 __ bind(preemption_cancelled);
4748 __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4749 __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4750 __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4751 __ ld(t1, Address(t1));
4752 __ jr(t1);
4753
4754 return start;
4755 }
4756
4757 #ifdef COMPILER2
4758
4759 #undef __
4760 #define __ this->
4761
4762 class Sha2Generator : public MacroAssembler {
4763 StubCodeGenerator* _cgen;
4764 public:
4765 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4766 address generate_sha256_implCompress(StubId stub_id) {
4767 return generate_sha2_implCompress(Assembler::e32, stub_id);
4768 }
4769 address generate_sha512_implCompress(StubId stub_id) {
4770 return generate_sha2_implCompress(Assembler::e64, stub_id);
4771 }
4772 private:
4773
4774 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4775 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4776 else __ vle64_v(vr, sr);
4777 }
4778
4779 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4780 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4781 else __ vse64_v(vr, sr);
4782 }
4783
4784 // Overview of the logic in each "quad round".
4785 //
4786 // The code below repeats 16/20 times the logic implementing four rounds
4787 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4788 // to implementing the 64/80 single rounds.
4789 //
4790 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4791 // // Output:
4792 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4793 // vl1reXX.v vTmp1, ofs
4794 //
4795 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4796 // addi ofs, ofs, 16/32
4797 //
4798 // // Add constants to message schedule words:
4799 // // Input
4800 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4801 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4802 // // Output
4803 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4804 // vadd.vv vTmp0, vTmp1, vW0
4805 //
4806 // // 2 rounds of working variables updates.
4807 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4808 // // Input:
4809 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] "
4810 // // vState0 = {a[t],b[t],e[t],f[t]}
4811 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4812 // // Output:
4813 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
4814 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] "
4815 // vsha2cl.vv vState1, vState0, vTmp0
4816 //
4817 // // 2 rounds of working variables updates.
4818 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4819 // // Input
4820 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] "
4821 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] "
4822 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
4823 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4824 // // Output:
4825 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] "
4826 // vsha2ch.vv vState0, vState1, vTmp0
4827 //
4828 // // Combine 2QW into 1QW
4829 // //
4830 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4831 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4832 // // and it can only take 3 vectors as inputs. Hence we need to combine
4833 // // vW1[0] and vW2[1..3] in a single vector.
4834 // //
4835 // // vmerge Vt4, Vt1, Vt2, V0
4836 // // Input
4837 // // V0 = mask // first word from vW2, 1..3 words from vW1
4838 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4839 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4840 // // Output
4841 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4842 // vmerge.vvm vTmp0, vW2, vW1, v0
4843 //
4844 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4845 // // Input
4846 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0]
4847 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12]
4848 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4]
4849 // // Output (next four message schedule words)
4850 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16]
4851 // vsha2ms.vv vW0, vTmp0, vW3
4852 //
4853 // BEFORE
4854 // vW0 - vW3 hold the message schedule words (initially the block words)
4855 // vW0 = W[ 3: 0] "oldest"
4856 // vW1 = W[ 7: 4]
4857 // vW2 = W[11: 8]
4858 // vW3 = W[15:12] "newest"
4859 //
4860 // vt6 - vt7 hold the working state variables
4861 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0}
4862 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2}
4863 //
4864 // AFTER
4865 // vW0 - vW3 hold the message schedule words (initially the block words)
4866 // vW1 = W[ 7: 4] "oldest"
4867 // vW2 = W[11: 8]
4868 // vW3 = W[15:12]
4869 // vW0 = W[19:16] "newest"
4870 //
4871 // vState0 and vState1 hold the working state variables
4872 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4873 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4874 //
4875 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4876 // hence the uses of those vectors rotate in each round, and we get back to the
4877 // initial configuration every 4 quad-rounds. We could avoid those changes at
4878 // the cost of moving those vectors at the end of each quad-rounds.
4879 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4880 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4881 bool gen_words = true, bool step_const = true) {
4882 __ vleXX_v(vset_sew, vtemp, scalarconst);
4883 if (step_const) {
4884 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4885 }
4886 __ vadd_vv(vtemp2, vtemp, rot1);
4887 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4888 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4889 if (gen_words) {
4890 __ vmerge_vvm(vtemp2, rot3, rot2);
4891 __ vsha2ms_vv(rot1, vtemp2, rot4);
4892 }
4893 }
4894
4895 // Arguments:
4896 //
4897 // Inputs:
4898 // c_rarg0 - byte[] source+offset
4899 // c_rarg1 - int[] SHA.state
4900 // c_rarg2 - int offset
4901 // c_rarg3 - int limit
4902 //
4903 address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4904 alignas(64) static const uint32_t round_consts_256[64] = {
4905 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4906 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4907 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4908 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4909 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4910 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4911 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4912 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4913 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4914 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4915 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4916 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4917 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4918 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4919 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4920 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4921 };
4922 alignas(64) static const uint64_t round_consts_512[80] = {
4923 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4924 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4925 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4926 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4927 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4928 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4929 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4930 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4931 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4932 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4933 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4934 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4935 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4936 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4937 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4938 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4939 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4940 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4941 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4942 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4943 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4944 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4945 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4946 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4947 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4948 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4949 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4950 };
4951 const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4952
4953 bool multi_block;
4954 switch (stub_id) {
4955 case StubId::stubgen_sha256_implCompress_id:
4956 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4957 multi_block = false;
4958 break;
4959 case StubId::stubgen_sha256_implCompressMB_id:
4960 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4961 multi_block = true;
4962 break;
4963 case StubId::stubgen_sha512_implCompress_id:
4964 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4965 multi_block = false;
4966 break;
4967 case StubId::stubgen_sha512_implCompressMB_id:
4968 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4969 multi_block = true;
4970 break;
4971 default:
4972 ShouldNotReachHere();
4973 };
4974 __ align(CodeEntryAlignment);
4975 StubCodeMark mark(_cgen, stub_id);
4976 address start = __ pc();
4977
4978 Register buf = c_rarg0;
4979 Register state = c_rarg1;
4980 Register ofs = c_rarg2;
4981 Register limit = c_rarg3;
4982 Register consts = t2; // caller saved
4983 Register state_c = x28; // caller saved
4984 VectorRegister vindex = v2;
4985 VectorRegister vW0 = v4;
4986 VectorRegister vW1 = v6;
4987 VectorRegister vW2 = v8;
4988 VectorRegister vW3 = v10;
4989 VectorRegister vState0 = v12;
4990 VectorRegister vState1 = v14;
4991 VectorRegister vHash0 = v16;
4992 VectorRegister vHash1 = v18;
4993 VectorRegister vTmp0 = v20;
4994 VectorRegister vTmp1 = v22;
4995
4996 Label multi_block_loop;
4997
4998 __ enter();
4999
5000 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5001 la(consts, ExternalAddress(constant_table));
5002
5003 // Register use in this function:
5004 //
5005 // VECTORS
5006 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5007 // schedule words (Wt). They start with the message block
5008 // content (W0 to W15), then further words in the message
5009 // schedule generated via vsha2ms from previous Wt.
5010 // Initially:
5011 // vW0 = W[ 3:0] = { W3, W2, W1, W0}
5012 // vW1 = W[ 7:4] = { W7, W6, W5, W4}
5013 // vW2 = W[ 11:8] = {W11, W10, W9, W8}
5014 // vW3 = W[15:12] = {W15, W14, W13, W12}
5015 //
5016 // vState0 - vState1 hold the working state variables (a, b, ..., h)
5017 // vState0 = {f[t],e[t],b[t],a[t]}
5018 // vState1 = {h[t],g[t],d[t],c[t]}
5019 // Initially:
5020 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5021 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5022 //
5023 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5024 //
5025 // vTmp0 = temporary, Wt+Kt
5026 // vTmp1 = temporary, Kt
5027 //
5028 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5029 //
5030 // During most of the function the vector state is configured so that each
5031 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5032
5033 // vsha2ch/vsha2cl uses EGW of 4*SEW.
5034 // SHA256 SEW = e32, EGW = 128-bits
5035 // SHA512 SEW = e64, EGW = 256-bits
5036 //
5037 // VLEN is required to be at least 128.
5038 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5039 //
5040 // m1: LMUL=1/2
5041 // ta: tail agnostic (don't care about those lanes)
5042 // ma: mask agnostic (don't care about those lanes)
5043 // x0 is not written, we known the number of vector elements.
5044
5045 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5046 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5047 } else {
5048 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5049 }
5050
5051 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5052 __ li(t0, indexes);
5053 __ vmv_v_x(vindex, t0);
5054
5055 // Step-over a,b, so we are pointing to c.
5056 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5057 __ addi(state_c, state, const_add/2);
5058
5059 // Use index-load to get {f,e,b,a},{h,g,d,c}
5060 __ vluxei8_v(vState0, state, vindex);
5061 __ vluxei8_v(vState1, state_c, vindex);
5062
5063 __ bind(multi_block_loop);
5064
5065 // Capture the initial H values in vHash0 and vHash1 to allow for computing
5066 // the resulting H', since H' = H+{a',b',c',...,h'}.
5067 __ vmv_v_v(vHash0, vState0);
5068 __ vmv_v_v(vHash1, vState1);
5069
5070 // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5071 // an endian swap on each 4/8 bytes element.
5072 //
5073 // If Zvkb is not implemented one can use vrgather
5074 // with an index sequence to byte-swap.
5075 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
5076 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5077 // this sequence. 'vid' gives us the N.
5078 __ vleXX_v(vset_sew, vW0, buf);
5079 __ vrev8_v(vW0, vW0);
5080 __ addi(buf, buf, const_add);
5081 __ vleXX_v(vset_sew, vW1, buf);
5082 __ vrev8_v(vW1, vW1);
5083 __ addi(buf, buf, const_add);
5084 __ vleXX_v(vset_sew, vW2, buf);
5085 __ vrev8_v(vW2, vW2);
5086 __ addi(buf, buf, const_add);
5087 __ vleXX_v(vset_sew, vW3, buf);
5088 __ vrev8_v(vW3, vW3);
5089 __ addi(buf, buf, const_add);
5090
5091 // Set v0 up for the vmerge that replaces the first word (idx==0)
5092 __ vid_v(v0);
5093 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0)
5094
5095 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5096 int rot_pos = 0;
5097 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5098 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5099 for (int i = 0; i < qr_end; i++) {
5100 sha2_quad_round(vset_sew,
5101 rotation_regs[(rot_pos + 0) & 0x3],
5102 rotation_regs[(rot_pos + 1) & 0x3],
5103 rotation_regs[(rot_pos + 2) & 0x3],
5104 rotation_regs[(rot_pos + 3) & 0x3],
5105 consts,
5106 vTmp1, vTmp0, vState0, vState1);
5107 ++rot_pos;
5108 }
5109 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5110 // Note that we stop generating new message schedule words (Wt, vW0-13)
5111 // as we already generated all the words we end up consuming (i.e., W[63:60]).
5112 const int qr_c_end = qr_end + 4;
5113 for (int i = qr_end; i < qr_c_end; i++) {
5114 sha2_quad_round(vset_sew,
5115 rotation_regs[(rot_pos + 0) & 0x3],
5116 rotation_regs[(rot_pos + 1) & 0x3],
5117 rotation_regs[(rot_pos + 2) & 0x3],
5118 rotation_regs[(rot_pos + 3) & 0x3],
5119 consts,
5120 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5121 ++rot_pos;
5122 }
5123
5124 //--------------------------------------------------------------------------------
5125 // Compute the updated hash value H'
5126 // H' = H + {h',g',...,b',a'}
5127 // = {h,g,...,b,a} + {h',g',...,b',a'}
5128 // = {h+h',g+g',...,b+b',a+a'}
5129
5130 // H' = H+{a',b',c',...,h'}
5131 __ vadd_vv(vState0, vHash0, vState0);
5132 __ vadd_vv(vState1, vHash1, vState1);
5133
5134 if (multi_block) {
5135 int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5136 __ subi(consts, consts, total_adds);
5137 __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5138 __ ble(ofs, limit, multi_block_loop);
5139 __ mv(c_rarg0, ofs); // return ofs
5140 }
5141
5142 // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5143 // vState0 = {f,e,b,a}
5144 // vState1 = {h,g,d,c}
5145 __ vsuxei8_v(vState0, state, vindex);
5146 __ vsuxei8_v(vState1, state_c, vindex);
5147
5148 __ leave();
5149 __ ret();
5150
5151 return start;
5152 }
5153 };
5154
5155 #undef __
5156 #define __ _masm->
5157
5158 // Set of L registers that correspond to a contiguous memory area.
5159 // Each 64-bit register typically corresponds to 2 32-bit integers.
5160 template <uint L>
5161 class RegCache {
5162 private:
5163 MacroAssembler *_masm;
5164 Register _regs[L];
5165
5166 public:
5167 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5168 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5169 auto it = rs.begin();
5170 for (auto &r: _regs) {
5171 r = *it;
5172 ++it;
5173 }
5174 }
5175
5176 // generate load for the i'th register
5177 void gen_load(uint i, Register base) {
5178 assert(i < L, "invalid i: %u", i);
5179 __ ld(_regs[i], Address(base, 8 * i));
5180 }
5181
5182 // add i'th 32-bit integer to dest
5183 void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5184 assert(i < 2 * L, "invalid i: %u", i);
5185
5186 if (is_even(i)) {
5187 // Use the bottom 32 bits. No need to mask off the top 32 bits
5188 // as addw will do the right thing.
5189 __ addw(dest, dest, _regs[i / 2]);
5190 } else {
5191 // Use the top 32 bits by right-shifting them.
5192 __ srli(rtmp, _regs[i / 2], 32);
5193 __ addw(dest, dest, rtmp);
5194 }
5195 }
5196 };
5197
5198 typedef RegCache<8> BufRegCache;
5199
5200 // a += value + x + ac;
5201 // a = Integer.rotateLeft(a, s) + b;
5202 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5203 Register a, Register b, Register c, Register d,
5204 int k, int s, int t,
5205 Register value) {
5206 // a += ac
5207 __ addw(a, a, t, t1);
5208
5209 // a += x;
5210 reg_cache.add_u32(a, k);
5211 // a += value;
5212 __ addw(a, a, value);
5213
5214 // a = Integer.rotateLeft(a, s) + b;
5215 __ rolw(a, a, s);
5216 __ addw(a, a, b);
5217 }
5218
5219 // a += ((b & c) | ((~b) & d)) + x + ac;
5220 // a = Integer.rotateLeft(a, s) + b;
5221 void md5_FF(BufRegCache& reg_cache,
5222 Register a, Register b, Register c, Register d,
5223 int k, int s, int t,
5224 Register rtmp1, Register rtmp2) {
5225 // rtmp1 = b & c
5226 __ andr(rtmp1, b, c);
5227
5228 // rtmp2 = (~b) & d
5229 __ andn(rtmp2, d, b);
5230
5231 // rtmp1 = (b & c) | ((~b) & d)
5232 __ orr(rtmp1, rtmp1, rtmp2);
5233
5234 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5235 }
5236
5237 // a += ((b & d) | (c & (~d))) + x + ac;
5238 // a = Integer.rotateLeft(a, s) + b;
5239 void md5_GG(BufRegCache& reg_cache,
5240 Register a, Register b, Register c, Register d,
5241 int k, int s, int t,
5242 Register rtmp1, Register rtmp2) {
5243 // rtmp1 = b & d
5244 __ andr(rtmp1, b, d);
5245
5246 // rtmp2 = c & (~d)
5247 __ andn(rtmp2, c, d);
5248
5249 // rtmp1 = (b & d) | (c & (~d))
5250 __ orr(rtmp1, rtmp1, rtmp2);
5251
5252 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5253 }
5254
5255 // a += ((b ^ c) ^ d) + x + ac;
5256 // a = Integer.rotateLeft(a, s) + b;
5257 void md5_HH(BufRegCache& reg_cache,
5258 Register a, Register b, Register c, Register d,
5259 int k, int s, int t,
5260 Register rtmp1, Register rtmp2) {
5261 // rtmp1 = (b ^ c) ^ d
5262 __ xorr(rtmp2, b, c);
5263 __ xorr(rtmp1, rtmp2, d);
5264
5265 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5266 }
5267
5268 // a += (c ^ (b | (~d))) + x + ac;
5269 // a = Integer.rotateLeft(a, s) + b;
5270 void md5_II(BufRegCache& reg_cache,
5271 Register a, Register b, Register c, Register d,
5272 int k, int s, int t,
5273 Register rtmp1, Register rtmp2) {
5274 // rtmp1 = c ^ (b | (~d))
5275 __ orn(rtmp2, b, d);
5276 __ xorr(rtmp1, c, rtmp2);
5277
5278 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5279 }
5280
5281 // Arguments:
5282 //
5283 // Inputs:
5284 // c_rarg0 - byte[] source+offset
5285 // c_rarg1 - int[] SHA.state
5286 // c_rarg2 - int offset (multi_block == True)
5287 // c_rarg3 - int limit (multi_block == True)
5288 //
5289 // Registers:
5290 // x0 zero (zero)
5291 // x1 ra (return address)
5292 // x2 sp (stack pointer)
5293 // x3 gp (global pointer)
5294 // x4 tp (thread pointer)
5295 // x5 t0 (tmp register)
5296 // x6 t1 (tmp register)
5297 // x7 t2 state0
5298 // x8 f0/s0 (frame pointer)
5299 // x9 s1
5300 // x10 a0 rtmp1 / c_rarg0
5301 // x11 a1 rtmp2 / c_rarg1
5302 // x12 a2 a / c_rarg2
5303 // x13 a3 b / c_rarg3
5304 // x14 a4 c
5305 // x15 a5 d
5306 // x16 a6 buf
5307 // x17 a7 state
5308 // x18 s2 ofs [saved-reg] (multi_block == True)
5309 // x19 s3 limit [saved-reg] (multi_block == True)
5310 // x20 s4 state1 [saved-reg]
5311 // x21 s5 state2 [saved-reg]
5312 // x22 s6 state3 [saved-reg]
5313 // x23 s7
5314 // x24 s8 buf0 [saved-reg]
5315 // x25 s9 buf1 [saved-reg]
5316 // x26 s10 buf2 [saved-reg]
5317 // x27 s11 buf3 [saved-reg]
5318 // x28 t3 buf4
5319 // x29 t4 buf5
5320 // x30 t5 buf6
5321 // x31 t6 buf7
5322 address generate_md5_implCompress(StubId stub_id) {
5323 __ align(CodeEntryAlignment);
5324 bool multi_block;
5325 switch (stub_id) {
5326 case StubId::stubgen_md5_implCompress_id:
5327 multi_block = false;
5328 break;
5329 case StubId::stubgen_md5_implCompressMB_id:
5330 multi_block = true;
5331 break;
5332 default:
5333 ShouldNotReachHere();
5334 };
5335 StubCodeMark mark(this, stub_id);
5336 address start = __ pc();
5337
5338 // rotation constants
5339 const int S11 = 7;
5340 const int S12 = 12;
5341 const int S13 = 17;
5342 const int S14 = 22;
5343 const int S21 = 5;
5344 const int S22 = 9;
5345 const int S23 = 14;
5346 const int S24 = 20;
5347 const int S31 = 4;
5348 const int S32 = 11;
5349 const int S33 = 16;
5350 const int S34 = 23;
5351 const int S41 = 6;
5352 const int S42 = 10;
5353 const int S43 = 15;
5354 const int S44 = 21;
5355
5356 const int64_t mask32 = 0xffffffff;
5357
5358 Register buf_arg = c_rarg0; // a0
5359 Register state_arg = c_rarg1; // a1
5360 Register ofs_arg = c_rarg2; // a2
5361 Register limit_arg = c_rarg3; // a3
5362
5363 // we'll copy the args to these registers to free up a0-a3
5364 // to use for other values manipulated by instructions
5365 // that can be compressed
5366 Register buf = x16; // a6
5367 Register state = x17; // a7
5368 Register ofs = x18; // s2
5369 Register limit = x19; // s3
5370
5371 // using x12->15 to allow compressed instructions
5372 Register a = x12; // a2
5373 Register b = x13; // a3
5374 Register c = x14; // a4
5375 Register d = x15; // a5
5376
5377 Register state0 = x7; // t2
5378 Register state1 = x20; // s4
5379 Register state2 = x21; // s5
5380 Register state3 = x22; // s6
5381
5382 // using x10->x11 to allow compressed instructions
5383 Register rtmp1 = x10; // a0
5384 Register rtmp2 = x11; // a1
5385
5386 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5387 RegSet reg_cache_regs;
5388 reg_cache_regs += reg_cache_saved_regs;
5389 reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5390 BufRegCache reg_cache(_masm, reg_cache_regs);
5391
5392 RegSet saved_regs;
5393 if (multi_block) {
5394 saved_regs += RegSet::of(ofs, limit);
5395 }
5396 saved_regs += RegSet::of(state1, state2, state3);
5397 saved_regs += reg_cache_saved_regs;
5398
5399 __ push_reg(saved_regs, sp);
5400
5401 __ mv(buf, buf_arg);
5402 __ mv(state, state_arg);
5403 if (multi_block) {
5404 __ mv(ofs, ofs_arg);
5405 __ mv(limit, limit_arg);
5406 }
5407
5408 // to minimize the number of memory operations:
5409 // read the 4 state 4-byte values in pairs, with a single ld,
5410 // and split them into 2 registers.
5411 //
5412 // And, as the core algorithm of md5 works on 32-bits words, so
5413 // in the following code, it does not care about the content of
5414 // higher 32-bits in state[x]. Based on this observation,
5415 // we can apply further optimization, which is to just ignore the
5416 // higher 32-bits in state0/state2, rather than set the higher
5417 // 32-bits of state0/state2 to zero explicitly with extra instructions.
5418 __ ld(state0, Address(state));
5419 __ srli(state1, state0, 32);
5420 __ ld(state2, Address(state, 8));
5421 __ srli(state3, state2, 32);
5422
5423 Label md5_loop;
5424 __ BIND(md5_loop);
5425
5426 __ mv(a, state0);
5427 __ mv(b, state1);
5428 __ mv(c, state2);
5429 __ mv(d, state3);
5430
5431 // Round 1
5432 reg_cache.gen_load(0, buf);
5433 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2);
5434 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2);
5435 reg_cache.gen_load(1, buf);
5436 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2);
5437 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2);
5438 reg_cache.gen_load(2, buf);
5439 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2);
5440 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2);
5441 reg_cache.gen_load(3, buf);
5442 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2);
5443 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2);
5444 reg_cache.gen_load(4, buf);
5445 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2);
5446 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2);
5447 reg_cache.gen_load(5, buf);
5448 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5449 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5450 reg_cache.gen_load(6, buf);
5451 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5452 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5453 reg_cache.gen_load(7, buf);
5454 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5455 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5456
5457 // Round 2
5458 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2);
5459 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2);
5460 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5461 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5462 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2);
5463 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5464 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5465 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5466 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2);
5467 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5468 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2);
5469 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2);
5470 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5471 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5472 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2);
5473 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5474
5475 // Round 3
5476 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2);
5477 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2);
5478 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5479 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5480 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2);
5481 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5482 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5483 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5484 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5485 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2);
5486 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2);
5487 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2);
5488 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2);
5489 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5490 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5491 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2);
5492
5493 // Round 4
5494 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2);
5495 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2);
5496 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5497 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2);
5498 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5499 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5500 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5501 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2);
5502 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5503 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5504 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2);
5505 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5506 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2);
5507 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5508 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5509 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2);
5510
5511 __ addw(state0, state0, a);
5512 __ addw(state1, state1, b);
5513 __ addw(state2, state2, c);
5514 __ addw(state3, state3, d);
5515
5516 if (multi_block) {
5517 __ addi(buf, buf, 64);
5518 __ addi(ofs, ofs, 64);
5519 // if (ofs <= limit) goto m5_loop
5520 __ bge(limit, ofs, md5_loop);
5521 __ mv(c_rarg0, ofs); // return ofs
5522 }
5523
5524 // to minimize the number of memory operations:
5525 // write back the 4 state 4-byte values in pairs, with a single sd
5526 __ mv(t0, mask32);
5527 __ andr(state0, state0, t0);
5528 __ slli(state1, state1, 32);
5529 __ orr(state0, state0, state1);
5530 __ sd(state0, Address(state));
5531 __ andr(state2, state2, t0);
5532 __ slli(state3, state3, 32);
5533 __ orr(state2, state2, state3);
5534 __ sd(state2, Address(state, 8));
5535
5536 __ pop_reg(saved_regs, sp);
5537 __ ret();
5538
5539 return (address) start;
5540 }
5541
5542 /**
5543 * Perform the quarter round calculations on values contained within four vector registers.
5544 *
5545 * @param aVec the SIMD register containing only the "a" values
5546 * @param bVec the SIMD register containing only the "b" values
5547 * @param cVec the SIMD register containing only the "c" values
5548 * @param dVec the SIMD register containing only the "d" values
5549 * @param tmp_vr temporary vector register holds intermedia values.
5550 */
5551 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5552 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5553 // a += b, d ^= a, d <<<= 16
5554 __ vadd_vv(aVec, aVec, bVec);
5555 __ vxor_vv(dVec, dVec, aVec);
5556 __ vrole32_vi(dVec, 16, tmp_vr);
5557
5558 // c += d, b ^= c, b <<<= 12
5559 __ vadd_vv(cVec, cVec, dVec);
5560 __ vxor_vv(bVec, bVec, cVec);
5561 __ vrole32_vi(bVec, 12, tmp_vr);
5562
5563 // a += b, d ^= a, d <<<= 8
5564 __ vadd_vv(aVec, aVec, bVec);
5565 __ vxor_vv(dVec, dVec, aVec);
5566 __ vrole32_vi(dVec, 8, tmp_vr);
5567
5568 // c += d, b ^= c, b <<<= 7
5569 __ vadd_vv(cVec, cVec, dVec);
5570 __ vxor_vv(bVec, bVec, cVec);
5571 __ vrole32_vi(bVec, 7, tmp_vr);
5572 }
5573
5574 /**
5575 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5576 *
5577 * Input arguments:
5578 * c_rarg0 - state, the starting state
5579 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
5580 *
5581 * Implementation Note:
5582 * Parallelization is achieved by loading individual state elements into vectors for N blocks.
5583 * N depends on single vector register length.
5584 */
5585 address generate_chacha20Block() {
5586 Label L_Rounds;
5587
5588 __ align(CodeEntryAlignment);
5589 StubId stub_id = StubId::stubgen_chacha20Block_id;
5590 StubCodeMark mark(this, stub_id);
5591 address start = __ pc();
5592 __ enter();
5593
5594 const int states_len = 16;
5595 const int step = 4;
5596 const Register state = c_rarg0;
5597 const Register key_stream = c_rarg1;
5598 const Register tmp_addr = t0;
5599 const Register length = t1;
5600
5601 // Organize vector registers in an array that facilitates
5602 // putting repetitive opcodes into loop structures below.
5603 const VectorRegister work_vrs[16] = {
5604 v0, v1, v2, v3, v4, v5, v6, v7,
5605 v8, v9, v10, v11, v12, v13, v14, v15
5606 };
5607 const VectorRegister tmp_vr = v16;
5608 const VectorRegister counter_vr = v17;
5609
5610 {
5611 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5612 // in java level.
5613 __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5614 }
5615
5616 // Load from source state.
5617 // Every element in source state is duplicated to all elements in the corresponding vector.
5618 __ mv(tmp_addr, state);
5619 for (int i = 0; i < states_len; i += 1) {
5620 __ vlse32_v(work_vrs[i], tmp_addr, zr);
5621 __ addi(tmp_addr, tmp_addr, step);
5622 }
5623 // Adjust counter for every individual block.
5624 __ vid_v(counter_vr);
5625 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5626
5627 // Perform 10 iterations of the 8 quarter round set
5628 {
5629 const Register loop = t2; // share t2 with other non-overlapping usages.
5630 __ mv(loop, 10);
5631 __ BIND(L_Rounds);
5632
5633 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
5634 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
5635 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5636 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5637
5638 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5639 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5640 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
5641 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
5642
5643 __ subi(loop, loop, 1);
5644 __ bnez(loop, L_Rounds);
5645 }
5646
5647 // Add the original state into the end working state.
5648 // We do this by first duplicating every element in source state array to the corresponding
5649 // vector, then adding it to the post-loop working state.
5650 __ mv(tmp_addr, state);
5651 for (int i = 0; i < states_len; i += 1) {
5652 __ vlse32_v(tmp_vr, tmp_addr, zr);
5653 __ addi(tmp_addr, tmp_addr, step);
5654 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5655 }
5656 // Add the counter overlay onto work_vrs[12] at the end.
5657 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5658
5659 // Store result to key stream.
5660 {
5661 const Register stride = t2; // share t2 with other non-overlapping usages.
5662 // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5663 __ mv(stride, 64);
5664 for (int i = 0; i < states_len; i += 1) {
5665 __ vsse32_v(work_vrs[i], key_stream, stride);
5666 __ addi(key_stream, key_stream, step);
5667 }
5668 }
5669
5670 // Return length of output key_stream
5671 __ slli(c_rarg0, length, 6);
5672
5673 __ leave();
5674 __ ret();
5675
5676 return (address) start;
5677 }
5678
5679
5680 // ------------------------ SHA-1 intrinsic ------------------------
5681
5682 // K't =
5683 // 5a827999, 0 <= t <= 19
5684 // 6ed9eba1, 20 <= t <= 39
5685 // 8f1bbcdc, 40 <= t <= 59
5686 // ca62c1d6, 60 <= t <= 79
5687 void sha1_prepare_k(Register cur_k, int round) {
5688 assert(round >= 0 && round < 80, "must be");
5689
5690 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5691 if ((round % 20) == 0) {
5692 __ mv(cur_k, ks[round/20]);
5693 }
5694 }
5695
5696 // W't =
5697 // M't, 0 <= t <= 15
5698 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5699 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5700 assert(round >= 0 && round < 80, "must be");
5701
5702 if (round < 16) {
5703 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5704 // in ws[0], high part contains W't-0, low part contains W't-1,
5705 // in ws[1], high part contains W't-2, low part contains W't-3,
5706 // ...
5707 // in ws[7], high part contains W't-14, low part contains W't-15.
5708
5709 if ((round % 2) == 0) {
5710 __ ld(ws[round/2], Address(buf, (round/2) * 8));
5711 // reverse bytes, as SHA-1 is defined in big-endian.
5712 __ revb(ws[round/2], ws[round/2]);
5713 __ srli(cur_w, ws[round/2], 32);
5714 } else {
5715 __ mv(cur_w, ws[round/2]);
5716 }
5717
5718 return;
5719 }
5720
5721 if ((round % 2) == 0) {
5722 int idx = 16;
5723 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5724 __ srli(t1, ws[(idx-8)/2], 32);
5725 __ xorr(t0, ws[(idx-3)/2], t1);
5726
5727 __ srli(t1, ws[(idx-14)/2], 32);
5728 __ srli(cur_w, ws[(idx-16)/2], 32);
5729 __ xorr(cur_w, cur_w, t1);
5730
5731 __ xorr(cur_w, cur_w, t0);
5732 __ rolw(cur_w, cur_w, 1, t0);
5733
5734 // copy the cur_w value to ws[8].
5735 // now, valid w't values are at:
5736 // w0: ws[0]'s lower 32 bits
5737 // w1 ~ w14: ws[1] ~ ws[7]
5738 // w15: ws[8]'s higher 32 bits
5739 __ slli(ws[idx/2], cur_w, 32);
5740
5741 return;
5742 }
5743
5744 int idx = 17;
5745 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5746 __ srli(t1, ws[(idx-3)/2], 32);
5747 __ xorr(t0, t1, ws[(idx-8)/2]);
5748
5749 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5750
5751 __ xorr(cur_w, cur_w, t0);
5752 __ rolw(cur_w, cur_w, 1, t0);
5753
5754 // copy the cur_w value to ws[8]
5755 __ zext(cur_w, cur_w, 32);
5756 __ orr(ws[idx/2], ws[idx/2], cur_w);
5757
5758 // shift the w't registers, so they start from ws[0] again.
5759 // now, valid w't values are at:
5760 // w0 ~ w15: ws[0] ~ ws[7]
5761 Register ws_0 = ws[0];
5762 for (int i = 0; i < 16/2; i++) {
5763 ws[i] = ws[i+1];
5764 }
5765 ws[8] = ws_0;
5766 }
5767
5768 // f't(x, y, z) =
5769 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
5770 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
5771 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
5772 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
5773 void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5774 assert(round >= 0 && round < 80, "must be");
5775 assert_different_registers(dst, x, y, z, t0, t1);
5776
5777 if (round < 20) {
5778 // (x & y) ^ (~x & z)
5779 __ andr(t0, x, y);
5780 __ andn(dst, z, x);
5781 __ xorr(dst, dst, t0);
5782 } else if (round >= 40 && round < 60) {
5783 // (x & y) ^ (x & z) ^ (y & z)
5784 __ andr(t0, x, y);
5785 __ andr(t1, x, z);
5786 __ andr(dst, y, z);
5787 __ xorr(dst, dst, t0);
5788 __ xorr(dst, dst, t1);
5789 } else {
5790 // x ^ y ^ z
5791 __ xorr(dst, x, y);
5792 __ xorr(dst, dst, z);
5793 }
5794 }
5795
5796 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5797 // e = d
5798 // d = c
5799 // c = ROTL'30(b)
5800 // b = a
5801 // a = T
5802 void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5803 Register cur_k, Register cur_w, Register tmp, int round) {
5804 assert(round >= 0 && round < 80, "must be");
5805 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5806
5807 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5808
5809 // cur_w will be recalculated at the beginning of each round,
5810 // so, we can reuse it as a temp register here.
5811 Register tmp2 = cur_w;
5812
5813 // reuse e as a temporary register, as we will mv new value into it later
5814 Register tmp3 = e;
5815 __ add(tmp2, cur_k, tmp2);
5816 __ add(tmp3, tmp3, tmp2);
5817 __ rolw(tmp2, a, 5, t0);
5818
5819 sha1_f(tmp, b, c, d, round);
5820
5821 __ add(tmp2, tmp2, tmp);
5822 __ add(tmp2, tmp2, tmp3);
5823
5824 // e = d
5825 // d = c
5826 // c = ROTL'30(b)
5827 // b = a
5828 // a = T
5829 __ mv(e, d);
5830 __ mv(d, c);
5831
5832 __ rolw(c, b, 30);
5833 __ mv(b, a);
5834 __ mv(a, tmp2);
5835 }
5836
5837 // H(i)0 = a + H(i-1)0
5838 // H(i)1 = b + H(i-1)1
5839 // H(i)2 = c + H(i-1)2
5840 // H(i)3 = d + H(i-1)3
5841 // H(i)4 = e + H(i-1)4
5842 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5843 Register prev_ab, Register prev_cd, Register prev_e) {
5844 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5845
5846 __ add(a, a, prev_ab);
5847 __ srli(prev_ab, prev_ab, 32);
5848 __ add(b, b, prev_ab);
5849
5850 __ add(c, c, prev_cd);
5851 __ srli(prev_cd, prev_cd, 32);
5852 __ add(d, d, prev_cd);
5853
5854 __ add(e, e, prev_e);
5855 }
5856
5857 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5858 Register prev_ab, Register prev_cd, Register prev_e) {
5859 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5860
5861 __ slli(t0, b, 32);
5862 __ zext(prev_ab, a, 32);
5863 __ orr(prev_ab, prev_ab, t0);
5864
5865 __ slli(t0, d, 32);
5866 __ zext(prev_cd, c, 32);
5867 __ orr(prev_cd, prev_cd, t0);
5868
5869 __ mv(prev_e, e);
5870 }
5871
5872 // Intrinsic for:
5873 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5874 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5875 //
5876 // Arguments:
5877 //
5878 // Inputs:
5879 // c_rarg0: byte[] src array + offset
5880 // c_rarg1: int[] SHA.state
5881 // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5882 // c_rarg2: int offset
5883 // c_rarg3: int limit
5884 //
5885 // Outputs:
5886 // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5887 // c_rarg0: int offset, when (multi_block == true)
5888 //
5889 address generate_sha1_implCompress(StubId stub_id) {
5890 bool multi_block;
5891 switch (stub_id) {
5892 case StubId::stubgen_sha1_implCompress_id:
5893 multi_block = false;
5894 break;
5895 case StubId::stubgen_sha1_implCompressMB_id:
5896 multi_block = true;
5897 break;
5898 default:
5899 ShouldNotReachHere();
5900 };
5901 __ align(CodeEntryAlignment);
5902 StubCodeMark mark(this, stub_id);
5903
5904 address start = __ pc();
5905 __ enter();
5906
5907 RegSet saved_regs = RegSet::range(x18, x27);
5908 if (multi_block) {
5909 // use x9 as src below.
5910 saved_regs += RegSet::of(x9);
5911 }
5912 __ push_reg(saved_regs, sp);
5913
5914 // c_rarg0 - c_rarg3: x10 - x13
5915 Register buf = c_rarg0;
5916 Register state = c_rarg1;
5917 Register offset = c_rarg2;
5918 Register limit = c_rarg3;
5919 // use src to contain the original start point of the array.
5920 Register src = x9;
5921
5922 if (multi_block) {
5923 __ sub(limit, limit, offset);
5924 __ add(limit, limit, buf);
5925 __ sub(src, buf, offset);
5926 }
5927
5928 // [args-reg]: x14 - x17
5929 // [temp-reg]: x28 - x31
5930 // [saved-reg]: x18 - x27
5931
5932 // h0/1/2/3/4
5933 const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5934 // w0, w1, ... w15
5935 // put two adjecent w's in one register:
5936 // one at high word part, another at low word part
5937 // at different round (even or odd), w't value reside in different items in ws[].
5938 // w0 ~ w15, either reside in
5939 // ws[0] ~ ws[7], where
5940 // w0 at higher 32 bits of ws[0],
5941 // w1 at lower 32 bits of ws[0],
5942 // ...
5943 // w14 at higher 32 bits of ws[7],
5944 // w15 at lower 32 bits of ws[7].
5945 // or, reside in
5946 // w0: ws[0]'s lower 32 bits
5947 // w1 ~ w14: ws[1] ~ ws[7]
5948 // w15: ws[8]'s higher 32 bits
5949 Register ws[9] = {x29, x30, x31, x18,
5950 x19, x20, x21, x22,
5951 x23}; // auxiliary register for calculating w's value
5952 // current k't's value
5953 const Register cur_k = x24;
5954 // current w't's value
5955 const Register cur_w = x25;
5956 // values of a, b, c, d, e in the previous round
5957 const Register prev_ab = x26, prev_cd = x27;
5958 const Register prev_e = offset; // reuse offset/c_rarg2
5959
5960 // load 5 words state into a, b, c, d, e.
5961 //
5962 // To minimize the number of memory operations, we apply following
5963 // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5964 // with a single ld, and split them into 2 registers.
5965 //
5966 // And, as the core algorithm of SHA-1 works on 32-bits words, so
5967 // in the following code, it does not care about the content of
5968 // higher 32-bits in a/b/c/d/e. Based on this observation,
5969 // we can apply further optimization, which is to just ignore the
5970 // higher 32-bits in a/c/e, rather than set the higher
5971 // 32-bits of a/c/e to zero explicitly with extra instructions.
5972 __ ld(a, Address(state, 0));
5973 __ srli(b, a, 32);
5974 __ ld(c, Address(state, 8));
5975 __ srli(d, c, 32);
5976 __ lw(e, Address(state, 16));
5977
5978 Label L_sha1_loop;
5979 if (multi_block) {
5980 __ BIND(L_sha1_loop);
5981 }
5982
5983 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5984
5985 for (int round = 0; round < 80; round++) {
5986 // prepare K't value
5987 sha1_prepare_k(cur_k, round);
5988
5989 // prepare W't value
5990 sha1_prepare_w(cur_w, ws, buf, round);
5991
5992 // one round process
5993 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
5994 }
5995
5996 // compute the intermediate hash value
5997 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5998
5999 if (multi_block) {
6000 int64_t block_bytes = 16 * 4;
6001 __ addi(buf, buf, block_bytes);
6002
6003 __ bge(limit, buf, L_sha1_loop, true);
6004 }
6005
6006 // store back the state.
6007 __ zext(a, a, 32);
6008 __ slli(b, b, 32);
6009 __ orr(a, a, b);
6010 __ sd(a, Address(state, 0));
6011 __ zext(c, c, 32);
6012 __ slli(d, d, 32);
6013 __ orr(c, c, d);
6014 __ sd(c, Address(state, 8));
6015 __ sw(e, Address(state, 16));
6016
6017 // return offset
6018 if (multi_block) {
6019 __ sub(c_rarg0, buf, src);
6020 }
6021
6022 __ pop_reg(saved_regs, sp);
6023
6024 __ leave();
6025 __ ret();
6026
6027 return (address) start;
6028 }
6029
6030 /**
6031 * vector registers:
6032 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6033 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6034 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6035 *
6036 * NOTE: each field will occupy a vector register group
6037 */
6038 void base64_vector_encode_round(Register src, Register dst, Register codec,
6039 Register size, Register stepSrc, Register stepDst,
6040 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6041 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6042 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6043 Assembler::LMUL lmul) {
6044 // set vector register type/len
6045 __ vsetvli(x0, size, Assembler::e8, lmul);
6046
6047 // segmented load src into v registers: mem(src) => vr(3)
6048 __ vlseg3e8_v(inputV1, src);
6049
6050 // src = src + register_group_len_bytes * 3
6051 __ add(src, src, stepSrc);
6052
6053 // encoding
6054 // 1. compute index into lookup table: vr(3) => vr(4)
6055 __ vsrl_vi(idxV1, inputV1, 2);
6056
6057 __ vsrl_vi(idxV2, inputV2, 2);
6058 __ vsll_vi(inputV1, inputV1, 6);
6059 __ vor_vv(idxV2, idxV2, inputV1);
6060 __ vsrl_vi(idxV2, idxV2, 2);
6061
6062 __ vsrl_vi(idxV3, inputV3, 4);
6063 __ vsll_vi(inputV2, inputV2, 4);
6064 __ vor_vv(idxV3, inputV2, idxV3);
6065 __ vsrl_vi(idxV3, idxV3, 2);
6066
6067 __ vsll_vi(idxV4, inputV3, 2);
6068 __ vsrl_vi(idxV4, idxV4, 2);
6069
6070 // 2. indexed load: vr(4) => vr(4)
6071 __ vluxei8_v(outputV1, codec, idxV1);
6072 __ vluxei8_v(outputV2, codec, idxV2);
6073 __ vluxei8_v(outputV3, codec, idxV3);
6074 __ vluxei8_v(outputV4, codec, idxV4);
6075
6076 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6077 __ vsseg4e8_v(outputV1, dst);
6078
6079 // dst = dst + register_group_len_bytes * 4
6080 __ add(dst, dst, stepDst);
6081 }
6082
6083 /**
6084 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6085 *
6086 * Input arguments:
6087 * c_rarg0 - src, source array
6088 * c_rarg1 - sp, src start offset
6089 * c_rarg2 - sl, src end offset
6090 * c_rarg3 - dst, dest array
6091 * c_rarg4 - dp, dst start offset
6092 * c_rarg5 - isURL, Base64 or URL character set
6093 */
6094 address generate_base64_encodeBlock() {
6095 alignas(64) static const char toBase64[64] = {
6096 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6097 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6098 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6099 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6100 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6101 };
6102
6103 alignas(64) static const char toBase64URL[64] = {
6104 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6105 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6106 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6107 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6108 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6109 };
6110
6111 __ align(CodeEntryAlignment);
6112 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6113 StubCodeMark mark(this, stub_id);
6114 address start = __ pc();
6115 __ enter();
6116
6117 Register src = c_rarg0;
6118 Register soff = c_rarg1;
6119 Register send = c_rarg2;
6120 Register dst = c_rarg3;
6121 Register doff = c_rarg4;
6122 Register isURL = c_rarg5;
6123
6124 Register codec = c_rarg6;
6125 Register length = c_rarg7; // total length of src data in bytes
6126
6127 Label ProcessData, Exit;
6128
6129 // length should be multiple of 3
6130 __ sub(length, send, soff);
6131 // real src/dst to process data
6132 __ add(src, src, soff);
6133 __ add(dst, dst, doff);
6134
6135 // load the codec base address
6136 __ la(codec, ExternalAddress((address) toBase64));
6137 __ beqz(isURL, ProcessData);
6138 __ la(codec, ExternalAddress((address) toBase64URL));
6139 __ BIND(ProcessData);
6140
6141 // vector version
6142 if (UseRVV) {
6143 Label ProcessM2, ProcessM1, ProcessScalar;
6144
6145 Register size = soff;
6146 Register stepSrcM1 = send;
6147 Register stepSrcM2 = doff;
6148 Register stepDst = isURL;
6149
6150 __ mv(size, MaxVectorSize * 2);
6151 __ mv(stepSrcM1, MaxVectorSize * 3);
6152 __ slli(stepSrcM2, stepSrcM1, 1);
6153 __ mv(stepDst, MaxVectorSize * 2 * 4);
6154
6155 __ blt(length, stepSrcM2, ProcessM1);
6156
6157 __ BIND(ProcessM2);
6158 base64_vector_encode_round(src, dst, codec,
6159 size, stepSrcM2, stepDst,
6160 v2, v4, v6, // inputs
6161 v8, v10, v12, v14, // indexes
6162 v16, v18, v20, v22, // outputs
6163 Assembler::m2);
6164
6165 __ sub(length, length, stepSrcM2);
6166 __ bge(length, stepSrcM2, ProcessM2);
6167
6168 __ BIND(ProcessM1);
6169 __ blt(length, stepSrcM1, ProcessScalar);
6170
6171 __ srli(size, size, 1);
6172 __ srli(stepDst, stepDst, 1);
6173 base64_vector_encode_round(src, dst, codec,
6174 size, stepSrcM1, stepDst,
6175 v1, v2, v3, // inputs
6176 v4, v5, v6, v7, // indexes
6177 v8, v9, v10, v11, // outputs
6178 Assembler::m1);
6179 __ sub(length, length, stepSrcM1);
6180
6181 __ BIND(ProcessScalar);
6182 }
6183
6184 // scalar version
6185 {
6186 Register byte1 = soff, byte0 = send, byte2 = doff;
6187 Register combined24Bits = isURL;
6188
6189 __ beqz(length, Exit);
6190
6191 Label ScalarLoop;
6192 __ BIND(ScalarLoop);
6193 {
6194 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6195 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6196
6197 // load 3 bytes src data
6198 __ lbu(byte0, Address(src, 0));
6199 __ lbu(byte1, Address(src, 1));
6200 __ lbu(byte2, Address(src, 2));
6201 __ addi(src, src, 3);
6202
6203 // construct 24 bits from 3 bytes
6204 __ slliw(byte0, byte0, 16);
6205 __ slliw(byte1, byte1, 8);
6206 __ orr(combined24Bits, byte0, byte1);
6207 __ orr(combined24Bits, combined24Bits, byte2);
6208
6209 // get codec index and encode(ie. load from codec by index)
6210 __ slliw(byte0, combined24Bits, 8);
6211 __ srliw(byte0, byte0, 26);
6212 __ add(byte0, codec, byte0);
6213 __ lbu(byte0, byte0);
6214
6215 __ slliw(byte1, combined24Bits, 14);
6216 __ srliw(byte1, byte1, 26);
6217 __ add(byte1, codec, byte1);
6218 __ lbu(byte1, byte1);
6219
6220 __ slliw(byte2, combined24Bits, 20);
6221 __ srliw(byte2, byte2, 26);
6222 __ add(byte2, codec, byte2);
6223 __ lbu(byte2, byte2);
6224
6225 __ andi(combined24Bits, combined24Bits, 0x3f);
6226 __ add(combined24Bits, codec, combined24Bits);
6227 __ lbu(combined24Bits, combined24Bits);
6228
6229 // store 4 bytes encoded data
6230 __ sb(byte0, Address(dst, 0));
6231 __ sb(byte1, Address(dst, 1));
6232 __ sb(byte2, Address(dst, 2));
6233 __ sb(combined24Bits, Address(dst, 3));
6234
6235 __ subi(length, length, 3);
6236 __ addi(dst, dst, 4);
6237 // loop back
6238 __ bnez(length, ScalarLoop);
6239 }
6240 }
6241
6242 __ BIND(Exit);
6243
6244 __ leave();
6245 __ ret();
6246
6247 return (address) start;
6248 }
6249
6250 /**
6251 * vector registers:
6252 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6253 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6254 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6255 *
6256 * NOTE: each field will occupy a single vector register group
6257 */
6258 void base64_vector_decode_round(Register src, Register dst, Register codec,
6259 Register size, Register stepSrc, Register stepDst, Register failedIdx,
6260 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6261 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6262 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6263 Assembler::LMUL lmul) {
6264 // set vector register type/len
6265 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6266
6267 // segmented load src into v registers: mem(src) => vr(4)
6268 __ vlseg4e8_v(inputV1, src);
6269
6270 // src = src + register_group_len_bytes * 4
6271 __ add(src, src, stepSrc);
6272
6273 // decoding
6274 // 1. indexed load: vr(4) => vr(4)
6275 __ vluxei8_v(idxV1, codec, inputV1);
6276 __ vluxei8_v(idxV2, codec, inputV2);
6277 __ vluxei8_v(idxV3, codec, inputV3);
6278 __ vluxei8_v(idxV4, codec, inputV4);
6279
6280 // 2. check wrong data
6281 __ vor_vv(outputV1, idxV1, idxV2);
6282 __ vor_vv(outputV2, idxV3, idxV4);
6283 __ vor_vv(outputV1, outputV1, outputV2);
6284 __ vmseq_vi(v0, outputV1, -1);
6285 __ vfirst_m(failedIdx, v0);
6286 Label NoFailure, FailureAtIdx0;
6287 // valid value can only be -1 when < 0
6288 __ bltz(failedIdx, NoFailure);
6289 // when the first data (at index 0) fails, no need to process data anymore
6290 __ beqz(failedIdx, FailureAtIdx0);
6291 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6292 __ slli(stepDst, failedIdx, 1);
6293 __ add(stepDst, failedIdx, stepDst);
6294 __ BIND(NoFailure);
6295
6296 // 3. compute the decoded data: vr(4) => vr(3)
6297 __ vsll_vi(idxV1, idxV1, 2);
6298 __ vsrl_vi(outputV1, idxV2, 4);
6299 __ vor_vv(outputV1, outputV1, idxV1);
6300
6301 __ vsll_vi(idxV2, idxV2, 4);
6302 __ vsrl_vi(outputV2, idxV3, 2);
6303 __ vor_vv(outputV2, outputV2, idxV2);
6304
6305 __ vsll_vi(idxV3, idxV3, 6);
6306 __ vor_vv(outputV3, idxV4, idxV3);
6307
6308 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6309 __ vsseg3e8_v(outputV1, dst);
6310
6311 // dst = dst + register_group_len_bytes * 3
6312 __ add(dst, dst, stepDst);
6313 __ BIND(FailureAtIdx0);
6314 }
6315
6316 /**
6317 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6318 *
6319 * Input arguments:
6320 * c_rarg0 - src, source array
6321 * c_rarg1 - sp, src start offset
6322 * c_rarg2 - sl, src end offset
6323 * c_rarg3 - dst, dest array
6324 * c_rarg4 - dp, dst start offset
6325 * c_rarg5 - isURL, Base64 or URL character set
6326 * c_rarg6 - isMIME, Decoding MIME block
6327 */
6328 address generate_base64_decodeBlock() {
6329
6330 static const uint8_t fromBase64[256] = {
6331 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6332 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6333 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
6334 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
6335 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
6336 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
6337 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
6338 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
6339 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6340 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6341 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6342 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6343 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6344 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6345 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6346 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347 };
6348
6349 static const uint8_t fromBase64URL[256] = {
6350 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6351 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
6353 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
6354 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
6355 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
6356 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
6357 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
6358 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6359 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6360 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6361 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6362 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6363 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6364 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6365 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366 };
6367
6368 __ align(CodeEntryAlignment);
6369 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6370 StubCodeMark mark(this, stub_id);
6371 address start = __ pc();
6372 __ enter();
6373
6374 Register src = c_rarg0;
6375 Register soff = c_rarg1;
6376 Register send = c_rarg2;
6377 Register dst = c_rarg3;
6378 Register doff = c_rarg4;
6379 Register isURL = c_rarg5;
6380 Register isMIME = c_rarg6;
6381
6382 Register codec = c_rarg7;
6383 Register dstBackup = t6;
6384 Register length = t3; // total length of src data in bytes
6385
6386 Label ProcessData, Exit;
6387 Label ProcessScalar, ScalarLoop;
6388
6389 // passed in length (send - soff) is guaranteed to be > 4,
6390 // and in this intrinsic we only process data of length in multiple of 4,
6391 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6392 __ sub(length, send, soff);
6393 __ andi(length, length, -4);
6394 // real src/dst to process data
6395 __ add(src, src, soff);
6396 __ add(dst, dst, doff);
6397 // backup of dst, used to calculate the return value at exit
6398 __ mv(dstBackup, dst);
6399
6400 // load the codec base address
6401 __ la(codec, ExternalAddress((address) fromBase64));
6402 __ beqz(isURL, ProcessData);
6403 __ la(codec, ExternalAddress((address) fromBase64URL));
6404 __ BIND(ProcessData);
6405
6406 // vector version
6407 if (UseRVV) {
6408 // for MIME case, it has a default length limit of 76 which could be
6409 // different(smaller) from (send - soff), so in MIME case, we go through
6410 // the scalar code path directly.
6411 __ bnez(isMIME, ScalarLoop);
6412
6413 Label ProcessM1, ProcessM2;
6414
6415 Register failedIdx = soff;
6416 Register stepSrcM1 = send;
6417 Register stepSrcM2 = doff;
6418 Register stepDst = isURL;
6419 Register size = t4;
6420
6421 __ mv(size, MaxVectorSize * 2);
6422 __ mv(stepSrcM1, MaxVectorSize * 4);
6423 __ slli(stepSrcM2, stepSrcM1, 1);
6424 __ mv(stepDst, MaxVectorSize * 2 * 3);
6425
6426 __ blt(length, stepSrcM2, ProcessM1);
6427
6428
6429 // Assembler::m2
6430 __ BIND(ProcessM2);
6431 base64_vector_decode_round(src, dst, codec,
6432 size, stepSrcM2, stepDst, failedIdx,
6433 v2, v4, v6, v8, // inputs
6434 v10, v12, v14, v16, // indexes
6435 v18, v20, v22, // outputs
6436 Assembler::m2);
6437 __ sub(length, length, stepSrcM2);
6438
6439 // error check
6440 // valid value of failedIdx can only be -1 when < 0
6441 __ bgez(failedIdx, Exit);
6442
6443 __ bge(length, stepSrcM2, ProcessM2);
6444
6445
6446 // Assembler::m1
6447 __ BIND(ProcessM1);
6448 __ blt(length, stepSrcM1, ProcessScalar);
6449
6450 __ srli(size, size, 1);
6451 __ srli(stepDst, stepDst, 1);
6452 base64_vector_decode_round(src, dst, codec,
6453 size, stepSrcM1, stepDst, failedIdx,
6454 v1, v2, v3, v4, // inputs
6455 v5, v6, v7, v8, // indexes
6456 v9, v10, v11, // outputs
6457 Assembler::m1);
6458 __ sub(length, length, stepSrcM1);
6459
6460 // error check
6461 // valid value of failedIdx can only be -1 when < 0
6462 __ bgez(failedIdx, Exit);
6463
6464 __ BIND(ProcessScalar);
6465 __ beqz(length, Exit);
6466 }
6467
6468 // scalar version
6469 {
6470 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6471 Register combined32Bits = t4;
6472
6473 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6474 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6475 __ BIND(ScalarLoop);
6476
6477 // load 4 bytes encoded src data
6478 __ lbu(byte0, Address(src, 0));
6479 __ lbu(byte1, Address(src, 1));
6480 __ lbu(byte2, Address(src, 2));
6481 __ lbu(byte3, Address(src, 3));
6482 __ addi(src, src, 4);
6483
6484 // get codec index and decode (ie. load from codec by index)
6485 __ add(byte0, codec, byte0);
6486 __ add(byte1, codec, byte1);
6487 __ lb(byte0, Address(byte0, 0));
6488 __ lb(byte1, Address(byte1, 0));
6489 __ add(byte2, codec, byte2);
6490 __ add(byte3, codec, byte3);
6491 __ lb(byte2, Address(byte2, 0));
6492 __ lb(byte3, Address(byte3, 0));
6493 __ slliw(byte0, byte0, 18);
6494 __ slliw(byte1, byte1, 12);
6495 __ orr(byte0, byte0, byte1);
6496 __ orr(byte0, byte0, byte3);
6497 __ slliw(byte2, byte2, 6);
6498 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6499 // 1. error check below
6500 // 2. decode below
6501 __ orr(combined32Bits, byte0, byte2);
6502
6503 // error check
6504 __ bltz(combined32Bits, Exit);
6505
6506 // store 3 bytes decoded data
6507 __ sraiw(byte0, combined32Bits, 16);
6508 __ sraiw(byte1, combined32Bits, 8);
6509 __ sb(byte0, Address(dst, 0));
6510 __ sb(byte1, Address(dst, 1));
6511 __ sb(combined32Bits, Address(dst, 2));
6512
6513 __ subi(length, length, 4);
6514 __ addi(dst, dst, 3);
6515 // loop back
6516 __ bnez(length, ScalarLoop);
6517 }
6518
6519 __ BIND(Exit);
6520 __ sub(c_rarg0, dst, dstBackup);
6521
6522 __ leave();
6523 __ ret();
6524
6525 return (address) start;
6526 }
6527
6528 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6529 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6530 Register temp0, Register temp1, Register temp2, Register temp3,
6531 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6532
6533 assert((lmul == Assembler::m4 && step == 64) ||
6534 (lmul == Assembler::m2 && step == 32) ||
6535 (lmul == Assembler::m1 && step == 16),
6536 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6537 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6538 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6539 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6540 // In non-vectorized code, we update s1 and s2 as:
6541 // s1 <- s1 + b1
6542 // s2 <- s2 + s1
6543 // s1 <- s1 + b2
6544 // s2 <- s2 + b1
6545 // ...
6546 // s1 <- s1 + b64
6547 // s2 <- s2 + s1
6548 // Putting above assignments together, we have:
6549 // s1_new = s1 + b1 + b2 + ... + b64
6550 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6551 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6552 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6553
6554 __ mv(temp3, step);
6555 // Load data
6556 __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6557 __ vle8_v(vbytes, buff);
6558 __ addi(buff, buff, step);
6559
6560 // Upper bound reduction sum for s1_new:
6561 // 0xFF * 64 = 0x3FC0, so:
6562 // 1. Need to do vector-widening reduction sum
6563 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6564 __ vwredsumu_vs(vs1acc, vbytes, vzero);
6565 // Multiplication for s2_new
6566 __ vwmulu_vv(vs2acc, vtable, vbytes);
6567
6568 // s2 = s2 + s1 * log2(step)
6569 __ slli(temp1, s1, exact_log2(step));
6570 __ add(s2, s2, temp1);
6571
6572 // Summing up calculated results for s2_new
6573 if (MaxVectorSize > 16) {
6574 __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6575 } else {
6576 // Half of vector-widening multiplication result is in successor of vs2acc
6577 // group for vlen == 16, in which case we need to double vector register
6578 // group width in order to reduction sum all of them
6579 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6580 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6581 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6582 }
6583 // Upper bound for reduction sum:
6584 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6585 // 1. Need to do vector-widening reduction sum
6586 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6587 __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6588
6589 // Extracting results for:
6590 // s1_new
6591 __ vmv_x_s(temp0, vs1acc);
6592 __ add(s1, s1, temp0);
6593 // s2_new
6594 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6595 __ vmv_x_s(temp1, vtemp1);
6596 __ add(s2, s2, temp1);
6597 }
6598
6599 /***
6600 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6601 *
6602 * Arguments:
6603 *
6604 * Inputs:
6605 * c_rarg0 - int adler
6606 * c_rarg1 - byte* buff (b + off)
6607 * c_rarg2 - int len
6608 *
6609 * Output:
6610 * c_rarg0 - int adler result
6611 */
6612 address generate_updateBytesAdler32() {
6613 __ align(CodeEntryAlignment);
6614 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6615 StubCodeMark mark(this, stub_id);
6616 address start = __ pc();
6617
6618 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6619 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6620
6621 // Aliases
6622 Register adler = c_rarg0;
6623 Register s1 = c_rarg0;
6624 Register s2 = c_rarg3;
6625 Register buff = c_rarg1;
6626 Register len = c_rarg2;
6627 Register nmax = c_rarg4;
6628 Register base = c_rarg5;
6629 Register count = c_rarg6;
6630 Register temp0 = t3;
6631 Register temp1 = t4;
6632 Register temp2 = t5;
6633 Register temp3 = t6;
6634
6635 VectorRegister vzero = v31;
6636 VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6637 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6638 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6639 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6640 VectorRegister vtable_32 = v4; // group: v4, v5
6641 VectorRegister vtable_16 = v30;
6642 VectorRegister vtemp1 = v28;
6643 VectorRegister vtemp2 = v29;
6644
6645 // Max number of bytes we can process before having to take the mod
6646 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6647 const uint64_t BASE = 0xfff1;
6648 const uint64_t NMAX = 0x15B0;
6649
6650 // Loops steps
6651 int step_64 = 64;
6652 int step_32 = 32;
6653 int step_16 = 16;
6654 int step_1 = 1;
6655
6656 __ enter(); // Required for proper stackwalking of RuntimeStub frame
6657 __ mv(temp1, 64);
6658 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6659
6660 // Generating accumulation coefficients for further calculations
6661 // vtable_64:
6662 __ vid_v(vtemp1);
6663 __ vrsub_vx(vtable_64, vtemp1, temp1);
6664 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6665
6666 // vtable_32:
6667 __ mv(temp1, 32);
6668 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6669 __ vid_v(vtemp1);
6670 __ vrsub_vx(vtable_32, vtemp1, temp1);
6671 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6672
6673 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6674 // vtable_16:
6675 __ mv(temp1, 16);
6676 __ vid_v(vtemp1);
6677 __ vrsub_vx(vtable_16, vtemp1, temp1);
6678 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6679
6680 __ vmv_v_i(vzero, 0);
6681
6682 __ mv(base, BASE);
6683 __ mv(nmax, NMAX);
6684
6685 // s1 is initialized to the lower 16 bits of adler
6686 // s2 is initialized to the upper 16 bits of adler
6687 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6688 __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6689
6690 // The pipelined loop needs at least 16 elements for 1 iteration
6691 // It does check this, but it is more effective to skip to the cleanup loop
6692 __ mv(temp0, step_16);
6693 __ bgeu(len, temp0, L_nmax);
6694 __ beqz(len, L_combine);
6695
6696 // Jumping to L_by1_loop
6697 __ subi(len, len, step_1);
6698 __ j(L_by1_loop);
6699
6700 __ bind(L_nmax);
6701 __ sub(len, len, nmax);
6702 __ subi(count, nmax, 16);
6703 __ bltz(len, L_by16);
6704
6705 // Align L_nmax loop by 64
6706 __ bind(L_nmax_loop_entry);
6707 __ subi(count, count, 32);
6708
6709 __ bind(L_nmax_loop);
6710 adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6711 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6712 vtemp1, vtemp2, step_64, Assembler::m4);
6713 __ subi(count, count, step_64);
6714 __ bgtz(count, L_nmax_loop);
6715
6716 // There are three iterations left to do
6717 adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6718 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6719 vtemp1, vtemp2, step_32, Assembler::m2);
6720 adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6721 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6722 vtemp1, vtemp2, step_16, Assembler::m1);
6723
6724 // s1 = s1 % BASE
6725 __ remuw(s1, s1, base);
6726 // s2 = s2 % BASE
6727 __ remuw(s2, s2, base);
6728
6729 __ sub(len, len, nmax);
6730 __ subi(count, nmax, 16);
6731 __ bgez(len, L_nmax_loop_entry);
6732
6733 __ bind(L_by16);
6734 __ add(len, len, count);
6735 __ bltz(len, L_by1);
6736 // Trying to unroll
6737 __ mv(temp3, step_64);
6738 __ blt(len, temp3, L_by16_loop);
6739
6740 __ bind(L_by16_loop_unroll);
6741 adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6742 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6743 vtemp1, vtemp2, step_64, Assembler::m4);
6744 __ subi(len, len, step_64);
6745 // By now the temp3 should still be 64
6746 __ bge(len, temp3, L_by16_loop_unroll);
6747
6748 __ bind(L_by16_loop);
6749 adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6750 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6751 vtemp1, vtemp2, step_16, Assembler::m1);
6752 __ subi(len, len, step_16);
6753 __ bgez(len, L_by16_loop);
6754
6755 __ bind(L_by1);
6756 __ addi(len, len, 15);
6757 __ bltz(len, L_do_mod);
6758
6759 __ bind(L_by1_loop);
6760 __ lbu(temp0, Address(buff, 0));
6761 __ addi(buff, buff, step_1);
6762 __ add(s1, temp0, s1);
6763 __ add(s2, s2, s1);
6764 __ subi(len, len, step_1);
6765 __ bgez(len, L_by1_loop);
6766
6767 __ bind(L_do_mod);
6768 // s1 = s1 % BASE
6769 __ remuw(s1, s1, base);
6770 // s2 = s2 % BASE
6771 __ remuw(s2, s2, base);
6772
6773 // Combine lower bits and higher bits
6774 // adler = s1 | (s2 << 16)
6775 __ bind(L_combine);
6776 __ slli(s2, s2, 16);
6777 __ orr(s1, s1, s2);
6778
6779 __ leave(); // Required for proper stackwalking of RuntimeStub frame
6780 __ ret();
6781
6782 return start;
6783 }
6784
6785 #endif // COMPILER2
6786
6787 // x10 = input (float16)
6788 // f10 = result (float)
6789 // t1 = temporary register
6790 address generate_float16ToFloat() {
6791 __ align(CodeEntryAlignment);
6792 StubId stub_id = StubId::stubgen_hf2f_id;
6793 StubCodeMark mark(this, stub_id);
6794 address entry = __ pc();
6795 BLOCK_COMMENT("float16ToFloat:");
6796
6797 FloatRegister dst = f10;
6798 Register src = x10;
6799 Label NaN_SLOW;
6800
6801 assert(VM_Version::supports_float16_float_conversion(), "must");
6802
6803 // On riscv, NaN needs a special process as fcvt does not work in that case.
6804 // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6805 // but we consider to get the slow path to process NaN and Inf at the same time,
6806 // as both of them are rare cases, and if we try to get the slow path to handle
6807 // only NaN case it would sacrifise the performance for normal cases,
6808 // i.e. non-NaN and non-Inf cases.
6809
6810 // check whether it's a NaN or +/- Inf.
6811 __ mv(t0, 0x7c00);
6812 __ andr(t1, src, t0);
6813 // jump to stub processing NaN and Inf cases.
6814 __ beq(t0, t1, NaN_SLOW);
6815
6816 // non-NaN or non-Inf cases, just use built-in instructions.
6817 __ fmv_h_x(dst, src);
6818 __ fcvt_s_h(dst, dst);
6819 __ ret();
6820
6821 __ bind(NaN_SLOW);
6822 // following instructions mainly focus on NaN, as riscv does not handle
6823 // NaN well with fcvt, but the code also works for Inf at the same time.
6824
6825 // construct a NaN in 32 bits from the NaN in 16 bits,
6826 // we need the payloads of non-canonical NaNs to be preserved.
6827 __ mv(t1, 0x7f800000);
6828 // sign-bit was already set via sign-extension if necessary.
6829 __ slli(t0, src, 13);
6830 __ orr(t1, t0, t1);
6831 __ fmv_w_x(dst, t1);
6832
6833 __ ret();
6834 return entry;
6835 }
6836
6837 // f10 = input (float)
6838 // x10 = result (float16)
6839 // f11 = temporary float register
6840 // t1 = temporary register
6841 address generate_floatToFloat16() {
6842 __ align(CodeEntryAlignment);
6843 StubId stub_id = StubId::stubgen_f2hf_id;
6844 StubCodeMark mark(this, stub_id);
6845 address entry = __ pc();
6846 BLOCK_COMMENT("floatToFloat16:");
6847
6848 Register dst = x10;
6849 FloatRegister src = f10, ftmp = f11;
6850 Label NaN_SLOW;
6851
6852 assert(VM_Version::supports_float16_float_conversion(), "must");
6853
6854 // On riscv, NaN needs a special process as fcvt does not work in that case.
6855
6856 // check whether it's a NaN.
6857 // replace fclass with feq as performance optimization.
6858 __ feq_s(t0, src, src);
6859 // jump to stub processing NaN cases.
6860 __ beqz(t0, NaN_SLOW);
6861
6862 // non-NaN cases, just use built-in instructions.
6863 __ fcvt_h_s(ftmp, src);
6864 __ fmv_x_h(dst, ftmp);
6865 __ ret();
6866
6867 __ bind(NaN_SLOW);
6868
6869 __ float_to_float16_NaN(dst, src, t0, t1);
6870
6871 __ ret();
6872 return entry;
6873 }
6874
6875 #ifdef COMPILER2
6876
6877 static const int64_t right_2_bits = right_n_bits(2);
6878 static const int64_t right_3_bits = right_n_bits(3);
6879
6880 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6881 // are represented as long[5], with BITS_PER_LIMB = 26.
6882 // Pack five 26-bit limbs into three 64-bit registers.
6883 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6884 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6885
6886 // The goal is to have 128-bit value in dest2:dest1:dest0
6887 __ ld(dest0, Address(src, 0)); // 26 bits in dest0
6888
6889 __ ld(tmp1, Address(src, sizeof(jlong)));
6890 __ slli(tmp1, tmp1, 26);
6891 __ add(dest0, dest0, tmp1); // 52 bits in dest0
6892
6893 __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6894 __ slli(tmp1, tmp2, 52);
6895 __ add(dest0, dest0, tmp1); // dest0 is full
6896
6897 __ srli(dest1, tmp2, 12); // 14-bit in dest1
6898
6899 __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6900 __ slli(tmp1, tmp1, 14);
6901 __ add(dest1, dest1, tmp1); // 40-bit in dest1
6902
6903 __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6904 __ slli(tmp2, tmp1, 40);
6905 __ add(dest1, dest1, tmp2); // dest1 is full
6906
6907 if (dest2->is_valid()) {
6908 __ srli(tmp1, tmp1, 24);
6909 __ mv(dest2, tmp1); // 2 bits in dest2
6910 } else {
6911 #ifdef ASSERT
6912 Label OK;
6913 __ srli(tmp1, tmp1, 24);
6914 __ beq(zr, tmp1, OK); // 2 bits
6915 __ stop("high bits of Poly1305 integer should be zero");
6916 __ should_not_reach_here();
6917 __ bind(OK);
6918 #endif
6919 }
6920 }
6921
6922 // As above, but return only a 128-bit integer, packed into two
6923 // 64-bit registers.
6924 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6925 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6926 }
6927
6928 // U_2:U_1:U_0: += (U_2 >> 2) * 5
6929 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6930 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6931
6932 // First, U_2:U_1:U_0 += (U_2 >> 2)
6933 __ srli(tmp1, U_2, 2);
6934 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6935 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6936 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6937 __ add(U_2, U_2, tmp2);
6938
6939 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6940 __ slli(tmp1, tmp1, 2);
6941 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6942 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6943 __ add(U_2, U_2, tmp2);
6944 }
6945
6946 // Poly1305, RFC 7539
6947 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6948
6949 // Arguments:
6950 // c_rarg0: input_start -- where the input is stored
6951 // c_rarg1: length
6952 // c_rarg2: acc_start -- where the output will be stored
6953 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored
6954
6955 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6956 // description of the tricks used to simplify and accelerate this
6957 // computation.
6958
6959 address generate_poly1305_processBlocks() {
6960 __ align(CodeEntryAlignment);
6961 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6962 StubCodeMark mark(this, stub_id);
6963 address start = __ pc();
6964 __ enter();
6965 Label here;
6966
6967 RegSet saved_regs = RegSet::range(x18, x21);
6968 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6969 __ push_reg(saved_regs, sp);
6970
6971 // Arguments
6972 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6973
6974 // R_n is the 128-bit randomly-generated key, packed into two
6975 // registers. The caller passes this key to us as long[5], with
6976 // BITS_PER_LIMB = 26.
6977 const Register R_0 = *regs, R_1 = *++regs;
6978 poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6979
6980 // RR_n is (R_n >> 2) * 5
6981 const Register RR_0 = *++regs, RR_1 = *++regs;
6982 __ srli(t1, R_0, 2);
6983 __ shadd(RR_0, t1, t1, t2, 2);
6984 __ srli(t1, R_1, 2);
6985 __ shadd(RR_1, t1, t1, t2, 2);
6986
6987 // U_n is the current checksum
6988 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6989 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6990
6991 static constexpr int BLOCK_LENGTH = 16;
6992 Label DONE, LOOP;
6993
6994 __ mv(t1, BLOCK_LENGTH);
6995 __ blt(length, t1, DONE); {
6996 __ bind(LOOP);
6997
6998 // S_n is to be the sum of U_n and the next block of data
6999 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7000 __ ld(S_0, Address(input_start, 0));
7001 __ ld(S_1, Address(input_start, wordSize));
7002
7003 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7004 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7005 __ add(S_2, U_2, t1);
7006
7007 __ addi(S_2, S_2, 1);
7008
7009 const Register U_0HI = *++regs, U_1HI = *++regs;
7010
7011 // NB: this logic depends on some of the special properties of
7012 // Poly1305 keys. In particular, because we know that the top
7013 // four bits of R_0 and R_1 are zero, we can add together
7014 // partial products without any risk of needing to propagate a
7015 // carry out.
7016 __ wide_mul(U_0, U_0HI, S_0, R_0);
7017 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7018 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7019
7020 __ wide_mul(U_1, U_1HI, S_0, R_1);
7021 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7022 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7023
7024 __ andi(U_2, R_0, right_2_bits);
7025 __ mul(U_2, S_2, U_2);
7026
7027 // Partial reduction mod 2**130 - 5
7028 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7029 __ adc(U_2, U_2, U_1HI, t1);
7030 // Sum is now in U_2:U_1:U_0.
7031
7032 // U_2:U_1:U_0: += (U_2 >> 2) * 5
7033 poly1305_reduce(U_2, U_1, U_0, t1, t2);
7034
7035 __ subi(length, length, BLOCK_LENGTH);
7036 __ addi(input_start, input_start, BLOCK_LENGTH);
7037 __ mv(t1, BLOCK_LENGTH);
7038 __ bge(length, t1, LOOP);
7039 }
7040
7041 // Further reduce modulo 2^130 - 5
7042 poly1305_reduce(U_2, U_1, U_0, t1, t2);
7043
7044 // Unpack the sum into five 26-bit limbs and write to memory.
7045 // First 26 bits is the first limb
7046 __ slli(t1, U_0, 38); // Take lowest 26 bits
7047 __ srli(t1, t1, 38);
7048 __ sd(t1, Address(acc_start)); // First 26-bit limb
7049
7050 // 27-52 bits of U_0 is the second limb
7051 __ slli(t1, U_0, 12); // Take next 27-52 bits
7052 __ srli(t1, t1, 38);
7053 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7054
7055 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7056 __ srli(t1, U_0, 52);
7057 __ slli(t2, U_1, 50);
7058 __ srli(t2, t2, 38);
7059 __ add(t1, t1, t2);
7060 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7061
7062 // Storing 15-40 bits of U_1
7063 __ slli(t1, U_1, 24); // Already used up 14 bits
7064 __ srli(t1, t1, 38); // Clear all other bits from t1
7065 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7066
7067 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7068 __ srli(t1, U_1, 40);
7069 __ andi(t2, U_2, right_3_bits);
7070 __ slli(t2, t2, 24);
7071 __ add(t1, t1, t2);
7072 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7073
7074 __ bind(DONE);
7075 __ pop_reg(saved_regs, sp);
7076 __ leave(); // Required for proper stackwalking
7077 __ ret();
7078
7079 return start;
7080 }
7081
7082 address generate_arrays_hashcode_powers_of_31() {
7083 assert(UseRVV, "sanity");
7084 const int lmul = 2;
7085 const int stride = MaxVectorSize / sizeof(jint) * lmul;
7086 __ align(CodeEntryAlignment);
7087 StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7088 address start = __ pc();
7089 for (int i = stride; i >= 0; i--) {
7090 jint power_of_31 = 1;
7091 for (int j = i; j > 0; j--) {
7092 power_of_31 = java_multiply(power_of_31, 31);
7093 }
7094 __ emit_int32(power_of_31);
7095 }
7096
7097 return start;
7098 }
7099
7100 #endif // COMPILER2
7101
7102 /**
7103 * Arguments:
7104 *
7105 * Inputs:
7106 * c_rarg0 - int crc
7107 * c_rarg1 - byte* buf
7108 * c_rarg2 - int length
7109 *
7110 * Output:
7111 * c_rarg0 - int crc result
7112 */
7113 address generate_updateBytesCRC32() {
7114 assert(UseCRC32Intrinsics, "what are we doing here?");
7115
7116 __ align(CodeEntryAlignment);
7117 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7118 StubCodeMark mark(this, stub_id);
7119
7120 address start = __ pc();
7121
7122 // input parameters
7123 const Register crc = c_rarg0; // crc
7124 const Register buf = c_rarg1; // source java byte array address
7125 const Register len = c_rarg2; // length
7126
7127 BLOCK_COMMENT("Entry:");
7128 __ enter(); // required for proper stackwalking of RuntimeStub frame
7129
7130 __ kernel_crc32(crc, buf, len,
7131 c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7132 c_rarg7, t2, t3, t4, t5, t6); // misc tmps
7133
7134 __ leave(); // required for proper stackwalking of RuntimeStub frame
7135 __ ret();
7136
7137 return start;
7138 }
7139
7140 // exception handler for upcall stubs
7141 address generate_upcall_stub_exception_handler() {
7142 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7143 StubCodeMark mark(this, stub_id);
7144 address start = __ pc();
7145
7146 // Native caller has no idea how to handle exceptions,
7147 // so we just crash here. Up to callee to catch exceptions.
7148 __ verify_oop(x10); // return a exception oop in a0
7149 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7150 __ should_not_reach_here();
7151
7152 return start;
7153 }
7154
7155 // load Method* target of MethodHandle
7156 // j_rarg0 = jobject receiver
7157 // xmethod = Method* result
7158 address generate_upcall_stub_load_target() {
7159
7160 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7161 StubCodeMark mark(this, stub_id);
7162 address start = __ pc();
7163
7164 __ resolve_global_jobject(j_rarg0, t0, t1);
7165 // Load target method from receiver
7166 __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7167 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7168 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7169 __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7170 Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7171 noreg, noreg);
7172 __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7173
7174 __ ret();
7175
7176 return start;
7177 }
7178
7179 #undef __
7180
7181 // Initialization
7182 void generate_preuniverse_stubs() {
7183 // preuniverse stubs are not needed for riscv
7184 }
7185
7186 void generate_initial_stubs() {
7187 // Generate initial stubs and initializes the entry points
7188
7189 // entry points that exist in all platforms Note: This is code
7190 // that could be shared among different platforms - however the
7191 // benefit seems to be smaller than the disadvantage of having a
7192 // much more complicated generator structure. See also comment in
7193 // stubRoutines.hpp.
7194
7195 StubRoutines::_forward_exception_entry = generate_forward_exception();
7196
7197 if (UnsafeMemoryAccess::_table == nullptr) {
7198 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7199 }
7200
7201 StubRoutines::_call_stub_entry =
7202 generate_call_stub(StubRoutines::_call_stub_return_address);
7203
7204 // is referenced by megamorphic call
7205 StubRoutines::_catch_exception_entry = generate_catch_exception();
7206
7207 if (UseCRC32Intrinsics) {
7208 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7209 }
7210
7211 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7212 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7213 StubRoutines::_hf2f = generate_float16ToFloat();
7214 StubRoutines::_f2hf = generate_floatToFloat16();
7215 }
7216 }
7217
7218 void generate_continuation_stubs() {
7219 // Continuation stubs:
7220 StubRoutines::_cont_thaw = generate_cont_thaw();
7221 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7222 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7223 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
7224 }
7225
7226 void generate_final_stubs() {
7227 // support for verify_oop (must happen after universe_init)
7228 if (VerifyOops) {
7229 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7230 }
7231
7232 // arraycopy stubs used by compilers
7233 generate_arraycopy_stubs();
7234
7235 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7236
7237 #ifdef COMPILER2
7238 if (UseSecondarySupersTable) {
7239 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7240 if (!InlineSecondarySupersTest) {
7241 generate_lookup_secondary_supers_table_stub();
7242 }
7243 }
7244 #endif // COMPILER2
7245
7246 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7247 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7248
7249 StubRoutines::riscv::set_completed();
7250 }
7251
7252 void generate_compiler_stubs() {
7253 #ifdef COMPILER2
7254 if (UseMulAddIntrinsic) {
7255 StubRoutines::_mulAdd = generate_mulAdd();
7256 }
7257
7258 if (UseMultiplyToLenIntrinsic) {
7259 StubRoutines::_multiplyToLen = generate_multiplyToLen();
7260 }
7261
7262 if (UseSquareToLenIntrinsic) {
7263 StubRoutines::_squareToLen = generate_squareToLen();
7264 }
7265
7266 if (UseMontgomeryMultiplyIntrinsic) {
7267 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7268 StubCodeMark mark(this, stub_id);
7269 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7270 StubRoutines::_montgomeryMultiply = g.generate_multiply();
7271 }
7272
7273 if (UseMontgomerySquareIntrinsic) {
7274 StubId stub_id = StubId::stubgen_montgomerySquare_id;
7275 StubCodeMark mark(this, stub_id);
7276 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7277 StubRoutines::_montgomerySquare = g.generate_square();
7278 }
7279
7280 if (UseAESIntrinsics) {
7281 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7282 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7283 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7284 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7285 }
7286
7287 if (UseAESCTRIntrinsics) {
7288 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7289 }
7290
7291 if (UseGHASHIntrinsics) {
7292 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7293 }
7294
7295 if (UsePoly1305Intrinsics) {
7296 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7297 }
7298
7299 if (UseRVV) {
7300 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7301 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7302 }
7303
7304 if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7305 StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7306 }
7307
7308 if (UseSHA256Intrinsics) {
7309 Sha2Generator sha2(_masm, this);
7310 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7311 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7312 }
7313
7314 if (UseSHA512Intrinsics) {
7315 Sha2Generator sha2(_masm, this);
7316 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7317 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7318 }
7319
7320 if (UseMD5Intrinsics) {
7321 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7322 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7323 }
7324
7325 if (UseChaCha20Intrinsics) {
7326 StubRoutines::_chacha20Block = generate_chacha20Block();
7327 }
7328
7329 if (UseSHA1Intrinsics) {
7330 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7331 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7332 }
7333
7334 if (UseBASE64Intrinsics) {
7335 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7336 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7337 }
7338
7339 if (UseAdler32Intrinsics) {
7340 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7341 }
7342
7343 generate_compare_long_strings();
7344
7345 generate_string_indexof_stubs();
7346
7347 #endif // COMPILER2
7348 }
7349
7350 public:
7351 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7352 switch(blob_id) {
7353 case BlobId::stubgen_preuniverse_id:
7354 generate_preuniverse_stubs();
7355 break;
7356 case BlobId::stubgen_initial_id:
7357 generate_initial_stubs();
7358 break;
7359 case BlobId::stubgen_continuation_id:
7360 generate_continuation_stubs();
7361 break;
7362 case BlobId::stubgen_compiler_id:
7363 generate_compiler_stubs();
7364 break;
7365 case BlobId::stubgen_final_id:
7366 generate_final_stubs();
7367 break;
7368 default:
7369 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7370 break;
7371 };
7372 }
7373 }; // end class declaration
7374
7375 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7376 StubGenerator g(code, blob_id, stub_data);
7377 }