1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * Copyright (c) 2020, 2025, Huawei Technologies Co., Ltd. All rights reserved.
5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
6 *
7 * This code is free software; you can redistribute it and/or modify it
8 * under the terms of the GNU General Public License version 2 only, as
9 * published by the Free Software Foundation.
10 *
11 * This code is distributed in the hope that it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 * version 2 for more details (a copy is included in the LICENSE file that
15 * accompanied this code).
16 *
17 * You should have received a copy of the GNU General Public License version
18 * 2 along with this work; if not, write to the Free Software Foundation,
19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
20 *
21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
22 * or visit www.oracle.com if you need additional information or have any
23 * questions.
24 *
25 */
26
27 #include "asm/macroAssembler.hpp"
28 #include "asm/macroAssembler.inline.hpp"
29 #include "compiler/oopMap.hpp"
30 #include "gc/shared/barrierSet.hpp"
31 #include "gc/shared/barrierSetAssembler.hpp"
32 #include "interpreter/interpreter.hpp"
33 #include "memory/universe.hpp"
34 #include "nativeInst_riscv.hpp"
35 #include "oops/instanceOop.hpp"
36 #include "oops/method.hpp"
37 #include "oops/objArrayKlass.hpp"
38 #include "oops/oop.inline.hpp"
39 #include "prims/methodHandles.hpp"
40 #include "prims/upcallLinker.hpp"
41 #include "runtime/continuation.hpp"
42 #include "runtime/continuationEntry.inline.hpp"
43 #include "runtime/frame.inline.hpp"
44 #include "runtime/handles.inline.hpp"
45 #include "runtime/javaThread.hpp"
46 #include "runtime/sharedRuntime.hpp"
47 #include "runtime/stubCodeGenerator.hpp"
48 #include "runtime/stubRoutines.hpp"
49 #include "utilities/align.hpp"
50 #include "utilities/powerOfTwo.hpp"
51 #ifdef COMPILER2
52 #include "opto/runtime.hpp"
53 #endif
54
55 // Declaration and definition of StubGenerator (no .hpp file).
56 // For a more detailed description of the stub routine structure
57 // see the comment in stubRoutines.hpp
58
59 #undef __
60 #define __ _masm->
61
62 #ifdef PRODUCT
63 #define BLOCK_COMMENT(str) /* nothing */
64 #else
65 #define BLOCK_COMMENT(str) __ block_comment(str)
66 #endif
67
68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
69
70 // Stub Code definitions
71
72 class StubGenerator: public StubCodeGenerator {
73 private:
74
75 #ifdef PRODUCT
76 #define inc_counter_np(counter) ((void)0)
77 #else
78 void inc_counter_np_(uint& counter) {
79 __ incrementw(ExternalAddress((address)&counter));
80 }
81 #define inc_counter_np(counter) \
82 BLOCK_COMMENT("inc_counter " #counter); \
83 inc_counter_np_(counter);
84 #endif
85
86 // Call stubs are used to call Java from C
87 //
88 // Arguments:
89 // c_rarg0: call wrapper address address
90 // c_rarg1: result address
91 // c_rarg2: result type BasicType
92 // c_rarg3: method Method*
93 // c_rarg4: (interpreter) entry point address
94 // c_rarg5: parameters intptr_t*
95 // c_rarg6: parameter size (in words) int
96 // c_rarg7: thread Thread*
97 //
98 // There is no return from the stub itself as any Java result
99 // is written to result
100 //
101 // we save x1 (ra) as the return PC at the base of the frame and
102 // link x8 (fp) below it as the frame pointer installing sp (x2)
103 // into fp.
104 //
105 // we save x10-x17, which accounts for all the c arguments.
106 //
107 // TODO: strictly do we need to save them all? they are treated as
108 // volatile by C so could we omit saving the ones we are going to
109 // place in global registers (thread? method?) or those we only use
110 // during setup of the Java call?
111 //
112 // we don't need to save x5 which C uses as an indirect result location
113 // return register.
114 //
115 // we don't need to save x6-x7 and x28-x31 which both C and Java treat as
116 // volatile
117 //
118 // we save x9, x18-x27, f8-f9, and f18-f27 which Java uses as temporary
119 // registers and C expects to be callee-save
120 //
121 // so the stub frame looks like this when we enter Java code
122 //
123 // [ return_from_Java ] <--- sp
124 // [ argument word n ]
125 // ...
126 // -35 [ argument word 1 ]
127 // -34 [ saved FRM in Floating-point Control and Status Register ] <--- sp_after_call
128 // -33 [ saved f27 ]
129 // -32 [ saved f26 ]
130 // -31 [ saved f25 ]
131 // -30 [ saved f24 ]
132 // -29 [ saved f23 ]
133 // -28 [ saved f22 ]
134 // -27 [ saved f21 ]
135 // -26 [ saved f20 ]
136 // -25 [ saved f19 ]
137 // -24 [ saved f18 ]
138 // -23 [ saved f9 ]
139 // -22 [ saved f8 ]
140 // -21 [ saved x27 ]
141 // -20 [ saved x26 ]
142 // -19 [ saved x25 ]
143 // -18 [ saved x24 ]
144 // -17 [ saved x23 ]
145 // -16 [ saved x22 ]
146 // -15 [ saved x21 ]
147 // -14 [ saved x20 ]
148 // -13 [ saved x19 ]
149 // -12 [ saved x18 ]
150 // -11 [ saved x9 ]
151 // -10 [ call wrapper (x10) ]
152 // -9 [ result (x11) ]
153 // -8 [ result type (x12) ]
154 // -7 [ method (x13) ]
155 // -6 [ entry point (x14) ]
156 // -5 [ parameters (x15) ]
157 // -4 [ parameter size (x16) ]
158 // -3 [ thread (x17) ]
159 // -2 [ saved fp (x8) ]
160 // -1 [ saved ra (x1) ]
161 // 0 [ ] <--- fp == saved sp (x2)
162
163 // Call stub stack layout word offsets from fp
164 enum call_stub_layout {
165 sp_after_call_off = -34,
166
167 frm_off = sp_after_call_off,
168 f27_off = -33,
169 f26_off = -32,
170 f25_off = -31,
171 f24_off = -30,
172 f23_off = -29,
173 f22_off = -28,
174 f21_off = -27,
175 f20_off = -26,
176 f19_off = -25,
177 f18_off = -24,
178 f9_off = -23,
179 f8_off = -22,
180
181 x27_off = -21,
182 x26_off = -20,
183 x25_off = -19,
184 x24_off = -18,
185 x23_off = -17,
186 x22_off = -16,
187 x21_off = -15,
188 x20_off = -14,
189 x19_off = -13,
190 x18_off = -12,
191 x9_off = -11,
192
193 call_wrapper_off = -10,
194 result_off = -9,
195 result_type_off = -8,
196 method_off = -7,
197 entry_point_off = -6,
198 parameters_off = -5,
199 parameter_size_off = -4,
200 thread_off = -3,
201 fp_f = -2,
202 retaddr_off = -1,
203 };
204
205 address generate_call_stub(address& return_address) {
206 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
207 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
208 "adjust this code");
209
210 StubId stub_id = StubId::stubgen_call_stub_id;
211 StubCodeMark mark(this, stub_id);
212 address start = __ pc();
213
214 const Address sp_after_call (fp, sp_after_call_off * wordSize);
215
216 const Address frm_save (fp, frm_off * wordSize);
217 const Address call_wrapper (fp, call_wrapper_off * wordSize);
218 const Address result (fp, result_off * wordSize);
219 const Address result_type (fp, result_type_off * wordSize);
220 const Address method (fp, method_off * wordSize);
221 const Address entry_point (fp, entry_point_off * wordSize);
222 const Address parameters (fp, parameters_off * wordSize);
223 const Address parameter_size(fp, parameter_size_off * wordSize);
224
225 const Address thread (fp, thread_off * wordSize);
226
227 const Address f27_save (fp, f27_off * wordSize);
228 const Address f26_save (fp, f26_off * wordSize);
229 const Address f25_save (fp, f25_off * wordSize);
230 const Address f24_save (fp, f24_off * wordSize);
231 const Address f23_save (fp, f23_off * wordSize);
232 const Address f22_save (fp, f22_off * wordSize);
233 const Address f21_save (fp, f21_off * wordSize);
234 const Address f20_save (fp, f20_off * wordSize);
235 const Address f19_save (fp, f19_off * wordSize);
236 const Address f18_save (fp, f18_off * wordSize);
237 const Address f9_save (fp, f9_off * wordSize);
238 const Address f8_save (fp, f8_off * wordSize);
239
240 const Address x27_save (fp, x27_off * wordSize);
241 const Address x26_save (fp, x26_off * wordSize);
242 const Address x25_save (fp, x25_off * wordSize);
243 const Address x24_save (fp, x24_off * wordSize);
244 const Address x23_save (fp, x23_off * wordSize);
245 const Address x22_save (fp, x22_off * wordSize);
246 const Address x21_save (fp, x21_off * wordSize);
247 const Address x20_save (fp, x20_off * wordSize);
248 const Address x19_save (fp, x19_off * wordSize);
249 const Address x18_save (fp, x18_off * wordSize);
250
251 const Address x9_save (fp, x9_off * wordSize);
252
253 // stub code
254
255 address riscv_entry = __ pc();
256
257 // set up frame and move sp to end of save area
258 __ enter();
259 __ addi(sp, fp, sp_after_call_off * wordSize);
260
261 // save register parameters and Java temporary/global registers
262 // n.b. we save thread even though it gets installed in
263 // xthread because we want to sanity check tp later
264 __ sd(c_rarg7, thread);
265 __ sw(c_rarg6, parameter_size);
266 __ sd(c_rarg5, parameters);
267 __ sd(c_rarg4, entry_point);
268 __ sd(c_rarg3, method);
269 __ sd(c_rarg2, result_type);
270 __ sd(c_rarg1, result);
271 __ sd(c_rarg0, call_wrapper);
272
273 __ sd(x9, x9_save);
274
275 __ sd(x18, x18_save);
276 __ sd(x19, x19_save);
277 __ sd(x20, x20_save);
278 __ sd(x21, x21_save);
279 __ sd(x22, x22_save);
280 __ sd(x23, x23_save);
281 __ sd(x24, x24_save);
282 __ sd(x25, x25_save);
283 __ sd(x26, x26_save);
284 __ sd(x27, x27_save);
285
286 __ fsd(f8, f8_save);
287 __ fsd(f9, f9_save);
288 __ fsd(f18, f18_save);
289 __ fsd(f19, f19_save);
290 __ fsd(f20, f20_save);
291 __ fsd(f21, f21_save);
292 __ fsd(f22, f22_save);
293 __ fsd(f23, f23_save);
294 __ fsd(f24, f24_save);
295 __ fsd(f25, f25_save);
296 __ fsd(f26, f26_save);
297 __ fsd(f27, f27_save);
298
299 __ frrm(t0);
300 __ sd(t0, frm_save);
301 // Set frm to the state we need. We do want Round to Nearest. We
302 // don't want non-IEEE rounding modes.
303 Label skip_fsrmi;
304 guarantee(__ RoundingMode::rne == 0, "must be");
305 __ beqz(t0, skip_fsrmi);
306 __ fsrmi(__ RoundingMode::rne);
307 __ bind(skip_fsrmi);
308
309 // install Java thread in global register now we have saved
310 // whatever value it held
311 __ mv(xthread, c_rarg7);
312
313 // And method
314 __ mv(xmethod, c_rarg3);
315
316 // set up the heapbase register
317 __ reinit_heapbase();
318
319 #ifdef ASSERT
320 // make sure we have no pending exceptions
321 {
322 Label L;
323 __ ld(t0, Address(xthread, in_bytes(Thread::pending_exception_offset())));
324 __ beqz(t0, L);
325 __ stop("StubRoutines::call_stub: entered with pending exception");
326 __ BIND(L);
327 }
328 #endif
329 // pass parameters if any
330 __ mv(esp, sp);
331 __ slli(t0, c_rarg6, LogBytesPerWord);
332 __ sub(t0, sp, t0); // Move SP out of the way
333 __ andi(sp, t0, -2 * wordSize);
334
335 BLOCK_COMMENT("pass parameters if any");
336 Label parameters_done;
337 // parameter count is still in c_rarg6
338 // and parameter pointer identifying param 1 is in c_rarg5
339 __ beqz(c_rarg6, parameters_done);
340
341 address loop = __ pc();
342 __ ld(t0, Address(c_rarg5, 0));
343 __ addi(c_rarg5, c_rarg5, wordSize);
344 __ subi(c_rarg6, c_rarg6, 1);
345 __ push_reg(t0);
346 __ bgtz(c_rarg6, loop);
347
348 __ BIND(parameters_done);
349
350 // call Java entry -- passing methdoOop, and current sp
351 // xmethod: Method*
352 // x19_sender_sp: sender sp
353 BLOCK_COMMENT("call Java function");
354 __ mv(x19_sender_sp, sp);
355 __ jalr(c_rarg4);
356
357 // save current address for use by exception handling code
358
359 return_address = __ pc();
360
361 // store result depending on type (everything that is not
362 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
363 // n.b. this assumes Java returns an integral result in x10
364 // and a floating result in j_farg0
365 __ ld(j_rarg2, result);
366 Label is_long, is_float, is_double, exit;
367 __ ld(j_rarg1, result_type);
368 __ mv(t0, (u1)T_OBJECT);
369 __ beq(j_rarg1, t0, is_long);
370 __ mv(t0, (u1)T_LONG);
371 __ beq(j_rarg1, t0, is_long);
372 __ mv(t0, (u1)T_FLOAT);
373 __ beq(j_rarg1, t0, is_float);
374 __ mv(t0, (u1)T_DOUBLE);
375 __ beq(j_rarg1, t0, is_double);
376
377 // handle T_INT case
378 __ sw(x10, Address(j_rarg2));
379
380 __ BIND(exit);
381
382 // pop parameters
383 __ addi(esp, fp, sp_after_call_off * wordSize);
384
385 #ifdef ASSERT
386 // verify that threads correspond
387 {
388 Label L, S;
389 __ ld(t0, thread);
390 __ bne(xthread, t0, S);
391 __ get_thread(t0);
392 __ beq(xthread, t0, L);
393 __ BIND(S);
394 __ stop("StubRoutines::call_stub: threads must correspond");
395 __ BIND(L);
396 }
397 #endif
398
399 __ pop_cont_fastpath(xthread);
400
401 // restore callee-save registers
402 __ fld(f27, f27_save);
403 __ fld(f26, f26_save);
404 __ fld(f25, f25_save);
405 __ fld(f24, f24_save);
406 __ fld(f23, f23_save);
407 __ fld(f22, f22_save);
408 __ fld(f21, f21_save);
409 __ fld(f20, f20_save);
410 __ fld(f19, f19_save);
411 __ fld(f18, f18_save);
412 __ fld(f9, f9_save);
413 __ fld(f8, f8_save);
414
415 __ ld(x27, x27_save);
416 __ ld(x26, x26_save);
417 __ ld(x25, x25_save);
418 __ ld(x24, x24_save);
419 __ ld(x23, x23_save);
420 __ ld(x22, x22_save);
421 __ ld(x21, x21_save);
422 __ ld(x20, x20_save);
423 __ ld(x19, x19_save);
424 __ ld(x18, x18_save);
425
426 __ ld(x9, x9_save);
427
428 // restore frm
429 Label skip_fsrm;
430 __ ld(t0, frm_save);
431 __ frrm(t1);
432 __ beq(t0, t1, skip_fsrm);
433 __ fsrm(t0);
434 __ bind(skip_fsrm);
435
436 __ ld(c_rarg0, call_wrapper);
437 __ ld(c_rarg1, result);
438 __ ld(c_rarg2, result_type);
439 __ ld(c_rarg3, method);
440 __ ld(c_rarg4, entry_point);
441 __ ld(c_rarg5, parameters);
442 __ ld(c_rarg6, parameter_size);
443 __ ld(c_rarg7, thread);
444
445 // leave frame and return to caller
446 __ leave();
447 __ ret();
448
449 // handle return types different from T_INT
450
451 __ BIND(is_long);
452 __ sd(x10, Address(j_rarg2, 0));
453 __ j(exit);
454
455 __ BIND(is_float);
456 __ fsw(j_farg0, Address(j_rarg2, 0), t0);
457 __ j(exit);
458
459 __ BIND(is_double);
460 __ fsd(j_farg0, Address(j_rarg2, 0), t0);
461 __ j(exit);
462
463 return start;
464 }
465
466 // Return point for a Java call if there's an exception thrown in
467 // Java code. The exception is caught and transformed into a
468 // pending exception stored in JavaThread that can be tested from
469 // within the VM.
470 //
471 // Note: Usually the parameters are removed by the callee. In case
472 // of an exception crossing an activation frame boundary, that is
473 // not the case if the callee is compiled code => need to setup the
474 // sp.
475 //
476 // x10: exception oop
477
478 address generate_catch_exception() {
479 StubId stub_id = StubId::stubgen_catch_exception_id;
480 StubCodeMark mark(this, stub_id);
481 address start = __ pc();
482
483 // same as in generate_call_stub():
484 const Address thread(fp, thread_off * wordSize);
485
486 #ifdef ASSERT
487 // verify that threads correspond
488 {
489 Label L, S;
490 __ ld(t0, thread);
491 __ bne(xthread, t0, S);
492 __ get_thread(t0);
493 __ beq(xthread, t0, L);
494 __ bind(S);
495 __ stop("StubRoutines::catch_exception: threads must correspond");
496 __ bind(L);
497 }
498 #endif
499
500 // set pending exception
501 __ verify_oop(x10);
502
503 __ sd(x10, Address(xthread, Thread::pending_exception_offset()));
504 __ mv(t0, (address)__FILE__);
505 __ sd(t0, Address(xthread, Thread::exception_file_offset()));
506 __ mv(t0, (int)__LINE__);
507 __ sw(t0, Address(xthread, Thread::exception_line_offset()));
508
509 // complete return to VM
510 assert(StubRoutines::_call_stub_return_address != nullptr,
511 "_call_stub_return_address must have been generated before");
512 __ j(RuntimeAddress(StubRoutines::_call_stub_return_address));
513
514 return start;
515 }
516
517 // Continuation point for runtime calls returning with a pending
518 // exception. The pending exception check happened in the runtime
519 // or native call stub. The pending exception in Thread is
520 // converted into a Java-level exception.
521 //
522 // Contract with Java-level exception handlers:
523 // x10: exception
524 // x13: throwing pc
525 //
526 // NOTE: At entry of this stub, exception-pc must be in RA !!
527
528 // NOTE: this is always used as a jump target within generated code
529 // so it just needs to be generated code with no x86 prolog
530
531 address generate_forward_exception() {
532 StubId stub_id = StubId::stubgen_forward_exception_id;
533 StubCodeMark mark(this, stub_id);
534 address start = __ pc();
535
536 // Upon entry, RA points to the return address returning into
537 // Java (interpreted or compiled) code; i.e., the return address
538 // becomes the throwing pc.
539 //
540 // Arguments pushed before the runtime call are still on the stack
541 // but the exception handler will reset the stack pointer ->
542 // ignore them. A potential result in registers can be ignored as
543 // well.
544
545 #ifdef ASSERT
546 // make sure this code is only executed if there is a pending exception
547 {
548 Label L;
549 __ ld(t0, Address(xthread, Thread::pending_exception_offset()));
550 __ bnez(t0, L);
551 __ stop("StubRoutines::forward exception: no pending exception (1)");
552 __ bind(L);
553 }
554 #endif
555
556 // compute exception handler into x9
557
558 // call the VM to find the handler address associated with the
559 // caller address. pass thread in x10 and caller pc (ret address)
560 // in x11. n.b. the caller pc is in ra, unlike x86 where it is on
561 // the stack.
562 __ mv(c_rarg1, ra);
563 // ra will be trashed by the VM call so we move it to x9
564 // (callee-saved) because we also need to pass it to the handler
565 // returned by this call.
566 __ mv(x9, ra);
567 BLOCK_COMMENT("call exception_handler_for_return_address");
568 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
569 SharedRuntime::exception_handler_for_return_address),
570 xthread, c_rarg1);
571 // we should not really care that ra is no longer the callee
572 // address. we saved the value the handler needs in x9 so we can
573 // just copy it to x13. however, the C2 handler will push its own
574 // frame and then calls into the VM and the VM code asserts that
575 // the PC for the frame above the handler belongs to a compiled
576 // Java method. So, we restore ra here to satisfy that assert.
577 __ mv(ra, x9);
578 // setup x10 & x13 & clear pending exception
579 __ mv(x13, x9);
580 __ mv(x9, x10);
581 __ ld(x10, Address(xthread, Thread::pending_exception_offset()));
582 __ sd(zr, Address(xthread, Thread::pending_exception_offset()));
583
584 #ifdef ASSERT
585 // make sure exception is set
586 {
587 Label L;
588 __ bnez(x10, L);
589 __ stop("StubRoutines::forward exception: no pending exception (2)");
590 __ bind(L);
591 }
592 #endif
593
594 // continue at exception handler
595 // x10: exception
596 // x13: throwing pc
597 // x9: exception handler
598 __ verify_oop(x10);
599 __ jr(x9);
600
601 return start;
602 }
603
604 // Non-destructive plausibility checks for oops
605 //
606 // Arguments:
607 // x10: oop to verify
608 // t0: error message
609 //
610 // Stack after saving c_rarg3:
611 // [tos + 0]: saved c_rarg3
612 // [tos + 1]: saved c_rarg2
613 // [tos + 2]: saved ra
614 // [tos + 3]: saved t1
615 // [tos + 4]: saved x10
616 // [tos + 5]: saved t0
617 address generate_verify_oop() {
618
619 StubId stub_id = StubId::stubgen_verify_oop_id;
620 StubCodeMark mark(this, stub_id);
621 address start = __ pc();
622
623 Label exit, error;
624
625 __ push_reg(RegSet::of(c_rarg2, c_rarg3), sp); // save c_rarg2 and c_rarg3
626
627 __ la(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
628 __ ld(c_rarg3, Address(c_rarg2));
629 __ addi(c_rarg3, c_rarg3, 1);
630 __ sd(c_rarg3, Address(c_rarg2));
631
632 // object is in x10
633 // make sure object is 'reasonable'
634 __ beqz(x10, exit); // if obj is null it is OK
635
636 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
637 bs_asm->check_oop(_masm, x10, c_rarg2, c_rarg3, error);
638
639 // return if everything seems ok
640 __ bind(exit);
641
642 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
643 __ ret();
644
645 // handle errors
646 __ bind(error);
647 __ pop_reg(RegSet::of(c_rarg2, c_rarg3), sp); // pop c_rarg2 and c_rarg3
648
649 __ push_reg(RegSet::range(x0, x31), sp);
650 // debug(char* msg, int64_t pc, int64_t regs[])
651 __ mv(c_rarg0, t0); // pass address of error message
652 __ mv(c_rarg1, ra); // pass return address
653 __ mv(c_rarg2, sp); // pass address of regs on stack
654 #ifndef PRODUCT
655 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
656 #endif
657 BLOCK_COMMENT("call MacroAssembler::debug");
658 __ rt_call(CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
659 __ ebreak();
660
661 return start;
662 }
663
664 // The inner part of zero_words().
665 //
666 // Inputs:
667 // x28: the HeapWord-aligned base address of an array to zero.
668 // x29: the count in HeapWords, x29 > 0.
669 //
670 // Returns x28 and x29, adjusted for the caller to clear.
671 // x28: the base address of the tail of words left to clear.
672 // x29: the number of words in the tail.
673 // x29 < MacroAssembler::zero_words_block_size.
674
675 address generate_zero_blocks() {
676 Label done;
677
678 const Register base = x28, cnt = x29, tmp1 = x30, tmp2 = x31;
679
680 __ align(CodeEntryAlignment);
681 StubId stub_id = StubId::stubgen_zero_blocks_id;
682 StubCodeMark mark(this, stub_id);
683 address start = __ pc();
684
685 if (UseBlockZeroing) {
686 int zicboz_block_size = VM_Version::zicboz_block_size.value();
687 // Ensure count >= 2 * zicboz_block_size so that it still deserves
688 // a cbo.zero after alignment.
689 Label small;
690 int low_limit = MAX2(2 * zicboz_block_size, (int)BlockZeroingLowLimit) / wordSize;
691 __ mv(tmp1, low_limit);
692 __ blt(cnt, tmp1, small);
693 __ zero_dcache_blocks(base, cnt, tmp1, tmp2);
694 __ bind(small);
695 }
696
697 {
698 // Clear the remaining blocks.
699 Label loop;
700 __ mv(tmp1, MacroAssembler::zero_words_block_size);
701 __ blt(cnt, tmp1, done);
702 __ bind(loop);
703 for (int i = 0; i < MacroAssembler::zero_words_block_size; i++) {
704 __ sd(zr, Address(base, i * wordSize));
705 }
706 __ addi(base, base, MacroAssembler::zero_words_block_size * wordSize);
707 __ subi(cnt, cnt, MacroAssembler::zero_words_block_size);
708 __ bge(cnt, tmp1, loop);
709 __ bind(done);
710 }
711
712 __ ret();
713
714 return start;
715 }
716
717 typedef enum {
718 copy_forwards = 1,
719 copy_backwards = -1
720 } copy_direction;
721
722 // Bulk copy of blocks of 8 words.
723 //
724 // count is a count of words.
725 //
726 // Precondition: count >= 8
727 //
728 // Postconditions:
729 //
730 // The least significant bit of count contains the remaining count
731 // of words to copy. The rest of count is trash.
732 //
733 // s and d are adjusted to point to the remaining words to copy
734 //
735 address generate_copy_longs(StubId stub_id, Register s, Register d, Register count) {
736 BasicType type;
737 copy_direction direction;
738 switch (stub_id) {
739 case StubId::stubgen_copy_byte_f_id:
740 direction = copy_forwards;
741 type = T_BYTE;
742 break;
743 case StubId::stubgen_copy_byte_b_id:
744 direction = copy_backwards;
745 type = T_BYTE;
746 break;
747 default:
748 ShouldNotReachHere();
749 }
750 int unit = wordSize * direction;
751 int bias = wordSize;
752
753 const Register tmp_reg0 = x13, tmp_reg1 = x14, tmp_reg2 = x15, tmp_reg3 = x16,
754 tmp_reg4 = x17, tmp_reg5 = x7, tmp_reg6 = x28, tmp_reg7 = x29;
755
756 const Register stride = x30;
757
758 assert_different_registers(t0, tmp_reg0, tmp_reg1, tmp_reg2, tmp_reg3,
759 tmp_reg4, tmp_reg5, tmp_reg6, tmp_reg7);
760 assert_different_registers(s, d, count, t0);
761
762 Label again, drain;
763 StubCodeMark mark(this, stub_id);
764 __ align(CodeEntryAlignment);
765 address start = __ pc();
766
767 if (direction == copy_forwards) {
768 __ sub(s, s, bias);
769 __ sub(d, d, bias);
770 }
771
772 #ifdef ASSERT
773 // Make sure we are never given < 8 words
774 {
775 Label L;
776
777 __ mv(t0, 8);
778 __ bge(count, t0, L);
779 __ stop("genrate_copy_longs called with < 8 words");
780 __ bind(L);
781 }
782 #endif
783
784 __ ld(tmp_reg0, Address(s, 1 * unit));
785 __ ld(tmp_reg1, Address(s, 2 * unit));
786 __ ld(tmp_reg2, Address(s, 3 * unit));
787 __ ld(tmp_reg3, Address(s, 4 * unit));
788 __ ld(tmp_reg4, Address(s, 5 * unit));
789 __ ld(tmp_reg5, Address(s, 6 * unit));
790 __ ld(tmp_reg6, Address(s, 7 * unit));
791 __ ld(tmp_reg7, Address(s, 8 * unit));
792 __ addi(s, s, 8 * unit);
793
794 __ subi(count, count, 16);
795 __ bltz(count, drain);
796
797 __ bind(again);
798
799 __ sd(tmp_reg0, Address(d, 1 * unit));
800 __ sd(tmp_reg1, Address(d, 2 * unit));
801 __ sd(tmp_reg2, Address(d, 3 * unit));
802 __ sd(tmp_reg3, Address(d, 4 * unit));
803 __ sd(tmp_reg4, Address(d, 5 * unit));
804 __ sd(tmp_reg5, Address(d, 6 * unit));
805 __ sd(tmp_reg6, Address(d, 7 * unit));
806 __ sd(tmp_reg7, Address(d, 8 * unit));
807
808 __ ld(tmp_reg0, Address(s, 1 * unit));
809 __ ld(tmp_reg1, Address(s, 2 * unit));
810 __ ld(tmp_reg2, Address(s, 3 * unit));
811 __ ld(tmp_reg3, Address(s, 4 * unit));
812 __ ld(tmp_reg4, Address(s, 5 * unit));
813 __ ld(tmp_reg5, Address(s, 6 * unit));
814 __ ld(tmp_reg6, Address(s, 7 * unit));
815 __ ld(tmp_reg7, Address(s, 8 * unit));
816
817 __ addi(s, s, 8 * unit);
818 __ addi(d, d, 8 * unit);
819
820 __ subi(count, count, 8);
821 __ bgez(count, again);
822
823 // Drain
824 __ bind(drain);
825
826 __ sd(tmp_reg0, Address(d, 1 * unit));
827 __ sd(tmp_reg1, Address(d, 2 * unit));
828 __ sd(tmp_reg2, Address(d, 3 * unit));
829 __ sd(tmp_reg3, Address(d, 4 * unit));
830 __ sd(tmp_reg4, Address(d, 5 * unit));
831 __ sd(tmp_reg5, Address(d, 6 * unit));
832 __ sd(tmp_reg6, Address(d, 7 * unit));
833 __ sd(tmp_reg7, Address(d, 8 * unit));
834 __ addi(d, d, 8 * unit);
835
836 {
837 Label L1, L2;
838 __ test_bit(t0, count, 2);
839 __ beqz(t0, L1);
840
841 __ ld(tmp_reg0, Address(s, 1 * unit));
842 __ ld(tmp_reg1, Address(s, 2 * unit));
843 __ ld(tmp_reg2, Address(s, 3 * unit));
844 __ ld(tmp_reg3, Address(s, 4 * unit));
845 __ addi(s, s, 4 * unit);
846
847 __ sd(tmp_reg0, Address(d, 1 * unit));
848 __ sd(tmp_reg1, Address(d, 2 * unit));
849 __ sd(tmp_reg2, Address(d, 3 * unit));
850 __ sd(tmp_reg3, Address(d, 4 * unit));
851 __ addi(d, d, 4 * unit);
852
853 __ bind(L1);
854
855 if (direction == copy_forwards) {
856 __ addi(s, s, bias);
857 __ addi(d, d, bias);
858 }
859
860 __ test_bit(t0, count, 1);
861 __ beqz(t0, L2);
862 if (direction == copy_backwards) {
863 __ addi(s, s, 2 * unit);
864 __ ld(tmp_reg0, Address(s));
865 __ ld(tmp_reg1, Address(s, wordSize));
866 __ addi(d, d, 2 * unit);
867 __ sd(tmp_reg0, Address(d));
868 __ sd(tmp_reg1, Address(d, wordSize));
869 } else {
870 __ ld(tmp_reg0, Address(s));
871 __ ld(tmp_reg1, Address(s, wordSize));
872 __ addi(s, s, 2 * unit);
873 __ sd(tmp_reg0, Address(d));
874 __ sd(tmp_reg1, Address(d, wordSize));
875 __ addi(d, d, 2 * unit);
876 }
877 __ bind(L2);
878 }
879
880 __ ret();
881
882 return start;
883 }
884
885 typedef void (MacroAssembler::*copy_insn)(Register Rd, const Address &adr, Register temp);
886
887 void copy_memory_v(Register s, Register d, Register count, int step) {
888 bool is_backward = step < 0;
889 int granularity = g_uabs(step);
890
891 const Register src = x30, dst = x31, vl = x14, cnt = x15, tmp1 = x16, tmp2 = x17;
892 assert_different_registers(s, d, cnt, vl, tmp1, tmp2);
893 Assembler::SEW sew = Assembler::elembytes_to_sew(granularity);
894 Label loop_forward, loop_backward, done;
895
896 __ mv(dst, d);
897 __ mv(src, s);
898 __ mv(cnt, count);
899
900 __ bind(loop_forward);
901 __ vsetvli(vl, cnt, sew, Assembler::m8);
902 if (is_backward) {
903 __ bne(vl, cnt, loop_backward);
904 }
905
906 __ vlex_v(v0, src, sew);
907 __ sub(cnt, cnt, vl);
908 if (sew != Assembler::e8) {
909 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
910 __ slli(vl, vl, sew);
911 }
912 __ add(src, src, vl);
913
914 __ vsex_v(v0, dst, sew);
915 __ add(dst, dst, vl);
916 __ bnez(cnt, loop_forward);
917
918 if (is_backward) {
919 __ j(done);
920
921 __ bind(loop_backward);
922 __ sub(t0, cnt, vl);
923 if (sew != Assembler::e8) {
924 // when sew == e8 (e.g., elem size is 1 byte), slli R, R, 0 is a nop and unnecessary
925 __ slli(t0, t0, sew);
926 }
927 __ add(tmp1, s, t0);
928 __ vlex_v(v0, tmp1, sew);
929 __ add(tmp2, d, t0);
930 __ vsex_v(v0, tmp2, sew);
931 __ sub(cnt, cnt, vl);
932 __ bnez(cnt, loop_forward);
933 __ bind(done);
934 }
935 }
936
937 // All-singing all-dancing memory copy.
938 //
939 // Copy count units of memory from s to d. The size of a unit is
940 // step, which can be positive or negative depending on the direction
941 // of copy.
942 //
943 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
944 Register s, Register d, Register count, int step) {
945 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
946 if (UseRVV && (!is_reference_type(type) || bs_asm->supports_rvv_arraycopy())) {
947 return copy_memory_v(s, d, count, step);
948 }
949
950 bool is_backwards = step < 0;
951 int granularity = g_uabs(step);
952
953 const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;
954 const Register gct1 = x28, gct2 = x29, gct3 = t2;
955
956 Label same_aligned;
957 Label copy_big, copy32_loop, copy8_loop, copy_small, done;
958
959 // The size of copy32_loop body increases significantly with ZGC GC barriers.
960 // Need conditional far branches to reach a point beyond the loop in this case.
961 bool is_far = UseZGC;
962
963 __ beqz(count, done, is_far);
964 __ slli(cnt, count, exact_log2(granularity));
965 if (is_backwards) {
966 __ add(src, s, cnt);
967 __ add(dst, d, cnt);
968 } else {
969 __ mv(src, s);
970 __ mv(dst, d);
971 }
972
973 if (is_aligned) {
974 __ subi(t0, cnt, 32);
975 __ bgez(t0, copy32_loop);
976 __ subi(t0, cnt, 8);
977 __ bgez(t0, copy8_loop, is_far);
978 __ j(copy_small);
979 } else {
980 __ mv(t0, 16);
981 __ blt(cnt, t0, copy_small, is_far);
982
983 __ xorr(t0, src, dst);
984 __ andi(t0, t0, 0b111);
985 __ bnez(t0, copy_small, is_far);
986
987 __ bind(same_aligned);
988 __ andi(t0, src, 0b111);
989 __ beqz(t0, copy_big);
990 if (is_backwards) {
991 __ addi(src, src, step);
992 __ addi(dst, dst, step);
993 }
994 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
995 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
996 if (!is_backwards) {
997 __ addi(src, src, step);
998 __ addi(dst, dst, step);
999 }
1000 __ subi(cnt, cnt, granularity);
1001 __ beqz(cnt, done, is_far);
1002 __ j(same_aligned);
1003
1004 __ bind(copy_big);
1005 __ mv(t0, 32);
1006 __ blt(cnt, t0, copy8_loop, is_far);
1007 }
1008
1009 __ bind(copy32_loop);
1010 if (is_backwards) {
1011 __ subi(src, src, wordSize * 4);
1012 __ subi(dst, dst, wordSize * 4);
1013 }
1014 // we first load 32 bytes, then write it, so the direction here doesn't matter
1015 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1016 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp4, Address(src, 8), gct1);
1017 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp5, Address(src, 16), gct1);
1018 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp6, Address(src, 24), gct1);
1019
1020 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1021 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 8), tmp4, gct1, gct2, gct3);
1022 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 16), tmp5, gct1, gct2, gct3);
1023 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst, 24), tmp6, gct1, gct2, gct3);
1024
1025 if (!is_backwards) {
1026 __ addi(src, src, wordSize * 4);
1027 __ addi(dst, dst, wordSize * 4);
1028 }
1029 __ subi(t0, cnt, 32 + wordSize * 4);
1030 __ subi(cnt, cnt, wordSize * 4);
1031 __ bgez(t0, copy32_loop); // cnt >= 32, do next loop
1032
1033 __ beqz(cnt, done); // if that's all - done
1034
1035 __ subi(t0, cnt, 8); // if not - copy the reminder
1036 __ bltz(t0, copy_small); // cnt < 8, go to copy_small, else fall through to copy8_loop
1037
1038 __ bind(copy8_loop);
1039 if (is_backwards) {
1040 __ subi(src, src, wordSize);
1041 __ subi(dst, dst, wordSize);
1042 }
1043 bs_asm->copy_load_at(_masm, decorators, type, 8, tmp3, Address(src), gct1);
1044 bs_asm->copy_store_at(_masm, decorators, type, 8, Address(dst), tmp3, gct1, gct2, gct3);
1045
1046 if (!is_backwards) {
1047 __ addi(src, src, wordSize);
1048 __ addi(dst, dst, wordSize);
1049 }
1050 __ subi(t0, cnt, 8 + wordSize);
1051 __ subi(cnt, cnt, wordSize);
1052 __ bgez(t0, copy8_loop); // cnt >= 8, do next loop
1053
1054 __ beqz(cnt, done); // if that's all - done
1055
1056 __ bind(copy_small);
1057 if (is_backwards) {
1058 __ addi(src, src, step);
1059 __ addi(dst, dst, step);
1060 }
1061
1062 bs_asm->copy_load_at(_masm, decorators, type, granularity, tmp3, Address(src), gct1);
1063 bs_asm->copy_store_at(_masm, decorators, type, granularity, Address(dst), tmp3, gct1, gct2, gct3);
1064
1065 if (!is_backwards) {
1066 __ addi(src, src, step);
1067 __ addi(dst, dst, step);
1068 }
1069 __ subi(cnt, cnt, granularity);
1070 __ bgtz(cnt, copy_small);
1071
1072 __ bind(done);
1073 }
1074
1075 // Scan over array at a for count oops, verifying each one.
1076 // Preserves a and count, clobbers t0 and t1.
1077 void verify_oop_array(size_t size, Register a, Register count, Register temp) {
1078 Label loop, end;
1079 __ mv(t1, zr);
1080 __ slli(t0, count, exact_log2(size));
1081 __ bind(loop);
1082 __ bgeu(t1, t0, end);
1083
1084 __ add(temp, a, t1);
1085 if (size == (size_t)wordSize) {
1086 __ ld(temp, Address(temp, 0));
1087 __ verify_oop(temp);
1088 } else {
1089 __ lwu(temp, Address(temp, 0));
1090 __ decode_heap_oop(temp); // calls verify_oop
1091 }
1092 __ add(t1, t1, size);
1093 __ j(loop);
1094 __ bind(end);
1095 }
1096
1097 // Arguments:
1098 // stub_id - is used to name the stub and identify all details of
1099 // how to perform the copy.
1100 //
1101 // nopush_entry - is assigned to the stub's post push entry point
1102 // unless it is null
1103 //
1104 // Inputs:
1105 // c_rarg0 - source array address
1106 // c_rarg1 - destination array address
1107 // c_rarg2 - element count, treated as ssize_t, can be zero
1108 //
1109 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1110 // the hardware handle it. The two dwords within qwords that span
1111 // cache line boundaries will still be loaded and stored atomically.
1112 //
1113 // Side Effects: nopush_entry is set to the (post push) entry point
1114 // so it can be used by the corresponding conjoint
1115 // copy method
1116 //
1117 address generate_disjoint_copy(StubId stub_id, address* nopush_entry) {
1118 size_t size;
1119 bool aligned;
1120 bool is_oop;
1121 bool dest_uninitialized;
1122 switch (stub_id) {
1123 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1124 size = sizeof(jbyte);
1125 aligned = false;
1126 is_oop = false;
1127 dest_uninitialized = false;
1128 break;
1129 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1130 size = sizeof(jbyte);
1131 aligned = true;
1132 is_oop = false;
1133 dest_uninitialized = false;
1134 break;
1135 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1136 size = sizeof(jshort);
1137 aligned = false;
1138 is_oop = false;
1139 dest_uninitialized = false;
1140 break;
1141 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1142 size = sizeof(jshort);
1143 aligned = true;
1144 is_oop = false;
1145 dest_uninitialized = false;
1146 break;
1147 case StubId::stubgen_jint_disjoint_arraycopy_id:
1148 size = sizeof(jint);
1149 aligned = false;
1150 is_oop = false;
1151 dest_uninitialized = false;
1152 break;
1153 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1154 size = sizeof(jint);
1155 aligned = true;
1156 is_oop = false;
1157 dest_uninitialized = false;
1158 break;
1159 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1160 // since this is always aligned we can (should!) use the same
1161 // stub as for case arrayof_jlong_disjoint_arraycopy
1162 ShouldNotReachHere();
1163 break;
1164 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1165 size = sizeof(jlong);
1166 aligned = true;
1167 is_oop = false;
1168 dest_uninitialized = false;
1169 break;
1170 case StubId::stubgen_oop_disjoint_arraycopy_id:
1171 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1172 aligned = !UseCompressedOops;
1173 is_oop = true;
1174 dest_uninitialized = false;
1175 break;
1176 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1177 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1178 aligned = !UseCompressedOops;
1179 is_oop = true;
1180 dest_uninitialized = false;
1181 break;
1182 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1183 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1184 aligned = !UseCompressedOops;
1185 is_oop = true;
1186 dest_uninitialized = true;
1187 break;
1188 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1189 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1190 aligned = !UseCompressedOops;
1191 is_oop = true;
1192 dest_uninitialized = true;
1193 break;
1194 default:
1195 ShouldNotReachHere();
1196 break;
1197 }
1198
1199 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1200 RegSet saved_reg = RegSet::of(s, d, count);
1201 __ align(CodeEntryAlignment);
1202 StubCodeMark mark(this, stub_id);
1203 address start = __ pc();
1204 __ enter();
1205
1206 if (nopush_entry != nullptr) {
1207 *nopush_entry = __ pc();
1208 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1209 BLOCK_COMMENT("Entry:");
1210 }
1211
1212 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1213 if (dest_uninitialized) {
1214 decorators |= IS_DEST_UNINITIALIZED;
1215 }
1216 if (aligned) {
1217 decorators |= ARRAYCOPY_ALIGNED;
1218 }
1219
1220 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1221 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1222
1223 if (is_oop) {
1224 // save regs before copy_memory
1225 __ push_reg(RegSet::of(d, count), sp);
1226 }
1227
1228 {
1229 // UnsafeMemoryAccess page error: continue after unsafe access
1230 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1231 UnsafeMemoryAccessMark umam(this, add_entry, true);
1232 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1233 }
1234
1235 if (is_oop) {
1236 __ pop_reg(RegSet::of(d, count), sp);
1237 if (VerifyOops) {
1238 verify_oop_array(size, d, count, t2);
1239 }
1240 }
1241
1242 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1243
1244 __ leave();
1245 __ mv(x10, zr); // return 0
1246 __ ret();
1247 return start;
1248 }
1249
1250 // Arguments:
1251 // stub_id - is used to name the stub and identify all details of
1252 // how to perform the copy.
1253 //
1254 // nooverlap_target - identifes the (post push) entry for the
1255 // corresponding disjoint copy routine which can be
1256 // jumped to if the ranges do not actually overlap
1257 //
1258 // nopush_entry - is assigned to the stub's post push entry point
1259 // unless it is null
1260 //
1261 // Inputs:
1262 // c_rarg0 - source array address
1263 // c_rarg1 - destination array address
1264 // c_rarg2 - element count, treated as ssize_t, can be zero
1265 //
1266 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1267 // the hardware handle it. The two dwords within qwords that span
1268 // cache line boundaries will still be loaded and stored atomically.
1269 //
1270 // Side Effects:
1271 // nopush_entry is set to the no-overlap entry point so it can be
1272 // used by some other conjoint copy method
1273 //
1274 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1275 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1276 RegSet saved_regs = RegSet::of(s, d, count);
1277 int size;
1278 bool aligned;
1279 bool is_oop;
1280 bool dest_uninitialized;
1281 switch (stub_id) {
1282 case StubId::stubgen_jbyte_arraycopy_id:
1283 size = sizeof(jbyte);
1284 aligned = false;
1285 is_oop = false;
1286 dest_uninitialized = false;
1287 break;
1288 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1289 size = sizeof(jbyte);
1290 aligned = true;
1291 is_oop = false;
1292 dest_uninitialized = false;
1293 break;
1294 case StubId::stubgen_jshort_arraycopy_id:
1295 size = sizeof(jshort);
1296 aligned = false;
1297 is_oop = false;
1298 dest_uninitialized = false;
1299 break;
1300 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1301 size = sizeof(jshort);
1302 aligned = true;
1303 is_oop = false;
1304 dest_uninitialized = false;
1305 break;
1306 case StubId::stubgen_jint_arraycopy_id:
1307 size = sizeof(jint);
1308 aligned = false;
1309 is_oop = false;
1310 dest_uninitialized = false;
1311 break;
1312 case StubId::stubgen_arrayof_jint_arraycopy_id:
1313 size = sizeof(jint);
1314 aligned = true;
1315 is_oop = false;
1316 dest_uninitialized = false;
1317 break;
1318 case StubId::stubgen_jlong_arraycopy_id:
1319 // since this is always aligned we can (should!) use the same
1320 // stub as for case arrayof_jlong_disjoint_arraycopy
1321 ShouldNotReachHere();
1322 break;
1323 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1324 size = sizeof(jlong);
1325 aligned = true;
1326 is_oop = false;
1327 dest_uninitialized = false;
1328 break;
1329 case StubId::stubgen_oop_arraycopy_id:
1330 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1331 aligned = !UseCompressedOops;
1332 is_oop = true;
1333 dest_uninitialized = false;
1334 break;
1335 case StubId::stubgen_arrayof_oop_arraycopy_id:
1336 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1337 aligned = !UseCompressedOops;
1338 is_oop = true;
1339 dest_uninitialized = false;
1340 break;
1341 case StubId::stubgen_oop_arraycopy_uninit_id:
1342 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1343 aligned = !UseCompressedOops;
1344 is_oop = true;
1345 dest_uninitialized = true;
1346 break;
1347 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1348 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1349 aligned = !UseCompressedOops;
1350 is_oop = true;
1351 dest_uninitialized = true;
1352 break;
1353 default:
1354 ShouldNotReachHere();
1355 }
1356
1357 StubCodeMark mark(this, stub_id);
1358 address start = __ pc();
1359 __ enter();
1360
1361 if (nopush_entry != nullptr) {
1362 *nopush_entry = __ pc();
1363 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1364 BLOCK_COMMENT("Entry:");
1365 }
1366
1367 // use fwd copy when (d-s) above_equal (count*size)
1368 __ sub(t0, d, s);
1369 __ slli(t1, count, exact_log2(size));
1370 Label L_continue;
1371 __ bltu(t0, t1, L_continue);
1372 __ j(RuntimeAddress(nooverlap_target));
1373 __ bind(L_continue);
1374
1375 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1376 if (dest_uninitialized) {
1377 decorators |= IS_DEST_UNINITIALIZED;
1378 }
1379 if (aligned) {
1380 decorators |= ARRAYCOPY_ALIGNED;
1381 }
1382
1383 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1384 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1385
1386 if (is_oop) {
1387 // save regs before copy_memory
1388 __ push_reg(RegSet::of(d, count), sp);
1389 }
1390
1391 {
1392 // UnsafeMemoryAccess page error: continue after unsafe access
1393 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1394 UnsafeMemoryAccessMark umam(this, add_entry, true);
1395 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1396 }
1397
1398 if (is_oop) {
1399 __ pop_reg(RegSet::of(d, count), sp);
1400 if (VerifyOops) {
1401 verify_oop_array(size, d, count, t2);
1402 }
1403 }
1404 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, t0);
1405 __ leave();
1406 __ mv(x10, zr); // return 0
1407 __ ret();
1408 return start;
1409 }
1410
1411 // Helper for generating a dynamic type check.
1412 // Smashes t0, t1.
1413 void generate_type_check(Register sub_klass,
1414 Register super_check_offset,
1415 Register super_klass,
1416 Register result,
1417 Register tmp1,
1418 Register tmp2,
1419 Label& L_success) {
1420 assert_different_registers(sub_klass, super_check_offset, super_klass);
1421
1422 BLOCK_COMMENT("type_check:");
1423
1424 Label L_miss;
1425
1426 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr, super_check_offset);
1427 __ check_klass_subtype_slow_path(sub_klass, super_klass, tmp1, tmp2, &L_success, nullptr);
1428
1429 // Fall through on failure!
1430 __ BIND(L_miss);
1431 }
1432
1433 //
1434 // Generate checkcasting array copy stub
1435 //
1436 // Input:
1437 // c_rarg0 - source array address
1438 // c_rarg1 - destination array address
1439 // c_rarg2 - element count, treated as ssize_t, can be zero
1440 // c_rarg3 - size_t ckoff (super_check_offset)
1441 // c_rarg4 - oop ckval (super_klass)
1442 //
1443 // Output:
1444 // x10 == 0 - success
1445 // x10 == -1^K - failure, where K is partial transfer count
1446 //
1447 address generate_checkcast_copy(StubId stub_id, address* nopush_entry) {
1448 bool dest_uninitialized;
1449 switch (stub_id) {
1450 case StubId::stubgen_checkcast_arraycopy_id:
1451 dest_uninitialized = false;
1452 break;
1453 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1454 dest_uninitialized = true;
1455 break;
1456 default:
1457 ShouldNotReachHere();
1458 }
1459
1460 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1461
1462 // Input registers (after setup_arg_regs)
1463 const Register from = c_rarg0; // source array address
1464 const Register to = c_rarg1; // destination array address
1465 const Register count = c_rarg2; // elementscount
1466 const Register ckoff = c_rarg3; // super_check_offset
1467 const Register ckval = c_rarg4; // super_klass
1468
1469 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1470
1471 // Registers used as temps (x7, x9, x18 are save-on-entry)
1472 const Register count_save = x19; // orig elementscount
1473 const Register start_to = x18; // destination array start address
1474 const Register copied_oop = x7; // actual oop copied
1475 const Register r9_klass = x9; // oop._klass
1476
1477 // Registers used as gc temps (x15, x16, x17 are save-on-call)
1478 const Register gct1 = x15, gct2 = x16, gct3 = x17;
1479
1480 //---------------------------------------------------------------
1481 // Assembler stub will be used for this call to arraycopy
1482 // if the two arrays are subtypes of Object[] but the
1483 // destination array type is not equal to or a supertype
1484 // of the source type. Each element must be separately
1485 // checked.
1486
1487 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1488 copied_oop, r9_klass, count_save);
1489
1490 __ align(CodeEntryAlignment);
1491 StubCodeMark mark(this, stub_id);
1492 address start = __ pc();
1493
1494 __ enter(); // required for proper stackwalking of RuntimeStub frame
1495
1496 // Caller of this entry point must set up the argument registers.
1497 if (nopush_entry != nullptr) {
1498 *nopush_entry = __ pc();
1499 BLOCK_COMMENT("Entry:");
1500 }
1501
1502 // Empty array: Nothing to do
1503 __ beqz(count, L_done);
1504
1505 __ push_reg(RegSet::of(x7, x9, x18, x19), sp);
1506
1507 #ifdef ASSERT
1508 BLOCK_COMMENT("assert consistent ckoff/ckval");
1509 // The ckoff and ckval must be mutually consistent,
1510 // even though caller generates both.
1511 { Label L;
1512 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1513 __ lwu(start_to, Address(ckval, sco_offset));
1514 __ beq(ckoff, start_to, L);
1515 __ stop("super_check_offset inconsistent");
1516 __ bind(L);
1517 }
1518 #endif //ASSERT
1519
1520 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1521 if (dest_uninitialized) {
1522 decorators |= IS_DEST_UNINITIALIZED;
1523 }
1524
1525 bool is_oop = true;
1526 int element_size = UseCompressedOops ? 4 : 8;
1527
1528 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1529 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1530
1531 // save the original count
1532 __ mv(count_save, count);
1533
1534 // Copy from low to high addresses
1535 __ mv(start_to, to); // Save destination array start address
1536 __ j(L_load_element);
1537
1538 // ======== begin loop ========
1539 // (Loop is rotated; its entry is L_load_element.)
1540 // Loop control:
1541 // for count to 0 do
1542 // copied_oop = load_heap_oop(from++)
1543 // ... generate_type_check ...
1544 // store_heap_oop(to++, copied_oop)
1545 // end
1546
1547 __ align(OptoLoopAlignment);
1548
1549 __ BIND(L_store_element);
1550 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1551 Address(to, 0), copied_oop,
1552 gct1, gct2, gct3);
1553 __ addi(to, to, UseCompressedOops ? 4 : 8);
1554 __ subi(count, count, 1);
1555 __ beqz(count, L_do_card_marks);
1556
1557 // ======== loop entry is here ========
1558 __ BIND(L_load_element);
1559 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1560 copied_oop, Address(from, 0),
1561 gct1);
1562 __ addi(from, from, UseCompressedOops ? 4 : 8);
1563 __ beqz(copied_oop, L_store_element);
1564
1565 __ load_klass(r9_klass, copied_oop);// query the object klass
1566
1567 BLOCK_COMMENT("type_check:");
1568 generate_type_check(r9_klass, /*sub_klass*/
1569 ckoff, /*super_check_offset*/
1570 ckval, /*super_klass*/
1571 x10, /*result*/
1572 gct1, /*tmp1*/
1573 gct2, /*tmp2*/
1574 L_store_element);
1575
1576 // Fall through on failure!
1577
1578 // ======== end loop ========
1579
1580 // It was a real error; we must depend on the caller to finish the job.
1581 // Register count = remaining oops, count_orig = total oops.
1582 // Emit GC store barriers for the oops we have copied and report
1583 // their number to the caller.
1584
1585 __ sub(count, count_save, count); // K = partially copied oop count
1586 __ xori(count, count, -1); // report (-1^K) to caller
1587 __ beqz(count, L_done_pop);
1588
1589 __ BIND(L_do_card_marks);
1590 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, t0);
1591
1592 __ bind(L_done_pop);
1593 __ pop_reg(RegSet::of(x7, x9, x18, x19), sp);
1594 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1595
1596 __ bind(L_done);
1597 __ mv(x10, count);
1598 __ leave();
1599 __ ret();
1600
1601 return start;
1602 }
1603
1604 // Perform range checks on the proposed arraycopy.
1605 // Kills temp, but nothing else.
1606 // Also, clean the sign bits of src_pos and dst_pos.
1607 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
1608 Register src_pos, // source position (c_rarg1)
1609 Register dst, // destination array oo (c_rarg2)
1610 Register dst_pos, // destination position (c_rarg3)
1611 Register length,
1612 Register temp,
1613 Label& L_failed) {
1614 BLOCK_COMMENT("arraycopy_range_checks:");
1615
1616 assert_different_registers(t0, temp);
1617
1618 // if [src_pos + length > arrayOop(src)->length()] then FAIL
1619 __ lwu(t0, Address(src, arrayOopDesc::length_offset_in_bytes()));
1620 __ addw(temp, length, src_pos);
1621 __ bgtu(temp, t0, L_failed);
1622
1623 // if [dst_pos + length > arrayOop(dst)->length()] then FAIL
1624 __ lwu(t0, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1625 __ addw(temp, length, dst_pos);
1626 __ bgtu(temp, t0, L_failed);
1627
1628 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1629 __ zext(src_pos, src_pos, 32);
1630 __ zext(dst_pos, dst_pos, 32);
1631
1632 BLOCK_COMMENT("arraycopy_range_checks done");
1633 }
1634
1635 address generate_unsafecopy_common_error_exit() {
1636 address start = __ pc();
1637 __ mv(x10, 0);
1638 __ leave();
1639 __ ret();
1640 return start;
1641 }
1642
1643 //
1644 // Generate 'unsafe' set memory stub
1645 // Though just as safe as the other stubs, it takes an unscaled
1646 // size_t (# bytes) argument instead of an element count.
1647 //
1648 // Input:
1649 // c_rarg0 - destination array address
1650 // c_rarg1 - byte count (size_t)
1651 // c_rarg2 - byte value
1652 //
1653 address generate_unsafe_setmemory() {
1654 __ align(CodeEntryAlignment);
1655 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
1656 StubCodeMark mark(this, stub_id);
1657 address start = __ pc();
1658
1659 // bump this on entry, not on exit:
1660 // inc_counter_np(SharedRuntime::_unsafe_set_memory_ctr);
1661
1662 Label L_fill_elements;
1663
1664 const Register dest = c_rarg0;
1665 const Register count = c_rarg1;
1666 const Register value = c_rarg2;
1667 const Register cnt_words = x28; // temp register
1668 const Register tmp_reg = x29; // temp register
1669
1670 // Mark remaining code as such which performs Unsafe accesses.
1671 UnsafeMemoryAccessMark umam(this, true, false);
1672
1673 __ enter(); // required for proper stackwalking of RuntimeStub frame
1674
1675 // if count < 8, jump to L_fill_elements
1676 __ mv(tmp_reg, 8); // 8 bytes fill by element
1677 __ bltu(count, tmp_reg, L_fill_elements);
1678
1679 // Propagate byte to 64-bit width
1680 // 8 bit -> 16 bit
1681 __ zext(value, value, 8);
1682 __ slli(tmp_reg, value, 8);
1683 __ orr(value, value, tmp_reg);
1684 // 16 bit -> 32 bit
1685 __ slli(tmp_reg, value, 16);
1686 __ orr(value, value, tmp_reg);
1687 // 32 bit -> 64 bit
1688 __ slli(tmp_reg, value, 32);
1689 __ orr(value, value, tmp_reg);
1690
1691 // Align source address at 8 bytes address boundary.
1692 Label L_skip_align1, L_skip_align2, L_skip_align4;
1693 // One byte misalignment happens.
1694 __ test_bit(tmp_reg, dest, 0);
1695 __ beqz(tmp_reg, L_skip_align1);
1696 __ sb(value, Address(dest, 0));
1697 __ addi(dest, dest, 1);
1698 __ subi(count, count, 1);
1699
1700 __ bind(L_skip_align1);
1701 // Two bytes misalignment happens.
1702 __ test_bit(tmp_reg, dest, 1);
1703 __ beqz(tmp_reg, L_skip_align2);
1704 __ sh(value, Address(dest, 0));
1705 __ addi(dest, dest, 2);
1706 __ subi(count, count, 2);
1707
1708 __ bind(L_skip_align2);
1709 // Four bytes misalignment happens.
1710 __ test_bit(tmp_reg, dest, 2);
1711 __ beqz(tmp_reg, L_skip_align4);
1712 __ sw(value, Address(dest, 0));
1713 __ addi(dest, dest, 4);
1714 __ subi(count, count, 4);
1715 __ bind(L_skip_align4);
1716
1717 // Fill large chunks
1718 __ srli(cnt_words, count, 3); // number of words
1719 __ slli(tmp_reg, cnt_words, 3);
1720 __ sub(count, count, tmp_reg);
1721 {
1722 __ fill_words(dest, cnt_words, value);
1723 }
1724
1725 // Handle copies less than 8 bytes
1726 __ bind(L_fill_elements);
1727 Label L_fill_2, L_fill_1, L_exit;
1728 __ test_bit(tmp_reg, count, 2);
1729 __ beqz(tmp_reg, L_fill_2);
1730 __ sb(value, Address(dest, 0));
1731 __ sb(value, Address(dest, 1));
1732 __ sb(value, Address(dest, 2));
1733 __ sb(value, Address(dest, 3));
1734 __ addi(dest, dest, 4);
1735
1736 __ bind(L_fill_2);
1737 __ test_bit(tmp_reg, count, 1);
1738 __ beqz(tmp_reg, L_fill_1);
1739 __ sb(value, Address(dest, 0));
1740 __ sb(value, Address(dest, 1));
1741 __ addi(dest, dest, 2);
1742
1743 __ bind(L_fill_1);
1744 __ test_bit(tmp_reg, count, 0);
1745 __ beqz(tmp_reg, L_exit);
1746 __ sb(value, Address(dest, 0));
1747
1748 __ bind(L_exit);
1749 __ leave();
1750 __ ret();
1751
1752 return start;
1753 }
1754
1755 //
1756 // Generate 'unsafe' array copy stub
1757 // Though just as safe as the other stubs, it takes an unscaled
1758 // size_t argument instead of an element count.
1759 //
1760 // Input:
1761 // c_rarg0 - source array address
1762 // c_rarg1 - destination array address
1763 // c_rarg2 - byte count, treated as ssize_t, can be zero
1764 //
1765 // Examines the alignment of the operands and dispatches
1766 // to a long, int, short, or byte copy loop.
1767 //
1768 address generate_unsafe_copy(address byte_copy_entry,
1769 address short_copy_entry,
1770 address int_copy_entry,
1771 address long_copy_entry) {
1772 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1773 int_copy_entry != nullptr && long_copy_entry != nullptr);
1774 Label L_long_aligned, L_int_aligned, L_short_aligned;
1775 const Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1776
1777 __ align(CodeEntryAlignment);
1778 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
1779 StubCodeMark mark(this, stub_id);
1780 address start = __ pc();
1781 __ enter(); // required for proper stackwalking of RuntimeStub frame
1782
1783 // bump this on entry, not on exit:
1784 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1785
1786 __ orr(t0, s, d);
1787 __ orr(t0, t0, count);
1788
1789 __ andi(t0, t0, BytesPerLong - 1);
1790 __ beqz(t0, L_long_aligned);
1791 __ andi(t0, t0, BytesPerInt - 1);
1792 __ beqz(t0, L_int_aligned);
1793 __ test_bit(t0, t0, 0);
1794 __ beqz(t0, L_short_aligned);
1795 __ j(RuntimeAddress(byte_copy_entry));
1796
1797 __ BIND(L_short_aligned);
1798 __ srli(count, count, LogBytesPerShort); // size => short_count
1799 __ j(RuntimeAddress(short_copy_entry));
1800 __ BIND(L_int_aligned);
1801 __ srli(count, count, LogBytesPerInt); // size => int_count
1802 __ j(RuntimeAddress(int_copy_entry));
1803 __ BIND(L_long_aligned);
1804 __ srli(count, count, LogBytesPerLong); // size => long_count
1805 __ j(RuntimeAddress(long_copy_entry));
1806
1807 return start;
1808 }
1809
1810 //
1811 // Generate generic array copy stubs
1812 //
1813 // Input:
1814 // c_rarg0 - src oop
1815 // c_rarg1 - src_pos (32-bits)
1816 // c_rarg2 - dst oop
1817 // c_rarg3 - dst_pos (32-bits)
1818 // c_rarg4 - element count (32-bits)
1819 //
1820 // Output:
1821 // x10 == 0 - success
1822 // x10 == -1^K - failure, where K is partial transfer count
1823 //
1824 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
1825 address int_copy_entry, address oop_copy_entry,
1826 address long_copy_entry, address checkcast_copy_entry) {
1827 assert_cond(byte_copy_entry != nullptr && short_copy_entry != nullptr &&
1828 int_copy_entry != nullptr && oop_copy_entry != nullptr &&
1829 long_copy_entry != nullptr && checkcast_copy_entry != nullptr);
1830 Label L_failed, L_failed_0, L_objArray;
1831 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1832
1833 // Input registers
1834 const Register src = c_rarg0; // source array oop
1835 const Register src_pos = c_rarg1; // source position
1836 const Register dst = c_rarg2; // destination array oop
1837 const Register dst_pos = c_rarg3; // destination position
1838 const Register length = c_rarg4;
1839
1840 // Registers used as temps
1841 const Register dst_klass = c_rarg5;
1842
1843 __ align(CodeEntryAlignment);
1844
1845 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
1846 StubCodeMark mark(this, stub_id);
1847
1848 address start = __ pc();
1849
1850 __ enter(); // required for proper stackwalking of RuntimeStub frame
1851
1852 // bump this on entry, not on exit:
1853 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1854
1855 //-----------------------------------------------------------------------
1856 // Assembler stub will be used for this call to arraycopy
1857 // if the following conditions are met:
1858 //
1859 // (1) src and dst must not be null.
1860 // (2) src_pos must not be negative.
1861 // (3) dst_pos must not be negative.
1862 // (4) length must not be negative.
1863 // (5) src klass and dst klass should be the same and not null.
1864 // (6) src and dst should be arrays.
1865 // (7) src_pos + length must not exceed length of src.
1866 // (8) dst_pos + length must not exceed length of dst.
1867 //
1868
1869 // if src is null then return -1
1870 __ beqz(src, L_failed);
1871
1872 // if [src_pos < 0] then return -1
1873 __ sext(t0, src_pos, 32);
1874 __ bltz(t0, L_failed);
1875
1876 // if dst is null then return -1
1877 __ beqz(dst, L_failed);
1878
1879 // if [dst_pos < 0] then return -1
1880 __ sext(t0, dst_pos, 32);
1881 __ bltz(t0, L_failed);
1882
1883 // registers used as temp
1884 const Register scratch_length = x28; // elements count to copy
1885 const Register scratch_src_klass = x29; // array klass
1886 const Register lh = x30; // layout helper
1887
1888 // if [length < 0] then return -1
1889 __ sext(scratch_length, length, 32); // length (elements count, 32-bits value)
1890 __ bltz(scratch_length, L_failed);
1891
1892 __ load_klass(scratch_src_klass, src);
1893 #ifdef ASSERT
1894 {
1895 BLOCK_COMMENT("assert klasses not null {");
1896 Label L1, L2;
1897 __ bnez(scratch_src_klass, L2); // it is broken if klass is null
1898 __ bind(L1);
1899 __ stop("broken null klass");
1900 __ bind(L2);
1901 __ load_klass(t0, dst, t1);
1902 __ beqz(t0, L1); // this would be broken also
1903 BLOCK_COMMENT("} assert klasses not null done");
1904 }
1905 #endif
1906
1907 // Load layout helper (32-bits)
1908 //
1909 // |array_tag| | header_size | element_type | |log2_element_size|
1910 // 32 30 24 16 8 2 0
1911 //
1912 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1913 //
1914
1915 const int lh_offset = in_bytes(Klass::layout_helper_offset());
1916
1917 // Handle objArrays completely differently...
1918 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1919 __ lw(lh, Address(scratch_src_klass, lh_offset));
1920 __ mv(t0, objArray_lh);
1921 __ beq(lh, t0, L_objArray);
1922
1923 // if [src->klass() != dst->klass()] then return -1
1924 __ load_klass(t1, dst);
1925 __ bne(t1, scratch_src_klass, L_failed);
1926
1927 // Check for flat inline type array -> return -1
1928 __ test_flat_array_oop(src, t1, L_failed);
1929
1930 // Check for null-free (non-flat) inline type array -> handle as object array
1931 __ test_null_free_array_oop(src, t1, L_objArray);
1932
1933 // if src->is_Array() isn't null then return -1
1934 // i.e. (lh >= 0)
1935 __ bgez(lh, L_failed);
1936
1937 // At this point, it is known to be a typeArray (array_tag 0x3).
1938 #ifdef ASSERT
1939 {
1940 BLOCK_COMMENT("assert primitive array {");
1941 Label L;
1942 __ mv(t1, (int32_t)(Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift));
1943 __ bge(lh, t1, L);
1944 __ stop("must be a primitive array");
1945 __ bind(L);
1946 BLOCK_COMMENT("} assert primitive array done");
1947 }
1948 #endif
1949
1950 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1951 t1, L_failed);
1952
1953 // TypeArrayKlass
1954 //
1955 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize)
1956 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize)
1957 //
1958
1959 const Register t0_offset = t0; // array offset
1960 const Register x30_elsize = lh; // element size
1961
1962 // Get array_header_in_bytes()
1963 int lh_header_size_width = exact_log2(Klass::_lh_header_size_mask + 1);
1964 int lh_header_size_msb = Klass::_lh_header_size_shift + lh_header_size_width;
1965 __ slli(t0_offset, lh, XLEN - lh_header_size_msb); // left shift to remove 24 ~ 32;
1966 __ srli(t0_offset, t0_offset, XLEN - lh_header_size_width); // array_offset
1967
1968 __ add(src, src, t0_offset); // src array offset
1969 __ add(dst, dst, t0_offset); // dst array offset
1970 BLOCK_COMMENT("choose copy loop based on element size");
1971
1972 // next registers should be set before the jump to corresponding stub
1973 const Register from = c_rarg0; // source array address
1974 const Register to = c_rarg1; // destination array address
1975 const Register count = c_rarg2; // elements count
1976
1977 // 'from', 'to', 'count' registers should be set in such order
1978 // since they are the same as 'src', 'src_pos', 'dst'.
1979
1980 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1981
1982 // The possible values of elsize are 0-3, i.e. exact_log2(element
1983 // size in bytes). We do a simple bitwise binary search.
1984 __ BIND(L_copy_bytes);
1985 __ test_bit(t0, x30_elsize, 1);
1986 __ bnez(t0, L_copy_ints);
1987 __ test_bit(t0, x30_elsize, 0);
1988 __ bnez(t0, L_copy_shorts);
1989 __ add(from, src, src_pos); // src_addr
1990 __ add(to, dst, dst_pos); // dst_addr
1991 __ sext(count, scratch_length, 32); // length
1992 __ j(RuntimeAddress(byte_copy_entry));
1993
1994 __ BIND(L_copy_shorts);
1995 __ shadd(from, src_pos, src, t0, 1); // src_addr
1996 __ shadd(to, dst_pos, dst, t0, 1); // dst_addr
1997 __ sext(count, scratch_length, 32); // length
1998 __ j(RuntimeAddress(short_copy_entry));
1999
2000 __ BIND(L_copy_ints);
2001 __ test_bit(t0, x30_elsize, 0);
2002 __ bnez(t0, L_copy_longs);
2003 __ shadd(from, src_pos, src, t0, 2); // src_addr
2004 __ shadd(to, dst_pos, dst, t0, 2); // dst_addr
2005 __ sext(count, scratch_length, 32); // length
2006 __ j(RuntimeAddress(int_copy_entry));
2007
2008 __ BIND(L_copy_longs);
2009 #ifdef ASSERT
2010 {
2011 BLOCK_COMMENT("assert long copy {");
2012 Label L;
2013 __ andi(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> x30_elsize
2014 __ sext(lh, lh, 32);
2015 __ mv(t0, LogBytesPerLong);
2016 __ beq(x30_elsize, t0, L);
2017 __ stop("must be long copy, but elsize is wrong");
2018 __ bind(L);
2019 BLOCK_COMMENT("} assert long copy done");
2020 }
2021 #endif
2022 __ shadd(from, src_pos, src, t0, 3); // src_addr
2023 __ shadd(to, dst_pos, dst, t0, 3); // dst_addr
2024 __ sext(count, scratch_length, 32); // length
2025 __ j(RuntimeAddress(long_copy_entry));
2026
2027 // ObjArrayKlass
2028 __ BIND(L_objArray);
2029 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2030
2031 Label L_plain_copy, L_checkcast_copy;
2032 // test array classes for subtyping
2033 __ load_klass(t2, dst);
2034 __ bne(scratch_src_klass, t2, L_checkcast_copy); // usual case is exact equality
2035
2036 // Identically typed arrays can be copied without element-wise checks.
2037 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2038 t1, L_failed);
2039
2040 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2041 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2042 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2043 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2044 __ sext(count, scratch_length, 32); // length
2045 __ BIND(L_plain_copy);
2046 __ j(RuntimeAddress(oop_copy_entry));
2047
2048 __ BIND(L_checkcast_copy);
2049 // live at this point: scratch_src_klass, scratch_length, t2 (dst_klass)
2050 {
2051 // Before looking at dst.length, make sure dst is also an objArray.
2052 __ lwu(t0, Address(t2, lh_offset));
2053 __ mv(t1, objArray_lh);
2054 __ bne(t0, t1, L_failed);
2055
2056 // It is safe to examine both src.length and dst.length.
2057 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2058 t2, L_failed);
2059
2060 __ load_klass(dst_klass, dst); // reload
2061
2062 // Marshal the base address arguments now, freeing registers.
2063 __ shadd(from, src_pos, src, t0, LogBytesPerHeapOop);
2064 __ addi(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2065 __ shadd(to, dst_pos, dst, t0, LogBytesPerHeapOop);
2066 __ addi(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2067 __ sext(count, length, 32); // length (reloaded)
2068 const Register sco_temp = c_rarg3; // this register is free now
2069 assert_different_registers(from, to, count, sco_temp,
2070 dst_klass, scratch_src_klass);
2071
2072 // Generate the type check.
2073 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2074 __ lwu(sco_temp, Address(dst_klass, sco_offset));
2075
2076 // Smashes t0, t1
2077 generate_type_check(scratch_src_klass, sco_temp, dst_klass, noreg, noreg, noreg, L_plain_copy);
2078
2079 // Fetch destination element klass from the ObjArrayKlass header.
2080 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2081 __ ld(dst_klass, Address(dst_klass, ek_offset));
2082 __ lwu(sco_temp, Address(dst_klass, sco_offset));
2083
2084 // the checkcast_copy loop needs two extra arguments:
2085 assert(c_rarg3 == sco_temp, "#3 already in place");
2086 // Set up arguments for checkcast_copy_entry.
2087 __ mv(c_rarg4, dst_klass); // dst.klass.element_klass
2088 __ j(RuntimeAddress(checkcast_copy_entry));
2089 }
2090
2091 __ BIND(L_failed);
2092 __ mv(x10, -1);
2093 __ leave(); // required for proper stackwalking of RuntimeStub frame
2094 __ ret();
2095
2096 return start;
2097 }
2098
2099 //
2100 // Generate stub for array fill. If "aligned" is true, the
2101 // "to" address is assumed to be heapword aligned.
2102 //
2103 // Arguments for generated stub:
2104 // to: c_rarg0
2105 // value: c_rarg1
2106 // count: c_rarg2 treated as signed
2107 //
2108 address generate_fill(StubId stub_id) {
2109 BasicType t;
2110 bool aligned;
2111
2112 switch (stub_id) {
2113 case StubId::stubgen_jbyte_fill_id:
2114 t = T_BYTE;
2115 aligned = false;
2116 break;
2117 case StubId::stubgen_jshort_fill_id:
2118 t = T_SHORT;
2119 aligned = false;
2120 break;
2121 case StubId::stubgen_jint_fill_id:
2122 t = T_INT;
2123 aligned = false;
2124 break;
2125 case StubId::stubgen_arrayof_jbyte_fill_id:
2126 t = T_BYTE;
2127 aligned = true;
2128 break;
2129 case StubId::stubgen_arrayof_jshort_fill_id:
2130 t = T_SHORT;
2131 aligned = true;
2132 break;
2133 case StubId::stubgen_arrayof_jint_fill_id:
2134 t = T_INT;
2135 aligned = true;
2136 break;
2137 default:
2138 ShouldNotReachHere();
2139 };
2140
2141 __ align(CodeEntryAlignment);
2142 StubCodeMark mark(this, stub_id);
2143 address start = __ pc();
2144
2145 BLOCK_COMMENT("Entry:");
2146
2147 const Register to = c_rarg0; // source array address
2148 const Register value = c_rarg1; // value
2149 const Register count = c_rarg2; // elements count
2150
2151 const Register bz_base = x28; // base for block_zero routine
2152 const Register cnt_words = x29; // temp register
2153 const Register tmp_reg = t1;
2154
2155 __ enter();
2156
2157 Label L_fill_elements;
2158
2159 int shift = -1;
2160 switch (t) {
2161 case T_BYTE:
2162 shift = 0;
2163 // Short arrays (< 8 bytes) fill by element
2164 __ mv(tmp_reg, 8 >> shift);
2165 __ bltu(count, tmp_reg, L_fill_elements);
2166
2167 // Zero extend value
2168 // 8 bit -> 16 bit
2169 __ zext(value, value, 8);
2170 __ slli(tmp_reg, value, 8);
2171 __ orr(value, value, tmp_reg);
2172
2173 // 16 bit -> 32 bit
2174 __ slli(tmp_reg, value, 16);
2175 __ orr(value, value, tmp_reg);
2176 break;
2177 case T_SHORT:
2178 shift = 1;
2179 // Short arrays (< 8 bytes) fill by element
2180 __ mv(tmp_reg, 8 >> shift);
2181 __ bltu(count, tmp_reg, L_fill_elements);
2182
2183 // Zero extend value
2184 // 16 bit -> 32 bit
2185 __ zext(value, value, 16);
2186 __ slli(tmp_reg, value, 16);
2187 __ orr(value, value, tmp_reg);
2188 break;
2189 case T_INT:
2190 shift = 2;
2191 // Short arrays (< 8 bytes) fill by element
2192 __ mv(tmp_reg, 8 >> shift);
2193 __ bltu(count, tmp_reg, L_fill_elements);
2194 break;
2195 default: ShouldNotReachHere();
2196 }
2197
2198 // Align source address at 8 bytes address boundary.
2199 Label L_skip_align1, L_skip_align2, L_skip_align4;
2200 if (!aligned) {
2201 switch (t) {
2202 case T_BYTE:
2203 // One byte misalignment happens only for byte arrays.
2204 __ test_bit(tmp_reg, to, 0);
2205 __ beqz(tmp_reg, L_skip_align1);
2206 __ sb(value, Address(to, 0));
2207 __ addi(to, to, 1);
2208 __ subiw(count, count, 1);
2209 __ bind(L_skip_align1);
2210 // Fallthrough
2211 case T_SHORT:
2212 // Two bytes misalignment happens only for byte and short (char) arrays.
2213 __ test_bit(tmp_reg, to, 1);
2214 __ beqz(tmp_reg, L_skip_align2);
2215 __ sh(value, Address(to, 0));
2216 __ addi(to, to, 2);
2217 __ subiw(count, count, 2 >> shift);
2218 __ bind(L_skip_align2);
2219 // Fallthrough
2220 case T_INT:
2221 // Align to 8 bytes, we know we are 4 byte aligned to start.
2222 __ test_bit(tmp_reg, to, 2);
2223 __ beqz(tmp_reg, L_skip_align4);
2224 __ sw(value, Address(to, 0));
2225 __ addi(to, to, 4);
2226 __ subiw(count, count, 4 >> shift);
2227 __ bind(L_skip_align4);
2228 break;
2229 default: ShouldNotReachHere();
2230 }
2231 }
2232
2233 //
2234 // Fill large chunks
2235 //
2236 __ srliw(cnt_words, count, 3 - shift); // number of words
2237
2238 // 32 bit -> 64 bit
2239 __ zext(value, value, 32);
2240 __ slli(tmp_reg, value, 32);
2241 __ orr(value, value, tmp_reg);
2242
2243 __ slli(tmp_reg, cnt_words, 3 - shift);
2244 __ subw(count, count, tmp_reg);
2245 {
2246 __ fill_words(to, cnt_words, value);
2247 }
2248
2249 // Handle copies less than 8 bytes.
2250 // Address may not be heapword aligned.
2251 Label L_fill_1, L_fill_2, L_exit;
2252 __ bind(L_fill_elements);
2253 switch (t) {
2254 case T_BYTE:
2255 __ test_bit(tmp_reg, count, 2);
2256 __ beqz(tmp_reg, L_fill_2);
2257 __ sb(value, Address(to, 0));
2258 __ sb(value, Address(to, 1));
2259 __ sb(value, Address(to, 2));
2260 __ sb(value, Address(to, 3));
2261 __ addi(to, to, 4);
2262
2263 __ bind(L_fill_2);
2264 __ test_bit(tmp_reg, count, 1);
2265 __ beqz(tmp_reg, L_fill_1);
2266 __ sb(value, Address(to, 0));
2267 __ sb(value, Address(to, 1));
2268 __ addi(to, to, 2);
2269
2270 __ bind(L_fill_1);
2271 __ test_bit(tmp_reg, count, 0);
2272 __ beqz(tmp_reg, L_exit);
2273 __ sb(value, Address(to, 0));
2274 break;
2275 case T_SHORT:
2276 __ test_bit(tmp_reg, count, 1);
2277 __ beqz(tmp_reg, L_fill_2);
2278 __ sh(value, Address(to, 0));
2279 __ sh(value, Address(to, 2));
2280 __ addi(to, to, 4);
2281
2282 __ bind(L_fill_2);
2283 __ test_bit(tmp_reg, count, 0);
2284 __ beqz(tmp_reg, L_exit);
2285 __ sh(value, Address(to, 0));
2286 break;
2287 case T_INT:
2288 __ beqz(count, L_exit);
2289 __ sw(value, Address(to, 0));
2290 break;
2291 default: ShouldNotReachHere();
2292 }
2293 __ bind(L_exit);
2294 __ leave();
2295 __ ret();
2296
2297 return start;
2298 }
2299
2300 void generate_arraycopy_stubs() {
2301 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2302 // entry immediately following their stack push. This can be used
2303 // as a post-push branch target for compatible stubs when they
2304 // identify a special case that can be handled by the fallback
2305 // stub e.g a disjoint copy stub may be use as a special case
2306 // fallback for its compatible conjoint copy stub.
2307 //
2308 // A no push entry is always returned in the following local and
2309 // then published by assigning to the appropriate entry field in
2310 // class StubRoutines. The entry value is then passed to the
2311 // generator for the compatible stub. That means the entry must be
2312 // listed when saving to/restoring from the AOT cache, ensuring
2313 // that the inter-stub jumps are noted at AOT-cache save and
2314 // relocated at AOT cache load.
2315 address nopush_entry = nullptr;
2316
2317 // generate the common exit first so later stubs can rely on it if
2318 // they want an UnsafeMemoryAccess exit non-local to the stub
2319 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2320 // register the stub as the default exit with class UnsafeMemoryAccess
2321 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2322
2323 // generate and publish riscv-specific bulk copy routines first
2324 // so we can call them from other copy stubs
2325 StubRoutines::riscv::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, c_rarg0, c_rarg1, t1);
2326 StubRoutines::riscv::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, c_rarg0, c_rarg1, t1);
2327
2328 StubRoutines::riscv::_zero_blocks = generate_zero_blocks();
2329
2330 //*** jbyte
2331 // Always need aligned and unaligned versions
2332 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2333 // disjoint nopush entry is needed by conjoint copy
2334 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2335 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2336 // conjoint nopush entry is needed by generic/unsafe copy
2337 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2338 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2339 // disjoint arrayof nopush entry is needed by conjoint copy
2340 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2341 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2342
2343 //*** jshort
2344 // Always need aligned and unaligned versions
2345 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2346 // disjoint nopush entry is needed by conjoint copy
2347 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2348 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2349 // conjoint nopush entry is used by generic/unsafe copy
2350 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2351 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2352 // disjoint arrayof nopush entry is needed by conjoint copy
2353 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2354 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2355
2356 //*** jint
2357 // Aligned versions
2358 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2359 // disjoint arrayof nopush entry is needed by conjoint copy
2360 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2361 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2362 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2363 // entry_jint_arraycopy always points to the unaligned version
2364 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2365 // disjoint nopush entry is needed by conjoint copy
2366 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2367 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2368 // conjoint nopush entry is needed by generic/unsafe copy
2369 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2370
2371 //*** jlong
2372 // It is always aligned
2373 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2374 // disjoint arrayof nopush entry is needed by conjoint copy
2375 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2376 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2377 // conjoint nopush entry is needed by generic/unsafe copy
2378 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2379 // disjoint normal/nopush and conjoint normal entries are not
2380 // generated since the arrayof versions are the same
2381 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2382 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2383 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2384
2385 //*** oops
2386 StubRoutines::_arrayof_oop_disjoint_arraycopy
2387 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2388 // disjoint arrayof nopush entry is needed by conjoint copy
2389 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2390 StubRoutines::_arrayof_oop_arraycopy
2391 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2392 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2393 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2394 // Aligned versions without pre-barriers
2395 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2396 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2397 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2398 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2399
2400 // note that we don't need a returned nopush entry because the
2401 // generic/unsafe copy does not cater for uninit arrays.
2402 StubRoutines::_arrayof_oop_arraycopy_uninit
2403 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2404
2405 // for oop copies reuse arrayof entries for non-arrayof cases
2406 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2407 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2408 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2409 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2410 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2411 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2412
2413 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2414 // checkcast nopush entry is needed by generic copy
2415 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2416 // note that we don't need a returned nopush entry because the
2417 // generic copy does not cater for uninit arrays.
2418 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2419
2420
2421 // unsafe arraycopy may fallback on conjoint stubs
2422 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2423 StubRoutines::_jshort_arraycopy_nopush,
2424 StubRoutines::_jint_arraycopy_nopush,
2425 StubRoutines::_jlong_arraycopy_nopush);
2426
2427 // generic arraycopy may fallback on conjoint stubs
2428 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2429 StubRoutines::_jshort_arraycopy_nopush,
2430 StubRoutines::_jint_arraycopy_nopush,
2431 StubRoutines::_oop_arraycopy_nopush,
2432 StubRoutines::_jlong_arraycopy_nopush,
2433 StubRoutines::_checkcast_arraycopy_nopush);
2434
2435 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2436 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2437 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2438 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2439 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2440 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2441
2442 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
2443 }
2444
2445 void aes_load_keys(const Register &key, VectorRegister *working_vregs, int rounds) {
2446 const int step = 16;
2447 for (int i = 0; i < rounds; i++) {
2448 __ vle32_v(working_vregs[i], key);
2449 // The keys are stored in little-endian array, while we need
2450 // to operate in big-endian.
2451 // So performing an endian-swap here with vrev8.v instruction
2452 __ vrev8_v(working_vregs[i], working_vregs[i]);
2453 __ addi(key, key, step);
2454 }
2455 }
2456
2457 void aes_encrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2458 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2459
2460 __ vxor_vv(res, res, working_vregs[0]);
2461 for (int i = 1; i < rounds - 1; i++) {
2462 __ vaesem_vv(res, working_vregs[i]);
2463 }
2464 __ vaesef_vv(res, working_vregs[rounds - 1]);
2465 }
2466
2467 // Arguments:
2468 //
2469 // Inputs:
2470 // c_rarg0 - source byte array address
2471 // c_rarg1 - destination byte array address
2472 // c_rarg2 - sessionKe (key) in little endian int array
2473 //
2474 address generate_aescrypt_encryptBlock() {
2475 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2476
2477 __ align(CodeEntryAlignment);
2478 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2479 StubCodeMark mark(this, stub_id);
2480
2481 Label L_aes128, L_aes192;
2482
2483 const Register from = c_rarg0; // source array address
2484 const Register to = c_rarg1; // destination array address
2485 const Register key = c_rarg2; // key array address
2486 const Register keylen = c_rarg3;
2487
2488 VectorRegister working_vregs[] = {
2489 v4, v5, v6, v7, v8, v9, v10, v11,
2490 v12, v13, v14, v15, v16, v17, v18
2491 };
2492 const VectorRegister res = v19;
2493
2494 address start = __ pc();
2495 __ enter();
2496
2497 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2498
2499 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2500 __ vle32_v(res, from);
2501
2502 __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2503 __ bltu(keylen, t2, L_aes128);
2504 __ beq(keylen, t2, L_aes192);
2505 // Else we fallthrough to the biggest case (256-bit key size)
2506
2507 // Note: the following function performs key += 15*16
2508 aes_load_keys(key, working_vregs, 15);
2509 aes_encrypt(res, working_vregs, 15);
2510 __ vse32_v(res, to);
2511 __ mv(c_rarg0, 0);
2512 __ leave();
2513 __ ret();
2514
2515 __ bind(L_aes192);
2516 // Note: the following function performs key += 13*16
2517 aes_load_keys(key, working_vregs, 13);
2518 aes_encrypt(res, working_vregs, 13);
2519 __ vse32_v(res, to);
2520 __ mv(c_rarg0, 0);
2521 __ leave();
2522 __ ret();
2523
2524 __ bind(L_aes128);
2525 // Note: the following function performs key += 11*16
2526 aes_load_keys(key, working_vregs, 11);
2527 aes_encrypt(res, working_vregs, 11);
2528 __ vse32_v(res, to);
2529 __ mv(c_rarg0, 0);
2530 __ leave();
2531 __ ret();
2532
2533 return start;
2534 }
2535
2536 void aes_decrypt(const VectorRegister &res, VectorRegister *working_vregs, int rounds) {
2537 assert(rounds <= 15, "rounds should be less than or equal to working_vregs size");
2538
2539 __ vxor_vv(res, res, working_vregs[rounds - 1]);
2540 for (int i = rounds - 2; i > 0; i--) {
2541 __ vaesdm_vv(res, working_vregs[i]);
2542 }
2543 __ vaesdf_vv(res, working_vregs[0]);
2544 }
2545
2546 // Arguments:
2547 //
2548 // Inputs:
2549 // c_rarg0 - source byte array address
2550 // c_rarg1 - destination byte array address
2551 // c_rarg2 - sessionKe (key) in little endian int array
2552 //
2553 address generate_aescrypt_decryptBlock() {
2554 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2555
2556 __ align(CodeEntryAlignment);
2557 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2558 StubCodeMark mark(this, stub_id);
2559
2560 Label L_aes128, L_aes192;
2561
2562 const Register from = c_rarg0; // source array address
2563 const Register to = c_rarg1; // destination array address
2564 const Register key = c_rarg2; // key array address
2565 const Register keylen = c_rarg3;
2566
2567 VectorRegister working_vregs[] = {
2568 v4, v5, v6, v7, v8, v9, v10, v11,
2569 v12, v13, v14, v15, v16, v17, v18
2570 };
2571 const VectorRegister res = v19;
2572
2573 address start = __ pc();
2574 __ enter(); // required for proper stackwalking of RuntimeStub frame
2575
2576 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2577
2578 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2579 __ vle32_v(res, from);
2580
2581 __ mv(t2, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2582 __ bltu(keylen, t2, L_aes128);
2583 __ beq(keylen, t2, L_aes192);
2584 // Else we fallthrough to the biggest case (256-bit key size)
2585
2586 // Note: the following function performs key += 15*16
2587 aes_load_keys(key, working_vregs, 15);
2588 aes_decrypt(res, working_vregs, 15);
2589 __ vse32_v(res, to);
2590 __ mv(c_rarg0, 0);
2591 __ leave();
2592 __ ret();
2593
2594 __ bind(L_aes192);
2595 // Note: the following function performs key += 13*16
2596 aes_load_keys(key, working_vregs, 13);
2597 aes_decrypt(res, working_vregs, 13);
2598 __ vse32_v(res, to);
2599 __ mv(c_rarg0, 0);
2600 __ leave();
2601 __ ret();
2602
2603 __ bind(L_aes128);
2604 // Note: the following function performs key += 11*16
2605 aes_load_keys(key, working_vregs, 11);
2606 aes_decrypt(res, working_vregs, 11);
2607 __ vse32_v(res, to);
2608 __ mv(c_rarg0, 0);
2609 __ leave();
2610 __ ret();
2611
2612 return start;
2613 }
2614
2615 void cipherBlockChaining_encryptAESCrypt(int round, Register from, Register to, Register key,
2616 Register rvec, Register input_len) {
2617 const Register len = x29;
2618
2619 VectorRegister working_vregs[] = {
2620 v1, v2, v3, v4, v5, v6, v7, v8,
2621 v9, v10, v11, v12, v13, v14, v15
2622 };
2623
2624 const unsigned int BLOCK_SIZE = 16;
2625
2626 __ mv(len, input_len);
2627 // load init rvec
2628 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2629 __ vle32_v(v16, rvec);
2630
2631 aes_load_keys(key, working_vregs, round);
2632 Label L_enc_loop;
2633 __ bind(L_enc_loop);
2634 // Encrypt from source by block size
2635 __ vle32_v(v17, from);
2636 __ addi(from, from, BLOCK_SIZE);
2637 __ vxor_vv(v16, v16, v17);
2638 aes_encrypt(v16, working_vregs, round);
2639 __ vse32_v(v16, to);
2640 __ addi(to, to, BLOCK_SIZE);
2641 __ subi(len, len, BLOCK_SIZE);
2642 __ bnez(len, L_enc_loop);
2643
2644 // save current rvec and return
2645 __ vse32_v(v16, rvec);
2646 __ mv(x10, input_len);
2647 __ leave();
2648 __ ret();
2649 }
2650
2651 // Arguments:
2652 //
2653 // Inputs:
2654 // c_rarg0 - source byte array address
2655 // c_rarg1 - destination byte array address
2656 // c_rarg2 - K (key) in little endian int array
2657 // c_rarg3 - r vector byte array address
2658 // c_rarg4 - input length
2659 //
2660 // Output:
2661 // x10 - input length
2662 //
2663 address generate_cipherBlockChaining_encryptAESCrypt() {
2664 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2665 __ align(CodeEntryAlignment);
2666 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2667 StubCodeMark mark(this, stub_id);
2668
2669 const Register from = c_rarg0;
2670 const Register to = c_rarg1;
2671 const Register key = c_rarg2;
2672 const Register rvec = c_rarg3;
2673 const Register input_len = c_rarg4;
2674
2675 const Register keylen = x28;
2676
2677 address start = __ pc();
2678 __ enter();
2679
2680 Label L_aes128, L_aes192;
2681 // Compute #rounds for AES based on the length of the key array
2682 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2683 __ mv(t0, 52);
2684 __ bltu(keylen, t0, L_aes128);
2685 __ beq(keylen, t0, L_aes192);
2686 // Else we fallthrough to the biggest case (256-bit key size)
2687
2688 // Note: the following function performs key += 15*16
2689 cipherBlockChaining_encryptAESCrypt(15, from, to, key, rvec, input_len);
2690
2691 // Note: the following function performs key += 11*16
2692 __ bind(L_aes128);
2693 cipherBlockChaining_encryptAESCrypt(11, from, to, key, rvec, input_len);
2694
2695 // Note: the following function performs key += 13*16
2696 __ bind(L_aes192);
2697 cipherBlockChaining_encryptAESCrypt(13, from, to, key, rvec, input_len);
2698
2699 return start;
2700 }
2701
2702 void cipherBlockChaining_decryptAESCrypt(int round, Register from, Register to, Register key,
2703 Register rvec, Register input_len) {
2704 const Register len = x29;
2705
2706 VectorRegister working_vregs[] = {
2707 v1, v2, v3, v4, v5, v6, v7, v8,
2708 v9, v10, v11, v12, v13, v14, v15
2709 };
2710
2711 const unsigned int BLOCK_SIZE = 16;
2712
2713 __ mv(len, input_len);
2714 // load init rvec
2715 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2716 __ vle32_v(v16, rvec);
2717
2718 aes_load_keys(key, working_vregs, round);
2719 Label L_dec_loop;
2720 // Decrypt from source by block size
2721 __ bind(L_dec_loop);
2722 __ vle32_v(v17, from);
2723 __ addi(from, from, BLOCK_SIZE);
2724 __ vmv_v_v(v18, v17);
2725 aes_decrypt(v17, working_vregs, round);
2726 __ vxor_vv(v17, v17, v16);
2727 __ vse32_v(v17, to);
2728 __ vmv_v_v(v16, v18);
2729 __ addi(to, to, BLOCK_SIZE);
2730 __ subi(len, len, BLOCK_SIZE);
2731 __ bnez(len, L_dec_loop);
2732
2733 // save current rvec and return
2734 __ vse32_v(v16, rvec);
2735 __ mv(x10, input_len);
2736 __ leave();
2737 __ ret();
2738 }
2739
2740 // Arguments:
2741 //
2742 // Inputs:
2743 // c_rarg0 - source byte array address
2744 // c_rarg1 - destination byte array address
2745 // c_rarg2 - K (key) in little endian int array
2746 // c_rarg3 - r vector byte array address
2747 // c_rarg4 - input length
2748 //
2749 // Output:
2750 // x10 - input length
2751 //
2752 address generate_cipherBlockChaining_decryptAESCrypt() {
2753 assert(UseAESIntrinsics, "need AES instructions (Zvkned extension) support");
2754 __ align(CodeEntryAlignment);
2755 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
2756 StubCodeMark mark(this, stub_id);
2757
2758 const Register from = c_rarg0;
2759 const Register to = c_rarg1;
2760 const Register key = c_rarg2;
2761 const Register rvec = c_rarg3;
2762 const Register input_len = c_rarg4;
2763
2764 const Register keylen = x28;
2765
2766 address start = __ pc();
2767 __ enter();
2768
2769 Label L_aes128, L_aes192, L_aes128_loop, L_aes192_loop, L_aes256_loop;
2770 // Compute #rounds for AES based on the length of the key array
2771 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2772 __ mv(t0, 52);
2773 __ bltu(keylen, t0, L_aes128);
2774 __ beq(keylen, t0, L_aes192);
2775 // Else we fallthrough to the biggest case (256-bit key size)
2776
2777 // Note: the following function performs key += 15*16
2778 cipherBlockChaining_decryptAESCrypt(15, from, to, key, rvec, input_len);
2779
2780 // Note: the following function performs key += 11*16
2781 __ bind(L_aes128);
2782 cipherBlockChaining_decryptAESCrypt(11, from, to, key, rvec, input_len);
2783
2784 // Note: the following function performs key += 13*16
2785 __ bind(L_aes192);
2786 cipherBlockChaining_decryptAESCrypt(13, from, to, key, rvec, input_len);
2787
2788 return start;
2789 }
2790
2791 // Load big-endian 128-bit from memory.
2792 void be_load_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2793 __ ld(counter_lo, Address(counter, 8)); // Load 128-bits from counter
2794 __ ld(counter_hi, Address(counter));
2795 __ rev8(counter_lo, counter_lo); // Convert big-endian to little-endian
2796 __ rev8(counter_hi, counter_hi);
2797 }
2798
2799 // Little-endian 128-bit + 64-bit -> 128-bit addition.
2800 void add_counter_128(Register counter_hi, Register counter_lo) {
2801 assert_different_registers(counter_hi, counter_lo, t0);
2802 __ addi(counter_lo, counter_lo, 1);
2803 __ seqz(t0, counter_lo); // Check for result overflow
2804 __ add(counter_hi, counter_hi, t0); // Add 1 if overflow otherwise 0
2805 }
2806
2807 // Store big-endian 128-bit to memory.
2808 void be_store_counter_128(Register counter_hi, Register counter_lo, Register counter) {
2809 assert_different_registers(counter_hi, counter_lo, t0, t1);
2810 __ rev8(t0, counter_lo); // Convert little-endian to big-endian
2811 __ rev8(t1, counter_hi);
2812 __ sd(t0, Address(counter, 8)); // Store 128-bits to counter
2813 __ sd(t1, Address(counter));
2814 }
2815
2816 void counterMode_AESCrypt(int round, Register in, Register out, Register key, Register counter,
2817 Register input_len, Register saved_encrypted_ctr, Register used_ptr) {
2818 // Algorithm:
2819 //
2820 // aes_load_keys();
2821 // load_counter_128(counter_hi, counter_lo, counter);
2822 //
2823 // L_next:
2824 // if (used >= BLOCK_SIZE) goto L_main_loop;
2825 //
2826 // L_encrypt_next:
2827 // *out = *in ^ saved_encrypted_ctr[used]);
2828 // out++; in++; used++; len--;
2829 // if (len == 0) goto L_exit;
2830 // goto L_next;
2831 //
2832 // L_main_loop:
2833 // if (len == 0) goto L_exit;
2834 // saved_encrypted_ctr = aes_encrypt(counter);
2835 //
2836 // add_counter_128(counter_hi, counter_lo);
2837 // be_store_counter_128(counter_hi, counter_lo, counter);
2838 // used = 0;
2839 //
2840 // if(len < BLOCK_SIZE) goto L_encrypt_next;
2841 //
2842 // v_in = load_16Byte(in);
2843 // v_out = load_16Byte(out);
2844 // v_saved_encrypted_ctr = load_16Byte(saved_encrypted_ctr);
2845 // v_out = v_in ^ v_saved_encrypted_ctr;
2846 // out += BLOCK_SIZE;
2847 // in += BLOCK_SIZE;
2848 // len -= BLOCK_SIZE;
2849 // used = BLOCK_SIZE;
2850 // goto L_main_loop;
2851 //
2852 //
2853 // L_exit:
2854 // store(used);
2855 // result = input_len
2856 // return result;
2857
2858 const Register used = x28;
2859 const Register len = x29;
2860 const Register counter_hi = x30;
2861 const Register counter_lo = x31;
2862 const Register block_size = t2;
2863
2864 const unsigned int BLOCK_SIZE = 16;
2865
2866 VectorRegister working_vregs[] = {
2867 v1, v2, v3, v4, v5, v6, v7, v8,
2868 v9, v10, v11, v12, v13, v14, v15
2869 };
2870
2871 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
2872
2873 __ lwu(used, Address(used_ptr));
2874 __ mv(len, input_len);
2875 __ mv(block_size, BLOCK_SIZE);
2876
2877 // load keys to working_vregs according to round
2878 aes_load_keys(key, working_vregs, round);
2879
2880 // 128-bit big-endian load
2881 be_load_counter_128(counter_hi, counter_lo, counter);
2882
2883 Label L_next, L_encrypt_next, L_main_loop, L_exit;
2884 // Check the last saved_encrypted_ctr used value, we fall through
2885 // to L_encrypt_next when the used value lower than block_size
2886 __ bind(L_next);
2887 __ bgeu(used, block_size, L_main_loop);
2888
2889 // There is still data left fewer than block_size after L_main_loop
2890 // or last used, we encrypt them one by one.
2891 __ bind(L_encrypt_next);
2892 __ add(t0, saved_encrypted_ctr, used);
2893 __ lbu(t1, Address(t0));
2894 __ lbu(t0, Address(in));
2895 __ xorr(t1, t1, t0);
2896 __ sb(t1, Address(out));
2897 __ addi(in, in, 1);
2898 __ addi(out, out, 1);
2899 __ addi(used, used, 1);
2900 __ subi(len, len, 1);
2901 __ beqz(len, L_exit);
2902 __ j(L_next);
2903
2904 // We will calculate the next saved_encrypted_ctr and encrypt the blocks of data
2905 // one by one until there is less than a full block remaining if len not zero
2906 __ bind(L_main_loop);
2907 __ beqz(len, L_exit);
2908 __ vle32_v(v16, counter);
2909
2910 // encrypt counter according to round
2911 aes_encrypt(v16, working_vregs, round);
2912
2913 __ vse32_v(v16, saved_encrypted_ctr);
2914
2915 // 128-bit little-endian increment
2916 add_counter_128(counter_hi, counter_lo);
2917 // 128-bit big-endian store
2918 be_store_counter_128(counter_hi, counter_lo, counter);
2919
2920 __ mv(used, 0);
2921 // Check if we have a full block_size
2922 __ bltu(len, block_size, L_encrypt_next);
2923
2924 // We have one full block to encrypt at least
2925 __ vle32_v(v17, in);
2926 __ vxor_vv(v16, v16, v17);
2927 __ vse32_v(v16, out);
2928 __ add(out, out, block_size);
2929 __ add(in, in, block_size);
2930 __ sub(len, len, block_size);
2931 __ mv(used, block_size);
2932 __ j(L_main_loop);
2933
2934 __ bind(L_exit);
2935 __ sw(used, Address(used_ptr));
2936 __ mv(x10, input_len);
2937 __ leave();
2938 __ ret();
2939 };
2940
2941 // CTR AES crypt.
2942 // Arguments:
2943 //
2944 // Inputs:
2945 // c_rarg0 - source byte array address
2946 // c_rarg1 - destination byte array address
2947 // c_rarg2 - K (key) in little endian int array
2948 // c_rarg3 - counter vector byte array address
2949 // c_rarg4 - input length
2950 // c_rarg5 - saved encryptedCounter start
2951 // c_rarg6 - saved used length
2952 //
2953 // Output:
2954 // x10 - input length
2955 //
2956 address generate_counterMode_AESCrypt() {
2957 assert(UseAESCTRIntrinsics, "need AES instructions (Zvkned extension) and Zbb extension support");
2958
2959 __ align(CodeEntryAlignment);
2960 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
2961 StubCodeMark mark(this, stub_id);
2962
2963 const Register in = c_rarg0;
2964 const Register out = c_rarg1;
2965 const Register key = c_rarg2;
2966 const Register counter = c_rarg3;
2967 const Register input_len = c_rarg4;
2968 const Register saved_encrypted_ctr = c_rarg5;
2969 const Register used_len_ptr = c_rarg6;
2970
2971 const Register keylen = c_rarg7; // temporary register
2972
2973 const address start = __ pc();
2974 __ enter();
2975
2976 Label L_exit;
2977 __ beqz(input_len, L_exit);
2978
2979 Label L_aes128, L_aes192;
2980 // Compute #rounds for AES based on the length of the key array
2981 __ lwu(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2982 __ mv(t0, 52); // key length could be only {11, 13, 15} * 4 = {44, 52, 60}
2983 __ bltu(keylen, t0, L_aes128);
2984 __ beq(keylen, t0, L_aes192);
2985 // Else we fallthrough to the biggest case (256-bit key size)
2986
2987 // Note: the following function performs crypt with key += 15*16
2988 counterMode_AESCrypt(15, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2989
2990 // Note: the following function performs crypt with key += 13*16
2991 __ bind(L_aes192);
2992 counterMode_AESCrypt(13, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2993
2994 // Note: the following function performs crypt with key += 11*16
2995 __ bind(L_aes128);
2996 counterMode_AESCrypt(11, in, out, key, counter, input_len, saved_encrypted_ctr, used_len_ptr);
2997
2998 __ bind(L_exit);
2999 __ mv(x10, input_len);
3000 __ leave();
3001 __ ret();
3002
3003 return start;
3004 }
3005
3006 void ghash_loop(Register state, Register subkeyH, Register data, Register blocks,
3007 VectorRegister vtmp1, VectorRegister vtmp2, VectorRegister vtmp3) {
3008 VectorRegister partial_hash = vtmp1;
3009 VectorRegister hash_subkey = vtmp2;
3010 VectorRegister cipher_text = vtmp3;
3011
3012 const unsigned int BLOCK_SIZE = 16;
3013
3014 __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3015 __ vle64_v(hash_subkey, subkeyH);
3016 __ vrev8_v(hash_subkey, hash_subkey);
3017 __ vle64_v(partial_hash, state);
3018 __ vrev8_v(partial_hash, partial_hash);
3019
3020 __ vsetivli(x0, 4, Assembler::e32, Assembler::m1);
3021 Label L_ghash_loop;
3022 __ bind(L_ghash_loop);
3023 __ vle32_v(cipher_text, data);
3024 __ addi(data, data, BLOCK_SIZE);
3025 __ vghsh_vv(partial_hash, hash_subkey, cipher_text);
3026 __ subi(blocks, blocks, 1);
3027 __ bnez(blocks, L_ghash_loop);
3028
3029 __ vsetivli(x0, 2, Assembler::e64, Assembler::m1);
3030 __ vrev8_v(partial_hash, partial_hash);
3031 __ vse64_v(partial_hash, state);
3032 }
3033
3034 /**
3035 * Arguments:
3036 *
3037 * Input:
3038 * c_rarg0 - current state address
3039 * c_rarg1 - H key address
3040 * c_rarg2 - data address
3041 * c_rarg3 - number of blocks
3042 *
3043 * Output:
3044 * Updated state at c_rarg0
3045 */
3046 address generate_ghash_processBlocks() {
3047 assert(UseGHASHIntrinsics, "need GHASH instructions (Zvkg extension) and Zvbb support");
3048
3049 __ align(CodeEntryAlignment);
3050 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
3051 StubCodeMark mark(this, stub_id);
3052
3053 address start = __ pc();
3054 __ enter();
3055
3056 Register state = c_rarg0;
3057 Register subkeyH = c_rarg1;
3058 Register data = c_rarg2;
3059 Register blocks = c_rarg3;
3060
3061 VectorRegister vtmp1 = v1;
3062 VectorRegister vtmp2 = v2;
3063 VectorRegister vtmp3 = v3;
3064
3065 ghash_loop(state, subkeyH, data, blocks, vtmp1, vtmp2, vtmp3);
3066
3067 __ leave();
3068 __ ret();
3069
3070 return start;
3071 }
3072
3073 // code for comparing 8 characters of strings with Latin1 and Utf16 encoding
3074 void compare_string_8_x_LU(Register tmpL, Register tmpU,
3075 Register strL, Register strU, Label& DIFF) {
3076 const Register tmp = x30, tmpLval = x12;
3077
3078 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3079 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3080
3081 #ifdef ASSERT
3082 if (AvoidUnalignedAccesses) {
3083 Label align_ok;
3084 __ andi(t0, strL, 0x7);
3085 __ beqz(t0, align_ok);
3086 __ stop("bad alignment");
3087 __ bind(align_ok);
3088 }
3089 #endif
3090 __ ld(tmpLval, Address(strL));
3091 __ addi(strL, strL, wordSize);
3092
3093 // compare first 4 characters
3094 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3095 __ addi(strU, strU, wordSize);
3096 __ inflate_lo32(tmpL, tmpLval);
3097 __ xorr(tmp, tmpU, tmpL);
3098 __ bnez(tmp, DIFF);
3099
3100 // compare second 4 characters
3101 __ load_long_misaligned(tmpU, Address(strU), tmp, (base_offset % 8) != 0 ? 4 : 8);
3102 __ addi(strU, strU, wordSize);
3103 __ inflate_hi32(tmpL, tmpLval);
3104 __ xorr(tmp, tmpU, tmpL);
3105 __ bnez(tmp, DIFF);
3106 }
3107
3108 // x10 = result
3109 // x11 = str1
3110 // x12 = cnt1
3111 // x13 = str2
3112 // x14 = cnt2
3113 // x28 = tmp1
3114 // x29 = tmp2
3115 // x30 = tmp3
3116 address generate_compare_long_string_different_encoding(StubId stub_id) {
3117 bool isLU;
3118 switch (stub_id) {
3119 case StubId::stubgen_compare_long_string_LU_id:
3120 isLU = true;
3121 break;
3122 case StubId::stubgen_compare_long_string_UL_id:
3123 isLU = false;
3124 break;
3125 default:
3126 ShouldNotReachHere();
3127 };
3128 __ align(CodeEntryAlignment);
3129 StubCodeMark mark(this, stub_id);
3130 address entry = __ pc();
3131 Label SMALL_LOOP, TAIL, LOAD_LAST, DONE, CALCULATE_DIFFERENCE;
3132 const Register result = x10, str1 = x11, str2 = x13, cnt2 = x14,
3133 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x12;
3134
3135 int base_offset = arrayOopDesc::base_offset_in_bytes(T_BYTE);
3136 assert((base_offset % (UseCompactObjectHeaders ? 4 : 8)) == 0, "Must be");
3137
3138 Register strU = isLU ? str2 : str1,
3139 strL = isLU ? str1 : str2,
3140 tmpU = isLU ? tmp2 : tmp1, // where to keep U for comparison
3141 tmpL = isLU ? tmp1 : tmp2; // where to keep L for comparison
3142
3143 if (AvoidUnalignedAccesses && (base_offset % 8) != 0) {
3144 // Load 4 bytes from strL to make sure main loop is 8-byte aligned
3145 // cnt2 is >= 68 here, no need to check it for >= 0
3146 __ lwu(tmpL, Address(strL));
3147 __ addi(strL, strL, wordSize / 2);
3148 __ load_long_misaligned(tmpU, Address(strU), tmp4, (base_offset % 8) != 0 ? 4 : 8);
3149 __ addi(strU, strU, wordSize);
3150 __ inflate_lo32(tmp3, tmpL);
3151 __ mv(tmpL, tmp3);
3152 __ xorr(tmp3, tmpU, tmpL);
3153 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3154 __ subi(cnt2, cnt2, wordSize / 2);
3155 }
3156
3157 // we are now 8-bytes aligned on strL when AvoidUnalignedAccesses is true
3158 __ subi(cnt2, cnt2, wordSize * 2);
3159 __ bltz(cnt2, TAIL);
3160 __ bind(SMALL_LOOP); // smaller loop
3161 __ subi(cnt2, cnt2, wordSize * 2);
3162 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3163 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3164 __ bgez(cnt2, SMALL_LOOP);
3165 __ addi(t0, cnt2, wordSize * 2);
3166 __ beqz(t0, DONE);
3167 __ bind(TAIL); // 1..15 characters left
3168 // Aligned access. Load bytes in portions - 4, 2, 1.
3169
3170 __ addi(t0, cnt2, wordSize);
3171 __ addi(cnt2, cnt2, wordSize * 2); // amount of characters left to process
3172 __ bltz(t0, LOAD_LAST);
3173 // remaining characters are greater than or equals to 8, we can do one compare_string_8_x_LU
3174 compare_string_8_x_LU(tmpL, tmpU, strL, strU, CALCULATE_DIFFERENCE);
3175 __ subi(cnt2, cnt2, wordSize);
3176 __ beqz(cnt2, DONE); // no character left
3177 __ bind(LOAD_LAST); // cnt2 = 1..7 characters left
3178
3179 __ subi(cnt2, cnt2, wordSize); // cnt2 is now an offset in strL which points to last 8 bytes
3180 __ slli(t0, cnt2, 1); // t0 is now an offset in strU which points to last 16 bytes
3181 __ add(strL, strL, cnt2); // Address of last 8 bytes in Latin1 string
3182 __ add(strU, strU, t0); // Address of last 16 bytes in UTF-16 string
3183 __ load_int_misaligned(tmpL, Address(strL), t0, false);
3184 __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3185 __ inflate_lo32(tmp3, tmpL);
3186 __ mv(tmpL, tmp3);
3187 __ xorr(tmp3, tmpU, tmpL);
3188 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3189
3190 __ addi(strL, strL, wordSize / 2); // Address of last 4 bytes in Latin1 string
3191 __ addi(strU, strU, wordSize); // Address of last 8 bytes in UTF-16 string
3192 __ load_int_misaligned(tmpL, Address(strL), t0, false);
3193 __ load_long_misaligned(tmpU, Address(strU), t0, 2);
3194 __ inflate_lo32(tmp3, tmpL);
3195 __ mv(tmpL, tmp3);
3196 __ xorr(tmp3, tmpU, tmpL);
3197 __ bnez(tmp3, CALCULATE_DIFFERENCE);
3198 __ j(DONE); // no character left
3199
3200 // Find the first different characters in the longwords and
3201 // compute their difference.
3202 __ bind(CALCULATE_DIFFERENCE);
3203 // count bits of trailing zero chars
3204 __ ctzc_bits(tmp4, tmp3);
3205 __ srl(tmp1, tmp1, tmp4);
3206 __ srl(tmp2, tmp2, tmp4);
3207 __ zext(tmp1, tmp1, 16);
3208 __ zext(tmp2, tmp2, 16);
3209 __ sub(result, tmp1, tmp2);
3210 __ bind(DONE);
3211 __ ret();
3212 return entry;
3213 }
3214
3215 address generate_method_entry_barrier() {
3216 __ align(CodeEntryAlignment);
3217 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
3218 StubCodeMark mark(this, stub_id);
3219
3220 Label deoptimize_label;
3221
3222 address start = __ pc();
3223
3224 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
3225
3226 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
3227 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
3228 Address thread_epoch_addr(xthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
3229 __ la(t1, ExternalAddress(bs_asm->patching_epoch_addr()));
3230 __ lwu(t1, t1);
3231 __ sw(t1, thread_epoch_addr);
3232 // There are two ways this can work:
3233 // - The writer did system icache shootdown after the instruction stream update.
3234 // Hence do nothing.
3235 // - The writer trust us to make sure our icache is in sync before entering.
3236 // Hence use cmodx fence (fence.i, may change).
3237 if (UseCtxFencei) {
3238 __ cmodx_fence();
3239 }
3240 __ membar(__ LoadLoad);
3241 }
3242
3243 __ set_last_Java_frame(sp, fp, ra);
3244
3245 __ enter();
3246 __ addi(t1, sp, wordSize);
3247
3248 __ subi(sp, sp, 4 * wordSize);
3249
3250 __ push_call_clobbered_registers();
3251
3252 __ mv(c_rarg0, t1);
3253 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
3254
3255 __ reset_last_Java_frame(true);
3256
3257 __ mv(t0, x10);
3258
3259 __ pop_call_clobbered_registers();
3260
3261 __ bnez(t0, deoptimize_label);
3262
3263 __ leave();
3264 __ ret();
3265
3266 __ BIND(deoptimize_label);
3267
3268 __ ld(t0, Address(sp, 0));
3269 __ ld(fp, Address(sp, wordSize));
3270 __ ld(ra, Address(sp, wordSize * 2));
3271 __ ld(t1, Address(sp, wordSize * 3));
3272
3273 __ mv(sp, t0);
3274 __ jr(t1);
3275
3276 return start;
3277 }
3278
3279 // x10 = result
3280 // x11 = str1
3281 // x12 = cnt1
3282 // x13 = str2
3283 // x14 = cnt2
3284 // x28 = tmp1
3285 // x29 = tmp2
3286 // x30 = tmp3
3287 // x31 = tmp4
3288 address generate_compare_long_string_same_encoding(StubId stub_id) {
3289 bool isLL;
3290 switch (stub_id) {
3291 case StubId::stubgen_compare_long_string_LL_id:
3292 isLL = true;
3293 break;
3294 case StubId::stubgen_compare_long_string_UU_id:
3295 isLL = false;
3296 break;
3297 default:
3298 ShouldNotReachHere();
3299 };
3300 __ align(CodeEntryAlignment);
3301 StubCodeMark mark(this, stub_id);
3302 address entry = __ pc();
3303 Label SMALL_LOOP, CHECK_LAST, DIFF2, TAIL,
3304 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF;
3305 const Register result = x10, str1 = x11, cnt1 = x12, str2 = x13, cnt2 = x14,
3306 tmp1 = x28, tmp2 = x29, tmp3 = x30, tmp4 = x7, tmp5 = x31;
3307 RegSet spilled_regs = RegSet::of(tmp4, tmp5);
3308
3309 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
3310 // update cnt2 counter with already loaded 8 bytes
3311 __ subi(cnt2, cnt2, wordSize / (isLL ? 1 : 2));
3312 // update pointers, because of previous read
3313 __ addi(str1, str1, wordSize);
3314 __ addi(str2, str2, wordSize);
3315 // less than 16 bytes left?
3316 __ subi(cnt2, cnt2, isLL ? 16 : 8);
3317 __ push_reg(spilled_regs, sp);
3318 __ bltz(cnt2, TAIL);
3319 __ bind(SMALL_LOOP);
3320 // compare 16 bytes of strings with same encoding
3321 __ ld(tmp5, Address(str1));
3322 __ addi(str1, str1, 8);
3323 __ xorr(tmp4, tmp1, tmp2);
3324 __ ld(cnt1, Address(str2));
3325 __ addi(str2, str2, 8);
3326 __ bnez(tmp4, DIFF);
3327 __ ld(tmp1, Address(str1));
3328 __ addi(str1, str1, 8);
3329 __ xorr(tmp4, tmp5, cnt1);
3330 __ ld(tmp2, Address(str2));
3331 __ addi(str2, str2, 8);
3332 __ bnez(tmp4, DIFF2);
3333
3334 __ subi(cnt2, cnt2, isLL ? 16 : 8);
3335 __ bgez(cnt2, SMALL_LOOP);
3336 __ bind(TAIL);
3337 __ addi(cnt2, cnt2, isLL ? 16 : 8);
3338 __ beqz(cnt2, LAST_CHECK_AND_LENGTH_DIFF);
3339 __ subi(cnt2, cnt2, isLL ? 8 : 4);
3340 __ blez(cnt2, CHECK_LAST);
3341 __ xorr(tmp4, tmp1, tmp2);
3342 __ bnez(tmp4, DIFF);
3343 __ ld(tmp1, Address(str1));
3344 __ addi(str1, str1, 8);
3345 __ ld(tmp2, Address(str2));
3346 __ addi(str2, str2, 8);
3347 __ subi(cnt2, cnt2, isLL ? 8 : 4);
3348 __ bind(CHECK_LAST);
3349 if (!isLL) {
3350 __ add(cnt2, cnt2, cnt2); // now in bytes
3351 }
3352 __ xorr(tmp4, tmp1, tmp2);
3353 __ bnez(tmp4, DIFF);
3354 __ add(str1, str1, cnt2);
3355 __ load_long_misaligned(tmp5, Address(str1), tmp3, isLL ? 1 : 2);
3356 __ add(str2, str2, cnt2);
3357 __ load_long_misaligned(cnt1, Address(str2), tmp3, isLL ? 1 : 2);
3358 __ xorr(tmp4, tmp5, cnt1);
3359 __ beqz(tmp4, LENGTH_DIFF);
3360 // Find the first different characters in the longwords and
3361 // compute their difference.
3362 __ bind(DIFF2);
3363 // count bits of trailing zero chars
3364 __ ctzc_bits(tmp3, tmp4, isLL);
3365 __ srl(tmp5, tmp5, tmp3);
3366 __ srl(cnt1, cnt1, tmp3);
3367 if (isLL) {
3368 __ zext(tmp5, tmp5, 8);
3369 __ zext(cnt1, cnt1, 8);
3370 } else {
3371 __ zext(tmp5, tmp5, 16);
3372 __ zext(cnt1, cnt1, 16);
3373 }
3374 __ sub(result, tmp5, cnt1);
3375 __ j(LENGTH_DIFF);
3376 __ bind(DIFF);
3377 // count bits of trailing zero chars
3378 __ ctzc_bits(tmp3, tmp4, isLL);
3379 __ srl(tmp1, tmp1, tmp3);
3380 __ srl(tmp2, tmp2, tmp3);
3381 if (isLL) {
3382 __ zext(tmp1, tmp1, 8);
3383 __ zext(tmp2, tmp2, 8);
3384 } else {
3385 __ zext(tmp1, tmp1, 16);
3386 __ zext(tmp2, tmp2, 16);
3387 }
3388 __ sub(result, tmp1, tmp2);
3389 __ j(LENGTH_DIFF);
3390 __ bind(LAST_CHECK_AND_LENGTH_DIFF);
3391 __ xorr(tmp4, tmp1, tmp2);
3392 __ bnez(tmp4, DIFF);
3393 __ bind(LENGTH_DIFF);
3394 __ pop_reg(spilled_regs, sp);
3395 __ ret();
3396 return entry;
3397 }
3398
3399 void generate_compare_long_strings() {
3400 StubRoutines::riscv::_compare_long_string_LL = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_LL_id);
3401 StubRoutines::riscv::_compare_long_string_UU = generate_compare_long_string_same_encoding(StubId::stubgen_compare_long_string_UU_id);
3402 StubRoutines::riscv::_compare_long_string_LU = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_LU_id);
3403 StubRoutines::riscv::_compare_long_string_UL = generate_compare_long_string_different_encoding(StubId::stubgen_compare_long_string_UL_id);
3404 }
3405
3406 // x10 result
3407 // x11 src
3408 // x12 src count
3409 // x13 pattern
3410 // x14 pattern count
3411 address generate_string_indexof_linear(StubId stub_id)
3412 {
3413 bool needle_isL;
3414 bool haystack_isL;
3415 switch (stub_id) {
3416 case StubId::stubgen_string_indexof_linear_ll_id:
3417 needle_isL = true;
3418 haystack_isL = true;
3419 break;
3420 case StubId::stubgen_string_indexof_linear_ul_id:
3421 needle_isL = true;
3422 haystack_isL = false;
3423 break;
3424 case StubId::stubgen_string_indexof_linear_uu_id:
3425 needle_isL = false;
3426 haystack_isL = false;
3427 break;
3428 default:
3429 ShouldNotReachHere();
3430 };
3431
3432 __ align(CodeEntryAlignment);
3433 StubCodeMark mark(this, stub_id);
3434 address entry = __ pc();
3435
3436 int needle_chr_size = needle_isL ? 1 : 2;
3437 int haystack_chr_size = haystack_isL ? 1 : 2;
3438 int needle_chr_shift = needle_isL ? 0 : 1;
3439 int haystack_chr_shift = haystack_isL ? 0 : 1;
3440 bool isL = needle_isL && haystack_isL;
3441 // parameters
3442 Register result = x10, haystack = x11, haystack_len = x12, needle = x13, needle_len = x14;
3443 // temporary registers
3444 Register mask1 = x20, match_mask = x21, first = x22, trailing_zeros = x23, mask2 = x24, tmp = x25;
3445 // redefinitions
3446 Register ch1 = x28, ch2 = x29;
3447 RegSet spilled_regs = RegSet::range(x20, x25) + RegSet::range(x28, x29);
3448
3449 __ push_reg(spilled_regs, sp);
3450
3451 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
3452 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
3453 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
3454 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
3455 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
3456 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
3457
3458 __ ld(ch1, Address(needle));
3459 __ ld(ch2, Address(haystack));
3460 // src.length - pattern.length
3461 __ sub(haystack_len, haystack_len, needle_len);
3462
3463 // first is needle[0]
3464 __ zext(first, ch1, needle_isL ? 8 : 16);
3465
3466 uint64_t mask0101 = UCONST64(0x0101010101010101);
3467 uint64_t mask0001 = UCONST64(0x0001000100010001);
3468 __ mv(mask1, haystack_isL ? mask0101 : mask0001);
3469 __ mul(first, first, mask1);
3470 uint64_t mask7f7f = UCONST64(0x7f7f7f7f7f7f7f7f);
3471 uint64_t mask7fff = UCONST64(0x7fff7fff7fff7fff);
3472 __ mv(mask2, haystack_isL ? mask7f7f : mask7fff);
3473 if (needle_isL != haystack_isL) {
3474 __ mv(tmp, ch1);
3475 }
3476 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size - 1);
3477 __ blez(haystack_len, L_SMALL);
3478
3479 if (needle_isL != haystack_isL) {
3480 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3481 }
3482 // xorr, sub, orr, notr, andr
3483 // compare and set match_mask[i] with 0x80/0x8000 (Latin1/UTF16) if ch2[i] == first[i]
3484 // eg:
3485 // first: aa aa aa aa aa aa aa aa
3486 // ch2: aa aa li nx jd ka aa aa
3487 // match_mask: 80 80 00 00 00 00 80 80
3488 __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3489
3490 // search first char of needle, if success, goto L_HAS_ZERO;
3491 __ bnez(match_mask, L_HAS_ZERO);
3492 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3493 __ addi(result, result, wordSize / haystack_chr_size);
3494 __ addi(haystack, haystack, wordSize);
3495 __ bltz(haystack_len, L_POST_LOOP);
3496
3497 __ bind(L_LOOP);
3498 __ ld(ch2, Address(haystack));
3499 __ compute_match_mask(ch2, first, match_mask, mask1, mask2);
3500 __ bnez(match_mask, L_HAS_ZERO);
3501
3502 __ bind(L_LOOP_PROCEED);
3503 __ subi(haystack_len, haystack_len, wordSize / haystack_chr_size);
3504 __ addi(haystack, haystack, wordSize);
3505 __ addi(result, result, wordSize / haystack_chr_size);
3506 __ bgez(haystack_len, L_LOOP);
3507
3508 __ bind(L_POST_LOOP);
3509 __ mv(ch2, -wordSize / haystack_chr_size);
3510 __ ble(haystack_len, ch2, NOMATCH); // no extra characters to check
3511 __ ld(ch2, Address(haystack));
3512 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3513 __ neg(haystack_len, haystack_len);
3514 __ xorr(ch2, first, ch2);
3515 __ sub(match_mask, ch2, mask1);
3516 __ orr(ch2, ch2, mask2);
3517 __ mv(trailing_zeros, -1); // all bits set
3518 __ j(L_SMALL_PROCEED);
3519
3520 __ align(OptoLoopAlignment);
3521 __ bind(L_SMALL);
3522 __ slli(haystack_len, haystack_len, LogBitsPerByte + haystack_chr_shift);
3523 __ neg(haystack_len, haystack_len);
3524 if (needle_isL != haystack_isL) {
3525 __ inflate_lo32(ch1, tmp, match_mask, trailing_zeros);
3526 }
3527 __ xorr(ch2, first, ch2);
3528 __ sub(match_mask, ch2, mask1);
3529 __ orr(ch2, ch2, mask2);
3530 __ mv(trailing_zeros, -1); // all bits set
3531
3532 __ bind(L_SMALL_PROCEED);
3533 __ srl(trailing_zeros, trailing_zeros, haystack_len); // mask. zeroes on useless bits.
3534 __ notr(ch2, ch2);
3535 __ andr(match_mask, match_mask, ch2);
3536 __ andr(match_mask, match_mask, trailing_zeros); // clear useless bits and check
3537 __ beqz(match_mask, NOMATCH);
3538
3539 __ bind(L_SMALL_HAS_ZERO_LOOP);
3540 // count bits of trailing zero chars
3541 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, ch2, tmp);
3542 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3543 __ mv(ch2, wordSize / haystack_chr_size);
3544 __ ble(needle_len, ch2, L_SMALL_CMP_LOOP_LAST_CMP2);
3545 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3546 __ mv(trailing_zeros, wordSize / haystack_chr_size);
3547 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3548
3549 __ bind(L_SMALL_CMP_LOOP);
3550 __ shadd(first, trailing_zeros, needle, first, needle_chr_shift);
3551 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3552 needle_isL ? __ lbu(first, Address(first)) : __ lhu(first, Address(first));
3553 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3554 __ addi(trailing_zeros, trailing_zeros, 1);
3555 __ bge(trailing_zeros, needle_len, L_SMALL_CMP_LOOP_LAST_CMP);
3556 __ beq(first, ch2, L_SMALL_CMP_LOOP);
3557
3558 __ bind(L_SMALL_CMP_LOOP_NOMATCH);
3559 __ beqz(match_mask, NOMATCH);
3560 // count bits of trailing zero chars
3561 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3562 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3563 __ addi(result, result, 1);
3564 __ addi(haystack, haystack, haystack_chr_size);
3565 __ j(L_SMALL_HAS_ZERO_LOOP);
3566
3567 __ align(OptoLoopAlignment);
3568 __ bind(L_SMALL_CMP_LOOP_LAST_CMP);
3569 __ bne(first, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3570 __ j(DONE);
3571
3572 __ align(OptoLoopAlignment);
3573 __ bind(L_SMALL_CMP_LOOP_LAST_CMP2);
3574 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3575 __ bne(ch1, ch2, L_SMALL_CMP_LOOP_NOMATCH);
3576 __ j(DONE);
3577
3578 __ align(OptoLoopAlignment);
3579 __ bind(L_HAS_ZERO);
3580 // count bits of trailing zero chars
3581 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, tmp, ch2);
3582 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3583 __ slli(needle_len, needle_len, BitsPerByte * wordSize / 2);
3584 __ orr(haystack_len, haystack_len, needle_len); // restore needle_len(32bits)
3585 __ subi(result, result, 1); // array index from 0, so result -= 1
3586
3587 __ bind(L_HAS_ZERO_LOOP);
3588 __ mv(needle_len, wordSize / haystack_chr_size);
3589 __ srli(ch2, haystack_len, BitsPerByte * wordSize / 2);
3590 __ bge(needle_len, ch2, L_CMP_LOOP_LAST_CMP2);
3591 // load next 8 bytes from haystack, and increase result index
3592 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3593 __ addi(result, result, 1);
3594 __ mv(trailing_zeros, wordSize / haystack_chr_size);
3595 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3596
3597 // compare one char
3598 __ bind(L_CMP_LOOP);
3599 __ shadd(needle_len, trailing_zeros, needle, needle_len, needle_chr_shift);
3600 needle_isL ? __ lbu(needle_len, Address(needle_len)) : __ lhu(needle_len, Address(needle_len));
3601 __ shadd(ch2, trailing_zeros, haystack, ch2, haystack_chr_shift);
3602 haystack_isL ? __ lbu(ch2, Address(ch2)) : __ lhu(ch2, Address(ch2));
3603 __ addi(trailing_zeros, trailing_zeros, 1); // next char index
3604 __ srli(tmp, haystack_len, BitsPerByte * wordSize / 2);
3605 __ bge(trailing_zeros, tmp, L_CMP_LOOP_LAST_CMP);
3606 __ beq(needle_len, ch2, L_CMP_LOOP);
3607
3608 __ bind(L_CMP_LOOP_NOMATCH);
3609 __ beqz(match_mask, L_HAS_ZERO_LOOP_NOMATCH);
3610 // count bits of trailing zero chars
3611 __ ctzc_bits(trailing_zeros, match_mask, haystack_isL, needle_len, ch2);
3612 __ addi(trailing_zeros, trailing_zeros, haystack_isL ? 7 : 15);
3613 __ addi(haystack, haystack, haystack_chr_size);
3614 __ j(L_HAS_ZERO_LOOP);
3615
3616 __ align(OptoLoopAlignment);
3617 __ bind(L_CMP_LOOP_LAST_CMP);
3618 __ bne(needle_len, ch2, L_CMP_LOOP_NOMATCH);
3619 __ j(DONE);
3620
3621 __ align(OptoLoopAlignment);
3622 __ bind(L_CMP_LOOP_LAST_CMP2);
3623 __ compute_index(haystack, trailing_zeros, match_mask, result, ch2, tmp, haystack_isL);
3624 __ addi(result, result, 1);
3625 __ bne(ch1, ch2, L_CMP_LOOP_NOMATCH);
3626 __ j(DONE);
3627
3628 __ align(OptoLoopAlignment);
3629 __ bind(L_HAS_ZERO_LOOP_NOMATCH);
3630 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
3631 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
3632 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
3633 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
3634 // result by analyzed characters value, so, we can just reset lower bits
3635 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
3636 // 2) restore needle_len and haystack_len values from "compressed" haystack_len
3637 // 3) advance haystack value to represent next haystack octet. result & 7/3 is
3638 // index of last analyzed substring inside current octet. So, haystack in at
3639 // respective start address. We need to advance it to next octet
3640 __ andi(match_mask, result, wordSize / haystack_chr_size - 1);
3641 __ srli(needle_len, haystack_len, BitsPerByte * wordSize / 2);
3642 __ andi(result, result, haystack_isL ? -8 : -4);
3643 __ slli(tmp, match_mask, haystack_chr_shift);
3644 __ sub(haystack, haystack, tmp);
3645 __ sext(haystack_len, haystack_len, 32);
3646 __ j(L_LOOP_PROCEED);
3647
3648 __ align(OptoLoopAlignment);
3649 __ bind(NOMATCH);
3650 __ mv(result, -1);
3651
3652 __ bind(DONE);
3653 __ pop_reg(spilled_regs, sp);
3654 __ ret();
3655 return entry;
3656 }
3657
3658 void generate_string_indexof_stubs()
3659 {
3660 StubRoutines::riscv::_string_indexof_linear_ll = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ll_id);
3661 StubRoutines::riscv::_string_indexof_linear_uu = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_uu_id);
3662 StubRoutines::riscv::_string_indexof_linear_ul = generate_string_indexof_linear(StubId::stubgen_string_indexof_linear_ul_id);
3663 }
3664
3665 #ifdef COMPILER2
3666 void generate_lookup_secondary_supers_table_stub() {
3667 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
3668 StubCodeMark mark(this, stub_id);
3669
3670 const Register
3671 r_super_klass = x10,
3672 r_array_base = x11,
3673 r_array_length = x12,
3674 r_array_index = x13,
3675 r_sub_klass = x14,
3676 result = x15,
3677 r_bitmap = x16;
3678
3679 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
3680 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
3681 Label L_success;
3682 __ enter();
3683 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass, result,
3684 r_array_base, r_array_length, r_array_index,
3685 r_bitmap, slot, /*stub_is_near*/true);
3686 __ leave();
3687 __ ret();
3688 }
3689 }
3690
3691 // Slow path implementation for UseSecondarySupersTable.
3692 address generate_lookup_secondary_supers_table_slow_path_stub() {
3693 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
3694 StubCodeMark mark(this, stub_id);
3695
3696 address start = __ pc();
3697 const Register
3698 r_super_klass = x10, // argument
3699 r_array_base = x11, // argument
3700 temp1 = x12, // tmp
3701 r_array_index = x13, // argument
3702 result = x15, // argument
3703 r_bitmap = x16; // argument
3704
3705
3706 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, result, temp1);
3707 __ ret();
3708
3709 return start;
3710 }
3711
3712 address generate_mulAdd()
3713 {
3714 __ align(CodeEntryAlignment);
3715 StubId stub_id = StubId::stubgen_mulAdd_id;
3716 StubCodeMark mark(this, stub_id);
3717
3718 address entry = __ pc();
3719
3720 const Register out = x10;
3721 const Register in = x11;
3722 const Register offset = x12;
3723 const Register len = x13;
3724 const Register k = x14;
3725 const Register tmp = x28;
3726
3727 BLOCK_COMMENT("Entry:");
3728 __ enter();
3729 __ mul_add(out, in, offset, len, k, tmp);
3730 __ leave();
3731 __ ret();
3732
3733 return entry;
3734 }
3735
3736 /**
3737 * Arguments:
3738 *
3739 * Input:
3740 * c_rarg0 - x address
3741 * c_rarg1 - x length
3742 * c_rarg2 - y address
3743 * c_rarg3 - y length
3744 * c_rarg4 - z address
3745 */
3746 address generate_multiplyToLen()
3747 {
3748 __ align(CodeEntryAlignment);
3749 StubId stub_id = StubId::stubgen_multiplyToLen_id;
3750 StubCodeMark mark(this, stub_id);
3751 address entry = __ pc();
3752
3753 const Register x = x10;
3754 const Register xlen = x11;
3755 const Register y = x12;
3756 const Register ylen = x13;
3757 const Register z = x14;
3758
3759 const Register tmp0 = x15;
3760 const Register tmp1 = x16;
3761 const Register tmp2 = x17;
3762 const Register tmp3 = x7;
3763 const Register tmp4 = x28;
3764 const Register tmp5 = x29;
3765 const Register tmp6 = x30;
3766 const Register tmp7 = x31;
3767
3768 BLOCK_COMMENT("Entry:");
3769 __ enter(); // required for proper stackwalking of RuntimeStub frame
3770 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3771 __ leave(); // required for proper stackwalking of RuntimeStub frame
3772 __ ret();
3773
3774 return entry;
3775 }
3776
3777 address generate_squareToLen()
3778 {
3779 __ align(CodeEntryAlignment);
3780 StubId stub_id = StubId::stubgen_squareToLen_id;
3781 StubCodeMark mark(this, stub_id);
3782 address entry = __ pc();
3783
3784 const Register x = x10;
3785 const Register xlen = x11;
3786 const Register z = x12;
3787 const Register y = x14; // == x
3788 const Register ylen = x15; // == xlen
3789
3790 const Register tmp0 = x13; // zlen, unused
3791 const Register tmp1 = x16;
3792 const Register tmp2 = x17;
3793 const Register tmp3 = x7;
3794 const Register tmp4 = x28;
3795 const Register tmp5 = x29;
3796 const Register tmp6 = x30;
3797 const Register tmp7 = x31;
3798
3799 BLOCK_COMMENT("Entry:");
3800 __ enter();
3801 __ mv(y, x);
3802 __ mv(ylen, xlen);
3803 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3804 __ leave();
3805 __ ret();
3806
3807 return entry;
3808 }
3809
3810 // Arguments:
3811 //
3812 // Input:
3813 // c_rarg0 - newArr address
3814 // c_rarg1 - oldArr address
3815 // c_rarg2 - newIdx
3816 // c_rarg3 - shiftCount
3817 // c_rarg4 - numIter
3818 //
3819 address generate_bigIntegerLeftShift() {
3820 __ align(CodeEntryAlignment);
3821 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
3822 StubCodeMark mark(this, stub_id);
3823 address entry = __ pc();
3824
3825 Label loop, exit;
3826
3827 Register newArr = c_rarg0;
3828 Register oldArr = c_rarg1;
3829 Register newIdx = c_rarg2;
3830 Register shiftCount = c_rarg3;
3831 Register numIter = c_rarg4;
3832
3833 Register shiftRevCount = c_rarg5;
3834 Register oldArrNext = t1;
3835
3836 __ beqz(numIter, exit);
3837 __ shadd(newArr, newIdx, newArr, t0, 2);
3838
3839 __ mv(shiftRevCount, 32);
3840 __ sub(shiftRevCount, shiftRevCount, shiftCount);
3841
3842 __ bind(loop);
3843 __ addi(oldArrNext, oldArr, 4);
3844 __ vsetvli(t0, numIter, Assembler::e32, Assembler::m4);
3845 __ vle32_v(v0, oldArr);
3846 __ vle32_v(v4, oldArrNext);
3847 __ vsll_vx(v0, v0, shiftCount);
3848 __ vsrl_vx(v4, v4, shiftRevCount);
3849 __ vor_vv(v0, v0, v4);
3850 __ vse32_v(v0, newArr);
3851 __ sub(numIter, numIter, t0);
3852 __ shadd(oldArr, t0, oldArr, t1, 2);
3853 __ shadd(newArr, t0, newArr, t1, 2);
3854 __ bnez(numIter, loop);
3855
3856 __ bind(exit);
3857 __ ret();
3858
3859 return entry;
3860 }
3861
3862 // Arguments:
3863 //
3864 // Input:
3865 // c_rarg0 - newArr address
3866 // c_rarg1 - oldArr address
3867 // c_rarg2 - newIdx
3868 // c_rarg3 - shiftCount
3869 // c_rarg4 - numIter
3870 //
3871 address generate_bigIntegerRightShift() {
3872 __ align(CodeEntryAlignment);
3873 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
3874 StubCodeMark mark(this, stub_id);
3875 address entry = __ pc();
3876
3877 Label loop, exit;
3878
3879 Register newArr = c_rarg0;
3880 Register oldArr = c_rarg1;
3881 Register newIdx = c_rarg2;
3882 Register shiftCount = c_rarg3;
3883 Register numIter = c_rarg4;
3884 Register idx = numIter;
3885
3886 Register shiftRevCount = c_rarg5;
3887 Register oldArrNext = c_rarg6;
3888 Register newArrCur = t0;
3889 Register oldArrCur = t1;
3890
3891 __ beqz(idx, exit);
3892 __ shadd(newArr, newIdx, newArr, t0, 2);
3893
3894 __ mv(shiftRevCount, 32);
3895 __ sub(shiftRevCount, shiftRevCount, shiftCount);
3896
3897 __ bind(loop);
3898 __ vsetvli(t0, idx, Assembler::e32, Assembler::m4);
3899 __ sub(idx, idx, t0);
3900 __ shadd(oldArrNext, idx, oldArr, t1, 2);
3901 __ shadd(newArrCur, idx, newArr, t1, 2);
3902 __ addi(oldArrCur, oldArrNext, 4);
3903 __ vle32_v(v0, oldArrCur);
3904 __ vle32_v(v4, oldArrNext);
3905 __ vsrl_vx(v0, v0, shiftCount);
3906 __ vsll_vx(v4, v4, shiftRevCount);
3907 __ vor_vv(v0, v0, v4);
3908 __ vse32_v(v0, newArrCur);
3909 __ bnez(idx, loop);
3910
3911 __ bind(exit);
3912 __ ret();
3913
3914 return entry;
3915 }
3916 #endif
3917
3918 #ifdef COMPILER2
3919 class MontgomeryMultiplyGenerator : public MacroAssembler {
3920
3921 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3922 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2, Ri, Rj;
3923
3924 RegSet _toSave;
3925 bool _squaring;
3926
3927 public:
3928 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3929 : MacroAssembler(as->code()), _squaring(squaring) {
3930
3931 // Register allocation
3932
3933 RegSetIterator<Register> regs = RegSet::range(x10, x26).begin();
3934 Pa_base = *regs; // Argument registers
3935 if (squaring) {
3936 Pb_base = Pa_base;
3937 } else {
3938 Pb_base = *++regs;
3939 }
3940 Pn_base = *++regs;
3941 Rlen= *++regs;
3942 inv = *++regs;
3943 Pm_base = *++regs;
3944
3945 // Working registers:
3946 Ra = *++regs; // The current digit of a, b, n, and m.
3947 Rb = *++regs;
3948 Rm = *++regs;
3949 Rn = *++regs;
3950
3951 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
3952 Pb = *++regs;
3953 Pm = *++regs;
3954 Pn = *++regs;
3955
3956 tmp0 = *++regs; // Three registers which form a
3957 tmp1 = *++regs; // triple-precision accumuator.
3958 tmp2 = *++regs;
3959
3960 Ri = x6; // Inner and outer loop indexes.
3961 Rj = x7;
3962
3963 Rhi_ab = x28; // Product registers: low and high parts
3964 Rlo_ab = x29; // of a*b and m*n.
3965 Rhi_mn = x30;
3966 Rlo_mn = x31;
3967
3968 // x18 and up are callee-saved.
3969 _toSave = RegSet::range(x18, *regs) + Pm_base;
3970 }
3971
3972 private:
3973 void save_regs() {
3974 push_reg(_toSave, sp);
3975 }
3976
3977 void restore_regs() {
3978 pop_reg(_toSave, sp);
3979 }
3980
3981 template <typename T>
3982 void unroll_2(Register count, T block) {
3983 Label loop, end, odd;
3984 beqz(count, end);
3985 test_bit(t0, count, 0);
3986 bnez(t0, odd);
3987 align(16);
3988 bind(loop);
3989 (this->*block)();
3990 bind(odd);
3991 (this->*block)();
3992 subi(count, count, 2);
3993 bgtz(count, loop);
3994 bind(end);
3995 }
3996
3997 template <typename T>
3998 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3999 Label loop, end, odd;
4000 beqz(count, end);
4001 test_bit(tmp, count, 0);
4002 bnez(tmp, odd);
4003 align(16);
4004 bind(loop);
4005 (this->*block)(d, s, tmp);
4006 bind(odd);
4007 (this->*block)(d, s, tmp);
4008 subi(count, count, 2);
4009 bgtz(count, loop);
4010 bind(end);
4011 }
4012
4013 void pre1(RegisterOrConstant i) {
4014 block_comment("pre1");
4015 // Pa = Pa_base;
4016 // Pb = Pb_base + i;
4017 // Pm = Pm_base;
4018 // Pn = Pn_base + i;
4019 // Ra = *Pa;
4020 // Rb = *Pb;
4021 // Rm = *Pm;
4022 // Rn = *Pn;
4023 if (i.is_register()) {
4024 slli(t0, i.as_register(), LogBytesPerWord);
4025 } else {
4026 mv(t0, i.as_constant());
4027 slli(t0, t0, LogBytesPerWord);
4028 }
4029
4030 mv(Pa, Pa_base);
4031 add(Pb, Pb_base, t0);
4032 mv(Pm, Pm_base);
4033 add(Pn, Pn_base, t0);
4034
4035 ld(Ra, Address(Pa));
4036 ld(Rb, Address(Pb));
4037 ld(Rm, Address(Pm));
4038 ld(Rn, Address(Pn));
4039
4040 // Zero the m*n result.
4041 mv(Rhi_mn, zr);
4042 mv(Rlo_mn, zr);
4043 }
4044
4045 // The core multiply-accumulate step of a Montgomery
4046 // multiplication. The idea is to schedule operations as a
4047 // pipeline so that instructions with long latencies (loads and
4048 // multiplies) have time to complete before their results are
4049 // used. This most benefits in-order implementations of the
4050 // architecture but out-of-order ones also benefit.
4051 void step() {
4052 block_comment("step");
4053 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4054 // Ra = *++Pa;
4055 // Rb = *--Pb;
4056 mulhu(Rhi_ab, Ra, Rb);
4057 mul(Rlo_ab, Ra, Rb);
4058 addi(Pa, Pa, wordSize);
4059 ld(Ra, Address(Pa));
4060 subi(Pb, Pb, wordSize);
4061 ld(Rb, Address(Pb));
4062 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n from the
4063 // previous iteration.
4064 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4065 // Rm = *++Pm;
4066 // Rn = *--Pn;
4067 mulhu(Rhi_mn, Rm, Rn);
4068 mul(Rlo_mn, Rm, Rn);
4069 addi(Pm, Pm, wordSize);
4070 ld(Rm, Address(Pm));
4071 subi(Pn, Pn, wordSize);
4072 ld(Rn, Address(Pn));
4073 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4074 }
4075
4076 void post1() {
4077 block_comment("post1");
4078
4079 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4080 // Ra = *++Pa;
4081 // Rb = *--Pb;
4082 mulhu(Rhi_ab, Ra, Rb);
4083 mul(Rlo_ab, Ra, Rb);
4084 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4085 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4086
4087 // *Pm = Rm = tmp0 * inv;
4088 mul(Rm, tmp0, inv);
4089 sd(Rm, Address(Pm));
4090
4091 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4092 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4093 mulhu(Rhi_mn, Rm, Rn);
4094
4095 #ifndef PRODUCT
4096 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4097 {
4098 mul(Rlo_mn, Rm, Rn);
4099 add(Rlo_mn, tmp0, Rlo_mn);
4100 Label ok;
4101 beqz(Rlo_mn, ok);
4102 stop("broken Montgomery multiply");
4103 bind(ok);
4104 }
4105 #endif
4106 // We have very carefully set things up so that
4107 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4108 // the lower half of Rm * Rn because we know the result already:
4109 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
4110 // tmp0 != 0. So, rather than do a mul and an cad we just set
4111 // the carry flag iff tmp0 is nonzero.
4112 //
4113 // mul(Rlo_mn, Rm, Rn);
4114 // cad(zr, tmp0, Rlo_mn);
4115 subi(t0, tmp0, 1);
4116 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4117 cadc(tmp0, tmp1, Rhi_mn, t0);
4118 adc(tmp1, tmp2, zr, t0);
4119 mv(tmp2, zr);
4120 }
4121
4122 void pre2(Register i, Register len) {
4123 block_comment("pre2");
4124 // Pa = Pa_base + i-len;
4125 // Pb = Pb_base + len;
4126 // Pm = Pm_base + i-len;
4127 // Pn = Pn_base + len;
4128
4129 sub(Rj, i, len);
4130 // Rj == i-len
4131
4132 // Ra as temp register
4133 slli(Ra, Rj, LogBytesPerWord);
4134 add(Pa, Pa_base, Ra);
4135 add(Pm, Pm_base, Ra);
4136 slli(Ra, len, LogBytesPerWord);
4137 add(Pb, Pb_base, Ra);
4138 add(Pn, Pn_base, Ra);
4139
4140 // Ra = *++Pa;
4141 // Rb = *--Pb;
4142 // Rm = *++Pm;
4143 // Rn = *--Pn;
4144 addi(Pa, Pa, wordSize);
4145 ld(Ra, Address(Pa));
4146 subi(Pb, Pb, wordSize);
4147 ld(Rb, Address(Pb));
4148 addi(Pm, Pm, wordSize);
4149 ld(Rm, Address(Pm));
4150 subi(Pn, Pn, wordSize);
4151 ld(Rn, Address(Pn));
4152
4153 mv(Rhi_mn, zr);
4154 mv(Rlo_mn, zr);
4155 }
4156
4157 void post2(Register i, Register len) {
4158 block_comment("post2");
4159 sub(Rj, i, len);
4160
4161 cad(tmp0, tmp0, Rlo_mn, t0); // The pending m*n, low part
4162
4163 // As soon as we know the least significant digit of our result,
4164 // store it.
4165 // Pm_base[i-len] = tmp0;
4166 // Rj as temp register
4167 slli(Rj, Rj, LogBytesPerWord);
4168 add(Rj, Pm_base, Rj);
4169 sd(tmp0, Address(Rj));
4170
4171 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4172 cadc(tmp0, tmp1, Rhi_mn, t0); // The pending m*n, high part
4173 adc(tmp1, tmp2, zr, t0);
4174 mv(tmp2, zr);
4175 }
4176
4177 // A carry in tmp0 after Montgomery multiplication means that we
4178 // should subtract multiples of n from our result in m. We'll
4179 // keep doing that until there is no carry.
4180 void normalize(Register len) {
4181 block_comment("normalize");
4182 // while (tmp0)
4183 // tmp0 = sub(Pm_base, Pn_base, tmp0, len);
4184 Label loop, post, again;
4185 Register cnt = tmp1, i = tmp2; // Re-use registers; we're done with them now
4186 beqz(tmp0, post); {
4187 bind(again); {
4188 mv(i, zr);
4189 mv(cnt, len);
4190 slli(Rn, i, LogBytesPerWord);
4191 add(Rm, Pm_base, Rn);
4192 ld(Rm, Address(Rm));
4193 add(Rn, Pn_base, Rn);
4194 ld(Rn, Address(Rn));
4195 mv(t0, 1); // set carry flag, i.e. no borrow
4196 align(16);
4197 bind(loop); {
4198 notr(Rn, Rn);
4199 add(Rm, Rm, t0);
4200 add(Rm, Rm, Rn);
4201 sltu(t0, Rm, Rn);
4202 slli(Rn, i, LogBytesPerWord); // Rn as temp register
4203 add(Rn, Pm_base, Rn);
4204 sd(Rm, Address(Rn));
4205 addi(i, i, 1);
4206 slli(Rn, i, LogBytesPerWord);
4207 add(Rm, Pm_base, Rn);
4208 ld(Rm, Address(Rm));
4209 add(Rn, Pn_base, Rn);
4210 ld(Rn, Address(Rn));
4211 subi(cnt, cnt, 1);
4212 } bnez(cnt, loop);
4213 subi(tmp0, tmp0, 1);
4214 add(tmp0, tmp0, t0);
4215 } bnez(tmp0, again);
4216 } bind(post);
4217 }
4218
4219 // Move memory at s to d, reversing words.
4220 // Increments d to end of copied memory
4221 // Destroys tmp1, tmp2
4222 // Preserves len
4223 // Leaves s pointing to the address which was in d at start
4224 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4225 assert(tmp1->encoding() < x28->encoding(), "register corruption");
4226 assert(tmp2->encoding() < x28->encoding(), "register corruption");
4227
4228 shadd(s, len, s, tmp1, LogBytesPerWord);
4229 mv(tmp1, len);
4230 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4231 slli(tmp1, len, LogBytesPerWord);
4232 sub(s, d, tmp1);
4233 }
4234 // [63...0] -> [31...0][63...32]
4235 void reverse1(Register d, Register s, Register tmp) {
4236 subi(s, s, wordSize);
4237 ld(tmp, Address(s));
4238 ror(tmp, tmp, 32, t0);
4239 sd(tmp, Address(d));
4240 addi(d, d, wordSize);
4241 }
4242
4243 void step_squaring() {
4244 // An extra ACC
4245 step();
4246 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4247 }
4248
4249 void last_squaring(Register i) {
4250 Label dont;
4251 // if ((i & 1) == 0) {
4252 test_bit(t0, i, 0);
4253 bnez(t0, dont); {
4254 // MACC(Ra, Rb, tmp0, tmp1, tmp2);
4255 // Ra = *++Pa;
4256 // Rb = *--Pb;
4257 mulhu(Rhi_ab, Ra, Rb);
4258 mul(Rlo_ab, Ra, Rb);
4259 acc(Rhi_ab, Rlo_ab, tmp0, tmp1, tmp2);
4260 } bind(dont);
4261 }
4262
4263 void extra_step_squaring() {
4264 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4265
4266 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4267 // Rm = *++Pm;
4268 // Rn = *--Pn;
4269 mulhu(Rhi_mn, Rm, Rn);
4270 mul(Rlo_mn, Rm, Rn);
4271 addi(Pm, Pm, wordSize);
4272 ld(Rm, Address(Pm));
4273 subi(Pn, Pn, wordSize);
4274 ld(Rn, Address(Pn));
4275 }
4276
4277 void post1_squaring() {
4278 acc(Rhi_mn, Rlo_mn, tmp0, tmp1, tmp2); // The pending m*n
4279
4280 // *Pm = Rm = tmp0 * inv;
4281 mul(Rm, tmp0, inv);
4282 sd(Rm, Address(Pm));
4283
4284 // MACC(Rm, Rn, tmp0, tmp1, tmp2);
4285 // tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0;
4286 mulhu(Rhi_mn, Rm, Rn);
4287
4288 #ifndef PRODUCT
4289 // assert(m[i] * n[0] + tmp0 == 0, "broken Montgomery multiply");
4290 {
4291 mul(Rlo_mn, Rm, Rn);
4292 add(Rlo_mn, tmp0, Rlo_mn);
4293 Label ok;
4294 beqz(Rlo_mn, ok); {
4295 stop("broken Montgomery multiply");
4296 } bind(ok);
4297 }
4298 #endif
4299 // We have very carefully set things up so that
4300 // m[i]*n[0] + tmp0 == 0 (mod b), so we don't have to calculate
4301 // the lower half of Rm * Rn because we know the result already:
4302 // it must be -tmp0. tmp0 + (-tmp0) must generate a carry iff
4303 // tmp0 != 0. So, rather than do a mul and a cad we just set
4304 // the carry flag iff tmp0 is nonzero.
4305 //
4306 // mul(Rlo_mn, Rm, Rn);
4307 // cad(zr, tmp, Rlo_mn);
4308 subi(t0, tmp0, 1);
4309 sltu(t0, t0, tmp0); // Set carry iff tmp0 is nonzero
4310 cadc(tmp0, tmp1, Rhi_mn, t0);
4311 adc(tmp1, tmp2, zr, t0);
4312 mv(tmp2, zr);
4313 }
4314
4315 // use t0 as carry
4316 void acc(Register Rhi, Register Rlo,
4317 Register tmp0, Register tmp1, Register tmp2) {
4318 cad(tmp0, tmp0, Rlo, t0);
4319 cadc(tmp1, tmp1, Rhi, t0);
4320 adc(tmp2, tmp2, zr, t0);
4321 }
4322
4323 public:
4324 /**
4325 * Fast Montgomery multiplication. The derivation of the
4326 * algorithm is in A Cryptographic Library for the Motorola
4327 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4328 *
4329 * Arguments:
4330 *
4331 * Inputs for multiplication:
4332 * c_rarg0 - int array elements a
4333 * c_rarg1 - int array elements b
4334 * c_rarg2 - int array elements n (the modulus)
4335 * c_rarg3 - int length
4336 * c_rarg4 - int inv
4337 * c_rarg5 - int array elements m (the result)
4338 *
4339 * Inputs for squaring:
4340 * c_rarg0 - int array elements a
4341 * c_rarg1 - int array elements n (the modulus)
4342 * c_rarg2 - int length
4343 * c_rarg3 - int inv
4344 * c_rarg4 - int array elements m (the result)
4345 *
4346 */
4347 address generate_multiply() {
4348 Label argh, nothing;
4349 bind(argh);
4350 stop("MontgomeryMultiply total_allocation must be <= 8192");
4351
4352 align(CodeEntryAlignment);
4353 address entry = pc();
4354
4355 beqz(Rlen, nothing);
4356
4357 enter();
4358
4359 // Make room.
4360 mv(Ra, 512);
4361 bgt(Rlen, Ra, argh);
4362 slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4363 sub(Ra, sp, Ra);
4364 andi(sp, Ra, -2 * wordSize);
4365
4366 srliw(Rlen, Rlen, 1); // length in longwords = len/2
4367
4368 {
4369 // Copy input args, reversing as we go. We use Ra as a
4370 // temporary variable.
4371 reverse(Ra, Pa_base, Rlen, Ri, Rj);
4372 if (!_squaring)
4373 reverse(Ra, Pb_base, Rlen, Ri, Rj);
4374 reverse(Ra, Pn_base, Rlen, Ri, Rj);
4375 }
4376
4377 // Push all call-saved registers and also Pm_base which we'll need
4378 // at the end.
4379 save_regs();
4380
4381 #ifndef PRODUCT
4382 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4383 {
4384 ld(Rn, Address(Pn_base));
4385 mul(Rlo_mn, Rn, inv);
4386 mv(t0, -1);
4387 Label ok;
4388 beq(Rlo_mn, t0, ok);
4389 stop("broken inverse in Montgomery multiply");
4390 bind(ok);
4391 }
4392 #endif
4393
4394 mv(Pm_base, Ra);
4395
4396 mv(tmp0, zr);
4397 mv(tmp1, zr);
4398 mv(tmp2, zr);
4399
4400 block_comment("for (int i = 0; i < len; i++) {");
4401 mv(Ri, zr); {
4402 Label loop, end;
4403 bge(Ri, Rlen, end);
4404
4405 bind(loop);
4406 pre1(Ri);
4407
4408 block_comment(" for (j = i; j; j--) {"); {
4409 mv(Rj, Ri);
4410 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4411 } block_comment(" } // j");
4412
4413 post1();
4414 addiw(Ri, Ri, 1);
4415 blt(Ri, Rlen, loop);
4416 bind(end);
4417 block_comment("} // i");
4418 }
4419
4420 block_comment("for (int i = len; i < 2*len; i++) {");
4421 mv(Ri, Rlen); {
4422 Label loop, end;
4423 slli(t0, Rlen, 1);
4424 bge(Ri, t0, end);
4425
4426 bind(loop);
4427 pre2(Ri, Rlen);
4428
4429 block_comment(" for (j = len*2-i-1; j; j--) {"); {
4430 slliw(Rj, Rlen, 1);
4431 subw(Rj, Rj, Ri);
4432 subiw(Rj, Rj, 1);
4433 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4434 } block_comment(" } // j");
4435
4436 post2(Ri, Rlen);
4437 addiw(Ri, Ri, 1);
4438 slli(t0, Rlen, 1);
4439 blt(Ri, t0, loop);
4440 bind(end);
4441 }
4442 block_comment("} // i");
4443
4444 normalize(Rlen);
4445
4446 mv(Ra, Pm_base); // Save Pm_base in Ra
4447 restore_regs(); // Restore caller's Pm_base
4448
4449 // Copy our result into caller's Pm_base
4450 reverse(Pm_base, Ra, Rlen, Ri, Rj);
4451
4452 leave();
4453 bind(nothing);
4454 ret();
4455
4456 return entry;
4457 }
4458
4459 /**
4460 *
4461 * Arguments:
4462 *
4463 * Inputs:
4464 * c_rarg0 - int array elements a
4465 * c_rarg1 - int array elements n (the modulus)
4466 * c_rarg2 - int length
4467 * c_rarg3 - int inv
4468 * c_rarg4 - int array elements m (the result)
4469 *
4470 */
4471 address generate_square() {
4472 Label argh;
4473 bind(argh);
4474 stop("MontgomeryMultiply total_allocation must be <= 8192");
4475
4476 align(CodeEntryAlignment);
4477 address entry = pc();
4478
4479 enter();
4480
4481 // Make room.
4482 mv(Ra, 512);
4483 bgt(Rlen, Ra, argh);
4484 slli(Ra, Rlen, exact_log2(4 * sizeof(jint)));
4485 sub(Ra, sp, Ra);
4486 andi(sp, Ra, -2 * wordSize);
4487
4488 srliw(Rlen, Rlen, 1); // length in longwords = len/2
4489
4490 {
4491 // Copy input args, reversing as we go. We use Ra as a
4492 // temporary variable.
4493 reverse(Ra, Pa_base, Rlen, Ri, Rj);
4494 reverse(Ra, Pn_base, Rlen, Ri, Rj);
4495 }
4496
4497 // Push all call-saved registers and also Pm_base which we'll need
4498 // at the end.
4499 save_regs();
4500
4501 mv(Pm_base, Ra);
4502
4503 mv(tmp0, zr);
4504 mv(tmp1, zr);
4505 mv(tmp2, zr);
4506
4507 block_comment("for (int i = 0; i < len; i++) {");
4508 mv(Ri, zr); {
4509 Label loop, end;
4510 bind(loop);
4511 bge(Ri, Rlen, end);
4512
4513 pre1(Ri);
4514
4515 block_comment("for (j = (i+1)/2; j; j--) {"); {
4516 addi(Rj, Ri, 1);
4517 srliw(Rj, Rj, 1);
4518 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4519 } block_comment(" } // j");
4520
4521 last_squaring(Ri);
4522
4523 block_comment(" for (j = i/2; j; j--) {"); {
4524 srliw(Rj, Ri, 1);
4525 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4526 } block_comment(" } // j");
4527
4528 post1_squaring();
4529 addi(Ri, Ri, 1);
4530 blt(Ri, Rlen, loop);
4531
4532 bind(end);
4533 block_comment("} // i");
4534 }
4535
4536 block_comment("for (int i = len; i < 2*len; i++) {");
4537 mv(Ri, Rlen); {
4538 Label loop, end;
4539 bind(loop);
4540 slli(t0, Rlen, 1);
4541 bge(Ri, t0, end);
4542
4543 pre2(Ri, Rlen);
4544
4545 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
4546 slli(Rj, Rlen, 1);
4547 sub(Rj, Rj, Ri);
4548 subi(Rj, Rj, 1);
4549 srliw(Rj, Rj, 1);
4550 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4551 } block_comment(" } // j");
4552
4553 last_squaring(Ri);
4554
4555 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
4556 slli(Rj, Rlen, 1);
4557 sub(Rj, Rj, Ri);
4558 srliw(Rj, Rj, 1);
4559 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4560 } block_comment(" } // j");
4561
4562 post2(Ri, Rlen);
4563 addi(Ri, Ri, 1);
4564 slli(t0, Rlen, 1);
4565 blt(Ri, t0, loop);
4566
4567 bind(end);
4568 block_comment("} // i");
4569 }
4570
4571 normalize(Rlen);
4572
4573 mv(Ra, Pm_base); // Save Pm_base in Ra
4574 restore_regs(); // Restore caller's Pm_base
4575
4576 // Copy our result into caller's Pm_base
4577 reverse(Pm_base, Ra, Rlen, Ri, Rj);
4578
4579 leave();
4580 ret();
4581
4582 return entry;
4583 }
4584 };
4585
4586 #endif // COMPILER2
4587
4588 address generate_cont_thaw(Continuation::thaw_kind kind) {
4589 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
4590 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
4591
4592 address start = __ pc();
4593
4594 if (return_barrier) {
4595 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4596 }
4597
4598 #ifndef PRODUCT
4599 {
4600 Label OK;
4601 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4602 __ beq(sp, t0, OK);
4603 __ stop("incorrect sp");
4604 __ bind(OK);
4605 }
4606 #endif
4607
4608 if (return_barrier) {
4609 // preserve possible return value from a method returning to the return barrier
4610 __ subi(sp, sp, 2 * wordSize);
4611 __ fsd(f10, Address(sp, 0 * wordSize));
4612 __ sd(x10, Address(sp, 1 * wordSize));
4613 }
4614
4615 __ mv(c_rarg1, (return_barrier ? 1 : 0));
4616 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), xthread, c_rarg1);
4617 __ mv(t1, x10); // x10 contains the size of the frames to thaw, 0 if overflow or no more frames
4618
4619 if (return_barrier) {
4620 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4621 __ ld(x10, Address(sp, 1 * wordSize));
4622 __ fld(f10, Address(sp, 0 * wordSize));
4623 __ addi(sp, sp, 2 * wordSize);
4624 }
4625
4626 #ifndef PRODUCT
4627 {
4628 Label OK;
4629 __ ld(t0, Address(xthread, JavaThread::cont_entry_offset()));
4630 __ beq(sp, t0, OK);
4631 __ stop("incorrect sp");
4632 __ bind(OK);
4633 }
4634 #endif
4635
4636 Label thaw_success;
4637 // t1 contains the size of the frames to thaw, 0 if overflow or no more frames
4638 __ bnez(t1, thaw_success);
4639 __ j(RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
4640 __ bind(thaw_success);
4641
4642 // make room for the thawed frames
4643 __ sub(t0, sp, t1);
4644 __ andi(sp, t0, -16); // align
4645
4646 if (return_barrier) {
4647 // save original return value -- again
4648 __ subi(sp, sp, 2 * wordSize);
4649 __ fsd(f10, Address(sp, 0 * wordSize));
4650 __ sd(x10, Address(sp, 1 * wordSize));
4651 }
4652
4653 // If we want, we can templatize thaw by kind, and have three different entries
4654 __ mv(c_rarg1, kind);
4655
4656 __ call_VM_leaf(Continuation::thaw_entry(), xthread, c_rarg1);
4657 __ mv(t1, x10); // x10 is the sp of the yielding frame
4658
4659 if (return_barrier) {
4660 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
4661 __ ld(x10, Address(sp, 1 * wordSize));
4662 __ fld(f10, Address(sp, 0 * wordSize));
4663 __ addi(sp, sp, 2 * wordSize);
4664 } else {
4665 __ mv(x10, zr); // return 0 (success) from doYield
4666 }
4667
4668 // we're now on the yield frame (which is in an address above us b/c sp has been pushed down)
4669 __ mv(fp, t1);
4670 __ subi(sp, t1, 2 * wordSize); // now pointing to fp spill
4671
4672 if (return_barrier_exception) {
4673 __ ld(c_rarg1, Address(fp, -1 * wordSize)); // return address
4674 __ verify_oop(x10);
4675 __ mv(x9, x10); // save return value contaning the exception oop in callee-saved x9
4676
4677 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), xthread, c_rarg1);
4678
4679 // see OptoRuntime::generate_exception_blob: x10 -- exception oop, x13 -- exception pc
4680
4681 __ mv(x11, x10); // the exception handler
4682 __ mv(x10, x9); // restore return value contaning the exception oop
4683 __ verify_oop(x10);
4684
4685 __ leave();
4686 __ mv(x13, ra);
4687 __ jr(x11); // the exception handler
4688 } else {
4689 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
4690 __ leave();
4691 __ ret();
4692 }
4693
4694 return start;
4695 }
4696
4697 address generate_cont_thaw() {
4698 if (!Continuations::enabled()) return nullptr;
4699
4700 StubId stub_id = StubId::stubgen_cont_thaw_id;
4701 StubCodeMark mark(this, stub_id);
4702 address start = __ pc();
4703 generate_cont_thaw(Continuation::thaw_top);
4704 return start;
4705 }
4706
4707 address generate_cont_returnBarrier() {
4708 if (!Continuations::enabled()) return nullptr;
4709
4710 // TODO: will probably need multiple return barriers depending on return type
4711 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
4712 StubCodeMark mark(this, stub_id);
4713 address start = __ pc();
4714
4715 generate_cont_thaw(Continuation::thaw_return_barrier);
4716
4717 return start;
4718 }
4719
4720 address generate_cont_returnBarrier_exception() {
4721 if (!Continuations::enabled()) return nullptr;
4722
4723 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
4724 StubCodeMark mark(this, stub_id);
4725 address start = __ pc();
4726
4727 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
4728
4729 return start;
4730 }
4731
4732 address generate_cont_preempt_stub() {
4733 if (!Continuations::enabled()) return nullptr;
4734 StubId stub_id = StubId::stubgen_cont_preempt_id;
4735 StubCodeMark mark(this, stub_id);
4736 address start = __ pc();
4737
4738 __ reset_last_Java_frame(true);
4739
4740 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
4741 __ ld(sp, Address(xthread, JavaThread::cont_entry_offset()));
4742
4743 Label preemption_cancelled;
4744 __ lbu(t0, Address(xthread, JavaThread::preemption_cancelled_offset()));
4745 __ bnez(t0, preemption_cancelled);
4746
4747 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
4748 SharedRuntime::continuation_enter_cleanup(_masm);
4749 __ leave();
4750 __ ret();
4751
4752 // We acquired the monitor after freezing the frames so call thaw to continue execution.
4753 __ bind(preemption_cancelled);
4754 __ sb(zr, Address(xthread, JavaThread::preemption_cancelled_offset()));
4755 __ la(fp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size() + 2 * wordSize)));
4756 __ la(t1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
4757 __ ld(t1, Address(t1));
4758 __ jr(t1);
4759
4760 return start;
4761 }
4762
4763 #ifdef COMPILER2
4764
4765 #undef __
4766 #define __ this->
4767
4768 class Sha2Generator : public MacroAssembler {
4769 StubCodeGenerator* _cgen;
4770 public:
4771 Sha2Generator(MacroAssembler* masm, StubCodeGenerator* cgen) : MacroAssembler(masm->code()), _cgen(cgen) {}
4772 address generate_sha256_implCompress(StubId stub_id) {
4773 return generate_sha2_implCompress(Assembler::e32, stub_id);
4774 }
4775 address generate_sha512_implCompress(StubId stub_id) {
4776 return generate_sha2_implCompress(Assembler::e64, stub_id);
4777 }
4778 private:
4779
4780 void vleXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4781 if (vset_sew == Assembler::e32) __ vle32_v(vr, sr);
4782 else __ vle64_v(vr, sr);
4783 }
4784
4785 void vseXX_v(Assembler::SEW vset_sew, VectorRegister vr, Register sr) {
4786 if (vset_sew == Assembler::e32) __ vse32_v(vr, sr);
4787 else __ vse64_v(vr, sr);
4788 }
4789
4790 // Overview of the logic in each "quad round".
4791 //
4792 // The code below repeats 16/20 times the logic implementing four rounds
4793 // of the SHA-256/512 core loop as documented by NIST. 16/20 "quad rounds"
4794 // to implementing the 64/80 single rounds.
4795 //
4796 // // Load four word (u32/64) constants (K[t+3], K[t+2], K[t+1], K[t+0])
4797 // // Output:
4798 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4799 // vl1reXX.v vTmp1, ofs
4800 //
4801 // // Increment word constant address by stride (16/32 bytes, 4*4B/8B, 128b/256b)
4802 // addi ofs, ofs, 16/32
4803 //
4804 // // Add constants to message schedule words:
4805 // // Input
4806 // // vTmp1 = {K[t+3], K[t+2], K[t+1], K[t+0]}
4807 // // vW0 = {W[t+3], W[t+2], W[t+1], W[t+0]}; // Vt0 = W[3:0];
4808 // // Output
4809 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4810 // vadd.vv vTmp0, vTmp1, vW0
4811 //
4812 // // 2 rounds of working variables updates.
4813 // // vState1[t+4] <- vState1[t], vState0[t], vTmp0[t]
4814 // // Input:
4815 // // vState1 = {c[t],d[t],g[t],h[t]} " = vState1[t] "
4816 // // vState0 = {a[t],b[t],e[t],f[t]}
4817 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4818 // // Output:
4819 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
4820 // // = {h[t+4],g[t+4],d[t+4],c[t+4]} " = vState1[t+4] "
4821 // vsha2cl.vv vState1, vState0, vTmp0
4822 //
4823 // // 2 rounds of working variables updates.
4824 // // vState0[t+4] <- vState0[t], vState0[t+2], vTmp0[t]
4825 // // Input
4826 // // vState0 = {a[t],b[t],e[t],f[t]} " = vState0[t] "
4827 // // = {h[t+2],g[t+2],d[t+2],c[t+2]} " = vState1[t+2] "
4828 // // vState1 = {f[t+2],e[t+2],b[t+2],a[t+2]} " = vState0[t+2] "
4829 // // vTmp0 = {W[t+3]+K[t+3], W[t+2]+K[t+2], W[t+1]+K[t+1], W[t+0]+K[t+0]}
4830 // // Output:
4831 // // vState0 = {f[t+4],e[t+4],b[t+4],a[t+4]} " = vState0[t+4] "
4832 // vsha2ch.vv vState0, vState1, vTmp0
4833 //
4834 // // Combine 2QW into 1QW
4835 // //
4836 // // To generate the next 4 words, "new_vW0"/"vTmp0" from vW0-vW3, vsha2ms needs
4837 // // vW0[0..3], vW1[0], vW2[1..3], vW3[0, 2..3]
4838 // // and it can only take 3 vectors as inputs. Hence we need to combine
4839 // // vW1[0] and vW2[1..3] in a single vector.
4840 // //
4841 // // vmerge Vt4, Vt1, Vt2, V0
4842 // // Input
4843 // // V0 = mask // first word from vW2, 1..3 words from vW1
4844 // // vW2 = {Wt-8, Wt-7, Wt-6, Wt-5}
4845 // // vW1 = {Wt-12, Wt-11, Wt-10, Wt-9}
4846 // // Output
4847 // // Vt4 = {Wt-12, Wt-7, Wt-6, Wt-5}
4848 // vmerge.vvm vTmp0, vW2, vW1, v0
4849 //
4850 // // Generate next Four Message Schedule Words (hence allowing for 4 more rounds)
4851 // // Input
4852 // // vW0 = {W[t+ 3], W[t+ 2], W[t+ 1], W[t+ 0]} W[ 3: 0]
4853 // // vW3 = {W[t+15], W[t+14], W[t+13], W[t+12]} W[15:12]
4854 // // vTmp0 = {W[t+11], W[t+10], W[t+ 9], W[t+ 4]} W[11: 9,4]
4855 // // Output (next four message schedule words)
4856 // // vW0 = {W[t+19], W[t+18], W[t+17], W[t+16]} W[19:16]
4857 // vsha2ms.vv vW0, vTmp0, vW3
4858 //
4859 // BEFORE
4860 // vW0 - vW3 hold the message schedule words (initially the block words)
4861 // vW0 = W[ 3: 0] "oldest"
4862 // vW1 = W[ 7: 4]
4863 // vW2 = W[11: 8]
4864 // vW3 = W[15:12] "newest"
4865 //
4866 // vt6 - vt7 hold the working state variables
4867 // vState0 = {a[t],b[t],e[t],f[t]} // initially {H5,H4,H1,H0}
4868 // vState1 = {c[t],d[t],g[t],h[t]} // initially {H7,H6,H3,H2}
4869 //
4870 // AFTER
4871 // vW0 - vW3 hold the message schedule words (initially the block words)
4872 // vW1 = W[ 7: 4] "oldest"
4873 // vW2 = W[11: 8]
4874 // vW3 = W[15:12]
4875 // vW0 = W[19:16] "newest"
4876 //
4877 // vState0 and vState1 hold the working state variables
4878 // vState0 = {a[t+4],b[t+4],e[t+4],f[t+4]}
4879 // vState1 = {c[t+4],d[t+4],g[t+4],h[t+4]}
4880 //
4881 // The group of vectors vW0,vW1,vW2,vW3 is "rotated" by one in each quad-round,
4882 // hence the uses of those vectors rotate in each round, and we get back to the
4883 // initial configuration every 4 quad-rounds. We could avoid those changes at
4884 // the cost of moving those vectors at the end of each quad-rounds.
4885 void sha2_quad_round(Assembler::SEW vset_sew, VectorRegister rot1, VectorRegister rot2, VectorRegister rot3, VectorRegister rot4,
4886 Register scalarconst, VectorRegister vtemp, VectorRegister vtemp2, VectorRegister v_abef, VectorRegister v_cdgh,
4887 bool gen_words = true, bool step_const = true) {
4888 __ vleXX_v(vset_sew, vtemp, scalarconst);
4889 if (step_const) {
4890 __ addi(scalarconst, scalarconst, vset_sew == Assembler::e32 ? 16 : 32);
4891 }
4892 __ vadd_vv(vtemp2, vtemp, rot1);
4893 __ vsha2cl_vv(v_cdgh, v_abef, vtemp2);
4894 __ vsha2ch_vv(v_abef, v_cdgh, vtemp2);
4895 if (gen_words) {
4896 __ vmerge_vvm(vtemp2, rot3, rot2);
4897 __ vsha2ms_vv(rot1, vtemp2, rot4);
4898 }
4899 }
4900
4901 // Arguments:
4902 //
4903 // Inputs:
4904 // c_rarg0 - byte[] source+offset
4905 // c_rarg1 - int[] SHA.state
4906 // c_rarg2 - int offset
4907 // c_rarg3 - int limit
4908 //
4909 address generate_sha2_implCompress(Assembler::SEW vset_sew, StubId stub_id) {
4910 alignas(64) static const uint32_t round_consts_256[64] = {
4911 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
4912 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
4913 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
4914 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
4915 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
4916 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
4917 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
4918 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
4919 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
4920 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
4921 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
4922 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
4923 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
4924 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
4925 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
4926 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
4927 };
4928 alignas(64) static const uint64_t round_consts_512[80] = {
4929 0x428a2f98d728ae22l, 0x7137449123ef65cdl, 0xb5c0fbcfec4d3b2fl,
4930 0xe9b5dba58189dbbcl, 0x3956c25bf348b538l, 0x59f111f1b605d019l,
4931 0x923f82a4af194f9bl, 0xab1c5ed5da6d8118l, 0xd807aa98a3030242l,
4932 0x12835b0145706fbel, 0x243185be4ee4b28cl, 0x550c7dc3d5ffb4e2l,
4933 0x72be5d74f27b896fl, 0x80deb1fe3b1696b1l, 0x9bdc06a725c71235l,
4934 0xc19bf174cf692694l, 0xe49b69c19ef14ad2l, 0xefbe4786384f25e3l,
4935 0x0fc19dc68b8cd5b5l, 0x240ca1cc77ac9c65l, 0x2de92c6f592b0275l,
4936 0x4a7484aa6ea6e483l, 0x5cb0a9dcbd41fbd4l, 0x76f988da831153b5l,
4937 0x983e5152ee66dfabl, 0xa831c66d2db43210l, 0xb00327c898fb213fl,
4938 0xbf597fc7beef0ee4l, 0xc6e00bf33da88fc2l, 0xd5a79147930aa725l,
4939 0x06ca6351e003826fl, 0x142929670a0e6e70l, 0x27b70a8546d22ffcl,
4940 0x2e1b21385c26c926l, 0x4d2c6dfc5ac42aedl, 0x53380d139d95b3dfl,
4941 0x650a73548baf63del, 0x766a0abb3c77b2a8l, 0x81c2c92e47edaee6l,
4942 0x92722c851482353bl, 0xa2bfe8a14cf10364l, 0xa81a664bbc423001l,
4943 0xc24b8b70d0f89791l, 0xc76c51a30654be30l, 0xd192e819d6ef5218l,
4944 0xd69906245565a910l, 0xf40e35855771202al, 0x106aa07032bbd1b8l,
4945 0x19a4c116b8d2d0c8l, 0x1e376c085141ab53l, 0x2748774cdf8eeb99l,
4946 0x34b0bcb5e19b48a8l, 0x391c0cb3c5c95a63l, 0x4ed8aa4ae3418acbl,
4947 0x5b9cca4f7763e373l, 0x682e6ff3d6b2b8a3l, 0x748f82ee5defb2fcl,
4948 0x78a5636f43172f60l, 0x84c87814a1f0ab72l, 0x8cc702081a6439ecl,
4949 0x90befffa23631e28l, 0xa4506cebde82bde9l, 0xbef9a3f7b2c67915l,
4950 0xc67178f2e372532bl, 0xca273eceea26619cl, 0xd186b8c721c0c207l,
4951 0xeada7dd6cde0eb1el, 0xf57d4f7fee6ed178l, 0x06f067aa72176fbal,
4952 0x0a637dc5a2c898a6l, 0x113f9804bef90dael, 0x1b710b35131c471bl,
4953 0x28db77f523047d84l, 0x32caab7b40c72493l, 0x3c9ebe0a15c9bebcl,
4954 0x431d67c49c100d4cl, 0x4cc5d4becb3e42b6l, 0x597f299cfc657e2al,
4955 0x5fcb6fab3ad6faecl, 0x6c44198c4a475817l
4956 };
4957 const int const_add = vset_sew == Assembler::e32 ? 16 : 32;
4958
4959 bool multi_block;
4960 switch (stub_id) {
4961 case StubId::stubgen_sha256_implCompress_id:
4962 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4963 multi_block = false;
4964 break;
4965 case StubId::stubgen_sha256_implCompressMB_id:
4966 assert (vset_sew == Assembler::e32, "wrong macroassembler for stub");
4967 multi_block = true;
4968 break;
4969 case StubId::stubgen_sha512_implCompress_id:
4970 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4971 multi_block = false;
4972 break;
4973 case StubId::stubgen_sha512_implCompressMB_id:
4974 assert (vset_sew == Assembler::e64, "wrong macroassembler for stub");
4975 multi_block = true;
4976 break;
4977 default:
4978 ShouldNotReachHere();
4979 };
4980 __ align(CodeEntryAlignment);
4981 StubCodeMark mark(_cgen, stub_id);
4982 address start = __ pc();
4983
4984 Register buf = c_rarg0;
4985 Register state = c_rarg1;
4986 Register ofs = c_rarg2;
4987 Register limit = c_rarg3;
4988 Register consts = t2; // caller saved
4989 Register state_c = x28; // caller saved
4990 VectorRegister vindex = v2;
4991 VectorRegister vW0 = v4;
4992 VectorRegister vW1 = v6;
4993 VectorRegister vW2 = v8;
4994 VectorRegister vW3 = v10;
4995 VectorRegister vState0 = v12;
4996 VectorRegister vState1 = v14;
4997 VectorRegister vHash0 = v16;
4998 VectorRegister vHash1 = v18;
4999 VectorRegister vTmp0 = v20;
5000 VectorRegister vTmp1 = v22;
5001
5002 Label multi_block_loop;
5003
5004 __ enter();
5005
5006 address constant_table = vset_sew == Assembler::e32 ? (address)round_consts_256 : (address)round_consts_512;
5007 la(consts, ExternalAddress(constant_table));
5008
5009 // Register use in this function:
5010 //
5011 // VECTORS
5012 // vW0 - vW3 (512/1024-bits / 4*128/256 bits / 4*4*32/65 bits), hold the message
5013 // schedule words (Wt). They start with the message block
5014 // content (W0 to W15), then further words in the message
5015 // schedule generated via vsha2ms from previous Wt.
5016 // Initially:
5017 // vW0 = W[ 3:0] = { W3, W2, W1, W0}
5018 // vW1 = W[ 7:4] = { W7, W6, W5, W4}
5019 // vW2 = W[ 11:8] = {W11, W10, W9, W8}
5020 // vW3 = W[15:12] = {W15, W14, W13, W12}
5021 //
5022 // vState0 - vState1 hold the working state variables (a, b, ..., h)
5023 // vState0 = {f[t],e[t],b[t],a[t]}
5024 // vState1 = {h[t],g[t],d[t],c[t]}
5025 // Initially:
5026 // vState0 = {H5i-1, H4i-1, H1i-1 , H0i-1}
5027 // vState1 = {H7i-i, H6i-1, H3i-1 , H2i-1}
5028 //
5029 // v0 = masks for vrgather/vmerge. Single value during the 16 rounds.
5030 //
5031 // vTmp0 = temporary, Wt+Kt
5032 // vTmp1 = temporary, Kt
5033 //
5034 // vHash0/vHash1 = hold the initial values of the hash, byte-swapped.
5035 //
5036 // During most of the function the vector state is configured so that each
5037 // vector is interpreted as containing four 32/64 bits (e32/e64) elements (128/256 bits).
5038
5039 // vsha2ch/vsha2cl uses EGW of 4*SEW.
5040 // SHA256 SEW = e32, EGW = 128-bits
5041 // SHA512 SEW = e64, EGW = 256-bits
5042 //
5043 // VLEN is required to be at least 128.
5044 // For the case of VLEN=128 and SHA512 we need LMUL=2 to work with 4*e64 (EGW = 256)
5045 //
5046 // m1: LMUL=1/2
5047 // ta: tail agnostic (don't care about those lanes)
5048 // ma: mask agnostic (don't care about those lanes)
5049 // x0 is not written, we known the number of vector elements.
5050
5051 if (vset_sew == Assembler::e64 && MaxVectorSize == 16) { // SHA512 and VLEN = 128
5052 __ vsetivli(x0, 4, vset_sew, Assembler::m2, Assembler::ma, Assembler::ta);
5053 } else {
5054 __ vsetivli(x0, 4, vset_sew, Assembler::m1, Assembler::ma, Assembler::ta);
5055 }
5056
5057 int64_t indexes = vset_sew == Assembler::e32 ? 0x00041014ul : 0x00082028ul;
5058 __ li(t0, indexes);
5059 __ vmv_v_x(vindex, t0);
5060
5061 // Step-over a,b, so we are pointing to c.
5062 // const_add is equal to 4x state variable, div by 2 is thus 2, a,b
5063 __ addi(state_c, state, const_add/2);
5064
5065 // Use index-load to get {f,e,b,a},{h,g,d,c}
5066 __ vluxei8_v(vState0, state, vindex);
5067 __ vluxei8_v(vState1, state_c, vindex);
5068
5069 __ bind(multi_block_loop);
5070
5071 // Capture the initial H values in vHash0 and vHash1 to allow for computing
5072 // the resulting H', since H' = H+{a',b',c',...,h'}.
5073 __ vmv_v_v(vHash0, vState0);
5074 __ vmv_v_v(vHash1, vState1);
5075
5076 // Load the 512/1024-bits of the message block in vW0-vW3 and perform
5077 // an endian swap on each 4/8 bytes element.
5078 //
5079 // If Zvkb is not implemented one can use vrgather
5080 // with an index sequence to byte-swap.
5081 // sequence = [3 2 1 0 7 6 5 4 11 10 9 8 15 14 13 12]
5082 // <https://oeis.org/A004444> gives us "N ^ 3" as a nice formula to generate
5083 // this sequence. 'vid' gives us the N.
5084 __ vleXX_v(vset_sew, vW0, buf);
5085 __ vrev8_v(vW0, vW0);
5086 __ addi(buf, buf, const_add);
5087 __ vleXX_v(vset_sew, vW1, buf);
5088 __ vrev8_v(vW1, vW1);
5089 __ addi(buf, buf, const_add);
5090 __ vleXX_v(vset_sew, vW2, buf);
5091 __ vrev8_v(vW2, vW2);
5092 __ addi(buf, buf, const_add);
5093 __ vleXX_v(vset_sew, vW3, buf);
5094 __ vrev8_v(vW3, vW3);
5095 __ addi(buf, buf, const_add);
5096
5097 // Set v0 up for the vmerge that replaces the first word (idx==0)
5098 __ vid_v(v0);
5099 __ vmseq_vi(v0, v0, 0x0); // v0.mask[i] = (i == 0 ? 1 : 0)
5100
5101 VectorRegister rotation_regs[] = {vW0, vW1, vW2, vW3};
5102 int rot_pos = 0;
5103 // Quad-round #0 (+0, vW0->vW1->vW2->vW3) ... #11 (+3, vW3->vW0->vW1->vW2)
5104 const int qr_end = vset_sew == Assembler::e32 ? 12 : 16;
5105 for (int i = 0; i < qr_end; i++) {
5106 sha2_quad_round(vset_sew,
5107 rotation_regs[(rot_pos + 0) & 0x3],
5108 rotation_regs[(rot_pos + 1) & 0x3],
5109 rotation_regs[(rot_pos + 2) & 0x3],
5110 rotation_regs[(rot_pos + 3) & 0x3],
5111 consts,
5112 vTmp1, vTmp0, vState0, vState1);
5113 ++rot_pos;
5114 }
5115 // Quad-round #12 (+0, vW0->vW1->vW2->vW3) ... #15 (+3, vW3->vW0->vW1->vW2)
5116 // Note that we stop generating new message schedule words (Wt, vW0-13)
5117 // as we already generated all the words we end up consuming (i.e., W[63:60]).
5118 const int qr_c_end = qr_end + 4;
5119 for (int i = qr_end; i < qr_c_end; i++) {
5120 sha2_quad_round(vset_sew,
5121 rotation_regs[(rot_pos + 0) & 0x3],
5122 rotation_regs[(rot_pos + 1) & 0x3],
5123 rotation_regs[(rot_pos + 2) & 0x3],
5124 rotation_regs[(rot_pos + 3) & 0x3],
5125 consts,
5126 vTmp1, vTmp0, vState0, vState1, false, i < (qr_c_end-1));
5127 ++rot_pos;
5128 }
5129
5130 //--------------------------------------------------------------------------------
5131 // Compute the updated hash value H'
5132 // H' = H + {h',g',...,b',a'}
5133 // = {h,g,...,b,a} + {h',g',...,b',a'}
5134 // = {h+h',g+g',...,b+b',a+a'}
5135
5136 // H' = H+{a',b',c',...,h'}
5137 __ vadd_vv(vState0, vHash0, vState0);
5138 __ vadd_vv(vState1, vHash1, vState1);
5139
5140 if (multi_block) {
5141 int total_adds = vset_sew == Assembler::e32 ? 240 : 608;
5142 __ subi(consts, consts, total_adds);
5143 __ addi(ofs, ofs, vset_sew == Assembler::e32 ? 64 : 128);
5144 __ ble(ofs, limit, multi_block_loop);
5145 __ mv(c_rarg0, ofs); // return ofs
5146 }
5147
5148 // Store H[0..8] = {a,b,c,d,e,f,g,h} from
5149 // vState0 = {f,e,b,a}
5150 // vState1 = {h,g,d,c}
5151 __ vsuxei8_v(vState0, state, vindex);
5152 __ vsuxei8_v(vState1, state_c, vindex);
5153
5154 __ leave();
5155 __ ret();
5156
5157 return start;
5158 }
5159 };
5160
5161 #undef __
5162 #define __ _masm->
5163
5164 // Set of L registers that correspond to a contiguous memory area.
5165 // Each 64-bit register typically corresponds to 2 32-bit integers.
5166 template <uint L>
5167 class RegCache {
5168 private:
5169 MacroAssembler *_masm;
5170 Register _regs[L];
5171
5172 public:
5173 RegCache(MacroAssembler *masm, RegSet rs): _masm(masm) {
5174 assert(rs.size() == L, "%u registers are used to cache %u 4-byte data", rs.size(), 2 * L);
5175 auto it = rs.begin();
5176 for (auto &r: _regs) {
5177 r = *it;
5178 ++it;
5179 }
5180 }
5181
5182 // generate load for the i'th register
5183 void gen_load(uint i, Register base) {
5184 assert(i < L, "invalid i: %u", i);
5185 __ ld(_regs[i], Address(base, 8 * i));
5186 }
5187
5188 // add i'th 32-bit integer to dest
5189 void add_u32(const Register dest, uint i, const Register rtmp = t0) {
5190 assert(i < 2 * L, "invalid i: %u", i);
5191
5192 if (is_even(i)) {
5193 // Use the bottom 32 bits. No need to mask off the top 32 bits
5194 // as addw will do the right thing.
5195 __ addw(dest, dest, _regs[i / 2]);
5196 } else {
5197 // Use the top 32 bits by right-shifting them.
5198 __ srli(rtmp, _regs[i / 2], 32);
5199 __ addw(dest, dest, rtmp);
5200 }
5201 }
5202 };
5203
5204 typedef RegCache<8> BufRegCache;
5205
5206 // a += value + x + ac;
5207 // a = Integer.rotateLeft(a, s) + b;
5208 void m5_FF_GG_HH_II_epilogue(BufRegCache& reg_cache,
5209 Register a, Register b, Register c, Register d,
5210 int k, int s, int t,
5211 Register value) {
5212 // a += ac
5213 __ addw(a, a, t, t1);
5214
5215 // a += x;
5216 reg_cache.add_u32(a, k);
5217 // a += value;
5218 __ addw(a, a, value);
5219
5220 // a = Integer.rotateLeft(a, s) + b;
5221 __ rolw(a, a, s);
5222 __ addw(a, a, b);
5223 }
5224
5225 // a += ((b & c) | ((~b) & d)) + x + ac;
5226 // a = Integer.rotateLeft(a, s) + b;
5227 void md5_FF(BufRegCache& reg_cache,
5228 Register a, Register b, Register c, Register d,
5229 int k, int s, int t,
5230 Register rtmp1, Register rtmp2) {
5231 // rtmp1 = b & c
5232 __ andr(rtmp1, b, c);
5233
5234 // rtmp2 = (~b) & d
5235 __ andn(rtmp2, d, b);
5236
5237 // rtmp1 = (b & c) | ((~b) & d)
5238 __ orr(rtmp1, rtmp1, rtmp2);
5239
5240 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5241 }
5242
5243 // a += ((b & d) | (c & (~d))) + x + ac;
5244 // a = Integer.rotateLeft(a, s) + b;
5245 void md5_GG(BufRegCache& reg_cache,
5246 Register a, Register b, Register c, Register d,
5247 int k, int s, int t,
5248 Register rtmp1, Register rtmp2) {
5249 // rtmp1 = b & d
5250 __ andr(rtmp1, b, d);
5251
5252 // rtmp2 = c & (~d)
5253 __ andn(rtmp2, c, d);
5254
5255 // rtmp1 = (b & d) | (c & (~d))
5256 __ orr(rtmp1, rtmp1, rtmp2);
5257
5258 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5259 }
5260
5261 // a += ((b ^ c) ^ d) + x + ac;
5262 // a = Integer.rotateLeft(a, s) + b;
5263 void md5_HH(BufRegCache& reg_cache,
5264 Register a, Register b, Register c, Register d,
5265 int k, int s, int t,
5266 Register rtmp1, Register rtmp2) {
5267 // rtmp1 = (b ^ c) ^ d
5268 __ xorr(rtmp2, b, c);
5269 __ xorr(rtmp1, rtmp2, d);
5270
5271 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5272 }
5273
5274 // a += (c ^ (b | (~d))) + x + ac;
5275 // a = Integer.rotateLeft(a, s) + b;
5276 void md5_II(BufRegCache& reg_cache,
5277 Register a, Register b, Register c, Register d,
5278 int k, int s, int t,
5279 Register rtmp1, Register rtmp2) {
5280 // rtmp1 = c ^ (b | (~d))
5281 __ orn(rtmp2, b, d);
5282 __ xorr(rtmp1, c, rtmp2);
5283
5284 m5_FF_GG_HH_II_epilogue(reg_cache, a, b, c, d, k, s, t, rtmp1);
5285 }
5286
5287 // Arguments:
5288 //
5289 // Inputs:
5290 // c_rarg0 - byte[] source+offset
5291 // c_rarg1 - int[] SHA.state
5292 // c_rarg2 - int offset (multi_block == True)
5293 // c_rarg3 - int limit (multi_block == True)
5294 //
5295 // Registers:
5296 // x0 zero (zero)
5297 // x1 ra (return address)
5298 // x2 sp (stack pointer)
5299 // x3 gp (global pointer)
5300 // x4 tp (thread pointer)
5301 // x5 t0 (tmp register)
5302 // x6 t1 (tmp register)
5303 // x7 t2 state0
5304 // x8 f0/s0 (frame pointer)
5305 // x9 s1
5306 // x10 a0 rtmp1 / c_rarg0
5307 // x11 a1 rtmp2 / c_rarg1
5308 // x12 a2 a / c_rarg2
5309 // x13 a3 b / c_rarg3
5310 // x14 a4 c
5311 // x15 a5 d
5312 // x16 a6 buf
5313 // x17 a7 state
5314 // x18 s2 ofs [saved-reg] (multi_block == True)
5315 // x19 s3 limit [saved-reg] (multi_block == True)
5316 // x20 s4 state1 [saved-reg]
5317 // x21 s5 state2 [saved-reg]
5318 // x22 s6 state3 [saved-reg]
5319 // x23 s7
5320 // x24 s8 buf0 [saved-reg]
5321 // x25 s9 buf1 [saved-reg]
5322 // x26 s10 buf2 [saved-reg]
5323 // x27 s11 buf3 [saved-reg]
5324 // x28 t3 buf4
5325 // x29 t4 buf5
5326 // x30 t5 buf6
5327 // x31 t6 buf7
5328 address generate_md5_implCompress(StubId stub_id) {
5329 __ align(CodeEntryAlignment);
5330 bool multi_block;
5331 switch (stub_id) {
5332 case StubId::stubgen_md5_implCompress_id:
5333 multi_block = false;
5334 break;
5335 case StubId::stubgen_md5_implCompressMB_id:
5336 multi_block = true;
5337 break;
5338 default:
5339 ShouldNotReachHere();
5340 };
5341 StubCodeMark mark(this, stub_id);
5342 address start = __ pc();
5343
5344 // rotation constants
5345 const int S11 = 7;
5346 const int S12 = 12;
5347 const int S13 = 17;
5348 const int S14 = 22;
5349 const int S21 = 5;
5350 const int S22 = 9;
5351 const int S23 = 14;
5352 const int S24 = 20;
5353 const int S31 = 4;
5354 const int S32 = 11;
5355 const int S33 = 16;
5356 const int S34 = 23;
5357 const int S41 = 6;
5358 const int S42 = 10;
5359 const int S43 = 15;
5360 const int S44 = 21;
5361
5362 const int64_t mask32 = 0xffffffff;
5363
5364 Register buf_arg = c_rarg0; // a0
5365 Register state_arg = c_rarg1; // a1
5366 Register ofs_arg = c_rarg2; // a2
5367 Register limit_arg = c_rarg3; // a3
5368
5369 // we'll copy the args to these registers to free up a0-a3
5370 // to use for other values manipulated by instructions
5371 // that can be compressed
5372 Register buf = x16; // a6
5373 Register state = x17; // a7
5374 Register ofs = x18; // s2
5375 Register limit = x19; // s3
5376
5377 // using x12->15 to allow compressed instructions
5378 Register a = x12; // a2
5379 Register b = x13; // a3
5380 Register c = x14; // a4
5381 Register d = x15; // a5
5382
5383 Register state0 = x7; // t2
5384 Register state1 = x20; // s4
5385 Register state2 = x21; // s5
5386 Register state3 = x22; // s6
5387
5388 // using x10->x11 to allow compressed instructions
5389 Register rtmp1 = x10; // a0
5390 Register rtmp2 = x11; // a1
5391
5392 RegSet reg_cache_saved_regs = RegSet::of(x24, x25, x26, x27); // s8, s9, s10, s11
5393 RegSet reg_cache_regs;
5394 reg_cache_regs += reg_cache_saved_regs;
5395 reg_cache_regs += RegSet::of(t3, t4, t5, t6);
5396 BufRegCache reg_cache(_masm, reg_cache_regs);
5397
5398 RegSet saved_regs;
5399 if (multi_block) {
5400 saved_regs += RegSet::of(ofs, limit);
5401 }
5402 saved_regs += RegSet::of(state1, state2, state3);
5403 saved_regs += reg_cache_saved_regs;
5404
5405 __ push_reg(saved_regs, sp);
5406
5407 __ mv(buf, buf_arg);
5408 __ mv(state, state_arg);
5409 if (multi_block) {
5410 __ mv(ofs, ofs_arg);
5411 __ mv(limit, limit_arg);
5412 }
5413
5414 // to minimize the number of memory operations:
5415 // read the 4 state 4-byte values in pairs, with a single ld,
5416 // and split them into 2 registers.
5417 //
5418 // And, as the core algorithm of md5 works on 32-bits words, so
5419 // in the following code, it does not care about the content of
5420 // higher 32-bits in state[x]. Based on this observation,
5421 // we can apply further optimization, which is to just ignore the
5422 // higher 32-bits in state0/state2, rather than set the higher
5423 // 32-bits of state0/state2 to zero explicitly with extra instructions.
5424 __ ld(state0, Address(state));
5425 __ srli(state1, state0, 32);
5426 __ ld(state2, Address(state, 8));
5427 __ srli(state3, state2, 32);
5428
5429 Label md5_loop;
5430 __ BIND(md5_loop);
5431
5432 __ mv(a, state0);
5433 __ mv(b, state1);
5434 __ mv(c, state2);
5435 __ mv(d, state3);
5436
5437 // Round 1
5438 reg_cache.gen_load(0, buf);
5439 md5_FF(reg_cache, a, b, c, d, 0, S11, 0xd76aa478, rtmp1, rtmp2);
5440 md5_FF(reg_cache, d, a, b, c, 1, S12, 0xe8c7b756, rtmp1, rtmp2);
5441 reg_cache.gen_load(1, buf);
5442 md5_FF(reg_cache, c, d, a, b, 2, S13, 0x242070db, rtmp1, rtmp2);
5443 md5_FF(reg_cache, b, c, d, a, 3, S14, 0xc1bdceee, rtmp1, rtmp2);
5444 reg_cache.gen_load(2, buf);
5445 md5_FF(reg_cache, a, b, c, d, 4, S11, 0xf57c0faf, rtmp1, rtmp2);
5446 md5_FF(reg_cache, d, a, b, c, 5, S12, 0x4787c62a, rtmp1, rtmp2);
5447 reg_cache.gen_load(3, buf);
5448 md5_FF(reg_cache, c, d, a, b, 6, S13, 0xa8304613, rtmp1, rtmp2);
5449 md5_FF(reg_cache, b, c, d, a, 7, S14, 0xfd469501, rtmp1, rtmp2);
5450 reg_cache.gen_load(4, buf);
5451 md5_FF(reg_cache, a, b, c, d, 8, S11, 0x698098d8, rtmp1, rtmp2);
5452 md5_FF(reg_cache, d, a, b, c, 9, S12, 0x8b44f7af, rtmp1, rtmp2);
5453 reg_cache.gen_load(5, buf);
5454 md5_FF(reg_cache, c, d, a, b, 10, S13, 0xffff5bb1, rtmp1, rtmp2);
5455 md5_FF(reg_cache, b, c, d, a, 11, S14, 0x895cd7be, rtmp1, rtmp2);
5456 reg_cache.gen_load(6, buf);
5457 md5_FF(reg_cache, a, b, c, d, 12, S11, 0x6b901122, rtmp1, rtmp2);
5458 md5_FF(reg_cache, d, a, b, c, 13, S12, 0xfd987193, rtmp1, rtmp2);
5459 reg_cache.gen_load(7, buf);
5460 md5_FF(reg_cache, c, d, a, b, 14, S13, 0xa679438e, rtmp1, rtmp2);
5461 md5_FF(reg_cache, b, c, d, a, 15, S14, 0x49b40821, rtmp1, rtmp2);
5462
5463 // Round 2
5464 md5_GG(reg_cache, a, b, c, d, 1, S21, 0xf61e2562, rtmp1, rtmp2);
5465 md5_GG(reg_cache, d, a, b, c, 6, S22, 0xc040b340, rtmp1, rtmp2);
5466 md5_GG(reg_cache, c, d, a, b, 11, S23, 0x265e5a51, rtmp1, rtmp2);
5467 md5_GG(reg_cache, b, c, d, a, 0, S24, 0xe9b6c7aa, rtmp1, rtmp2);
5468 md5_GG(reg_cache, a, b, c, d, 5, S21, 0xd62f105d, rtmp1, rtmp2);
5469 md5_GG(reg_cache, d, a, b, c, 10, S22, 0x02441453, rtmp1, rtmp2);
5470 md5_GG(reg_cache, c, d, a, b, 15, S23, 0xd8a1e681, rtmp1, rtmp2);
5471 md5_GG(reg_cache, b, c, d, a, 4, S24, 0xe7d3fbc8, rtmp1, rtmp2);
5472 md5_GG(reg_cache, a, b, c, d, 9, S21, 0x21e1cde6, rtmp1, rtmp2);
5473 md5_GG(reg_cache, d, a, b, c, 14, S22, 0xc33707d6, rtmp1, rtmp2);
5474 md5_GG(reg_cache, c, d, a, b, 3, S23, 0xf4d50d87, rtmp1, rtmp2);
5475 md5_GG(reg_cache, b, c, d, a, 8, S24, 0x455a14ed, rtmp1, rtmp2);
5476 md5_GG(reg_cache, a, b, c, d, 13, S21, 0xa9e3e905, rtmp1, rtmp2);
5477 md5_GG(reg_cache, d, a, b, c, 2, S22, 0xfcefa3f8, rtmp1, rtmp2);
5478 md5_GG(reg_cache, c, d, a, b, 7, S23, 0x676f02d9, rtmp1, rtmp2);
5479 md5_GG(reg_cache, b, c, d, a, 12, S24, 0x8d2a4c8a, rtmp1, rtmp2);
5480
5481 // Round 3
5482 md5_HH(reg_cache, a, b, c, d, 5, S31, 0xfffa3942, rtmp1, rtmp2);
5483 md5_HH(reg_cache, d, a, b, c, 8, S32, 0x8771f681, rtmp1, rtmp2);
5484 md5_HH(reg_cache, c, d, a, b, 11, S33, 0x6d9d6122, rtmp1, rtmp2);
5485 md5_HH(reg_cache, b, c, d, a, 14, S34, 0xfde5380c, rtmp1, rtmp2);
5486 md5_HH(reg_cache, a, b, c, d, 1, S31, 0xa4beea44, rtmp1, rtmp2);
5487 md5_HH(reg_cache, d, a, b, c, 4, S32, 0x4bdecfa9, rtmp1, rtmp2);
5488 md5_HH(reg_cache, c, d, a, b, 7, S33, 0xf6bb4b60, rtmp1, rtmp2);
5489 md5_HH(reg_cache, b, c, d, a, 10, S34, 0xbebfbc70, rtmp1, rtmp2);
5490 md5_HH(reg_cache, a, b, c, d, 13, S31, 0x289b7ec6, rtmp1, rtmp2);
5491 md5_HH(reg_cache, d, a, b, c, 0, S32, 0xeaa127fa, rtmp1, rtmp2);
5492 md5_HH(reg_cache, c, d, a, b, 3, S33, 0xd4ef3085, rtmp1, rtmp2);
5493 md5_HH(reg_cache, b, c, d, a, 6, S34, 0x04881d05, rtmp1, rtmp2);
5494 md5_HH(reg_cache, a, b, c, d, 9, S31, 0xd9d4d039, rtmp1, rtmp2);
5495 md5_HH(reg_cache, d, a, b, c, 12, S32, 0xe6db99e5, rtmp1, rtmp2);
5496 md5_HH(reg_cache, c, d, a, b, 15, S33, 0x1fa27cf8, rtmp1, rtmp2);
5497 md5_HH(reg_cache, b, c, d, a, 2, S34, 0xc4ac5665, rtmp1, rtmp2);
5498
5499 // Round 4
5500 md5_II(reg_cache, a, b, c, d, 0, S41, 0xf4292244, rtmp1, rtmp2);
5501 md5_II(reg_cache, d, a, b, c, 7, S42, 0x432aff97, rtmp1, rtmp2);
5502 md5_II(reg_cache, c, d, a, b, 14, S43, 0xab9423a7, rtmp1, rtmp2);
5503 md5_II(reg_cache, b, c, d, a, 5, S44, 0xfc93a039, rtmp1, rtmp2);
5504 md5_II(reg_cache, a, b, c, d, 12, S41, 0x655b59c3, rtmp1, rtmp2);
5505 md5_II(reg_cache, d, a, b, c, 3, S42, 0x8f0ccc92, rtmp1, rtmp2);
5506 md5_II(reg_cache, c, d, a, b, 10, S43, 0xffeff47d, rtmp1, rtmp2);
5507 md5_II(reg_cache, b, c, d, a, 1, S44, 0x85845dd1, rtmp1, rtmp2);
5508 md5_II(reg_cache, a, b, c, d, 8, S41, 0x6fa87e4f, rtmp1, rtmp2);
5509 md5_II(reg_cache, d, a, b, c, 15, S42, 0xfe2ce6e0, rtmp1, rtmp2);
5510 md5_II(reg_cache, c, d, a, b, 6, S43, 0xa3014314, rtmp1, rtmp2);
5511 md5_II(reg_cache, b, c, d, a, 13, S44, 0x4e0811a1, rtmp1, rtmp2);
5512 md5_II(reg_cache, a, b, c, d, 4, S41, 0xf7537e82, rtmp1, rtmp2);
5513 md5_II(reg_cache, d, a, b, c, 11, S42, 0xbd3af235, rtmp1, rtmp2);
5514 md5_II(reg_cache, c, d, a, b, 2, S43, 0x2ad7d2bb, rtmp1, rtmp2);
5515 md5_II(reg_cache, b, c, d, a, 9, S44, 0xeb86d391, rtmp1, rtmp2);
5516
5517 __ addw(state0, state0, a);
5518 __ addw(state1, state1, b);
5519 __ addw(state2, state2, c);
5520 __ addw(state3, state3, d);
5521
5522 if (multi_block) {
5523 __ addi(buf, buf, 64);
5524 __ addi(ofs, ofs, 64);
5525 // if (ofs <= limit) goto m5_loop
5526 __ bge(limit, ofs, md5_loop);
5527 __ mv(c_rarg0, ofs); // return ofs
5528 }
5529
5530 // to minimize the number of memory operations:
5531 // write back the 4 state 4-byte values in pairs, with a single sd
5532 __ mv(t0, mask32);
5533 __ andr(state0, state0, t0);
5534 __ slli(state1, state1, 32);
5535 __ orr(state0, state0, state1);
5536 __ sd(state0, Address(state));
5537 __ andr(state2, state2, t0);
5538 __ slli(state3, state3, 32);
5539 __ orr(state2, state2, state3);
5540 __ sd(state2, Address(state, 8));
5541
5542 __ pop_reg(saved_regs, sp);
5543 __ ret();
5544
5545 return (address) start;
5546 }
5547
5548 /**
5549 * Perform the quarter round calculations on values contained within four vector registers.
5550 *
5551 * @param aVec the SIMD register containing only the "a" values
5552 * @param bVec the SIMD register containing only the "b" values
5553 * @param cVec the SIMD register containing only the "c" values
5554 * @param dVec the SIMD register containing only the "d" values
5555 * @param tmp_vr temporary vector register holds intermedia values.
5556 */
5557 void chacha20_quarter_round(VectorRegister aVec, VectorRegister bVec,
5558 VectorRegister cVec, VectorRegister dVec, VectorRegister tmp_vr) {
5559 // a += b, d ^= a, d <<<= 16
5560 __ vadd_vv(aVec, aVec, bVec);
5561 __ vxor_vv(dVec, dVec, aVec);
5562 __ vrole32_vi(dVec, 16, tmp_vr);
5563
5564 // c += d, b ^= c, b <<<= 12
5565 __ vadd_vv(cVec, cVec, dVec);
5566 __ vxor_vv(bVec, bVec, cVec);
5567 __ vrole32_vi(bVec, 12, tmp_vr);
5568
5569 // a += b, d ^= a, d <<<= 8
5570 __ vadd_vv(aVec, aVec, bVec);
5571 __ vxor_vv(dVec, dVec, aVec);
5572 __ vrole32_vi(dVec, 8, tmp_vr);
5573
5574 // c += d, b ^= c, b <<<= 7
5575 __ vadd_vv(cVec, cVec, dVec);
5576 __ vxor_vv(bVec, bVec, cVec);
5577 __ vrole32_vi(bVec, 7, tmp_vr);
5578 }
5579
5580 /**
5581 * int com.sun.crypto.provider.ChaCha20Cipher.implChaCha20Block(int[] initState, byte[] result)
5582 *
5583 * Input arguments:
5584 * c_rarg0 - state, the starting state
5585 * c_rarg1 - key_stream, the array that will hold the result of the ChaCha20 block function
5586 *
5587 * Implementation Note:
5588 * Parallelization is achieved by loading individual state elements into vectors for N blocks.
5589 * N depends on single vector register length.
5590 */
5591 address generate_chacha20Block() {
5592 Label L_Rounds;
5593
5594 __ align(CodeEntryAlignment);
5595 StubId stub_id = StubId::stubgen_chacha20Block_id;
5596 StubCodeMark mark(this, stub_id);
5597 address start = __ pc();
5598 __ enter();
5599
5600 const int states_len = 16;
5601 const int step = 4;
5602 const Register state = c_rarg0;
5603 const Register key_stream = c_rarg1;
5604 const Register tmp_addr = t0;
5605 const Register length = t1;
5606
5607 // Organize vector registers in an array that facilitates
5608 // putting repetitive opcodes into loop structures below.
5609 const VectorRegister work_vrs[16] = {
5610 v0, v1, v2, v3, v4, v5, v6, v7,
5611 v8, v9, v10, v11, v12, v13, v14, v15
5612 };
5613 const VectorRegister tmp_vr = v16;
5614 const VectorRegister counter_vr = v17;
5615
5616 {
5617 // Put 16 here, as com.sun.crypto.providerChaCha20Cipher.KS_MAX_LEN is 1024
5618 // in java level.
5619 __ vsetivli(length, 16, Assembler::e32, Assembler::m1);
5620 }
5621
5622 // Load from source state.
5623 // Every element in source state is duplicated to all elements in the corresponding vector.
5624 __ mv(tmp_addr, state);
5625 for (int i = 0; i < states_len; i += 1) {
5626 __ vlse32_v(work_vrs[i], tmp_addr, zr);
5627 __ addi(tmp_addr, tmp_addr, step);
5628 }
5629 // Adjust counter for every individual block.
5630 __ vid_v(counter_vr);
5631 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5632
5633 // Perform 10 iterations of the 8 quarter round set
5634 {
5635 const Register loop = t2; // share t2 with other non-overlapping usages.
5636 __ mv(loop, 10);
5637 __ BIND(L_Rounds);
5638
5639 chacha20_quarter_round(work_vrs[0], work_vrs[4], work_vrs[8], work_vrs[12], tmp_vr);
5640 chacha20_quarter_round(work_vrs[1], work_vrs[5], work_vrs[9], work_vrs[13], tmp_vr);
5641 chacha20_quarter_round(work_vrs[2], work_vrs[6], work_vrs[10], work_vrs[14], tmp_vr);
5642 chacha20_quarter_round(work_vrs[3], work_vrs[7], work_vrs[11], work_vrs[15], tmp_vr);
5643
5644 chacha20_quarter_round(work_vrs[0], work_vrs[5], work_vrs[10], work_vrs[15], tmp_vr);
5645 chacha20_quarter_round(work_vrs[1], work_vrs[6], work_vrs[11], work_vrs[12], tmp_vr);
5646 chacha20_quarter_round(work_vrs[2], work_vrs[7], work_vrs[8], work_vrs[13], tmp_vr);
5647 chacha20_quarter_round(work_vrs[3], work_vrs[4], work_vrs[9], work_vrs[14], tmp_vr);
5648
5649 __ subi(loop, loop, 1);
5650 __ bnez(loop, L_Rounds);
5651 }
5652
5653 // Add the original state into the end working state.
5654 // We do this by first duplicating every element in source state array to the corresponding
5655 // vector, then adding it to the post-loop working state.
5656 __ mv(tmp_addr, state);
5657 for (int i = 0; i < states_len; i += 1) {
5658 __ vlse32_v(tmp_vr, tmp_addr, zr);
5659 __ addi(tmp_addr, tmp_addr, step);
5660 __ vadd_vv(work_vrs[i], work_vrs[i], tmp_vr);
5661 }
5662 // Add the counter overlay onto work_vrs[12] at the end.
5663 __ vadd_vv(work_vrs[12], work_vrs[12], counter_vr);
5664
5665 // Store result to key stream.
5666 {
5667 const Register stride = t2; // share t2 with other non-overlapping usages.
5668 // Every block occupies 64 bytes, so we use 64 as stride of the vector store.
5669 __ mv(stride, 64);
5670 for (int i = 0; i < states_len; i += 1) {
5671 __ vsse32_v(work_vrs[i], key_stream, stride);
5672 __ addi(key_stream, key_stream, step);
5673 }
5674 }
5675
5676 // Return length of output key_stream
5677 __ slli(c_rarg0, length, 6);
5678
5679 __ leave();
5680 __ ret();
5681
5682 return (address) start;
5683 }
5684
5685
5686 // ------------------------ SHA-1 intrinsic ------------------------
5687
5688 // K't =
5689 // 5a827999, 0 <= t <= 19
5690 // 6ed9eba1, 20 <= t <= 39
5691 // 8f1bbcdc, 40 <= t <= 59
5692 // ca62c1d6, 60 <= t <= 79
5693 void sha1_prepare_k(Register cur_k, int round) {
5694 assert(round >= 0 && round < 80, "must be");
5695
5696 static const int64_t ks[] = {0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6};
5697 if ((round % 20) == 0) {
5698 __ mv(cur_k, ks[round/20]);
5699 }
5700 }
5701
5702 // W't =
5703 // M't, 0 <= t <= 15
5704 // ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5705 void sha1_prepare_w(Register cur_w, Register ws[], Register buf, int round) {
5706 assert(round >= 0 && round < 80, "must be");
5707
5708 if (round < 16) {
5709 // in the first 16 rounds, in ws[], every register contains 2 W't, e.g.
5710 // in ws[0], high part contains W't-0, low part contains W't-1,
5711 // in ws[1], high part contains W't-2, low part contains W't-3,
5712 // ...
5713 // in ws[7], high part contains W't-14, low part contains W't-15.
5714
5715 if ((round % 2) == 0) {
5716 __ ld(ws[round/2], Address(buf, (round/2) * 8));
5717 // reverse bytes, as SHA-1 is defined in big-endian.
5718 __ revb(ws[round/2], ws[round/2]);
5719 __ srli(cur_w, ws[round/2], 32);
5720 } else {
5721 __ mv(cur_w, ws[round/2]);
5722 }
5723
5724 return;
5725 }
5726
5727 if ((round % 2) == 0) {
5728 int idx = 16;
5729 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5730 __ srli(t1, ws[(idx-8)/2], 32);
5731 __ xorr(t0, ws[(idx-3)/2], t1);
5732
5733 __ srli(t1, ws[(idx-14)/2], 32);
5734 __ srli(cur_w, ws[(idx-16)/2], 32);
5735 __ xorr(cur_w, cur_w, t1);
5736
5737 __ xorr(cur_w, cur_w, t0);
5738 __ rolw(cur_w, cur_w, 1, t0);
5739
5740 // copy the cur_w value to ws[8].
5741 // now, valid w't values are at:
5742 // w0: ws[0]'s lower 32 bits
5743 // w1 ~ w14: ws[1] ~ ws[7]
5744 // w15: ws[8]'s higher 32 bits
5745 __ slli(ws[idx/2], cur_w, 32);
5746
5747 return;
5748 }
5749
5750 int idx = 17;
5751 // W't = ROTL'1(W't-3 ^ W't-8 ^ W't-14 ^ W't-16), 16 <= t <= 79
5752 __ srli(t1, ws[(idx-3)/2], 32);
5753 __ xorr(t0, t1, ws[(idx-8)/2]);
5754
5755 __ xorr(cur_w, ws[(idx-16)/2], ws[(idx-14)/2]);
5756
5757 __ xorr(cur_w, cur_w, t0);
5758 __ rolw(cur_w, cur_w, 1, t0);
5759
5760 // copy the cur_w value to ws[8]
5761 __ zext(cur_w, cur_w, 32);
5762 __ orr(ws[idx/2], ws[idx/2], cur_w);
5763
5764 // shift the w't registers, so they start from ws[0] again.
5765 // now, valid w't values are at:
5766 // w0 ~ w15: ws[0] ~ ws[7]
5767 Register ws_0 = ws[0];
5768 for (int i = 0; i < 16/2; i++) {
5769 ws[i] = ws[i+1];
5770 }
5771 ws[8] = ws_0;
5772 }
5773
5774 // f't(x, y, z) =
5775 // Ch(x, y, z) = (x & y) ^ (~x & z) , 0 <= t <= 19
5776 // Parity(x, y, z) = x ^ y ^ z , 20 <= t <= 39
5777 // Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) , 40 <= t <= 59
5778 // Parity(x, y, z) = x ^ y ^ z , 60 <= t <= 79
5779 void sha1_f(Register dst, Register x, Register y, Register z, int round) {
5780 assert(round >= 0 && round < 80, "must be");
5781 assert_different_registers(dst, x, y, z, t0, t1);
5782
5783 if (round < 20) {
5784 // (x & y) ^ (~x & z)
5785 __ andr(t0, x, y);
5786 __ andn(dst, z, x);
5787 __ xorr(dst, dst, t0);
5788 } else if (round >= 40 && round < 60) {
5789 // (x & y) ^ (x & z) ^ (y & z)
5790 __ andr(t0, x, y);
5791 __ andr(t1, x, z);
5792 __ andr(dst, y, z);
5793 __ xorr(dst, dst, t0);
5794 __ xorr(dst, dst, t1);
5795 } else {
5796 // x ^ y ^ z
5797 __ xorr(dst, x, y);
5798 __ xorr(dst, dst, z);
5799 }
5800 }
5801
5802 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5803 // e = d
5804 // d = c
5805 // c = ROTL'30(b)
5806 // b = a
5807 // a = T
5808 void sha1_process_round(Register a, Register b, Register c, Register d, Register e,
5809 Register cur_k, Register cur_w, Register tmp, int round) {
5810 assert(round >= 0 && round < 80, "must be");
5811 assert_different_registers(a, b, c, d, e, cur_w, cur_k, tmp, t0);
5812
5813 // T = ROTL'5(a) + f't(b, c, d) + e + K't + W't
5814
5815 // cur_w will be recalculated at the beginning of each round,
5816 // so, we can reuse it as a temp register here.
5817 Register tmp2 = cur_w;
5818
5819 // reuse e as a temporary register, as we will mv new value into it later
5820 Register tmp3 = e;
5821 __ add(tmp2, cur_k, tmp2);
5822 __ add(tmp3, tmp3, tmp2);
5823 __ rolw(tmp2, a, 5, t0);
5824
5825 sha1_f(tmp, b, c, d, round);
5826
5827 __ add(tmp2, tmp2, tmp);
5828 __ add(tmp2, tmp2, tmp3);
5829
5830 // e = d
5831 // d = c
5832 // c = ROTL'30(b)
5833 // b = a
5834 // a = T
5835 __ mv(e, d);
5836 __ mv(d, c);
5837
5838 __ rolw(c, b, 30);
5839 __ mv(b, a);
5840 __ mv(a, tmp2);
5841 }
5842
5843 // H(i)0 = a + H(i-1)0
5844 // H(i)1 = b + H(i-1)1
5845 // H(i)2 = c + H(i-1)2
5846 // H(i)3 = d + H(i-1)3
5847 // H(i)4 = e + H(i-1)4
5848 void sha1_calculate_im_hash(Register a, Register b, Register c, Register d, Register e,
5849 Register prev_ab, Register prev_cd, Register prev_e) {
5850 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5851
5852 __ add(a, a, prev_ab);
5853 __ srli(prev_ab, prev_ab, 32);
5854 __ add(b, b, prev_ab);
5855
5856 __ add(c, c, prev_cd);
5857 __ srli(prev_cd, prev_cd, 32);
5858 __ add(d, d, prev_cd);
5859
5860 __ add(e, e, prev_e);
5861 }
5862
5863 void sha1_preserve_prev_abcde(Register a, Register b, Register c, Register d, Register e,
5864 Register prev_ab, Register prev_cd, Register prev_e) {
5865 assert_different_registers(a, b, c, d, e, prev_ab, prev_cd, prev_e, t0);
5866
5867 __ slli(t0, b, 32);
5868 __ zext(prev_ab, a, 32);
5869 __ orr(prev_ab, prev_ab, t0);
5870
5871 __ slli(t0, d, 32);
5872 __ zext(prev_cd, c, 32);
5873 __ orr(prev_cd, prev_cd, t0);
5874
5875 __ mv(prev_e, e);
5876 }
5877
5878 // Intrinsic for:
5879 // void sun.security.provider.SHA.implCompress0(byte[] buf, int ofs)
5880 // void sun.security.provider.DigestBase.implCompressMultiBlock0(byte[] b, int ofs, int limit)
5881 //
5882 // Arguments:
5883 //
5884 // Inputs:
5885 // c_rarg0: byte[] src array + offset
5886 // c_rarg1: int[] SHA.state
5887 // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5888 // c_rarg2: int offset
5889 // c_rarg3: int limit
5890 //
5891 // Outputs:
5892 // - - - - - - below are only for implCompressMultiBlock0 - - - - - -
5893 // c_rarg0: int offset, when (multi_block == true)
5894 //
5895 address generate_sha1_implCompress(StubId stub_id) {
5896 bool multi_block;
5897 switch (stub_id) {
5898 case StubId::stubgen_sha1_implCompress_id:
5899 multi_block = false;
5900 break;
5901 case StubId::stubgen_sha1_implCompressMB_id:
5902 multi_block = true;
5903 break;
5904 default:
5905 ShouldNotReachHere();
5906 };
5907 __ align(CodeEntryAlignment);
5908 StubCodeMark mark(this, stub_id);
5909
5910 address start = __ pc();
5911 __ enter();
5912
5913 RegSet saved_regs = RegSet::range(x18, x27);
5914 if (multi_block) {
5915 // use x9 as src below.
5916 saved_regs += RegSet::of(x9);
5917 }
5918 __ push_reg(saved_regs, sp);
5919
5920 // c_rarg0 - c_rarg3: x10 - x13
5921 Register buf = c_rarg0;
5922 Register state = c_rarg1;
5923 Register offset = c_rarg2;
5924 Register limit = c_rarg3;
5925 // use src to contain the original start point of the array.
5926 Register src = x9;
5927
5928 if (multi_block) {
5929 __ sub(limit, limit, offset);
5930 __ add(limit, limit, buf);
5931 __ sub(src, buf, offset);
5932 }
5933
5934 // [args-reg]: x14 - x17
5935 // [temp-reg]: x28 - x31
5936 // [saved-reg]: x18 - x27
5937
5938 // h0/1/2/3/4
5939 const Register a = x14, b = x15, c = x16, d = x17, e = x28;
5940 // w0, w1, ... w15
5941 // put two adjecent w's in one register:
5942 // one at high word part, another at low word part
5943 // at different round (even or odd), w't value reside in different items in ws[].
5944 // w0 ~ w15, either reside in
5945 // ws[0] ~ ws[7], where
5946 // w0 at higher 32 bits of ws[0],
5947 // w1 at lower 32 bits of ws[0],
5948 // ...
5949 // w14 at higher 32 bits of ws[7],
5950 // w15 at lower 32 bits of ws[7].
5951 // or, reside in
5952 // w0: ws[0]'s lower 32 bits
5953 // w1 ~ w14: ws[1] ~ ws[7]
5954 // w15: ws[8]'s higher 32 bits
5955 Register ws[9] = {x29, x30, x31, x18,
5956 x19, x20, x21, x22,
5957 x23}; // auxiliary register for calculating w's value
5958 // current k't's value
5959 const Register cur_k = x24;
5960 // current w't's value
5961 const Register cur_w = x25;
5962 // values of a, b, c, d, e in the previous round
5963 const Register prev_ab = x26, prev_cd = x27;
5964 const Register prev_e = offset; // reuse offset/c_rarg2
5965
5966 // load 5 words state into a, b, c, d, e.
5967 //
5968 // To minimize the number of memory operations, we apply following
5969 // optimization: read the states (a/b/c/d) of 4-byte values in pairs,
5970 // with a single ld, and split them into 2 registers.
5971 //
5972 // And, as the core algorithm of SHA-1 works on 32-bits words, so
5973 // in the following code, it does not care about the content of
5974 // higher 32-bits in a/b/c/d/e. Based on this observation,
5975 // we can apply further optimization, which is to just ignore the
5976 // higher 32-bits in a/c/e, rather than set the higher
5977 // 32-bits of a/c/e to zero explicitly with extra instructions.
5978 __ ld(a, Address(state, 0));
5979 __ srli(b, a, 32);
5980 __ ld(c, Address(state, 8));
5981 __ srli(d, c, 32);
5982 __ lw(e, Address(state, 16));
5983
5984 Label L_sha1_loop;
5985 if (multi_block) {
5986 __ BIND(L_sha1_loop);
5987 }
5988
5989 sha1_preserve_prev_abcde(a, b, c, d, e, prev_ab, prev_cd, prev_e);
5990
5991 for (int round = 0; round < 80; round++) {
5992 // prepare K't value
5993 sha1_prepare_k(cur_k, round);
5994
5995 // prepare W't value
5996 sha1_prepare_w(cur_w, ws, buf, round);
5997
5998 // one round process
5999 sha1_process_round(a, b, c, d, e, cur_k, cur_w, t2, round);
6000 }
6001
6002 // compute the intermediate hash value
6003 sha1_calculate_im_hash(a, b, c, d, e, prev_ab, prev_cd, prev_e);
6004
6005 if (multi_block) {
6006 int64_t block_bytes = 16 * 4;
6007 __ addi(buf, buf, block_bytes);
6008
6009 __ bge(limit, buf, L_sha1_loop, /* is_far */ true);
6010 }
6011
6012 // store back the state.
6013 __ zext(a, a, 32);
6014 __ slli(b, b, 32);
6015 __ orr(a, a, b);
6016 __ sd(a, Address(state, 0));
6017 __ zext(c, c, 32);
6018 __ slli(d, d, 32);
6019 __ orr(c, c, d);
6020 __ sd(c, Address(state, 8));
6021 __ sw(e, Address(state, 16));
6022
6023 // return offset
6024 if (multi_block) {
6025 __ sub(c_rarg0, buf, src);
6026 }
6027
6028 __ pop_reg(saved_regs, sp);
6029
6030 __ leave();
6031 __ ret();
6032
6033 return (address) start;
6034 }
6035
6036 /**
6037 * vector registers:
6038 * input VectorRegister's: intputV1-V3, for m2 they could be v2, v4, v6, for m1 they could be v1, v2, v3
6039 * index VectorRegister's: idxV1-V4, for m2 they could be v8, v10, v12, v14, for m1 they could be v4, v5, v6, v7
6040 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v8, v9, v10, v11
6041 *
6042 * NOTE: each field will occupy a vector register group
6043 */
6044 void base64_vector_encode_round(Register src, Register dst, Register codec,
6045 Register size, Register stepSrc, Register stepDst,
6046 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3,
6047 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6048 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3, VectorRegister outputV4,
6049 Assembler::LMUL lmul) {
6050 // set vector register type/len
6051 __ vsetvli(x0, size, Assembler::e8, lmul);
6052
6053 // segmented load src into v registers: mem(src) => vr(3)
6054 __ vlseg3e8_v(inputV1, src);
6055
6056 // src = src + register_group_len_bytes * 3
6057 __ add(src, src, stepSrc);
6058
6059 // encoding
6060 // 1. compute index into lookup table: vr(3) => vr(4)
6061 __ vsrl_vi(idxV1, inputV1, 2);
6062
6063 __ vsrl_vi(idxV2, inputV2, 2);
6064 __ vsll_vi(inputV1, inputV1, 6);
6065 __ vor_vv(idxV2, idxV2, inputV1);
6066 __ vsrl_vi(idxV2, idxV2, 2);
6067
6068 __ vsrl_vi(idxV3, inputV3, 4);
6069 __ vsll_vi(inputV2, inputV2, 4);
6070 __ vor_vv(idxV3, inputV2, idxV3);
6071 __ vsrl_vi(idxV3, idxV3, 2);
6072
6073 __ vsll_vi(idxV4, inputV3, 2);
6074 __ vsrl_vi(idxV4, idxV4, 2);
6075
6076 // 2. indexed load: vr(4) => vr(4)
6077 __ vluxei8_v(outputV1, codec, idxV1);
6078 __ vluxei8_v(outputV2, codec, idxV2);
6079 __ vluxei8_v(outputV3, codec, idxV3);
6080 __ vluxei8_v(outputV4, codec, idxV4);
6081
6082 // segmented store encoded data in v registers back to dst: vr(4) => mem(dst)
6083 __ vsseg4e8_v(outputV1, dst);
6084
6085 // dst = dst + register_group_len_bytes * 4
6086 __ add(dst, dst, stepDst);
6087 }
6088
6089 /**
6090 * void j.u.Base64.Encoder.encodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL)
6091 *
6092 * Input arguments:
6093 * c_rarg0 - src, source array
6094 * c_rarg1 - sp, src start offset
6095 * c_rarg2 - sl, src end offset
6096 * c_rarg3 - dst, dest array
6097 * c_rarg4 - dp, dst start offset
6098 * c_rarg5 - isURL, Base64 or URL character set
6099 */
6100 address generate_base64_encodeBlock() {
6101 alignas(64) static const char toBase64[64] = {
6102 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6103 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6104 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6105 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6106 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
6107 };
6108
6109 alignas(64) static const char toBase64URL[64] = {
6110 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
6111 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
6112 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
6113 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
6114 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
6115 };
6116
6117 __ align(CodeEntryAlignment);
6118 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
6119 StubCodeMark mark(this, stub_id);
6120 address start = __ pc();
6121 __ enter();
6122
6123 Register src = c_rarg0;
6124 Register soff = c_rarg1;
6125 Register send = c_rarg2;
6126 Register dst = c_rarg3;
6127 Register doff = c_rarg4;
6128 Register isURL = c_rarg5;
6129
6130 Register codec = c_rarg6;
6131 Register length = c_rarg7; // total length of src data in bytes
6132
6133 Label ProcessData, Exit;
6134
6135 // length should be multiple of 3
6136 __ sub(length, send, soff);
6137 // real src/dst to process data
6138 __ add(src, src, soff);
6139 __ add(dst, dst, doff);
6140
6141 // load the codec base address
6142 __ la(codec, ExternalAddress((address) toBase64));
6143 __ beqz(isURL, ProcessData);
6144 __ la(codec, ExternalAddress((address) toBase64URL));
6145 __ BIND(ProcessData);
6146
6147 // vector version
6148 if (UseRVV) {
6149 Label ProcessM2, ProcessM1, ProcessScalar;
6150
6151 Register size = soff;
6152 Register stepSrcM1 = send;
6153 Register stepSrcM2 = doff;
6154 Register stepDst = isURL;
6155
6156 __ mv(size, MaxVectorSize * 2);
6157 __ mv(stepSrcM1, MaxVectorSize * 3);
6158 __ slli(stepSrcM2, stepSrcM1, 1);
6159 __ mv(stepDst, MaxVectorSize * 2 * 4);
6160
6161 __ blt(length, stepSrcM2, ProcessM1);
6162
6163 __ BIND(ProcessM2);
6164 base64_vector_encode_round(src, dst, codec,
6165 size, stepSrcM2, stepDst,
6166 v2, v4, v6, // inputs
6167 v8, v10, v12, v14, // indexes
6168 v16, v18, v20, v22, // outputs
6169 Assembler::m2);
6170
6171 __ sub(length, length, stepSrcM2);
6172 __ bge(length, stepSrcM2, ProcessM2);
6173
6174 __ BIND(ProcessM1);
6175 __ blt(length, stepSrcM1, ProcessScalar);
6176
6177 __ srli(size, size, 1);
6178 __ srli(stepDst, stepDst, 1);
6179 base64_vector_encode_round(src, dst, codec,
6180 size, stepSrcM1, stepDst,
6181 v1, v2, v3, // inputs
6182 v4, v5, v6, v7, // indexes
6183 v8, v9, v10, v11, // outputs
6184 Assembler::m1);
6185 __ sub(length, length, stepSrcM1);
6186
6187 __ BIND(ProcessScalar);
6188 }
6189
6190 // scalar version
6191 {
6192 Register byte1 = soff, byte0 = send, byte2 = doff;
6193 Register combined24Bits = isURL;
6194
6195 __ beqz(length, Exit);
6196
6197 Label ScalarLoop;
6198 __ BIND(ScalarLoop);
6199 {
6200 // plain: [byte0[7:0] : byte1[7:0] : byte2[7:0]] =>
6201 // encoded: [byte0[7:2] : byte0[1:0]+byte1[7:4] : byte1[3:0]+byte2[7:6] : byte2[5:0]]
6202
6203 // load 3 bytes src data
6204 __ lbu(byte0, Address(src, 0));
6205 __ lbu(byte1, Address(src, 1));
6206 __ lbu(byte2, Address(src, 2));
6207 __ addi(src, src, 3);
6208
6209 // construct 24 bits from 3 bytes
6210 __ slliw(byte0, byte0, 16);
6211 __ slliw(byte1, byte1, 8);
6212 __ orr(combined24Bits, byte0, byte1);
6213 __ orr(combined24Bits, combined24Bits, byte2);
6214
6215 // get codec index and encode(ie. load from codec by index)
6216 __ slliw(byte0, combined24Bits, 8);
6217 __ srliw(byte0, byte0, 26);
6218 __ add(byte0, codec, byte0);
6219 __ lbu(byte0, byte0);
6220
6221 __ slliw(byte1, combined24Bits, 14);
6222 __ srliw(byte1, byte1, 26);
6223 __ add(byte1, codec, byte1);
6224 __ lbu(byte1, byte1);
6225
6226 __ slliw(byte2, combined24Bits, 20);
6227 __ srliw(byte2, byte2, 26);
6228 __ add(byte2, codec, byte2);
6229 __ lbu(byte2, byte2);
6230
6231 __ andi(combined24Bits, combined24Bits, 0x3f);
6232 __ add(combined24Bits, codec, combined24Bits);
6233 __ lbu(combined24Bits, combined24Bits);
6234
6235 // store 4 bytes encoded data
6236 __ sb(byte0, Address(dst, 0));
6237 __ sb(byte1, Address(dst, 1));
6238 __ sb(byte2, Address(dst, 2));
6239 __ sb(combined24Bits, Address(dst, 3));
6240
6241 __ subi(length, length, 3);
6242 __ addi(dst, dst, 4);
6243 // loop back
6244 __ bnez(length, ScalarLoop);
6245 }
6246 }
6247
6248 __ BIND(Exit);
6249
6250 __ leave();
6251 __ ret();
6252
6253 return (address) start;
6254 }
6255
6256 /**
6257 * vector registers:
6258 * input VectorRegister's: intputV1-V4, for m2 they could be v2, v4, v6, for m1 they could be v2, v4, v6, v8
6259 * index VectorRegister's: idxV1-V3, for m2 they could be v8, v10, v12, v14, for m1 they could be v10, v12, v14, v16
6260 * output VectorRegister's: outputV1-V4, for m2 they could be v16, v18, v20, v22, for m1 they could be v18, v20, v22
6261 *
6262 * NOTE: each field will occupy a single vector register group
6263 */
6264 void base64_vector_decode_round(Register src, Register dst, Register codec,
6265 Register size, Register stepSrc, Register stepDst, Register failedIdx,
6266 VectorRegister inputV1, VectorRegister inputV2, VectorRegister inputV3, VectorRegister inputV4,
6267 VectorRegister idxV1, VectorRegister idxV2, VectorRegister idxV3, VectorRegister idxV4,
6268 VectorRegister outputV1, VectorRegister outputV2, VectorRegister outputV3,
6269 Assembler::LMUL lmul) {
6270 // set vector register type/len
6271 __ vsetvli(x0, size, Assembler::e8, lmul, Assembler::ma, Assembler::ta);
6272
6273 // segmented load src into v registers: mem(src) => vr(4)
6274 __ vlseg4e8_v(inputV1, src);
6275
6276 // src = src + register_group_len_bytes * 4
6277 __ add(src, src, stepSrc);
6278
6279 // decoding
6280 // 1. indexed load: vr(4) => vr(4)
6281 __ vluxei8_v(idxV1, codec, inputV1);
6282 __ vluxei8_v(idxV2, codec, inputV2);
6283 __ vluxei8_v(idxV3, codec, inputV3);
6284 __ vluxei8_v(idxV4, codec, inputV4);
6285
6286 // 2. check wrong data
6287 __ vor_vv(outputV1, idxV1, idxV2);
6288 __ vor_vv(outputV2, idxV3, idxV4);
6289 __ vor_vv(outputV1, outputV1, outputV2);
6290 __ vmseq_vi(v0, outputV1, -1);
6291 __ vfirst_m(failedIdx, v0);
6292 Label NoFailure, FailureAtIdx0;
6293 // valid value can only be -1 when < 0
6294 __ bltz(failedIdx, NoFailure);
6295 // when the first data (at index 0) fails, no need to process data anymore
6296 __ beqz(failedIdx, FailureAtIdx0);
6297 __ vsetvli(x0, failedIdx, Assembler::e8, lmul, Assembler::mu, Assembler::tu);
6298 __ slli(stepDst, failedIdx, 1);
6299 __ add(stepDst, failedIdx, stepDst);
6300 __ BIND(NoFailure);
6301
6302 // 3. compute the decoded data: vr(4) => vr(3)
6303 __ vsll_vi(idxV1, idxV1, 2);
6304 __ vsrl_vi(outputV1, idxV2, 4);
6305 __ vor_vv(outputV1, outputV1, idxV1);
6306
6307 __ vsll_vi(idxV2, idxV2, 4);
6308 __ vsrl_vi(outputV2, idxV3, 2);
6309 __ vor_vv(outputV2, outputV2, idxV2);
6310
6311 __ vsll_vi(idxV3, idxV3, 6);
6312 __ vor_vv(outputV3, idxV4, idxV3);
6313
6314 // segmented store encoded data in v registers back to dst: vr(3) => mem(dst)
6315 __ vsseg3e8_v(outputV1, dst);
6316
6317 // dst = dst + register_group_len_bytes * 3
6318 __ add(dst, dst, stepDst);
6319 __ BIND(FailureAtIdx0);
6320 }
6321
6322 /**
6323 * int j.u.Base64.Decoder.decodeBlock(byte[] src, int sp, int sl, byte[] dst, int dp, boolean isURL, boolean isMIME)
6324 *
6325 * Input arguments:
6326 * c_rarg0 - src, source array
6327 * c_rarg1 - sp, src start offset
6328 * c_rarg2 - sl, src end offset
6329 * c_rarg3 - dst, dest array
6330 * c_rarg4 - dp, dst start offset
6331 * c_rarg5 - isURL, Base64 or URL character set
6332 * c_rarg6 - isMIME, Decoding MIME block
6333 */
6334 address generate_base64_decodeBlock() {
6335
6336 static const uint8_t fromBase64[256] = {
6337 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6338 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6339 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
6340 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
6341 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
6342 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
6343 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
6344 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
6345 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6346 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6347 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6348 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6349 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6350 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6351 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6352 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6353 };
6354
6355 static const uint8_t fromBase64URL[256] = {
6356 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6357 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6358 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
6359 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
6360 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
6361 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
6362 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
6363 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
6364 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6365 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6366 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6367 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6368 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6369 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6370 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6371 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
6372 };
6373
6374 __ align(CodeEntryAlignment);
6375 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
6376 StubCodeMark mark(this, stub_id);
6377 address start = __ pc();
6378 __ enter();
6379
6380 Register src = c_rarg0;
6381 Register soff = c_rarg1;
6382 Register send = c_rarg2;
6383 Register dst = c_rarg3;
6384 Register doff = c_rarg4;
6385 Register isURL = c_rarg5;
6386 Register isMIME = c_rarg6;
6387
6388 Register codec = c_rarg7;
6389 Register dstBackup = t6;
6390 Register length = t3; // total length of src data in bytes
6391
6392 Label ProcessData, Exit;
6393 Label ProcessScalar, ScalarLoop;
6394
6395 // passed in length (send - soff) is guaranteed to be > 4,
6396 // and in this intrinsic we only process data of length in multiple of 4,
6397 // it's not guaranteed to be multiple of 4 by java level, so do it explicitly
6398 __ sub(length, send, soff);
6399 __ andi(length, length, -4);
6400 // real src/dst to process data
6401 __ add(src, src, soff);
6402 __ add(dst, dst, doff);
6403 // backup of dst, used to calculate the return value at exit
6404 __ mv(dstBackup, dst);
6405
6406 // load the codec base address
6407 __ la(codec, ExternalAddress((address) fromBase64));
6408 __ beqz(isURL, ProcessData);
6409 __ la(codec, ExternalAddress((address) fromBase64URL));
6410 __ BIND(ProcessData);
6411
6412 // vector version
6413 if (UseRVV) {
6414 // for MIME case, it has a default length limit of 76 which could be
6415 // different(smaller) from (send - soff), so in MIME case, we go through
6416 // the scalar code path directly.
6417 __ bnez(isMIME, ScalarLoop);
6418
6419 Label ProcessM1, ProcessM2;
6420
6421 Register failedIdx = soff;
6422 Register stepSrcM1 = send;
6423 Register stepSrcM2 = doff;
6424 Register stepDst = isURL;
6425 Register size = t4;
6426
6427 __ mv(size, MaxVectorSize * 2);
6428 __ mv(stepSrcM1, MaxVectorSize * 4);
6429 __ slli(stepSrcM2, stepSrcM1, 1);
6430 __ mv(stepDst, MaxVectorSize * 2 * 3);
6431
6432 __ blt(length, stepSrcM2, ProcessM1);
6433
6434
6435 // Assembler::m2
6436 __ BIND(ProcessM2);
6437 base64_vector_decode_round(src, dst, codec,
6438 size, stepSrcM2, stepDst, failedIdx,
6439 v2, v4, v6, v8, // inputs
6440 v10, v12, v14, v16, // indexes
6441 v18, v20, v22, // outputs
6442 Assembler::m2);
6443 __ sub(length, length, stepSrcM2);
6444
6445 // error check
6446 // valid value of failedIdx can only be -1 when < 0
6447 __ bgez(failedIdx, Exit);
6448
6449 __ bge(length, stepSrcM2, ProcessM2);
6450
6451
6452 // Assembler::m1
6453 __ BIND(ProcessM1);
6454 __ blt(length, stepSrcM1, ProcessScalar);
6455
6456 __ srli(size, size, 1);
6457 __ srli(stepDst, stepDst, 1);
6458 base64_vector_decode_round(src, dst, codec,
6459 size, stepSrcM1, stepDst, failedIdx,
6460 v1, v2, v3, v4, // inputs
6461 v5, v6, v7, v8, // indexes
6462 v9, v10, v11, // outputs
6463 Assembler::m1);
6464 __ sub(length, length, stepSrcM1);
6465
6466 // error check
6467 // valid value of failedIdx can only be -1 when < 0
6468 __ bgez(failedIdx, Exit);
6469
6470 __ BIND(ProcessScalar);
6471 __ beqz(length, Exit);
6472 }
6473
6474 // scalar version
6475 {
6476 Register byte0 = soff, byte1 = send, byte2 = doff, byte3 = isURL;
6477 Register combined32Bits = t4;
6478
6479 // encoded: [byte0[5:0] : byte1[5:0] : byte2[5:0]] : byte3[5:0]] =>
6480 // plain: [byte0[5:0]+byte1[5:4] : byte1[3:0]+byte2[5:2] : byte2[1:0]+byte3[5:0]]
6481 __ BIND(ScalarLoop);
6482
6483 // load 4 bytes encoded src data
6484 __ lbu(byte0, Address(src, 0));
6485 __ lbu(byte1, Address(src, 1));
6486 __ lbu(byte2, Address(src, 2));
6487 __ lbu(byte3, Address(src, 3));
6488 __ addi(src, src, 4);
6489
6490 // get codec index and decode (ie. load from codec by index)
6491 __ add(byte0, codec, byte0);
6492 __ add(byte1, codec, byte1);
6493 __ lb(byte0, Address(byte0, 0));
6494 __ lb(byte1, Address(byte1, 0));
6495 __ add(byte2, codec, byte2);
6496 __ add(byte3, codec, byte3);
6497 __ lb(byte2, Address(byte2, 0));
6498 __ lb(byte3, Address(byte3, 0));
6499 __ slliw(byte0, byte0, 18);
6500 __ slliw(byte1, byte1, 12);
6501 __ orr(byte0, byte0, byte1);
6502 __ orr(byte0, byte0, byte3);
6503 __ slliw(byte2, byte2, 6);
6504 // For performance consideration, `combined32Bits` is constructed for 2 purposes at the same time,
6505 // 1. error check below
6506 // 2. decode below
6507 __ orr(combined32Bits, byte0, byte2);
6508
6509 // error check
6510 __ bltz(combined32Bits, Exit);
6511
6512 // store 3 bytes decoded data
6513 __ sraiw(byte0, combined32Bits, 16);
6514 __ sraiw(byte1, combined32Bits, 8);
6515 __ sb(byte0, Address(dst, 0));
6516 __ sb(byte1, Address(dst, 1));
6517 __ sb(combined32Bits, Address(dst, 2));
6518
6519 __ subi(length, length, 4);
6520 __ addi(dst, dst, 3);
6521 // loop back
6522 __ bnez(length, ScalarLoop);
6523 }
6524
6525 __ BIND(Exit);
6526 __ sub(c_rarg0, dst, dstBackup);
6527
6528 __ leave();
6529 __ ret();
6530
6531 return (address) start;
6532 }
6533
6534 void adler32_process_bytes(Register buff, Register s1, Register s2, VectorRegister vtable,
6535 VectorRegister vzero, VectorRegister vbytes, VectorRegister vs1acc, VectorRegister vs2acc,
6536 Register temp0, Register temp1, Register temp2, Register temp3,
6537 VectorRegister vtemp1, VectorRegister vtemp2, int step, Assembler::LMUL lmul) {
6538
6539 assert((lmul == Assembler::m4 && step == 64) ||
6540 (lmul == Assembler::m2 && step == 32) ||
6541 (lmul == Assembler::m1 && step == 16),
6542 "LMUL should be aligned with step: m4 and 64, m2 and 32 or m1 and 16");
6543 // Below is function for calculating Adler32 checksum with 64-, 32- or 16-byte step. LMUL=m4, m2 or m1 is used.
6544 // The results are in v12, v13, ..., v22, v23. Example below is for 64-byte step case.
6545 // We use b1, b2, ..., b64 to denote the 64 bytes loaded in each iteration.
6546 // In non-vectorized code, we update s1 and s2 as:
6547 // s1 <- s1 + b1
6548 // s2 <- s2 + s1
6549 // s1 <- s1 + b2
6550 // s2 <- s2 + b1
6551 // ...
6552 // s1 <- s1 + b64
6553 // s2 <- s2 + s1
6554 // Putting above assignments together, we have:
6555 // s1_new = s1 + b1 + b2 + ... + b64
6556 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b64) =
6557 // = s2 + s1 * 64 + (b1 * 64 + b2 * 63 + ... + b64 * 1) =
6558 // = s2 + s1 * 64 + (b1, b2, ... b64) dot (64, 63, ... 1)
6559
6560 __ mv(temp3, step);
6561 // Load data
6562 __ vsetvli(temp0, temp3, Assembler::e8, lmul);
6563 __ vle8_v(vbytes, buff);
6564 __ addi(buff, buff, step);
6565
6566 // Upper bound reduction sum for s1_new:
6567 // 0xFF * 64 = 0x3FC0, so:
6568 // 1. Need to do vector-widening reduction sum
6569 // 2. It is safe to perform sign-extension during vmv.x.s with 16-bits elements
6570 __ vwredsumu_vs(vs1acc, vbytes, vzero);
6571 // Multiplication for s2_new
6572 __ vwmulu_vv(vs2acc, vtable, vbytes);
6573
6574 // s2 = s2 + s1 * log2(step)
6575 __ slli(temp1, s1, exact_log2(step));
6576 __ add(s2, s2, temp1);
6577
6578 // Summing up calculated results for s2_new
6579 if (MaxVectorSize > 16) {
6580 __ vsetvli(temp0, temp3, Assembler::e16, lmul);
6581 } else {
6582 // Half of vector-widening multiplication result is in successor of vs2acc
6583 // group for vlen == 16, in which case we need to double vector register
6584 // group width in order to reduction sum all of them
6585 Assembler::LMUL lmulx2 = (lmul == Assembler::m1) ? Assembler::m2 :
6586 (lmul == Assembler::m2) ? Assembler::m4 : Assembler::m8;
6587 __ vsetvli(temp0, temp3, Assembler::e16, lmulx2);
6588 }
6589 // Upper bound for reduction sum:
6590 // 0xFF * (64 + 63 + ... + 2 + 1) = 0x817E0 max for whole register group, so:
6591 // 1. Need to do vector-widening reduction sum
6592 // 2. It is safe to perform sign-extension during vmv.x.s with 32-bits elements
6593 __ vwredsumu_vs(vtemp1, vs2acc, vzero);
6594
6595 // Extracting results for:
6596 // s1_new
6597 __ vmv_x_s(temp0, vs1acc);
6598 __ add(s1, s1, temp0);
6599 // s2_new
6600 __ vsetvli(temp0, temp3, Assembler::e32, Assembler::m1);
6601 __ vmv_x_s(temp1, vtemp1);
6602 __ add(s2, s2, temp1);
6603 }
6604
6605 /***
6606 * int java.util.zip.Adler32.updateBytes(int adler, byte[] b, int off, int len)
6607 *
6608 * Arguments:
6609 *
6610 * Inputs:
6611 * c_rarg0 - int adler
6612 * c_rarg1 - byte* buff (b + off)
6613 * c_rarg2 - int len
6614 *
6615 * Output:
6616 * c_rarg0 - int adler result
6617 */
6618 address generate_updateBytesAdler32() {
6619 __ align(CodeEntryAlignment);
6620 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
6621 StubCodeMark mark(this, stub_id);
6622 address start = __ pc();
6623
6624 Label L_nmax, L_nmax_loop, L_nmax_loop_entry, L_by16, L_by16_loop,
6625 L_by16_loop_unroll, L_by1_loop, L_do_mod, L_combine, L_by1;
6626
6627 // Aliases
6628 Register adler = c_rarg0;
6629 Register s1 = c_rarg0;
6630 Register s2 = c_rarg3;
6631 Register buff = c_rarg1;
6632 Register len = c_rarg2;
6633 Register nmax = c_rarg4;
6634 Register base = c_rarg5;
6635 Register count = c_rarg6;
6636 Register temp0 = t3;
6637 Register temp1 = t4;
6638 Register temp2 = t5;
6639 Register temp3 = t6;
6640
6641 VectorRegister vzero = v31;
6642 VectorRegister vbytes = v8; // group: v8, v9, v10, v11
6643 VectorRegister vs1acc = v12; // group: v12, v13, v14, v15
6644 VectorRegister vs2acc = v16; // group: v16, v17, v18, v19, v20, v21, v22, v23
6645 VectorRegister vtable_64 = v24; // group: v24, v25, v26, v27
6646 VectorRegister vtable_32 = v4; // group: v4, v5
6647 VectorRegister vtable_16 = v30;
6648 VectorRegister vtemp1 = v28;
6649 VectorRegister vtemp2 = v29;
6650
6651 // Max number of bytes we can process before having to take the mod
6652 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
6653 const uint64_t BASE = 0xfff1;
6654 const uint64_t NMAX = 0x15B0;
6655
6656 // Loops steps
6657 int step_64 = 64;
6658 int step_32 = 32;
6659 int step_16 = 16;
6660 int step_1 = 1;
6661
6662 __ enter(); // Required for proper stackwalking of RuntimeStub frame
6663 __ mv(temp1, 64);
6664 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m4);
6665
6666 // Generating accumulation coefficients for further calculations
6667 // vtable_64:
6668 __ vid_v(vtemp1);
6669 __ vrsub_vx(vtable_64, vtemp1, temp1);
6670 // vtable_64 group now contains { 0x40, 0x3f, 0x3e, ..., 0x3, 0x2, 0x1 }
6671
6672 // vtable_32:
6673 __ mv(temp1, 32);
6674 __ vsetvli(temp0, temp1, Assembler::e8, Assembler::m2);
6675 __ vid_v(vtemp1);
6676 __ vrsub_vx(vtable_32, vtemp1, temp1);
6677 // vtable_32 group now contains { 0x20, 0x1f, 0x1e, ..., 0x3, 0x2, 0x1 }
6678
6679 __ vsetivli(temp0, 16, Assembler::e8, Assembler::m1);
6680 // vtable_16:
6681 __ mv(temp1, 16);
6682 __ vid_v(vtemp1);
6683 __ vrsub_vx(vtable_16, vtemp1, temp1);
6684 // vtable_16 now contains { 0x10, 0xf, 0xe, ..., 0x3, 0x2, 0x1 }
6685
6686 __ vmv_v_i(vzero, 0);
6687
6688 __ mv(base, BASE);
6689 __ mv(nmax, NMAX);
6690
6691 // s1 is initialized to the lower 16 bits of adler
6692 // s2 is initialized to the upper 16 bits of adler
6693 __ srliw(s2, adler, 16); // s2 = ((adler >> 16) & 0xffff)
6694 __ zext(s1, adler, 16); // s1 = (adler & 0xffff)
6695
6696 // The pipelined loop needs at least 16 elements for 1 iteration
6697 // It does check this, but it is more effective to skip to the cleanup loop
6698 __ mv(temp0, step_16);
6699 __ bgeu(len, temp0, L_nmax);
6700 __ beqz(len, L_combine);
6701
6702 // Jumping to L_by1_loop
6703 __ subi(len, len, step_1);
6704 __ j(L_by1_loop);
6705
6706 __ bind(L_nmax);
6707 __ sub(len, len, nmax);
6708 __ subi(count, nmax, 16);
6709 __ bltz(len, L_by16);
6710
6711 // Align L_nmax loop by 64
6712 __ bind(L_nmax_loop_entry);
6713 __ subi(count, count, 32);
6714
6715 __ bind(L_nmax_loop);
6716 adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6717 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6718 vtemp1, vtemp2, step_64, Assembler::m4);
6719 __ subi(count, count, step_64);
6720 __ bgtz(count, L_nmax_loop);
6721
6722 // There are three iterations left to do
6723 adler32_process_bytes(buff, s1, s2, vtable_32, vzero,
6724 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6725 vtemp1, vtemp2, step_32, Assembler::m2);
6726 adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6727 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6728 vtemp1, vtemp2, step_16, Assembler::m1);
6729
6730 // s1 = s1 % BASE
6731 __ remuw(s1, s1, base);
6732 // s2 = s2 % BASE
6733 __ remuw(s2, s2, base);
6734
6735 __ sub(len, len, nmax);
6736 __ subi(count, nmax, 16);
6737 __ bgez(len, L_nmax_loop_entry);
6738
6739 __ bind(L_by16);
6740 __ add(len, len, count);
6741 __ bltz(len, L_by1);
6742 // Trying to unroll
6743 __ mv(temp3, step_64);
6744 __ blt(len, temp3, L_by16_loop);
6745
6746 __ bind(L_by16_loop_unroll);
6747 adler32_process_bytes(buff, s1, s2, vtable_64, vzero,
6748 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6749 vtemp1, vtemp2, step_64, Assembler::m4);
6750 __ subi(len, len, step_64);
6751 // By now the temp3 should still be 64
6752 __ bge(len, temp3, L_by16_loop_unroll);
6753
6754 __ bind(L_by16_loop);
6755 adler32_process_bytes(buff, s1, s2, vtable_16, vzero,
6756 vbytes, vs1acc, vs2acc, temp0, temp1, temp2, temp3,
6757 vtemp1, vtemp2, step_16, Assembler::m1);
6758 __ subi(len, len, step_16);
6759 __ bgez(len, L_by16_loop);
6760
6761 __ bind(L_by1);
6762 __ addi(len, len, 15);
6763 __ bltz(len, L_do_mod);
6764
6765 __ bind(L_by1_loop);
6766 __ lbu(temp0, Address(buff, 0));
6767 __ addi(buff, buff, step_1);
6768 __ add(s1, temp0, s1);
6769 __ add(s2, s2, s1);
6770 __ subi(len, len, step_1);
6771 __ bgez(len, L_by1_loop);
6772
6773 __ bind(L_do_mod);
6774 // s1 = s1 % BASE
6775 __ remuw(s1, s1, base);
6776 // s2 = s2 % BASE
6777 __ remuw(s2, s2, base);
6778
6779 // Combine lower bits and higher bits
6780 // adler = s1 | (s2 << 16)
6781 __ bind(L_combine);
6782 __ slli(s2, s2, 16);
6783 __ orr(s1, s1, s2);
6784
6785 __ leave(); // Required for proper stackwalking of RuntimeStub frame
6786 __ ret();
6787
6788 return start;
6789 }
6790
6791 #endif // COMPILER2
6792
6793 // x10 = input (float16)
6794 // f10 = result (float)
6795 // t1 = temporary register
6796 address generate_float16ToFloat() {
6797 __ align(CodeEntryAlignment);
6798 StubId stub_id = StubId::stubgen_hf2f_id;
6799 StubCodeMark mark(this, stub_id);
6800 address entry = __ pc();
6801 BLOCK_COMMENT("float16ToFloat:");
6802
6803 FloatRegister dst = f10;
6804 Register src = x10;
6805 Label NaN_SLOW;
6806
6807 assert(VM_Version::supports_float16_float_conversion(), "must");
6808
6809 // On riscv, NaN needs a special process as fcvt does not work in that case.
6810 // On riscv, Inf does not need a special process as fcvt can handle it correctly.
6811 // but we consider to get the slow path to process NaN and Inf at the same time,
6812 // as both of them are rare cases, and if we try to get the slow path to handle
6813 // only NaN case it would sacrifise the performance for normal cases,
6814 // i.e. non-NaN and non-Inf cases.
6815
6816 // check whether it's a NaN or +/- Inf.
6817 __ mv(t0, 0x7c00);
6818 __ andr(t1, src, t0);
6819 // jump to stub processing NaN and Inf cases.
6820 __ beq(t0, t1, NaN_SLOW);
6821
6822 // non-NaN or non-Inf cases, just use built-in instructions.
6823 __ fmv_h_x(dst, src);
6824 __ fcvt_s_h(dst, dst);
6825 __ ret();
6826
6827 __ bind(NaN_SLOW);
6828 // following instructions mainly focus on NaN, as riscv does not handle
6829 // NaN well with fcvt, but the code also works for Inf at the same time.
6830
6831 // construct a NaN in 32 bits from the NaN in 16 bits,
6832 // we need the payloads of non-canonical NaNs to be preserved.
6833 __ mv(t1, 0x7f800000);
6834 // sign-bit was already set via sign-extension if necessary.
6835 __ slli(t0, src, 13);
6836 __ orr(t1, t0, t1);
6837 __ fmv_w_x(dst, t1);
6838
6839 __ ret();
6840 return entry;
6841 }
6842
6843 // f10 = input (float)
6844 // x10 = result (float16)
6845 // f11 = temporary float register
6846 // t1 = temporary register
6847 address generate_floatToFloat16() {
6848 __ align(CodeEntryAlignment);
6849 StubId stub_id = StubId::stubgen_f2hf_id;
6850 StubCodeMark mark(this, stub_id);
6851 address entry = __ pc();
6852 BLOCK_COMMENT("floatToFloat16:");
6853
6854 Register dst = x10;
6855 FloatRegister src = f10, ftmp = f11;
6856 Label NaN_SLOW;
6857
6858 assert(VM_Version::supports_float16_float_conversion(), "must");
6859
6860 // On riscv, NaN needs a special process as fcvt does not work in that case.
6861
6862 // check whether it's a NaN.
6863 // replace fclass with feq as performance optimization.
6864 __ feq_s(t0, src, src);
6865 // jump to stub processing NaN cases.
6866 __ beqz(t0, NaN_SLOW);
6867
6868 // non-NaN cases, just use built-in instructions.
6869 __ fcvt_h_s(ftmp, src);
6870 __ fmv_x_h(dst, ftmp);
6871 __ ret();
6872
6873 __ bind(NaN_SLOW);
6874
6875 __ float_to_float16_NaN(dst, src, t0, t1);
6876
6877 __ ret();
6878 return entry;
6879 }
6880
6881 #ifdef COMPILER2
6882
6883 static const int64_t right_2_bits = right_n_bits(2);
6884 static const int64_t right_3_bits = right_n_bits(3);
6885
6886 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
6887 // are represented as long[5], with BITS_PER_LIMB = 26.
6888 // Pack five 26-bit limbs into three 64-bit registers.
6889 void poly1305_pack_26(Register dest0, Register dest1, Register dest2, Register src, Register tmp1, Register tmp2) {
6890 assert_different_registers(dest0, dest1, dest2, src, tmp1, tmp2);
6891
6892 // The goal is to have 128-bit value in dest2:dest1:dest0
6893 __ ld(dest0, Address(src, 0)); // 26 bits in dest0
6894
6895 __ ld(tmp1, Address(src, sizeof(jlong)));
6896 __ slli(tmp1, tmp1, 26);
6897 __ add(dest0, dest0, tmp1); // 52 bits in dest0
6898
6899 __ ld(tmp2, Address(src, 2 * sizeof(jlong)));
6900 __ slli(tmp1, tmp2, 52);
6901 __ add(dest0, dest0, tmp1); // dest0 is full
6902
6903 __ srli(dest1, tmp2, 12); // 14-bit in dest1
6904
6905 __ ld(tmp1, Address(src, 3 * sizeof(jlong)));
6906 __ slli(tmp1, tmp1, 14);
6907 __ add(dest1, dest1, tmp1); // 40-bit in dest1
6908
6909 __ ld(tmp1, Address(src, 4 * sizeof(jlong)));
6910 __ slli(tmp2, tmp1, 40);
6911 __ add(dest1, dest1, tmp2); // dest1 is full
6912
6913 if (dest2->is_valid()) {
6914 __ srli(tmp1, tmp1, 24);
6915 __ mv(dest2, tmp1); // 2 bits in dest2
6916 } else {
6917 #ifdef ASSERT
6918 Label OK;
6919 __ srli(tmp1, tmp1, 24);
6920 __ beq(zr, tmp1, OK); // 2 bits
6921 __ stop("high bits of Poly1305 integer should be zero");
6922 __ should_not_reach_here();
6923 __ bind(OK);
6924 #endif
6925 }
6926 }
6927
6928 // As above, but return only a 128-bit integer, packed into two
6929 // 64-bit registers.
6930 void poly1305_pack_26(Register dest0, Register dest1, Register src, Register tmp1, Register tmp2) {
6931 poly1305_pack_26(dest0, dest1, noreg, src, tmp1, tmp2);
6932 }
6933
6934 // U_2:U_1:U_0: += (U_2 >> 2) * 5
6935 void poly1305_reduce(Register U_2, Register U_1, Register U_0, Register tmp1, Register tmp2) {
6936 assert_different_registers(U_2, U_1, U_0, tmp1, tmp2);
6937
6938 // First, U_2:U_1:U_0 += (U_2 >> 2)
6939 __ srli(tmp1, U_2, 2);
6940 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6941 __ andi(U_2, U_2, right_2_bits); // Clear U_2 except for the lowest two bits
6942 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6943 __ add(U_2, U_2, tmp2);
6944
6945 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
6946 __ slli(tmp1, tmp1, 2);
6947 __ cad(U_0, U_0, tmp1, tmp2); // Add tmp1 to U_0 with carry output to tmp2
6948 __ cad(U_1, U_1, tmp2, tmp2); // Add carry to U_1 with carry output to tmp2
6949 __ add(U_2, U_2, tmp2);
6950 }
6951
6952 // Poly1305, RFC 7539
6953 // void com.sun.crypto.provider.Poly1305.processMultipleBlocks(byte[] input, int offset, int length, long[] aLimbs, long[] rLimbs)
6954
6955 // Arguments:
6956 // c_rarg0: input_start -- where the input is stored
6957 // c_rarg1: length
6958 // c_rarg2: acc_start -- where the output will be stored
6959 // c_rarg3: r_start -- where the randomly generated 128-bit key is stored
6960
6961 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
6962 // description of the tricks used to simplify and accelerate this
6963 // computation.
6964
6965 address generate_poly1305_processBlocks() {
6966 __ align(CodeEntryAlignment);
6967 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
6968 StubCodeMark mark(this, stub_id);
6969 address start = __ pc();
6970 __ enter();
6971 Label here;
6972
6973 RegSet saved_regs = RegSet::range(x18, x21);
6974 RegSetIterator<Register> regs = (RegSet::range(x14, x31) - RegSet::range(x22, x27)).begin();
6975 __ push_reg(saved_regs, sp);
6976
6977 // Arguments
6978 const Register input_start = c_rarg0, length = c_rarg1, acc_start = c_rarg2, r_start = c_rarg3;
6979
6980 // R_n is the 128-bit randomly-generated key, packed into two
6981 // registers. The caller passes this key to us as long[5], with
6982 // BITS_PER_LIMB = 26.
6983 const Register R_0 = *regs, R_1 = *++regs;
6984 poly1305_pack_26(R_0, R_1, r_start, t1, t2);
6985
6986 // RR_n is (R_n >> 2) * 5
6987 const Register RR_0 = *++regs, RR_1 = *++regs;
6988 __ srli(t1, R_0, 2);
6989 __ shadd(RR_0, t1, t1, t2, 2);
6990 __ srli(t1, R_1, 2);
6991 __ shadd(RR_1, t1, t1, t2, 2);
6992
6993 // U_n is the current checksum
6994 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
6995 poly1305_pack_26(U_0, U_1, U_2, acc_start, t1, t2);
6996
6997 static constexpr int BLOCK_LENGTH = 16;
6998 Label DONE, LOOP;
6999
7000 __ mv(t1, BLOCK_LENGTH);
7001 __ blt(length, t1, DONE); {
7002 __ bind(LOOP);
7003
7004 // S_n is to be the sum of U_n and the next block of data
7005 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
7006 __ ld(S_0, Address(input_start, 0));
7007 __ ld(S_1, Address(input_start, wordSize));
7008
7009 __ cad(S_0, S_0, U_0, t1); // Add U_0 to S_0 with carry output to t1
7010 __ cadc(S_1, S_1, U_1, t1); // Add U_1 with carry to S_1 with carry output to t1
7011 __ add(S_2, U_2, t1);
7012
7013 __ addi(S_2, S_2, 1);
7014
7015 const Register U_0HI = *++regs, U_1HI = *++regs;
7016
7017 // NB: this logic depends on some of the special properties of
7018 // Poly1305 keys. In particular, because we know that the top
7019 // four bits of R_0 and R_1 are zero, we can add together
7020 // partial products without any risk of needing to propagate a
7021 // carry out.
7022 __ wide_mul(U_0, U_0HI, S_0, R_0);
7023 __ wide_madd(U_0, U_0HI, S_1, RR_1, t1, t2);
7024 __ wide_madd(U_0, U_0HI, S_2, RR_0, t1, t2);
7025
7026 __ wide_mul(U_1, U_1HI, S_0, R_1);
7027 __ wide_madd(U_1, U_1HI, S_1, R_0, t1, t2);
7028 __ wide_madd(U_1, U_1HI, S_2, RR_1, t1, t2);
7029
7030 __ andi(U_2, R_0, right_2_bits);
7031 __ mul(U_2, S_2, U_2);
7032
7033 // Partial reduction mod 2**130 - 5
7034 __ cad(U_1, U_1, U_0HI, t1); // Add U_0HI to U_1 with carry output to t1
7035 __ adc(U_2, U_2, U_1HI, t1);
7036 // Sum is now in U_2:U_1:U_0.
7037
7038 // U_2:U_1:U_0: += (U_2 >> 2) * 5
7039 poly1305_reduce(U_2, U_1, U_0, t1, t2);
7040
7041 __ subi(length, length, BLOCK_LENGTH);
7042 __ addi(input_start, input_start, BLOCK_LENGTH);
7043 __ mv(t1, BLOCK_LENGTH);
7044 __ bge(length, t1, LOOP);
7045 }
7046
7047 // Further reduce modulo 2^130 - 5
7048 poly1305_reduce(U_2, U_1, U_0, t1, t2);
7049
7050 // Unpack the sum into five 26-bit limbs and write to memory.
7051 // First 26 bits is the first limb
7052 __ slli(t1, U_0, 38); // Take lowest 26 bits
7053 __ srli(t1, t1, 38);
7054 __ sd(t1, Address(acc_start)); // First 26-bit limb
7055
7056 // 27-52 bits of U_0 is the second limb
7057 __ slli(t1, U_0, 12); // Take next 27-52 bits
7058 __ srli(t1, t1, 38);
7059 __ sd(t1, Address(acc_start, sizeof (jlong))); // Second 26-bit limb
7060
7061 // Getting 53-64 bits of U_0 and 1-14 bits of U_1 in one register
7062 __ srli(t1, U_0, 52);
7063 __ slli(t2, U_1, 50);
7064 __ srli(t2, t2, 38);
7065 __ add(t1, t1, t2);
7066 __ sd(t1, Address(acc_start, 2 * sizeof (jlong))); // Third 26-bit limb
7067
7068 // Storing 15-40 bits of U_1
7069 __ slli(t1, U_1, 24); // Already used up 14 bits
7070 __ srli(t1, t1, 38); // Clear all other bits from t1
7071 __ sd(t1, Address(acc_start, 3 * sizeof (jlong))); // Fourth 26-bit limb
7072
7073 // Storing 41-64 bits of U_1 and first three bits from U_2 in one register
7074 __ srli(t1, U_1, 40);
7075 __ andi(t2, U_2, right_3_bits);
7076 __ slli(t2, t2, 24);
7077 __ add(t1, t1, t2);
7078 __ sd(t1, Address(acc_start, 4 * sizeof (jlong))); // Fifth 26-bit limb
7079
7080 __ bind(DONE);
7081 __ pop_reg(saved_regs, sp);
7082 __ leave(); // Required for proper stackwalking
7083 __ ret();
7084
7085 return start;
7086 }
7087
7088 address generate_arrays_hashcode_powers_of_31() {
7089 assert(UseRVV, "sanity");
7090 const int lmul = 2;
7091 const int stride = MaxVectorSize / sizeof(jint) * lmul;
7092 __ align(CodeEntryAlignment);
7093 StubCodeMark mark(this, "StubRoutines", "arrays_hashcode_powers_of_31");
7094 address start = __ pc();
7095 for (int i = stride; i >= 0; i--) {
7096 jint power_of_31 = 1;
7097 for (int j = i; j > 0; j--) {
7098 power_of_31 = java_multiply(power_of_31, 31);
7099 }
7100 __ emit_int32(power_of_31);
7101 }
7102
7103 return start;
7104 }
7105
7106 #endif // COMPILER2
7107
7108 /**
7109 * Arguments:
7110 *
7111 * Inputs:
7112 * c_rarg0 - int crc
7113 * c_rarg1 - byte* buf
7114 * c_rarg2 - int length
7115 *
7116 * Output:
7117 * c_rarg0 - int crc result
7118 */
7119 address generate_updateBytesCRC32() {
7120 assert(UseCRC32Intrinsics, "what are we doing here?");
7121
7122 __ align(CodeEntryAlignment);
7123 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7124 StubCodeMark mark(this, stub_id);
7125
7126 address start = __ pc();
7127
7128 // input parameters
7129 const Register crc = c_rarg0; // crc
7130 const Register buf = c_rarg1; // source java byte array address
7131 const Register len = c_rarg2; // length
7132
7133 BLOCK_COMMENT("Entry:");
7134 __ enter(); // required for proper stackwalking of RuntimeStub frame
7135
7136 __ kernel_crc32(crc, buf, len,
7137 c_rarg3, c_rarg4, c_rarg5, c_rarg6, // tmp's for tables
7138 c_rarg7, t2, t3, t4, t5, t6); // misc tmps
7139
7140 __ leave(); // required for proper stackwalking of RuntimeStub frame
7141 __ ret();
7142
7143 return start;
7144 }
7145
7146 // exception handler for upcall stubs
7147 address generate_upcall_stub_exception_handler() {
7148 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
7149 StubCodeMark mark(this, stub_id);
7150 address start = __ pc();
7151
7152 // Native caller has no idea how to handle exceptions,
7153 // so we just crash here. Up to callee to catch exceptions.
7154 __ verify_oop(x10); // return a exception oop in a0
7155 __ rt_call(CAST_FROM_FN_PTR(address, UpcallLinker::handle_uncaught_exception));
7156 __ should_not_reach_here();
7157
7158 return start;
7159 }
7160
7161 // load Method* target of MethodHandle
7162 // j_rarg0 = jobject receiver
7163 // xmethod = Method* result
7164 address generate_upcall_stub_load_target() {
7165
7166 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
7167 StubCodeMark mark(this, stub_id);
7168 address start = __ pc();
7169
7170 __ resolve_global_jobject(j_rarg0, t0, t1);
7171 // Load target method from receiver
7172 __ load_heap_oop(xmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), t0, t1);
7173 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_LambdaForm::vmentry_offset()), t0, t1);
7174 __ load_heap_oop(xmethod, Address(xmethod, java_lang_invoke_MemberName::method_offset()), t0, t1);
7175 __ access_load_at(T_ADDRESS, IN_HEAP, xmethod,
7176 Address(xmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
7177 noreg, noreg);
7178 __ sd(xmethod, Address(xthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
7179
7180 __ ret();
7181
7182 return start;
7183 }
7184
7185 #undef __
7186
7187 // Initialization
7188 void generate_preuniverse_stubs() {
7189 // preuniverse stubs are not needed for riscv
7190 }
7191
7192 void generate_initial_stubs() {
7193 // Generate initial stubs and initializes the entry points
7194
7195 // entry points that exist in all platforms Note: This is code
7196 // that could be shared among different platforms - however the
7197 // benefit seems to be smaller than the disadvantage of having a
7198 // much more complicated generator structure. See also comment in
7199 // stubRoutines.hpp.
7200
7201 StubRoutines::_forward_exception_entry = generate_forward_exception();
7202
7203 if (UnsafeMemoryAccess::_table == nullptr) {
7204 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
7205 }
7206
7207 StubRoutines::_call_stub_entry =
7208 generate_call_stub(StubRoutines::_call_stub_return_address);
7209
7210 // is referenced by megamorphic call
7211 StubRoutines::_catch_exception_entry = generate_catch_exception();
7212
7213 if (UseCRC32Intrinsics) {
7214 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
7215 }
7216
7217 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
7218 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
7219 StubRoutines::_hf2f = generate_float16ToFloat();
7220 StubRoutines::_f2hf = generate_floatToFloat16();
7221 }
7222 }
7223
7224 void generate_continuation_stubs() {
7225 // Continuation stubs:
7226 StubRoutines::_cont_thaw = generate_cont_thaw();
7227 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
7228 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
7229 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
7230 }
7231
7232 void generate_final_stubs() {
7233 // support for verify_oop (must happen after universe_init)
7234 if (VerifyOops) {
7235 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
7236 }
7237
7238 // arraycopy stubs used by compilers
7239 generate_arraycopy_stubs();
7240
7241 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
7242
7243 #ifdef COMPILER2
7244 if (UseSecondarySupersTable) {
7245 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
7246 if (!InlineSecondarySupersTest) {
7247 generate_lookup_secondary_supers_table_stub();
7248 }
7249 }
7250 #endif // COMPILER2
7251
7252 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
7253 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
7254
7255 StubRoutines::riscv::set_completed();
7256 }
7257
7258 void generate_compiler_stubs() {
7259 #ifdef COMPILER2
7260 if (UseMulAddIntrinsic) {
7261 StubRoutines::_mulAdd = generate_mulAdd();
7262 }
7263
7264 if (UseMultiplyToLenIntrinsic) {
7265 StubRoutines::_multiplyToLen = generate_multiplyToLen();
7266 }
7267
7268 if (UseSquareToLenIntrinsic) {
7269 StubRoutines::_squareToLen = generate_squareToLen();
7270 }
7271
7272 if (UseMontgomeryMultiplyIntrinsic) {
7273 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
7274 StubCodeMark mark(this, stub_id);
7275 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
7276 StubRoutines::_montgomeryMultiply = g.generate_multiply();
7277 }
7278
7279 if (UseMontgomerySquareIntrinsic) {
7280 StubId stub_id = StubId::stubgen_montgomerySquare_id;
7281 StubCodeMark mark(this, stub_id);
7282 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
7283 StubRoutines::_montgomerySquare = g.generate_square();
7284 }
7285
7286 if (UseAESIntrinsics) {
7287 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
7288 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
7289 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
7290 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
7291 }
7292
7293 if (UseAESCTRIntrinsics) {
7294 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
7295 }
7296
7297 if (UseGHASHIntrinsics) {
7298 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
7299 }
7300
7301 if (UsePoly1305Intrinsics) {
7302 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
7303 }
7304
7305 if (UseRVV) {
7306 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
7307 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
7308 }
7309
7310 if (UseVectorizedHashCodeIntrinsic && UseRVV) {
7311 StubRoutines::riscv::_arrays_hashcode_powers_of_31 = generate_arrays_hashcode_powers_of_31();
7312 }
7313
7314 if (UseSHA256Intrinsics) {
7315 Sha2Generator sha2(_masm, this);
7316 StubRoutines::_sha256_implCompress = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
7317 StubRoutines::_sha256_implCompressMB = sha2.generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
7318 }
7319
7320 if (UseSHA512Intrinsics) {
7321 Sha2Generator sha2(_masm, this);
7322 StubRoutines::_sha512_implCompress = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
7323 StubRoutines::_sha512_implCompressMB = sha2.generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
7324 }
7325
7326 if (UseMD5Intrinsics) {
7327 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
7328 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
7329 }
7330
7331 if (UseChaCha20Intrinsics) {
7332 StubRoutines::_chacha20Block = generate_chacha20Block();
7333 }
7334
7335 if (UseSHA1Intrinsics) {
7336 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
7337 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
7338 }
7339
7340 if (UseBASE64Intrinsics) {
7341 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
7342 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
7343 }
7344
7345 if (UseAdler32Intrinsics) {
7346 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
7347 }
7348
7349 generate_compare_long_strings();
7350
7351 generate_string_indexof_stubs();
7352
7353 #endif // COMPILER2
7354 }
7355
7356 public:
7357 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
7358 switch(blob_id) {
7359 case BlobId::stubgen_preuniverse_id:
7360 generate_preuniverse_stubs();
7361 break;
7362 case BlobId::stubgen_initial_id:
7363 generate_initial_stubs();
7364 break;
7365 case BlobId::stubgen_continuation_id:
7366 generate_continuation_stubs();
7367 break;
7368 case BlobId::stubgen_compiler_id:
7369 generate_compiler_stubs();
7370 break;
7371 case BlobId::stubgen_final_id:
7372 generate_final_stubs();
7373 break;
7374 default:
7375 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
7376 break;
7377 };
7378 }
7379 }; // end class declaration
7380
7381 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
7382 StubGenerator g(code, blob_id, stub_data);
7383 }