1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 __ ldr(j_rarg2, result);
332 Label is_long, is_float, is_double, exit;
333 __ ldr(j_rarg1, result_type);
334 __ cmp(j_rarg1, (u1)T_OBJECT);
335 __ br(Assembler::EQ, is_long);
336 __ cmp(j_rarg1, (u1)T_LONG);
337 __ br(Assembler::EQ, is_long);
338 __ cmp(j_rarg1, (u1)T_FLOAT);
339 __ br(Assembler::EQ, is_float);
340 __ cmp(j_rarg1, (u1)T_DOUBLE);
341 __ br(Assembler::EQ, is_double);
342
343 // handle T_INT case
344 __ strw(r0, Address(j_rarg2));
345
346 __ BIND(exit);
347
348 // pop parameters
349 __ sub(esp, rfp, -sp_after_call_off * wordSize);
350
351 #ifdef ASSERT
352 // verify that threads correspond
353 {
354 Label L, S;
355 __ ldr(rscratch1, thread);
356 __ cmp(rthread, rscratch1);
357 __ br(Assembler::NE, S);
358 __ get_thread(rscratch1);
359 __ cmp(rthread, rscratch1);
360 __ br(Assembler::EQ, L);
361 __ BIND(S);
362 __ stop("StubRoutines::call_stub: threads must correspond");
363 __ BIND(L);
364 }
365 #endif
366
367 __ pop_cont_fastpath(rthread);
368
369 // restore callee-save registers
370 __ ldpd(v15, v14, d15_save);
371 __ ldpd(v13, v12, d13_save);
372 __ ldpd(v11, v10, d11_save);
373 __ ldpd(v9, v8, d9_save);
374
375 __ ldp(r28, r27, r28_save);
376 __ ldp(r26, r25, r26_save);
377 __ ldp(r24, r23, r24_save);
378 __ ldp(r22, r21, r22_save);
379 __ ldp(r20, r19, r20_save);
380
381 // restore fpcr
382 __ ldr(rscratch1, fpcr_save);
383 __ set_fpcr(rscratch1);
384
385 __ ldp(c_rarg0, c_rarg1, call_wrapper);
386 __ ldrw(c_rarg2, result_type);
387 __ ldr(c_rarg3, method);
388 __ ldp(c_rarg4, c_rarg5, entry_point);
389 __ ldp(c_rarg6, c_rarg7, parameter_size);
390
391 // leave frame and return to caller
392 __ leave();
393 __ ret(lr);
394
395 // handle return types different from T_INT
396
397 __ BIND(is_long);
398 __ str(r0, Address(j_rarg2, 0));
399 __ br(Assembler::AL, exit);
400
401 __ BIND(is_float);
402 __ strs(j_farg0, Address(j_rarg2, 0));
403 __ br(Assembler::AL, exit);
404
405 __ BIND(is_double);
406 __ strd(j_farg0, Address(j_rarg2, 0));
407 __ br(Assembler::AL, exit);
408
409 return start;
410 }
411
412 // Return point for a Java call if there's an exception thrown in
413 // Java code. The exception is caught and transformed into a
414 // pending exception stored in JavaThread that can be tested from
415 // within the VM.
416 //
417 // Note: Usually the parameters are removed by the callee. In case
418 // of an exception crossing an activation frame boundary, that is
419 // not the case if the callee is compiled code => need to setup the
420 // rsp.
421 //
422 // r0: exception oop
423
424 address generate_catch_exception() {
425 StubId stub_id = StubId::stubgen_catch_exception_id;
426 StubCodeMark mark(this, stub_id);
427 address start = __ pc();
428
429 // same as in generate_call_stub():
430 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
431 const Address thread (rfp, thread_off * wordSize);
432
433 #ifdef ASSERT
434 // verify that threads correspond
435 {
436 Label L, S;
437 __ ldr(rscratch1, thread);
438 __ cmp(rthread, rscratch1);
439 __ br(Assembler::NE, S);
440 __ get_thread(rscratch1);
441 __ cmp(rthread, rscratch1);
442 __ br(Assembler::EQ, L);
443 __ bind(S);
444 __ stop("StubRoutines::catch_exception: threads must correspond");
445 __ bind(L);
446 }
447 #endif
448
449 // set pending exception
450 __ verify_oop(r0);
451
452 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
453 __ mov(rscratch1, (address)__FILE__);
454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
455 __ movw(rscratch1, (int)__LINE__);
456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
457
458 // complete return to VM
459 assert(StubRoutines::_call_stub_return_address != nullptr,
460 "_call_stub_return_address must have been generated before");
461 __ b(StubRoutines::_call_stub_return_address);
462
463 return start;
464 }
465
466 // Continuation point for runtime calls returning with a pending
467 // exception. The pending exception check happened in the runtime
468 // or native call stub. The pending exception in Thread is
469 // converted into a Java-level exception.
470 //
471 // Contract with Java-level exception handlers:
472 // r0: exception
473 // r3: throwing pc
474 //
475 // NOTE: At entry of this stub, exception-pc must be in LR !!
476
477 // NOTE: this is always used as a jump target within generated code
478 // so it just needs to be generated code with no x86 prolog
479
480 address generate_forward_exception() {
481 StubId stub_id = StubId::stubgen_forward_exception_id;
482 StubCodeMark mark(this, stub_id);
483 address start = __ pc();
484
485 // Upon entry, LR points to the return address returning into
486 // Java (interpreted or compiled) code; i.e., the return address
487 // becomes the throwing pc.
488 //
489 // Arguments pushed before the runtime call are still on the stack
490 // but the exception handler will reset the stack pointer ->
491 // ignore them. A potential result in registers can be ignored as
492 // well.
493
494 #ifdef ASSERT
495 // make sure this code is only executed if there is a pending exception
496 {
497 Label L;
498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
499 __ cbnz(rscratch1, L);
500 __ stop("StubRoutines::forward exception: no pending exception (1)");
501 __ bind(L);
502 }
503 #endif
504
505 // compute exception handler into r19
506
507 // call the VM to find the handler address associated with the
508 // caller address. pass thread in r0 and caller pc (ret address)
509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
510 // the stack.
511 __ mov(c_rarg1, lr);
512 // lr will be trashed by the VM call so we move it to R19
513 // (callee-saved) because we also need to pass it to the handler
514 // returned by this call.
515 __ mov(r19, lr);
516 BLOCK_COMMENT("call exception_handler_for_return_address");
517 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
518 SharedRuntime::exception_handler_for_return_address),
519 rthread, c_rarg1);
520 // Reinitialize the ptrue predicate register, in case the external runtime
521 // call clobbers ptrue reg, as we may return to SVE compiled code.
522 __ reinitialize_ptrue();
523
524 // we should not really care that lr is no longer the callee
525 // address. we saved the value the handler needs in r19 so we can
526 // just copy it to r3. however, the C2 handler will push its own
527 // frame and then calls into the VM and the VM code asserts that
528 // the PC for the frame above the handler belongs to a compiled
529 // Java method. So, we restore lr here to satisfy that assert.
530 __ mov(lr, r19);
531 // setup r0 & r3 & clear pending exception
532 __ mov(r3, r19);
533 __ mov(r19, r0);
534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
535 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
536
537 #ifdef ASSERT
538 // make sure exception is set
539 {
540 Label L;
541 __ cbnz(r0, L);
542 __ stop("StubRoutines::forward exception: no pending exception (2)");
543 __ bind(L);
544 }
545 #endif
546
547 // continue at exception handler
548 // r0: exception
549 // r3: throwing pc
550 // r19: exception handler
551 __ verify_oop(r0);
552 __ br(r19);
553
554 return start;
555 }
556
557 // Non-destructive plausibility checks for oops
558 //
559 // Arguments:
560 // r0: oop to verify
561 // rscratch1: error message
562 //
563 // Stack after saving c_rarg3:
564 // [tos + 0]: saved c_rarg3
565 // [tos + 1]: saved c_rarg2
566 // [tos + 2]: saved lr
567 // [tos + 3]: saved rscratch2
568 // [tos + 4]: saved r0
569 // [tos + 5]: saved rscratch1
570 address generate_verify_oop() {
571 StubId stub_id = StubId::stubgen_verify_oop_id;
572 StubCodeMark mark(this, stub_id);
573 address start = __ pc();
574
575 Label exit, error;
576
577 // save c_rarg2 and c_rarg3
578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
579
580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
582 __ ldr(c_rarg3, Address(c_rarg2));
583 __ add(c_rarg3, c_rarg3, 1);
584 __ str(c_rarg3, Address(c_rarg2));
585
586 // object is in r0
587 // make sure object is 'reasonable'
588 __ cbz(r0, exit); // if obj is null it is OK
589
590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
592
593 // return if everything seems ok
594 __ bind(exit);
595
596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
597 __ ret(lr);
598
599 // handle errors
600 __ bind(error);
601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
602
603 __ push(RegSet::range(r0, r29), sp);
604 // debug(char* msg, int64_t pc, int64_t regs[])
605 __ mov(c_rarg0, rscratch1); // pass address of error message
606 __ mov(c_rarg1, lr); // pass return address
607 __ mov(c_rarg2, sp); // pass address of regs on stack
608 #ifndef PRODUCT
609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
610 #endif
611 BLOCK_COMMENT("call MacroAssembler::debug");
612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
613 __ blr(rscratch1);
614 __ hlt(0);
615
616 return start;
617 }
618
619 // Generate indices for iota vector.
620 address generate_iota_indices(StubId stub_id) {
621 __ align(CodeEntryAlignment);
622 StubCodeMark mark(this, stub_id);
623 address start = __ pc();
624 // B
625 __ emit_data64(0x0706050403020100, relocInfo::none);
626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
627 // H
628 __ emit_data64(0x0003000200010000, relocInfo::none);
629 __ emit_data64(0x0007000600050004, relocInfo::none);
630 // S
631 __ emit_data64(0x0000000100000000, relocInfo::none);
632 __ emit_data64(0x0000000300000002, relocInfo::none);
633 // D
634 __ emit_data64(0x0000000000000000, relocInfo::none);
635 __ emit_data64(0x0000000000000001, relocInfo::none);
636 // S - FP
637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
639 // D - FP
640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
642 return start;
643 }
644
645 // The inner part of zero_words(). This is the bulk operation,
646 // zeroing words in blocks, possibly using DC ZVA to do it. The
647 // caller is responsible for zeroing the last few words.
648 //
649 // Inputs:
650 // r10: the HeapWord-aligned base address of an array to zero.
651 // r11: the count in HeapWords, r11 > 0.
652 //
653 // Returns r10 and r11, adjusted for the caller to clear.
654 // r10: the base address of the tail of words left to clear.
655 // r11: the number of words in the tail.
656 // r11 < MacroAssembler::zero_words_block_size.
657
658 address generate_zero_blocks() {
659 Label done;
660 Label base_aligned;
661
662 Register base = r10, cnt = r11;
663
664 __ align(CodeEntryAlignment);
665 StubId stub_id = StubId::stubgen_zero_blocks_id;
666 StubCodeMark mark(this, stub_id);
667 address start = __ pc();
668
669 if (UseBlockZeroing) {
670 int zva_length = VM_Version::zva_length();
671
672 // Ensure ZVA length can be divided by 16. This is required by
673 // the subsequent operations.
674 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
675
676 __ tbz(base, 3, base_aligned);
677 __ str(zr, Address(__ post(base, 8)));
678 __ sub(cnt, cnt, 1);
679 __ bind(base_aligned);
680
681 // Ensure count >= zva_length * 2 so that it still deserves a zva after
682 // alignment.
683 Label small;
684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
685 __ subs(rscratch1, cnt, low_limit >> 3);
686 __ br(Assembler::LT, small);
687 __ zero_dcache_blocks(base, cnt);
688 __ bind(small);
689 }
690
691 {
692 // Number of stp instructions we'll unroll
693 const int unroll =
694 MacroAssembler::zero_words_block_size / 2;
695 // Clear the remaining blocks.
696 Label loop;
697 __ subs(cnt, cnt, unroll * 2);
698 __ br(Assembler::LT, done);
699 __ bind(loop);
700 for (int i = 0; i < unroll; i++)
701 __ stp(zr, zr, __ post(base, 16));
702 __ subs(cnt, cnt, unroll * 2);
703 __ br(Assembler::GE, loop);
704 __ bind(done);
705 __ add(cnt, cnt, unroll * 2);
706 }
707
708 __ ret(lr);
709
710 return start;
711 }
712
713
714 typedef enum {
715 copy_forwards = 1,
716 copy_backwards = -1
717 } copy_direction;
718
719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
720 // for arraycopy stubs.
721 class ArrayCopyBarrierSetHelper : StackObj {
722 BarrierSetAssembler* _bs_asm;
723 MacroAssembler* _masm;
724 DecoratorSet _decorators;
725 BasicType _type;
726 Register _gct1;
727 Register _gct2;
728 Register _gct3;
729 FloatRegister _gcvt1;
730 FloatRegister _gcvt2;
731 FloatRegister _gcvt3;
732
733 public:
734 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
735 DecoratorSet decorators,
736 BasicType type,
737 Register gct1,
738 Register gct2,
739 Register gct3,
740 FloatRegister gcvt1,
741 FloatRegister gcvt2,
742 FloatRegister gcvt3)
743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
744 _masm(masm),
745 _decorators(decorators),
746 _type(type),
747 _gct1(gct1),
748 _gct2(gct2),
749 _gct3(gct3),
750 _gcvt1(gcvt1),
751 _gcvt2(gcvt2),
752 _gcvt3(gcvt3) {
753 }
754
755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
757 dst1, dst2, src,
758 _gct1, _gct2, _gcvt1);
759 }
760
761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
763 dst, src1, src2,
764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
765 }
766
767 void copy_load_at_16(Register dst1, Register dst2, Address src) {
768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
769 dst1, dst2, src,
770 _gct1);
771 }
772
773 void copy_store_at_16(Address dst, Register src1, Register src2) {
774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
775 dst, src1, src2,
776 _gct1, _gct2, _gct3);
777 }
778
779 void copy_load_at_8(Register dst, Address src) {
780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
781 dst, noreg, src,
782 _gct1);
783 }
784
785 void copy_store_at_8(Address dst, Register src) {
786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
787 dst, src, noreg,
788 _gct1, _gct2, _gct3);
789 }
790 };
791
792 // Bulk copy of blocks of 8 words.
793 //
794 // count is a count of words.
795 //
796 // Precondition: count >= 8
797 //
798 // Postconditions:
799 //
800 // The least significant bit of count contains the remaining count
801 // of words to copy. The rest of count is trash.
802 //
803 // s and d are adjusted to point to the remaining words to copy
804 //
805 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
806 BasicType type;
807 copy_direction direction;
808
809 switch (stub_id) {
810 case StubId::stubgen_copy_byte_f_id:
811 direction = copy_forwards;
812 type = T_BYTE;
813 break;
814 case StubId::stubgen_copy_byte_b_id:
815 direction = copy_backwards;
816 type = T_BYTE;
817 break;
818 case StubId::stubgen_copy_oop_f_id:
819 direction = copy_forwards;
820 type = T_OBJECT;
821 break;
822 case StubId::stubgen_copy_oop_b_id:
823 direction = copy_backwards;
824 type = T_OBJECT;
825 break;
826 case StubId::stubgen_copy_oop_uninit_f_id:
827 direction = copy_forwards;
828 type = T_OBJECT;
829 break;
830 case StubId::stubgen_copy_oop_uninit_b_id:
831 direction = copy_backwards;
832 type = T_OBJECT;
833 break;
834 default:
835 ShouldNotReachHere();
836 }
837
838 int unit = wordSize * direction;
839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
840
841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
842 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
843 const Register stride = r14;
844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
847
848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
849 assert_different_registers(s, d, count, rscratch1, rscratch2);
850
851 Label again, drain;
852
853 __ align(CodeEntryAlignment);
854
855 StubCodeMark mark(this, stub_id);
856
857 address start = __ pc();
858
859 Label unaligned_copy_long;
860 if (AvoidUnalignedAccesses) {
861 __ tbnz(d, 3, unaligned_copy_long);
862 }
863
864 if (direction == copy_forwards) {
865 __ sub(s, s, bias);
866 __ sub(d, d, bias);
867 }
868
869 #ifdef ASSERT
870 // Make sure we are never given < 8 words
871 {
872 Label L;
873 __ cmp(count, (u1)8);
874 __ br(Assembler::GE, L);
875 __ stop("genrate_copy_longs called with < 8 words");
876 __ bind(L);
877 }
878 #endif
879
880 // Fill 8 registers
881 if (UseSIMDForMemoryOps) {
882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
884 } else {
885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
889 }
890
891 __ subs(count, count, 16);
892 __ br(Assembler::LO, drain);
893
894 int prefetch = PrefetchCopyIntervalInBytes;
895 bool use_stride = false;
896 if (direction == copy_backwards) {
897 use_stride = prefetch > 256;
898 prefetch = -prefetch;
899 if (use_stride) __ mov(stride, prefetch);
900 }
901
902 __ bind(again);
903
904 if (PrefetchCopyIntervalInBytes > 0)
905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
906
907 if (UseSIMDForMemoryOps) {
908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
912 } else {
913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
921 }
922
923 __ subs(count, count, 8);
924 __ br(Assembler::HS, again);
925
926 // Drain
927 __ bind(drain);
928 if (UseSIMDForMemoryOps) {
929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
931 } else {
932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 }
937
938 {
939 Label L1, L2;
940 __ tbz(count, exact_log2(4), L1);
941 if (UseSIMDForMemoryOps) {
942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
944 } else {
945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
949 }
950 __ bind(L1);
951
952 if (direction == copy_forwards) {
953 __ add(s, s, bias);
954 __ add(d, d, bias);
955 }
956
957 __ tbz(count, 1, L2);
958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
960 __ bind(L2);
961 }
962
963 __ ret(lr);
964
965 if (AvoidUnalignedAccesses) {
966 Label drain, again;
967 // Register order for storing. Order is different for backward copy.
968
969 __ bind(unaligned_copy_long);
970
971 // source address is even aligned, target odd aligned
972 //
973 // when forward copying word pairs we read long pairs at offsets
974 // {0, 2, 4, 6} (in long words). when backwards copying we read
975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
976 // address by -2 in the forwards case so we can compute the
977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
978 // or -1.
979 //
980 // when forward copying we need to store 1 word, 3 pairs and
981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
982 // zero offset We adjust the destination by -1 which means we
983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
984 //
985 // When backwards copyng we need to store 1 word, 3 pairs and
986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
987 // offsets {1, 3, 5, 7, 8} * unit.
988
989 if (direction == copy_forwards) {
990 __ sub(s, s, 16);
991 __ sub(d, d, 8);
992 }
993
994 // Fill 8 registers
995 //
996 // for forwards copy s was offset by -16 from the original input
997 // value of s so the register contents are at these offsets
998 // relative to the 64 bit block addressed by that original input
999 // and so on for each successive 64 byte block when s is updated
1000 //
1001 // t0 at offset 0, t1 at offset 8
1002 // t2 at offset 16, t3 at offset 24
1003 // t4 at offset 32, t5 at offset 40
1004 // t6 at offset 48, t7 at offset 56
1005
1006 // for backwards copy s was not offset so the register contents
1007 // are at these offsets into the preceding 64 byte block
1008 // relative to that original input and so on for each successive
1009 // preceding 64 byte block when s is updated. this explains the
1010 // slightly counter-intuitive looking pattern of register usage
1011 // in the stp instructions for backwards copy.
1012 //
1013 // t0 at offset -16, t1 at offset -8
1014 // t2 at offset -32, t3 at offset -24
1015 // t4 at offset -48, t5 at offset -40
1016 // t6 at offset -64, t7 at offset -56
1017
1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1022
1023 __ subs(count, count, 16);
1024 __ br(Assembler::LO, drain);
1025
1026 int prefetch = PrefetchCopyIntervalInBytes;
1027 bool use_stride = false;
1028 if (direction == copy_backwards) {
1029 use_stride = prefetch > 256;
1030 prefetch = -prefetch;
1031 if (use_stride) __ mov(stride, prefetch);
1032 }
1033
1034 __ bind(again);
1035
1036 if (PrefetchCopyIntervalInBytes > 0)
1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1038
1039 if (direction == copy_forwards) {
1040 // allowing for the offset of -8 the store instructions place
1041 // registers into the target 64 bit block at the following
1042 // offsets
1043 //
1044 // t0 at offset 0
1045 // t1 at offset 8, t2 at offset 16
1046 // t3 at offset 24, t4 at offset 32
1047 // t5 at offset 40, t6 at offset 48
1048 // t7 at offset 56
1049
1050 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1059 } else {
1060 // d was not offset when we started so the registers are
1061 // written into the 64 bit block preceding d with the following
1062 // offsets
1063 //
1064 // t1 at offset -8
1065 // t3 at offset -24, t0 at offset -16
1066 // t5 at offset -48, t2 at offset -32
1067 // t7 at offset -56, t4 at offset -48
1068 // t6 at offset -64
1069 //
1070 // note that this matches the offsets previously noted for the
1071 // loads
1072
1073 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1082 }
1083
1084 __ subs(count, count, 8);
1085 __ br(Assembler::HS, again);
1086
1087 // Drain
1088 //
1089 // this uses the same pattern of offsets and register arguments
1090 // as above
1091 __ bind(drain);
1092 if (direction == copy_forwards) {
1093 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1098 } else {
1099 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1104 }
1105 // now we need to copy any remaining part block which may
1106 // include a 4 word block subblock and/or a 2 word subblock.
1107 // bits 2 and 1 in the count are the tell-tale for whether we
1108 // have each such subblock
1109 {
1110 Label L1, L2;
1111 __ tbz(count, exact_log2(4), L1);
1112 // this is the same as above but copying only 4 longs hence
1113 // with only one intervening stp between the str instructions
1114 // but note that the offsets and registers still follow the
1115 // same pattern
1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1118 if (direction == copy_forwards) {
1119 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1122 } else {
1123 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1126 }
1127 __ bind(L1);
1128
1129 __ tbz(count, 1, L2);
1130 // this is the same as above but copying only 2 longs hence
1131 // there is no intervening stp between the str instructions
1132 // but note that the offset and register patterns are still
1133 // the same
1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1135 if (direction == copy_forwards) {
1136 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1141 }
1142 __ bind(L2);
1143
1144 // for forwards copy we need to re-adjust the offsets we
1145 // applied so that s and d are follow the last words written
1146
1147 if (direction == copy_forwards) {
1148 __ add(s, s, 16);
1149 __ add(d, d, 8);
1150 }
1151
1152 }
1153
1154 __ ret(lr);
1155 }
1156
1157 return start;
1158 }
1159
1160 // Small copy: less than 16 bytes.
1161 //
1162 // NB: Ignores all of the bits of count which represent more than 15
1163 // bytes, so a caller doesn't have to mask them.
1164
1165 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1166 bool is_backwards = step < 0;
1167 size_t granularity = g_uabs(step);
1168 int direction = is_backwards ? -1 : 1;
1169
1170 Label Lword, Lint, Lshort, Lbyte;
1171
1172 assert(granularity
1173 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1174
1175 const Register t0 = r3;
1176 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1177 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1178
1179 // ??? I don't know if this bit-test-and-branch is the right thing
1180 // to do. It does a lot of jumping, resulting in several
1181 // mispredicted branches. It might make more sense to do this
1182 // with something like Duff's device with a single computed branch.
1183
1184 __ tbz(count, 3 - exact_log2(granularity), Lword);
1185 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1186 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1187 __ bind(Lword);
1188
1189 if (granularity <= sizeof (jint)) {
1190 __ tbz(count, 2 - exact_log2(granularity), Lint);
1191 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1192 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1193 __ bind(Lint);
1194 }
1195
1196 if (granularity <= sizeof (jshort)) {
1197 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1198 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1199 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1200 __ bind(Lshort);
1201 }
1202
1203 if (granularity <= sizeof (jbyte)) {
1204 __ tbz(count, 0, Lbyte);
1205 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1206 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1207 __ bind(Lbyte);
1208 }
1209 }
1210
1211 // All-singing all-dancing memory copy.
1212 //
1213 // Copy count units of memory from s to d. The size of a unit is
1214 // step, which can be positive or negative depending on the direction
1215 // of copy. If is_aligned is false, we align the source address.
1216 //
1217
1218 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1219 Register s, Register d, Register count, int step) {
1220 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1221 bool is_backwards = step < 0;
1222 unsigned int granularity = g_uabs(step);
1223 const Register t0 = r3, t1 = r4;
1224
1225 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1226 // load all the data before writing anything
1227 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1228 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1229 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1230 const Register send = r17, dend = r16;
1231 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1232 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1233 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1234
1235 if (PrefetchCopyIntervalInBytes > 0)
1236 __ prfm(Address(s, 0), PLDL1KEEP);
1237 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1238 __ br(Assembler::HI, copy_big);
1239
1240 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1241 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1242
1243 __ cmp(count, u1(16/granularity));
1244 __ br(Assembler::LS, copy16);
1245
1246 __ cmp(count, u1(64/granularity));
1247 __ br(Assembler::HI, copy80);
1248
1249 __ cmp(count, u1(32/granularity));
1250 __ br(Assembler::LS, copy32);
1251
1252 // 33..64 bytes
1253 if (UseSIMDForMemoryOps) {
1254 bs.copy_load_at_32(v0, v1, Address(s, 0));
1255 bs.copy_load_at_32(v2, v3, Address(send, -32));
1256 bs.copy_store_at_32(Address(d, 0), v0, v1);
1257 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1258 } else {
1259 bs.copy_load_at_16(t0, t1, Address(s, 0));
1260 bs.copy_load_at_16(t2, t3, Address(s, 16));
1261 bs.copy_load_at_16(t4, t5, Address(send, -32));
1262 bs.copy_load_at_16(t6, t7, Address(send, -16));
1263
1264 bs.copy_store_at_16(Address(d, 0), t0, t1);
1265 bs.copy_store_at_16(Address(d, 16), t2, t3);
1266 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1267 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1268 }
1269 __ b(finish);
1270
1271 // 17..32 bytes
1272 __ bind(copy32);
1273 bs.copy_load_at_16(t0, t1, Address(s, 0));
1274 bs.copy_load_at_16(t6, t7, Address(send, -16));
1275
1276 bs.copy_store_at_16(Address(d, 0), t0, t1);
1277 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1278 __ b(finish);
1279
1280 // 65..80/96 bytes
1281 // (96 bytes if SIMD because we do 32 byes per instruction)
1282 __ bind(copy80);
1283 if (UseSIMDForMemoryOps) {
1284 bs.copy_load_at_32(v0, v1, Address(s, 0));
1285 bs.copy_load_at_32(v2, v3, Address(s, 32));
1286 // Unaligned pointers can be an issue for copying.
1287 // The issue has more chances to happen when granularity of data is
1288 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1289 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1290 // The most performance drop has been seen for the range 65-80 bytes.
1291 // For such cases using the pair of ldp/stp instead of the third pair of
1292 // ldpq/stpq fixes the performance issue.
1293 if (granularity < sizeof (jint)) {
1294 Label copy96;
1295 __ cmp(count, u1(80/granularity));
1296 __ br(Assembler::HI, copy96);
1297 bs.copy_load_at_16(t0, t1, Address(send, -16));
1298
1299 bs.copy_store_at_32(Address(d, 0), v0, v1);
1300 bs.copy_store_at_32(Address(d, 32), v2, v3);
1301
1302 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1303 __ b(finish);
1304
1305 __ bind(copy96);
1306 }
1307 bs.copy_load_at_32(v4, v5, Address(send, -32));
1308
1309 bs.copy_store_at_32(Address(d, 0), v0, v1);
1310 bs.copy_store_at_32(Address(d, 32), v2, v3);
1311
1312 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1313 } else {
1314 bs.copy_load_at_16(t0, t1, Address(s, 0));
1315 bs.copy_load_at_16(t2, t3, Address(s, 16));
1316 bs.copy_load_at_16(t4, t5, Address(s, 32));
1317 bs.copy_load_at_16(t6, t7, Address(s, 48));
1318 bs.copy_load_at_16(t8, t9, Address(send, -16));
1319
1320 bs.copy_store_at_16(Address(d, 0), t0, t1);
1321 bs.copy_store_at_16(Address(d, 16), t2, t3);
1322 bs.copy_store_at_16(Address(d, 32), t4, t5);
1323 bs.copy_store_at_16(Address(d, 48), t6, t7);
1324 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1325 }
1326 __ b(finish);
1327
1328 // 0..16 bytes
1329 __ bind(copy16);
1330 __ cmp(count, u1(8/granularity));
1331 __ br(Assembler::LO, copy8);
1332
1333 // 8..16 bytes
1334 bs.copy_load_at_8(t0, Address(s, 0));
1335 bs.copy_load_at_8(t1, Address(send, -8));
1336 bs.copy_store_at_8(Address(d, 0), t0);
1337 bs.copy_store_at_8(Address(dend, -8), t1);
1338 __ b(finish);
1339
1340 if (granularity < 8) {
1341 // 4..7 bytes
1342 __ bind(copy8);
1343 __ tbz(count, 2 - exact_log2(granularity), copy4);
1344 __ ldrw(t0, Address(s, 0));
1345 __ ldrw(t1, Address(send, -4));
1346 __ strw(t0, Address(d, 0));
1347 __ strw(t1, Address(dend, -4));
1348 __ b(finish);
1349 if (granularity < 4) {
1350 // 0..3 bytes
1351 __ bind(copy4);
1352 __ cbz(count, finish); // get rid of 0 case
1353 if (granularity == 2) {
1354 __ ldrh(t0, Address(s, 0));
1355 __ strh(t0, Address(d, 0));
1356 } else { // granularity == 1
1357 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1358 // the first and last byte.
1359 // Handle the 3 byte case by loading and storing base + count/2
1360 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1361 // This does means in the 1 byte case we load/store the same
1362 // byte 3 times.
1363 __ lsr(count, count, 1);
1364 __ ldrb(t0, Address(s, 0));
1365 __ ldrb(t1, Address(send, -1));
1366 __ ldrb(t2, Address(s, count));
1367 __ strb(t0, Address(d, 0));
1368 __ strb(t1, Address(dend, -1));
1369 __ strb(t2, Address(d, count));
1370 }
1371 __ b(finish);
1372 }
1373 }
1374
1375 __ bind(copy_big);
1376 if (is_backwards) {
1377 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1378 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1379 }
1380
1381 // Now we've got the small case out of the way we can align the
1382 // source address on a 2-word boundary.
1383
1384 // Here we will materialize a count in r15, which is used by copy_memory_small
1385 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1386 // Up until here, we have used t9, which aliases r15, but from here on, that register
1387 // can not be used as a temp register, as it contains the count.
1388
1389 Label aligned;
1390
1391 if (is_aligned) {
1392 // We may have to adjust by 1 word to get s 2-word-aligned.
1393 __ tbz(s, exact_log2(wordSize), aligned);
1394 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1395 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1396 __ sub(count, count, wordSize/granularity);
1397 } else {
1398 if (is_backwards) {
1399 __ andr(r15, s, 2 * wordSize - 1);
1400 } else {
1401 __ neg(r15, s);
1402 __ andr(r15, r15, 2 * wordSize - 1);
1403 }
1404 // r15 is the byte adjustment needed to align s.
1405 __ cbz(r15, aligned);
1406 int shift = exact_log2(granularity);
1407 if (shift > 0) {
1408 __ lsr(r15, r15, shift);
1409 }
1410 __ sub(count, count, r15);
1411
1412 #if 0
1413 // ?? This code is only correct for a disjoint copy. It may or
1414 // may not make sense to use it in that case.
1415
1416 // Copy the first pair; s and d may not be aligned.
1417 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1418 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1419
1420 // Align s and d, adjust count
1421 if (is_backwards) {
1422 __ sub(s, s, r15);
1423 __ sub(d, d, r15);
1424 } else {
1425 __ add(s, s, r15);
1426 __ add(d, d, r15);
1427 }
1428 #else
1429 copy_memory_small(decorators, type, s, d, r15, step);
1430 #endif
1431 }
1432
1433 __ bind(aligned);
1434
1435 // s is now 2-word-aligned.
1436
1437 // We have a count of units and some trailing bytes. Adjust the
1438 // count and do a bulk copy of words. If the shift is zero
1439 // perform a move instead to benefit from zero latency moves.
1440 int shift = exact_log2(wordSize/granularity);
1441 if (shift > 0) {
1442 __ lsr(r15, count, shift);
1443 } else {
1444 __ mov(r15, count);
1445 }
1446 if (direction == copy_forwards) {
1447 if (type != T_OBJECT) {
1448 __ bl(StubRoutines::aarch64::copy_byte_f());
1449 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1450 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1451 } else {
1452 __ bl(StubRoutines::aarch64::copy_oop_f());
1453 }
1454 } else {
1455 if (type != T_OBJECT) {
1456 __ bl(StubRoutines::aarch64::copy_byte_b());
1457 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1458 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1459 } else {
1460 __ bl(StubRoutines::aarch64::copy_oop_b());
1461 }
1462 }
1463
1464 // And the tail.
1465 copy_memory_small(decorators, type, s, d, count, step);
1466
1467 if (granularity >= 8) __ bind(copy8);
1468 if (granularity >= 4) __ bind(copy4);
1469 __ bind(finish);
1470 }
1471
1472
1473 void clobber_registers() {
1474 #ifdef ASSERT
1475 RegSet clobbered
1476 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1477 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1478 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1479 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1480 __ mov(*it, rscratch1);
1481 }
1482 #endif
1483
1484 }
1485
1486 // Scan over array at a for count oops, verifying each one.
1487 // Preserves a and count, clobbers rscratch1 and rscratch2.
1488 void verify_oop_array (int size, Register a, Register count, Register temp) {
1489 Label loop, end;
1490 __ mov(rscratch1, a);
1491 __ mov(rscratch2, zr);
1492 __ bind(loop);
1493 __ cmp(rscratch2, count);
1494 __ br(Assembler::HS, end);
1495 if (size == wordSize) {
1496 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1497 __ verify_oop(temp);
1498 } else {
1499 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1500 __ decode_heap_oop(temp); // calls verify_oop
1501 }
1502 __ add(rscratch2, rscratch2, 1);
1503 __ b(loop);
1504 __ bind(end);
1505 }
1506
1507 // Arguments:
1508 // stub_id - is used to name the stub and identify all details of
1509 // how to perform the copy.
1510 //
1511 // entry - is assigned to the stub's post push entry point unless
1512 // it is null
1513 //
1514 // Inputs:
1515 // c_rarg0 - source array address
1516 // c_rarg1 - destination array address
1517 // c_rarg2 - element count, treated as ssize_t, can be zero
1518 //
1519 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1520 // the hardware handle it. The two dwords within qwords that span
1521 // cache line boundaries will still be loaded and stored atomically.
1522 //
1523 // Side Effects: nopush_entry is set to the (post push) entry point
1524 // so it can be used by the corresponding conjoint
1525 // copy method
1526 //
1527 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1528 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1529 RegSet saved_reg = RegSet::of(s, d, count);
1530 int size;
1531 bool aligned;
1532 bool is_oop;
1533 bool dest_uninitialized;
1534 switch (stub_id) {
1535 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1536 size = sizeof(jbyte);
1537 aligned = false;
1538 is_oop = false;
1539 dest_uninitialized = false;
1540 break;
1541 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1542 size = sizeof(jbyte);
1543 aligned = true;
1544 is_oop = false;
1545 dest_uninitialized = false;
1546 break;
1547 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1548 size = sizeof(jshort);
1549 aligned = false;
1550 is_oop = false;
1551 dest_uninitialized = false;
1552 break;
1553 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1554 size = sizeof(jshort);
1555 aligned = true;
1556 is_oop = false;
1557 dest_uninitialized = false;
1558 break;
1559 case StubId::stubgen_jint_disjoint_arraycopy_id:
1560 size = sizeof(jint);
1561 aligned = false;
1562 is_oop = false;
1563 dest_uninitialized = false;
1564 break;
1565 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1566 size = sizeof(jint);
1567 aligned = true;
1568 is_oop = false;
1569 dest_uninitialized = false;
1570 break;
1571 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1572 // since this is always aligned we can (should!) use the same
1573 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1574 ShouldNotReachHere();
1575 break;
1576 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1577 size = sizeof(jlong);
1578 aligned = true;
1579 is_oop = false;
1580 dest_uninitialized = false;
1581 break;
1582 case StubId::stubgen_oop_disjoint_arraycopy_id:
1583 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1584 aligned = !UseCompressedOops;
1585 is_oop = true;
1586 dest_uninitialized = false;
1587 break;
1588 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1589 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1590 aligned = !UseCompressedOops;
1591 is_oop = true;
1592 dest_uninitialized = false;
1593 break;
1594 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1595 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1596 aligned = !UseCompressedOops;
1597 is_oop = true;
1598 dest_uninitialized = true;
1599 break;
1600 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1602 aligned = !UseCompressedOops;
1603 is_oop = true;
1604 dest_uninitialized = true;
1605 break;
1606 default:
1607 ShouldNotReachHere();
1608 break;
1609 }
1610
1611 __ align(CodeEntryAlignment);
1612 StubCodeMark mark(this, stub_id);
1613 address start = __ pc();
1614 __ enter();
1615
1616 if (nopush_entry != nullptr) {
1617 *nopush_entry = __ pc();
1618 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1619 BLOCK_COMMENT("Entry:");
1620 }
1621
1622 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1623 if (dest_uninitialized) {
1624 decorators |= IS_DEST_UNINITIALIZED;
1625 }
1626 if (aligned) {
1627 decorators |= ARRAYCOPY_ALIGNED;
1628 }
1629
1630 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1631 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1632
1633 if (is_oop) {
1634 // save regs before copy_memory
1635 __ push(RegSet::of(d, count), sp);
1636 }
1637 {
1638 // UnsafeMemoryAccess page error: continue after unsafe access
1639 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1640 UnsafeMemoryAccessMark umam(this, add_entry, true);
1641 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1642 }
1643
1644 if (is_oop) {
1645 __ pop(RegSet::of(d, count), sp);
1646 if (VerifyOops)
1647 verify_oop_array(size, d, count, r16);
1648 }
1649
1650 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1651
1652 __ leave();
1653 __ mov(r0, zr); // return 0
1654 __ ret(lr);
1655 return start;
1656 }
1657
1658 // Arguments:
1659 // stub_id - is used to name the stub and identify all details of
1660 // how to perform the copy.
1661 //
1662 // nooverlap_target - identifes the (post push) entry for the
1663 // corresponding disjoint copy routine which can be
1664 // jumped to if the ranges do not actually overlap
1665 //
1666 // entry - is assigned to the stub's post push entry point unless
1667 // it is null
1668 //
1669 //
1670 // Inputs:
1671 // c_rarg0 - source array address
1672 // c_rarg1 - destination array address
1673 // c_rarg2 - element count, treated as ssize_t, can be zero
1674 //
1675 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1676 // the hardware handle it. The two dwords within qwords that span
1677 // cache line boundaries will still be loaded and stored atomically.
1678 //
1679 // Side Effects:
1680 // nopush_entry is set to the no-overlap entry point so it can be
1681 // used by some other conjoint copy method
1682 //
1683 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1684 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1685 RegSet saved_regs = RegSet::of(s, d, count);
1686 int size;
1687 bool aligned;
1688 bool is_oop;
1689 bool dest_uninitialized;
1690 switch (stub_id) {
1691 case StubId::stubgen_jbyte_arraycopy_id:
1692 size = sizeof(jbyte);
1693 aligned = false;
1694 is_oop = false;
1695 dest_uninitialized = false;
1696 break;
1697 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1698 size = sizeof(jbyte);
1699 aligned = true;
1700 is_oop = false;
1701 dest_uninitialized = false;
1702 break;
1703 case StubId::stubgen_jshort_arraycopy_id:
1704 size = sizeof(jshort);
1705 aligned = false;
1706 is_oop = false;
1707 dest_uninitialized = false;
1708 break;
1709 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1710 size = sizeof(jshort);
1711 aligned = true;
1712 is_oop = false;
1713 dest_uninitialized = false;
1714 break;
1715 case StubId::stubgen_jint_arraycopy_id:
1716 size = sizeof(jint);
1717 aligned = false;
1718 is_oop = false;
1719 dest_uninitialized = false;
1720 break;
1721 case StubId::stubgen_arrayof_jint_arraycopy_id:
1722 size = sizeof(jint);
1723 aligned = true;
1724 is_oop = false;
1725 dest_uninitialized = false;
1726 break;
1727 case StubId::stubgen_jlong_arraycopy_id:
1728 // since this is always aligned we can (should!) use the same
1729 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1730 ShouldNotReachHere();
1731 break;
1732 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1733 size = sizeof(jlong);
1734 aligned = true;
1735 is_oop = false;
1736 dest_uninitialized = false;
1737 break;
1738 case StubId::stubgen_oop_arraycopy_id:
1739 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1740 aligned = !UseCompressedOops;
1741 is_oop = true;
1742 dest_uninitialized = false;
1743 break;
1744 case StubId::stubgen_arrayof_oop_arraycopy_id:
1745 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1746 aligned = !UseCompressedOops;
1747 is_oop = true;
1748 dest_uninitialized = false;
1749 break;
1750 case StubId::stubgen_oop_arraycopy_uninit_id:
1751 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1752 aligned = !UseCompressedOops;
1753 is_oop = true;
1754 dest_uninitialized = true;
1755 break;
1756 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1758 aligned = !UseCompressedOops;
1759 is_oop = true;
1760 dest_uninitialized = true;
1761 break;
1762 default:
1763 ShouldNotReachHere();
1764 }
1765
1766 StubCodeMark mark(this, stub_id);
1767 address start = __ pc();
1768 __ enter();
1769
1770 if (nopush_entry != nullptr) {
1771 *nopush_entry = __ pc();
1772 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1773 BLOCK_COMMENT("Entry:");
1774 }
1775
1776 // use fwd copy when (d-s) above_equal (count*size)
1777 Label L_overlapping;
1778 __ sub(rscratch1, d, s);
1779 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1780 __ br(Assembler::LO, L_overlapping);
1781 __ b(RuntimeAddress(nooverlap_target));
1782 __ bind(L_overlapping);
1783
1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1785 if (dest_uninitialized) {
1786 decorators |= IS_DEST_UNINITIALIZED;
1787 }
1788 if (aligned) {
1789 decorators |= ARRAYCOPY_ALIGNED;
1790 }
1791
1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1794
1795 if (is_oop) {
1796 // save regs before copy_memory
1797 __ push(RegSet::of(d, count), sp);
1798 }
1799 {
1800 // UnsafeMemoryAccess page error: continue after unsafe access
1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1802 UnsafeMemoryAccessMark umam(this, add_entry, true);
1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1804 }
1805 if (is_oop) {
1806 __ pop(RegSet::of(d, count), sp);
1807 if (VerifyOops)
1808 verify_oop_array(size, d, count, r16);
1809 }
1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1811 __ leave();
1812 __ mov(r0, zr); // return 0
1813 __ ret(lr);
1814 return start;
1815 }
1816
1817 // Helper for generating a dynamic type check.
1818 // Smashes rscratch1, rscratch2.
1819 void generate_type_check(Register sub_klass,
1820 Register super_check_offset,
1821 Register super_klass,
1822 Register temp1,
1823 Register temp2,
1824 Register result,
1825 Label& L_success) {
1826 assert_different_registers(sub_klass, super_check_offset, super_klass);
1827
1828 BLOCK_COMMENT("type_check:");
1829
1830 Label L_miss;
1831
1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1833 super_check_offset);
1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1835
1836 // Fall through on failure!
1837 __ BIND(L_miss);
1838 }
1839
1840 //
1841 // Generate checkcasting array copy stub
1842 //
1843 // Input:
1844 // c_rarg0 - source array address
1845 // c_rarg1 - destination array address
1846 // c_rarg2 - element count, treated as ssize_t, can be zero
1847 // c_rarg3 - size_t ckoff (super_check_offset)
1848 // c_rarg4 - oop ckval (super_klass)
1849 //
1850 // Output:
1851 // r0 == 0 - success
1852 // r0 == -1^K - failure, where K is partial transfer count
1853 //
1854 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1855 bool dest_uninitialized;
1856 switch (stub_id) {
1857 case StubId::stubgen_checkcast_arraycopy_id:
1858 dest_uninitialized = false;
1859 break;
1860 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1861 dest_uninitialized = true;
1862 break;
1863 default:
1864 ShouldNotReachHere();
1865 }
1866
1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1868
1869 // Input registers (after setup_arg_regs)
1870 const Register from = c_rarg0; // source array address
1871 const Register to = c_rarg1; // destination array address
1872 const Register count = c_rarg2; // elementscount
1873 const Register ckoff = c_rarg3; // super_check_offset
1874 const Register ckval = c_rarg4; // super_klass
1875
1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1877
1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1879 const Register copied_oop = r22; // actual oop copied
1880 const Register count_save = r21; // orig elementscount
1881 const Register start_to = r20; // destination array start address
1882 const Register r19_klass = r19; // oop._klass
1883
1884 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1885 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1886
1887 //---------------------------------------------------------------
1888 // Assembler stub will be used for this call to arraycopy
1889 // if the two arrays are subtypes of Object[] but the
1890 // destination array type is not equal to or a supertype
1891 // of the source type. Each element must be separately
1892 // checked.
1893
1894 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1895 copied_oop, r19_klass, count_save);
1896
1897 __ align(CodeEntryAlignment);
1898 StubCodeMark mark(this, stub_id);
1899 address start = __ pc();
1900
1901 __ enter(); // required for proper stackwalking of RuntimeStub frame
1902
1903 #ifdef ASSERT
1904 // caller guarantees that the arrays really are different
1905 // otherwise, we would have to make conjoint checks
1906 { Label L;
1907 __ b(L); // conjoint check not yet implemented
1908 __ stop("checkcast_copy within a single array");
1909 __ bind(L);
1910 }
1911 #endif //ASSERT
1912
1913 // Caller of this entry point must set up the argument registers.
1914 if (nopush_entry != nullptr) {
1915 *nopush_entry = __ pc();
1916 BLOCK_COMMENT("Entry:");
1917 }
1918
1919 // Empty array: Nothing to do.
1920 __ cbz(count, L_done);
1921 __ push(RegSet::of(r19, r20, r21, r22), sp);
1922
1923 #ifdef ASSERT
1924 BLOCK_COMMENT("assert consistent ckoff/ckval");
1925 // The ckoff and ckval must be mutually consistent,
1926 // even though caller generates both.
1927 { Label L;
1928 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1929 __ ldrw(start_to, Address(ckval, sco_offset));
1930 __ cmpw(ckoff, start_to);
1931 __ br(Assembler::EQ, L);
1932 __ stop("super_check_offset inconsistent");
1933 __ bind(L);
1934 }
1935 #endif //ASSERT
1936
1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1938 bool is_oop = true;
1939 int element_size = UseCompressedOops ? 4 : 8;
1940 if (dest_uninitialized) {
1941 decorators |= IS_DEST_UNINITIALIZED;
1942 }
1943
1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1946
1947 // save the original count
1948 __ mov(count_save, count);
1949
1950 // Copy from low to high addresses
1951 __ mov(start_to, to); // Save destination array start address
1952 __ b(L_load_element);
1953
1954 // ======== begin loop ========
1955 // (Loop is rotated; its entry is L_load_element.)
1956 // Loop control:
1957 // for (; count != 0; count--) {
1958 // copied_oop = load_heap_oop(from++);
1959 // ... generate_type_check ...;
1960 // store_heap_oop(to++, copied_oop);
1961 // }
1962 __ align(OptoLoopAlignment);
1963
1964 __ BIND(L_store_element);
1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1966 __ post(to, element_size), copied_oop, noreg,
1967 gct1, gct2, gct3);
1968 __ sub(count, count, 1);
1969 __ cbz(count, L_do_card_marks);
1970
1971 // ======== loop entry is here ========
1972 __ BIND(L_load_element);
1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1974 copied_oop, noreg, __ post(from, element_size),
1975 gct1);
1976 __ cbz(copied_oop, L_store_element);
1977
1978 __ load_klass(r19_klass, copied_oop);// query the object klass
1979
1980 BLOCK_COMMENT("type_check:");
1981 generate_type_check(/*sub_klass*/r19_klass,
1982 /*super_check_offset*/ckoff,
1983 /*super_klass*/ckval,
1984 /*r_array_base*/gct1,
1985 /*temp2*/gct2,
1986 /*result*/r10, L_store_element);
1987
1988 // Fall through on failure!
1989
1990 // ======== end loop ========
1991
1992 // It was a real error; we must depend on the caller to finish the job.
1993 // Register count = remaining oops, count_orig = total oops.
1994 // Emit GC store barriers for the oops we have copied and report
1995 // their number to the caller.
1996
1997 __ subs(count, count_save, count); // K = partially copied oop count
1998 __ eon(count, count, zr); // report (-1^K) to caller
1999 __ br(Assembler::EQ, L_done_pop);
2000
2001 __ BIND(L_do_card_marks);
2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2003
2004 __ bind(L_done_pop);
2005 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2007
2008 __ bind(L_done);
2009 __ mov(r0, count);
2010 __ leave();
2011 __ ret(lr);
2012
2013 return start;
2014 }
2015
2016 // Perform range checks on the proposed arraycopy.
2017 // Kills temp, but nothing else.
2018 // Also, clean the sign bits of src_pos and dst_pos.
2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2020 Register src_pos, // source position (c_rarg1)
2021 Register dst, // destination array oo (c_rarg2)
2022 Register dst_pos, // destination position (c_rarg3)
2023 Register length,
2024 Register temp,
2025 Label& L_failed) {
2026 BLOCK_COMMENT("arraycopy_range_checks:");
2027
2028 assert_different_registers(rscratch1, temp);
2029
2030 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2032 __ addw(temp, length, src_pos);
2033 __ cmpw(temp, rscratch1);
2034 __ br(Assembler::HI, L_failed);
2035
2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2038 __ addw(temp, length, dst_pos);
2039 __ cmpw(temp, rscratch1);
2040 __ br(Assembler::HI, L_failed);
2041
2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2043 __ movw(src_pos, src_pos);
2044 __ movw(dst_pos, dst_pos);
2045
2046 BLOCK_COMMENT("arraycopy_range_checks done");
2047 }
2048
2049 // These stubs get called from some dumb test routine.
2050 // I'll write them properly when they're called from
2051 // something that's actually doing something.
2052 static void fake_arraycopy_stub(address src, address dst, int count) {
2053 assert(count == 0, "huh?");
2054 }
2055
2056
2057 //
2058 // Generate 'unsafe' array copy stub
2059 // Though just as safe as the other stubs, it takes an unscaled
2060 // size_t argument instead of an element count.
2061 //
2062 // Input:
2063 // c_rarg0 - source array address
2064 // c_rarg1 - destination array address
2065 // c_rarg2 - byte count, treated as ssize_t, can be zero
2066 //
2067 // Examines the alignment of the operands and dispatches
2068 // to a long, int, short, or byte copy loop.
2069 //
2070 address generate_unsafe_copy(address byte_copy_entry,
2071 address short_copy_entry,
2072 address int_copy_entry,
2073 address long_copy_entry) {
2074 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2075
2076 Label L_long_aligned, L_int_aligned, L_short_aligned;
2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2078
2079 __ align(CodeEntryAlignment);
2080 StubCodeMark mark(this, stub_id);
2081 address start = __ pc();
2082 __ enter(); // required for proper stackwalking of RuntimeStub frame
2083
2084 // bump this on entry, not on exit:
2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2086
2087 __ orr(rscratch1, s, d);
2088 __ orr(rscratch1, rscratch1, count);
2089
2090 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2091 __ cbz(rscratch1, L_long_aligned);
2092 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2093 __ cbz(rscratch1, L_int_aligned);
2094 __ tbz(rscratch1, 0, L_short_aligned);
2095 __ b(RuntimeAddress(byte_copy_entry));
2096
2097 __ BIND(L_short_aligned);
2098 __ lsr(count, count, LogBytesPerShort); // size => short_count
2099 __ b(RuntimeAddress(short_copy_entry));
2100 __ BIND(L_int_aligned);
2101 __ lsr(count, count, LogBytesPerInt); // size => int_count
2102 __ b(RuntimeAddress(int_copy_entry));
2103 __ BIND(L_long_aligned);
2104 __ lsr(count, count, LogBytesPerLong); // size => long_count
2105 __ b(RuntimeAddress(long_copy_entry));
2106
2107 return start;
2108 }
2109
2110 //
2111 // Generate generic array copy stubs
2112 //
2113 // Input:
2114 // c_rarg0 - src oop
2115 // c_rarg1 - src_pos (32-bits)
2116 // c_rarg2 - dst oop
2117 // c_rarg3 - dst_pos (32-bits)
2118 // c_rarg4 - element count (32-bits)
2119 //
2120 // Output:
2121 // r0 == 0 - success
2122 // r0 == -1^K - failure, where K is partial transfer count
2123 //
2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2125 address int_copy_entry, address oop_copy_entry,
2126 address long_copy_entry, address checkcast_copy_entry) {
2127 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2128
2129 Label L_failed, L_objArray;
2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2131
2132 // Input registers
2133 const Register src = c_rarg0; // source array oop
2134 const Register src_pos = c_rarg1; // source position
2135 const Register dst = c_rarg2; // destination array oop
2136 const Register dst_pos = c_rarg3; // destination position
2137 const Register length = c_rarg4;
2138
2139
2140 // Registers used as temps
2141 const Register dst_klass = c_rarg5;
2142
2143 __ align(CodeEntryAlignment);
2144
2145 StubCodeMark mark(this, stub_id);
2146
2147 address start = __ pc();
2148
2149 __ enter(); // required for proper stackwalking of RuntimeStub frame
2150
2151 // bump this on entry, not on exit:
2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2153
2154 //-----------------------------------------------------------------------
2155 // Assembler stub will be used for this call to arraycopy
2156 // if the following conditions are met:
2157 //
2158 // (1) src and dst must not be null.
2159 // (2) src_pos must not be negative.
2160 // (3) dst_pos must not be negative.
2161 // (4) length must not be negative.
2162 // (5) src klass and dst klass should be the same and not null.
2163 // (6) src and dst should be arrays.
2164 // (7) src_pos + length must not exceed length of src.
2165 // (8) dst_pos + length must not exceed length of dst.
2166 //
2167
2168 // if (src == nullptr) return -1;
2169 __ cbz(src, L_failed);
2170
2171 // if (src_pos < 0) return -1;
2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2173
2174 // if (dst == nullptr) return -1;
2175 __ cbz(dst, L_failed);
2176
2177 // if (dst_pos < 0) return -1;
2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2179
2180 // registers used as temp
2181 const Register scratch_length = r16; // elements count to copy
2182 const Register scratch_src_klass = r17; // array klass
2183 const Register lh = r15; // layout helper
2184
2185 // if (length < 0) return -1;
2186 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2188
2189 __ load_klass(scratch_src_klass, src);
2190 #ifdef ASSERT
2191 // assert(src->klass() != nullptr);
2192 {
2193 BLOCK_COMMENT("assert klasses not null {");
2194 Label L1, L2;
2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2196 __ bind(L1);
2197 __ stop("broken null klass");
2198 __ bind(L2);
2199 __ load_klass(rscratch1, dst);
2200 __ cbz(rscratch1, L1); // this would be broken also
2201 BLOCK_COMMENT("} assert klasses not null done");
2202 }
2203 #endif
2204
2205 // Load layout helper (32-bits)
2206 //
2207 // |array_tag| | header_size | element_type | |log2_element_size|
2208 // 32 30 24 16 8 2 0
2209 //
2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2211 //
2212
2213 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2214
2215 // Handle objArrays completely differently...
2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2218 __ movw(rscratch1, objArray_lh);
2219 __ eorw(rscratch2, lh, rscratch1);
2220 __ cbzw(rscratch2, L_objArray);
2221
2222 // if (src->klass() != dst->klass()) return -1;
2223 __ load_klass(rscratch2, dst);
2224 __ eor(rscratch2, rscratch2, scratch_src_klass);
2225 __ cbnz(rscratch2, L_failed);
2226
2227 // if (!src->is_Array()) return -1;
2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2229
2230 // At this point, it is known to be a typeArray (array_tag 0x3).
2231 #ifdef ASSERT
2232 {
2233 BLOCK_COMMENT("assert primitive array {");
2234 Label L;
2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2236 __ cmpw(lh, rscratch2);
2237 __ br(Assembler::GE, L);
2238 __ stop("must be a primitive array");
2239 __ bind(L);
2240 BLOCK_COMMENT("} assert primitive array done");
2241 }
2242 #endif
2243
2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2245 rscratch2, L_failed);
2246
2247 // TypeArrayKlass
2248 //
2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2251 //
2252
2253 const Register rscratch1_offset = rscratch1; // array offset
2254 const Register r15_elsize = lh; // element size
2255
2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2258 __ add(src, src, rscratch1_offset); // src array offset
2259 __ add(dst, dst, rscratch1_offset); // dst array offset
2260 BLOCK_COMMENT("choose copy loop based on element size");
2261
2262 // next registers should be set before the jump to corresponding stub
2263 const Register from = c_rarg0; // source array address
2264 const Register to = c_rarg1; // destination array address
2265 const Register count = c_rarg2; // elements count
2266
2267 // 'from', 'to', 'count' registers should be set in such order
2268 // since they are the same as 'src', 'src_pos', 'dst'.
2269
2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2271
2272 // The possible values of elsize are 0-3, i.e. exact_log2(element
2273 // size in bytes). We do a simple bitwise binary search.
2274 __ BIND(L_copy_bytes);
2275 __ tbnz(r15_elsize, 1, L_copy_ints);
2276 __ tbnz(r15_elsize, 0, L_copy_shorts);
2277 __ lea(from, Address(src, src_pos));// src_addr
2278 __ lea(to, Address(dst, dst_pos));// dst_addr
2279 __ movw(count, scratch_length); // length
2280 __ b(RuntimeAddress(byte_copy_entry));
2281
2282 __ BIND(L_copy_shorts);
2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2285 __ movw(count, scratch_length); // length
2286 __ b(RuntimeAddress(short_copy_entry));
2287
2288 __ BIND(L_copy_ints);
2289 __ tbnz(r15_elsize, 0, L_copy_longs);
2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2292 __ movw(count, scratch_length); // length
2293 __ b(RuntimeAddress(int_copy_entry));
2294
2295 __ BIND(L_copy_longs);
2296 #ifdef ASSERT
2297 {
2298 BLOCK_COMMENT("assert long copy {");
2299 Label L;
2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2301 __ cmpw(r15_elsize, LogBytesPerLong);
2302 __ br(Assembler::EQ, L);
2303 __ stop("must be long copy, but elsize is wrong");
2304 __ bind(L);
2305 BLOCK_COMMENT("} assert long copy done");
2306 }
2307 #endif
2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2310 __ movw(count, scratch_length); // length
2311 __ b(RuntimeAddress(long_copy_entry));
2312
2313 // ObjArrayKlass
2314 __ BIND(L_objArray);
2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2316
2317 Label L_plain_copy, L_checkcast_copy;
2318 // test array classes for subtyping
2319 __ load_klass(r15, dst);
2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2321 __ br(Assembler::NE, L_checkcast_copy);
2322
2323 // Identically typed arrays can be copied without element-wise checks.
2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2325 rscratch2, L_failed);
2326
2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2331 __ movw(count, scratch_length); // length
2332 __ BIND(L_plain_copy);
2333 __ b(RuntimeAddress(oop_copy_entry));
2334
2335 __ BIND(L_checkcast_copy);
2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2337 {
2338 // Before looking at dst.length, make sure dst is also an objArray.
2339 __ ldrw(rscratch1, Address(r15, lh_offset));
2340 __ movw(rscratch2, objArray_lh);
2341 __ eorw(rscratch1, rscratch1, rscratch2);
2342 __ cbnzw(rscratch1, L_failed);
2343
2344 // It is safe to examine both src.length and dst.length.
2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2346 r15, L_failed);
2347
2348 __ load_klass(dst_klass, dst); // reload
2349
2350 // Marshal the base address arguments now, freeing registers.
2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2355 __ movw(count, length); // length (reloaded)
2356 Register sco_temp = c_rarg3; // this register is free now
2357 assert_different_registers(from, to, count, sco_temp,
2358 dst_klass, scratch_src_klass);
2359 // assert_clean_int(count, sco_temp);
2360
2361 // Generate the type check.
2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2364
2365 // Smashes rscratch1, rscratch2
2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2367 L_plain_copy);
2368
2369 // Fetch destination element klass from the ObjArrayKlass header.
2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2371 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2373
2374 // the checkcast_copy loop needs two extra arguments:
2375 assert(c_rarg3 == sco_temp, "#3 already in place");
2376 // Set up arguments for checkcast_copy_entry.
2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2378 __ b(RuntimeAddress(checkcast_copy_entry));
2379 }
2380
2381 __ BIND(L_failed);
2382 __ mov(r0, -1);
2383 __ leave(); // required for proper stackwalking of RuntimeStub frame
2384 __ ret(lr);
2385
2386 return start;
2387 }
2388
2389 //
2390 // Generate stub for array fill. If "aligned" is true, the
2391 // "to" address is assumed to be heapword aligned.
2392 //
2393 // Arguments for generated stub:
2394 // to: c_rarg0
2395 // value: c_rarg1
2396 // count: c_rarg2 treated as signed
2397 //
2398 address generate_fill(StubId stub_id) {
2399 BasicType t;
2400 bool aligned;
2401
2402 switch (stub_id) {
2403 case StubId::stubgen_jbyte_fill_id:
2404 t = T_BYTE;
2405 aligned = false;
2406 break;
2407 case StubId::stubgen_jshort_fill_id:
2408 t = T_SHORT;
2409 aligned = false;
2410 break;
2411 case StubId::stubgen_jint_fill_id:
2412 t = T_INT;
2413 aligned = false;
2414 break;
2415 case StubId::stubgen_arrayof_jbyte_fill_id:
2416 t = T_BYTE;
2417 aligned = true;
2418 break;
2419 case StubId::stubgen_arrayof_jshort_fill_id:
2420 t = T_SHORT;
2421 aligned = true;
2422 break;
2423 case StubId::stubgen_arrayof_jint_fill_id:
2424 t = T_INT;
2425 aligned = true;
2426 break;
2427 default:
2428 ShouldNotReachHere();
2429 };
2430
2431 __ align(CodeEntryAlignment);
2432 StubCodeMark mark(this, stub_id);
2433 address start = __ pc();
2434
2435 BLOCK_COMMENT("Entry:");
2436
2437 const Register to = c_rarg0; // source array address
2438 const Register value = c_rarg1; // value
2439 const Register count = c_rarg2; // elements count
2440
2441 const Register bz_base = r10; // base for block_zero routine
2442 const Register cnt_words = r11; // temp register
2443
2444 __ enter();
2445
2446 Label L_fill_elements, L_exit1;
2447
2448 int shift = -1;
2449 switch (t) {
2450 case T_BYTE:
2451 shift = 0;
2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2455 __ br(Assembler::LO, L_fill_elements);
2456 break;
2457 case T_SHORT:
2458 shift = 1;
2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2461 __ br(Assembler::LO, L_fill_elements);
2462 break;
2463 case T_INT:
2464 shift = 2;
2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2466 __ br(Assembler::LO, L_fill_elements);
2467 break;
2468 default: ShouldNotReachHere();
2469 }
2470
2471 // Align source address at 8 bytes address boundary.
2472 Label L_skip_align1, L_skip_align2, L_skip_align4;
2473 if (!aligned) {
2474 switch (t) {
2475 case T_BYTE:
2476 // One byte misalignment happens only for byte arrays.
2477 __ tbz(to, 0, L_skip_align1);
2478 __ strb(value, Address(__ post(to, 1)));
2479 __ subw(count, count, 1);
2480 __ bind(L_skip_align1);
2481 // Fallthrough
2482 case T_SHORT:
2483 // Two bytes misalignment happens only for byte and short (char) arrays.
2484 __ tbz(to, 1, L_skip_align2);
2485 __ strh(value, Address(__ post(to, 2)));
2486 __ subw(count, count, 2 >> shift);
2487 __ bind(L_skip_align2);
2488 // Fallthrough
2489 case T_INT:
2490 // Align to 8 bytes, we know we are 4 byte aligned to start.
2491 __ tbz(to, 2, L_skip_align4);
2492 __ strw(value, Address(__ post(to, 4)));
2493 __ subw(count, count, 4 >> shift);
2494 __ bind(L_skip_align4);
2495 break;
2496 default: ShouldNotReachHere();
2497 }
2498 }
2499
2500 //
2501 // Fill large chunks
2502 //
2503 __ lsrw(cnt_words, count, 3 - shift); // number of words
2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2506 if (UseBlockZeroing) {
2507 Label non_block_zeroing, rest;
2508 // If the fill value is zero we can use the fast zero_words().
2509 __ cbnz(value, non_block_zeroing);
2510 __ mov(bz_base, to);
2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2512 address tpc = __ zero_words(bz_base, cnt_words);
2513 if (tpc == nullptr) {
2514 fatal("CodeCache is full at generate_fill");
2515 }
2516 __ b(rest);
2517 __ bind(non_block_zeroing);
2518 __ fill_words(to, cnt_words, value);
2519 __ bind(rest);
2520 } else {
2521 __ fill_words(to, cnt_words, value);
2522 }
2523
2524 // Remaining count is less than 8 bytes. Fill it by a single store.
2525 // Note that the total length is no less than 8 bytes.
2526 if (t == T_BYTE || t == T_SHORT) {
2527 Label L_exit1;
2528 __ cbzw(count, L_exit1);
2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2530 __ str(value, Address(to, -8)); // overwrite some elements
2531 __ bind(L_exit1);
2532 __ leave();
2533 __ ret(lr);
2534 }
2535
2536 // Handle copies less than 8 bytes.
2537 Label L_fill_2, L_fill_4, L_exit2;
2538 __ bind(L_fill_elements);
2539 switch (t) {
2540 case T_BYTE:
2541 __ tbz(count, 0, L_fill_2);
2542 __ strb(value, Address(__ post(to, 1)));
2543 __ bind(L_fill_2);
2544 __ tbz(count, 1, L_fill_4);
2545 __ strh(value, Address(__ post(to, 2)));
2546 __ bind(L_fill_4);
2547 __ tbz(count, 2, L_exit2);
2548 __ strw(value, Address(to));
2549 break;
2550 case T_SHORT:
2551 __ tbz(count, 0, L_fill_4);
2552 __ strh(value, Address(__ post(to, 2)));
2553 __ bind(L_fill_4);
2554 __ tbz(count, 1, L_exit2);
2555 __ strw(value, Address(to));
2556 break;
2557 case T_INT:
2558 __ cbzw(count, L_exit2);
2559 __ strw(value, Address(to));
2560 break;
2561 default: ShouldNotReachHere();
2562 }
2563 __ bind(L_exit2);
2564 __ leave();
2565 __ ret(lr);
2566 return start;
2567 }
2568
2569 address generate_unsafecopy_common_error_exit() {
2570 address start_pc = __ pc();
2571 __ leave();
2572 __ mov(r0, 0);
2573 __ ret(lr);
2574 return start_pc;
2575 }
2576
2577 //
2578 // Generate 'unsafe' set memory stub
2579 // Though just as safe as the other stubs, it takes an unscaled
2580 // size_t (# bytes) argument instead of an element count.
2581 //
2582 // This fill operation is atomicity preserving: as long as the
2583 // address supplied is sufficiently aligned, all writes of up to 64
2584 // bits in size are single-copy atomic.
2585 //
2586 // Input:
2587 // c_rarg0 - destination array address
2588 // c_rarg1 - byte count (size_t)
2589 // c_rarg2 - byte value
2590 //
2591 address generate_unsafe_setmemory() {
2592 __ align(CodeEntryAlignment);
2593 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2594 address start = __ pc();
2595
2596 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2597 Label tail;
2598
2599 UnsafeMemoryAccessMark umam(this, true, false);
2600
2601 __ enter(); // required for proper stackwalking of RuntimeStub frame
2602
2603 __ dup(v0, __ T16B, value);
2604
2605 if (AvoidUnalignedAccesses) {
2606 __ cmp(count, (u1)16);
2607 __ br(__ LO, tail);
2608
2609 __ mov(rscratch1, 16);
2610 __ andr(rscratch2, dest, 15);
2611 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2612 __ strq(v0, Address(dest));
2613 __ sub(count, count, rscratch1);
2614 __ add(dest, dest, rscratch1);
2615 }
2616
2617 __ subs(count, count, (u1)64);
2618 __ br(__ LO, tail);
2619 {
2620 Label again;
2621 __ bind(again);
2622 __ stpq(v0, v0, Address(dest));
2623 __ stpq(v0, v0, Address(dest, 32));
2624
2625 __ subs(count, count, 64);
2626 __ add(dest, dest, 64);
2627 __ br(__ HS, again);
2628 }
2629
2630 __ bind(tail);
2631 // The count of bytes is off by 64, but we don't need to correct
2632 // it because we're only going to use the least-significant few
2633 // count bits from here on.
2634 // __ add(count, count, 64);
2635
2636 {
2637 Label dont;
2638 __ tbz(count, exact_log2(32), dont);
2639 __ stpq(v0, v0, __ post(dest, 32));
2640 __ bind(dont);
2641 }
2642 {
2643 Label dont;
2644 __ tbz(count, exact_log2(16), dont);
2645 __ strq(v0, __ post(dest, 16));
2646 __ bind(dont);
2647 }
2648 {
2649 Label dont;
2650 __ tbz(count, exact_log2(8), dont);
2651 __ strd(v0, __ post(dest, 8));
2652 __ bind(dont);
2653 }
2654
2655 Label finished;
2656 __ tst(count, 7);
2657 __ br(__ EQ, finished);
2658
2659 {
2660 Label dont;
2661 __ tbz(count, exact_log2(4), dont);
2662 __ strs(v0, __ post(dest, 4));
2663 __ bind(dont);
2664 }
2665 {
2666 Label dont;
2667 __ tbz(count, exact_log2(2), dont);
2668 __ bfi(value, value, 8, 8);
2669 __ strh(value, __ post(dest, 2));
2670 __ bind(dont);
2671 }
2672 {
2673 Label dont;
2674 __ tbz(count, exact_log2(1), dont);
2675 __ strb(value, Address(dest));
2676 __ bind(dont);
2677 }
2678
2679 __ bind(finished);
2680 __ leave();
2681 __ ret(lr);
2682
2683 return start;
2684 }
2685
2686 address generate_data_cache_writeback() {
2687 const Register line = c_rarg0; // address of line to write back
2688
2689 __ align(CodeEntryAlignment);
2690
2691 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2692 StubCodeMark mark(this, stub_id);
2693
2694 address start = __ pc();
2695 __ enter();
2696 __ cache_wb(Address(line, 0));
2697 __ leave();
2698 __ ret(lr);
2699
2700 return start;
2701 }
2702
2703 address generate_data_cache_writeback_sync() {
2704 const Register is_pre = c_rarg0; // pre or post sync
2705
2706 __ align(CodeEntryAlignment);
2707
2708 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2709 StubCodeMark mark(this, stub_id);
2710
2711 // pre wbsync is a no-op
2712 // post wbsync translates to an sfence
2713
2714 Label skip;
2715 address start = __ pc();
2716 __ enter();
2717 __ cbnz(is_pre, skip);
2718 __ cache_wbsync(false);
2719 __ bind(skip);
2720 __ leave();
2721 __ ret(lr);
2722
2723 return start;
2724 }
2725
2726 void generate_arraycopy_stubs() {
2727 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2728 // entry immediately following their stack push. This can be used
2729 // as a post-push branch target for compatible stubs when they
2730 // identify a special case that can be handled by the fallback
2731 // stub e.g a disjoint copy stub may be use as a special case
2732 // fallback for its compatible conjoint copy stub.
2733 //
2734 // A no push entry is always returned in the following local and
2735 // then published by assigning to the appropriate entry field in
2736 // class StubRoutines. The entry value is then passed to the
2737 // generator for the compatible stub. That means the entry must be
2738 // listed when saving to/restoring from the AOT cache, ensuring
2739 // that the inter-stub jumps are noted at AOT-cache save and
2740 // relocated at AOT cache load.
2741 address nopush_entry;
2742
2743 // generate the common exit first so later stubs can rely on it if
2744 // they want an UnsafeMemoryAccess exit non-local to the stub
2745 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2746 // register the stub as the default exit with class UnsafeMemoryAccess
2747 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2748
2749 // generate and publish arch64-specific bulk copy routines first
2750 // so we can call them from other copy stubs
2751 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2752 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2753
2754 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2755 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2756
2757 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2758 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2759
2760 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2761
2762 //*** jbyte
2763 // Always need aligned and unaligned versions
2764 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2765 // disjoint nopush entry is needed by conjoint copy
2766 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2767 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2768 // conjoint nopush entry is needed by generic/unsafe copy
2769 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2770 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2771 // disjoint arrayof nopush entry is needed by conjoint copy
2772 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2773 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2774
2775 //*** jshort
2776 // Always need aligned and unaligned versions
2777 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2778 // disjoint nopush entry is needed by conjoint copy
2779 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2780 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2781 // conjoint nopush entry is used by generic/unsafe copy
2782 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2783 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2784 // disjoint arrayof nopush entry is needed by conjoint copy
2785 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2786 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2787
2788 //*** jint
2789 // Aligned versions
2790 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2791 // disjoint arrayof nopush entry is needed by conjoint copy
2792 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2793 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2794 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2795 // jint_arraycopy_nopush always points to the unaligned version
2796 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2797 // disjoint nopush entry is needed by conjoint copy
2798 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2799 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2800 // conjoint nopush entry is needed by generic/unsafe copy
2801 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2802
2803 //*** jlong
2804 // It is always aligned
2805 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2806 // disjoint arrayof nopush entry is needed by conjoint copy
2807 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2808 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2809 // conjoint nopush entry is needed by generic/unsafe copy
2810 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2811 // disjoint normal/nopush and conjoint normal entries are not
2812 // generated since the arrayof versions are the same
2813 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2814 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2815 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2816
2817 //*** oops
2818 {
2819 StubRoutines::_arrayof_oop_disjoint_arraycopy
2820 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2821 // disjoint arrayof nopush entry is needed by conjoint copy
2822 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2823 StubRoutines::_arrayof_oop_arraycopy
2824 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2825 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2826 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2827 // Aligned versions without pre-barriers
2828 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2829 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2830 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2831 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2832 // note that we don't need a returned nopush entry because the
2833 // generic/unsafe copy does not cater for uninit arrays.
2834 StubRoutines::_arrayof_oop_arraycopy_uninit
2835 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2836 }
2837
2838 // for oop copies reuse arrayof entries for non-arrayof cases
2839 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2840 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2841 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2842 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2843 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2844 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2845
2846 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2847 // checkcast nopush entry is needed by generic copy
2848 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2849 // note that we don't need a returned nopush entry because the
2850 // generic copy does not cater for uninit arrays.
2851 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2852
2853 // unsafe arraycopy may fallback on conjoint stubs
2854 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2855 StubRoutines::_jshort_arraycopy_nopush,
2856 StubRoutines::_jint_arraycopy_nopush,
2857 StubRoutines::_jlong_arraycopy_nopush);
2858
2859 // generic arraycopy may fallback on conjoint stubs
2860 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2861 StubRoutines::_jshort_arraycopy_nopush,
2862 StubRoutines::_jint_arraycopy_nopush,
2863 StubRoutines::_oop_arraycopy_nopush,
2864 StubRoutines::_jlong_arraycopy_nopush,
2865 StubRoutines::_checkcast_arraycopy_nopush);
2866
2867 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2868 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2869 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2870 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2871 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2872 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2873 }
2874
2875 void generate_math_stubs() { Unimplemented(); }
2876
2877 // Arguments:
2878 //
2879 // Inputs:
2880 // c_rarg0 - source byte array address
2881 // c_rarg1 - destination byte array address
2882 // c_rarg2 - sessionKe (key) in little endian int array
2883 //
2884 address generate_aescrypt_encryptBlock() {
2885 __ align(CodeEntryAlignment);
2886 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2887 StubCodeMark mark(this, stub_id);
2888
2889 const Register from = c_rarg0; // source array address
2890 const Register to = c_rarg1; // destination array address
2891 const Register key = c_rarg2; // key array address
2892 const Register keylen = rscratch1;
2893
2894 address start = __ pc();
2895 __ enter();
2896
2897 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2898
2899 __ aesenc_loadkeys(key, keylen);
2900 __ aesecb_encrypt(from, to, keylen);
2901
2902 __ mov(r0, 0);
2903
2904 __ leave();
2905 __ ret(lr);
2906
2907 return start;
2908 }
2909
2910 // Arguments:
2911 //
2912 // Inputs:
2913 // c_rarg0 - source byte array address
2914 // c_rarg1 - destination byte array address
2915 // c_rarg2 - sessionKd (key) in little endian int array
2916 //
2917 address generate_aescrypt_decryptBlock() {
2918 assert(UseAES, "need AES cryptographic extension support");
2919 __ align(CodeEntryAlignment);
2920 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2921 StubCodeMark mark(this, stub_id);
2922 Label L_doLast;
2923
2924 const Register from = c_rarg0; // source array address
2925 const Register to = c_rarg1; // destination array address
2926 const Register key = c_rarg2; // key array address
2927 const Register keylen = rscratch1;
2928
2929 address start = __ pc();
2930 __ enter(); // required for proper stackwalking of RuntimeStub frame
2931
2932 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2933
2934 __ aesecb_decrypt(from, to, key, keylen);
2935
2936 __ mov(r0, 0);
2937
2938 __ leave();
2939 __ ret(lr);
2940
2941 return start;
2942 }
2943
2944 // Arguments:
2945 //
2946 // Inputs:
2947 // c_rarg0 - source byte array address
2948 // c_rarg1 - destination byte array address
2949 // c_rarg2 - sessionKe (key) in little endian int array
2950 // c_rarg3 - r vector byte array address
2951 // c_rarg4 - input length
2952 //
2953 // Output:
2954 // x0 - input length
2955 //
2956 address generate_cipherBlockChaining_encryptAESCrypt() {
2957 assert(UseAES, "need AES cryptographic extension support");
2958 __ align(CodeEntryAlignment);
2959 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2960 StubCodeMark mark(this, stub_id);
2961
2962 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2963
2964 const Register from = c_rarg0; // source array address
2965 const Register to = c_rarg1; // destination array address
2966 const Register key = c_rarg2; // key array address
2967 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2968 // and left with the results of the last encryption block
2969 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2970 const Register keylen = rscratch1;
2971
2972 address start = __ pc();
2973
2974 __ enter();
2975
2976 __ movw(rscratch2, len_reg);
2977
2978 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2979
2980 __ ld1(v0, __ T16B, rvec);
2981
2982 __ cmpw(keylen, 52);
2983 __ br(Assembler::CC, L_loadkeys_44);
2984 __ br(Assembler::EQ, L_loadkeys_52);
2985
2986 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2987 __ rev32(v17, __ T16B, v17);
2988 __ rev32(v18, __ T16B, v18);
2989 __ BIND(L_loadkeys_52);
2990 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2991 __ rev32(v19, __ T16B, v19);
2992 __ rev32(v20, __ T16B, v20);
2993 __ BIND(L_loadkeys_44);
2994 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2995 __ rev32(v21, __ T16B, v21);
2996 __ rev32(v22, __ T16B, v22);
2997 __ rev32(v23, __ T16B, v23);
2998 __ rev32(v24, __ T16B, v24);
2999 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3000 __ rev32(v25, __ T16B, v25);
3001 __ rev32(v26, __ T16B, v26);
3002 __ rev32(v27, __ T16B, v27);
3003 __ rev32(v28, __ T16B, v28);
3004 __ ld1(v29, v30, v31, __ T16B, key);
3005 __ rev32(v29, __ T16B, v29);
3006 __ rev32(v30, __ T16B, v30);
3007 __ rev32(v31, __ T16B, v31);
3008
3009 __ BIND(L_aes_loop);
3010 __ ld1(v1, __ T16B, __ post(from, 16));
3011 __ eor(v0, __ T16B, v0, v1);
3012
3013 __ br(Assembler::CC, L_rounds_44);
3014 __ br(Assembler::EQ, L_rounds_52);
3015
3016 __ aese(v0, v17); __ aesmc(v0, v0);
3017 __ aese(v0, v18); __ aesmc(v0, v0);
3018 __ BIND(L_rounds_52);
3019 __ aese(v0, v19); __ aesmc(v0, v0);
3020 __ aese(v0, v20); __ aesmc(v0, v0);
3021 __ BIND(L_rounds_44);
3022 __ aese(v0, v21); __ aesmc(v0, v0);
3023 __ aese(v0, v22); __ aesmc(v0, v0);
3024 __ aese(v0, v23); __ aesmc(v0, v0);
3025 __ aese(v0, v24); __ aesmc(v0, v0);
3026 __ aese(v0, v25); __ aesmc(v0, v0);
3027 __ aese(v0, v26); __ aesmc(v0, v0);
3028 __ aese(v0, v27); __ aesmc(v0, v0);
3029 __ aese(v0, v28); __ aesmc(v0, v0);
3030 __ aese(v0, v29); __ aesmc(v0, v0);
3031 __ aese(v0, v30);
3032 __ eor(v0, __ T16B, v0, v31);
3033
3034 __ st1(v0, __ T16B, __ post(to, 16));
3035
3036 __ subw(len_reg, len_reg, 16);
3037 __ cbnzw(len_reg, L_aes_loop);
3038
3039 __ st1(v0, __ T16B, rvec);
3040
3041 __ mov(r0, rscratch2);
3042
3043 __ leave();
3044 __ ret(lr);
3045
3046 return start;
3047 }
3048
3049 // Arguments:
3050 //
3051 // Inputs:
3052 // c_rarg0 - source byte array address
3053 // c_rarg1 - destination byte array address
3054 // c_rarg2 - sessionKd (key) in little endian int array
3055 // c_rarg3 - r vector byte array address
3056 // c_rarg4 - input length
3057 //
3058 // Output:
3059 // r0 - input length
3060 //
3061 address generate_cipherBlockChaining_decryptAESCrypt() {
3062 assert(UseAES, "need AES cryptographic extension support");
3063 __ align(CodeEntryAlignment);
3064 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3065 StubCodeMark mark(this, stub_id);
3066
3067 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3068
3069 const Register from = c_rarg0; // source array address
3070 const Register to = c_rarg1; // destination array address
3071 const Register key = c_rarg2; // key array address
3072 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3073 // and left with the results of the last encryption block
3074 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3075 const Register keylen = rscratch1;
3076
3077 address start = __ pc();
3078
3079 __ enter();
3080
3081 __ movw(rscratch2, len_reg);
3082
3083 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3084
3085 __ ld1(v2, __ T16B, rvec);
3086
3087 __ ld1(v31, __ T16B, __ post(key, 16));
3088 __ rev32(v31, __ T16B, v31);
3089
3090 __ cmpw(keylen, 52);
3091 __ br(Assembler::CC, L_loadkeys_44);
3092 __ br(Assembler::EQ, L_loadkeys_52);
3093
3094 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3095 __ rev32(v17, __ T16B, v17);
3096 __ rev32(v18, __ T16B, v18);
3097 __ BIND(L_loadkeys_52);
3098 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3099 __ rev32(v19, __ T16B, v19);
3100 __ rev32(v20, __ T16B, v20);
3101 __ BIND(L_loadkeys_44);
3102 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3103 __ rev32(v21, __ T16B, v21);
3104 __ rev32(v22, __ T16B, v22);
3105 __ rev32(v23, __ T16B, v23);
3106 __ rev32(v24, __ T16B, v24);
3107 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3108 __ rev32(v25, __ T16B, v25);
3109 __ rev32(v26, __ T16B, v26);
3110 __ rev32(v27, __ T16B, v27);
3111 __ rev32(v28, __ T16B, v28);
3112 __ ld1(v29, v30, __ T16B, key);
3113 __ rev32(v29, __ T16B, v29);
3114 __ rev32(v30, __ T16B, v30);
3115
3116 __ BIND(L_aes_loop);
3117 __ ld1(v0, __ T16B, __ post(from, 16));
3118 __ orr(v1, __ T16B, v0, v0);
3119
3120 __ br(Assembler::CC, L_rounds_44);
3121 __ br(Assembler::EQ, L_rounds_52);
3122
3123 __ aesd(v0, v17); __ aesimc(v0, v0);
3124 __ aesd(v0, v18); __ aesimc(v0, v0);
3125 __ BIND(L_rounds_52);
3126 __ aesd(v0, v19); __ aesimc(v0, v0);
3127 __ aesd(v0, v20); __ aesimc(v0, v0);
3128 __ BIND(L_rounds_44);
3129 __ aesd(v0, v21); __ aesimc(v0, v0);
3130 __ aesd(v0, v22); __ aesimc(v0, v0);
3131 __ aesd(v0, v23); __ aesimc(v0, v0);
3132 __ aesd(v0, v24); __ aesimc(v0, v0);
3133 __ aesd(v0, v25); __ aesimc(v0, v0);
3134 __ aesd(v0, v26); __ aesimc(v0, v0);
3135 __ aesd(v0, v27); __ aesimc(v0, v0);
3136 __ aesd(v0, v28); __ aesimc(v0, v0);
3137 __ aesd(v0, v29); __ aesimc(v0, v0);
3138 __ aesd(v0, v30);
3139 __ eor(v0, __ T16B, v0, v31);
3140 __ eor(v0, __ T16B, v0, v2);
3141
3142 __ st1(v0, __ T16B, __ post(to, 16));
3143 __ orr(v2, __ T16B, v1, v1);
3144
3145 __ subw(len_reg, len_reg, 16);
3146 __ cbnzw(len_reg, L_aes_loop);
3147
3148 __ st1(v2, __ T16B, rvec);
3149
3150 __ mov(r0, rscratch2);
3151
3152 __ leave();
3153 __ ret(lr);
3154
3155 return start;
3156 }
3157
3158 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3159 // Inputs: 128-bits. in is preserved.
3160 // The least-significant 64-bit word is in the upper dword of each vector.
3161 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3162 // Output: result
3163 void be_add_128_64(FloatRegister result, FloatRegister in,
3164 FloatRegister inc, FloatRegister tmp) {
3165 assert_different_registers(result, tmp, inc);
3166
3167 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3168 // input
3169 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3170 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3171 // MSD == 0 (must be!) to LSD
3172 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3173 }
3174
3175 // CTR AES crypt.
3176 // Arguments:
3177 //
3178 // Inputs:
3179 // c_rarg0 - source byte array address
3180 // c_rarg1 - destination byte array address
3181 // c_rarg2 - sessionKe (key) in little endian int array
3182 // c_rarg3 - counter vector byte array address
3183 // c_rarg4 - input length
3184 // c_rarg5 - saved encryptedCounter start
3185 // c_rarg6 - saved used length
3186 //
3187 // Output:
3188 // r0 - input length
3189 //
3190 address generate_counterMode_AESCrypt() {
3191 const Register in = c_rarg0;
3192 const Register out = c_rarg1;
3193 const Register key = c_rarg2;
3194 const Register counter = c_rarg3;
3195 const Register saved_len = c_rarg4, len = r10;
3196 const Register saved_encrypted_ctr = c_rarg5;
3197 const Register used_ptr = c_rarg6, used = r12;
3198
3199 const Register offset = r7;
3200 const Register keylen = r11;
3201
3202 const unsigned char block_size = 16;
3203 const int bulk_width = 4;
3204 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3205 // performance with larger data sizes, but it also means that the
3206 // fast path isn't used until you have at least 8 blocks, and up
3207 // to 127 bytes of data will be executed on the slow path. For
3208 // that reason, and also so as not to blow away too much icache, 4
3209 // blocks seems like a sensible compromise.
3210
3211 // Algorithm:
3212 //
3213 // if (len == 0) {
3214 // goto DONE;
3215 // }
3216 // int result = len;
3217 // do {
3218 // if (used >= blockSize) {
3219 // if (len >= bulk_width * blockSize) {
3220 // CTR_large_block();
3221 // if (len == 0)
3222 // goto DONE;
3223 // }
3224 // for (;;) {
3225 // 16ByteVector v0 = counter;
3226 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3227 // used = 0;
3228 // if (len < blockSize)
3229 // break; /* goto NEXT */
3230 // 16ByteVector v1 = load16Bytes(in, offset);
3231 // v1 = v1 ^ encryptedCounter;
3232 // store16Bytes(out, offset);
3233 // used = blockSize;
3234 // offset += blockSize;
3235 // len -= blockSize;
3236 // if (len == 0)
3237 // goto DONE;
3238 // }
3239 // }
3240 // NEXT:
3241 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3242 // len--;
3243 // } while (len != 0);
3244 // DONE:
3245 // return result;
3246 //
3247 // CTR_large_block()
3248 // Wide bulk encryption of whole blocks.
3249
3250 __ align(CodeEntryAlignment);
3251 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3252 StubCodeMark mark(this, stub_id);
3253 const address start = __ pc();
3254 __ enter();
3255
3256 Label DONE, CTR_large_block, large_block_return;
3257 __ ldrw(used, Address(used_ptr));
3258 __ cbzw(saved_len, DONE);
3259
3260 __ mov(len, saved_len);
3261 __ mov(offset, 0);
3262
3263 // Compute #rounds for AES based on the length of the key array
3264 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3265
3266 __ aesenc_loadkeys(key, keylen);
3267
3268 {
3269 Label L_CTR_loop, NEXT;
3270
3271 __ bind(L_CTR_loop);
3272
3273 __ cmp(used, block_size);
3274 __ br(__ LO, NEXT);
3275
3276 // Maybe we have a lot of data
3277 __ subsw(rscratch1, len, bulk_width * block_size);
3278 __ br(__ HS, CTR_large_block);
3279 __ BIND(large_block_return);
3280 __ cbzw(len, DONE);
3281
3282 // Setup the counter
3283 __ movi(v4, __ T4S, 0);
3284 __ movi(v5, __ T4S, 1);
3285 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3286
3287 // 128-bit big-endian increment
3288 __ ld1(v0, __ T16B, counter);
3289 __ rev64(v16, __ T16B, v0);
3290 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3291 __ rev64(v16, __ T16B, v16);
3292 __ st1(v16, __ T16B, counter);
3293 // Previous counter value is in v0
3294 // v4 contains { 0, 1 }
3295
3296 {
3297 // We have fewer than bulk_width blocks of data left. Encrypt
3298 // them one by one until there is less than a full block
3299 // remaining, being careful to save both the encrypted counter
3300 // and the counter.
3301
3302 Label inner_loop;
3303 __ bind(inner_loop);
3304 // Counter to encrypt is in v0
3305 __ aesecb_encrypt(noreg, noreg, keylen);
3306 __ st1(v0, __ T16B, saved_encrypted_ctr);
3307
3308 // Do we have a remaining full block?
3309
3310 __ mov(used, 0);
3311 __ cmp(len, block_size);
3312 __ br(__ LO, NEXT);
3313
3314 // Yes, we have a full block
3315 __ ldrq(v1, Address(in, offset));
3316 __ eor(v1, __ T16B, v1, v0);
3317 __ strq(v1, Address(out, offset));
3318 __ mov(used, block_size);
3319 __ add(offset, offset, block_size);
3320
3321 __ subw(len, len, block_size);
3322 __ cbzw(len, DONE);
3323
3324 // Increment the counter, store it back
3325 __ orr(v0, __ T16B, v16, v16);
3326 __ rev64(v16, __ T16B, v16);
3327 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3328 __ rev64(v16, __ T16B, v16);
3329 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3330
3331 __ b(inner_loop);
3332 }
3333
3334 __ BIND(NEXT);
3335
3336 // Encrypt a single byte, and loop.
3337 // We expect this to be a rare event.
3338 __ ldrb(rscratch1, Address(in, offset));
3339 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3340 __ eor(rscratch1, rscratch1, rscratch2);
3341 __ strb(rscratch1, Address(out, offset));
3342 __ add(offset, offset, 1);
3343 __ add(used, used, 1);
3344 __ subw(len, len,1);
3345 __ cbnzw(len, L_CTR_loop);
3346 }
3347
3348 __ bind(DONE);
3349 __ strw(used, Address(used_ptr));
3350 __ mov(r0, saved_len);
3351
3352 __ leave(); // required for proper stackwalking of RuntimeStub frame
3353 __ ret(lr);
3354
3355 // Bulk encryption
3356
3357 __ BIND (CTR_large_block);
3358 assert(bulk_width == 4 || bulk_width == 8, "must be");
3359
3360 if (bulk_width == 8) {
3361 __ sub(sp, sp, 4 * 16);
3362 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3363 }
3364 __ sub(sp, sp, 4 * 16);
3365 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3366 RegSet saved_regs = (RegSet::of(in, out, offset)
3367 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3368 __ push(saved_regs, sp);
3369 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3370 __ add(in, in, offset);
3371 __ add(out, out, offset);
3372
3373 // Keys should already be loaded into the correct registers
3374
3375 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3376 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3377
3378 // AES/CTR loop
3379 {
3380 Label L_CTR_loop;
3381 __ BIND(L_CTR_loop);
3382
3383 // Setup the counters
3384 __ movi(v8, __ T4S, 0);
3385 __ movi(v9, __ T4S, 1);
3386 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3387
3388 for (int i = 0; i < bulk_width; i++) {
3389 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3390 __ rev64(v0_ofs, __ T16B, v16);
3391 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3392 }
3393
3394 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3395
3396 // Encrypt the counters
3397 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3398
3399 if (bulk_width == 8) {
3400 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3401 }
3402
3403 // XOR the encrypted counters with the inputs
3404 for (int i = 0; i < bulk_width; i++) {
3405 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3406 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3407 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3408 }
3409
3410 // Write the encrypted data
3411 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3412 if (bulk_width == 8) {
3413 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3414 }
3415
3416 __ subw(len, len, 16 * bulk_width);
3417 __ cbnzw(len, L_CTR_loop);
3418 }
3419
3420 // Save the counter back where it goes
3421 __ rev64(v16, __ T16B, v16);
3422 __ st1(v16, __ T16B, counter);
3423
3424 __ pop(saved_regs, sp);
3425
3426 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3427 if (bulk_width == 8) {
3428 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3429 }
3430
3431 __ andr(rscratch1, len, -16 * bulk_width);
3432 __ sub(len, len, rscratch1);
3433 __ add(offset, offset, rscratch1);
3434 __ mov(used, 16);
3435 __ strw(used, Address(used_ptr));
3436 __ b(large_block_return);
3437
3438 return start;
3439 }
3440
3441 // Vector AES Galois Counter Mode implementation. Parameters:
3442 //
3443 // in = c_rarg0
3444 // len = c_rarg1
3445 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3446 // out = c_rarg3
3447 // key = c_rarg4
3448 // state = c_rarg5 - GHASH.state
3449 // subkeyHtbl = c_rarg6 - powers of H
3450 // counter = c_rarg7 - 16 bytes of CTR
3451 // return - number of processed bytes
3452 address generate_galoisCounterMode_AESCrypt() {
3453 Label ghash_polynomial; // local data generated after code
3454
3455 __ align(CodeEntryAlignment);
3456 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3457 StubCodeMark mark(this, stub_id);
3458 address start = __ pc();
3459 __ enter();
3460
3461 const Register in = c_rarg0;
3462 const Register len = c_rarg1;
3463 const Register ct = c_rarg2;
3464 const Register out = c_rarg3;
3465 // and updated with the incremented counter in the end
3466
3467 const Register key = c_rarg4;
3468 const Register state = c_rarg5;
3469
3470 const Register subkeyHtbl = c_rarg6;
3471
3472 const Register counter = c_rarg7;
3473
3474 const Register keylen = r10;
3475 // Save state before entering routine
3476 __ sub(sp, sp, 4 * 16);
3477 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3478 __ sub(sp, sp, 4 * 16);
3479 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3480
3481 // __ andr(len, len, -512);
3482 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3483 __ str(len, __ pre(sp, -2 * wordSize));
3484
3485 Label DONE;
3486 __ cbz(len, DONE);
3487
3488 // Compute #rounds for AES based on the length of the key array
3489 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3490
3491 __ aesenc_loadkeys(key, keylen);
3492 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3493 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3494
3495 // AES/CTR loop
3496 {
3497 Label L_CTR_loop;
3498 __ BIND(L_CTR_loop);
3499
3500 // Setup the counters
3501 __ movi(v8, __ T4S, 0);
3502 __ movi(v9, __ T4S, 1);
3503 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3504
3505 assert(v0->encoding() < v8->encoding(), "");
3506 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3507 FloatRegister f = as_FloatRegister(i);
3508 __ rev32(f, __ T16B, v16);
3509 __ addv(v16, __ T4S, v16, v8);
3510 }
3511
3512 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3513
3514 // Encrypt the counters
3515 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3516
3517 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3518
3519 // XOR the encrypted counters with the inputs
3520 for (int i = 0; i < 8; i++) {
3521 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3522 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3523 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3524 }
3525 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3526 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3527
3528 __ subw(len, len, 16 * 8);
3529 __ cbnzw(len, L_CTR_loop);
3530 }
3531
3532 __ rev32(v16, __ T16B, v16);
3533 __ st1(v16, __ T16B, counter);
3534
3535 __ ldr(len, Address(sp));
3536 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3537
3538 // GHASH/CTR loop
3539 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3540 len, /*unrolls*/4);
3541
3542 #ifdef ASSERT
3543 { Label L;
3544 __ cmp(len, (unsigned char)0);
3545 __ br(Assembler::EQ, L);
3546 __ stop("stubGenerator: abort");
3547 __ bind(L);
3548 }
3549 #endif
3550
3551 __ bind(DONE);
3552 // Return the number of bytes processed
3553 __ ldr(r0, __ post(sp, 2 * wordSize));
3554
3555 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3556 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3557
3558 __ leave(); // required for proper stackwalking of RuntimeStub frame
3559 __ ret(lr);
3560
3561 // bind label and generate polynomial data
3562 __ align(wordSize * 2);
3563 __ bind(ghash_polynomial);
3564 __ emit_int64(0x87); // The low-order bits of the field
3565 // polynomial (i.e. p = z^7+z^2+z+1)
3566 // repeated in the low and high parts of a
3567 // 128-bit vector
3568 __ emit_int64(0x87);
3569
3570 return start;
3571 }
3572
3573 class Cached64Bytes {
3574 private:
3575 MacroAssembler *_masm;
3576 Register _regs[8];
3577
3578 public:
3579 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3580 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3581 auto it = rs.begin();
3582 for (auto &r: _regs) {
3583 r = *it;
3584 ++it;
3585 }
3586 }
3587
3588 void gen_loads(Register base) {
3589 for (int i = 0; i < 8; i += 2) {
3590 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3591 }
3592 }
3593
3594 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3595 void extract_u32(Register dest, int i) {
3596 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3597 }
3598 };
3599
3600 // Utility routines for md5.
3601 // Clobbers r10 and r11.
3602 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3603 int k, int s, int t) {
3604 Register rscratch3 = r10;
3605 Register rscratch4 = r11;
3606
3607 __ eorw(rscratch3, r3, r4);
3608 __ movw(rscratch2, t);
3609 __ andw(rscratch3, rscratch3, r2);
3610 __ addw(rscratch4, r1, rscratch2);
3611 reg_cache.extract_u32(rscratch1, k);
3612 __ eorw(rscratch3, rscratch3, r4);
3613 __ addw(rscratch4, rscratch4, rscratch1);
3614 __ addw(rscratch3, rscratch3, rscratch4);
3615 __ rorw(rscratch2, rscratch3, 32 - s);
3616 __ addw(r1, rscratch2, r2);
3617 }
3618
3619 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3620 int k, int s, int t) {
3621 Register rscratch3 = r10;
3622 Register rscratch4 = r11;
3623
3624 reg_cache.extract_u32(rscratch1, k);
3625 __ movw(rscratch2, t);
3626 __ addw(rscratch4, r1, rscratch2);
3627 __ addw(rscratch4, rscratch4, rscratch1);
3628 __ bicw(rscratch2, r3, r4);
3629 __ andw(rscratch3, r2, r4);
3630 __ addw(rscratch2, rscratch2, rscratch4);
3631 __ addw(rscratch2, rscratch2, rscratch3);
3632 __ rorw(rscratch2, rscratch2, 32 - s);
3633 __ addw(r1, rscratch2, r2);
3634 }
3635
3636 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3637 int k, int s, int t) {
3638 Register rscratch3 = r10;
3639 Register rscratch4 = r11;
3640
3641 __ eorw(rscratch3, r3, r4);
3642 __ movw(rscratch2, t);
3643 __ addw(rscratch4, r1, rscratch2);
3644 reg_cache.extract_u32(rscratch1, k);
3645 __ eorw(rscratch3, rscratch3, r2);
3646 __ addw(rscratch4, rscratch4, rscratch1);
3647 __ addw(rscratch3, rscratch3, rscratch4);
3648 __ rorw(rscratch2, rscratch3, 32 - s);
3649 __ addw(r1, rscratch2, r2);
3650 }
3651
3652 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3653 int k, int s, int t) {
3654 Register rscratch3 = r10;
3655 Register rscratch4 = r11;
3656
3657 __ movw(rscratch3, t);
3658 __ ornw(rscratch2, r2, r4);
3659 __ addw(rscratch4, r1, rscratch3);
3660 reg_cache.extract_u32(rscratch1, k);
3661 __ eorw(rscratch3, rscratch2, r3);
3662 __ addw(rscratch4, rscratch4, rscratch1);
3663 __ addw(rscratch3, rscratch3, rscratch4);
3664 __ rorw(rscratch2, rscratch3, 32 - s);
3665 __ addw(r1, rscratch2, r2);
3666 }
3667
3668 // Arguments:
3669 //
3670 // Inputs:
3671 // c_rarg0 - byte[] source+offset
3672 // c_rarg1 - int[] SHA.state
3673 // c_rarg2 - int offset
3674 // c_rarg3 - int limit
3675 //
3676 address generate_md5_implCompress(StubId stub_id) {
3677 bool multi_block;
3678 switch (stub_id) {
3679 case StubId::stubgen_md5_implCompress_id:
3680 multi_block = false;
3681 break;
3682 case StubId::stubgen_md5_implCompressMB_id:
3683 multi_block = true;
3684 break;
3685 default:
3686 ShouldNotReachHere();
3687 }
3688 __ align(CodeEntryAlignment);
3689
3690 StubCodeMark mark(this, stub_id);
3691 address start = __ pc();
3692
3693 Register buf = c_rarg0;
3694 Register state = c_rarg1;
3695 Register ofs = c_rarg2;
3696 Register limit = c_rarg3;
3697 Register a = r4;
3698 Register b = r5;
3699 Register c = r6;
3700 Register d = r7;
3701 Register rscratch3 = r10;
3702 Register rscratch4 = r11;
3703
3704 Register state_regs[2] = { r12, r13 };
3705 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3706 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3707
3708 __ push(saved_regs, sp);
3709
3710 __ ldp(state_regs[0], state_regs[1], Address(state));
3711 __ ubfx(a, state_regs[0], 0, 32);
3712 __ ubfx(b, state_regs[0], 32, 32);
3713 __ ubfx(c, state_regs[1], 0, 32);
3714 __ ubfx(d, state_regs[1], 32, 32);
3715
3716 Label md5_loop;
3717 __ BIND(md5_loop);
3718
3719 reg_cache.gen_loads(buf);
3720
3721 // Round 1
3722 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3723 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3724 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3725 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3726 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3727 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3728 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3729 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3730 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3731 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3732 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3733 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3734 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3735 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3736 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3737 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3738
3739 // Round 2
3740 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3741 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3742 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3743 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3744 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3745 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3746 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3747 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3748 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3749 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3750 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3751 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3752 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3753 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3754 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3755 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3756
3757 // Round 3
3758 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3759 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3760 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3761 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3762 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3763 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3764 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3765 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3766 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3767 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3768 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3769 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3770 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3771 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3772 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3773 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3774
3775 // Round 4
3776 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3777 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3778 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3779 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3780 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3781 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3782 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3783 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3784 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3785 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3786 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3787 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3788 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3789 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3790 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3791 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3792
3793 __ addw(a, state_regs[0], a);
3794 __ ubfx(rscratch2, state_regs[0], 32, 32);
3795 __ addw(b, rscratch2, b);
3796 __ addw(c, state_regs[1], c);
3797 __ ubfx(rscratch4, state_regs[1], 32, 32);
3798 __ addw(d, rscratch4, d);
3799
3800 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3801 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3802
3803 if (multi_block) {
3804 __ add(buf, buf, 64);
3805 __ add(ofs, ofs, 64);
3806 __ cmp(ofs, limit);
3807 __ br(Assembler::LE, md5_loop);
3808 __ mov(c_rarg0, ofs); // return ofs
3809 }
3810
3811 // write hash values back in the correct order
3812 __ stp(state_regs[0], state_regs[1], Address(state));
3813
3814 __ pop(saved_regs, sp);
3815
3816 __ ret(lr);
3817
3818 return start;
3819 }
3820
3821 // Arguments:
3822 //
3823 // Inputs:
3824 // c_rarg0 - byte[] source+offset
3825 // c_rarg1 - int[] SHA.state
3826 // c_rarg2 - int offset
3827 // c_rarg3 - int limit
3828 //
3829 address generate_sha1_implCompress(StubId stub_id) {
3830 bool multi_block;
3831 switch (stub_id) {
3832 case StubId::stubgen_sha1_implCompress_id:
3833 multi_block = false;
3834 break;
3835 case StubId::stubgen_sha1_implCompressMB_id:
3836 multi_block = true;
3837 break;
3838 default:
3839 ShouldNotReachHere();
3840 }
3841
3842 __ align(CodeEntryAlignment);
3843
3844 StubCodeMark mark(this, stub_id);
3845 address start = __ pc();
3846
3847 Register buf = c_rarg0;
3848 Register state = c_rarg1;
3849 Register ofs = c_rarg2;
3850 Register limit = c_rarg3;
3851
3852 Label keys;
3853 Label sha1_loop;
3854
3855 // load the keys into v0..v3
3856 __ adr(rscratch1, keys);
3857 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3858 // load 5 words state into v6, v7
3859 __ ldrq(v6, Address(state, 0));
3860 __ ldrs(v7, Address(state, 16));
3861
3862
3863 __ BIND(sha1_loop);
3864 // load 64 bytes of data into v16..v19
3865 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3866 __ rev32(v16, __ T16B, v16);
3867 __ rev32(v17, __ T16B, v17);
3868 __ rev32(v18, __ T16B, v18);
3869 __ rev32(v19, __ T16B, v19);
3870
3871 // do the sha1
3872 __ addv(v4, __ T4S, v16, v0);
3873 __ orr(v20, __ T16B, v6, v6);
3874
3875 FloatRegister d0 = v16;
3876 FloatRegister d1 = v17;
3877 FloatRegister d2 = v18;
3878 FloatRegister d3 = v19;
3879
3880 for (int round = 0; round < 20; round++) {
3881 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3882 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3883 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3884 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3885 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3886
3887 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3888 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3889 __ sha1h(tmp2, __ T4S, v20);
3890 if (round < 5)
3891 __ sha1c(v20, __ T4S, tmp3, tmp4);
3892 else if (round < 10 || round >= 15)
3893 __ sha1p(v20, __ T4S, tmp3, tmp4);
3894 else
3895 __ sha1m(v20, __ T4S, tmp3, tmp4);
3896 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3897
3898 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3899 }
3900
3901 __ addv(v7, __ T2S, v7, v21);
3902 __ addv(v6, __ T4S, v6, v20);
3903
3904 if (multi_block) {
3905 __ add(ofs, ofs, 64);
3906 __ cmp(ofs, limit);
3907 __ br(Assembler::LE, sha1_loop);
3908 __ mov(c_rarg0, ofs); // return ofs
3909 }
3910
3911 __ strq(v6, Address(state, 0));
3912 __ strs(v7, Address(state, 16));
3913
3914 __ ret(lr);
3915
3916 __ bind(keys);
3917 __ emit_int32(0x5a827999);
3918 __ emit_int32(0x6ed9eba1);
3919 __ emit_int32(0x8f1bbcdc);
3920 __ emit_int32(0xca62c1d6);
3921
3922 return start;
3923 }
3924
3925
3926 // Arguments:
3927 //
3928 // Inputs:
3929 // c_rarg0 - byte[] source+offset
3930 // c_rarg1 - int[] SHA.state
3931 // c_rarg2 - int offset
3932 // c_rarg3 - int limit
3933 //
3934 address generate_sha256_implCompress(StubId stub_id) {
3935 bool multi_block;
3936 switch (stub_id) {
3937 case StubId::stubgen_sha256_implCompress_id:
3938 multi_block = false;
3939 break;
3940 case StubId::stubgen_sha256_implCompressMB_id:
3941 multi_block = true;
3942 break;
3943 default:
3944 ShouldNotReachHere();
3945 }
3946
3947 static const uint32_t round_consts[64] = {
3948 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3949 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3950 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3951 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3952 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3953 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3954 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3955 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3956 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3957 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3958 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3959 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3960 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3961 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3962 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3963 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3964 };
3965
3966 __ align(CodeEntryAlignment);
3967
3968 StubCodeMark mark(this, stub_id);
3969 address start = __ pc();
3970
3971 Register buf = c_rarg0;
3972 Register state = c_rarg1;
3973 Register ofs = c_rarg2;
3974 Register limit = c_rarg3;
3975
3976 Label sha1_loop;
3977
3978 __ stpd(v8, v9, __ pre(sp, -32));
3979 __ stpd(v10, v11, Address(sp, 16));
3980
3981 // dga == v0
3982 // dgb == v1
3983 // dg0 == v2
3984 // dg1 == v3
3985 // dg2 == v4
3986 // t0 == v6
3987 // t1 == v7
3988
3989 // load 16 keys to v16..v31
3990 __ lea(rscratch1, ExternalAddress((address)round_consts));
3991 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3992 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3993 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3994 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3995
3996 // load 8 words (256 bits) state
3997 __ ldpq(v0, v1, state);
3998
3999 __ BIND(sha1_loop);
4000 // load 64 bytes of data into v8..v11
4001 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4002 __ rev32(v8, __ T16B, v8);
4003 __ rev32(v9, __ T16B, v9);
4004 __ rev32(v10, __ T16B, v10);
4005 __ rev32(v11, __ T16B, v11);
4006
4007 __ addv(v6, __ T4S, v8, v16);
4008 __ orr(v2, __ T16B, v0, v0);
4009 __ orr(v3, __ T16B, v1, v1);
4010
4011 FloatRegister d0 = v8;
4012 FloatRegister d1 = v9;
4013 FloatRegister d2 = v10;
4014 FloatRegister d3 = v11;
4015
4016
4017 for (int round = 0; round < 16; round++) {
4018 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4019 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4020 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4021 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4022
4023 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4024 __ orr(v4, __ T16B, v2, v2);
4025 if (round < 15)
4026 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4027 __ sha256h(v2, __ T4S, v3, tmp2);
4028 __ sha256h2(v3, __ T4S, v4, tmp2);
4029 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4030
4031 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4032 }
4033
4034 __ addv(v0, __ T4S, v0, v2);
4035 __ addv(v1, __ T4S, v1, v3);
4036
4037 if (multi_block) {
4038 __ add(ofs, ofs, 64);
4039 __ cmp(ofs, limit);
4040 __ br(Assembler::LE, sha1_loop);
4041 __ mov(c_rarg0, ofs); // return ofs
4042 }
4043
4044 __ ldpd(v10, v11, Address(sp, 16));
4045 __ ldpd(v8, v9, __ post(sp, 32));
4046
4047 __ stpq(v0, v1, state);
4048
4049 __ ret(lr);
4050
4051 return start;
4052 }
4053
4054 // Double rounds for sha512.
4055 void sha512_dround(int dr,
4056 FloatRegister vi0, FloatRegister vi1,
4057 FloatRegister vi2, FloatRegister vi3,
4058 FloatRegister vi4, FloatRegister vrc0,
4059 FloatRegister vrc1, FloatRegister vin0,
4060 FloatRegister vin1, FloatRegister vin2,
4061 FloatRegister vin3, FloatRegister vin4) {
4062 if (dr < 36) {
4063 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4064 }
4065 __ addv(v5, __ T2D, vrc0, vin0);
4066 __ ext(v6, __ T16B, vi2, vi3, 8);
4067 __ ext(v5, __ T16B, v5, v5, 8);
4068 __ ext(v7, __ T16B, vi1, vi2, 8);
4069 __ addv(vi3, __ T2D, vi3, v5);
4070 if (dr < 32) {
4071 __ ext(v5, __ T16B, vin3, vin4, 8);
4072 __ sha512su0(vin0, __ T2D, vin1);
4073 }
4074 __ sha512h(vi3, __ T2D, v6, v7);
4075 if (dr < 32) {
4076 __ sha512su1(vin0, __ T2D, vin2, v5);
4077 }
4078 __ addv(vi4, __ T2D, vi1, vi3);
4079 __ sha512h2(vi3, __ T2D, vi1, vi0);
4080 }
4081
4082 // Arguments:
4083 //
4084 // Inputs:
4085 // c_rarg0 - byte[] source+offset
4086 // c_rarg1 - int[] SHA.state
4087 // c_rarg2 - int offset
4088 // c_rarg3 - int limit
4089 //
4090 address generate_sha512_implCompress(StubId stub_id) {
4091 bool multi_block;
4092 switch (stub_id) {
4093 case StubId::stubgen_sha512_implCompress_id:
4094 multi_block = false;
4095 break;
4096 case StubId::stubgen_sha512_implCompressMB_id:
4097 multi_block = true;
4098 break;
4099 default:
4100 ShouldNotReachHere();
4101 }
4102
4103 static const uint64_t round_consts[80] = {
4104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4131 };
4132
4133 __ align(CodeEntryAlignment);
4134
4135 StubCodeMark mark(this, stub_id);
4136 address start = __ pc();
4137
4138 Register buf = c_rarg0;
4139 Register state = c_rarg1;
4140 Register ofs = c_rarg2;
4141 Register limit = c_rarg3;
4142
4143 __ stpd(v8, v9, __ pre(sp, -64));
4144 __ stpd(v10, v11, Address(sp, 16));
4145 __ stpd(v12, v13, Address(sp, 32));
4146 __ stpd(v14, v15, Address(sp, 48));
4147
4148 Label sha512_loop;
4149
4150 // load state
4151 __ ld1(v8, v9, v10, v11, __ T2D, state);
4152
4153 // load first 4 round constants
4154 __ lea(rscratch1, ExternalAddress((address)round_consts));
4155 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4156
4157 __ BIND(sha512_loop);
4158 // load 128B of data into v12..v19
4159 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4160 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4161 __ rev64(v12, __ T16B, v12);
4162 __ rev64(v13, __ T16B, v13);
4163 __ rev64(v14, __ T16B, v14);
4164 __ rev64(v15, __ T16B, v15);
4165 __ rev64(v16, __ T16B, v16);
4166 __ rev64(v17, __ T16B, v17);
4167 __ rev64(v18, __ T16B, v18);
4168 __ rev64(v19, __ T16B, v19);
4169
4170 __ mov(rscratch2, rscratch1);
4171
4172 __ mov(v0, __ T16B, v8);
4173 __ mov(v1, __ T16B, v9);
4174 __ mov(v2, __ T16B, v10);
4175 __ mov(v3, __ T16B, v11);
4176
4177 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4178 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4179 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4180 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4181 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4182 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4183 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4184 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4185 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4186 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4187 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4188 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4189 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4190 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4191 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4192 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4193 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4194 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4195 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4196 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4197 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4198 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4199 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4200 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4201 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4202 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4203 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4204 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4205 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4206 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4207 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4208 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4209 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4210 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4211 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4212 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4213 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4214 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4215 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4216 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4217
4218 __ addv(v8, __ T2D, v8, v0);
4219 __ addv(v9, __ T2D, v9, v1);
4220 __ addv(v10, __ T2D, v10, v2);
4221 __ addv(v11, __ T2D, v11, v3);
4222
4223 if (multi_block) {
4224 __ add(ofs, ofs, 128);
4225 __ cmp(ofs, limit);
4226 __ br(Assembler::LE, sha512_loop);
4227 __ mov(c_rarg0, ofs); // return ofs
4228 }
4229
4230 __ st1(v8, v9, v10, v11, __ T2D, state);
4231
4232 __ ldpd(v14, v15, Address(sp, 48));
4233 __ ldpd(v12, v13, Address(sp, 32));
4234 __ ldpd(v10, v11, Address(sp, 16));
4235 __ ldpd(v8, v9, __ post(sp, 64));
4236
4237 __ ret(lr);
4238
4239 return start;
4240 }
4241
4242 // Execute one round of keccak of two computations in parallel.
4243 // One of the states should be loaded into the lower halves of
4244 // the vector registers v0-v24, the other should be loaded into
4245 // the upper halves of those registers. The ld1r instruction loads
4246 // the round constant into both halves of register v31.
4247 // Intermediate results c0...c5 and d0...d5 are computed
4248 // in registers v25...v30.
4249 // All vector instructions that are used operate on both register
4250 // halves in parallel.
4251 // If only a single computation is needed, one can only load the lower halves.
4252 void keccak_round(Register rscratch1) {
4253 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4254 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4255 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4256 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4257 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4258 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4259 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4260 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4261 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4262 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4263
4264 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4265 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4266 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4267 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4268 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4269
4270 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4271 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4272 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4273 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4274 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4275 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4276 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4277 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4278 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4279 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4280 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4281 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4282 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4283 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4284 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4285 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4286 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4287 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4288 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4289 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4290 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4291 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4292 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4293 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4294 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4295
4296 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4297 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4298 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4299 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4300 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4301
4302 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4303
4304 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4305 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4306 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4307 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4308 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4309
4310 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4311 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4312 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4313 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4314 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4315
4316 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4317 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4318 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4319 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4320 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4321
4322 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4323 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4324 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4325 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4326 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4327
4328 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4329 }
4330
4331 // Arguments:
4332 //
4333 // Inputs:
4334 // c_rarg0 - byte[] source+offset
4335 // c_rarg1 - byte[] SHA.state
4336 // c_rarg2 - int block_size
4337 // c_rarg3 - int offset
4338 // c_rarg4 - int limit
4339 //
4340 address generate_sha3_implCompress(StubId stub_id) {
4341 bool multi_block;
4342 switch (stub_id) {
4343 case StubId::stubgen_sha3_implCompress_id:
4344 multi_block = false;
4345 break;
4346 case StubId::stubgen_sha3_implCompressMB_id:
4347 multi_block = true;
4348 break;
4349 default:
4350 ShouldNotReachHere();
4351 }
4352
4353 static const uint64_t round_consts[24] = {
4354 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4355 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4356 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4357 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4358 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4359 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4360 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4361 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4362 };
4363
4364 __ align(CodeEntryAlignment);
4365
4366 StubCodeMark mark(this, stub_id);
4367 address start = __ pc();
4368
4369 Register buf = c_rarg0;
4370 Register state = c_rarg1;
4371 Register block_size = c_rarg2;
4372 Register ofs = c_rarg3;
4373 Register limit = c_rarg4;
4374
4375 Label sha3_loop, rounds24_loop;
4376 Label sha3_512_or_sha3_384, shake128;
4377
4378 __ stpd(v8, v9, __ pre(sp, -64));
4379 __ stpd(v10, v11, Address(sp, 16));
4380 __ stpd(v12, v13, Address(sp, 32));
4381 __ stpd(v14, v15, Address(sp, 48));
4382
4383 // load state
4384 __ add(rscratch1, state, 32);
4385 __ ld1(v0, v1, v2, v3, __ T1D, state);
4386 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4387 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4388 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4389 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4390 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4391 __ ld1(v24, __ T1D, rscratch1);
4392
4393 __ BIND(sha3_loop);
4394
4395 // 24 keccak rounds
4396 __ movw(rscratch2, 24);
4397
4398 // load round_constants base
4399 __ lea(rscratch1, ExternalAddress((address) round_consts));
4400
4401 // load input
4402 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4403 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4404 __ eor(v0, __ T8B, v0, v25);
4405 __ eor(v1, __ T8B, v1, v26);
4406 __ eor(v2, __ T8B, v2, v27);
4407 __ eor(v3, __ T8B, v3, v28);
4408 __ eor(v4, __ T8B, v4, v29);
4409 __ eor(v5, __ T8B, v5, v30);
4410 __ eor(v6, __ T8B, v6, v31);
4411
4412 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4413 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4414
4415 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4416 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4417 __ eor(v7, __ T8B, v7, v25);
4418 __ eor(v8, __ T8B, v8, v26);
4419 __ eor(v9, __ T8B, v9, v27);
4420 __ eor(v10, __ T8B, v10, v28);
4421 __ eor(v11, __ T8B, v11, v29);
4422 __ eor(v12, __ T8B, v12, v30);
4423 __ eor(v13, __ T8B, v13, v31);
4424
4425 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4426 __ eor(v14, __ T8B, v14, v25);
4427 __ eor(v15, __ T8B, v15, v26);
4428 __ eor(v16, __ T8B, v16, v27);
4429
4430 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4431 __ andw(c_rarg5, block_size, 48);
4432 __ cbzw(c_rarg5, rounds24_loop);
4433
4434 __ tbnz(block_size, 5, shake128);
4435 // block_size == 144, bit5 == 0, SHA3-224
4436 __ ldrd(v28, __ post(buf, 8));
4437 __ eor(v17, __ T8B, v17, v28);
4438 __ b(rounds24_loop);
4439
4440 __ BIND(shake128);
4441 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4442 __ eor(v17, __ T8B, v17, v28);
4443 __ eor(v18, __ T8B, v18, v29);
4444 __ eor(v19, __ T8B, v19, v30);
4445 __ eor(v20, __ T8B, v20, v31);
4446 __ b(rounds24_loop); // block_size == 168, SHAKE128
4447
4448 __ BIND(sha3_512_or_sha3_384);
4449 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4450 __ eor(v7, __ T8B, v7, v25);
4451 __ eor(v8, __ T8B, v8, v26);
4452 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4453
4454 // SHA3-384
4455 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4456 __ eor(v9, __ T8B, v9, v27);
4457 __ eor(v10, __ T8B, v10, v28);
4458 __ eor(v11, __ T8B, v11, v29);
4459 __ eor(v12, __ T8B, v12, v30);
4460
4461 __ BIND(rounds24_loop);
4462 __ subw(rscratch2, rscratch2, 1);
4463
4464 keccak_round(rscratch1);
4465
4466 __ cbnzw(rscratch2, rounds24_loop);
4467
4468 if (multi_block) {
4469 __ add(ofs, ofs, block_size);
4470 __ cmp(ofs, limit);
4471 __ br(Assembler::LE, sha3_loop);
4472 __ mov(c_rarg0, ofs); // return ofs
4473 }
4474
4475 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4476 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4477 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4478 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4479 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4480 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4481 __ st1(v24, __ T1D, state);
4482
4483 // restore callee-saved registers
4484 __ ldpd(v14, v15, Address(sp, 48));
4485 __ ldpd(v12, v13, Address(sp, 32));
4486 __ ldpd(v10, v11, Address(sp, 16));
4487 __ ldpd(v8, v9, __ post(sp, 64));
4488
4489 __ ret(lr);
4490
4491 return start;
4492 }
4493
4494 // Inputs:
4495 // c_rarg0 - long[] state0
4496 // c_rarg1 - long[] state1
4497 address generate_double_keccak() {
4498 static const uint64_t round_consts[24] = {
4499 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4500 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4501 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4502 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4503 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4504 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4505 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4506 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4507 };
4508
4509 // Implements the double_keccak() method of the
4510 // sun.secyrity.provider.SHA3Parallel class
4511 __ align(CodeEntryAlignment);
4512 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4513 address start = __ pc();
4514 __ enter();
4515
4516 Register state0 = c_rarg0;
4517 Register state1 = c_rarg1;
4518
4519 Label rounds24_loop;
4520
4521 // save callee-saved registers
4522 __ stpd(v8, v9, __ pre(sp, -64));
4523 __ stpd(v10, v11, Address(sp, 16));
4524 __ stpd(v12, v13, Address(sp, 32));
4525 __ stpd(v14, v15, Address(sp, 48));
4526
4527 // load states
4528 __ add(rscratch1, state0, 32);
4529 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4530 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4531 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4532 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4533 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4534 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4535 __ ld1(v24, __ D, 0, rscratch1);
4536 __ add(rscratch1, state1, 32);
4537 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4538 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4539 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4540 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4541 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4542 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4543 __ ld1(v24, __ D, 1, rscratch1);
4544
4545 // 24 keccak rounds
4546 __ movw(rscratch2, 24);
4547
4548 // load round_constants base
4549 __ lea(rscratch1, ExternalAddress((address) round_consts));
4550
4551 __ BIND(rounds24_loop);
4552 __ subw(rscratch2, rscratch2, 1);
4553 keccak_round(rscratch1);
4554 __ cbnzw(rscratch2, rounds24_loop);
4555
4556 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4557 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4558 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4559 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4560 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4561 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4562 __ st1(v24, __ D, 0, state0);
4563 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4564 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4565 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4566 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4567 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4568 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4569 __ st1(v24, __ D, 1, state1);
4570
4571 // restore callee-saved vector registers
4572 __ ldpd(v14, v15, Address(sp, 48));
4573 __ ldpd(v12, v13, Address(sp, 32));
4574 __ ldpd(v10, v11, Address(sp, 16));
4575 __ ldpd(v8, v9, __ post(sp, 64));
4576
4577 __ leave(); // required for proper stackwalking of RuntimeStub frame
4578 __ mov(r0, zr); // return 0
4579 __ ret(lr);
4580
4581 return start;
4582 }
4583
4584 // ChaCha20 block function. This version parallelizes the 32-bit
4585 // state elements on each of 16 vectors, producing 4 blocks of
4586 // keystream at a time.
4587 //
4588 // state (int[16]) = c_rarg0
4589 // keystream (byte[256]) = c_rarg1
4590 // return - number of bytes of produced keystream (always 256)
4591 //
4592 // This implementation takes each 32-bit integer from the state
4593 // array and broadcasts it across all 4 32-bit lanes of a vector register
4594 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4595 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4596 // the quarter round schedule is implemented as outlined in RFC 7539 section
4597 // 2.3. However, instead of sequentially processing the 3 quarter round
4598 // operations represented by one QUARTERROUND function, we instead stack all
4599 // the adds, xors and left-rotations from the first 4 quarter rounds together
4600 // and then do the same for the second set of 4 quarter rounds. This removes
4601 // some latency that would otherwise be incurred by waiting for an add to
4602 // complete before performing an xor (which depends on the result of the
4603 // add), etc. An adjustment happens between the first and second groups of 4
4604 // quarter rounds, but this is done only in the inputs to the macro functions
4605 // that generate the assembly instructions - these adjustments themselves are
4606 // not part of the resulting assembly.
4607 // The 4 registers v0-v3 are used during the quarter round operations as
4608 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4609 // registers become the vectors involved in adding the start state back onto
4610 // the post-QR working state. After the adds are complete, each of the 16
4611 // vectors write their first lane back to the keystream buffer, followed
4612 // by the second lane from all vectors and so on.
4613 address generate_chacha20Block_blockpar() {
4614 Label L_twoRounds, L_cc20_const;
4615 __ align(CodeEntryAlignment);
4616 StubId stub_id = StubId::stubgen_chacha20Block_id;
4617 StubCodeMark mark(this, stub_id);
4618 address start = __ pc();
4619 __ enter();
4620
4621 int i, j;
4622 const Register state = c_rarg0;
4623 const Register keystream = c_rarg1;
4624 const Register loopCtr = r10;
4625 const Register tmpAddr = r11;
4626 const FloatRegister ctrAddOverlay = v28;
4627 const FloatRegister lrot8Tbl = v29;
4628
4629 // Organize SIMD registers in an array that facilitates
4630 // putting repetitive opcodes into loop structures. It is
4631 // important that each grouping of 4 registers is monotonically
4632 // increasing to support the requirements of multi-register
4633 // instructions (e.g. ld4r, st4, etc.)
4634 const FloatRegister workSt[16] = {
4635 v4, v5, v6, v7, v16, v17, v18, v19,
4636 v20, v21, v22, v23, v24, v25, v26, v27
4637 };
4638
4639 // Pull in constant data. The first 16 bytes are the add overlay
4640 // which is applied to the vector holding the counter (state[12]).
4641 // The second 16 bytes is the index register for the 8-bit left
4642 // rotation tbl instruction.
4643 __ adr(tmpAddr, L_cc20_const);
4644 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4645
4646 // Load from memory and interlace across 16 SIMD registers,
4647 // With each word from memory being broadcast to all lanes of
4648 // each successive SIMD register.
4649 // Addr(0) -> All lanes in workSt[i]
4650 // Addr(4) -> All lanes workSt[i + 1], etc.
4651 __ mov(tmpAddr, state);
4652 for (i = 0; i < 16; i += 4) {
4653 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4654 __ post(tmpAddr, 16));
4655 }
4656 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4657
4658 // Before entering the loop, create 5 4-register arrays. These
4659 // will hold the 4 registers that represent the a/b/c/d fields
4660 // in the quarter round operation. For instance the "b" field
4661 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4662 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4663 // since it is part of a diagonal organization. The aSet and scratch
4664 // register sets are defined at declaration time because they do not change
4665 // organization at any point during the 20-round processing.
4666 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4667 FloatRegister bSet[4];
4668 FloatRegister cSet[4];
4669 FloatRegister dSet[4];
4670 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4671
4672 // Set up the 10 iteration loop and perform all 8 quarter round ops
4673 __ mov(loopCtr, 10);
4674 __ BIND(L_twoRounds);
4675
4676 // Set to columnar organization and do the following 4 quarter-rounds:
4677 // QUARTERROUND(0, 4, 8, 12)
4678 // QUARTERROUND(1, 5, 9, 13)
4679 // QUARTERROUND(2, 6, 10, 14)
4680 // QUARTERROUND(3, 7, 11, 15)
4681 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4682 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4683 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4684
4685 __ cc20_qr_add4(aSet, bSet); // a += b
4686 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4687 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4688
4689 __ cc20_qr_add4(cSet, dSet); // c += d
4690 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4691 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4692
4693 __ cc20_qr_add4(aSet, bSet); // a += b
4694 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4695 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4696
4697 __ cc20_qr_add4(cSet, dSet); // c += d
4698 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4699 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4700
4701 // Set to diagonal organization and do the next 4 quarter-rounds:
4702 // QUARTERROUND(0, 5, 10, 15)
4703 // QUARTERROUND(1, 6, 11, 12)
4704 // QUARTERROUND(2, 7, 8, 13)
4705 // QUARTERROUND(3, 4, 9, 14)
4706 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4707 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4708 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4709
4710 __ cc20_qr_add4(aSet, bSet); // a += b
4711 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4712 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4713
4714 __ cc20_qr_add4(cSet, dSet); // c += d
4715 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4716 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4717
4718 __ cc20_qr_add4(aSet, bSet); // a += b
4719 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4720 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4721
4722 __ cc20_qr_add4(cSet, dSet); // c += d
4723 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4724 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4725
4726 // Decrement and iterate
4727 __ sub(loopCtr, loopCtr, 1);
4728 __ cbnz(loopCtr, L_twoRounds);
4729
4730 __ mov(tmpAddr, state);
4731
4732 // Add the starting state back to the post-loop keystream
4733 // state. We read/interlace the state array from memory into
4734 // 4 registers similar to what we did in the beginning. Then
4735 // add the counter overlay onto workSt[12] at the end.
4736 for (i = 0; i < 16; i += 4) {
4737 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4738 __ addv(workSt[i], __ T4S, workSt[i], v0);
4739 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4740 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4741 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4742 }
4743 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4744
4745 // Write working state into the keystream buffer. This is accomplished
4746 // by taking the lane "i" from each of the four vectors and writing
4747 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4748 // repeating with the next 4 vectors until all 16 vectors have been used.
4749 // Then move to the next lane and repeat the process until all lanes have
4750 // been written.
4751 for (i = 0; i < 4; i++) {
4752 for (j = 0; j < 16; j += 4) {
4753 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4754 __ post(keystream, 16));
4755 }
4756 }
4757
4758 __ mov(r0, 256); // Return length of output keystream
4759 __ leave();
4760 __ ret(lr);
4761
4762 // bind label and generate local constant data used by this stub
4763 // The constant data is broken into two 128-bit segments to be loaded
4764 // onto FloatRegisters. The first 128 bits are a counter add overlay
4765 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4766 // The second 128-bits is a table constant used for 8-bit left rotations.
4767 __ BIND(L_cc20_const);
4768 __ emit_int64(0x0000000100000000UL);
4769 __ emit_int64(0x0000000300000002UL);
4770 __ emit_int64(0x0605040702010003UL);
4771 __ emit_int64(0x0E0D0C0F0A09080BUL);
4772
4773 return start;
4774 }
4775
4776 // Helpers to schedule parallel operation bundles across vector
4777 // register sequences of size 2, 4 or 8.
4778
4779 // Implement various primitive computations across vector sequences
4780
4781 template<int N>
4782 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4783 const VSeq<N>& v1, const VSeq<N>& v2) {
4784 // output must not be constant
4785 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4786 // output cannot overwrite pending inputs
4787 assert(!vs_write_before_read(v, v1), "output overwrites input");
4788 assert(!vs_write_before_read(v, v2), "output overwrites input");
4789 for (int i = 0; i < N; i++) {
4790 __ addv(v[i], T, v1[i], v2[i]);
4791 }
4792 }
4793
4794 template<int N>
4795 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4796 const VSeq<N>& v1, const VSeq<N>& v2) {
4797 // output must not be constant
4798 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4799 // output cannot overwrite pending inputs
4800 assert(!vs_write_before_read(v, v1), "output overwrites input");
4801 assert(!vs_write_before_read(v, v2), "output overwrites input");
4802 for (int i = 0; i < N; i++) {
4803 __ subv(v[i], T, v1[i], v2[i]);
4804 }
4805 }
4806
4807 template<int N>
4808 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4809 const VSeq<N>& v1, const VSeq<N>& v2) {
4810 // output must not be constant
4811 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4812 // output cannot overwrite pending inputs
4813 assert(!vs_write_before_read(v, v1), "output overwrites input");
4814 assert(!vs_write_before_read(v, v2), "output overwrites input");
4815 for (int i = 0; i < N; i++) {
4816 __ mulv(v[i], T, v1[i], v2[i]);
4817 }
4818 }
4819
4820 template<int N>
4821 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4822 // output must not be constant
4823 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4824 // output cannot overwrite pending inputs
4825 assert(!vs_write_before_read(v, v1), "output overwrites input");
4826 for (int i = 0; i < N; i++) {
4827 __ negr(v[i], T, v1[i]);
4828 }
4829 }
4830
4831 template<int N>
4832 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4833 const VSeq<N>& v1, int shift) {
4834 // output must not be constant
4835 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4836 // output cannot overwrite pending inputs
4837 assert(!vs_write_before_read(v, v1), "output overwrites input");
4838 for (int i = 0; i < N; i++) {
4839 __ sshr(v[i], T, v1[i], shift);
4840 }
4841 }
4842
4843 template<int N>
4844 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4845 // output must not be constant
4846 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4847 // output cannot overwrite pending inputs
4848 assert(!vs_write_before_read(v, v1), "output overwrites input");
4849 assert(!vs_write_before_read(v, v2), "output overwrites input");
4850 for (int i = 0; i < N; i++) {
4851 __ andr(v[i], __ T16B, v1[i], v2[i]);
4852 }
4853 }
4854
4855 template<int N>
4856 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4857 // output must not be constant
4858 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4859 // output cannot overwrite pending inputs
4860 assert(!vs_write_before_read(v, v1), "output overwrites input");
4861 assert(!vs_write_before_read(v, v2), "output overwrites input");
4862 for (int i = 0; i < N; i++) {
4863 __ orr(v[i], __ T16B, v1[i], v2[i]);
4864 }
4865 }
4866
4867 template<int N>
4868 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4869 // output must not be constant
4870 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4871 // output cannot overwrite pending inputs
4872 assert(!vs_write_before_read(v, v1), "output overwrites input");
4873 for (int i = 0; i < N; i++) {
4874 __ notr(v[i], __ T16B, v1[i]);
4875 }
4876 }
4877
4878 template<int N>
4879 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4880 // output must not be constant
4881 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4882 // output cannot overwrite pending inputs
4883 assert(!vs_write_before_read(v, v1), "output overwrites input");
4884 assert(!vs_write_before_read(v, v2), "output overwrites input");
4885 for (int i = 0; i < N; i++) {
4886 __ sqdmulh(v[i], T, v1[i], v2[i]);
4887 }
4888 }
4889
4890 template<int N>
4891 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4892 // output must not be constant
4893 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4894 // output cannot overwrite pending inputs
4895 assert(!vs_write_before_read(v, v1), "output overwrites input");
4896 assert(!vs_write_before_read(v, v2), "output overwrites input");
4897 for (int i = 0; i < N; i++) {
4898 __ mlsv(v[i], T, v1[i], v2[i]);
4899 }
4900 }
4901
4902 // load N/2 successive pairs of quadword values from memory in order
4903 // into N successive vector registers of the sequence via the
4904 // address supplied in base.
4905 template<int N>
4906 void vs_ldpq(const VSeq<N>& v, Register base) {
4907 for (int i = 0; i < N; i += 2) {
4908 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4909 }
4910 }
4911
4912 // load N/2 successive pairs of quadword values from memory in order
4913 // into N vector registers of the sequence via the address supplied
4914 // in base using post-increment addressing
4915 template<int N>
4916 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4917 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4918 for (int i = 0; i < N; i += 2) {
4919 __ ldpq(v[i], v[i+1], __ post(base, 32));
4920 }
4921 }
4922
4923 // store N successive vector registers of the sequence into N/2
4924 // successive pairs of quadword memory locations via the address
4925 // supplied in base using post-increment addressing
4926 template<int N>
4927 void vs_stpq_post(const VSeq<N>& v, Register base) {
4928 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4929 for (int i = 0; i < N; i += 2) {
4930 __ stpq(v[i], v[i+1], __ post(base, 32));
4931 }
4932 }
4933
4934 // load N/2 pairs of quadword values from memory de-interleaved into
4935 // N vector registers 2 at a time via the address supplied in base
4936 // using post-increment addressing.
4937 template<int N>
4938 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4939 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4940 for (int i = 0; i < N; i += 2) {
4941 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4942 }
4943 }
4944
4945 // store N vector registers interleaved into N/2 pairs of quadword
4946 // memory locations via the address supplied in base using
4947 // post-increment addressing.
4948 template<int N>
4949 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4950 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4951 for (int i = 0; i < N; i += 2) {
4952 __ st2(v[i], v[i+1], T, __ post(base, 32));
4953 }
4954 }
4955
4956 // load N quadword values from memory de-interleaved into N vector
4957 // registers 3 elements at a time via the address supplied in base.
4958 template<int N>
4959 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4960 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4961 for (int i = 0; i < N; i += 3) {
4962 __ ld3(v[i], v[i+1], v[i+2], T, base);
4963 }
4964 }
4965
4966 // load N quadword values from memory de-interleaved into N vector
4967 // registers 3 elements at a time via the address supplied in base
4968 // using post-increment addressing.
4969 template<int N>
4970 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4971 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4972 for (int i = 0; i < N; i += 3) {
4973 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4974 }
4975 }
4976
4977 // load N/2 pairs of quadword values from memory into N vector
4978 // registers via the address supplied in base with each pair indexed
4979 // using the the start offset plus the corresponding entry in the
4980 // offsets array
4981 template<int N>
4982 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
4983 for (int i = 0; i < N/2; i++) {
4984 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4985 }
4986 }
4987
4988 // store N vector registers into N/2 pairs of quadword memory
4989 // locations via the address supplied in base with each pair indexed
4990 // using the the start offset plus the corresponding entry in the
4991 // offsets array
4992 template<int N>
4993 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
4994 for (int i = 0; i < N/2; i++) {
4995 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4996 }
4997 }
4998
4999 // load N single quadword values from memory into N vector registers
5000 // via the address supplied in base with each value indexed using
5001 // the the start offset plus the corresponding entry in the offsets
5002 // array
5003 template<int N>
5004 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5005 int start, int (&offsets)[N]) {
5006 for (int i = 0; i < N; i++) {
5007 __ ldr(v[i], T, Address(base, start + offsets[i]));
5008 }
5009 }
5010
5011 // store N vector registers into N single quadword memory locations
5012 // via the address supplied in base with each value indexed using
5013 // the the start offset plus the corresponding entry in the offsets
5014 // array
5015 template<int N>
5016 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5017 int start, int (&offsets)[N]) {
5018 for (int i = 0; i < N; i++) {
5019 __ str(v[i], T, Address(base, start + offsets[i]));
5020 }
5021 }
5022
5023 // load N/2 pairs of quadword values from memory de-interleaved into
5024 // N vector registers 2 at a time via the address supplied in base
5025 // with each pair indexed using the the start offset plus the
5026 // corresponding entry in the offsets array
5027 template<int N>
5028 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5029 Register tmp, int start, int (&offsets)[N/2]) {
5030 for (int i = 0; i < N/2; i++) {
5031 __ add(tmp, base, start + offsets[i]);
5032 __ ld2(v[2*i], v[2*i+1], T, tmp);
5033 }
5034 }
5035
5036 // store N vector registers 2 at a time interleaved into N/2 pairs
5037 // of quadword memory locations via the address supplied in base
5038 // with each pair indexed using the the start offset plus the
5039 // corresponding entry in the offsets array
5040 template<int N>
5041 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5042 Register tmp, int start, int (&offsets)[N/2]) {
5043 for (int i = 0; i < N/2; i++) {
5044 __ add(tmp, base, start + offsets[i]);
5045 __ st2(v[2*i], v[2*i+1], T, tmp);
5046 }
5047 }
5048
5049 // Helper routines for various flavours of Montgomery multiply
5050
5051 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5052 // multiplications in parallel
5053 //
5054
5055 // See the montMul() method of the sun.security.provider.ML_DSA
5056 // class.
5057 //
5058 // Computes 4x4S results or 8x8H results
5059 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5060 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5061 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5062 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5063 // Outputs: va - 4x4S or 4x8H vector register sequences
5064 // vb, vc, vtmp and vq must all be disjoint
5065 // va must be disjoint from all other inputs/temps or must equal vc
5066 // va must have a non-zero delta i.e. it must not be a constant vseq.
5067 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5068 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5069 Assembler::SIMD_Arrangement T,
5070 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5071 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5072 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5073 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5074 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5075
5076 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5077 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5078
5079 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5080
5081 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5082 assert(vs_disjoint(va, vb), "va and vb overlap");
5083 assert(vs_disjoint(va, vq), "va and vq overlap");
5084 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5085 assert(!va.is_constant(), "output vector must identify 4 different registers");
5086
5087 // schedule 4 streams of instructions across the vector sequences
5088 for (int i = 0; i < 4; i++) {
5089 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5090 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5091 }
5092
5093 for (int i = 0; i < 4; i++) {
5094 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5095 }
5096
5097 for (int i = 0; i < 4; i++) {
5098 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5099 }
5100
5101 for (int i = 0; i < 4; i++) {
5102 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5103 }
5104 }
5105
5106 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5107 // multiplications in parallel
5108 //
5109
5110 // See the montMul() method of the sun.security.provider.ML_DSA
5111 // class.
5112 //
5113 // Computes 4x4S results or 8x8H results
5114 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5115 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5116 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5117 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5118 // Outputs: va - 4x4S or 4x8H vector register sequences
5119 // vb, vc, vtmp and vq must all be disjoint
5120 // va must be disjoint from all other inputs/temps or must equal vc
5121 // va must have a non-zero delta i.e. it must not be a constant vseq.
5122 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5123 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5124 Assembler::SIMD_Arrangement T,
5125 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5126 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5127 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5128 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5129 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5130
5131 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5132 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5133
5134 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5135
5136 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5137 assert(vs_disjoint(va, vb), "va and vb overlap");
5138 assert(vs_disjoint(va, vq), "va and vq overlap");
5139 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5140 assert(!va.is_constant(), "output vector must identify 2 different registers");
5141
5142 // schedule 2 streams of instructions across the vector sequences
5143 for (int i = 0; i < 2; i++) {
5144 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5145 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5146 }
5147
5148 for (int i = 0; i < 2; i++) {
5149 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5150 }
5151
5152 for (int i = 0; i < 2; i++) {
5153 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5154 }
5155
5156 for (int i = 0; i < 2; i++) {
5157 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5158 }
5159 }
5160
5161 // Perform 16 16-bit Montgomery multiplications in parallel.
5162 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5163 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5164 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5165 // It will assert that the register use is valid
5166 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5167 }
5168
5169 // Perform 32 16-bit Montgomery multiplications in parallel.
5170 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5171 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5172 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5173 // It will assert that the register use is valid
5174 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5175 }
5176
5177 // Perform 64 16-bit Montgomery multiplications in parallel.
5178 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5179 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5180 // Schedule two successive 4x8H multiplies via the montmul helper
5181 // on the front and back halves of va, vb and vc. The helper will
5182 // assert that the register use has no overlap conflicts on each
5183 // individual call but we also need to ensure that the necessary
5184 // disjoint/equality constraints are met across both calls.
5185
5186 // vb, vc, vtmp and vq must be disjoint. va must either be
5187 // disjoint from all other registers or equal vc
5188
5189 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5190 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5191 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5192
5193 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5194 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5195
5196 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5197
5198 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5199 assert(vs_disjoint(va, vb), "va and vb overlap");
5200 assert(vs_disjoint(va, vq), "va and vq overlap");
5201 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5202
5203 // we multiply the front and back halves of each sequence 4 at a
5204 // time because
5205 //
5206 // 1) we are currently only able to get 4-way instruction
5207 // parallelism at best
5208 //
5209 // 2) we need registers for the constants in vq and temporary
5210 // scratch registers to hold intermediate results so vtmp can only
5211 // be a VSeq<4> which means we only have 4 scratch slots
5212
5213 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5214 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5215 }
5216
5217 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5218 const VSeq<4>& vc,
5219 const VSeq<4>& vtmp,
5220 const VSeq<2>& vq) {
5221 // compute a = montmul(a1, c)
5222 kyber_montmul32(vc, va1, vc, vtmp, vq);
5223 // ouptut a1 = a0 - a
5224 vs_subv(va1, __ T8H, va0, vc);
5225 // and a0 = a0 + a
5226 vs_addv(va0, __ T8H, va0, vc);
5227 }
5228
5229 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5230 const VSeq<4>& vb,
5231 const VSeq<4>& vtmp1,
5232 const VSeq<4>& vtmp2,
5233 const VSeq<2>& vq) {
5234 // compute c = a0 - a1
5235 vs_subv(vtmp1, __ T8H, va0, va1);
5236 // output a0 = a0 + a1
5237 vs_addv(va0, __ T8H, va0, va1);
5238 // output a1 = b montmul c
5239 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5240 }
5241
5242 void load64shorts(const VSeq<8>& v, Register shorts) {
5243 vs_ldpq_post(v, shorts);
5244 }
5245
5246 void load32shorts(const VSeq<4>& v, Register shorts) {
5247 vs_ldpq_post(v, shorts);
5248 }
5249
5250 void store64shorts(VSeq<8> v, Register tmpAddr) {
5251 vs_stpq_post(v, tmpAddr);
5252 }
5253
5254 // Kyber NTT function.
5255 // Implements
5256 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5257 //
5258 // coeffs (short[256]) = c_rarg0
5259 // ntt_zetas (short[256]) = c_rarg1
5260 address generate_kyberNtt() {
5261
5262 __ align(CodeEntryAlignment);
5263 StubId stub_id = StubId::stubgen_kyberNtt_id;
5264 StubCodeMark mark(this, stub_id);
5265 address start = __ pc();
5266 __ enter();
5267
5268 const Register coeffs = c_rarg0;
5269 const Register zetas = c_rarg1;
5270
5271 const Register kyberConsts = r10;
5272 const Register tmpAddr = r11;
5273
5274 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5275 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5276 VSeq<2> vq(30); // n.b. constants overlap vs3
5277
5278 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5279 // load the montmul constants
5280 vs_ldpq(vq, kyberConsts);
5281
5282 // Each level corresponds to an iteration of the outermost loop of the
5283 // Java method seilerNTT(int[] coeffs). There are some differences
5284 // from what is done in the seilerNTT() method, though:
5285 // 1. The computation is using 16-bit signed values, we do not convert them
5286 // to ints here.
5287 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5288 // this array for each level, it is easier that way to fill up the vector
5289 // registers.
5290 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5291 // multiplications (this is because that way there should not be any
5292 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5293 // that we can use the 16-bit arithmetic in the vector unit.
5294 //
5295 // On each level, we fill up the vector registers in such a way that the
5296 // array elements that need to be multiplied by the zetas go into one
5297 // set of vector registers while the corresponding ones that don't need to
5298 // be multiplied, go into another set.
5299 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5300 // registers interleaving the steps of 4 identical computations,
5301 // each done on 8 16-bit values per register.
5302
5303 // At levels 0-3 the coefficients multiplied by or added/subtracted
5304 // to the zetas occur in discrete blocks whose size is some multiple
5305 // of 32.
5306
5307 // level 0
5308 __ add(tmpAddr, coeffs, 256);
5309 load64shorts(vs1, tmpAddr);
5310 load64shorts(vs2, zetas);
5311 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5312 __ add(tmpAddr, coeffs, 0);
5313 load64shorts(vs1, tmpAddr);
5314 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5315 vs_addv(vs1, __ T8H, vs1, vs2);
5316 __ add(tmpAddr, coeffs, 0);
5317 vs_stpq_post(vs1, tmpAddr);
5318 __ add(tmpAddr, coeffs, 256);
5319 vs_stpq_post(vs3, tmpAddr);
5320 // restore montmul constants
5321 vs_ldpq(vq, kyberConsts);
5322 load64shorts(vs1, tmpAddr);
5323 load64shorts(vs2, zetas);
5324 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5325 __ add(tmpAddr, coeffs, 128);
5326 load64shorts(vs1, tmpAddr);
5327 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5328 vs_addv(vs1, __ T8H, vs1, vs2);
5329 __ add(tmpAddr, coeffs, 128);
5330 store64shorts(vs1, tmpAddr);
5331 __ add(tmpAddr, coeffs, 384);
5332 store64shorts(vs3, tmpAddr);
5333
5334 // level 1
5335 // restore montmul constants
5336 vs_ldpq(vq, kyberConsts);
5337 __ add(tmpAddr, coeffs, 128);
5338 load64shorts(vs1, tmpAddr);
5339 load64shorts(vs2, zetas);
5340 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5341 __ add(tmpAddr, coeffs, 0);
5342 load64shorts(vs1, tmpAddr);
5343 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5344 vs_addv(vs1, __ T8H, vs1, vs2);
5345 __ add(tmpAddr, coeffs, 0);
5346 store64shorts(vs1, tmpAddr);
5347 store64shorts(vs3, tmpAddr);
5348 vs_ldpq(vq, kyberConsts);
5349 __ add(tmpAddr, coeffs, 384);
5350 load64shorts(vs1, tmpAddr);
5351 load64shorts(vs2, zetas);
5352 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5353 __ add(tmpAddr, coeffs, 256);
5354 load64shorts(vs1, tmpAddr);
5355 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5356 vs_addv(vs1, __ T8H, vs1, vs2);
5357 __ add(tmpAddr, coeffs, 256);
5358 store64shorts(vs1, tmpAddr);
5359 store64shorts(vs3, tmpAddr);
5360
5361 // level 2
5362 vs_ldpq(vq, kyberConsts);
5363 int offsets1[4] = { 0, 32, 128, 160 };
5364 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5365 load64shorts(vs2, zetas);
5366 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5367 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5368 // kyber_subv_addv64();
5369 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5370 vs_addv(vs1, __ T8H, vs1, vs2);
5371 __ add(tmpAddr, coeffs, 0);
5372 vs_stpq_post(vs_front(vs1), tmpAddr);
5373 vs_stpq_post(vs_front(vs3), tmpAddr);
5374 vs_stpq_post(vs_back(vs1), tmpAddr);
5375 vs_stpq_post(vs_back(vs3), tmpAddr);
5376 vs_ldpq(vq, kyberConsts);
5377 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5378 load64shorts(vs2, zetas);
5379 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5380 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5381 // kyber_subv_addv64();
5382 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5383 vs_addv(vs1, __ T8H, vs1, vs2);
5384 __ add(tmpAddr, coeffs, 256);
5385 vs_stpq_post(vs_front(vs1), tmpAddr);
5386 vs_stpq_post(vs_front(vs3), tmpAddr);
5387 vs_stpq_post(vs_back(vs1), tmpAddr);
5388 vs_stpq_post(vs_back(vs3), tmpAddr);
5389
5390 // level 3
5391 vs_ldpq(vq, kyberConsts);
5392 int offsets2[4] = { 0, 64, 128, 192 };
5393 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5394 load64shorts(vs2, zetas);
5395 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5396 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5397 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5398 vs_addv(vs1, __ T8H, vs1, vs2);
5399 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5400 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5401
5402 vs_ldpq(vq, kyberConsts);
5403 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5404 load64shorts(vs2, zetas);
5405 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5406 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5407 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5408 vs_addv(vs1, __ T8H, vs1, vs2);
5409 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5410 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5411
5412 // level 4
5413 // At level 4 coefficients occur in 8 discrete blocks of size 16
5414 // so they are loaded using employing an ldr at 8 distinct offsets.
5415
5416 vs_ldpq(vq, kyberConsts);
5417 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5418 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5419 load64shorts(vs2, zetas);
5420 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5421 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5422 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5423 vs_addv(vs1, __ T8H, vs1, vs2);
5424 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5425 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5426
5427 vs_ldpq(vq, kyberConsts);
5428 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5429 load64shorts(vs2, zetas);
5430 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5431 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5432 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5433 vs_addv(vs1, __ T8H, vs1, vs2);
5434 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5435 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5436
5437 // level 5
5438 // At level 5 related coefficients occur in discrete blocks of size 8 so
5439 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5440
5441 vs_ldpq(vq, kyberConsts);
5442 int offsets4[4] = { 0, 32, 64, 96 };
5443 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5444 load32shorts(vs_front(vs2), zetas);
5445 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5446 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5447 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5448 load32shorts(vs_front(vs2), zetas);
5449 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5450 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5451 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5452 load32shorts(vs_front(vs2), zetas);
5453 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5454 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5455
5456 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5457 load32shorts(vs_front(vs2), zetas);
5458 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5459 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5460
5461 // level 6
5462 // At level 6 related coefficients occur in discrete blocks of size 4 so
5463 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5464
5465 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5466 load32shorts(vs_front(vs2), zetas);
5467 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5468 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5469 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5470 // __ ldpq(v18, v19, __ post(zetas, 32));
5471 load32shorts(vs_front(vs2), zetas);
5472 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5473 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5474
5475 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5476 load32shorts(vs_front(vs2), zetas);
5477 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5478 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5479
5480 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5481 load32shorts(vs_front(vs2), zetas);
5482 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5483 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5484
5485 __ leave(); // required for proper stackwalking of RuntimeStub frame
5486 __ mov(r0, zr); // return 0
5487 __ ret(lr);
5488
5489 return start;
5490 }
5491
5492 // Kyber Inverse NTT function
5493 // Implements
5494 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5495 //
5496 // coeffs (short[256]) = c_rarg0
5497 // ntt_zetas (short[256]) = c_rarg1
5498 address generate_kyberInverseNtt() {
5499
5500 __ align(CodeEntryAlignment);
5501 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5502 StubCodeMark mark(this, stub_id);
5503 address start = __ pc();
5504 __ enter();
5505
5506 const Register coeffs = c_rarg0;
5507 const Register zetas = c_rarg1;
5508
5509 const Register kyberConsts = r10;
5510 const Register tmpAddr = r11;
5511 const Register tmpAddr2 = c_rarg2;
5512
5513 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5514 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5515 VSeq<2> vq(30); // n.b. constants overlap vs3
5516
5517 __ lea(kyberConsts,
5518 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5519
5520 // level 0
5521 // At level 0 related coefficients occur in discrete blocks of size 4 so
5522 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5523
5524 vs_ldpq(vq, kyberConsts);
5525 int offsets4[4] = { 0, 32, 64, 96 };
5526 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5527 load32shorts(vs_front(vs2), zetas);
5528 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5529 vs_front(vs2), vs_back(vs2), vtmp, vq);
5530 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5531 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5532 load32shorts(vs_front(vs2), zetas);
5533 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5534 vs_front(vs2), vs_back(vs2), vtmp, vq);
5535 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5536 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5537 load32shorts(vs_front(vs2), zetas);
5538 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5539 vs_front(vs2), vs_back(vs2), vtmp, vq);
5540 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5541 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5542 load32shorts(vs_front(vs2), zetas);
5543 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5544 vs_front(vs2), vs_back(vs2), vtmp, vq);
5545 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5546
5547 // level 1
5548 // At level 1 related coefficients occur in discrete blocks of size 8 so
5549 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5550
5551 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5552 load32shorts(vs_front(vs2), zetas);
5553 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5554 vs_front(vs2), vs_back(vs2), vtmp, vq);
5555 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5556 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5557 load32shorts(vs_front(vs2), zetas);
5558 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5559 vs_front(vs2), vs_back(vs2), vtmp, vq);
5560 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5561
5562 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5563 load32shorts(vs_front(vs2), zetas);
5564 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5565 vs_front(vs2), vs_back(vs2), vtmp, vq);
5566 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5567 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5568 load32shorts(vs_front(vs2), zetas);
5569 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5570 vs_front(vs2), vs_back(vs2), vtmp, vq);
5571 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5572
5573 // level 2
5574 // At level 2 coefficients occur in 8 discrete blocks of size 16
5575 // so they are loaded using employing an ldr at 8 distinct offsets.
5576
5577 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5578 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5579 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5580 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5581 vs_subv(vs1, __ T8H, vs1, vs2);
5582 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5583 load64shorts(vs2, zetas);
5584 vs_ldpq(vq, kyberConsts);
5585 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5586 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5587
5588 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5589 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5590 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5591 vs_subv(vs1, __ T8H, vs1, vs2);
5592 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5593 load64shorts(vs2, zetas);
5594 vs_ldpq(vq, kyberConsts);
5595 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5596 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5597
5598 // Barrett reduction at indexes where overflow may happen
5599
5600 // load q and the multiplier for the Barrett reduction
5601 __ add(tmpAddr, kyberConsts, 16);
5602 vs_ldpq(vq, tmpAddr);
5603
5604 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5605 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5606 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5607 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5608 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5609 vs_sshr(vs2, __ T8H, vs2, 11);
5610 vs_mlsv(vs1, __ T8H, vs2, vq1);
5611 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5612 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5613 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5614 vs_sshr(vs2, __ T8H, vs2, 11);
5615 vs_mlsv(vs1, __ T8H, vs2, vq1);
5616 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5617
5618 // level 3
5619 // From level 3 upwards coefficients occur in discrete blocks whose size is
5620 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5621
5622 int offsets2[4] = { 0, 64, 128, 192 };
5623 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5624 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5625 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5626 vs_subv(vs1, __ T8H, vs1, vs2);
5627 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5628 load64shorts(vs2, zetas);
5629 vs_ldpq(vq, kyberConsts);
5630 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5631 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5632
5633 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5634 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5635 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5636 vs_subv(vs1, __ T8H, vs1, vs2);
5637 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5638 load64shorts(vs2, zetas);
5639 vs_ldpq(vq, kyberConsts);
5640 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5641 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5642
5643 // level 4
5644
5645 int offsets1[4] = { 0, 32, 128, 160 };
5646 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5647 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5648 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5649 vs_subv(vs1, __ T8H, vs1, vs2);
5650 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5651 load64shorts(vs2, zetas);
5652 vs_ldpq(vq, kyberConsts);
5653 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5654 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5655
5656 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5657 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5658 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5659 vs_subv(vs1, __ T8H, vs1, vs2);
5660 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5661 load64shorts(vs2, zetas);
5662 vs_ldpq(vq, kyberConsts);
5663 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5664 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5665
5666 // level 5
5667
5668 __ add(tmpAddr, coeffs, 0);
5669 load64shorts(vs1, tmpAddr);
5670 __ add(tmpAddr, coeffs, 128);
5671 load64shorts(vs2, tmpAddr);
5672 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5673 vs_subv(vs1, __ T8H, vs1, vs2);
5674 __ add(tmpAddr, coeffs, 0);
5675 store64shorts(vs3, tmpAddr);
5676 load64shorts(vs2, zetas);
5677 vs_ldpq(vq, kyberConsts);
5678 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5679 __ add(tmpAddr, coeffs, 128);
5680 store64shorts(vs2, tmpAddr);
5681
5682 load64shorts(vs1, tmpAddr);
5683 __ add(tmpAddr, coeffs, 384);
5684 load64shorts(vs2, tmpAddr);
5685 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5686 vs_subv(vs1, __ T8H, vs1, vs2);
5687 __ add(tmpAddr, coeffs, 256);
5688 store64shorts(vs3, tmpAddr);
5689 load64shorts(vs2, zetas);
5690 vs_ldpq(vq, kyberConsts);
5691 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5692 __ add(tmpAddr, coeffs, 384);
5693 store64shorts(vs2, tmpAddr);
5694
5695 // Barrett reduction at indexes where overflow may happen
5696
5697 // load q and the multiplier for the Barrett reduction
5698 __ add(tmpAddr, kyberConsts, 16);
5699 vs_ldpq(vq, tmpAddr);
5700
5701 int offsets0[2] = { 0, 256 };
5702 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5703 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5704 vs_sshr(vs2, __ T8H, vs2, 11);
5705 vs_mlsv(vs1, __ T8H, vs2, vq1);
5706 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5707
5708 // level 6
5709
5710 __ add(tmpAddr, coeffs, 0);
5711 load64shorts(vs1, tmpAddr);
5712 __ add(tmpAddr, coeffs, 256);
5713 load64shorts(vs2, tmpAddr);
5714 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5715 vs_subv(vs1, __ T8H, vs1, vs2);
5716 __ add(tmpAddr, coeffs, 0);
5717 store64shorts(vs3, tmpAddr);
5718 load64shorts(vs2, zetas);
5719 vs_ldpq(vq, kyberConsts);
5720 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5721 __ add(tmpAddr, coeffs, 256);
5722 store64shorts(vs2, tmpAddr);
5723
5724 __ add(tmpAddr, coeffs, 128);
5725 load64shorts(vs1, tmpAddr);
5726 __ add(tmpAddr, coeffs, 384);
5727 load64shorts(vs2, tmpAddr);
5728 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5729 vs_subv(vs1, __ T8H, vs1, vs2);
5730 __ add(tmpAddr, coeffs, 128);
5731 store64shorts(vs3, tmpAddr);
5732 load64shorts(vs2, zetas);
5733 vs_ldpq(vq, kyberConsts);
5734 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5735 __ add(tmpAddr, coeffs, 384);
5736 store64shorts(vs2, tmpAddr);
5737
5738 // multiply by 2^-n
5739
5740 // load toMont(2^-n mod q)
5741 __ add(tmpAddr, kyberConsts, 48);
5742 __ ldr(v29, __ Q, tmpAddr);
5743
5744 vs_ldpq(vq, kyberConsts);
5745 __ add(tmpAddr, coeffs, 0);
5746 load64shorts(vs1, tmpAddr);
5747 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5748 __ add(tmpAddr, coeffs, 0);
5749 store64shorts(vs2, tmpAddr);
5750
5751 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5752 load64shorts(vs1, tmpAddr);
5753 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5754 __ add(tmpAddr, coeffs, 128);
5755 store64shorts(vs2, tmpAddr);
5756
5757 // now tmpAddr contains coeffs + 256
5758 load64shorts(vs1, tmpAddr);
5759 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5760 __ add(tmpAddr, coeffs, 256);
5761 store64shorts(vs2, tmpAddr);
5762
5763 // now tmpAddr contains coeffs + 384
5764 load64shorts(vs1, tmpAddr);
5765 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5766 __ add(tmpAddr, coeffs, 384);
5767 store64shorts(vs2, tmpAddr);
5768
5769 __ leave(); // required for proper stackwalking of RuntimeStub frame
5770 __ mov(r0, zr); // return 0
5771 __ ret(lr);
5772
5773 return start;
5774 }
5775
5776 // Kyber multiply polynomials in the NTT domain.
5777 // Implements
5778 // static int implKyberNttMult(
5779 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5780 //
5781 // result (short[256]) = c_rarg0
5782 // ntta (short[256]) = c_rarg1
5783 // nttb (short[256]) = c_rarg2
5784 // zetas (short[128]) = c_rarg3
5785 address generate_kyberNttMult() {
5786
5787 __ align(CodeEntryAlignment);
5788 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5789 StubCodeMark mark(this, stub_id);
5790 address start = __ pc();
5791 __ enter();
5792
5793 const Register result = c_rarg0;
5794 const Register ntta = c_rarg1;
5795 const Register nttb = c_rarg2;
5796 const Register zetas = c_rarg3;
5797
5798 const Register kyberConsts = r10;
5799 const Register limit = r11;
5800
5801 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5802 VSeq<4> vs3(16), vs4(20);
5803 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5804 VSeq<2> vz(28); // pair of zetas
5805 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5806
5807 __ lea(kyberConsts,
5808 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5809
5810 Label kyberNttMult_loop;
5811
5812 __ add(limit, result, 512);
5813
5814 // load q and qinv
5815 vs_ldpq(vq, kyberConsts);
5816
5817 // load R^2 mod q (to convert back from Montgomery representation)
5818 __ add(kyberConsts, kyberConsts, 64);
5819 __ ldr(v27, __ Q, kyberConsts);
5820
5821 __ BIND(kyberNttMult_loop);
5822
5823 // load 16 zetas
5824 vs_ldpq_post(vz, zetas);
5825
5826 // load 2 sets of 32 coefficients from the two input arrays
5827 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5828 // are striped across pairs of vector registers
5829 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5830 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5831 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5832 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5833
5834 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5835 // i.e. montmul the first and second halves of vs1 in order and
5836 // then with one sequence reversed storing the two results in vs3
5837 //
5838 // vs3[0] <- montmul(a0, b0)
5839 // vs3[1] <- montmul(a1, b1)
5840 // vs3[2] <- montmul(a0, b1)
5841 // vs3[3] <- montmul(a1, b0)
5842 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5843 kyber_montmul16(vs_back(vs3),
5844 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5845
5846 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5847 // i.e. montmul the first and second halves of vs4 in order and
5848 // then with one sequence reversed storing the two results in vs1
5849 //
5850 // vs1[0] <- montmul(a2, b2)
5851 // vs1[1] <- montmul(a3, b3)
5852 // vs1[2] <- montmul(a2, b3)
5853 // vs1[3] <- montmul(a3, b2)
5854 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5855 kyber_montmul16(vs_back(vs1),
5856 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5857
5858 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5859 // We can schedule two montmuls at a time if we use a suitable vector
5860 // sequence <vs3[1], vs1[1]>.
5861 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5862 VSeq<2> vs5(vs3[1], delta);
5863
5864 // vs3[1] <- montmul(montmul(a1, b1), z0)
5865 // vs1[1] <- montmul(montmul(a3, b3), z1)
5866 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5867
5868 // add results in pairs storing in vs3
5869 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5870 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5871 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5872
5873 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5874 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5875 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5876
5877 // vs1 <- montmul(vs3, montRSquareModQ)
5878 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5879
5880 // store back the two pairs of result vectors de-interleaved as 8H elements
5881 // i.e. storing each pairs of shorts striped across a register pair adjacent
5882 // in memory
5883 vs_st2_post(vs1, __ T8H, result);
5884
5885 __ cmp(result, limit);
5886 __ br(Assembler::NE, kyberNttMult_loop);
5887
5888 __ leave(); // required for proper stackwalking of RuntimeStub frame
5889 __ mov(r0, zr); // return 0
5890 __ ret(lr);
5891
5892 return start;
5893 }
5894
5895 // Kyber add 2 polynomials.
5896 // Implements
5897 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5898 //
5899 // result (short[256]) = c_rarg0
5900 // a (short[256]) = c_rarg1
5901 // b (short[256]) = c_rarg2
5902 address generate_kyberAddPoly_2() {
5903
5904 __ align(CodeEntryAlignment);
5905 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5906 StubCodeMark mark(this, stub_id);
5907 address start = __ pc();
5908 __ enter();
5909
5910 const Register result = c_rarg0;
5911 const Register a = c_rarg1;
5912 const Register b = c_rarg2;
5913
5914 const Register kyberConsts = r11;
5915
5916 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5917 // So, we can load, add and store the data in 3 groups of 11,
5918 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5919 // registers. A further constraint is that the mapping needs
5920 // to skip callee saves. So, we allocate the register
5921 // sequences using two 8 sequences, two 2 sequences and two
5922 // single registers.
5923 VSeq<8> vs1_1(0);
5924 VSeq<2> vs1_2(16);
5925 FloatRegister vs1_3 = v28;
5926 VSeq<8> vs2_1(18);
5927 VSeq<2> vs2_2(26);
5928 FloatRegister vs2_3 = v29;
5929
5930 // two constant vector sequences
5931 VSeq<8> vc_1(31, 0);
5932 VSeq<2> vc_2(31, 0);
5933
5934 FloatRegister vc_3 = v31;
5935 __ lea(kyberConsts,
5936 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5937
5938 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5939 for (int i = 0; i < 3; i++) {
5940 // load 80 or 88 values from a into vs1_1/2/3
5941 vs_ldpq_post(vs1_1, a);
5942 vs_ldpq_post(vs1_2, a);
5943 if (i < 2) {
5944 __ ldr(vs1_3, __ Q, __ post(a, 16));
5945 }
5946 // load 80 or 88 values from b into vs2_1/2/3
5947 vs_ldpq_post(vs2_1, b);
5948 vs_ldpq_post(vs2_2, b);
5949 if (i < 2) {
5950 __ ldr(vs2_3, __ Q, __ post(b, 16));
5951 }
5952 // sum 80 or 88 values across vs1 and vs2 into vs1
5953 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5954 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5955 if (i < 2) {
5956 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5957 }
5958 // add constant to all 80 or 88 results
5959 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5960 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5961 if (i < 2) {
5962 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5963 }
5964 // store 80 or 88 values
5965 vs_stpq_post(vs1_1, result);
5966 vs_stpq_post(vs1_2, result);
5967 if (i < 2) {
5968 __ str(vs1_3, __ Q, __ post(result, 16));
5969 }
5970 }
5971
5972 __ leave(); // required for proper stackwalking of RuntimeStub frame
5973 __ mov(r0, zr); // return 0
5974 __ ret(lr);
5975
5976 return start;
5977 }
5978
5979 // Kyber add 3 polynomials.
5980 // Implements
5981 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
5982 //
5983 // result (short[256]) = c_rarg0
5984 // a (short[256]) = c_rarg1
5985 // b (short[256]) = c_rarg2
5986 // c (short[256]) = c_rarg3
5987 address generate_kyberAddPoly_3() {
5988
5989 __ align(CodeEntryAlignment);
5990 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
5991 StubCodeMark mark(this, stub_id);
5992 address start = __ pc();
5993 __ enter();
5994
5995 const Register result = c_rarg0;
5996 const Register a = c_rarg1;
5997 const Register b = c_rarg2;
5998 const Register c = c_rarg3;
5999
6000 const Register kyberConsts = r11;
6001
6002 // As above we sum 256 sets of values in total i.e. 32 x 8H
6003 // quadwords. So, we can load, add and store the data in 3
6004 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6005 // of 10 or 11 registers. A further constraint is that the
6006 // mapping needs to skip callee saves. So, we allocate the
6007 // register sequences using two 8 sequences, two 2 sequences
6008 // and two single registers.
6009 VSeq<8> vs1_1(0);
6010 VSeq<2> vs1_2(16);
6011 FloatRegister vs1_3 = v28;
6012 VSeq<8> vs2_1(18);
6013 VSeq<2> vs2_2(26);
6014 FloatRegister vs2_3 = v29;
6015
6016 // two constant vector sequences
6017 VSeq<8> vc_1(31, 0);
6018 VSeq<2> vc_2(31, 0);
6019
6020 FloatRegister vc_3 = v31;
6021
6022 __ lea(kyberConsts,
6023 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6024
6025 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6026 for (int i = 0; i < 3; i++) {
6027 // load 80 or 88 values from a into vs1_1/2/3
6028 vs_ldpq_post(vs1_1, a);
6029 vs_ldpq_post(vs1_2, a);
6030 if (i < 2) {
6031 __ ldr(vs1_3, __ Q, __ post(a, 16));
6032 }
6033 // load 80 or 88 values from b into vs2_1/2/3
6034 vs_ldpq_post(vs2_1, b);
6035 vs_ldpq_post(vs2_2, b);
6036 if (i < 2) {
6037 __ ldr(vs2_3, __ Q, __ post(b, 16));
6038 }
6039 // sum 80 or 88 values across vs1 and vs2 into vs1
6040 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6041 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6042 if (i < 2) {
6043 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6044 }
6045 // load 80 or 88 values from c into vs2_1/2/3
6046 vs_ldpq_post(vs2_1, c);
6047 vs_ldpq_post(vs2_2, c);
6048 if (i < 2) {
6049 __ ldr(vs2_3, __ Q, __ post(c, 16));
6050 }
6051 // sum 80 or 88 values across vs1 and vs2 into vs1
6052 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6053 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6054 if (i < 2) {
6055 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6056 }
6057 // add constant to all 80 or 88 results
6058 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6059 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6060 if (i < 2) {
6061 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6062 }
6063 // store 80 or 88 values
6064 vs_stpq_post(vs1_1, result);
6065 vs_stpq_post(vs1_2, result);
6066 if (i < 2) {
6067 __ str(vs1_3, __ Q, __ post(result, 16));
6068 }
6069 }
6070
6071 __ leave(); // required for proper stackwalking of RuntimeStub frame
6072 __ mov(r0, zr); // return 0
6073 __ ret(lr);
6074
6075 return start;
6076 }
6077
6078 // Kyber parse XOF output to polynomial coefficient candidates
6079 // or decodePoly(12, ...).
6080 // Implements
6081 // static int implKyber12To16(
6082 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6083 //
6084 // (parsedLength or (parsedLength - 48) must be divisible by 64.)
6085 //
6086 // condensed (byte[]) = c_rarg0
6087 // condensedIndex = c_rarg1
6088 // parsed (short[112 or 256]) = c_rarg2
6089 // parsedLength (112 or 256) = c_rarg3
6090 address generate_kyber12To16() {
6091 Label L_F00, L_loop, L_end;
6092
6093 __ align(CodeEntryAlignment);
6094 StubId stub_id = StubId::stubgen_kyber12To16_id;
6095 StubCodeMark mark(this, stub_id);
6096 address start = __ pc();
6097 __ enter();
6098
6099 const Register condensed = c_rarg0;
6100 const Register condensedOffs = c_rarg1;
6101 const Register parsed = c_rarg2;
6102 const Register parsedLength = c_rarg3;
6103
6104 const Register tmpAddr = r11;
6105
6106 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6107 // quadwords so we need a 6 vector sequence for the inputs.
6108 // Parsing produces 64 shorts, employing two 8 vector
6109 // sequences to store and combine the intermediate data.
6110 VSeq<6> vin(24);
6111 VSeq<8> va(0), vb(16);
6112
6113 __ adr(tmpAddr, L_F00);
6114 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6115 __ add(condensed, condensed, condensedOffs);
6116
6117 __ BIND(L_loop);
6118 // load 96 (6 x 16B) byte values
6119 vs_ld3_post(vin, __ T16B, condensed);
6120
6121 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6122 // holds 48 (16x3) contiguous bytes from memory striped
6123 // horizontally across each of the 16 byte lanes. Equivalently,
6124 // that is 16 pairs of 12-bit integers. Likewise the back half
6125 // holds the next 48 bytes in the same arrangement.
6126
6127 // Each vector in the front half can also be viewed as a vertical
6128 // strip across the 16 pairs of 12 bit integers. Each byte in
6129 // vin[0] stores the low 8 bits of the first int in a pair. Each
6130 // byte in vin[1] stores the high 4 bits of the first int and the
6131 // low 4 bits of the second int. Each byte in vin[2] stores the
6132 // high 8 bits of the second int. Likewise the vectors in second
6133 // half.
6134
6135 // Converting the data to 16-bit shorts requires first of all
6136 // expanding each of the 6 x 16B vectors into 6 corresponding
6137 // pairs of 8H vectors. Mask, shift and add operations on the
6138 // resulting vector pairs can be used to combine 4 and 8 bit
6139 // parts of related 8H vector elements.
6140 //
6141 // The middle vectors (vin[2] and vin[5]) are actually expanded
6142 // twice, one copy manipulated to provide the lower 4 bits
6143 // belonging to the first short in a pair and another copy
6144 // manipulated to provide the higher 4 bits belonging to the
6145 // second short in a pair. This is why the the vector sequences va
6146 // and vb used to hold the expanded 8H elements are of length 8.
6147
6148 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6149 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6150 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6151 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6152 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6153 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6154 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6155 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6156
6157 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6158 // and vb[4:5]
6159 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6160 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6161 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6162 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6163 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6164 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6165
6166 // shift lo byte of copy 1 of the middle stripe into the high byte
6167 __ shl(va[2], __ T8H, va[2], 8);
6168 __ shl(va[3], __ T8H, va[3], 8);
6169 __ shl(vb[2], __ T8H, vb[2], 8);
6170 __ shl(vb[3], __ T8H, vb[3], 8);
6171
6172 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6173 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6174 // are in bit positions [4..11].
6175 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6176 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6177 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6178 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6179
6180 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6181 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6182 // copy2
6183 __ andr(va[2], __ T16B, va[2], v31);
6184 __ andr(va[3], __ T16B, va[3], v31);
6185 __ ushr(va[4], __ T8H, va[4], 4);
6186 __ ushr(va[5], __ T8H, va[5], 4);
6187 __ andr(vb[2], __ T16B, vb[2], v31);
6188 __ andr(vb[3], __ T16B, vb[3], v31);
6189 __ ushr(vb[4], __ T8H, vb[4], 4);
6190 __ ushr(vb[5], __ T8H, vb[5], 4);
6191
6192 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6193 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6194 // n.b. the ordering ensures: i) inputs are consumed before they
6195 // are overwritten ii) the order of 16-bit results across successive
6196 // pairs of vectors in va and then vb reflects the order of the
6197 // corresponding 12-bit inputs
6198 __ addv(va[0], __ T8H, va[0], va[2]);
6199 __ addv(va[2], __ T8H, va[1], va[3]);
6200 __ addv(va[1], __ T8H, va[4], va[6]);
6201 __ addv(va[3], __ T8H, va[5], va[7]);
6202 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6203 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6204 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6205 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6206
6207 // store 64 results interleaved as shorts
6208 vs_st2_post(vs_front(va), __ T8H, parsed);
6209 vs_st2_post(vs_front(vb), __ T8H, parsed);
6210
6211 __ sub(parsedLength, parsedLength, 64);
6212 __ cmp(parsedLength, (u1)64);
6213 __ br(Assembler::GE, L_loop);
6214 __ cbz(parsedLength, L_end);
6215
6216 // if anything is left it should be a final 72 bytes of input
6217 // i.e. a final 48 12-bit values. so we handle this by loading
6218 // 48 bytes into all 16B lanes of front(vin) and only 24
6219 // bytes into the lower 8B lane of back(vin)
6220 vs_ld3_post(vs_front(vin), __ T16B, condensed);
6221 vs_ld3(vs_back(vin), __ T8B, condensed);
6222
6223 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6224 // n.b. target elements 2 and 3 of va duplicate elements 4 and
6225 // 5 and target element 2 of vb duplicates element 4.
6226 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6227 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6228 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6229 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6230 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6231 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6232
6233 // This time expand just the lower 8 lanes
6234 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6235 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6236 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6237
6238 // shift lo byte of copy 1 of the middle stripe into the high byte
6239 __ shl(va[2], __ T8H, va[2], 8);
6240 __ shl(va[3], __ T8H, va[3], 8);
6241 __ shl(vb[2], __ T8H, vb[2], 8);
6242
6243 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
6244 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
6245 // int are in bit positions [4..11].
6246 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6247 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6248 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6249
6250 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
6251 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
6252 // copy2
6253 __ andr(va[2], __ T16B, va[2], v31);
6254 __ andr(va[3], __ T16B, va[3], v31);
6255 __ ushr(va[4], __ T8H, va[4], 4);
6256 __ ushr(va[5], __ T8H, va[5], 4);
6257 __ andr(vb[2], __ T16B, vb[2], v31);
6258 __ ushr(vb[4], __ T8H, vb[4], 4);
6259
6260
6261
6262 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
6263 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
6264
6265 // n.b. ordering ensures: i) inputs are consumed before they are
6266 // overwritten ii) order of 16-bit results across succsessive
6267 // pairs of vectors in va and then lower half of vb reflects order
6268 // of corresponding 12-bit inputs
6269 __ addv(va[0], __ T8H, va[0], va[2]);
6270 __ addv(va[2], __ T8H, va[1], va[3]);
6271 __ addv(va[1], __ T8H, va[4], va[6]);
6272 __ addv(va[3], __ T8H, va[5], va[7]);
6273 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6274 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6275
6276 // store 48 results interleaved as shorts
6277 vs_st2_post(vs_front(va), __ T8H, parsed);
6278 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
6279
6280 __ BIND(L_end);
6281
6282 __ leave(); // required for proper stackwalking of RuntimeStub frame
6283 __ mov(r0, zr); // return 0
6284 __ ret(lr);
6285
6286 // bind label and generate constant data used by this stub
6287 __ BIND(L_F00);
6288 __ emit_int64(0x0f000f000f000f00);
6289 __ emit_int64(0x0f000f000f000f00);
6290
6291 return start;
6292 }
6293
6294 // Kyber Barrett reduce function.
6295 // Implements
6296 // static int implKyberBarrettReduce(short[] coeffs) {}
6297 //
6298 // coeffs (short[256]) = c_rarg0
6299 address generate_kyberBarrettReduce() {
6300
6301 __ align(CodeEntryAlignment);
6302 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6303 StubCodeMark mark(this, stub_id);
6304 address start = __ pc();
6305 __ enter();
6306
6307 const Register coeffs = c_rarg0;
6308
6309 const Register kyberConsts = r10;
6310 const Register result = r11;
6311
6312 // As above we process 256 sets of values in total i.e. 32 x
6313 // 8H quadwords. So, we can load, add and store the data in 3
6314 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6315 // of 10 or 11 registers. A further constraint is that the
6316 // mapping needs to skip callee saves. So, we allocate the
6317 // register sequences using two 8 sequences, two 2 sequences
6318 // and two single registers.
6319 VSeq<8> vs1_1(0);
6320 VSeq<2> vs1_2(16);
6321 FloatRegister vs1_3 = v28;
6322 VSeq<8> vs2_1(18);
6323 VSeq<2> vs2_2(26);
6324 FloatRegister vs2_3 = v29;
6325
6326 // we also need a pair of corresponding constant sequences
6327
6328 VSeq<8> vc1_1(30, 0);
6329 VSeq<2> vc1_2(30, 0);
6330 FloatRegister vc1_3 = v30; // for kyber_q
6331
6332 VSeq<8> vc2_1(31, 0);
6333 VSeq<2> vc2_2(31, 0);
6334 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6335
6336 __ add(result, coeffs, 0);
6337 __ lea(kyberConsts,
6338 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6339
6340 // load q and the multiplier for the Barrett reduction
6341 __ add(kyberConsts, kyberConsts, 16);
6342 __ ldpq(vc1_3, vc2_3, kyberConsts);
6343
6344 for (int i = 0; i < 3; i++) {
6345 // load 80 or 88 coefficients
6346 vs_ldpq_post(vs1_1, coeffs);
6347 vs_ldpq_post(vs1_2, coeffs);
6348 if (i < 2) {
6349 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6350 }
6351
6352 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6353 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6354 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6355 if (i < 2) {
6356 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6357 }
6358
6359 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6360 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6361 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6362 if (i < 2) {
6363 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6364 }
6365
6366 // vs1 <- vs1 - vs2 * kyber_q
6367 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6368 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6369 if (i < 2) {
6370 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6371 }
6372
6373 vs_stpq_post(vs1_1, result);
6374 vs_stpq_post(vs1_2, result);
6375 if (i < 2) {
6376 __ str(vs1_3, __ Q, __ post(result, 16));
6377 }
6378 }
6379
6380 __ leave(); // required for proper stackwalking of RuntimeStub frame
6381 __ mov(r0, zr); // return 0
6382 __ ret(lr);
6383
6384 return start;
6385 }
6386
6387
6388 // Dilithium-specific montmul helper routines that generate parallel
6389 // code for, respectively, a single 4x4s vector sequence montmul or
6390 // two such multiplies in a row.
6391
6392 // Perform 16 32-bit Montgomery multiplications in parallel
6393 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6394 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6395 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6396 // It will assert that the register use is valid
6397 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6398 }
6399
6400 // Perform 2x16 32-bit Montgomery multiplications in parallel
6401 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6402 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6403 // Schedule two successive 4x4S multiplies via the montmul helper
6404 // on the front and back halves of va, vb and vc. The helper will
6405 // assert that the register use has no overlap conflicts on each
6406 // individual call but we also need to ensure that the necessary
6407 // disjoint/equality constraints are met across both calls.
6408
6409 // vb, vc, vtmp and vq must be disjoint. va must either be
6410 // disjoint from all other registers or equal vc
6411
6412 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6413 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6414 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6415
6416 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6417 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6418
6419 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6420
6421 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6422 assert(vs_disjoint(va, vb), "va and vb overlap");
6423 assert(vs_disjoint(va, vq), "va and vq overlap");
6424 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6425
6426 // We multiply the front and back halves of each sequence 4 at a
6427 // time because
6428 //
6429 // 1) we are currently only able to get 4-way instruction
6430 // parallelism at best
6431 //
6432 // 2) we need registers for the constants in vq and temporary
6433 // scratch registers to hold intermediate results so vtmp can only
6434 // be a VSeq<4> which means we only have 4 scratch slots.
6435
6436 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6437 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6438 }
6439
6440 // Perform combined montmul then add/sub on 4x4S vectors.
6441 void dilithium_montmul16_sub_add(
6442 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6443 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6444 // compute a = montmul(a1, c)
6445 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6446 // ouptut a1 = a0 - a
6447 vs_subv(va1, __ T4S, va0, vc);
6448 // and a0 = a0 + a
6449 vs_addv(va0, __ T4S, va0, vc);
6450 }
6451
6452 // Perform combined add/sub then montul on 4x4S vectors.
6453 void dilithium_sub_add_montmul16(
6454 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6455 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6456 // compute c = a0 - a1
6457 vs_subv(vtmp1, __ T4S, va0, va1);
6458 // output a0 = a0 + a1
6459 vs_addv(va0, __ T4S, va0, va1);
6460 // output a1 = b montmul c
6461 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6462 }
6463
6464 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6465 // in the Java implementation come in sequences of at least 8, so we
6466 // can use ldpq to collect the corresponding data into pairs of vector
6467 // registers.
6468 // We collect the coefficients corresponding to the 'j+l' indexes into
6469 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6470 // then we do the (Montgomery) multiplications by the zetas in parallel
6471 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6472 // v0-v7, then do the additions into v24-v31 and the subtractions into
6473 // v0-v7 and finally save the results back to the coeffs array.
6474 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6475 const Register coeffs, const Register zetas) {
6476 int c1 = 0;
6477 int c2 = 512;
6478 int startIncr;
6479 // don't use callee save registers v8 - v15
6480 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6481 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6482 VSeq<2> vq(30); // n.b. constants overlap vs3
6483 int offsets[4] = { 0, 32, 64, 96 };
6484
6485 for (int level = 0; level < 5; level++) {
6486 int c1Start = c1;
6487 int c2Start = c2;
6488 if (level == 3) {
6489 offsets[1] = 32;
6490 offsets[2] = 128;
6491 offsets[3] = 160;
6492 } else if (level == 4) {
6493 offsets[1] = 64;
6494 offsets[2] = 128;
6495 offsets[3] = 192;
6496 }
6497
6498 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6499 // time at 4 different offsets and multiply them in order by the
6500 // next set of input values. So we employ indexed load and store
6501 // pair instructions with arrangement 4S.
6502 for (int i = 0; i < 4; i++) {
6503 // reload q and qinv
6504 vs_ldpq(vq, dilithiumConsts); // qInv, q
6505 // load 8x4S coefficients via second start pos == c2
6506 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6507 // load next 8x4S inputs == b
6508 vs_ldpq_post(vs2, zetas);
6509 // compute a == c2 * b mod MONT_Q
6510 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6511 // load 8x4s coefficients via first start pos == c1
6512 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6513 // compute a1 = c1 + a
6514 vs_addv(vs3, __ T4S, vs1, vs2);
6515 // compute a2 = c1 - a
6516 vs_subv(vs1, __ T4S, vs1, vs2);
6517 // output a1 and a2
6518 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6519 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6520
6521 int k = 4 * level + i;
6522
6523 if (k > 7) {
6524 startIncr = 256;
6525 } else if (k == 5) {
6526 startIncr = 384;
6527 } else {
6528 startIncr = 128;
6529 }
6530
6531 c1Start += startIncr;
6532 c2Start += startIncr;
6533 }
6534
6535 c2 /= 2;
6536 }
6537 }
6538
6539 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6540 // Implements the method
6541 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6542 // of the Java class sun.security.provider
6543 //
6544 // coeffs (int[256]) = c_rarg0
6545 // zetas (int[256]) = c_rarg1
6546 address generate_dilithiumAlmostNtt() {
6547
6548 __ align(CodeEntryAlignment);
6549 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6550 StubCodeMark mark(this, stub_id);
6551 address start = __ pc();
6552 __ enter();
6553
6554 const Register coeffs = c_rarg0;
6555 const Register zetas = c_rarg1;
6556
6557 const Register tmpAddr = r9;
6558 const Register dilithiumConsts = r10;
6559 const Register result = r11;
6560 // don't use callee save registers v8 - v15
6561 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6562 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6563 VSeq<2> vq(30); // n.b. constants overlap vs3
6564 int offsets[4] = { 0, 32, 64, 96};
6565 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6566 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6567 __ add(result, coeffs, 0);
6568 __ lea(dilithiumConsts,
6569 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6570
6571 // Each level represents one iteration of the outer for loop of the Java version.
6572
6573 // level 0-4
6574 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6575
6576 // level 5
6577
6578 // At level 5 the coefficients we need to combine with the zetas
6579 // are grouped in memory in blocks of size 4. So, for both sets of
6580 // coefficients we load 4 adjacent values at 8 different offsets
6581 // using an indexed ldr with register variant Q and multiply them
6582 // in sequence order by the next set of inputs. Likewise we store
6583 // the resuls using an indexed str with register variant Q.
6584 for (int i = 0; i < 1024; i += 256) {
6585 // reload constants q, qinv each iteration as they get clobbered later
6586 vs_ldpq(vq, dilithiumConsts); // qInv, q
6587 // load 32 (8x4S) coefficients via first offsets = c1
6588 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6589 // load next 32 (8x4S) inputs = b
6590 vs_ldpq_post(vs2, zetas);
6591 // a = b montul c1
6592 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6593 // load 32 (8x4S) coefficients via second offsets = c2
6594 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6595 // add/sub with result of multiply
6596 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6597 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6598 // write back new coefficients using same offsets
6599 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6600 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6601 }
6602
6603 // level 6
6604 // At level 6 the coefficients we need to combine with the zetas
6605 // are grouped in memory in pairs, the first two being montmul
6606 // inputs and the second add/sub inputs. We can still implement
6607 // the montmul+sub+add using 4-way parallelism but only if we
6608 // combine the coefficients with the zetas 16 at a time. We load 8
6609 // adjacent values at 4 different offsets using an ld2 load with
6610 // arrangement 2D. That interleaves the lower and upper halves of
6611 // each pair of quadwords into successive vector registers. We
6612 // then need to montmul the 4 even elements of the coefficients
6613 // register sequence by the zetas in order and then add/sub the 4
6614 // odd elements of the coefficients register sequence. We use an
6615 // equivalent st2 operation to store the results back into memory
6616 // de-interleaved.
6617 for (int i = 0; i < 1024; i += 128) {
6618 // reload constants q, qinv each iteration as they get clobbered later
6619 vs_ldpq(vq, dilithiumConsts); // qInv, q
6620 // load interleaved 16 (4x2D) coefficients via offsets
6621 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6622 // load next 16 (4x4S) inputs
6623 vs_ldpq_post(vs_front(vs2), zetas);
6624 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6625 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6626 vs_front(vs2), vtmp, vq);
6627 // store interleaved 16 (4x2D) coefficients via offsets
6628 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6629 }
6630
6631 // level 7
6632 // At level 7 the coefficients we need to combine with the zetas
6633 // occur singly with montmul inputs alterating with add/sub
6634 // inputs. Once again we can use 4-way parallelism to combine 16
6635 // zetas at a time. However, we have to load 8 adjacent values at
6636 // 4 different offsets using an ld2 load with arrangement 4S. That
6637 // interleaves the the odd words of each pair into one
6638 // coefficients vector register and the even words of the pair
6639 // into the next register. We then need to montmul the 4 even
6640 // elements of the coefficients register sequence by the zetas in
6641 // order and then add/sub the 4 odd elements of the coefficients
6642 // register sequence. We use an equivalent st2 operation to store
6643 // the results back into memory de-interleaved.
6644
6645 for (int i = 0; i < 1024; i += 128) {
6646 // reload constants q, qinv each iteration as they get clobbered later
6647 vs_ldpq(vq, dilithiumConsts); // qInv, q
6648 // load interleaved 16 (4x4S) coefficients via offsets
6649 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6650 // load next 16 (4x4S) inputs
6651 vs_ldpq_post(vs_front(vs2), zetas);
6652 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6653 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6654 vs_front(vs2), vtmp, vq);
6655 // store interleaved 16 (4x4S) coefficients via offsets
6656 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6657 }
6658 __ leave(); // required for proper stackwalking of RuntimeStub frame
6659 __ mov(r0, zr); // return 0
6660 __ ret(lr);
6661
6662 return start;
6663 }
6664
6665 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6666 // in the Java implementation come in sequences of at least 8, so we
6667 // can use ldpq to collect the corresponding data into pairs of vector
6668 // registers
6669 // We collect the coefficients that correspond to the 'j's into vs1
6670 // the coefficiets that correspond to the 'j+l's into vs2 then
6671 // do the additions into vs3 and the subtractions into vs1 then
6672 // save the result of the additions, load the zetas into vs2
6673 // do the (Montgomery) multiplications by zeta in parallel into vs2
6674 // finally save the results back to the coeffs array
6675 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6676 const Register coeffs, const Register zetas) {
6677 int c1 = 0;
6678 int c2 = 32;
6679 int startIncr;
6680 int offsets[4];
6681 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6682 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6683 VSeq<2> vq(30); // n.b. constants overlap vs3
6684
6685 offsets[0] = 0;
6686
6687 for (int level = 3; level < 8; level++) {
6688 int c1Start = c1;
6689 int c2Start = c2;
6690 if (level == 3) {
6691 offsets[1] = 64;
6692 offsets[2] = 128;
6693 offsets[3] = 192;
6694 } else if (level == 4) {
6695 offsets[1] = 32;
6696 offsets[2] = 128;
6697 offsets[3] = 160;
6698 } else {
6699 offsets[1] = 32;
6700 offsets[2] = 64;
6701 offsets[3] = 96;
6702 }
6703
6704 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6705 // time at 4 different offsets and multiply them in order by the
6706 // next set of input values. So we employ indexed load and store
6707 // pair instructions with arrangement 4S.
6708 for (int i = 0; i < 4; i++) {
6709 // load v1 32 (8x4S) coefficients relative to first start index
6710 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6711 // load v2 32 (8x4S) coefficients relative to second start index
6712 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6713 // a0 = v1 + v2 -- n.b. clobbers vqs
6714 vs_addv(vs3, __ T4S, vs1, vs2);
6715 // a1 = v1 - v2
6716 vs_subv(vs1, __ T4S, vs1, vs2);
6717 // save a1 relative to first start index
6718 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6719 // load constants q, qinv each iteration as they get clobbered above
6720 vs_ldpq(vq, dilithiumConsts); // qInv, q
6721 // load b next 32 (8x4S) inputs
6722 vs_ldpq_post(vs2, zetas);
6723 // a = a1 montmul b
6724 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6725 // save a relative to second start index
6726 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6727
6728 int k = 4 * level + i;
6729
6730 if (k < 24) {
6731 startIncr = 256;
6732 } else if (k == 25) {
6733 startIncr = 384;
6734 } else {
6735 startIncr = 128;
6736 }
6737
6738 c1Start += startIncr;
6739 c2Start += startIncr;
6740 }
6741
6742 c2 *= 2;
6743 }
6744 }
6745
6746 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6747 // Implements the method
6748 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6749 // the sun.security.provider.ML_DSA class.
6750 //
6751 // coeffs (int[256]) = c_rarg0
6752 // zetas (int[256]) = c_rarg1
6753 address generate_dilithiumAlmostInverseNtt() {
6754
6755 __ align(CodeEntryAlignment);
6756 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6757 StubCodeMark mark(this, stub_id);
6758 address start = __ pc();
6759 __ enter();
6760
6761 const Register coeffs = c_rarg0;
6762 const Register zetas = c_rarg1;
6763
6764 const Register tmpAddr = r9;
6765 const Register dilithiumConsts = r10;
6766 const Register result = r11;
6767 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6768 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6769 VSeq<2> vq(30); // n.b. constants overlap vs3
6770 int offsets[4] = { 0, 32, 64, 96 };
6771 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6772 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6773
6774 __ add(result, coeffs, 0);
6775 __ lea(dilithiumConsts,
6776 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6777
6778 // Each level represents one iteration of the outer for loop of the Java version
6779
6780 // level 0
6781 // At level 0 we need to interleave adjacent quartets of
6782 // coefficients before we multiply and add/sub by the next 16
6783 // zetas just as we did for level 7 in the multiply code. So we
6784 // load and store the values using an ld2/st2 with arrangement 4S.
6785 for (int i = 0; i < 1024; i += 128) {
6786 // load constants q, qinv
6787 // n.b. this can be moved out of the loop as they do not get
6788 // clobbered by first two loops
6789 vs_ldpq(vq, dilithiumConsts); // qInv, q
6790 // a0/a1 load interleaved 32 (8x4S) coefficients
6791 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6792 // b load next 32 (8x4S) inputs
6793 vs_ldpq_post(vs_front(vs2), zetas);
6794 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6795 // n.b. second half of vs2 provides temporary register storage
6796 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6797 vs_front(vs2), vs_back(vs2), vtmp, vq);
6798 // a0/a1 store interleaved 32 (8x4S) coefficients
6799 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6800 }
6801
6802 // level 1
6803 // At level 1 we need to interleave pairs of adjacent pairs of
6804 // coefficients before we multiply by the next 16 zetas just as we
6805 // did for level 6 in the multiply code. So we load and store the
6806 // values an ld2/st2 with arrangement 2D.
6807 for (int i = 0; i < 1024; i += 128) {
6808 // a0/a1 load interleaved 32 (8x2D) coefficients
6809 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6810 // b load next 16 (4x4S) inputs
6811 vs_ldpq_post(vs_front(vs2), zetas);
6812 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6813 // n.b. second half of vs2 provides temporary register storage
6814 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6815 vs_front(vs2), vs_back(vs2), vtmp, vq);
6816 // a0/a1 store interleaved 32 (8x2D) coefficients
6817 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6818 }
6819
6820 // level 2
6821 // At level 2 coefficients come in blocks of 4. So, we load 4
6822 // adjacent coefficients at 8 distinct offsets for both the first
6823 // and second coefficient sequences, using an ldr with register
6824 // variant Q then combine them with next set of 32 zetas. Likewise
6825 // we store the results using an str with register variant Q.
6826 for (int i = 0; i < 1024; i += 256) {
6827 // c0 load 32 (8x4S) coefficients via first offsets
6828 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6829 // c1 load 32 (8x4S) coefficients via second offsets
6830 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6831 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6832 vs_addv(vs3, __ T4S, vs1, vs2);
6833 // c = c0 - c1
6834 vs_subv(vs1, __ T4S, vs1, vs2);
6835 // store a0 32 (8x4S) coefficients via first offsets
6836 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6837 // b load 32 (8x4S) next inputs
6838 vs_ldpq_post(vs2, zetas);
6839 // reload constants q, qinv -- they were clobbered earlier
6840 vs_ldpq(vq, dilithiumConsts); // qInv, q
6841 // compute a1 = b montmul c
6842 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6843 // store a1 32 (8x4S) coefficients via second offsets
6844 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6845 }
6846
6847 // level 3-7
6848 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6849
6850 __ leave(); // required for proper stackwalking of RuntimeStub frame
6851 __ mov(r0, zr); // return 0
6852 __ ret(lr);
6853
6854 return start;
6855 }
6856
6857 // Dilithium multiply polynomials in the NTT domain.
6858 // Straightforward implementation of the method
6859 // static int implDilithiumNttMult(
6860 // int[] result, int[] ntta, int[] nttb {} of
6861 // the sun.security.provider.ML_DSA class.
6862 //
6863 // result (int[256]) = c_rarg0
6864 // poly1 (int[256]) = c_rarg1
6865 // poly2 (int[256]) = c_rarg2
6866 address generate_dilithiumNttMult() {
6867
6868 __ align(CodeEntryAlignment);
6869 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6870 StubCodeMark mark(this, stub_id);
6871 address start = __ pc();
6872 __ enter();
6873
6874 Label L_loop;
6875
6876 const Register result = c_rarg0;
6877 const Register poly1 = c_rarg1;
6878 const Register poly2 = c_rarg2;
6879
6880 const Register dilithiumConsts = r10;
6881 const Register len = r11;
6882
6883 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6884 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6885 VSeq<2> vq(30); // n.b. constants overlap vs3
6886 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6887
6888 __ lea(dilithiumConsts,
6889 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6890
6891 // load constants q, qinv
6892 vs_ldpq(vq, dilithiumConsts); // qInv, q
6893 // load constant rSquare into v29
6894 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6895
6896 __ mov(len, zr);
6897 __ add(len, len, 1024);
6898
6899 __ BIND(L_loop);
6900
6901 // b load 32 (8x4S) next inputs from poly1
6902 vs_ldpq_post(vs1, poly1);
6903 // c load 32 (8x4S) next inputs from poly2
6904 vs_ldpq_post(vs2, poly2);
6905 // compute a = b montmul c
6906 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6907 // compute a = rsquare montmul a
6908 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6909 // save a 32 (8x4S) results
6910 vs_stpq_post(vs2, result);
6911
6912 __ sub(len, len, 128);
6913 __ cmp(len, (u1)128);
6914 __ br(Assembler::GE, L_loop);
6915
6916 __ leave(); // required for proper stackwalking of RuntimeStub frame
6917 __ mov(r0, zr); // return 0
6918 __ ret(lr);
6919
6920 return start;
6921 }
6922
6923 // Dilithium Motgomery multiply an array by a constant.
6924 // A straightforward implementation of the method
6925 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6926 // of the sun.security.provider.MLDSA class
6927 //
6928 // coeffs (int[256]) = c_rarg0
6929 // constant (int) = c_rarg1
6930 address generate_dilithiumMontMulByConstant() {
6931
6932 __ align(CodeEntryAlignment);
6933 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6934 StubCodeMark mark(this, stub_id);
6935 address start = __ pc();
6936 __ enter();
6937
6938 Label L_loop;
6939
6940 const Register coeffs = c_rarg0;
6941 const Register constant = c_rarg1;
6942
6943 const Register dilithiumConsts = r10;
6944 const Register result = r11;
6945 const Register len = r12;
6946
6947 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6948 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6949 VSeq<2> vq(30); // n.b. constants overlap vs3
6950 VSeq<8> vconst(29, 0); // for montmul by constant
6951
6952 // results track inputs
6953 __ add(result, coeffs, 0);
6954 __ lea(dilithiumConsts,
6955 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6956
6957 // load constants q, qinv -- they do not get clobbered by first two loops
6958 vs_ldpq(vq, dilithiumConsts); // qInv, q
6959 // copy caller supplied constant across vconst
6960 __ dup(vconst[0], __ T4S, constant);
6961 __ mov(len, zr);
6962 __ add(len, len, 1024);
6963
6964 __ BIND(L_loop);
6965
6966 // load next 32 inputs
6967 vs_ldpq_post(vs2, coeffs);
6968 // mont mul by constant
6969 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6970 // write next 32 results
6971 vs_stpq_post(vs2, result);
6972
6973 __ sub(len, len, 128);
6974 __ cmp(len, (u1)128);
6975 __ br(Assembler::GE, L_loop);
6976
6977 __ leave(); // required for proper stackwalking of RuntimeStub frame
6978 __ mov(r0, zr); // return 0
6979 __ ret(lr);
6980
6981 return start;
6982 }
6983
6984 // Dilithium decompose poly.
6985 // Implements the method
6986 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6987 // of the sun.security.provider.ML_DSA class
6988 //
6989 // input (int[256]) = c_rarg0
6990 // lowPart (int[256]) = c_rarg1
6991 // highPart (int[256]) = c_rarg2
6992 // twoGamma2 (int) = c_rarg3
6993 // multiplier (int) = c_rarg4
6994 address generate_dilithiumDecomposePoly() {
6995
6996 __ align(CodeEntryAlignment);
6997 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
6998 StubCodeMark mark(this, stub_id);
6999 address start = __ pc();
7000 Label L_loop;
7001
7002 const Register input = c_rarg0;
7003 const Register lowPart = c_rarg1;
7004 const Register highPart = c_rarg2;
7005 const Register twoGamma2 = c_rarg3;
7006 const Register multiplier = c_rarg4;
7007
7008 const Register len = r9;
7009 const Register dilithiumConsts = r10;
7010 const Register tmp = r11;
7011
7012 // 6 independent sets of 4x4s values
7013 VSeq<4> vs1(0), vs2(4), vs3(8);
7014 VSeq<4> vs4(12), vs5(16), vtmp(20);
7015
7016 // 7 constants for cross-multiplying
7017 VSeq<4> one(25, 0);
7018 VSeq<4> qminus1(26, 0);
7019 VSeq<4> g2(27, 0);
7020 VSeq<4> twog2(28, 0);
7021 VSeq<4> mult(29, 0);
7022 VSeq<4> q(30, 0);
7023 VSeq<4> qadd(31, 0);
7024
7025 __ enter();
7026
7027 __ lea(dilithiumConsts,
7028 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7029
7030 // save callee-saved registers
7031 __ stpd(v8, v9, __ pre(sp, -64));
7032 __ stpd(v10, v11, Address(sp, 16));
7033 __ stpd(v12, v13, Address(sp, 32));
7034 __ stpd(v14, v15, Address(sp, 48));
7035
7036 // populate constant registers
7037 __ mov(tmp, zr);
7038 __ add(tmp, tmp, 1);
7039 __ dup(one[0], __ T4S, tmp); // 1
7040 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7041 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7042 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7043 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7044 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7045 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7046
7047 __ mov(len, zr);
7048 __ add(len, len, 1024);
7049
7050 __ BIND(L_loop);
7051
7052 // load next 4x4S inputs interleaved: rplus --> vs1
7053 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7054
7055 // rplus = rplus - ((rplus + qadd) >> 23) * q
7056 vs_addv(vtmp, __ T4S, vs1, qadd);
7057 vs_sshr(vtmp, __ T4S, vtmp, 23);
7058 vs_mulv(vtmp, __ T4S, vtmp, q);
7059 vs_subv(vs1, __ T4S, vs1, vtmp);
7060
7061 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7062 vs_sshr(vtmp, __ T4S, vs1, 31);
7063 vs_andr(vtmp, vtmp, q);
7064 vs_addv(vs1, __ T4S, vs1, vtmp);
7065
7066 // quotient --> vs2
7067 // int quotient = (rplus * multiplier) >> 22;
7068 vs_mulv(vtmp, __ T4S, vs1, mult);
7069 vs_sshr(vs2, __ T4S, vtmp, 22);
7070
7071 // r0 --> vs3
7072 // int r0 = rplus - quotient * twoGamma2;
7073 vs_mulv(vtmp, __ T4S, vs2, twog2);
7074 vs_subv(vs3, __ T4S, vs1, vtmp);
7075
7076 // mask --> vs4
7077 // int mask = (twoGamma2 - r0) >> 22;
7078 vs_subv(vtmp, __ T4S, twog2, vs3);
7079 vs_sshr(vs4, __ T4S, vtmp, 22);
7080
7081 // r0 -= (mask & twoGamma2);
7082 vs_andr(vtmp, vs4, twog2);
7083 vs_subv(vs3, __ T4S, vs3, vtmp);
7084
7085 // quotient += (mask & 1);
7086 vs_andr(vtmp, vs4, one);
7087 vs_addv(vs2, __ T4S, vs2, vtmp);
7088
7089 // mask = (twoGamma2 / 2 - r0) >> 31;
7090 vs_subv(vtmp, __ T4S, g2, vs3);
7091 vs_sshr(vs4, __ T4S, vtmp, 31);
7092
7093 // r0 -= (mask & twoGamma2);
7094 vs_andr(vtmp, vs4, twog2);
7095 vs_subv(vs3, __ T4S, vs3, vtmp);
7096
7097 // quotient += (mask & 1);
7098 vs_andr(vtmp, vs4, one);
7099 vs_addv(vs2, __ T4S, vs2, vtmp);
7100
7101 // r1 --> vs5
7102 // int r1 = rplus - r0 - (dilithium_q - 1);
7103 vs_subv(vtmp, __ T4S, vs1, vs3);
7104 vs_subv(vs5, __ T4S, vtmp, qminus1);
7105
7106 // r1 --> vs1 (overwriting rplus)
7107 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7108 vs_negr(vtmp, __ T4S, vs5);
7109 vs_orr(vtmp, vs5, vtmp);
7110 vs_sshr(vs1, __ T4S, vtmp, 31);
7111
7112 // r0 += ~r1;
7113 vs_notr(vtmp, vs1);
7114 vs_addv(vs3, __ T4S, vs3, vtmp);
7115
7116 // r1 = r1 & quotient;
7117 vs_andr(vs1, vs2, vs1);
7118
7119 // store results inteleaved
7120 // lowPart[m] = r0;
7121 // highPart[m] = r1;
7122 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7123 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7124
7125 __ sub(len, len, 64);
7126 __ cmp(len, (u1)64);
7127 __ br(Assembler::GE, L_loop);
7128
7129 // restore callee-saved vector registers
7130 __ ldpd(v14, v15, Address(sp, 48));
7131 __ ldpd(v12, v13, Address(sp, 32));
7132 __ ldpd(v10, v11, Address(sp, 16));
7133 __ ldpd(v8, v9, __ post(sp, 64));
7134
7135 __ leave(); // required for proper stackwalking of RuntimeStub frame
7136 __ mov(r0, zr); // return 0
7137 __ ret(lr);
7138
7139 return start;
7140 }
7141
7142 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7143 Register tmp0, Register tmp1, Register tmp2) {
7144 __ bic(tmp0, a2, a1); // for a0
7145 __ bic(tmp1, a3, a2); // for a1
7146 __ bic(tmp2, a4, a3); // for a2
7147 __ eor(a2, a2, tmp2);
7148 __ bic(tmp2, a0, a4); // for a3
7149 __ eor(a3, a3, tmp2);
7150 __ bic(tmp2, a1, a0); // for a4
7151 __ eor(a0, a0, tmp0);
7152 __ eor(a1, a1, tmp1);
7153 __ eor(a4, a4, tmp2);
7154 }
7155
7156 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7157 Register a0, Register a1, Register a2, Register a3, Register a4,
7158 Register a5, Register a6, Register a7, Register a8, Register a9,
7159 Register a10, Register a11, Register a12, Register a13, Register a14,
7160 Register a15, Register a16, Register a17, Register a18, Register a19,
7161 Register a20, Register a21, Register a22, Register a23, Register a24,
7162 Register tmp0, Register tmp1, Register tmp2) {
7163 __ eor3(tmp1, a4, a9, a14);
7164 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7165 __ eor3(tmp2, a1, a6, a11);
7166 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7167 __ rax1(tmp2, tmp0, tmp1); // d0
7168 {
7169
7170 Register tmp3, tmp4;
7171 if (can_use_fp && can_use_r18) {
7172 tmp3 = rfp;
7173 tmp4 = r18_tls;
7174 } else {
7175 tmp3 = a4;
7176 tmp4 = a9;
7177 __ stp(tmp3, tmp4, __ pre(sp, -16));
7178 }
7179
7180 __ eor3(tmp3, a0, a5, a10);
7181 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7182 __ eor(a0, a0, tmp2);
7183 __ eor(a5, a5, tmp2);
7184 __ eor(a10, a10, tmp2);
7185 __ eor(a15, a15, tmp2);
7186 __ eor(a20, a20, tmp2); // d0(tmp2)
7187 __ eor3(tmp3, a2, a7, a12);
7188 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7189 __ rax1(tmp3, tmp4, tmp2); // d1
7190 __ eor(a1, a1, tmp3);
7191 __ eor(a6, a6, tmp3);
7192 __ eor(a11, a11, tmp3);
7193 __ eor(a16, a16, tmp3);
7194 __ eor(a21, a21, tmp3); // d1(tmp3)
7195 __ rax1(tmp3, tmp2, tmp0); // d3
7196 __ eor3(tmp2, a3, a8, a13);
7197 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7198 __ eor(a3, a3, tmp3);
7199 __ eor(a8, a8, tmp3);
7200 __ eor(a13, a13, tmp3);
7201 __ eor(a18, a18, tmp3);
7202 __ eor(a23, a23, tmp3);
7203 __ rax1(tmp2, tmp1, tmp0); // d2
7204 __ eor(a2, a2, tmp2);
7205 __ eor(a7, a7, tmp2);
7206 __ eor(a12, a12, tmp2);
7207 __ rax1(tmp0, tmp0, tmp4); // d4
7208 if (!can_use_fp || !can_use_r18) {
7209 __ ldp(tmp3, tmp4, __ post(sp, 16));
7210 }
7211 __ eor(a17, a17, tmp2);
7212 __ eor(a22, a22, tmp2);
7213 __ eor(a4, a4, tmp0);
7214 __ eor(a9, a9, tmp0);
7215 __ eor(a14, a14, tmp0);
7216 __ eor(a19, a19, tmp0);
7217 __ eor(a24, a24, tmp0);
7218 }
7219
7220 __ rol(tmp0, a10, 3);
7221 __ rol(a10, a1, 1);
7222 __ rol(a1, a6, 44);
7223 __ rol(a6, a9, 20);
7224 __ rol(a9, a22, 61);
7225 __ rol(a22, a14, 39);
7226 __ rol(a14, a20, 18);
7227 __ rol(a20, a2, 62);
7228 __ rol(a2, a12, 43);
7229 __ rol(a12, a13, 25);
7230 __ rol(a13, a19, 8) ;
7231 __ rol(a19, a23, 56);
7232 __ rol(a23, a15, 41);
7233 __ rol(a15, a4, 27);
7234 __ rol(a4, a24, 14);
7235 __ rol(a24, a21, 2);
7236 __ rol(a21, a8, 55);
7237 __ rol(a8, a16, 45);
7238 __ rol(a16, a5, 36);
7239 __ rol(a5, a3, 28);
7240 __ rol(a3, a18, 21);
7241 __ rol(a18, a17, 15);
7242 __ rol(a17, a11, 10);
7243 __ rol(a11, a7, 6);
7244 __ mov(a7, tmp0);
7245
7246 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7247 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7248 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7249 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7250 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7251
7252 __ ldr(tmp1, __ post(rc, 8));
7253 __ eor(a0, a0, tmp1);
7254
7255 }
7256
7257 // Arguments:
7258 //
7259 // Inputs:
7260 // c_rarg0 - byte[] source+offset
7261 // c_rarg1 - byte[] SHA.state
7262 // c_rarg2 - int block_size
7263 // c_rarg3 - int offset
7264 // c_rarg4 - int limit
7265 //
7266 address generate_sha3_implCompress_gpr(StubId stub_id) {
7267 bool multi_block;
7268 switch (stub_id) {
7269 case StubId::stubgen_sha3_implCompress_id:
7270 multi_block = false;
7271 break;
7272 case StubId::stubgen_sha3_implCompressMB_id:
7273 multi_block = true;
7274 break;
7275 default:
7276 ShouldNotReachHere();
7277 }
7278
7279 static const uint64_t round_consts[24] = {
7280 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7281 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7282 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7283 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7284 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7285 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7286 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7287 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7288 };
7289
7290 __ align(CodeEntryAlignment);
7291 StubCodeMark mark(this, stub_id);
7292 address start = __ pc();
7293
7294 Register buf = c_rarg0;
7295 Register state = c_rarg1;
7296 Register block_size = c_rarg2;
7297 Register ofs = c_rarg3;
7298 Register limit = c_rarg4;
7299
7300 // use r3.r17,r19..r28 to keep a0..a24.
7301 // a0..a24 are respective locals from SHA3.java
7302 Register a0 = r25,
7303 a1 = r26,
7304 a2 = r27,
7305 a3 = r3,
7306 a4 = r4,
7307 a5 = r5,
7308 a6 = r6,
7309 a7 = r7,
7310 a8 = rscratch1, // r8
7311 a9 = rscratch2, // r9
7312 a10 = r10,
7313 a11 = r11,
7314 a12 = r12,
7315 a13 = r13,
7316 a14 = r14,
7317 a15 = r15,
7318 a16 = r16,
7319 a17 = r17,
7320 a18 = r28,
7321 a19 = r19,
7322 a20 = r20,
7323 a21 = r21,
7324 a22 = r22,
7325 a23 = r23,
7326 a24 = r24;
7327
7328 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7329
7330 Label sha3_loop, rounds24_preloop, loop_body;
7331 Label sha3_512_or_sha3_384, shake128;
7332
7333 bool can_use_r18 = false;
7334 #ifndef R18_RESERVED
7335 can_use_r18 = true;
7336 #endif
7337 bool can_use_fp = !PreserveFramePointer;
7338
7339 __ enter();
7340
7341 // save almost all yet unsaved gpr registers on stack
7342 __ str(block_size, __ pre(sp, -128));
7343 if (multi_block) {
7344 __ stpw(ofs, limit, Address(sp, 8));
7345 }
7346 // 8 bytes at sp+16 will be used to keep buf
7347 __ stp(r19, r20, Address(sp, 32));
7348 __ stp(r21, r22, Address(sp, 48));
7349 __ stp(r23, r24, Address(sp, 64));
7350 __ stp(r25, r26, Address(sp, 80));
7351 __ stp(r27, r28, Address(sp, 96));
7352 if (can_use_r18 && can_use_fp) {
7353 __ stp(r18_tls, state, Address(sp, 112));
7354 } else {
7355 __ str(state, Address(sp, 112));
7356 }
7357
7358 // begin sha3 calculations: loading a0..a24 from state arrary
7359 __ ldp(a0, a1, state);
7360 __ ldp(a2, a3, Address(state, 16));
7361 __ ldp(a4, a5, Address(state, 32));
7362 __ ldp(a6, a7, Address(state, 48));
7363 __ ldp(a8, a9, Address(state, 64));
7364 __ ldp(a10, a11, Address(state, 80));
7365 __ ldp(a12, a13, Address(state, 96));
7366 __ ldp(a14, a15, Address(state, 112));
7367 __ ldp(a16, a17, Address(state, 128));
7368 __ ldp(a18, a19, Address(state, 144));
7369 __ ldp(a20, a21, Address(state, 160));
7370 __ ldp(a22, a23, Address(state, 176));
7371 __ ldr(a24, Address(state, 192));
7372
7373 __ BIND(sha3_loop);
7374
7375 // load input
7376 __ ldp(tmp3, tmp2, __ post(buf, 16));
7377 __ eor(a0, a0, tmp3);
7378 __ eor(a1, a1, tmp2);
7379 __ ldp(tmp3, tmp2, __ post(buf, 16));
7380 __ eor(a2, a2, tmp3);
7381 __ eor(a3, a3, tmp2);
7382 __ ldp(tmp3, tmp2, __ post(buf, 16));
7383 __ eor(a4, a4, tmp3);
7384 __ eor(a5, a5, tmp2);
7385 __ ldr(tmp3, __ post(buf, 8));
7386 __ eor(a6, a6, tmp3);
7387
7388 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7389 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7390
7391 __ ldp(tmp3, tmp2, __ post(buf, 16));
7392 __ eor(a7, a7, tmp3);
7393 __ eor(a8, a8, tmp2);
7394 __ ldp(tmp3, tmp2, __ post(buf, 16));
7395 __ eor(a9, a9, tmp3);
7396 __ eor(a10, a10, tmp2);
7397 __ ldp(tmp3, tmp2, __ post(buf, 16));
7398 __ eor(a11, a11, tmp3);
7399 __ eor(a12, a12, tmp2);
7400 __ ldp(tmp3, tmp2, __ post(buf, 16));
7401 __ eor(a13, a13, tmp3);
7402 __ eor(a14, a14, tmp2);
7403 __ ldp(tmp3, tmp2, __ post(buf, 16));
7404 __ eor(a15, a15, tmp3);
7405 __ eor(a16, a16, tmp2);
7406
7407 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7408 __ andw(tmp2, block_size, 48);
7409 __ cbzw(tmp2, rounds24_preloop);
7410 __ tbnz(block_size, 5, shake128);
7411 // block_size == 144, bit5 == 0, SHA3-244
7412 __ ldr(tmp3, __ post(buf, 8));
7413 __ eor(a17, a17, tmp3);
7414 __ b(rounds24_preloop);
7415
7416 __ BIND(shake128);
7417 __ ldp(tmp3, tmp2, __ post(buf, 16));
7418 __ eor(a17, a17, tmp3);
7419 __ eor(a18, a18, tmp2);
7420 __ ldp(tmp3, tmp2, __ post(buf, 16));
7421 __ eor(a19, a19, tmp3);
7422 __ eor(a20, a20, tmp2);
7423 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7424
7425 __ BIND(sha3_512_or_sha3_384);
7426 __ ldp(tmp3, tmp2, __ post(buf, 16));
7427 __ eor(a7, a7, tmp3);
7428 __ eor(a8, a8, tmp2);
7429 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7430
7431 // SHA3-384
7432 __ ldp(tmp3, tmp2, __ post(buf, 16));
7433 __ eor(a9, a9, tmp3);
7434 __ eor(a10, a10, tmp2);
7435 __ ldp(tmp3, tmp2, __ post(buf, 16));
7436 __ eor(a11, a11, tmp3);
7437 __ eor(a12, a12, tmp2);
7438
7439 __ BIND(rounds24_preloop);
7440 __ fmovs(v0, 24.0); // float loop counter,
7441 __ fmovs(v1, 1.0); // exact representation
7442
7443 __ str(buf, Address(sp, 16));
7444 __ lea(tmp3, ExternalAddress((address) round_consts));
7445
7446 __ BIND(loop_body);
7447 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7448 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7449 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7450 tmp0, tmp1, tmp2);
7451 __ fsubs(v0, v0, v1);
7452 __ fcmps(v0, 0.0);
7453 __ br(__ NE, loop_body);
7454
7455 if (multi_block) {
7456 __ ldrw(block_size, sp); // block_size
7457 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7458 __ addw(tmp2, tmp2, block_size);
7459 __ cmpw(tmp2, tmp1);
7460 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7461 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7462 __ br(Assembler::LE, sha3_loop);
7463 __ movw(c_rarg0, tmp2); // return offset
7464 }
7465 if (can_use_fp && can_use_r18) {
7466 __ ldp(r18_tls, state, Address(sp, 112));
7467 } else {
7468 __ ldr(state, Address(sp, 112));
7469 }
7470 // save calculated sha3 state
7471 __ stp(a0, a1, Address(state));
7472 __ stp(a2, a3, Address(state, 16));
7473 __ stp(a4, a5, Address(state, 32));
7474 __ stp(a6, a7, Address(state, 48));
7475 __ stp(a8, a9, Address(state, 64));
7476 __ stp(a10, a11, Address(state, 80));
7477 __ stp(a12, a13, Address(state, 96));
7478 __ stp(a14, a15, Address(state, 112));
7479 __ stp(a16, a17, Address(state, 128));
7480 __ stp(a18, a19, Address(state, 144));
7481 __ stp(a20, a21, Address(state, 160));
7482 __ stp(a22, a23, Address(state, 176));
7483 __ str(a24, Address(state, 192));
7484
7485 // restore required registers from stack
7486 __ ldp(r19, r20, Address(sp, 32));
7487 __ ldp(r21, r22, Address(sp, 48));
7488 __ ldp(r23, r24, Address(sp, 64));
7489 __ ldp(r25, r26, Address(sp, 80));
7490 __ ldp(r27, r28, Address(sp, 96));
7491 if (can_use_fp && can_use_r18) {
7492 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7493 } // else no need to recalculate rfp, since it wasn't changed
7494
7495 __ leave();
7496
7497 __ ret(lr);
7498
7499 return start;
7500 }
7501
7502 /**
7503 * Arguments:
7504 *
7505 * Inputs:
7506 * c_rarg0 - int crc
7507 * c_rarg1 - byte* buf
7508 * c_rarg2 - int length
7509 *
7510 * Output:
7511 * rax - int crc result
7512 */
7513 address generate_updateBytesCRC32() {
7514 assert(UseCRC32Intrinsics, "what are we doing here?");
7515
7516 __ align(CodeEntryAlignment);
7517 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7518 StubCodeMark mark(this, stub_id);
7519
7520 address start = __ pc();
7521
7522 const Register crc = c_rarg0; // crc
7523 const Register buf = c_rarg1; // source java byte array address
7524 const Register len = c_rarg2; // length
7525 const Register table0 = c_rarg3; // crc_table address
7526 const Register table1 = c_rarg4;
7527 const Register table2 = c_rarg5;
7528 const Register table3 = c_rarg6;
7529 const Register tmp3 = c_rarg7;
7530
7531 BLOCK_COMMENT("Entry:");
7532 __ enter(); // required for proper stackwalking of RuntimeStub frame
7533
7534 __ kernel_crc32(crc, buf, len,
7535 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7536
7537 __ leave(); // required for proper stackwalking of RuntimeStub frame
7538 __ ret(lr);
7539
7540 return start;
7541 }
7542
7543 /**
7544 * Arguments:
7545 *
7546 * Inputs:
7547 * c_rarg0 - int crc
7548 * c_rarg1 - byte* buf
7549 * c_rarg2 - int length
7550 * c_rarg3 - int* table
7551 *
7552 * Output:
7553 * r0 - int crc result
7554 */
7555 address generate_updateBytesCRC32C() {
7556 assert(UseCRC32CIntrinsics, "what are we doing here?");
7557
7558 __ align(CodeEntryAlignment);
7559 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7560 StubCodeMark mark(this, stub_id);
7561
7562 address start = __ pc();
7563
7564 const Register crc = c_rarg0; // crc
7565 const Register buf = c_rarg1; // source java byte array address
7566 const Register len = c_rarg2; // length
7567 const Register table0 = c_rarg3; // crc_table address
7568 const Register table1 = c_rarg4;
7569 const Register table2 = c_rarg5;
7570 const Register table3 = c_rarg6;
7571 const Register tmp3 = c_rarg7;
7572
7573 BLOCK_COMMENT("Entry:");
7574 __ enter(); // required for proper stackwalking of RuntimeStub frame
7575
7576 __ kernel_crc32c(crc, buf, len,
7577 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7578
7579 __ leave(); // required for proper stackwalking of RuntimeStub frame
7580 __ ret(lr);
7581
7582 return start;
7583 }
7584
7585 /***
7586 * Arguments:
7587 *
7588 * Inputs:
7589 * c_rarg0 - int adler
7590 * c_rarg1 - byte* buff
7591 * c_rarg2 - int len
7592 *
7593 * Output:
7594 * c_rarg0 - int adler result
7595 */
7596 address generate_updateBytesAdler32() {
7597 __ align(CodeEntryAlignment);
7598 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7599 StubCodeMark mark(this, stub_id);
7600 address start = __ pc();
7601
7602 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7603
7604 // Aliases
7605 Register adler = c_rarg0;
7606 Register s1 = c_rarg0;
7607 Register s2 = c_rarg3;
7608 Register buff = c_rarg1;
7609 Register len = c_rarg2;
7610 Register nmax = r4;
7611 Register base = r5;
7612 Register count = r6;
7613 Register temp0 = rscratch1;
7614 Register temp1 = rscratch2;
7615 FloatRegister vbytes = v0;
7616 FloatRegister vs1acc = v1;
7617 FloatRegister vs2acc = v2;
7618 FloatRegister vtable = v3;
7619
7620 // Max number of bytes we can process before having to take the mod
7621 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7622 uint64_t BASE = 0xfff1;
7623 uint64_t NMAX = 0x15B0;
7624
7625 __ mov(base, BASE);
7626 __ mov(nmax, NMAX);
7627
7628 // Load accumulation coefficients for the upper 16 bits
7629 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7630 __ ld1(vtable, __ T16B, Address(temp0));
7631
7632 // s1 is initialized to the lower 16 bits of adler
7633 // s2 is initialized to the upper 16 bits of adler
7634 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7635 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7636
7637 // The pipelined loop needs at least 16 elements for 1 iteration
7638 // It does check this, but it is more effective to skip to the cleanup loop
7639 __ cmp(len, (u1)16);
7640 __ br(Assembler::HS, L_nmax);
7641 __ cbz(len, L_combine);
7642
7643 __ bind(L_simple_by1_loop);
7644 __ ldrb(temp0, Address(__ post(buff, 1)));
7645 __ add(s1, s1, temp0);
7646 __ add(s2, s2, s1);
7647 __ subs(len, len, 1);
7648 __ br(Assembler::HI, L_simple_by1_loop);
7649
7650 // s1 = s1 % BASE
7651 __ subs(temp0, s1, base);
7652 __ csel(s1, temp0, s1, Assembler::HS);
7653
7654 // s2 = s2 % BASE
7655 __ lsr(temp0, s2, 16);
7656 __ lsl(temp1, temp0, 4);
7657 __ sub(temp1, temp1, temp0);
7658 __ add(s2, temp1, s2, ext::uxth);
7659
7660 __ subs(temp0, s2, base);
7661 __ csel(s2, temp0, s2, Assembler::HS);
7662
7663 __ b(L_combine);
7664
7665 __ bind(L_nmax);
7666 __ subs(len, len, nmax);
7667 __ sub(count, nmax, 16);
7668 __ br(Assembler::LO, L_by16);
7669
7670 __ bind(L_nmax_loop);
7671
7672 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7673 vbytes, vs1acc, vs2acc, vtable);
7674
7675 __ subs(count, count, 16);
7676 __ br(Assembler::HS, L_nmax_loop);
7677
7678 // s1 = s1 % BASE
7679 __ lsr(temp0, s1, 16);
7680 __ lsl(temp1, temp0, 4);
7681 __ sub(temp1, temp1, temp0);
7682 __ add(temp1, temp1, s1, ext::uxth);
7683
7684 __ lsr(temp0, temp1, 16);
7685 __ lsl(s1, temp0, 4);
7686 __ sub(s1, s1, temp0);
7687 __ add(s1, s1, temp1, ext:: uxth);
7688
7689 __ subs(temp0, s1, base);
7690 __ csel(s1, temp0, s1, Assembler::HS);
7691
7692 // s2 = s2 % BASE
7693 __ lsr(temp0, s2, 16);
7694 __ lsl(temp1, temp0, 4);
7695 __ sub(temp1, temp1, temp0);
7696 __ add(temp1, temp1, s2, ext::uxth);
7697
7698 __ lsr(temp0, temp1, 16);
7699 __ lsl(s2, temp0, 4);
7700 __ sub(s2, s2, temp0);
7701 __ add(s2, s2, temp1, ext:: uxth);
7702
7703 __ subs(temp0, s2, base);
7704 __ csel(s2, temp0, s2, Assembler::HS);
7705
7706 __ subs(len, len, nmax);
7707 __ sub(count, nmax, 16);
7708 __ br(Assembler::HS, L_nmax_loop);
7709
7710 __ bind(L_by16);
7711 __ adds(len, len, count);
7712 __ br(Assembler::LO, L_by1);
7713
7714 __ bind(L_by16_loop);
7715
7716 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7717 vbytes, vs1acc, vs2acc, vtable);
7718
7719 __ subs(len, len, 16);
7720 __ br(Assembler::HS, L_by16_loop);
7721
7722 __ bind(L_by1);
7723 __ adds(len, len, 15);
7724 __ br(Assembler::LO, L_do_mod);
7725
7726 __ bind(L_by1_loop);
7727 __ ldrb(temp0, Address(__ post(buff, 1)));
7728 __ add(s1, temp0, s1);
7729 __ add(s2, s2, s1);
7730 __ subs(len, len, 1);
7731 __ br(Assembler::HS, L_by1_loop);
7732
7733 __ bind(L_do_mod);
7734 // s1 = s1 % BASE
7735 __ lsr(temp0, s1, 16);
7736 __ lsl(temp1, temp0, 4);
7737 __ sub(temp1, temp1, temp0);
7738 __ add(temp1, temp1, s1, ext::uxth);
7739
7740 __ lsr(temp0, temp1, 16);
7741 __ lsl(s1, temp0, 4);
7742 __ sub(s1, s1, temp0);
7743 __ add(s1, s1, temp1, ext:: uxth);
7744
7745 __ subs(temp0, s1, base);
7746 __ csel(s1, temp0, s1, Assembler::HS);
7747
7748 // s2 = s2 % BASE
7749 __ lsr(temp0, s2, 16);
7750 __ lsl(temp1, temp0, 4);
7751 __ sub(temp1, temp1, temp0);
7752 __ add(temp1, temp1, s2, ext::uxth);
7753
7754 __ lsr(temp0, temp1, 16);
7755 __ lsl(s2, temp0, 4);
7756 __ sub(s2, s2, temp0);
7757 __ add(s2, s2, temp1, ext:: uxth);
7758
7759 __ subs(temp0, s2, base);
7760 __ csel(s2, temp0, s2, Assembler::HS);
7761
7762 // Combine lower bits and higher bits
7763 __ bind(L_combine);
7764 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7765
7766 __ ret(lr);
7767
7768 return start;
7769 }
7770
7771 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7772 Register temp0, Register temp1, FloatRegister vbytes,
7773 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7774 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7775 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7776 // In non-vectorized code, we update s1 and s2 as:
7777 // s1 <- s1 + b1
7778 // s2 <- s2 + s1
7779 // s1 <- s1 + b2
7780 // s2 <- s2 + b1
7781 // ...
7782 // s1 <- s1 + b16
7783 // s2 <- s2 + s1
7784 // Putting above assignments together, we have:
7785 // s1_new = s1 + b1 + b2 + ... + b16
7786 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7787 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7788 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7789 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7790
7791 // s2 = s2 + s1 * 16
7792 __ add(s2, s2, s1, Assembler::LSL, 4);
7793
7794 // vs1acc = b1 + b2 + b3 + ... + b16
7795 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7796 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7797 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7798 __ uaddlv(vs1acc, __ T16B, vbytes);
7799 __ uaddlv(vs2acc, __ T8H, vs2acc);
7800
7801 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7802 __ fmovd(temp0, vs1acc);
7803 __ fmovd(temp1, vs2acc);
7804 __ add(s1, s1, temp0);
7805 __ add(s2, s2, temp1);
7806 }
7807
7808 /**
7809 * Arguments:
7810 *
7811 * Input:
7812 * c_rarg0 - x address
7813 * c_rarg1 - x length
7814 * c_rarg2 - y address
7815 * c_rarg3 - y length
7816 * c_rarg4 - z address
7817 */
7818 address generate_multiplyToLen() {
7819 __ align(CodeEntryAlignment);
7820 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7821 StubCodeMark mark(this, stub_id);
7822
7823 address start = __ pc();
7824 const Register x = r0;
7825 const Register xlen = r1;
7826 const Register y = r2;
7827 const Register ylen = r3;
7828 const Register z = r4;
7829
7830 const Register tmp0 = r5;
7831 const Register tmp1 = r10;
7832 const Register tmp2 = r11;
7833 const Register tmp3 = r12;
7834 const Register tmp4 = r13;
7835 const Register tmp5 = r14;
7836 const Register tmp6 = r15;
7837 const Register tmp7 = r16;
7838
7839 BLOCK_COMMENT("Entry:");
7840 __ enter(); // required for proper stackwalking of RuntimeStub frame
7841 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7842 __ leave(); // required for proper stackwalking of RuntimeStub frame
7843 __ ret(lr);
7844
7845 return start;
7846 }
7847
7848 address generate_squareToLen() {
7849 // squareToLen algorithm for sizes 1..127 described in java code works
7850 // faster than multiply_to_len on some CPUs and slower on others, but
7851 // multiply_to_len shows a bit better overall results
7852 __ align(CodeEntryAlignment);
7853 StubId stub_id = StubId::stubgen_squareToLen_id;
7854 StubCodeMark mark(this, stub_id);
7855 address start = __ pc();
7856
7857 const Register x = r0;
7858 const Register xlen = r1;
7859 const Register z = r2;
7860 const Register y = r4; // == x
7861 const Register ylen = r5; // == xlen
7862
7863 const Register tmp0 = r3;
7864 const Register tmp1 = r10;
7865 const Register tmp2 = r11;
7866 const Register tmp3 = r12;
7867 const Register tmp4 = r13;
7868 const Register tmp5 = r14;
7869 const Register tmp6 = r15;
7870 const Register tmp7 = r16;
7871
7872 RegSet spilled_regs = RegSet::of(y, ylen);
7873 BLOCK_COMMENT("Entry:");
7874 __ enter();
7875 __ push(spilled_regs, sp);
7876 __ mov(y, x);
7877 __ mov(ylen, xlen);
7878 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7879 __ pop(spilled_regs, sp);
7880 __ leave();
7881 __ ret(lr);
7882 return start;
7883 }
7884
7885 address generate_mulAdd() {
7886 __ align(CodeEntryAlignment);
7887 StubId stub_id = StubId::stubgen_mulAdd_id;
7888 StubCodeMark mark(this, stub_id);
7889
7890 address start = __ pc();
7891
7892 const Register out = r0;
7893 const Register in = r1;
7894 const Register offset = r2;
7895 const Register len = r3;
7896 const Register k = r4;
7897
7898 BLOCK_COMMENT("Entry:");
7899 __ enter();
7900 __ mul_add(out, in, offset, len, k);
7901 __ leave();
7902 __ ret(lr);
7903
7904 return start;
7905 }
7906
7907 // Arguments:
7908 //
7909 // Input:
7910 // c_rarg0 - newArr address
7911 // c_rarg1 - oldArr address
7912 // c_rarg2 - newIdx
7913 // c_rarg3 - shiftCount
7914 // c_rarg4 - numIter
7915 //
7916 address generate_bigIntegerRightShift() {
7917 __ align(CodeEntryAlignment);
7918 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7919 StubCodeMark mark(this, stub_id);
7920 address start = __ pc();
7921
7922 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7923
7924 Register newArr = c_rarg0;
7925 Register oldArr = c_rarg1;
7926 Register newIdx = c_rarg2;
7927 Register shiftCount = c_rarg3;
7928 Register numIter = c_rarg4;
7929 Register idx = numIter;
7930
7931 Register newArrCur = rscratch1;
7932 Register shiftRevCount = rscratch2;
7933 Register oldArrCur = r13;
7934 Register oldArrNext = r14;
7935
7936 FloatRegister oldElem0 = v0;
7937 FloatRegister oldElem1 = v1;
7938 FloatRegister newElem = v2;
7939 FloatRegister shiftVCount = v3;
7940 FloatRegister shiftVRevCount = v4;
7941
7942 __ cbz(idx, Exit);
7943
7944 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7945
7946 // left shift count
7947 __ movw(shiftRevCount, 32);
7948 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7949
7950 // numIter too small to allow a 4-words SIMD loop, rolling back
7951 __ cmp(numIter, (u1)4);
7952 __ br(Assembler::LT, ShiftThree);
7953
7954 __ dup(shiftVCount, __ T4S, shiftCount);
7955 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7956 __ negr(shiftVCount, __ T4S, shiftVCount);
7957
7958 __ BIND(ShiftSIMDLoop);
7959
7960 // Calculate the load addresses
7961 __ sub(idx, idx, 4);
7962 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7963 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7964 __ add(oldArrCur, oldArrNext, 4);
7965
7966 // Load 4 words and process
7967 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7968 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7969 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7970 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7971 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7972 __ st1(newElem, __ T4S, Address(newArrCur));
7973
7974 __ cmp(idx, (u1)4);
7975 __ br(Assembler::LT, ShiftTwoLoop);
7976 __ b(ShiftSIMDLoop);
7977
7978 __ BIND(ShiftTwoLoop);
7979 __ cbz(idx, Exit);
7980 __ cmp(idx, (u1)1);
7981 __ br(Assembler::EQ, ShiftOne);
7982
7983 // Calculate the load addresses
7984 __ sub(idx, idx, 2);
7985 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7986 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7987 __ add(oldArrCur, oldArrNext, 4);
7988
7989 // Load 2 words and process
7990 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
7991 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
7992 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
7993 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
7994 __ orr(newElem, __ T8B, oldElem0, oldElem1);
7995 __ st1(newElem, __ T2S, Address(newArrCur));
7996 __ b(ShiftTwoLoop);
7997
7998 __ BIND(ShiftThree);
7999 __ tbz(idx, 1, ShiftOne);
8000 __ tbz(idx, 0, ShiftTwo);
8001 __ ldrw(r10, Address(oldArr, 12));
8002 __ ldrw(r11, Address(oldArr, 8));
8003 __ lsrvw(r10, r10, shiftCount);
8004 __ lslvw(r11, r11, shiftRevCount);
8005 __ orrw(r12, r10, r11);
8006 __ strw(r12, Address(newArr, 8));
8007
8008 __ BIND(ShiftTwo);
8009 __ ldrw(r10, Address(oldArr, 8));
8010 __ ldrw(r11, Address(oldArr, 4));
8011 __ lsrvw(r10, r10, shiftCount);
8012 __ lslvw(r11, r11, shiftRevCount);
8013 __ orrw(r12, r10, r11);
8014 __ strw(r12, Address(newArr, 4));
8015
8016 __ BIND(ShiftOne);
8017 __ ldrw(r10, Address(oldArr, 4));
8018 __ ldrw(r11, Address(oldArr));
8019 __ lsrvw(r10, r10, shiftCount);
8020 __ lslvw(r11, r11, shiftRevCount);
8021 __ orrw(r12, r10, r11);
8022 __ strw(r12, Address(newArr));
8023
8024 __ BIND(Exit);
8025 __ ret(lr);
8026
8027 return start;
8028 }
8029
8030 // Arguments:
8031 //
8032 // Input:
8033 // c_rarg0 - newArr address
8034 // c_rarg1 - oldArr address
8035 // c_rarg2 - newIdx
8036 // c_rarg3 - shiftCount
8037 // c_rarg4 - numIter
8038 //
8039 address generate_bigIntegerLeftShift() {
8040 __ align(CodeEntryAlignment);
8041 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8042 StubCodeMark mark(this, stub_id);
8043 address start = __ pc();
8044
8045 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8046
8047 Register newArr = c_rarg0;
8048 Register oldArr = c_rarg1;
8049 Register newIdx = c_rarg2;
8050 Register shiftCount = c_rarg3;
8051 Register numIter = c_rarg4;
8052
8053 Register shiftRevCount = rscratch1;
8054 Register oldArrNext = rscratch2;
8055
8056 FloatRegister oldElem0 = v0;
8057 FloatRegister oldElem1 = v1;
8058 FloatRegister newElem = v2;
8059 FloatRegister shiftVCount = v3;
8060 FloatRegister shiftVRevCount = v4;
8061
8062 __ cbz(numIter, Exit);
8063
8064 __ add(oldArrNext, oldArr, 4);
8065 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8066
8067 // right shift count
8068 __ movw(shiftRevCount, 32);
8069 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8070
8071 // numIter too small to allow a 4-words SIMD loop, rolling back
8072 __ cmp(numIter, (u1)4);
8073 __ br(Assembler::LT, ShiftThree);
8074
8075 __ dup(shiftVCount, __ T4S, shiftCount);
8076 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8077 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8078
8079 __ BIND(ShiftSIMDLoop);
8080
8081 // load 4 words and process
8082 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8083 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8084 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8085 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8086 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8087 __ st1(newElem, __ T4S, __ post(newArr, 16));
8088 __ sub(numIter, numIter, 4);
8089
8090 __ cmp(numIter, (u1)4);
8091 __ br(Assembler::LT, ShiftTwoLoop);
8092 __ b(ShiftSIMDLoop);
8093
8094 __ BIND(ShiftTwoLoop);
8095 __ cbz(numIter, Exit);
8096 __ cmp(numIter, (u1)1);
8097 __ br(Assembler::EQ, ShiftOne);
8098
8099 // load 2 words and process
8100 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8101 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8102 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8103 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8104 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8105 __ st1(newElem, __ T2S, __ post(newArr, 8));
8106 __ sub(numIter, numIter, 2);
8107 __ b(ShiftTwoLoop);
8108
8109 __ BIND(ShiftThree);
8110 __ ldrw(r10, __ post(oldArr, 4));
8111 __ ldrw(r11, __ post(oldArrNext, 4));
8112 __ lslvw(r10, r10, shiftCount);
8113 __ lsrvw(r11, r11, shiftRevCount);
8114 __ orrw(r12, r10, r11);
8115 __ strw(r12, __ post(newArr, 4));
8116 __ tbz(numIter, 1, Exit);
8117 __ tbz(numIter, 0, ShiftOne);
8118
8119 __ BIND(ShiftTwo);
8120 __ ldrw(r10, __ post(oldArr, 4));
8121 __ ldrw(r11, __ post(oldArrNext, 4));
8122 __ lslvw(r10, r10, shiftCount);
8123 __ lsrvw(r11, r11, shiftRevCount);
8124 __ orrw(r12, r10, r11);
8125 __ strw(r12, __ post(newArr, 4));
8126
8127 __ BIND(ShiftOne);
8128 __ ldrw(r10, Address(oldArr));
8129 __ ldrw(r11, Address(oldArrNext));
8130 __ lslvw(r10, r10, shiftCount);
8131 __ lsrvw(r11, r11, shiftRevCount);
8132 __ orrw(r12, r10, r11);
8133 __ strw(r12, Address(newArr));
8134
8135 __ BIND(Exit);
8136 __ ret(lr);
8137
8138 return start;
8139 }
8140
8141 address generate_count_positives(address &count_positives_long) {
8142 const u1 large_loop_size = 64;
8143 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8144 int dcache_line = VM_Version::dcache_line_size();
8145
8146 Register ary1 = r1, len = r2, result = r0;
8147
8148 __ align(CodeEntryAlignment);
8149
8150 StubId stub_id = StubId::stubgen_count_positives_id;
8151 StubCodeMark mark(this, stub_id);
8152
8153 address entry = __ pc();
8154
8155 __ enter();
8156 // precondition: a copy of len is already in result
8157 // __ mov(result, len);
8158
8159 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8160 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8161
8162 __ cmp(len, (u1)15);
8163 __ br(Assembler::GT, LEN_OVER_15);
8164 // The only case when execution falls into this code is when pointer is near
8165 // the end of memory page and we have to avoid reading next page
8166 __ add(ary1, ary1, len);
8167 __ subs(len, len, 8);
8168 __ br(Assembler::GT, LEN_OVER_8);
8169 __ ldr(rscratch2, Address(ary1, -8));
8170 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8171 __ lsrv(rscratch2, rscratch2, rscratch1);
8172 __ tst(rscratch2, UPPER_BIT_MASK);
8173 __ csel(result, zr, result, Assembler::NE);
8174 __ leave();
8175 __ ret(lr);
8176 __ bind(LEN_OVER_8);
8177 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8178 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8179 __ tst(rscratch2, UPPER_BIT_MASK);
8180 __ br(Assembler::NE, RET_NO_POP);
8181 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8182 __ lsrv(rscratch1, rscratch1, rscratch2);
8183 __ tst(rscratch1, UPPER_BIT_MASK);
8184 __ bind(RET_NO_POP);
8185 __ csel(result, zr, result, Assembler::NE);
8186 __ leave();
8187 __ ret(lr);
8188
8189 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8190 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8191
8192 count_positives_long = __ pc(); // 2nd entry point
8193
8194 __ enter();
8195
8196 __ bind(LEN_OVER_15);
8197 __ push(spilled_regs, sp);
8198 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8199 __ cbz(rscratch2, ALIGNED);
8200 __ ldp(tmp6, tmp1, Address(ary1));
8201 __ mov(tmp5, 16);
8202 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8203 __ add(ary1, ary1, rscratch1);
8204 __ orr(tmp6, tmp6, tmp1);
8205 __ tst(tmp6, UPPER_BIT_MASK);
8206 __ br(Assembler::NE, RET_ADJUST);
8207 __ sub(len, len, rscratch1);
8208
8209 __ bind(ALIGNED);
8210 __ cmp(len, large_loop_size);
8211 __ br(Assembler::LT, CHECK_16);
8212 // Perform 16-byte load as early return in pre-loop to handle situation
8213 // when initially aligned large array has negative values at starting bytes,
8214 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8215 // slower. Cases with negative bytes further ahead won't be affected that
8216 // much. In fact, it'll be faster due to early loads, less instructions and
8217 // less branches in LARGE_LOOP.
8218 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8219 __ sub(len, len, 16);
8220 __ orr(tmp6, tmp6, tmp1);
8221 __ tst(tmp6, UPPER_BIT_MASK);
8222 __ br(Assembler::NE, RET_ADJUST_16);
8223 __ cmp(len, large_loop_size);
8224 __ br(Assembler::LT, CHECK_16);
8225
8226 if (SoftwarePrefetchHintDistance >= 0
8227 && SoftwarePrefetchHintDistance >= dcache_line) {
8228 // initial prefetch
8229 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8230 }
8231 __ bind(LARGE_LOOP);
8232 if (SoftwarePrefetchHintDistance >= 0) {
8233 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8234 }
8235 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8236 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8237 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8238 // instructions per cycle and have less branches, but this approach disables
8239 // early return, thus, all 64 bytes are loaded and checked every time.
8240 __ ldp(tmp2, tmp3, Address(ary1));
8241 __ ldp(tmp4, tmp5, Address(ary1, 16));
8242 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8243 __ ldp(tmp6, tmp1, Address(ary1, 48));
8244 __ add(ary1, ary1, large_loop_size);
8245 __ sub(len, len, large_loop_size);
8246 __ orr(tmp2, tmp2, tmp3);
8247 __ orr(tmp4, tmp4, tmp5);
8248 __ orr(rscratch1, rscratch1, rscratch2);
8249 __ orr(tmp6, tmp6, tmp1);
8250 __ orr(tmp2, tmp2, tmp4);
8251 __ orr(rscratch1, rscratch1, tmp6);
8252 __ orr(tmp2, tmp2, rscratch1);
8253 __ tst(tmp2, UPPER_BIT_MASK);
8254 __ br(Assembler::NE, RET_ADJUST_LONG);
8255 __ cmp(len, large_loop_size);
8256 __ br(Assembler::GE, LARGE_LOOP);
8257
8258 __ bind(CHECK_16); // small 16-byte load pre-loop
8259 __ cmp(len, (u1)16);
8260 __ br(Assembler::LT, POST_LOOP16);
8261
8262 __ bind(LOOP16); // small 16-byte load loop
8263 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8264 __ sub(len, len, 16);
8265 __ orr(tmp2, tmp2, tmp3);
8266 __ tst(tmp2, UPPER_BIT_MASK);
8267 __ br(Assembler::NE, RET_ADJUST_16);
8268 __ cmp(len, (u1)16);
8269 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8270
8271 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8272 __ cmp(len, (u1)8);
8273 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8274 __ ldr(tmp3, Address(__ post(ary1, 8)));
8275 __ tst(tmp3, UPPER_BIT_MASK);
8276 __ br(Assembler::NE, RET_ADJUST);
8277 __ sub(len, len, 8);
8278
8279 __ bind(POST_LOOP16_LOAD_TAIL);
8280 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8281 __ ldr(tmp1, Address(ary1));
8282 __ mov(tmp2, 64);
8283 __ sub(tmp4, tmp2, len, __ LSL, 3);
8284 __ lslv(tmp1, tmp1, tmp4);
8285 __ tst(tmp1, UPPER_BIT_MASK);
8286 __ br(Assembler::NE, RET_ADJUST);
8287 // Fallthrough
8288
8289 __ bind(RET_LEN);
8290 __ pop(spilled_regs, sp);
8291 __ leave();
8292 __ ret(lr);
8293
8294 // difference result - len is the count of guaranteed to be
8295 // positive bytes
8296
8297 __ bind(RET_ADJUST_LONG);
8298 __ add(len, len, (u1)(large_loop_size - 16));
8299 __ bind(RET_ADJUST_16);
8300 __ add(len, len, 16);
8301 __ bind(RET_ADJUST);
8302 __ pop(spilled_regs, sp);
8303 __ leave();
8304 __ sub(result, result, len);
8305 __ ret(lr);
8306
8307 return entry;
8308 }
8309
8310 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8311 bool usePrefetch, Label &NOT_EQUAL) {
8312 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8313 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8314 tmp7 = r12, tmp8 = r13;
8315 Label LOOP;
8316
8317 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8318 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8319 __ bind(LOOP);
8320 if (usePrefetch) {
8321 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8322 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8323 }
8324 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8325 __ eor(tmp1, tmp1, tmp2);
8326 __ eor(tmp3, tmp3, tmp4);
8327 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8328 __ orr(tmp1, tmp1, tmp3);
8329 __ cbnz(tmp1, NOT_EQUAL);
8330 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8331 __ eor(tmp5, tmp5, tmp6);
8332 __ eor(tmp7, tmp7, tmp8);
8333 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8334 __ orr(tmp5, tmp5, tmp7);
8335 __ cbnz(tmp5, NOT_EQUAL);
8336 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8337 __ eor(tmp1, tmp1, tmp2);
8338 __ eor(tmp3, tmp3, tmp4);
8339 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8340 __ orr(tmp1, tmp1, tmp3);
8341 __ cbnz(tmp1, NOT_EQUAL);
8342 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8343 __ eor(tmp5, tmp5, tmp6);
8344 __ sub(cnt1, cnt1, 8 * wordSize);
8345 __ eor(tmp7, tmp7, tmp8);
8346 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8347 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8348 // cmp) because subs allows an unlimited range of immediate operand.
8349 __ subs(tmp6, cnt1, loopThreshold);
8350 __ orr(tmp5, tmp5, tmp7);
8351 __ cbnz(tmp5, NOT_EQUAL);
8352 __ br(__ GE, LOOP);
8353 // post-loop
8354 __ eor(tmp1, tmp1, tmp2);
8355 __ eor(tmp3, tmp3, tmp4);
8356 __ orr(tmp1, tmp1, tmp3);
8357 __ sub(cnt1, cnt1, 2 * wordSize);
8358 __ cbnz(tmp1, NOT_EQUAL);
8359 }
8360
8361 void generate_large_array_equals_loop_simd(int loopThreshold,
8362 bool usePrefetch, Label &NOT_EQUAL) {
8363 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8364 tmp2 = rscratch2;
8365 Label LOOP;
8366
8367 __ bind(LOOP);
8368 if (usePrefetch) {
8369 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8370 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8371 }
8372 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8373 __ sub(cnt1, cnt1, 8 * wordSize);
8374 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8375 __ subs(tmp1, cnt1, loopThreshold);
8376 __ eor(v0, __ T16B, v0, v4);
8377 __ eor(v1, __ T16B, v1, v5);
8378 __ eor(v2, __ T16B, v2, v6);
8379 __ eor(v3, __ T16B, v3, v7);
8380 __ orr(v0, __ T16B, v0, v1);
8381 __ orr(v1, __ T16B, v2, v3);
8382 __ orr(v0, __ T16B, v0, v1);
8383 __ umov(tmp1, v0, __ D, 0);
8384 __ umov(tmp2, v0, __ D, 1);
8385 __ orr(tmp1, tmp1, tmp2);
8386 __ cbnz(tmp1, NOT_EQUAL);
8387 __ br(__ GE, LOOP);
8388 }
8389
8390 // a1 = r1 - array1 address
8391 // a2 = r2 - array2 address
8392 // result = r0 - return value. Already contains "false"
8393 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8394 // r3-r5 are reserved temporary registers
8395 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8396 address generate_large_array_equals() {
8397 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8398 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8399 tmp7 = r12, tmp8 = r13;
8400 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8401 SMALL_LOOP, POST_LOOP;
8402 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8403 // calculate if at least 32 prefetched bytes are used
8404 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8405 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8406 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8407 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8408 tmp5, tmp6, tmp7, tmp8);
8409
8410 __ align(CodeEntryAlignment);
8411
8412 StubId stub_id = StubId::stubgen_large_array_equals_id;
8413 StubCodeMark mark(this, stub_id);
8414
8415 address entry = __ pc();
8416 __ enter();
8417 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8418 // also advance pointers to use post-increment instead of pre-increment
8419 __ add(a1, a1, wordSize);
8420 __ add(a2, a2, wordSize);
8421 if (AvoidUnalignedAccesses) {
8422 // both implementations (SIMD/nonSIMD) are using relatively large load
8423 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8424 // on some CPUs in case of address is not at least 16-byte aligned.
8425 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8426 // load if needed at least for 1st address and make if 16-byte aligned.
8427 Label ALIGNED16;
8428 __ tbz(a1, 3, ALIGNED16);
8429 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8430 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8431 __ sub(cnt1, cnt1, wordSize);
8432 __ eor(tmp1, tmp1, tmp2);
8433 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8434 __ bind(ALIGNED16);
8435 }
8436 if (UseSIMDForArrayEquals) {
8437 if (SoftwarePrefetchHintDistance >= 0) {
8438 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8439 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8440 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8441 /* prfm = */ true, NOT_EQUAL);
8442 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8443 __ br(__ LT, TAIL);
8444 }
8445 __ bind(NO_PREFETCH_LARGE_LOOP);
8446 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8447 /* prfm = */ false, NOT_EQUAL);
8448 } else {
8449 __ push(spilled_regs, sp);
8450 if (SoftwarePrefetchHintDistance >= 0) {
8451 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8452 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8453 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8454 /* prfm = */ true, NOT_EQUAL);
8455 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8456 __ br(__ LT, TAIL);
8457 }
8458 __ bind(NO_PREFETCH_LARGE_LOOP);
8459 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8460 /* prfm = */ false, NOT_EQUAL);
8461 }
8462 __ bind(TAIL);
8463 __ cbz(cnt1, EQUAL);
8464 __ subs(cnt1, cnt1, wordSize);
8465 __ br(__ LE, POST_LOOP);
8466 __ bind(SMALL_LOOP);
8467 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8468 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8469 __ subs(cnt1, cnt1, wordSize);
8470 __ eor(tmp1, tmp1, tmp2);
8471 __ cbnz(tmp1, NOT_EQUAL);
8472 __ br(__ GT, SMALL_LOOP);
8473 __ bind(POST_LOOP);
8474 __ ldr(tmp1, Address(a1, cnt1));
8475 __ ldr(tmp2, Address(a2, cnt1));
8476 __ eor(tmp1, tmp1, tmp2);
8477 __ cbnz(tmp1, NOT_EQUAL);
8478 __ bind(EQUAL);
8479 __ mov(result, true);
8480 __ bind(NOT_EQUAL);
8481 if (!UseSIMDForArrayEquals) {
8482 __ pop(spilled_regs, sp);
8483 }
8484 __ bind(NOT_EQUAL_NO_POP);
8485 __ leave();
8486 __ ret(lr);
8487 return entry;
8488 }
8489
8490 // result = r0 - return value. Contains initial hashcode value on entry.
8491 // ary = r1 - array address
8492 // cnt = r2 - elements count
8493 // Clobbers: v0-v13, rscratch1, rscratch2
8494 address generate_large_arrays_hashcode(BasicType eltype) {
8495 const Register result = r0, ary = r1, cnt = r2;
8496 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8497 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8498 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8499 const FloatRegister vpowm = v13;
8500
8501 ARRAYS_HASHCODE_REGISTERS;
8502
8503 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8504
8505 unsigned int vf; // vectorization factor
8506 bool multiply_by_halves;
8507 Assembler::SIMD_Arrangement load_arrangement;
8508 switch (eltype) {
8509 case T_BOOLEAN:
8510 case T_BYTE:
8511 load_arrangement = Assembler::T8B;
8512 multiply_by_halves = true;
8513 vf = 8;
8514 break;
8515 case T_CHAR:
8516 case T_SHORT:
8517 load_arrangement = Assembler::T8H;
8518 multiply_by_halves = true;
8519 vf = 8;
8520 break;
8521 case T_INT:
8522 load_arrangement = Assembler::T4S;
8523 multiply_by_halves = false;
8524 vf = 4;
8525 break;
8526 default:
8527 ShouldNotReachHere();
8528 }
8529
8530 // Unroll factor
8531 const unsigned uf = 4;
8532
8533 // Effective vectorization factor
8534 const unsigned evf = vf * uf;
8535
8536 __ align(CodeEntryAlignment);
8537
8538 StubId stub_id;
8539 switch (eltype) {
8540 case T_BOOLEAN:
8541 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8542 break;
8543 case T_BYTE:
8544 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8545 break;
8546 case T_CHAR:
8547 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8548 break;
8549 case T_SHORT:
8550 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8551 break;
8552 case T_INT:
8553 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8554 break;
8555 default:
8556 stub_id = StubId::NO_STUBID;
8557 ShouldNotReachHere();
8558 };
8559
8560 StubCodeMark mark(this, stub_id);
8561
8562 address entry = __ pc();
8563 __ enter();
8564
8565 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8566 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8567 // value shouldn't change throughout both loops.
8568 __ movw(rscratch1, intpow(31U, 3));
8569 __ mov(vpow, Assembler::S, 0, rscratch1);
8570 __ movw(rscratch1, intpow(31U, 2));
8571 __ mov(vpow, Assembler::S, 1, rscratch1);
8572 __ movw(rscratch1, intpow(31U, 1));
8573 __ mov(vpow, Assembler::S, 2, rscratch1);
8574 __ movw(rscratch1, intpow(31U, 0));
8575 __ mov(vpow, Assembler::S, 3, rscratch1);
8576
8577 __ mov(vmul0, Assembler::T16B, 0);
8578 __ mov(vmul0, Assembler::S, 3, result);
8579
8580 __ andr(rscratch2, cnt, (uf - 1) * vf);
8581 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8582
8583 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8584 __ mov(vpowm, Assembler::S, 0, rscratch1);
8585
8586 // SMALL LOOP
8587 __ bind(SMALL_LOOP);
8588
8589 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8590 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8591 __ subsw(rscratch2, rscratch2, vf);
8592
8593 if (load_arrangement == Assembler::T8B) {
8594 // Extend 8B to 8H to be able to use vector multiply
8595 // instructions
8596 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8597 if (is_signed_subword_type(eltype)) {
8598 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8599 } else {
8600 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8601 }
8602 }
8603
8604 switch (load_arrangement) {
8605 case Assembler::T4S:
8606 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8607 break;
8608 case Assembler::T8B:
8609 case Assembler::T8H:
8610 assert(is_subword_type(eltype), "subword type expected");
8611 if (is_signed_subword_type(eltype)) {
8612 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8613 } else {
8614 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8615 }
8616 break;
8617 default:
8618 __ should_not_reach_here();
8619 }
8620
8621 // Process the upper half of a vector
8622 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8623 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8624 if (is_signed_subword_type(eltype)) {
8625 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8626 } else {
8627 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8628 }
8629 }
8630
8631 __ br(Assembler::HI, SMALL_LOOP);
8632
8633 // SMALL LOOP'S EPILOQUE
8634 __ lsr(rscratch2, cnt, exact_log2(evf));
8635 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8636
8637 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8638 __ addv(vmul0, Assembler::T4S, vmul0);
8639 __ umov(result, vmul0, Assembler::S, 0);
8640
8641 // TAIL
8642 __ bind(TAIL);
8643
8644 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8645 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8646 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8647 __ andr(rscratch2, cnt, vf - 1);
8648 __ bind(TAIL_SHORTCUT);
8649 __ adr(rscratch1, BR_BASE);
8650 // For Cortex-A53 offset is 4 because 2 nops are generated.
8651 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8652 __ movw(rscratch2, 0x1f);
8653 __ br(rscratch1);
8654
8655 for (size_t i = 0; i < vf - 1; ++i) {
8656 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8657 eltype);
8658 __ maddw(result, result, rscratch2, rscratch1);
8659 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8660 // Generate 2nd nop to have 4 instructions per iteration.
8661 if (VM_Version::supports_a53mac()) {
8662 __ nop();
8663 }
8664 }
8665 __ bind(BR_BASE);
8666
8667 __ leave();
8668 __ ret(lr);
8669
8670 // LARGE LOOP
8671 __ bind(LARGE_LOOP_PREHEADER);
8672
8673 __ lsr(rscratch2, cnt, exact_log2(evf));
8674
8675 if (multiply_by_halves) {
8676 // 31^4 - multiplier between lower and upper parts of a register
8677 __ movw(rscratch1, intpow(31U, vf / 2));
8678 __ mov(vpowm, Assembler::S, 1, rscratch1);
8679 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8680 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8681 __ mov(vpowm, Assembler::S, 0, rscratch1);
8682 } else {
8683 // 31^16
8684 __ movw(rscratch1, intpow(31U, evf));
8685 __ mov(vpowm, Assembler::S, 0, rscratch1);
8686 }
8687
8688 __ mov(vmul3, Assembler::T16B, 0);
8689 __ mov(vmul2, Assembler::T16B, 0);
8690 __ mov(vmul1, Assembler::T16B, 0);
8691
8692 __ bind(LARGE_LOOP);
8693
8694 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8695 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8696 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8697 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8698
8699 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8700 Address(__ post(ary, evf * type2aelembytes(eltype))));
8701
8702 if (load_arrangement == Assembler::T8B) {
8703 // Extend 8B to 8H to be able to use vector multiply
8704 // instructions
8705 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8706 if (is_signed_subword_type(eltype)) {
8707 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8708 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8709 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8710 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8711 } else {
8712 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8713 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8714 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8715 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8716 }
8717 }
8718
8719 switch (load_arrangement) {
8720 case Assembler::T4S:
8721 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8722 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8723 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8724 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8725 break;
8726 case Assembler::T8B:
8727 case Assembler::T8H:
8728 assert(is_subword_type(eltype), "subword type expected");
8729 if (is_signed_subword_type(eltype)) {
8730 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8731 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8732 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8733 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8734 } else {
8735 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8736 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8737 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8738 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8739 }
8740 break;
8741 default:
8742 __ should_not_reach_here();
8743 }
8744
8745 // Process the upper half of a vector
8746 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8747 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8748 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8749 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8750 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8751 if (is_signed_subword_type(eltype)) {
8752 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8753 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8754 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8755 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8756 } else {
8757 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8758 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8759 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8760 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8761 }
8762 }
8763
8764 __ subsw(rscratch2, rscratch2, 1);
8765 __ br(Assembler::HI, LARGE_LOOP);
8766
8767 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8768 __ addv(vmul3, Assembler::T4S, vmul3);
8769 __ umov(result, vmul3, Assembler::S, 0);
8770
8771 __ mov(rscratch2, intpow(31U, vf));
8772
8773 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8774 __ addv(vmul2, Assembler::T4S, vmul2);
8775 __ umov(rscratch1, vmul2, Assembler::S, 0);
8776 __ maddw(result, result, rscratch2, rscratch1);
8777
8778 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8779 __ addv(vmul1, Assembler::T4S, vmul1);
8780 __ umov(rscratch1, vmul1, Assembler::S, 0);
8781 __ maddw(result, result, rscratch2, rscratch1);
8782
8783 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8784 __ addv(vmul0, Assembler::T4S, vmul0);
8785 __ umov(rscratch1, vmul0, Assembler::S, 0);
8786 __ maddw(result, result, rscratch2, rscratch1);
8787
8788 __ andr(rscratch2, cnt, vf - 1);
8789 __ cbnz(rscratch2, TAIL_SHORTCUT);
8790
8791 __ leave();
8792 __ ret(lr);
8793
8794 return entry;
8795 }
8796
8797 address generate_dsin_dcos(bool isCos) {
8798 __ align(CodeEntryAlignment);
8799 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8800 StubCodeMark mark(this, stub_id);
8801 address start = __ pc();
8802 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8803 (address)StubRoutines::aarch64::_two_over_pi,
8804 (address)StubRoutines::aarch64::_pio2,
8805 (address)StubRoutines::aarch64::_dsin_coef,
8806 (address)StubRoutines::aarch64::_dcos_coef);
8807 return start;
8808 }
8809
8810 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8811 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8812 Label &DIFF2) {
8813 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8814 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8815
8816 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8817 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8818 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8819 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8820
8821 __ fmovd(tmpL, vtmp3);
8822 __ eor(rscratch2, tmp3, tmpL);
8823 __ cbnz(rscratch2, DIFF2);
8824
8825 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8826 __ umov(tmpL, vtmp3, __ D, 1);
8827 __ eor(rscratch2, tmpU, tmpL);
8828 __ cbnz(rscratch2, DIFF1);
8829
8830 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8831 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8832 __ fmovd(tmpL, vtmp);
8833 __ eor(rscratch2, tmp3, tmpL);
8834 __ cbnz(rscratch2, DIFF2);
8835
8836 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8837 __ umov(tmpL, vtmp, __ D, 1);
8838 __ eor(rscratch2, tmpU, tmpL);
8839 __ cbnz(rscratch2, DIFF1);
8840 }
8841
8842 // r0 = result
8843 // r1 = str1
8844 // r2 = cnt1
8845 // r3 = str2
8846 // r4 = cnt2
8847 // r10 = tmp1
8848 // r11 = tmp2
8849 address generate_compare_long_string_different_encoding(bool isLU) {
8850 __ align(CodeEntryAlignment);
8851 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8852 StubCodeMark mark(this, stub_id);
8853 address entry = __ pc();
8854 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8855 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8856 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8857 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8858 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8859 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8860 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8861
8862 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8863
8864 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8865 // cnt2 == amount of characters left to compare
8866 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8867 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8868 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8869 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8870 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8871 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8872 __ eor(rscratch2, tmp1, tmp2);
8873 __ mov(rscratch1, tmp2);
8874 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8875 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8876 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8877 __ push(spilled_regs, sp);
8878 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8879 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8880
8881 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8882
8883 if (SoftwarePrefetchHintDistance >= 0) {
8884 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8885 __ br(__ LT, NO_PREFETCH);
8886 __ bind(LARGE_LOOP_PREFETCH);
8887 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8888 __ mov(tmp4, 2);
8889 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8890 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8891 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8892 __ subs(tmp4, tmp4, 1);
8893 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8894 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8895 __ mov(tmp4, 2);
8896 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8897 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8898 __ subs(tmp4, tmp4, 1);
8899 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8900 __ sub(cnt2, cnt2, 64);
8901 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8902 __ br(__ GE, LARGE_LOOP_PREFETCH);
8903 }
8904 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8905 __ bind(NO_PREFETCH);
8906 __ subs(cnt2, cnt2, 16);
8907 __ br(__ LT, TAIL);
8908 __ align(OptoLoopAlignment);
8909 __ bind(SMALL_LOOP); // smaller loop
8910 __ subs(cnt2, cnt2, 16);
8911 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8912 __ br(__ GE, SMALL_LOOP);
8913 __ cmn(cnt2, (u1)16);
8914 __ br(__ EQ, LOAD_LAST);
8915 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8916 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8917 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8918 __ ldr(tmp3, Address(cnt1, -8));
8919 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8920 __ b(LOAD_LAST);
8921 __ bind(DIFF2);
8922 __ mov(tmpU, tmp3);
8923 __ bind(DIFF1);
8924 __ pop(spilled_regs, sp);
8925 __ b(CALCULATE_DIFFERENCE);
8926 __ bind(LOAD_LAST);
8927 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8928 // No need to load it again
8929 __ mov(tmpU, tmp3);
8930 __ pop(spilled_regs, sp);
8931
8932 // tmp2 points to the address of the last 4 Latin1 characters right now
8933 __ ldrs(vtmp, Address(tmp2));
8934 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8935 __ fmovd(tmpL, vtmp);
8936
8937 __ eor(rscratch2, tmpU, tmpL);
8938 __ cbz(rscratch2, DONE);
8939
8940 // Find the first different characters in the longwords and
8941 // compute their difference.
8942 __ bind(CALCULATE_DIFFERENCE);
8943 __ rev(rscratch2, rscratch2);
8944 __ clz(rscratch2, rscratch2);
8945 __ andr(rscratch2, rscratch2, -16);
8946 __ lsrv(tmp1, tmp1, rscratch2);
8947 __ uxthw(tmp1, tmp1);
8948 __ lsrv(rscratch1, rscratch1, rscratch2);
8949 __ uxthw(rscratch1, rscratch1);
8950 __ subw(result, tmp1, rscratch1);
8951 __ bind(DONE);
8952 __ ret(lr);
8953 return entry;
8954 }
8955
8956 // r0 = input (float16)
8957 // v0 = result (float)
8958 // v1 = temporary float register
8959 address generate_float16ToFloat() {
8960 __ align(CodeEntryAlignment);
8961 StubId stub_id = StubId::stubgen_hf2f_id;
8962 StubCodeMark mark(this, stub_id);
8963 address entry = __ pc();
8964 BLOCK_COMMENT("Entry:");
8965 __ flt16_to_flt(v0, r0, v1);
8966 __ ret(lr);
8967 return entry;
8968 }
8969
8970 // v0 = input (float)
8971 // r0 = result (float16)
8972 // v1 = temporary float register
8973 address generate_floatToFloat16() {
8974 __ align(CodeEntryAlignment);
8975 StubId stub_id = StubId::stubgen_f2hf_id;
8976 StubCodeMark mark(this, stub_id);
8977 address entry = __ pc();
8978 BLOCK_COMMENT("Entry:");
8979 __ flt_to_flt16(r0, v0, v1);
8980 __ ret(lr);
8981 return entry;
8982 }
8983
8984 address generate_method_entry_barrier() {
8985 __ align(CodeEntryAlignment);
8986 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
8987 StubCodeMark mark(this, stub_id);
8988
8989 Label deoptimize_label;
8990
8991 address start = __ pc();
8992
8993 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
8994
8995 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
8996 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8997 // We can get here despite the nmethod being good, if we have not
8998 // yet applied our cross modification fence (or data fence).
8999 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9000 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9001 __ ldrw(rscratch2, rscratch2);
9002 __ strw(rscratch2, thread_epoch_addr);
9003 __ isb();
9004 __ membar(__ LoadLoad);
9005 }
9006
9007 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9008
9009 __ enter();
9010 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9011
9012 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9013
9014 __ push_call_clobbered_registers();
9015
9016 __ mov(c_rarg0, rscratch2);
9017 __ call_VM_leaf
9018 (CAST_FROM_FN_PTR
9019 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9020
9021 __ reset_last_Java_frame(true);
9022
9023 __ mov(rscratch1, r0);
9024
9025 __ pop_call_clobbered_registers();
9026
9027 __ cbnz(rscratch1, deoptimize_label);
9028
9029 __ leave();
9030 __ ret(lr);
9031
9032 __ BIND(deoptimize_label);
9033
9034 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9035 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9036
9037 __ mov(sp, rscratch1);
9038 __ br(rscratch2);
9039
9040 return start;
9041 }
9042
9043 // r0 = result
9044 // r1 = str1
9045 // r2 = cnt1
9046 // r3 = str2
9047 // r4 = cnt2
9048 // r10 = tmp1
9049 // r11 = tmp2
9050 address generate_compare_long_string_same_encoding(bool isLL) {
9051 __ align(CodeEntryAlignment);
9052 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9053 StubCodeMark mark(this, stub_id);
9054 address entry = __ pc();
9055 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9056 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9057
9058 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9059
9060 // exit from large loop when less than 64 bytes left to read or we're about
9061 // to prefetch memory behind array border
9062 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9063
9064 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9065 __ eor(rscratch2, tmp1, tmp2);
9066 __ cbnz(rscratch2, CAL_DIFFERENCE);
9067
9068 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9069 // update pointers, because of previous read
9070 __ add(str1, str1, wordSize);
9071 __ add(str2, str2, wordSize);
9072 if (SoftwarePrefetchHintDistance >= 0) {
9073 __ align(OptoLoopAlignment);
9074 __ bind(LARGE_LOOP_PREFETCH);
9075 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9076 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9077
9078 for (int i = 0; i < 4; i++) {
9079 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9080 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9081 __ cmp(tmp1, tmp2);
9082 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9083 __ br(Assembler::NE, DIFF);
9084 }
9085 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9086 __ add(str1, str1, 64);
9087 __ add(str2, str2, 64);
9088 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9089 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9090 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9091 }
9092
9093 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9094 __ br(Assembler::LE, LESS16);
9095 __ align(OptoLoopAlignment);
9096 __ bind(LOOP_COMPARE16);
9097 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9098 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9099 __ cmp(tmp1, tmp2);
9100 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9101 __ br(Assembler::NE, DIFF);
9102 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9103 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9104 __ br(Assembler::LT, LESS16);
9105
9106 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9107 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9108 __ cmp(tmp1, tmp2);
9109 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9110 __ br(Assembler::NE, DIFF);
9111 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9112 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9113 __ br(Assembler::GE, LOOP_COMPARE16);
9114 __ cbz(cnt2, LENGTH_DIFF);
9115
9116 __ bind(LESS16);
9117 // each 8 compare
9118 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9119 __ br(Assembler::LE, LESS8);
9120 __ ldr(tmp1, Address(__ post(str1, 8)));
9121 __ ldr(tmp2, Address(__ post(str2, 8)));
9122 __ eor(rscratch2, tmp1, tmp2);
9123 __ cbnz(rscratch2, CAL_DIFFERENCE);
9124 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9125
9126 __ bind(LESS8); // directly load last 8 bytes
9127 if (!isLL) {
9128 __ add(cnt2, cnt2, cnt2);
9129 }
9130 __ ldr(tmp1, Address(str1, cnt2));
9131 __ ldr(tmp2, Address(str2, cnt2));
9132 __ eor(rscratch2, tmp1, tmp2);
9133 __ cbz(rscratch2, LENGTH_DIFF);
9134 __ b(CAL_DIFFERENCE);
9135
9136 __ bind(DIFF);
9137 __ cmp(tmp1, tmp2);
9138 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9139 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9140 // reuse rscratch2 register for the result of eor instruction
9141 __ eor(rscratch2, tmp1, tmp2);
9142
9143 __ bind(CAL_DIFFERENCE);
9144 __ rev(rscratch2, rscratch2);
9145 __ clz(rscratch2, rscratch2);
9146 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9147 __ lsrv(tmp1, tmp1, rscratch2);
9148 __ lsrv(tmp2, tmp2, rscratch2);
9149 if (isLL) {
9150 __ uxtbw(tmp1, tmp1);
9151 __ uxtbw(tmp2, tmp2);
9152 } else {
9153 __ uxthw(tmp1, tmp1);
9154 __ uxthw(tmp2, tmp2);
9155 }
9156 __ subw(result, tmp1, tmp2);
9157
9158 __ bind(LENGTH_DIFF);
9159 __ ret(lr);
9160 return entry;
9161 }
9162
9163 enum string_compare_mode {
9164 LL,
9165 LU,
9166 UL,
9167 UU,
9168 };
9169
9170 // The following registers are declared in aarch64.ad
9171 // r0 = result
9172 // r1 = str1
9173 // r2 = cnt1
9174 // r3 = str2
9175 // r4 = cnt2
9176 // r10 = tmp1
9177 // r11 = tmp2
9178 // z0 = ztmp1
9179 // z1 = ztmp2
9180 // p0 = pgtmp1
9181 // p1 = pgtmp2
9182 address generate_compare_long_string_sve(string_compare_mode mode) {
9183 StubId stub_id;
9184 switch (mode) {
9185 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9186 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9187 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9188 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9189 default: ShouldNotReachHere();
9190 }
9191
9192 __ align(CodeEntryAlignment);
9193 address entry = __ pc();
9194 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9195 tmp1 = r10, tmp2 = r11;
9196
9197 Label LOOP, DONE, MISMATCH;
9198 Register vec_len = tmp1;
9199 Register idx = tmp2;
9200 // The minimum of the string lengths has been stored in cnt2.
9201 Register cnt = cnt2;
9202 FloatRegister ztmp1 = z0, ztmp2 = z1;
9203 PRegister pgtmp1 = p0, pgtmp2 = p1;
9204
9205 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9206 switch (mode) { \
9207 case LL: \
9208 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9209 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9210 break; \
9211 case LU: \
9212 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9213 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9214 break; \
9215 case UL: \
9216 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9217 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9218 break; \
9219 case UU: \
9220 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9221 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9222 break; \
9223 default: \
9224 ShouldNotReachHere(); \
9225 }
9226
9227 StubCodeMark mark(this, stub_id);
9228
9229 __ mov(idx, 0);
9230 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9231
9232 if (mode == LL) {
9233 __ sve_cntb(vec_len);
9234 } else {
9235 __ sve_cnth(vec_len);
9236 }
9237
9238 __ sub(rscratch1, cnt, vec_len);
9239
9240 __ bind(LOOP);
9241
9242 // main loop
9243 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9244 __ add(idx, idx, vec_len);
9245 // Compare strings.
9246 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9247 __ br(__ NE, MISMATCH);
9248 __ cmp(idx, rscratch1);
9249 __ br(__ LT, LOOP);
9250
9251 // post loop, last iteration
9252 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9253
9254 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9255 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9256 __ br(__ EQ, DONE);
9257
9258 __ bind(MISMATCH);
9259
9260 // Crop the vector to find its location.
9261 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9262 // Extract the first different characters of each string.
9263 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9264 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9265
9266 // Compute the difference of the first different characters.
9267 __ sub(result, rscratch1, rscratch2);
9268
9269 __ bind(DONE);
9270 __ ret(lr);
9271 #undef LOAD_PAIR
9272 return entry;
9273 }
9274
9275 void generate_compare_long_strings() {
9276 if (UseSVE == 0) {
9277 StubRoutines::aarch64::_compare_long_string_LL
9278 = generate_compare_long_string_same_encoding(true);
9279 StubRoutines::aarch64::_compare_long_string_UU
9280 = generate_compare_long_string_same_encoding(false);
9281 StubRoutines::aarch64::_compare_long_string_LU
9282 = generate_compare_long_string_different_encoding(true);
9283 StubRoutines::aarch64::_compare_long_string_UL
9284 = generate_compare_long_string_different_encoding(false);
9285 } else {
9286 StubRoutines::aarch64::_compare_long_string_LL
9287 = generate_compare_long_string_sve(LL);
9288 StubRoutines::aarch64::_compare_long_string_UU
9289 = generate_compare_long_string_sve(UU);
9290 StubRoutines::aarch64::_compare_long_string_LU
9291 = generate_compare_long_string_sve(LU);
9292 StubRoutines::aarch64::_compare_long_string_UL
9293 = generate_compare_long_string_sve(UL);
9294 }
9295 }
9296
9297 // R0 = result
9298 // R1 = str2
9299 // R2 = cnt1
9300 // R3 = str1
9301 // R4 = cnt2
9302 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9303 //
9304 // This generic linear code use few additional ideas, which makes it faster:
9305 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9306 // in order to skip initial loading(help in systems with 1 ld pipeline)
9307 // 2) we can use "fast" algorithm of finding single character to search for
9308 // first symbol with less branches(1 branch per each loaded register instead
9309 // of branch for each symbol), so, this is where constants like
9310 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9311 // 3) after loading and analyzing 1st register of source string, it can be
9312 // used to search for every 1st character entry, saving few loads in
9313 // comparison with "simplier-but-slower" implementation
9314 // 4) in order to avoid lots of push/pop operations, code below is heavily
9315 // re-using/re-initializing/compressing register values, which makes code
9316 // larger and a bit less readable, however, most of extra operations are
9317 // issued during loads or branches, so, penalty is minimal
9318 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9319 StubId stub_id;
9320 if (str1_isL) {
9321 if (str2_isL) {
9322 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9323 } else {
9324 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9325 }
9326 } else {
9327 if (str2_isL) {
9328 ShouldNotReachHere();
9329 } else {
9330 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9331 }
9332 }
9333 __ align(CodeEntryAlignment);
9334 StubCodeMark mark(this, stub_id);
9335 address entry = __ pc();
9336
9337 int str1_chr_size = str1_isL ? 1 : 2;
9338 int str2_chr_size = str2_isL ? 1 : 2;
9339 int str1_chr_shift = str1_isL ? 0 : 1;
9340 int str2_chr_shift = str2_isL ? 0 : 1;
9341 bool isL = str1_isL && str2_isL;
9342 // parameters
9343 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9344 // temporary registers
9345 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9346 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9347 // redefinitions
9348 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9349
9350 __ push(spilled_regs, sp);
9351 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9352 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9353 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9354 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9355 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9356 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9357 // Read whole register from str1. It is safe, because length >=8 here
9358 __ ldr(ch1, Address(str1));
9359 // Read whole register from str2. It is safe, because length >=8 here
9360 __ ldr(ch2, Address(str2));
9361 __ sub(cnt2, cnt2, cnt1);
9362 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9363 if (str1_isL != str2_isL) {
9364 __ eor(v0, __ T16B, v0, v0);
9365 }
9366 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9367 __ mul(first, first, tmp1);
9368 // check if we have less than 1 register to check
9369 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9370 if (str1_isL != str2_isL) {
9371 __ fmovd(v1, ch1);
9372 }
9373 __ br(__ LE, L_SMALL);
9374 __ eor(ch2, first, ch2);
9375 if (str1_isL != str2_isL) {
9376 __ zip1(v1, __ T16B, v1, v0);
9377 }
9378 __ sub(tmp2, ch2, tmp1);
9379 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9380 __ bics(tmp2, tmp2, ch2);
9381 if (str1_isL != str2_isL) {
9382 __ fmovd(ch1, v1);
9383 }
9384 __ br(__ NE, L_HAS_ZERO);
9385 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9386 __ add(result, result, wordSize/str2_chr_size);
9387 __ add(str2, str2, wordSize);
9388 __ br(__ LT, L_POST_LOOP);
9389 __ BIND(L_LOOP);
9390 __ ldr(ch2, Address(str2));
9391 __ eor(ch2, first, ch2);
9392 __ sub(tmp2, ch2, tmp1);
9393 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9394 __ bics(tmp2, tmp2, ch2);
9395 __ br(__ NE, L_HAS_ZERO);
9396 __ BIND(L_LOOP_PROCEED);
9397 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9398 __ add(str2, str2, wordSize);
9399 __ add(result, result, wordSize/str2_chr_size);
9400 __ br(__ GE, L_LOOP);
9401 __ BIND(L_POST_LOOP);
9402 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9403 __ br(__ LE, NOMATCH);
9404 __ ldr(ch2, Address(str2));
9405 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9406 __ eor(ch2, first, ch2);
9407 __ sub(tmp2, ch2, tmp1);
9408 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9409 __ mov(tmp4, -1); // all bits set
9410 __ b(L_SMALL_PROCEED);
9411 __ align(OptoLoopAlignment);
9412 __ BIND(L_SMALL);
9413 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9414 __ eor(ch2, first, ch2);
9415 if (str1_isL != str2_isL) {
9416 __ zip1(v1, __ T16B, v1, v0);
9417 }
9418 __ sub(tmp2, ch2, tmp1);
9419 __ mov(tmp4, -1); // all bits set
9420 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9421 if (str1_isL != str2_isL) {
9422 __ fmovd(ch1, v1); // move converted 4 symbols
9423 }
9424 __ BIND(L_SMALL_PROCEED);
9425 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9426 __ bic(tmp2, tmp2, ch2);
9427 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9428 __ rbit(tmp2, tmp2);
9429 __ br(__ EQ, NOMATCH);
9430 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9431 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9432 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9433 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9434 if (str2_isL) { // LL
9435 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9436 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9437 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9438 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9439 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9440 } else {
9441 __ mov(ch2, 0xE); // all bits in byte set except last one
9442 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9443 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9444 __ lslv(tmp2, tmp2, tmp4);
9445 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9446 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9447 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9448 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9449 }
9450 __ cmp(ch1, ch2);
9451 __ mov(tmp4, wordSize/str2_chr_size);
9452 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9453 __ BIND(L_SMALL_CMP_LOOP);
9454 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9455 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9456 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9457 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9458 __ add(tmp4, tmp4, 1);
9459 __ cmp(tmp4, cnt1);
9460 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9461 __ cmp(first, ch2);
9462 __ br(__ EQ, L_SMALL_CMP_LOOP);
9463 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9464 __ cbz(tmp2, NOMATCH); // no more matches. exit
9465 __ clz(tmp4, tmp2);
9466 __ add(result, result, 1); // advance index
9467 __ add(str2, str2, str2_chr_size); // advance pointer
9468 __ b(L_SMALL_HAS_ZERO_LOOP);
9469 __ align(OptoLoopAlignment);
9470 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9471 __ cmp(first, ch2);
9472 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9473 __ b(DONE);
9474 __ align(OptoLoopAlignment);
9475 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9476 if (str2_isL) { // LL
9477 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9478 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9479 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9480 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9481 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9482 } else {
9483 __ mov(ch2, 0xE); // all bits in byte set except last one
9484 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9485 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9486 __ lslv(tmp2, tmp2, tmp4);
9487 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9488 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9489 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9490 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9491 }
9492 __ cmp(ch1, ch2);
9493 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9494 __ b(DONE);
9495 __ align(OptoLoopAlignment);
9496 __ BIND(L_HAS_ZERO);
9497 __ rbit(tmp2, tmp2);
9498 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9499 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9500 // It's fine because both counters are 32bit and are not changed in this
9501 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9502 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9503 __ sub(result, result, 1);
9504 __ BIND(L_HAS_ZERO_LOOP);
9505 __ mov(cnt1, wordSize/str2_chr_size);
9506 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9507 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9508 if (str2_isL) {
9509 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9510 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9511 __ lslv(tmp2, tmp2, tmp4);
9512 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9513 __ add(tmp4, tmp4, 1);
9514 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9515 __ lsl(tmp2, tmp2, 1);
9516 __ mov(tmp4, wordSize/str2_chr_size);
9517 } else {
9518 __ mov(ch2, 0xE);
9519 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9520 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9521 __ lslv(tmp2, tmp2, tmp4);
9522 __ add(tmp4, tmp4, 1);
9523 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9524 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9525 __ lsl(tmp2, tmp2, 1);
9526 __ mov(tmp4, wordSize/str2_chr_size);
9527 __ sub(str2, str2, str2_chr_size);
9528 }
9529 __ cmp(ch1, ch2);
9530 __ mov(tmp4, wordSize/str2_chr_size);
9531 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9532 __ BIND(L_CMP_LOOP);
9533 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9534 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9535 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9536 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9537 __ add(tmp4, tmp4, 1);
9538 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9539 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9540 __ cmp(cnt1, ch2);
9541 __ br(__ EQ, L_CMP_LOOP);
9542 __ BIND(L_CMP_LOOP_NOMATCH);
9543 // here we're not matched
9544 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9545 __ clz(tmp4, tmp2);
9546 __ add(str2, str2, str2_chr_size); // advance pointer
9547 __ b(L_HAS_ZERO_LOOP);
9548 __ align(OptoLoopAlignment);
9549 __ BIND(L_CMP_LOOP_LAST_CMP);
9550 __ cmp(cnt1, ch2);
9551 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9552 __ b(DONE);
9553 __ align(OptoLoopAlignment);
9554 __ BIND(L_CMP_LOOP_LAST_CMP2);
9555 if (str2_isL) {
9556 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9557 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9558 __ lslv(tmp2, tmp2, tmp4);
9559 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9560 __ add(tmp4, tmp4, 1);
9561 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9562 __ lsl(tmp2, tmp2, 1);
9563 } else {
9564 __ mov(ch2, 0xE);
9565 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9566 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9567 __ lslv(tmp2, tmp2, tmp4);
9568 __ add(tmp4, tmp4, 1);
9569 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9570 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9571 __ lsl(tmp2, tmp2, 1);
9572 __ sub(str2, str2, str2_chr_size);
9573 }
9574 __ cmp(ch1, ch2);
9575 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9576 __ b(DONE);
9577 __ align(OptoLoopAlignment);
9578 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9579 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9580 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9581 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9582 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9583 // result by analyzed characters value, so, we can just reset lower bits
9584 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9585 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9586 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9587 // index of last analyzed substring inside current octet. So, str2 in at
9588 // respective start address. We need to advance it to next octet
9589 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9590 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9591 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9592 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9593 __ movw(cnt2, cnt2);
9594 __ b(L_LOOP_PROCEED);
9595 __ align(OptoLoopAlignment);
9596 __ BIND(NOMATCH);
9597 __ mov(result, -1);
9598 __ BIND(DONE);
9599 __ pop(spilled_regs, sp);
9600 __ ret(lr);
9601 return entry;
9602 }
9603
9604 void generate_string_indexof_stubs() {
9605 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9606 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9607 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9608 }
9609
9610 void inflate_and_store_2_fp_registers(bool generatePrfm,
9611 FloatRegister src1, FloatRegister src2) {
9612 Register dst = r1;
9613 __ zip1(v1, __ T16B, src1, v0);
9614 __ zip2(v2, __ T16B, src1, v0);
9615 if (generatePrfm) {
9616 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9617 }
9618 __ zip1(v3, __ T16B, src2, v0);
9619 __ zip2(v4, __ T16B, src2, v0);
9620 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9621 }
9622
9623 // R0 = src
9624 // R1 = dst
9625 // R2 = len
9626 // R3 = len >> 3
9627 // V0 = 0
9628 // v1 = loaded 8 bytes
9629 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9630 address generate_large_byte_array_inflate() {
9631 __ align(CodeEntryAlignment);
9632 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9633 StubCodeMark mark(this, stub_id);
9634 address entry = __ pc();
9635 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9636 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9637 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9638
9639 // do one more 8-byte read to have address 16-byte aligned in most cases
9640 // also use single store instruction
9641 __ ldrd(v2, __ post(src, 8));
9642 __ sub(octetCounter, octetCounter, 2);
9643 __ zip1(v1, __ T16B, v1, v0);
9644 __ zip1(v2, __ T16B, v2, v0);
9645 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9646 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9647 __ subs(rscratch1, octetCounter, large_loop_threshold);
9648 __ br(__ LE, LOOP_START);
9649 __ b(LOOP_PRFM_START);
9650 __ bind(LOOP_PRFM);
9651 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9652 __ bind(LOOP_PRFM_START);
9653 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9654 __ sub(octetCounter, octetCounter, 8);
9655 __ subs(rscratch1, octetCounter, large_loop_threshold);
9656 inflate_and_store_2_fp_registers(true, v3, v4);
9657 inflate_and_store_2_fp_registers(true, v5, v6);
9658 __ br(__ GT, LOOP_PRFM);
9659 __ cmp(octetCounter, (u1)8);
9660 __ br(__ LT, DONE);
9661 __ bind(LOOP);
9662 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9663 __ bind(LOOP_START);
9664 __ sub(octetCounter, octetCounter, 8);
9665 __ cmp(octetCounter, (u1)8);
9666 inflate_and_store_2_fp_registers(false, v3, v4);
9667 inflate_and_store_2_fp_registers(false, v5, v6);
9668 __ br(__ GE, LOOP);
9669 __ bind(DONE);
9670 __ ret(lr);
9671 return entry;
9672 }
9673
9674 /**
9675 * Arguments:
9676 *
9677 * Input:
9678 * c_rarg0 - current state address
9679 * c_rarg1 - H key address
9680 * c_rarg2 - data address
9681 * c_rarg3 - number of blocks
9682 *
9683 * Output:
9684 * Updated state at c_rarg0
9685 */
9686 address generate_ghash_processBlocks() {
9687 // Bafflingly, GCM uses little-endian for the byte order, but
9688 // big-endian for the bit order. For example, the polynomial 1 is
9689 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9690 //
9691 // So, we must either reverse the bytes in each word and do
9692 // everything big-endian or reverse the bits in each byte and do
9693 // it little-endian. On AArch64 it's more idiomatic to reverse
9694 // the bits in each byte (we have an instruction, RBIT, to do
9695 // that) and keep the data in little-endian bit order through the
9696 // calculation, bit-reversing the inputs and outputs.
9697
9698 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9699 StubCodeMark mark(this, stub_id);
9700 Label polynomial; // local data generated at end of stub
9701 __ align(CodeEntryAlignment);
9702 address start = __ pc();
9703
9704 Register state = c_rarg0;
9705 Register subkeyH = c_rarg1;
9706 Register data = c_rarg2;
9707 Register blocks = c_rarg3;
9708
9709 FloatRegister vzr = v30;
9710 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9711
9712 __ adr(rscratch1, polynomial);
9713 __ ldrq(v24, rscratch1); // The field polynomial
9714
9715 __ ldrq(v0, Address(state));
9716 __ ldrq(v1, Address(subkeyH));
9717
9718 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9719 __ rbit(v0, __ T16B, v0);
9720 __ rev64(v1, __ T16B, v1);
9721 __ rbit(v1, __ T16B, v1);
9722
9723 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9724 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9725
9726 {
9727 Label L_ghash_loop;
9728 __ bind(L_ghash_loop);
9729
9730 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9731 // reversing each byte
9732 __ rbit(v2, __ T16B, v2);
9733 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9734
9735 // Multiply state in v2 by subkey in v1
9736 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9737 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9738 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9739 // Reduce v7:v5 by the field polynomial
9740 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9741
9742 __ sub(blocks, blocks, 1);
9743 __ cbnz(blocks, L_ghash_loop);
9744 }
9745
9746 // The bit-reversed result is at this point in v0
9747 __ rev64(v0, __ T16B, v0);
9748 __ rbit(v0, __ T16B, v0);
9749
9750 __ st1(v0, __ T16B, state);
9751 __ ret(lr);
9752
9753 // bind label and generate local polynomial data
9754 __ align(wordSize * 2);
9755 __ bind(polynomial);
9756 __ emit_int64(0x87); // The low-order bits of the field
9757 // polynomial (i.e. p = z^7+z^2+z+1)
9758 // repeated in the low and high parts of a
9759 // 128-bit vector
9760 __ emit_int64(0x87);
9761
9762 return start;
9763 }
9764
9765 address generate_ghash_processBlocks_wide() {
9766 address small = generate_ghash_processBlocks();
9767
9768 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9769 StubCodeMark mark(this, stub_id);
9770 Label polynomial; // local data generated after stub
9771 __ align(CodeEntryAlignment);
9772 address start = __ pc();
9773
9774 Register state = c_rarg0;
9775 Register subkeyH = c_rarg1;
9776 Register data = c_rarg2;
9777 Register blocks = c_rarg3;
9778
9779 const int unroll = 4;
9780
9781 __ cmp(blocks, (unsigned char)(unroll * 2));
9782 __ br(__ LT, small);
9783
9784 if (unroll > 1) {
9785 // Save state before entering routine
9786 __ sub(sp, sp, 4 * 16);
9787 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9788 __ sub(sp, sp, 4 * 16);
9789 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9790 }
9791
9792 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9793
9794 if (unroll > 1) {
9795 // And restore state
9796 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9797 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9798 }
9799
9800 __ cmp(blocks, (unsigned char)0);
9801 __ br(__ GT, small);
9802
9803 __ ret(lr);
9804
9805 // bind label and generate polynomial data
9806 __ align(wordSize * 2);
9807 __ bind(polynomial);
9808 __ emit_int64(0x87); // The low-order bits of the field
9809 // polynomial (i.e. p = z^7+z^2+z+1)
9810 // repeated in the low and high parts of a
9811 // 128-bit vector
9812 __ emit_int64(0x87);
9813
9814 return start;
9815
9816 }
9817
9818 void generate_base64_encode_simdround(Register src, Register dst,
9819 FloatRegister codec, u8 size) {
9820
9821 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9822 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9823 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9824
9825 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9826
9827 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9828
9829 __ ushr(ind0, arrangement, in0, 2);
9830
9831 __ ushr(ind1, arrangement, in1, 2);
9832 __ shl(in0, arrangement, in0, 6);
9833 __ orr(ind1, arrangement, ind1, in0);
9834 __ ushr(ind1, arrangement, ind1, 2);
9835
9836 __ ushr(ind2, arrangement, in2, 4);
9837 __ shl(in1, arrangement, in1, 4);
9838 __ orr(ind2, arrangement, in1, ind2);
9839 __ ushr(ind2, arrangement, ind2, 2);
9840
9841 __ shl(ind3, arrangement, in2, 2);
9842 __ ushr(ind3, arrangement, ind3, 2);
9843
9844 __ tbl(out0, arrangement, codec, 4, ind0);
9845 __ tbl(out1, arrangement, codec, 4, ind1);
9846 __ tbl(out2, arrangement, codec, 4, ind2);
9847 __ tbl(out3, arrangement, codec, 4, ind3);
9848
9849 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9850 }
9851
9852 /**
9853 * Arguments:
9854 *
9855 * Input:
9856 * c_rarg0 - src_start
9857 * c_rarg1 - src_offset
9858 * c_rarg2 - src_length
9859 * c_rarg3 - dest_start
9860 * c_rarg4 - dest_offset
9861 * c_rarg5 - isURL
9862 *
9863 */
9864 address generate_base64_encodeBlock() {
9865
9866 static const char toBase64[64] = {
9867 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9868 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9869 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9870 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9871 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9872 };
9873
9874 static const char toBase64URL[64] = {
9875 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9876 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9877 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9878 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9879 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9880 };
9881
9882 __ align(CodeEntryAlignment);
9883 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9884 StubCodeMark mark(this, stub_id);
9885 address start = __ pc();
9886
9887 Register src = c_rarg0; // source array
9888 Register soff = c_rarg1; // source start offset
9889 Register send = c_rarg2; // source end offset
9890 Register dst = c_rarg3; // dest array
9891 Register doff = c_rarg4; // position for writing to dest array
9892 Register isURL = c_rarg5; // Base64 or URL character set
9893
9894 // c_rarg6 and c_rarg7 are free to use as temps
9895 Register codec = c_rarg6;
9896 Register length = c_rarg7;
9897
9898 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9899
9900 __ add(src, src, soff);
9901 __ add(dst, dst, doff);
9902 __ sub(length, send, soff);
9903
9904 // load the codec base address
9905 __ lea(codec, ExternalAddress((address) toBase64));
9906 __ cbz(isURL, ProcessData);
9907 __ lea(codec, ExternalAddress((address) toBase64URL));
9908
9909 __ BIND(ProcessData);
9910
9911 // too short to formup a SIMD loop, roll back
9912 __ cmp(length, (u1)24);
9913 __ br(Assembler::LT, Process3B);
9914
9915 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9916
9917 __ BIND(Process48B);
9918 __ cmp(length, (u1)48);
9919 __ br(Assembler::LT, Process24B);
9920 generate_base64_encode_simdround(src, dst, v0, 16);
9921 __ sub(length, length, 48);
9922 __ b(Process48B);
9923
9924 __ BIND(Process24B);
9925 __ cmp(length, (u1)24);
9926 __ br(Assembler::LT, SIMDExit);
9927 generate_base64_encode_simdround(src, dst, v0, 8);
9928 __ sub(length, length, 24);
9929
9930 __ BIND(SIMDExit);
9931 __ cbz(length, Exit);
9932
9933 __ BIND(Process3B);
9934 // 3 src bytes, 24 bits
9935 __ ldrb(r10, __ post(src, 1));
9936 __ ldrb(r11, __ post(src, 1));
9937 __ ldrb(r12, __ post(src, 1));
9938 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9939 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9940 // codec index
9941 __ ubfmw(r15, r12, 18, 23);
9942 __ ubfmw(r14, r12, 12, 17);
9943 __ ubfmw(r13, r12, 6, 11);
9944 __ andw(r12, r12, 63);
9945 // get the code based on the codec
9946 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9947 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9948 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9949 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9950 __ strb(r15, __ post(dst, 1));
9951 __ strb(r14, __ post(dst, 1));
9952 __ strb(r13, __ post(dst, 1));
9953 __ strb(r12, __ post(dst, 1));
9954 __ sub(length, length, 3);
9955 __ cbnz(length, Process3B);
9956
9957 __ BIND(Exit);
9958 __ ret(lr);
9959
9960 return start;
9961 }
9962
9963 void generate_base64_decode_simdround(Register src, Register dst,
9964 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9965
9966 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9967 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9968
9969 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9970 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9971
9972 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9973
9974 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9975
9976 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9977
9978 // we need unsigned saturating subtract, to make sure all input values
9979 // in range [0, 63] will have 0U value in the higher half lookup
9980 __ uqsubv(decH0, __ T16B, in0, v27);
9981 __ uqsubv(decH1, __ T16B, in1, v27);
9982 __ uqsubv(decH2, __ T16B, in2, v27);
9983 __ uqsubv(decH3, __ T16B, in3, v27);
9984
9985 // lower half lookup
9986 __ tbl(decL0, arrangement, codecL, 4, in0);
9987 __ tbl(decL1, arrangement, codecL, 4, in1);
9988 __ tbl(decL2, arrangement, codecL, 4, in2);
9989 __ tbl(decL3, arrangement, codecL, 4, in3);
9990
9991 // higher half lookup
9992 __ tbx(decH0, arrangement, codecH, 4, decH0);
9993 __ tbx(decH1, arrangement, codecH, 4, decH1);
9994 __ tbx(decH2, arrangement, codecH, 4, decH2);
9995 __ tbx(decH3, arrangement, codecH, 4, decH3);
9996
9997 // combine lower and higher
9998 __ orr(decL0, arrangement, decL0, decH0);
9999 __ orr(decL1, arrangement, decL1, decH1);
10000 __ orr(decL2, arrangement, decL2, decH2);
10001 __ orr(decL3, arrangement, decL3, decH3);
10002
10003 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10004 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10005 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10006 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10007 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10008 __ orr(in0, arrangement, decH0, decH1);
10009 __ orr(in1, arrangement, decH2, decH3);
10010 __ orr(in2, arrangement, in0, in1);
10011 __ umaxv(in3, arrangement, in2);
10012 __ umov(rscratch2, in3, __ B, 0);
10013
10014 // get the data to output
10015 __ shl(out0, arrangement, decL0, 2);
10016 __ ushr(out1, arrangement, decL1, 4);
10017 __ orr(out0, arrangement, out0, out1);
10018 __ shl(out1, arrangement, decL1, 4);
10019 __ ushr(out2, arrangement, decL2, 2);
10020 __ orr(out1, arrangement, out1, out2);
10021 __ shl(out2, arrangement, decL2, 6);
10022 __ orr(out2, arrangement, out2, decL3);
10023
10024 __ cbz(rscratch2, NoIllegalData);
10025
10026 // handle illegal input
10027 __ umov(r10, in2, __ D, 0);
10028 if (size == 16) {
10029 __ cbnz(r10, ErrorInLowerHalf);
10030
10031 // illegal input is in higher half, store the lower half now.
10032 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10033
10034 __ umov(r10, in2, __ D, 1);
10035 __ umov(r11, out0, __ D, 1);
10036 __ umov(r12, out1, __ D, 1);
10037 __ umov(r13, out2, __ D, 1);
10038 __ b(StoreLegalData);
10039
10040 __ BIND(ErrorInLowerHalf);
10041 }
10042 __ umov(r11, out0, __ D, 0);
10043 __ umov(r12, out1, __ D, 0);
10044 __ umov(r13, out2, __ D, 0);
10045
10046 __ BIND(StoreLegalData);
10047 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10048 __ strb(r11, __ post(dst, 1));
10049 __ strb(r12, __ post(dst, 1));
10050 __ strb(r13, __ post(dst, 1));
10051 __ lsr(r10, r10, 8);
10052 __ lsr(r11, r11, 8);
10053 __ lsr(r12, r12, 8);
10054 __ lsr(r13, r13, 8);
10055 __ b(StoreLegalData);
10056
10057 __ BIND(NoIllegalData);
10058 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10059 }
10060
10061
10062 /**
10063 * Arguments:
10064 *
10065 * Input:
10066 * c_rarg0 - src_start
10067 * c_rarg1 - src_offset
10068 * c_rarg2 - src_length
10069 * c_rarg3 - dest_start
10070 * c_rarg4 - dest_offset
10071 * c_rarg5 - isURL
10072 * c_rarg6 - isMIME
10073 *
10074 */
10075 address generate_base64_decodeBlock() {
10076
10077 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10078 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10079 // titled "Base64 decoding".
10080
10081 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10082 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10083 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10084 static const uint8_t fromBase64ForNoSIMD[256] = {
10085 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10087 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10088 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10089 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10090 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10091 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10092 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10093 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10096 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10097 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101 };
10102
10103 static const uint8_t fromBase64URLForNoSIMD[256] = {
10104 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10105 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10106 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10107 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10108 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10109 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10110 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10111 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10112 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120 };
10121
10122 // A legal value of base64 code is in range [0, 127]. We need two lookups
10123 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10124 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10125 // table vector lookup use tbx, out of range indices are unchanged in
10126 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10127 // The value of index 64 is set to 0, so that we know that we already get the
10128 // decoded data with the 1st lookup.
10129 static const uint8_t fromBase64ForSIMD[128] = {
10130 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10133 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10134 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10135 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10136 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10137 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10138 };
10139
10140 static const uint8_t fromBase64URLForSIMD[128] = {
10141 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10144 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10145 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10146 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10147 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10148 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10149 };
10150
10151 __ align(CodeEntryAlignment);
10152 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10153 StubCodeMark mark(this, stub_id);
10154 address start = __ pc();
10155
10156 Register src = c_rarg0; // source array
10157 Register soff = c_rarg1; // source start offset
10158 Register send = c_rarg2; // source end offset
10159 Register dst = c_rarg3; // dest array
10160 Register doff = c_rarg4; // position for writing to dest array
10161 Register isURL = c_rarg5; // Base64 or URL character set
10162 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10163
10164 Register length = send; // reuse send as length of source data to process
10165
10166 Register simd_codec = c_rarg6;
10167 Register nosimd_codec = c_rarg7;
10168
10169 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10170
10171 __ enter();
10172
10173 __ add(src, src, soff);
10174 __ add(dst, dst, doff);
10175
10176 __ mov(doff, dst);
10177
10178 __ sub(length, send, soff);
10179 __ bfm(length, zr, 0, 1);
10180
10181 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10182 __ cbz(isURL, ProcessData);
10183 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10184
10185 __ BIND(ProcessData);
10186 __ mov(rscratch1, length);
10187 __ cmp(length, (u1)144); // 144 = 80 + 64
10188 __ br(Assembler::LT, Process4B);
10189
10190 // In the MIME case, the line length cannot be more than 76
10191 // bytes (see RFC 2045). This is too short a block for SIMD
10192 // to be worthwhile, so we use non-SIMD here.
10193 __ movw(rscratch1, 79);
10194
10195 __ BIND(Process4B);
10196 __ ldrw(r14, __ post(src, 4));
10197 __ ubfxw(r10, r14, 0, 8);
10198 __ ubfxw(r11, r14, 8, 8);
10199 __ ubfxw(r12, r14, 16, 8);
10200 __ ubfxw(r13, r14, 24, 8);
10201 // get the de-code
10202 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10203 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10204 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10205 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10206 // error detection, 255u indicates an illegal input
10207 __ orrw(r14, r10, r11);
10208 __ orrw(r15, r12, r13);
10209 __ orrw(r14, r14, r15);
10210 __ tbnz(r14, 7, Exit);
10211 // recover the data
10212 __ lslw(r14, r10, 10);
10213 __ bfiw(r14, r11, 4, 6);
10214 __ bfmw(r14, r12, 2, 5);
10215 __ rev16w(r14, r14);
10216 __ bfiw(r13, r12, 6, 2);
10217 __ strh(r14, __ post(dst, 2));
10218 __ strb(r13, __ post(dst, 1));
10219 // non-simd loop
10220 __ subsw(rscratch1, rscratch1, 4);
10221 __ br(Assembler::GT, Process4B);
10222
10223 // if exiting from PreProcess80B, rscratch1 == -1;
10224 // otherwise, rscratch1 == 0.
10225 __ cbzw(rscratch1, Exit);
10226 __ sub(length, length, 80);
10227
10228 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10229 __ cbz(isURL, SIMDEnter);
10230 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10231
10232 __ BIND(SIMDEnter);
10233 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10234 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10235 __ mov(rscratch1, 63);
10236 __ dup(v27, __ T16B, rscratch1);
10237
10238 __ BIND(Process64B);
10239 __ cmp(length, (u1)64);
10240 __ br(Assembler::LT, Process32B);
10241 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10242 __ sub(length, length, 64);
10243 __ b(Process64B);
10244
10245 __ BIND(Process32B);
10246 __ cmp(length, (u1)32);
10247 __ br(Assembler::LT, SIMDExit);
10248 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10249 __ sub(length, length, 32);
10250 __ b(Process32B);
10251
10252 __ BIND(SIMDExit);
10253 __ cbz(length, Exit);
10254 __ movw(rscratch1, length);
10255 __ b(Process4B);
10256
10257 __ BIND(Exit);
10258 __ sub(c_rarg0, dst, doff);
10259
10260 __ leave();
10261 __ ret(lr);
10262
10263 return start;
10264 }
10265
10266 // Support for spin waits.
10267 address generate_spin_wait() {
10268 __ align(CodeEntryAlignment);
10269 StubId stub_id = StubId::stubgen_spin_wait_id;
10270 StubCodeMark mark(this, stub_id);
10271 address start = __ pc();
10272
10273 __ spin_wait();
10274 __ ret(lr);
10275
10276 return start;
10277 }
10278
10279 void generate_lookup_secondary_supers_table_stub() {
10280 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10281 StubCodeMark mark(this, stub_id);
10282
10283 const Register
10284 r_super_klass = r0,
10285 r_array_base = r1,
10286 r_array_length = r2,
10287 r_array_index = r3,
10288 r_sub_klass = r4,
10289 r_bitmap = rscratch2,
10290 result = r5;
10291 const FloatRegister
10292 vtemp = v0;
10293
10294 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10295 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10296 Label L_success;
10297 __ enter();
10298 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10299 r_array_base, r_array_length, r_array_index,
10300 vtemp, result, slot,
10301 /*stub_is_near*/true);
10302 __ leave();
10303 __ ret(lr);
10304 }
10305 }
10306
10307 // Slow path implementation for UseSecondarySupersTable.
10308 address generate_lookup_secondary_supers_table_slow_path_stub() {
10309 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10310 StubCodeMark mark(this, stub_id);
10311
10312 address start = __ pc();
10313 const Register
10314 r_super_klass = r0, // argument
10315 r_array_base = r1, // argument
10316 temp1 = r2, // temp
10317 r_array_index = r3, // argument
10318 r_bitmap = rscratch2, // argument
10319 result = r5; // argument
10320
10321 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10322 __ ret(lr);
10323
10324 return start;
10325 }
10326
10327 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10328
10329 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10330 //
10331 // If LSE is in use, generate LSE versions of all the stubs. The
10332 // non-LSE versions are in atomic_aarch64.S.
10333
10334 // class AtomicStubMark records the entry point of a stub and the
10335 // stub pointer which will point to it. The stub pointer is set to
10336 // the entry point when ~AtomicStubMark() is called, which must be
10337 // after ICache::invalidate_range. This ensures safe publication of
10338 // the generated code.
10339 class AtomicStubMark {
10340 address _entry_point;
10341 aarch64_atomic_stub_t *_stub;
10342 MacroAssembler *_masm;
10343 public:
10344 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10345 _masm = masm;
10346 __ align(32);
10347 _entry_point = __ pc();
10348 _stub = stub;
10349 }
10350 ~AtomicStubMark() {
10351 *_stub = (aarch64_atomic_stub_t)_entry_point;
10352 }
10353 };
10354
10355 // NB: For memory_order_conservative we need a trailing membar after
10356 // LSE atomic operations but not a leading membar.
10357 //
10358 // We don't need a leading membar because a clause in the Arm ARM
10359 // says:
10360 //
10361 // Barrier-ordered-before
10362 //
10363 // Barrier instructions order prior Memory effects before subsequent
10364 // Memory effects generated by the same Observer. A read or a write
10365 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10366 // Observer if and only if RW1 appears in program order before RW 2
10367 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10368 // instruction with both Acquire and Release semantics.
10369 //
10370 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10371 // and Release semantics, therefore we don't need a leading
10372 // barrier. However, there is no corresponding Barrier-ordered-after
10373 // relationship, therefore we need a trailing membar to prevent a
10374 // later store or load from being reordered with the store in an
10375 // atomic instruction.
10376 //
10377 // This was checked by using the herd7 consistency model simulator
10378 // (http://diy.inria.fr/) with this test case:
10379 //
10380 // AArch64 LseCas
10381 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10382 // P0 | P1;
10383 // LDR W4, [X2] | MOV W3, #0;
10384 // DMB LD | MOV W4, #1;
10385 // LDR W3, [X1] | CASAL W3, W4, [X1];
10386 // | DMB ISH;
10387 // | STR W4, [X2];
10388 // exists
10389 // (0:X3=0 /\ 0:X4=1)
10390 //
10391 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10392 // with the store to x in P1. Without the DMB in P1 this may happen.
10393 //
10394 // At the time of writing we don't know of any AArch64 hardware that
10395 // reorders stores in this way, but the Reference Manual permits it.
10396
10397 void gen_cas_entry(Assembler::operand_size size,
10398 atomic_memory_order order) {
10399 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10400 exchange_val = c_rarg2;
10401 bool acquire, release;
10402 switch (order) {
10403 case memory_order_relaxed:
10404 acquire = false;
10405 release = false;
10406 break;
10407 case memory_order_release:
10408 acquire = false;
10409 release = true;
10410 break;
10411 default:
10412 acquire = true;
10413 release = true;
10414 break;
10415 }
10416 __ mov(prev, compare_val);
10417 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10418 if (order == memory_order_conservative) {
10419 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10420 }
10421 if (size == Assembler::xword) {
10422 __ mov(r0, prev);
10423 } else {
10424 __ movw(r0, prev);
10425 }
10426 __ ret(lr);
10427 }
10428
10429 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10430 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10431 // If not relaxed, then default to conservative. Relaxed is the only
10432 // case we use enough to be worth specializing.
10433 if (order == memory_order_relaxed) {
10434 __ ldadd(size, incr, prev, addr);
10435 } else {
10436 __ ldaddal(size, incr, prev, addr);
10437 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10438 }
10439 if (size == Assembler::xword) {
10440 __ mov(r0, prev);
10441 } else {
10442 __ movw(r0, prev);
10443 }
10444 __ ret(lr);
10445 }
10446
10447 void gen_swpal_entry(Assembler::operand_size size) {
10448 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10449 __ swpal(size, incr, prev, addr);
10450 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10451 if (size == Assembler::xword) {
10452 __ mov(r0, prev);
10453 } else {
10454 __ movw(r0, prev);
10455 }
10456 __ ret(lr);
10457 }
10458
10459 void generate_atomic_entry_points() {
10460 if (! UseLSE) {
10461 return;
10462 }
10463 __ align(CodeEntryAlignment);
10464 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10465 StubCodeMark mark(this, stub_id);
10466 address first_entry = __ pc();
10467
10468 // ADD, memory_order_conservative
10469 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10470 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10471 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10472 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10473
10474 // ADD, memory_order_relaxed
10475 AtomicStubMark mark_fetch_add_4_relaxed
10476 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10477 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10478 AtomicStubMark mark_fetch_add_8_relaxed
10479 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10480 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10481
10482 // XCHG, memory_order_conservative
10483 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10484 gen_swpal_entry(Assembler::word);
10485 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10486 gen_swpal_entry(Assembler::xword);
10487
10488 // CAS, memory_order_conservative
10489 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10490 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10491 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10492 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10493 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10494 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10495
10496 // CAS, memory_order_relaxed
10497 AtomicStubMark mark_cmpxchg_1_relaxed
10498 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10499 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10500 AtomicStubMark mark_cmpxchg_4_relaxed
10501 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10502 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10503 AtomicStubMark mark_cmpxchg_8_relaxed
10504 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10505 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10506
10507 AtomicStubMark mark_cmpxchg_4_release
10508 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10509 gen_cas_entry(MacroAssembler::word, memory_order_release);
10510 AtomicStubMark mark_cmpxchg_8_release
10511 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10512 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10513
10514 AtomicStubMark mark_cmpxchg_4_seq_cst
10515 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10516 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10517 AtomicStubMark mark_cmpxchg_8_seq_cst
10518 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10519 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10520
10521 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10522 }
10523 #endif // LINUX
10524
10525 address generate_cont_thaw(Continuation::thaw_kind kind) {
10526 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10527 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10528
10529 address start = __ pc();
10530
10531 if (return_barrier) {
10532 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10533 __ mov(sp, rscratch1);
10534 }
10535 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10536
10537 if (return_barrier) {
10538 // preserve possible return value from a method returning to the return barrier
10539 __ fmovd(rscratch1, v0);
10540 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10541 }
10542
10543 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10544 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10545 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10546
10547 if (return_barrier) {
10548 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10549 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10550 __ fmovd(v0, rscratch1);
10551 }
10552 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10553
10554
10555 Label thaw_success;
10556 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10557 __ cbnz(rscratch2, thaw_success);
10558 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10559 __ br(rscratch1);
10560 __ bind(thaw_success);
10561
10562 // make room for the thawed frames
10563 __ sub(rscratch1, sp, rscratch2);
10564 __ andr(rscratch1, rscratch1, -16); // align
10565 __ mov(sp, rscratch1);
10566
10567 if (return_barrier) {
10568 // save original return value -- again
10569 __ fmovd(rscratch1, v0);
10570 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10571 }
10572
10573 // If we want, we can templatize thaw by kind, and have three different entries
10574 __ movw(c_rarg1, (uint32_t)kind);
10575
10576 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10577 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10578
10579 if (return_barrier) {
10580 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10581 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10582 __ fmovd(v0, rscratch1);
10583 } else {
10584 __ mov(r0, zr); // return 0 (success) from doYield
10585 }
10586
10587 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10588 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10589 __ mov(rfp, sp);
10590
10591 if (return_barrier_exception) {
10592 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10593 __ authenticate_return_address(c_rarg1);
10594 __ verify_oop(r0);
10595 // save return value containing the exception oop in callee-saved R19
10596 __ mov(r19, r0);
10597
10598 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10599
10600 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10601 // __ reinitialize_ptrue();
10602
10603 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10604
10605 __ mov(r1, r0); // the exception handler
10606 __ mov(r0, r19); // restore return value containing the exception oop
10607 __ verify_oop(r0);
10608
10609 __ leave();
10610 __ mov(r3, lr);
10611 __ br(r1); // the exception handler
10612 } else {
10613 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10614 __ leave();
10615 __ ret(lr);
10616 }
10617
10618 return start;
10619 }
10620
10621 address generate_cont_thaw() {
10622 if (!Continuations::enabled()) return nullptr;
10623
10624 StubId stub_id = StubId::stubgen_cont_thaw_id;
10625 StubCodeMark mark(this, stub_id);
10626 address start = __ pc();
10627 generate_cont_thaw(Continuation::thaw_top);
10628 return start;
10629 }
10630
10631 address generate_cont_returnBarrier() {
10632 if (!Continuations::enabled()) return nullptr;
10633
10634 // TODO: will probably need multiple return barriers depending on return type
10635 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10636 StubCodeMark mark(this, stub_id);
10637 address start = __ pc();
10638
10639 generate_cont_thaw(Continuation::thaw_return_barrier);
10640
10641 return start;
10642 }
10643
10644 address generate_cont_returnBarrier_exception() {
10645 if (!Continuations::enabled()) return nullptr;
10646
10647 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10648 StubCodeMark mark(this, stub_id);
10649 address start = __ pc();
10650
10651 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10652
10653 return start;
10654 }
10655
10656 address generate_cont_preempt_stub() {
10657 if (!Continuations::enabled()) return nullptr;
10658 StubId stub_id = StubId::stubgen_cont_preempt_id;
10659 StubCodeMark mark(this, stub_id);
10660 address start = __ pc();
10661
10662 __ reset_last_Java_frame(true);
10663
10664 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10665 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10666 __ mov(sp, rscratch2);
10667
10668 Label preemption_cancelled;
10669 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10670 __ cbnz(rscratch1, preemption_cancelled);
10671
10672 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10673 SharedRuntime::continuation_enter_cleanup(_masm);
10674 __ leave();
10675 __ ret(lr);
10676
10677 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10678 __ bind(preemption_cancelled);
10679 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10680 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10681 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10682 __ ldr(rscratch1, Address(rscratch1));
10683 __ br(rscratch1);
10684
10685 return start;
10686 }
10687
10688 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10689 // are represented as long[5], with BITS_PER_LIMB = 26.
10690 // Pack five 26-bit limbs into three 64-bit registers.
10691 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10692 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10693 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10694 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10695 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10696
10697 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10698 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10699 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10700 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10701
10702 if (dest2->is_valid()) {
10703 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10704 } else {
10705 #ifdef ASSERT
10706 Label OK;
10707 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10708 __ br(__ EQ, OK);
10709 __ stop("high bits of Poly1305 integer should be zero");
10710 __ should_not_reach_here();
10711 __ bind(OK);
10712 #endif
10713 }
10714 }
10715
10716 // As above, but return only a 128-bit integer, packed into two
10717 // 64-bit registers.
10718 void pack_26(Register dest0, Register dest1, Register src) {
10719 pack_26(dest0, dest1, noreg, src);
10720 }
10721
10722 // Multiply and multiply-accumulate unsigned 64-bit registers.
10723 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10724 __ mul(prod_lo, n, m);
10725 __ umulh(prod_hi, n, m);
10726 }
10727 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10728 wide_mul(rscratch1, rscratch2, n, m);
10729 __ adds(sum_lo, sum_lo, rscratch1);
10730 __ adc(sum_hi, sum_hi, rscratch2);
10731 }
10732
10733 // Poly1305, RFC 7539
10734
10735 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10736 // description of the tricks used to simplify and accelerate this
10737 // computation.
10738
10739 address generate_poly1305_processBlocks() {
10740 __ align(CodeEntryAlignment);
10741 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10742 StubCodeMark mark(this, stub_id);
10743 address start = __ pc();
10744 Label here;
10745 __ enter();
10746 RegSet callee_saved = RegSet::range(r19, r28);
10747 __ push(callee_saved, sp);
10748
10749 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10750
10751 // Arguments
10752 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10753
10754 // R_n is the 128-bit randomly-generated key, packed into two
10755 // registers. The caller passes this key to us as long[5], with
10756 // BITS_PER_LIMB = 26.
10757 const Register R_0 = *++regs, R_1 = *++regs;
10758 pack_26(R_0, R_1, r_start);
10759
10760 // RR_n is (R_n >> 2) * 5
10761 const Register RR_0 = *++regs, RR_1 = *++regs;
10762 __ lsr(RR_0, R_0, 2);
10763 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10764 __ lsr(RR_1, R_1, 2);
10765 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10766
10767 // U_n is the current checksum
10768 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10769 pack_26(U_0, U_1, U_2, acc_start);
10770
10771 static constexpr int BLOCK_LENGTH = 16;
10772 Label DONE, LOOP;
10773
10774 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10775 __ br(Assembler::LT, DONE); {
10776 __ bind(LOOP);
10777
10778 // S_n is to be the sum of U_n and the next block of data
10779 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10780 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10781 __ adds(S_0, U_0, S_0);
10782 __ adcs(S_1, U_1, S_1);
10783 __ adc(S_2, U_2, zr);
10784 __ add(S_2, S_2, 1);
10785
10786 const Register U_0HI = *++regs, U_1HI = *++regs;
10787
10788 // NB: this logic depends on some of the special properties of
10789 // Poly1305 keys. In particular, because we know that the top
10790 // four bits of R_0 and R_1 are zero, we can add together
10791 // partial products without any risk of needing to propagate a
10792 // carry out.
10793 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10794 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10795 __ andr(U_2, R_0, 3);
10796 __ mul(U_2, S_2, U_2);
10797
10798 // Recycle registers S_0, S_1, S_2
10799 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10800
10801 // Partial reduction mod 2**130 - 5
10802 __ adds(U_1, U_0HI, U_1);
10803 __ adc(U_2, U_1HI, U_2);
10804 // Sum now in U_2:U_1:U_0.
10805 // Dead: U_0HI, U_1HI.
10806 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10807
10808 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10809
10810 // First, U_2:U_1:U_0 += (U_2 >> 2)
10811 __ lsr(rscratch1, U_2, 2);
10812 __ andr(U_2, U_2, (u8)3);
10813 __ adds(U_0, U_0, rscratch1);
10814 __ adcs(U_1, U_1, zr);
10815 __ adc(U_2, U_2, zr);
10816 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10817 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10818 __ adcs(U_1, U_1, zr);
10819 __ adc(U_2, U_2, zr);
10820
10821 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10822 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10823 __ br(~ Assembler::LT, LOOP);
10824 }
10825
10826 // Further reduce modulo 2^130 - 5
10827 __ lsr(rscratch1, U_2, 2);
10828 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10829 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10830 __ adcs(U_1, U_1, zr);
10831 __ andr(U_2, U_2, (u1)3);
10832 __ adc(U_2, U_2, zr);
10833
10834 // Unpack the sum into five 26-bit limbs and write to memory.
10835 __ ubfiz(rscratch1, U_0, 0, 26);
10836 __ ubfx(rscratch2, U_0, 26, 26);
10837 __ stp(rscratch1, rscratch2, Address(acc_start));
10838 __ ubfx(rscratch1, U_0, 52, 12);
10839 __ bfi(rscratch1, U_1, 12, 14);
10840 __ ubfx(rscratch2, U_1, 14, 26);
10841 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10842 __ ubfx(rscratch1, U_1, 40, 24);
10843 __ bfi(rscratch1, U_2, 24, 3);
10844 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10845
10846 __ bind(DONE);
10847 __ pop(callee_saved, sp);
10848 __ leave();
10849 __ ret(lr);
10850
10851 return start;
10852 }
10853
10854 // exception handler for upcall stubs
10855 address generate_upcall_stub_exception_handler() {
10856 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10857 StubCodeMark mark(this, stub_id);
10858 address start = __ pc();
10859
10860 // Native caller has no idea how to handle exceptions,
10861 // so we just crash here. Up to callee to catch exceptions.
10862 __ verify_oop(r0);
10863 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10864 __ blr(rscratch1);
10865 __ should_not_reach_here();
10866
10867 return start;
10868 }
10869
10870 // load Method* target of MethodHandle
10871 // j_rarg0 = jobject receiver
10872 // rmethod = result
10873 address generate_upcall_stub_load_target() {
10874 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10875 StubCodeMark mark(this, stub_id);
10876 address start = __ pc();
10877
10878 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10879 // Load target method from receiver
10880 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10881 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10882 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10883 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10884 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10885 noreg, noreg);
10886 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10887
10888 __ ret(lr);
10889
10890 return start;
10891 }
10892
10893 #undef __
10894 #define __ masm->
10895
10896 class MontgomeryMultiplyGenerator : public MacroAssembler {
10897
10898 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10899 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10900
10901 RegSet _toSave;
10902 bool _squaring;
10903
10904 public:
10905 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10906 : MacroAssembler(as->code()), _squaring(squaring) {
10907
10908 // Register allocation
10909
10910 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10911 Pa_base = *regs; // Argument registers
10912 if (squaring)
10913 Pb_base = Pa_base;
10914 else
10915 Pb_base = *++regs;
10916 Pn_base = *++regs;
10917 Rlen= *++regs;
10918 inv = *++regs;
10919 Pm_base = *++regs;
10920
10921 // Working registers:
10922 Ra = *++regs; // The current digit of a, b, n, and m.
10923 Rb = *++regs;
10924 Rm = *++regs;
10925 Rn = *++regs;
10926
10927 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10928 Pb = *++regs;
10929 Pm = *++regs;
10930 Pn = *++regs;
10931
10932 t0 = *++regs; // Three registers which form a
10933 t1 = *++regs; // triple-precision accumuator.
10934 t2 = *++regs;
10935
10936 Ri = *++regs; // Inner and outer loop indexes.
10937 Rj = *++regs;
10938
10939 Rhi_ab = *++regs; // Product registers: low and high parts
10940 Rlo_ab = *++regs; // of a*b and m*n.
10941 Rhi_mn = *++regs;
10942 Rlo_mn = *++regs;
10943
10944 // r19 and up are callee-saved.
10945 _toSave = RegSet::range(r19, *regs) + Pm_base;
10946 }
10947
10948 private:
10949 void save_regs() {
10950 push(_toSave, sp);
10951 }
10952
10953 void restore_regs() {
10954 pop(_toSave, sp);
10955 }
10956
10957 template <typename T>
10958 void unroll_2(Register count, T block) {
10959 Label loop, end, odd;
10960 tbnz(count, 0, odd);
10961 cbz(count, end);
10962 align(16);
10963 bind(loop);
10964 (this->*block)();
10965 bind(odd);
10966 (this->*block)();
10967 subs(count, count, 2);
10968 br(Assembler::GT, loop);
10969 bind(end);
10970 }
10971
10972 template <typename T>
10973 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10974 Label loop, end, odd;
10975 tbnz(count, 0, odd);
10976 cbz(count, end);
10977 align(16);
10978 bind(loop);
10979 (this->*block)(d, s, tmp);
10980 bind(odd);
10981 (this->*block)(d, s, tmp);
10982 subs(count, count, 2);
10983 br(Assembler::GT, loop);
10984 bind(end);
10985 }
10986
10987 void pre1(RegisterOrConstant i) {
10988 block_comment("pre1");
10989 // Pa = Pa_base;
10990 // Pb = Pb_base + i;
10991 // Pm = Pm_base;
10992 // Pn = Pn_base + i;
10993 // Ra = *Pa;
10994 // Rb = *Pb;
10995 // Rm = *Pm;
10996 // Rn = *Pn;
10997 ldr(Ra, Address(Pa_base));
10998 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10999 ldr(Rm, Address(Pm_base));
11000 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11001 lea(Pa, Address(Pa_base));
11002 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11003 lea(Pm, Address(Pm_base));
11004 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11005
11006 // Zero the m*n result.
11007 mov(Rhi_mn, zr);
11008 mov(Rlo_mn, zr);
11009 }
11010
11011 // The core multiply-accumulate step of a Montgomery
11012 // multiplication. The idea is to schedule operations as a
11013 // pipeline so that instructions with long latencies (loads and
11014 // multiplies) have time to complete before their results are
11015 // used. This most benefits in-order implementations of the
11016 // architecture but out-of-order ones also benefit.
11017 void step() {
11018 block_comment("step");
11019 // MACC(Ra, Rb, t0, t1, t2);
11020 // Ra = *++Pa;
11021 // Rb = *--Pb;
11022 umulh(Rhi_ab, Ra, Rb);
11023 mul(Rlo_ab, Ra, Rb);
11024 ldr(Ra, pre(Pa, wordSize));
11025 ldr(Rb, pre(Pb, -wordSize));
11026 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11027 // previous iteration.
11028 // MACC(Rm, Rn, t0, t1, t2);
11029 // Rm = *++Pm;
11030 // Rn = *--Pn;
11031 umulh(Rhi_mn, Rm, Rn);
11032 mul(Rlo_mn, Rm, Rn);
11033 ldr(Rm, pre(Pm, wordSize));
11034 ldr(Rn, pre(Pn, -wordSize));
11035 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11036 }
11037
11038 void post1() {
11039 block_comment("post1");
11040
11041 // MACC(Ra, Rb, t0, t1, t2);
11042 // Ra = *++Pa;
11043 // Rb = *--Pb;
11044 umulh(Rhi_ab, Ra, Rb);
11045 mul(Rlo_ab, Ra, Rb);
11046 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11047 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11048
11049 // *Pm = Rm = t0 * inv;
11050 mul(Rm, t0, inv);
11051 str(Rm, Address(Pm));
11052
11053 // MACC(Rm, Rn, t0, t1, t2);
11054 // t0 = t1; t1 = t2; t2 = 0;
11055 umulh(Rhi_mn, Rm, Rn);
11056
11057 #ifndef PRODUCT
11058 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11059 {
11060 mul(Rlo_mn, Rm, Rn);
11061 add(Rlo_mn, t0, Rlo_mn);
11062 Label ok;
11063 cbz(Rlo_mn, ok); {
11064 stop("broken Montgomery multiply");
11065 } bind(ok);
11066 }
11067 #endif
11068 // We have very carefully set things up so that
11069 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11070 // the lower half of Rm * Rn because we know the result already:
11071 // it must be -t0. t0 + (-t0) must generate a carry iff
11072 // t0 != 0. So, rather than do a mul and an adds we just set
11073 // the carry flag iff t0 is nonzero.
11074 //
11075 // mul(Rlo_mn, Rm, Rn);
11076 // adds(zr, t0, Rlo_mn);
11077 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11078 adcs(t0, t1, Rhi_mn);
11079 adc(t1, t2, zr);
11080 mov(t2, zr);
11081 }
11082
11083 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11084 block_comment("pre2");
11085 // Pa = Pa_base + i-len;
11086 // Pb = Pb_base + len;
11087 // Pm = Pm_base + i-len;
11088 // Pn = Pn_base + len;
11089
11090 if (i.is_register()) {
11091 sub(Rj, i.as_register(), len);
11092 } else {
11093 mov(Rj, i.as_constant());
11094 sub(Rj, Rj, len);
11095 }
11096 // Rj == i-len
11097
11098 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11099 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11100 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11101 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11102
11103 // Ra = *++Pa;
11104 // Rb = *--Pb;
11105 // Rm = *++Pm;
11106 // Rn = *--Pn;
11107 ldr(Ra, pre(Pa, wordSize));
11108 ldr(Rb, pre(Pb, -wordSize));
11109 ldr(Rm, pre(Pm, wordSize));
11110 ldr(Rn, pre(Pn, -wordSize));
11111
11112 mov(Rhi_mn, zr);
11113 mov(Rlo_mn, zr);
11114 }
11115
11116 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11117 block_comment("post2");
11118 if (i.is_constant()) {
11119 mov(Rj, i.as_constant()-len.as_constant());
11120 } else {
11121 sub(Rj, i.as_register(), len);
11122 }
11123
11124 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11125
11126 // As soon as we know the least significant digit of our result,
11127 // store it.
11128 // Pm_base[i-len] = t0;
11129 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11130
11131 // t0 = t1; t1 = t2; t2 = 0;
11132 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11133 adc(t1, t2, zr);
11134 mov(t2, zr);
11135 }
11136
11137 // A carry in t0 after Montgomery multiplication means that we
11138 // should subtract multiples of n from our result in m. We'll
11139 // keep doing that until there is no carry.
11140 void normalize(RegisterOrConstant len) {
11141 block_comment("normalize");
11142 // while (t0)
11143 // t0 = sub(Pm_base, Pn_base, t0, len);
11144 Label loop, post, again;
11145 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11146 cbz(t0, post); {
11147 bind(again); {
11148 mov(i, zr);
11149 mov(cnt, len);
11150 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11151 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11152 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11153 align(16);
11154 bind(loop); {
11155 sbcs(Rm, Rm, Rn);
11156 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11157 add(i, i, 1);
11158 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11159 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11160 sub(cnt, cnt, 1);
11161 } cbnz(cnt, loop);
11162 sbc(t0, t0, zr);
11163 } cbnz(t0, again);
11164 } bind(post);
11165 }
11166
11167 // Move memory at s to d, reversing words.
11168 // Increments d to end of copied memory
11169 // Destroys tmp1, tmp2
11170 // Preserves len
11171 // Leaves s pointing to the address which was in d at start
11172 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11173 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11174 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11175
11176 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11177 mov(tmp1, len);
11178 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11179 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11180 }
11181 // where
11182 void reverse1(Register d, Register s, Register tmp) {
11183 ldr(tmp, pre(s, -wordSize));
11184 ror(tmp, tmp, 32);
11185 str(tmp, post(d, wordSize));
11186 }
11187
11188 void step_squaring() {
11189 // An extra ACC
11190 step();
11191 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11192 }
11193
11194 void last_squaring(RegisterOrConstant i) {
11195 Label dont;
11196 // if ((i & 1) == 0) {
11197 tbnz(i.as_register(), 0, dont); {
11198 // MACC(Ra, Rb, t0, t1, t2);
11199 // Ra = *++Pa;
11200 // Rb = *--Pb;
11201 umulh(Rhi_ab, Ra, Rb);
11202 mul(Rlo_ab, Ra, Rb);
11203 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11204 } bind(dont);
11205 }
11206
11207 void extra_step_squaring() {
11208 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11209
11210 // MACC(Rm, Rn, t0, t1, t2);
11211 // Rm = *++Pm;
11212 // Rn = *--Pn;
11213 umulh(Rhi_mn, Rm, Rn);
11214 mul(Rlo_mn, Rm, Rn);
11215 ldr(Rm, pre(Pm, wordSize));
11216 ldr(Rn, pre(Pn, -wordSize));
11217 }
11218
11219 void post1_squaring() {
11220 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11221
11222 // *Pm = Rm = t0 * inv;
11223 mul(Rm, t0, inv);
11224 str(Rm, Address(Pm));
11225
11226 // MACC(Rm, Rn, t0, t1, t2);
11227 // t0 = t1; t1 = t2; t2 = 0;
11228 umulh(Rhi_mn, Rm, Rn);
11229
11230 #ifndef PRODUCT
11231 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11232 {
11233 mul(Rlo_mn, Rm, Rn);
11234 add(Rlo_mn, t0, Rlo_mn);
11235 Label ok;
11236 cbz(Rlo_mn, ok); {
11237 stop("broken Montgomery multiply");
11238 } bind(ok);
11239 }
11240 #endif
11241 // We have very carefully set things up so that
11242 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11243 // the lower half of Rm * Rn because we know the result already:
11244 // it must be -t0. t0 + (-t0) must generate a carry iff
11245 // t0 != 0. So, rather than do a mul and an adds we just set
11246 // the carry flag iff t0 is nonzero.
11247 //
11248 // mul(Rlo_mn, Rm, Rn);
11249 // adds(zr, t0, Rlo_mn);
11250 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11251 adcs(t0, t1, Rhi_mn);
11252 adc(t1, t2, zr);
11253 mov(t2, zr);
11254 }
11255
11256 void acc(Register Rhi, Register Rlo,
11257 Register t0, Register t1, Register t2) {
11258 adds(t0, t0, Rlo);
11259 adcs(t1, t1, Rhi);
11260 adc(t2, t2, zr);
11261 }
11262
11263 public:
11264 /**
11265 * Fast Montgomery multiplication. The derivation of the
11266 * algorithm is in A Cryptographic Library for the Motorola
11267 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11268 *
11269 * Arguments:
11270 *
11271 * Inputs for multiplication:
11272 * c_rarg0 - int array elements a
11273 * c_rarg1 - int array elements b
11274 * c_rarg2 - int array elements n (the modulus)
11275 * c_rarg3 - int length
11276 * c_rarg4 - int inv
11277 * c_rarg5 - int array elements m (the result)
11278 *
11279 * Inputs for squaring:
11280 * c_rarg0 - int array elements a
11281 * c_rarg1 - int array elements n (the modulus)
11282 * c_rarg2 - int length
11283 * c_rarg3 - int inv
11284 * c_rarg4 - int array elements m (the result)
11285 *
11286 */
11287 address generate_multiply() {
11288 Label argh, nothing;
11289 bind(argh);
11290 stop("MontgomeryMultiply total_allocation must be <= 8192");
11291
11292 align(CodeEntryAlignment);
11293 address entry = pc();
11294
11295 cbzw(Rlen, nothing);
11296
11297 enter();
11298
11299 // Make room.
11300 cmpw(Rlen, 512);
11301 br(Assembler::HI, argh);
11302 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11303 andr(sp, Ra, -2 * wordSize);
11304
11305 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11306
11307 {
11308 // Copy input args, reversing as we go. We use Ra as a
11309 // temporary variable.
11310 reverse(Ra, Pa_base, Rlen, t0, t1);
11311 if (!_squaring)
11312 reverse(Ra, Pb_base, Rlen, t0, t1);
11313 reverse(Ra, Pn_base, Rlen, t0, t1);
11314 }
11315
11316 // Push all call-saved registers and also Pm_base which we'll need
11317 // at the end.
11318 save_regs();
11319
11320 #ifndef PRODUCT
11321 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11322 {
11323 ldr(Rn, Address(Pn_base, 0));
11324 mul(Rlo_mn, Rn, inv);
11325 subs(zr, Rlo_mn, -1);
11326 Label ok;
11327 br(EQ, ok); {
11328 stop("broken inverse in Montgomery multiply");
11329 } bind(ok);
11330 }
11331 #endif
11332
11333 mov(Pm_base, Ra);
11334
11335 mov(t0, zr);
11336 mov(t1, zr);
11337 mov(t2, zr);
11338
11339 block_comment("for (int i = 0; i < len; i++) {");
11340 mov(Ri, zr); {
11341 Label loop, end;
11342 cmpw(Ri, Rlen);
11343 br(Assembler::GE, end);
11344
11345 bind(loop);
11346 pre1(Ri);
11347
11348 block_comment(" for (j = i; j; j--) {"); {
11349 movw(Rj, Ri);
11350 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11351 } block_comment(" } // j");
11352
11353 post1();
11354 addw(Ri, Ri, 1);
11355 cmpw(Ri, Rlen);
11356 br(Assembler::LT, loop);
11357 bind(end);
11358 block_comment("} // i");
11359 }
11360
11361 block_comment("for (int i = len; i < 2*len; i++) {");
11362 mov(Ri, Rlen); {
11363 Label loop, end;
11364 cmpw(Ri, Rlen, Assembler::LSL, 1);
11365 br(Assembler::GE, end);
11366
11367 bind(loop);
11368 pre2(Ri, Rlen);
11369
11370 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11371 lslw(Rj, Rlen, 1);
11372 subw(Rj, Rj, Ri);
11373 subw(Rj, Rj, 1);
11374 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11375 } block_comment(" } // j");
11376
11377 post2(Ri, Rlen);
11378 addw(Ri, Ri, 1);
11379 cmpw(Ri, Rlen, Assembler::LSL, 1);
11380 br(Assembler::LT, loop);
11381 bind(end);
11382 }
11383 block_comment("} // i");
11384
11385 normalize(Rlen);
11386
11387 mov(Ra, Pm_base); // Save Pm_base in Ra
11388 restore_regs(); // Restore caller's Pm_base
11389
11390 // Copy our result into caller's Pm_base
11391 reverse(Pm_base, Ra, Rlen, t0, t1);
11392
11393 leave();
11394 bind(nothing);
11395 ret(lr);
11396
11397 return entry;
11398 }
11399 // In C, approximately:
11400
11401 // void
11402 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11403 // julong Pn_base[], julong Pm_base[],
11404 // julong inv, int len) {
11405 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11406 // julong *Pa, *Pb, *Pn, *Pm;
11407 // julong Ra, Rb, Rn, Rm;
11408
11409 // int i;
11410
11411 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11412
11413 // for (i = 0; i < len; i++) {
11414 // int j;
11415
11416 // Pa = Pa_base;
11417 // Pb = Pb_base + i;
11418 // Pm = Pm_base;
11419 // Pn = Pn_base + i;
11420
11421 // Ra = *Pa;
11422 // Rb = *Pb;
11423 // Rm = *Pm;
11424 // Rn = *Pn;
11425
11426 // int iters = i;
11427 // for (j = 0; iters--; j++) {
11428 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11429 // MACC(Ra, Rb, t0, t1, t2);
11430 // Ra = *++Pa;
11431 // Rb = *--Pb;
11432 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11433 // MACC(Rm, Rn, t0, t1, t2);
11434 // Rm = *++Pm;
11435 // Rn = *--Pn;
11436 // }
11437
11438 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11439 // MACC(Ra, Rb, t0, t1, t2);
11440 // *Pm = Rm = t0 * inv;
11441 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11442 // MACC(Rm, Rn, t0, t1, t2);
11443
11444 // assert(t0 == 0, "broken Montgomery multiply");
11445
11446 // t0 = t1; t1 = t2; t2 = 0;
11447 // }
11448
11449 // for (i = len; i < 2*len; i++) {
11450 // int j;
11451
11452 // Pa = Pa_base + i-len;
11453 // Pb = Pb_base + len;
11454 // Pm = Pm_base + i-len;
11455 // Pn = Pn_base + len;
11456
11457 // Ra = *++Pa;
11458 // Rb = *--Pb;
11459 // Rm = *++Pm;
11460 // Rn = *--Pn;
11461
11462 // int iters = len*2-i-1;
11463 // for (j = i-len+1; iters--; j++) {
11464 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11465 // MACC(Ra, Rb, t0, t1, t2);
11466 // Ra = *++Pa;
11467 // Rb = *--Pb;
11468 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11469 // MACC(Rm, Rn, t0, t1, t2);
11470 // Rm = *++Pm;
11471 // Rn = *--Pn;
11472 // }
11473
11474 // Pm_base[i-len] = t0;
11475 // t0 = t1; t1 = t2; t2 = 0;
11476 // }
11477
11478 // while (t0)
11479 // t0 = sub(Pm_base, Pn_base, t0, len);
11480 // }
11481
11482 /**
11483 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11484 * multiplies than Montgomery multiplication so it should be up to
11485 * 25% faster. However, its loop control is more complex and it
11486 * may actually run slower on some machines.
11487 *
11488 * Arguments:
11489 *
11490 * Inputs:
11491 * c_rarg0 - int array elements a
11492 * c_rarg1 - int array elements n (the modulus)
11493 * c_rarg2 - int length
11494 * c_rarg3 - int inv
11495 * c_rarg4 - int array elements m (the result)
11496 *
11497 */
11498 address generate_square() {
11499 Label argh;
11500 bind(argh);
11501 stop("MontgomeryMultiply total_allocation must be <= 8192");
11502
11503 align(CodeEntryAlignment);
11504 address entry = pc();
11505
11506 enter();
11507
11508 // Make room.
11509 cmpw(Rlen, 512);
11510 br(Assembler::HI, argh);
11511 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11512 andr(sp, Ra, -2 * wordSize);
11513
11514 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11515
11516 {
11517 // Copy input args, reversing as we go. We use Ra as a
11518 // temporary variable.
11519 reverse(Ra, Pa_base, Rlen, t0, t1);
11520 reverse(Ra, Pn_base, Rlen, t0, t1);
11521 }
11522
11523 // Push all call-saved registers and also Pm_base which we'll need
11524 // at the end.
11525 save_regs();
11526
11527 mov(Pm_base, Ra);
11528
11529 mov(t0, zr);
11530 mov(t1, zr);
11531 mov(t2, zr);
11532
11533 block_comment("for (int i = 0; i < len; i++) {");
11534 mov(Ri, zr); {
11535 Label loop, end;
11536 bind(loop);
11537 cmp(Ri, Rlen);
11538 br(Assembler::GE, end);
11539
11540 pre1(Ri);
11541
11542 block_comment("for (j = (i+1)/2; j; j--) {"); {
11543 add(Rj, Ri, 1);
11544 lsr(Rj, Rj, 1);
11545 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11546 } block_comment(" } // j");
11547
11548 last_squaring(Ri);
11549
11550 block_comment(" for (j = i/2; j; j--) {"); {
11551 lsr(Rj, Ri, 1);
11552 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11553 } block_comment(" } // j");
11554
11555 post1_squaring();
11556 add(Ri, Ri, 1);
11557 cmp(Ri, Rlen);
11558 br(Assembler::LT, loop);
11559
11560 bind(end);
11561 block_comment("} // i");
11562 }
11563
11564 block_comment("for (int i = len; i < 2*len; i++) {");
11565 mov(Ri, Rlen); {
11566 Label loop, end;
11567 bind(loop);
11568 cmp(Ri, Rlen, Assembler::LSL, 1);
11569 br(Assembler::GE, end);
11570
11571 pre2(Ri, Rlen);
11572
11573 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11574 lsl(Rj, Rlen, 1);
11575 sub(Rj, Rj, Ri);
11576 sub(Rj, Rj, 1);
11577 lsr(Rj, Rj, 1);
11578 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11579 } block_comment(" } // j");
11580
11581 last_squaring(Ri);
11582
11583 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11584 lsl(Rj, Rlen, 1);
11585 sub(Rj, Rj, Ri);
11586 lsr(Rj, Rj, 1);
11587 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11588 } block_comment(" } // j");
11589
11590 post2(Ri, Rlen);
11591 add(Ri, Ri, 1);
11592 cmp(Ri, Rlen, Assembler::LSL, 1);
11593
11594 br(Assembler::LT, loop);
11595 bind(end);
11596 block_comment("} // i");
11597 }
11598
11599 normalize(Rlen);
11600
11601 mov(Ra, Pm_base); // Save Pm_base in Ra
11602 restore_regs(); // Restore caller's Pm_base
11603
11604 // Copy our result into caller's Pm_base
11605 reverse(Pm_base, Ra, Rlen, t0, t1);
11606
11607 leave();
11608 ret(lr);
11609
11610 return entry;
11611 }
11612 // In C, approximately:
11613
11614 // void
11615 // montgomery_square(julong Pa_base[], julong Pn_base[],
11616 // julong Pm_base[], julong inv, int len) {
11617 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11618 // julong *Pa, *Pb, *Pn, *Pm;
11619 // julong Ra, Rb, Rn, Rm;
11620
11621 // int i;
11622
11623 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11624
11625 // for (i = 0; i < len; i++) {
11626 // int j;
11627
11628 // Pa = Pa_base;
11629 // Pb = Pa_base + i;
11630 // Pm = Pm_base;
11631 // Pn = Pn_base + i;
11632
11633 // Ra = *Pa;
11634 // Rb = *Pb;
11635 // Rm = *Pm;
11636 // Rn = *Pn;
11637
11638 // int iters = (i+1)/2;
11639 // for (j = 0; iters--; j++) {
11640 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11641 // MACC2(Ra, Rb, t0, t1, t2);
11642 // Ra = *++Pa;
11643 // Rb = *--Pb;
11644 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11645 // MACC(Rm, Rn, t0, t1, t2);
11646 // Rm = *++Pm;
11647 // Rn = *--Pn;
11648 // }
11649 // if ((i & 1) == 0) {
11650 // assert(Ra == Pa_base[j], "must be");
11651 // MACC(Ra, Ra, t0, t1, t2);
11652 // }
11653 // iters = i/2;
11654 // assert(iters == i-j, "must be");
11655 // for (; iters--; j++) {
11656 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11657 // MACC(Rm, Rn, t0, t1, t2);
11658 // Rm = *++Pm;
11659 // Rn = *--Pn;
11660 // }
11661
11662 // *Pm = Rm = t0 * inv;
11663 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11664 // MACC(Rm, Rn, t0, t1, t2);
11665
11666 // assert(t0 == 0, "broken Montgomery multiply");
11667
11668 // t0 = t1; t1 = t2; t2 = 0;
11669 // }
11670
11671 // for (i = len; i < 2*len; i++) {
11672 // int start = i-len+1;
11673 // int end = start + (len - start)/2;
11674 // int j;
11675
11676 // Pa = Pa_base + i-len;
11677 // Pb = Pa_base + len;
11678 // Pm = Pm_base + i-len;
11679 // Pn = Pn_base + len;
11680
11681 // Ra = *++Pa;
11682 // Rb = *--Pb;
11683 // Rm = *++Pm;
11684 // Rn = *--Pn;
11685
11686 // int iters = (2*len-i-1)/2;
11687 // assert(iters == end-start, "must be");
11688 // for (j = start; iters--; j++) {
11689 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11690 // MACC2(Ra, Rb, t0, t1, t2);
11691 // Ra = *++Pa;
11692 // Rb = *--Pb;
11693 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11694 // MACC(Rm, Rn, t0, t1, t2);
11695 // Rm = *++Pm;
11696 // Rn = *--Pn;
11697 // }
11698 // if ((i & 1) == 0) {
11699 // assert(Ra == Pa_base[j], "must be");
11700 // MACC(Ra, Ra, t0, t1, t2);
11701 // }
11702 // iters = (2*len-i)/2;
11703 // assert(iters == len-j, "must be");
11704 // for (; iters--; j++) {
11705 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11706 // MACC(Rm, Rn, t0, t1, t2);
11707 // Rm = *++Pm;
11708 // Rn = *--Pn;
11709 // }
11710 // Pm_base[i-len] = t0;
11711 // t0 = t1; t1 = t2; t2 = 0;
11712 // }
11713
11714 // while (t0)
11715 // t0 = sub(Pm_base, Pn_base, t0, len);
11716 // }
11717 };
11718
11719 // Initialization
11720 void generate_preuniverse_stubs() {
11721 // preuniverse stubs are not needed for aarch64
11722 }
11723
11724 void generate_initial_stubs() {
11725 // Generate initial stubs and initializes the entry points
11726
11727 // entry points that exist in all platforms Note: This is code
11728 // that could be shared among different platforms - however the
11729 // benefit seems to be smaller than the disadvantage of having a
11730 // much more complicated generator structure. See also comment in
11731 // stubRoutines.hpp.
11732
11733 StubRoutines::_forward_exception_entry = generate_forward_exception();
11734
11735 StubRoutines::_call_stub_entry =
11736 generate_call_stub(StubRoutines::_call_stub_return_address);
11737
11738 // is referenced by megamorphic call
11739 StubRoutines::_catch_exception_entry = generate_catch_exception();
11740
11741 // Initialize table for copy memory (arraycopy) check.
11742 if (UnsafeMemoryAccess::_table == nullptr) {
11743 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11744 }
11745
11746 if (UseCRC32Intrinsics) {
11747 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11748 }
11749
11750 if (UseCRC32CIntrinsics) {
11751 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11752 }
11753
11754 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11755 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11756 }
11757
11758 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11759 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11760 }
11761
11762 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11763 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11764 StubRoutines::_hf2f = generate_float16ToFloat();
11765 StubRoutines::_f2hf = generate_floatToFloat16();
11766 }
11767 }
11768
11769 void generate_continuation_stubs() {
11770 // Continuation stubs:
11771 StubRoutines::_cont_thaw = generate_cont_thaw();
11772 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11773 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11774 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11775 }
11776
11777 void generate_final_stubs() {
11778 // support for verify_oop (must happen after universe_init)
11779 if (VerifyOops) {
11780 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11781 }
11782
11783 // arraycopy stubs used by compilers
11784 generate_arraycopy_stubs();
11785
11786 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11787
11788 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11789
11790 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11791 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11792
11793 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11794
11795 generate_atomic_entry_points();
11796
11797 #endif // LINUX
11798
11799 #ifdef COMPILER2
11800 if (UseSecondarySupersTable) {
11801 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11802 if (! InlineSecondarySupersTest) {
11803 generate_lookup_secondary_supers_table_stub();
11804 }
11805 }
11806 #endif
11807
11808 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11809
11810 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11811 }
11812
11813 void generate_compiler_stubs() {
11814 #if COMPILER2_OR_JVMCI
11815
11816 if (UseSVE == 0) {
11817 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11818 }
11819
11820 // array equals stub for large arrays.
11821 if (!UseSimpleArrayEquals) {
11822 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11823 }
11824
11825 // arrays_hascode stub for large arrays.
11826 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11827 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11828 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11829 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11830 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11831
11832 // byte_array_inflate stub for large arrays.
11833 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11834
11835 // countPositives stub for large arrays.
11836 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11837
11838 generate_compare_long_strings();
11839
11840 generate_string_indexof_stubs();
11841
11842 #ifdef COMPILER2
11843 if (UseMultiplyToLenIntrinsic) {
11844 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11845 }
11846
11847 if (UseSquareToLenIntrinsic) {
11848 StubRoutines::_squareToLen = generate_squareToLen();
11849 }
11850
11851 if (UseMulAddIntrinsic) {
11852 StubRoutines::_mulAdd = generate_mulAdd();
11853 }
11854
11855 if (UseSIMDForBigIntegerShiftIntrinsics) {
11856 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11857 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11858 }
11859
11860 if (UseMontgomeryMultiplyIntrinsic) {
11861 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11862 StubCodeMark mark(this, stub_id);
11863 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11864 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11865 }
11866
11867 if (UseMontgomerySquareIntrinsic) {
11868 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11869 StubCodeMark mark(this, stub_id);
11870 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11871 // We use generate_multiply() rather than generate_square()
11872 // because it's faster for the sizes of modulus we care about.
11873 StubRoutines::_montgomerySquare = g.generate_multiply();
11874 }
11875
11876 #endif // COMPILER2
11877
11878 if (UseChaCha20Intrinsics) {
11879 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11880 }
11881
11882 if (UseKyberIntrinsics) {
11883 StubRoutines::_kyberNtt = generate_kyberNtt();
11884 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11885 StubRoutines::_kyberNttMult = generate_kyberNttMult();
11886 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11887 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11888 StubRoutines::_kyber12To16 = generate_kyber12To16();
11889 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11890 }
11891
11892 if (UseDilithiumIntrinsics) {
11893 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11894 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11895 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11896 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11897 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11898 }
11899
11900 if (UseBASE64Intrinsics) {
11901 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11902 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11903 }
11904
11905 // data cache line writeback
11906 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11907 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11908
11909 if (UseAESIntrinsics) {
11910 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11911 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11912 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11913 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11914 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11915 }
11916 if (UseGHASHIntrinsics) {
11917 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11918 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11919 }
11920 if (UseAESIntrinsics && UseGHASHIntrinsics) {
11921 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11922 }
11923
11924 if (UseMD5Intrinsics) {
11925 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11926 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11927 }
11928 if (UseSHA1Intrinsics) {
11929 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11930 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11931 }
11932 if (UseSHA256Intrinsics) {
11933 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11934 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11935 }
11936 if (UseSHA512Intrinsics) {
11937 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11938 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11939 }
11940 if (UseSHA3Intrinsics) {
11941
11942 StubRoutines::_double_keccak = generate_double_keccak();
11943 if (UseSIMDForSHA3Intrinsic) {
11944 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11945 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11946 } else {
11947 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11948 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11949 }
11950 }
11951
11952 if (UsePoly1305Intrinsics) {
11953 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11954 }
11955
11956 // generate Adler32 intrinsics code
11957 if (UseAdler32Intrinsics) {
11958 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11959 }
11960
11961 #endif // COMPILER2_OR_JVMCI
11962 }
11963
11964 public:
11965 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11966 switch(blob_id) {
11967 case BlobId::stubgen_preuniverse_id:
11968 generate_preuniverse_stubs();
11969 break;
11970 case BlobId::stubgen_initial_id:
11971 generate_initial_stubs();
11972 break;
11973 case BlobId::stubgen_continuation_id:
11974 generate_continuation_stubs();
11975 break;
11976 case BlobId::stubgen_compiler_id:
11977 generate_compiler_stubs();
11978 break;
11979 case BlobId::stubgen_final_id:
11980 generate_final_stubs();
11981 break;
11982 default:
11983 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11984 break;
11985 };
11986 }
11987 }; // end class declaration
11988
11989 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11990 StubGenerator g(code, blob_id);
11991 }
11992
11993
11994 #if defined (LINUX)
11995
11996 // Define pointers to atomic stubs and initialize them to point to the
11997 // code in atomic_aarch64.S.
11998
11999 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12000 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12001 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12002 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12003 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12004
12005 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12006 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12007 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12008 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12009 DEFAULT_ATOMIC_OP(xchg, 4, )
12010 DEFAULT_ATOMIC_OP(xchg, 8, )
12011 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12012 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12013 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12014 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12015 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12016 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12017 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12018 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12019 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12020 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12021
12022 #undef DEFAULT_ATOMIC_OP
12023
12024 #endif // LINUX