1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "compiler/oopMap.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/gc_globals.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "memory/universe.hpp"
38 #include "nativeInst_aarch64.hpp"
39 #include "oops/instanceOop.hpp"
40 #include "oops/method.hpp"
41 #include "oops/objArrayKlass.hpp"
42 #include "oops/oop.inline.hpp"
43 #include "prims/methodHandles.hpp"
44 #include "prims/upcallLinker.hpp"
45 #include "runtime/arguments.hpp"
46 #include "runtime/atomicAccess.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/frame.inline.hpp"
50 #include "runtime/handles.inline.hpp"
51 #include "runtime/javaThread.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/stubCodeGenerator.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "utilities/align.hpp"
56 #include "utilities/checkedCast.hpp"
57 #include "utilities/debug.hpp"
58 #include "utilities/globalDefinitions.hpp"
59 #include "utilities/intpow.hpp"
60 #include "utilities/powerOfTwo.hpp"
61 #ifdef COMPILER2
62 #include "opto/runtime.hpp"
63 #endif
64 #if INCLUDE_ZGC
65 #include "gc/z/zThreadLocalData.hpp"
66 #endif
67
68 // Declaration and definition of StubGenerator (no .hpp file).
69 // For a more detailed description of the stub routine structure
70 // see the comment in stubRoutines.hpp
71
72 #undef __
73 #define __ _masm->
74
75 #ifdef PRODUCT
76 #define BLOCK_COMMENT(str) /* nothing */
77 #else
78 #define BLOCK_COMMENT(str) __ block_comment(str)
79 #endif
80
81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
82
83 // Stub Code definitions
84
85 class StubGenerator: public StubCodeGenerator {
86 private:
87
88 #ifdef PRODUCT
89 #define inc_counter_np(counter) ((void)0)
90 #else
91 void inc_counter_np_(uint& counter) {
92 __ incrementw(ExternalAddress((address)&counter));
93 }
94 #define inc_counter_np(counter) \
95 BLOCK_COMMENT("inc_counter " #counter); \
96 inc_counter_np_(counter);
97 #endif
98
99 // Call stubs are used to call Java from C
100 //
101 // Arguments:
102 // c_rarg0: call wrapper address address
103 // c_rarg1: result address
104 // c_rarg2: result type BasicType
105 // c_rarg3: method Method*
106 // c_rarg4: (interpreter) entry point address
107 // c_rarg5: parameters intptr_t*
108 // c_rarg6: parameter size (in words) int
109 // c_rarg7: thread Thread*
110 //
111 // There is no return from the stub itself as any Java result
112 // is written to result
113 //
114 // we save r30 (lr) as the return PC at the base of the frame and
115 // link r29 (fp) below it as the frame pointer installing sp (r31)
116 // into fp.
117 //
118 // we save r0-r7, which accounts for all the c arguments.
119 //
120 // TODO: strictly do we need to save them all? they are treated as
121 // volatile by C so could we omit saving the ones we are going to
122 // place in global registers (thread? method?) or those we only use
123 // during setup of the Java call?
124 //
125 // we don't need to save r8 which C uses as an indirect result location
126 // return register.
127 //
128 // we don't need to save r9-r15 which both C and Java treat as
129 // volatile
130 //
131 // we don't need to save r16-18 because Java does not use them
132 //
133 // we save r19-r28 which Java uses as scratch registers and C
134 // expects to be callee-save
135 //
136 // we save the bottom 64 bits of each value stored in v8-v15; it is
137 // the responsibility of the caller to preserve larger values.
138 //
139 // so the stub frame looks like this when we enter Java code
140 //
141 // [ return_from_Java ] <--- sp
142 // [ argument word n ]
143 // ...
144 // -29 [ argument word 1 ]
145 // -28 [ saved Floating-point Control Register ]
146 // -26 [ saved v15 ] <--- sp_after_call
147 // -25 [ saved v14 ]
148 // -24 [ saved v13 ]
149 // -23 [ saved v12 ]
150 // -22 [ saved v11 ]
151 // -21 [ saved v10 ]
152 // -20 [ saved v9 ]
153 // -19 [ saved v8 ]
154 // -18 [ saved r28 ]
155 // -17 [ saved r27 ]
156 // -16 [ saved r26 ]
157 // -15 [ saved r25 ]
158 // -14 [ saved r24 ]
159 // -13 [ saved r23 ]
160 // -12 [ saved r22 ]
161 // -11 [ saved r21 ]
162 // -10 [ saved r20 ]
163 // -9 [ saved r19 ]
164 // -8 [ call wrapper (r0) ]
165 // -7 [ result (r1) ]
166 // -6 [ result type (r2) ]
167 // -5 [ method (r3) ]
168 // -4 [ entry point (r4) ]
169 // -3 [ parameters (r5) ]
170 // -2 [ parameter size (r6) ]
171 // -1 [ thread (r7) ]
172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
173 // 1 [ saved lr (r30) ]
174
175 // Call stub stack layout word offsets from fp
176 enum call_stub_layout {
177 sp_after_call_off = -28,
178
179 fpcr_off = sp_after_call_off,
180 d15_off = -26,
181 d13_off = -24,
182 d11_off = -22,
183 d9_off = -20,
184
185 r28_off = -18,
186 r26_off = -16,
187 r24_off = -14,
188 r22_off = -12,
189 r20_off = -10,
190 call_wrapper_off = -8,
191 result_off = -7,
192 result_type_off = -6,
193 method_off = -5,
194 entry_point_off = -4,
195 parameter_size_off = -2,
196 thread_off = -1,
197 fp_f = 0,
198 retaddr_off = 1,
199 };
200
201 address generate_call_stub(address& return_address) {
202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
204 "adjust this code");
205
206 StubId stub_id = StubId::stubgen_call_stub_id;
207 StubCodeMark mark(this, stub_id);
208 address start = __ pc();
209
210 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
211
212 const Address fpcr_save (rfp, fpcr_off * wordSize);
213 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
214 const Address result (rfp, result_off * wordSize);
215 const Address result_type (rfp, result_type_off * wordSize);
216 const Address method (rfp, method_off * wordSize);
217 const Address entry_point (rfp, entry_point_off * wordSize);
218 const Address parameter_size(rfp, parameter_size_off * wordSize);
219
220 const Address thread (rfp, thread_off * wordSize);
221
222 const Address d15_save (rfp, d15_off * wordSize);
223 const Address d13_save (rfp, d13_off * wordSize);
224 const Address d11_save (rfp, d11_off * wordSize);
225 const Address d9_save (rfp, d9_off * wordSize);
226
227 const Address r28_save (rfp, r28_off * wordSize);
228 const Address r26_save (rfp, r26_off * wordSize);
229 const Address r24_save (rfp, r24_off * wordSize);
230 const Address r22_save (rfp, r22_off * wordSize);
231 const Address r20_save (rfp, r20_off * wordSize);
232
233 // stub code
234
235 address aarch64_entry = __ pc();
236
237 // set up frame and move sp to end of save area
238 __ enter();
239 __ sub(sp, rfp, -sp_after_call_off * wordSize);
240
241 // save register parameters and Java scratch/global registers
242 // n.b. we save thread even though it gets installed in
243 // rthread because we want to sanity check rthread later
244 __ str(c_rarg7, thread);
245 __ strw(c_rarg6, parameter_size);
246 __ stp(c_rarg4, c_rarg5, entry_point);
247 __ stp(c_rarg2, c_rarg3, result_type);
248 __ stp(c_rarg0, c_rarg1, call_wrapper);
249
250 __ stp(r20, r19, r20_save);
251 __ stp(r22, r21, r22_save);
252 __ stp(r24, r23, r24_save);
253 __ stp(r26, r25, r26_save);
254 __ stp(r28, r27, r28_save);
255
256 __ stpd(v9, v8, d9_save);
257 __ stpd(v11, v10, d11_save);
258 __ stpd(v13, v12, d13_save);
259 __ stpd(v15, v14, d15_save);
260
261 __ get_fpcr(rscratch1);
262 __ str(rscratch1, fpcr_save);
263 // Set FPCR to the state we need. We do want Round to Nearest. We
264 // don't want non-IEEE rounding modes or floating-point traps.
265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
267 __ set_fpcr(rscratch1);
268
269 // install Java thread in global register now we have saved
270 // whatever value it held
271 __ mov(rthread, c_rarg7);
272 // And method
273 __ mov(rmethod, c_rarg3);
274
275 // set up the heapbase register
276 __ reinit_heapbase();
277
278 #ifdef ASSERT
279 // make sure we have no pending exceptions
280 {
281 Label L;
282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
283 __ cmp(rscratch1, (u1)NULL_WORD);
284 __ br(Assembler::EQ, L);
285 __ stop("StubRoutines::call_stub: entered with pending exception");
286 __ BIND(L);
287 }
288 #endif
289 // pass parameters if any
290 __ mov(esp, sp);
291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
292 __ andr(sp, rscratch1, -2 * wordSize);
293
294 BLOCK_COMMENT("pass parameters if any");
295 Label parameters_done;
296 // parameter count is still in c_rarg6
297 // and parameter pointer identifying param 1 is in c_rarg5
298 __ cbzw(c_rarg6, parameters_done);
299
300 address loop = __ pc();
301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
302 __ subsw(c_rarg6, c_rarg6, 1);
303 __ push(rscratch1);
304 __ br(Assembler::GT, loop);
305
306 __ BIND(parameters_done);
307
308 // call Java entry -- passing methdoOop, and current sp
309 // rmethod: Method*
310 // r19_sender_sp: sender sp
311 BLOCK_COMMENT("call Java function");
312 __ mov(r19_sender_sp, sp);
313 __ blr(c_rarg4);
314
315 // we do this here because the notify will already have been done
316 // if we get to the next instruction via an exception
317 //
318 // n.b. adding this instruction here affects the calculation of
319 // whether or not a routine returns to the call stub (used when
320 // doing stack walks) since the normal test is to check the return
321 // pc against the address saved below. so we may need to allow for
322 // this extra instruction in the check.
323
324 // save current address for use by exception handling code
325
326 return_address = __ pc();
327
328 // store result depending on type (everything that is not
329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
330 // n.b. this assumes Java returns an integral result in r0
331 // and a floating result in j_farg0
332 __ ldr(j_rarg2, result);
333 Label is_long, is_float, is_double, exit;
334 __ ldr(j_rarg1, result_type);
335 __ cmp(j_rarg1, (u1)T_OBJECT);
336 __ br(Assembler::EQ, is_long);
337 __ cmp(j_rarg1, (u1)T_LONG);
338 __ br(Assembler::EQ, is_long);
339 __ cmp(j_rarg1, (u1)T_FLOAT);
340 __ br(Assembler::EQ, is_float);
341 __ cmp(j_rarg1, (u1)T_DOUBLE);
342 __ br(Assembler::EQ, is_double);
343
344 // handle T_INT case
345 __ strw(r0, Address(j_rarg2));
346
347 __ BIND(exit);
348
349 // pop parameters
350 __ sub(esp, rfp, -sp_after_call_off * wordSize);
351
352 #ifdef ASSERT
353 // verify that threads correspond
354 {
355 Label L, S;
356 __ ldr(rscratch1, thread);
357 __ cmp(rthread, rscratch1);
358 __ br(Assembler::NE, S);
359 __ get_thread(rscratch1);
360 __ cmp(rthread, rscratch1);
361 __ br(Assembler::EQ, L);
362 __ BIND(S);
363 __ stop("StubRoutines::call_stub: threads must correspond");
364 __ BIND(L);
365 }
366 #endif
367
368 __ pop_cont_fastpath(rthread);
369
370 // restore callee-save registers
371 __ ldpd(v15, v14, d15_save);
372 __ ldpd(v13, v12, d13_save);
373 __ ldpd(v11, v10, d11_save);
374 __ ldpd(v9, v8, d9_save);
375
376 __ ldp(r28, r27, r28_save);
377 __ ldp(r26, r25, r26_save);
378 __ ldp(r24, r23, r24_save);
379 __ ldp(r22, r21, r22_save);
380 __ ldp(r20, r19, r20_save);
381
382 // restore fpcr
383 __ ldr(rscratch1, fpcr_save);
384 __ set_fpcr(rscratch1);
385
386 __ ldp(c_rarg0, c_rarg1, call_wrapper);
387 __ ldrw(c_rarg2, result_type);
388 __ ldr(c_rarg3, method);
389 __ ldp(c_rarg4, c_rarg5, entry_point);
390 __ ldp(c_rarg6, c_rarg7, parameter_size);
391
392 // leave frame and return to caller
393 __ leave();
394 __ ret(lr);
395
396 // handle return types different from T_INT
397
398 __ BIND(is_long);
399 __ str(r0, Address(j_rarg2, 0));
400 __ br(Assembler::AL, exit);
401
402 __ BIND(is_float);
403 __ strs(j_farg0, Address(j_rarg2, 0));
404 __ br(Assembler::AL, exit);
405
406 __ BIND(is_double);
407 __ strd(j_farg0, Address(j_rarg2, 0));
408 __ br(Assembler::AL, exit);
409
410 return start;
411 }
412
413 // Return point for a Java call if there's an exception thrown in
414 // Java code. The exception is caught and transformed into a
415 // pending exception stored in JavaThread that can be tested from
416 // within the VM.
417 //
418 // Note: Usually the parameters are removed by the callee. In case
419 // of an exception crossing an activation frame boundary, that is
420 // not the case if the callee is compiled code => need to setup the
421 // rsp.
422 //
423 // r0: exception oop
424
425 address generate_catch_exception() {
426 StubId stub_id = StubId::stubgen_catch_exception_id;
427 StubCodeMark mark(this, stub_id);
428 address start = __ pc();
429
430 // same as in generate_call_stub():
431 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
432 const Address thread (rfp, thread_off * wordSize);
433
434 #ifdef ASSERT
435 // verify that threads correspond
436 {
437 Label L, S;
438 __ ldr(rscratch1, thread);
439 __ cmp(rthread, rscratch1);
440 __ br(Assembler::NE, S);
441 __ get_thread(rscratch1);
442 __ cmp(rthread, rscratch1);
443 __ br(Assembler::EQ, L);
444 __ bind(S);
445 __ stop("StubRoutines::catch_exception: threads must correspond");
446 __ bind(L);
447 }
448 #endif
449
450 // set pending exception
451 __ verify_oop(r0);
452
453 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
454 __ mov(rscratch1, (address)__FILE__);
455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
456 __ movw(rscratch1, (int)__LINE__);
457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
458
459 // complete return to VM
460 assert(StubRoutines::_call_stub_return_address != nullptr,
461 "_call_stub_return_address must have been generated before");
462 __ b(StubRoutines::_call_stub_return_address);
463
464 return start;
465 }
466
467 // Continuation point for runtime calls returning with a pending
468 // exception. The pending exception check happened in the runtime
469 // or native call stub. The pending exception in Thread is
470 // converted into a Java-level exception.
471 //
472 // Contract with Java-level exception handlers:
473 // r0: exception
474 // r3: throwing pc
475 //
476 // NOTE: At entry of this stub, exception-pc must be in LR !!
477
478 // NOTE: this is always used as a jump target within generated code
479 // so it just needs to be generated code with no x86 prolog
480
481 address generate_forward_exception() {
482 StubId stub_id = StubId::stubgen_forward_exception_id;
483 StubCodeMark mark(this, stub_id);
484 address start = __ pc();
485
486 // Upon entry, LR points to the return address returning into
487 // Java (interpreted or compiled) code; i.e., the return address
488 // becomes the throwing pc.
489 //
490 // Arguments pushed before the runtime call are still on the stack
491 // but the exception handler will reset the stack pointer ->
492 // ignore them. A potential result in registers can be ignored as
493 // well.
494
495 #ifdef ASSERT
496 // make sure this code is only executed if there is a pending exception
497 {
498 Label L;
499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
500 __ cbnz(rscratch1, L);
501 __ stop("StubRoutines::forward exception: no pending exception (1)");
502 __ bind(L);
503 }
504 #endif
505
506 // compute exception handler into r19
507
508 // call the VM to find the handler address associated with the
509 // caller address. pass thread in r0 and caller pc (ret address)
510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
511 // the stack.
512 __ mov(c_rarg1, lr);
513 // lr will be trashed by the VM call so we move it to R19
514 // (callee-saved) because we also need to pass it to the handler
515 // returned by this call.
516 __ mov(r19, lr);
517 BLOCK_COMMENT("call exception_handler_for_return_address");
518 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
519 SharedRuntime::exception_handler_for_return_address),
520 rthread, c_rarg1);
521 // Reinitialize the ptrue predicate register, in case the external runtime
522 // call clobbers ptrue reg, as we may return to SVE compiled code.
523 __ reinitialize_ptrue();
524
525 // we should not really care that lr is no longer the callee
526 // address. we saved the value the handler needs in r19 so we can
527 // just copy it to r3. however, the C2 handler will push its own
528 // frame and then calls into the VM and the VM code asserts that
529 // the PC for the frame above the handler belongs to a compiled
530 // Java method. So, we restore lr here to satisfy that assert.
531 __ mov(lr, r19);
532 // setup r0 & r3 & clear pending exception
533 __ mov(r3, r19);
534 __ mov(r19, r0);
535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
536 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
537
538 #ifdef ASSERT
539 // make sure exception is set
540 {
541 Label L;
542 __ cbnz(r0, L);
543 __ stop("StubRoutines::forward exception: no pending exception (2)");
544 __ bind(L);
545 }
546 #endif
547
548 // continue at exception handler
549 // r0: exception
550 // r3: throwing pc
551 // r19: exception handler
552 __ verify_oop(r0);
553 __ br(r19);
554
555 return start;
556 }
557
558 // Non-destructive plausibility checks for oops
559 //
560 // Arguments:
561 // r0: oop to verify
562 // rscratch1: error message
563 //
564 // Stack after saving c_rarg3:
565 // [tos + 0]: saved c_rarg3
566 // [tos + 1]: saved c_rarg2
567 // [tos + 2]: saved lr
568 // [tos + 3]: saved rscratch2
569 // [tos + 4]: saved r0
570 // [tos + 5]: saved rscratch1
571 address generate_verify_oop() {
572 StubId stub_id = StubId::stubgen_verify_oop_id;
573 StubCodeMark mark(this, stub_id);
574 address start = __ pc();
575
576 Label exit, error;
577
578 // save c_rarg2 and c_rarg3
579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
580
581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
583 __ ldr(c_rarg3, Address(c_rarg2));
584 __ add(c_rarg3, c_rarg3, 1);
585 __ str(c_rarg3, Address(c_rarg2));
586
587 // object is in r0
588 // make sure object is 'reasonable'
589 __ cbz(r0, exit); // if obj is null it is OK
590
591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
593
594 // return if everything seems ok
595 __ bind(exit);
596
597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
598 __ ret(lr);
599
600 // handle errors
601 __ bind(error);
602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
603
604 __ push(RegSet::range(r0, r29), sp);
605 // debug(char* msg, int64_t pc, int64_t regs[])
606 __ mov(c_rarg0, rscratch1); // pass address of error message
607 __ mov(c_rarg1, lr); // pass return address
608 __ mov(c_rarg2, sp); // pass address of regs on stack
609 #ifndef PRODUCT
610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
611 #endif
612 BLOCK_COMMENT("call MacroAssembler::debug");
613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
614 __ blr(rscratch1);
615 __ hlt(0);
616
617 return start;
618 }
619
620 // Generate indices for iota vector.
621 address generate_iota_indices(StubId stub_id) {
622 __ align(CodeEntryAlignment);
623 StubCodeMark mark(this, stub_id);
624 address start = __ pc();
625 // B
626 __ emit_data64(0x0706050403020100, relocInfo::none);
627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
628 // H
629 __ emit_data64(0x0003000200010000, relocInfo::none);
630 __ emit_data64(0x0007000600050004, relocInfo::none);
631 // S
632 __ emit_data64(0x0000000100000000, relocInfo::none);
633 __ emit_data64(0x0000000300000002, relocInfo::none);
634 // D
635 __ emit_data64(0x0000000000000000, relocInfo::none);
636 __ emit_data64(0x0000000000000001, relocInfo::none);
637 // S - FP
638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
640 // D - FP
641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
643 return start;
644 }
645
646 // The inner part of zero_words(). This is the bulk operation,
647 // zeroing words in blocks, possibly using DC ZVA to do it. The
648 // caller is responsible for zeroing the last few words.
649 //
650 // Inputs:
651 // r10: the HeapWord-aligned base address of an array to zero.
652 // r11: the count in HeapWords, r11 > 0.
653 //
654 // Returns r10 and r11, adjusted for the caller to clear.
655 // r10: the base address of the tail of words left to clear.
656 // r11: the number of words in the tail.
657 // r11 < MacroAssembler::zero_words_block_size.
658
659 address generate_zero_blocks() {
660 Label done;
661 Label base_aligned;
662
663 Register base = r10, cnt = r11;
664
665 __ align(CodeEntryAlignment);
666 StubId stub_id = StubId::stubgen_zero_blocks_id;
667 StubCodeMark mark(this, stub_id);
668 address start = __ pc();
669
670 if (UseBlockZeroing) {
671 int zva_length = VM_Version::zva_length();
672
673 // Ensure ZVA length can be divided by 16. This is required by
674 // the subsequent operations.
675 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
676
677 __ tbz(base, 3, base_aligned);
678 __ str(zr, Address(__ post(base, 8)));
679 __ sub(cnt, cnt, 1);
680 __ bind(base_aligned);
681
682 // Ensure count >= zva_length * 2 so that it still deserves a zva after
683 // alignment.
684 Label small;
685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
686 __ subs(rscratch1, cnt, low_limit >> 3);
687 __ br(Assembler::LT, small);
688 __ zero_dcache_blocks(base, cnt);
689 __ bind(small);
690 }
691
692 {
693 // Number of stp instructions we'll unroll
694 const int unroll =
695 MacroAssembler::zero_words_block_size / 2;
696 // Clear the remaining blocks.
697 Label loop;
698 __ subs(cnt, cnt, unroll * 2);
699 __ br(Assembler::LT, done);
700 __ bind(loop);
701 for (int i = 0; i < unroll; i++)
702 __ stp(zr, zr, __ post(base, 16));
703 __ subs(cnt, cnt, unroll * 2);
704 __ br(Assembler::GE, loop);
705 __ bind(done);
706 __ add(cnt, cnt, unroll * 2);
707 }
708
709 __ ret(lr);
710
711 return start;
712 }
713
714
715 typedef enum {
716 copy_forwards = 1,
717 copy_backwards = -1
718 } copy_direction;
719
720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
721 // for arraycopy stubs.
722 class ArrayCopyBarrierSetHelper : StackObj {
723 BarrierSetAssembler* _bs_asm;
724 MacroAssembler* _masm;
725 DecoratorSet _decorators;
726 BasicType _type;
727 Register _gct1;
728 Register _gct2;
729 Register _gct3;
730 FloatRegister _gcvt1;
731 FloatRegister _gcvt2;
732 FloatRegister _gcvt3;
733
734 public:
735 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
736 DecoratorSet decorators,
737 BasicType type,
738 Register gct1,
739 Register gct2,
740 Register gct3,
741 FloatRegister gcvt1,
742 FloatRegister gcvt2,
743 FloatRegister gcvt3)
744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
745 _masm(masm),
746 _decorators(decorators),
747 _type(type),
748 _gct1(gct1),
749 _gct2(gct2),
750 _gct3(gct3),
751 _gcvt1(gcvt1),
752 _gcvt2(gcvt2),
753 _gcvt3(gcvt3) {
754 }
755
756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
758 dst1, dst2, src,
759 _gct1, _gct2, _gcvt1);
760 }
761
762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
764 dst, src1, src2,
765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
766 }
767
768 void copy_load_at_16(Register dst1, Register dst2, Address src) {
769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
770 dst1, dst2, src,
771 _gct1);
772 }
773
774 void copy_store_at_16(Address dst, Register src1, Register src2) {
775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
776 dst, src1, src2,
777 _gct1, _gct2, _gct3);
778 }
779
780 void copy_load_at_8(Register dst, Address src) {
781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
782 dst, noreg, src,
783 _gct1);
784 }
785
786 void copy_store_at_8(Address dst, Register src) {
787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
788 dst, src, noreg,
789 _gct1, _gct2, _gct3);
790 }
791 };
792
793 // Bulk copy of blocks of 8 words.
794 //
795 // count is a count of words.
796 //
797 // Precondition: count >= 8
798 //
799 // Postconditions:
800 //
801 // The least significant bit of count contains the remaining count
802 // of words to copy. The rest of count is trash.
803 //
804 // s and d are adjusted to point to the remaining words to copy
805 //
806 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
807 BasicType type;
808 copy_direction direction;
809
810 switch (stub_id) {
811 case StubId::stubgen_copy_byte_f_id:
812 direction = copy_forwards;
813 type = T_BYTE;
814 break;
815 case StubId::stubgen_copy_byte_b_id:
816 direction = copy_backwards;
817 type = T_BYTE;
818 break;
819 case StubId::stubgen_copy_oop_f_id:
820 direction = copy_forwards;
821 type = T_OBJECT;
822 break;
823 case StubId::stubgen_copy_oop_b_id:
824 direction = copy_backwards;
825 type = T_OBJECT;
826 break;
827 case StubId::stubgen_copy_oop_uninit_f_id:
828 direction = copy_forwards;
829 type = T_OBJECT;
830 break;
831 case StubId::stubgen_copy_oop_uninit_b_id:
832 direction = copy_backwards;
833 type = T_OBJECT;
834 break;
835 default:
836 ShouldNotReachHere();
837 }
838
839 int unit = wordSize * direction;
840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
841
842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
843 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
844 const Register stride = r14;
845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
848
849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
850 assert_different_registers(s, d, count, rscratch1, rscratch2);
851
852 Label again, drain;
853
854 __ align(CodeEntryAlignment);
855
856 StubCodeMark mark(this, stub_id);
857
858 address start = __ pc();
859
860 Label unaligned_copy_long;
861 if (AvoidUnalignedAccesses) {
862 __ tbnz(d, 3, unaligned_copy_long);
863 }
864
865 if (direction == copy_forwards) {
866 __ sub(s, s, bias);
867 __ sub(d, d, bias);
868 }
869
870 #ifdef ASSERT
871 // Make sure we are never given < 8 words
872 {
873 Label L;
874 __ cmp(count, (u1)8);
875 __ br(Assembler::GE, L);
876 __ stop("genrate_copy_longs called with < 8 words");
877 __ bind(L);
878 }
879 #endif
880
881 // Fill 8 registers
882 if (UseSIMDForMemoryOps) {
883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
885 } else {
886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
890 }
891
892 __ subs(count, count, 16);
893 __ br(Assembler::LO, drain);
894
895 int prefetch = PrefetchCopyIntervalInBytes;
896 bool use_stride = false;
897 if (direction == copy_backwards) {
898 use_stride = prefetch > 256;
899 prefetch = -prefetch;
900 if (use_stride) __ mov(stride, prefetch);
901 }
902
903 __ bind(again);
904
905 if (PrefetchCopyIntervalInBytes > 0)
906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
907
908 if (UseSIMDForMemoryOps) {
909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
913 } else {
914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
922 }
923
924 __ subs(count, count, 8);
925 __ br(Assembler::HS, again);
926
927 // Drain
928 __ bind(drain);
929 if (UseSIMDForMemoryOps) {
930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
932 } else {
933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
937 }
938
939 {
940 Label L1, L2;
941 __ tbz(count, exact_log2(4), L1);
942 if (UseSIMDForMemoryOps) {
943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
945 } else {
946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
950 }
951 __ bind(L1);
952
953 if (direction == copy_forwards) {
954 __ add(s, s, bias);
955 __ add(d, d, bias);
956 }
957
958 __ tbz(count, 1, L2);
959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
961 __ bind(L2);
962 }
963
964 __ ret(lr);
965
966 if (AvoidUnalignedAccesses) {
967 Label drain, again;
968 // Register order for storing. Order is different for backward copy.
969
970 __ bind(unaligned_copy_long);
971
972 // source address is even aligned, target odd aligned
973 //
974 // when forward copying word pairs we read long pairs at offsets
975 // {0, 2, 4, 6} (in long words). when backwards copying we read
976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
977 // address by -2 in the forwards case so we can compute the
978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
979 // or -1.
980 //
981 // when forward copying we need to store 1 word, 3 pairs and
982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
983 // zero offset We adjust the destination by -1 which means we
984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
985 //
986 // When backwards copyng we need to store 1 word, 3 pairs and
987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
988 // offsets {1, 3, 5, 7, 8} * unit.
989
990 if (direction == copy_forwards) {
991 __ sub(s, s, 16);
992 __ sub(d, d, 8);
993 }
994
995 // Fill 8 registers
996 //
997 // for forwards copy s was offset by -16 from the original input
998 // value of s so the register contents are at these offsets
999 // relative to the 64 bit block addressed by that original input
1000 // and so on for each successive 64 byte block when s is updated
1001 //
1002 // t0 at offset 0, t1 at offset 8
1003 // t2 at offset 16, t3 at offset 24
1004 // t4 at offset 32, t5 at offset 40
1005 // t6 at offset 48, t7 at offset 56
1006
1007 // for backwards copy s was not offset so the register contents
1008 // are at these offsets into the preceding 64 byte block
1009 // relative to that original input and so on for each successive
1010 // preceding 64 byte block when s is updated. this explains the
1011 // slightly counter-intuitive looking pattern of register usage
1012 // in the stp instructions for backwards copy.
1013 //
1014 // t0 at offset -16, t1 at offset -8
1015 // t2 at offset -32, t3 at offset -24
1016 // t4 at offset -48, t5 at offset -40
1017 // t6 at offset -64, t7 at offset -56
1018
1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1023
1024 __ subs(count, count, 16);
1025 __ br(Assembler::LO, drain);
1026
1027 int prefetch = PrefetchCopyIntervalInBytes;
1028 bool use_stride = false;
1029 if (direction == copy_backwards) {
1030 use_stride = prefetch > 256;
1031 prefetch = -prefetch;
1032 if (use_stride) __ mov(stride, prefetch);
1033 }
1034
1035 __ bind(again);
1036
1037 if (PrefetchCopyIntervalInBytes > 0)
1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1039
1040 if (direction == copy_forwards) {
1041 // allowing for the offset of -8 the store instructions place
1042 // registers into the target 64 bit block at the following
1043 // offsets
1044 //
1045 // t0 at offset 0
1046 // t1 at offset 8, t2 at offset 16
1047 // t3 at offset 24, t4 at offset 32
1048 // t5 at offset 40, t6 at offset 48
1049 // t7 at offset 56
1050
1051 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1060 } else {
1061 // d was not offset when we started so the registers are
1062 // written into the 64 bit block preceding d with the following
1063 // offsets
1064 //
1065 // t1 at offset -8
1066 // t3 at offset -24, t0 at offset -16
1067 // t5 at offset -48, t2 at offset -32
1068 // t7 at offset -56, t4 at offset -48
1069 // t6 at offset -64
1070 //
1071 // note that this matches the offsets previously noted for the
1072 // loads
1073
1074 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1083 }
1084
1085 __ subs(count, count, 8);
1086 __ br(Assembler::HS, again);
1087
1088 // Drain
1089 //
1090 // this uses the same pattern of offsets and register arguments
1091 // as above
1092 __ bind(drain);
1093 if (direction == copy_forwards) {
1094 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1099 } else {
1100 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1105 }
1106 // now we need to copy any remaining part block which may
1107 // include a 4 word block subblock and/or a 2 word subblock.
1108 // bits 2 and 1 in the count are the tell-tale for whether we
1109 // have each such subblock
1110 {
1111 Label L1, L2;
1112 __ tbz(count, exact_log2(4), L1);
1113 // this is the same as above but copying only 4 longs hence
1114 // with only one intervening stp between the str instructions
1115 // but note that the offsets and registers still follow the
1116 // same pattern
1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1119 if (direction == copy_forwards) {
1120 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1123 } else {
1124 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1127 }
1128 __ bind(L1);
1129
1130 __ tbz(count, 1, L2);
1131 // this is the same as above but copying only 2 longs hence
1132 // there is no intervening stp between the str instructions
1133 // but note that the offset and register patterns are still
1134 // the same
1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1136 if (direction == copy_forwards) {
1137 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1139 } else {
1140 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1142 }
1143 __ bind(L2);
1144
1145 // for forwards copy we need to re-adjust the offsets we
1146 // applied so that s and d are follow the last words written
1147
1148 if (direction == copy_forwards) {
1149 __ add(s, s, 16);
1150 __ add(d, d, 8);
1151 }
1152
1153 }
1154
1155 __ ret(lr);
1156 }
1157
1158 return start;
1159 }
1160
1161 // Small copy: less than 16 bytes.
1162 //
1163 // NB: Ignores all of the bits of count which represent more than 15
1164 // bytes, so a caller doesn't have to mask them.
1165
1166 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1167 bool is_backwards = step < 0;
1168 size_t granularity = g_uabs(step);
1169 int direction = is_backwards ? -1 : 1;
1170
1171 Label Lword, Lint, Lshort, Lbyte;
1172
1173 assert(granularity
1174 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1175
1176 const Register t0 = r3;
1177 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1178 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1179
1180 // ??? I don't know if this bit-test-and-branch is the right thing
1181 // to do. It does a lot of jumping, resulting in several
1182 // mispredicted branches. It might make more sense to do this
1183 // with something like Duff's device with a single computed branch.
1184
1185 __ tbz(count, 3 - exact_log2(granularity), Lword);
1186 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1187 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1188 __ bind(Lword);
1189
1190 if (granularity <= sizeof (jint)) {
1191 __ tbz(count, 2 - exact_log2(granularity), Lint);
1192 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1193 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1194 __ bind(Lint);
1195 }
1196
1197 if (granularity <= sizeof (jshort)) {
1198 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1199 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1200 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1201 __ bind(Lshort);
1202 }
1203
1204 if (granularity <= sizeof (jbyte)) {
1205 __ tbz(count, 0, Lbyte);
1206 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1207 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1208 __ bind(Lbyte);
1209 }
1210 }
1211
1212 // All-singing all-dancing memory copy.
1213 //
1214 // Copy count units of memory from s to d. The size of a unit is
1215 // step, which can be positive or negative depending on the direction
1216 // of copy. If is_aligned is false, we align the source address.
1217 //
1218
1219 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1220 Register s, Register d, Register count, int step) {
1221 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1222 bool is_backwards = step < 0;
1223 unsigned int granularity = g_uabs(step);
1224 const Register t0 = r3, t1 = r4;
1225
1226 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1227 // load all the data before writing anything
1228 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1229 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1230 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1231 const Register send = r17, dend = r16;
1232 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1233 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1234 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1235
1236 if (PrefetchCopyIntervalInBytes > 0)
1237 __ prfm(Address(s, 0), PLDL1KEEP);
1238 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1239 __ br(Assembler::HI, copy_big);
1240
1241 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1242 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1243
1244 __ cmp(count, u1(16/granularity));
1245 __ br(Assembler::LS, copy16);
1246
1247 __ cmp(count, u1(64/granularity));
1248 __ br(Assembler::HI, copy80);
1249
1250 __ cmp(count, u1(32/granularity));
1251 __ br(Assembler::LS, copy32);
1252
1253 // 33..64 bytes
1254 if (UseSIMDForMemoryOps) {
1255 bs.copy_load_at_32(v0, v1, Address(s, 0));
1256 bs.copy_load_at_32(v2, v3, Address(send, -32));
1257 bs.copy_store_at_32(Address(d, 0), v0, v1);
1258 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1259 } else {
1260 bs.copy_load_at_16(t0, t1, Address(s, 0));
1261 bs.copy_load_at_16(t2, t3, Address(s, 16));
1262 bs.copy_load_at_16(t4, t5, Address(send, -32));
1263 bs.copy_load_at_16(t6, t7, Address(send, -16));
1264
1265 bs.copy_store_at_16(Address(d, 0), t0, t1);
1266 bs.copy_store_at_16(Address(d, 16), t2, t3);
1267 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1268 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1269 }
1270 __ b(finish);
1271
1272 // 17..32 bytes
1273 __ bind(copy32);
1274 bs.copy_load_at_16(t0, t1, Address(s, 0));
1275 bs.copy_load_at_16(t6, t7, Address(send, -16));
1276
1277 bs.copy_store_at_16(Address(d, 0), t0, t1);
1278 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1279 __ b(finish);
1280
1281 // 65..80/96 bytes
1282 // (96 bytes if SIMD because we do 32 byes per instruction)
1283 __ bind(copy80);
1284 if (UseSIMDForMemoryOps) {
1285 bs.copy_load_at_32(v0, v1, Address(s, 0));
1286 bs.copy_load_at_32(v2, v3, Address(s, 32));
1287 // Unaligned pointers can be an issue for copying.
1288 // The issue has more chances to happen when granularity of data is
1289 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1290 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1291 // The most performance drop has been seen for the range 65-80 bytes.
1292 // For such cases using the pair of ldp/stp instead of the third pair of
1293 // ldpq/stpq fixes the performance issue.
1294 if (granularity < sizeof (jint)) {
1295 Label copy96;
1296 __ cmp(count, u1(80/granularity));
1297 __ br(Assembler::HI, copy96);
1298 bs.copy_load_at_16(t0, t1, Address(send, -16));
1299
1300 bs.copy_store_at_32(Address(d, 0), v0, v1);
1301 bs.copy_store_at_32(Address(d, 32), v2, v3);
1302
1303 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1304 __ b(finish);
1305
1306 __ bind(copy96);
1307 }
1308 bs.copy_load_at_32(v4, v5, Address(send, -32));
1309
1310 bs.copy_store_at_32(Address(d, 0), v0, v1);
1311 bs.copy_store_at_32(Address(d, 32), v2, v3);
1312
1313 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1314 } else {
1315 bs.copy_load_at_16(t0, t1, Address(s, 0));
1316 bs.copy_load_at_16(t2, t3, Address(s, 16));
1317 bs.copy_load_at_16(t4, t5, Address(s, 32));
1318 bs.copy_load_at_16(t6, t7, Address(s, 48));
1319 bs.copy_load_at_16(t8, t9, Address(send, -16));
1320
1321 bs.copy_store_at_16(Address(d, 0), t0, t1);
1322 bs.copy_store_at_16(Address(d, 16), t2, t3);
1323 bs.copy_store_at_16(Address(d, 32), t4, t5);
1324 bs.copy_store_at_16(Address(d, 48), t6, t7);
1325 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1326 }
1327 __ b(finish);
1328
1329 // 0..16 bytes
1330 __ bind(copy16);
1331 __ cmp(count, u1(8/granularity));
1332 __ br(Assembler::LO, copy8);
1333
1334 // 8..16 bytes
1335 bs.copy_load_at_8(t0, Address(s, 0));
1336 bs.copy_load_at_8(t1, Address(send, -8));
1337 bs.copy_store_at_8(Address(d, 0), t0);
1338 bs.copy_store_at_8(Address(dend, -8), t1);
1339 __ b(finish);
1340
1341 if (granularity < 8) {
1342 // 4..7 bytes
1343 __ bind(copy8);
1344 __ tbz(count, 2 - exact_log2(granularity), copy4);
1345 __ ldrw(t0, Address(s, 0));
1346 __ ldrw(t1, Address(send, -4));
1347 __ strw(t0, Address(d, 0));
1348 __ strw(t1, Address(dend, -4));
1349 __ b(finish);
1350 if (granularity < 4) {
1351 // 0..3 bytes
1352 __ bind(copy4);
1353 __ cbz(count, finish); // get rid of 0 case
1354 if (granularity == 2) {
1355 __ ldrh(t0, Address(s, 0));
1356 __ strh(t0, Address(d, 0));
1357 } else { // granularity == 1
1358 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1359 // the first and last byte.
1360 // Handle the 3 byte case by loading and storing base + count/2
1361 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1362 // This does means in the 1 byte case we load/store the same
1363 // byte 3 times.
1364 __ lsr(count, count, 1);
1365 __ ldrb(t0, Address(s, 0));
1366 __ ldrb(t1, Address(send, -1));
1367 __ ldrb(t2, Address(s, count));
1368 __ strb(t0, Address(d, 0));
1369 __ strb(t1, Address(dend, -1));
1370 __ strb(t2, Address(d, count));
1371 }
1372 __ b(finish);
1373 }
1374 }
1375
1376 __ bind(copy_big);
1377 if (is_backwards) {
1378 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1379 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1380 }
1381
1382 // Now we've got the small case out of the way we can align the
1383 // source address on a 2-word boundary.
1384
1385 // Here we will materialize a count in r15, which is used by copy_memory_small
1386 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1387 // Up until here, we have used t9, which aliases r15, but from here on, that register
1388 // can not be used as a temp register, as it contains the count.
1389
1390 Label aligned;
1391
1392 if (is_aligned) {
1393 // We may have to adjust by 1 word to get s 2-word-aligned.
1394 __ tbz(s, exact_log2(wordSize), aligned);
1395 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1396 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1397 __ sub(count, count, wordSize/granularity);
1398 } else {
1399 if (is_backwards) {
1400 __ andr(r15, s, 2 * wordSize - 1);
1401 } else {
1402 __ neg(r15, s);
1403 __ andr(r15, r15, 2 * wordSize - 1);
1404 }
1405 // r15 is the byte adjustment needed to align s.
1406 __ cbz(r15, aligned);
1407 int shift = exact_log2(granularity);
1408 if (shift > 0) {
1409 __ lsr(r15, r15, shift);
1410 }
1411 __ sub(count, count, r15);
1412
1413 #if 0
1414 // ?? This code is only correct for a disjoint copy. It may or
1415 // may not make sense to use it in that case.
1416
1417 // Copy the first pair; s and d may not be aligned.
1418 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1419 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1420
1421 // Align s and d, adjust count
1422 if (is_backwards) {
1423 __ sub(s, s, r15);
1424 __ sub(d, d, r15);
1425 } else {
1426 __ add(s, s, r15);
1427 __ add(d, d, r15);
1428 }
1429 #else
1430 copy_memory_small(decorators, type, s, d, r15, step);
1431 #endif
1432 }
1433
1434 __ bind(aligned);
1435
1436 // s is now 2-word-aligned.
1437
1438 // We have a count of units and some trailing bytes. Adjust the
1439 // count and do a bulk copy of words. If the shift is zero
1440 // perform a move instead to benefit from zero latency moves.
1441 int shift = exact_log2(wordSize/granularity);
1442 if (shift > 0) {
1443 __ lsr(r15, count, shift);
1444 } else {
1445 __ mov(r15, count);
1446 }
1447 if (direction == copy_forwards) {
1448 if (type != T_OBJECT) {
1449 __ bl(StubRoutines::aarch64::copy_byte_f());
1450 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1451 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1452 } else {
1453 __ bl(StubRoutines::aarch64::copy_oop_f());
1454 }
1455 } else {
1456 if (type != T_OBJECT) {
1457 __ bl(StubRoutines::aarch64::copy_byte_b());
1458 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1459 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1460 } else {
1461 __ bl(StubRoutines::aarch64::copy_oop_b());
1462 }
1463 }
1464
1465 // And the tail.
1466 copy_memory_small(decorators, type, s, d, count, step);
1467
1468 if (granularity >= 8) __ bind(copy8);
1469 if (granularity >= 4) __ bind(copy4);
1470 __ bind(finish);
1471 }
1472
1473
1474 void clobber_registers() {
1475 #ifdef ASSERT
1476 RegSet clobbered
1477 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1478 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1479 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1480 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1481 __ mov(*it, rscratch1);
1482 }
1483 #endif
1484
1485 }
1486
1487 // Scan over array at a for count oops, verifying each one.
1488 // Preserves a and count, clobbers rscratch1 and rscratch2.
1489 void verify_oop_array (int size, Register a, Register count, Register temp) {
1490 Label loop, end;
1491 __ mov(rscratch1, a);
1492 __ mov(rscratch2, zr);
1493 __ bind(loop);
1494 __ cmp(rscratch2, count);
1495 __ br(Assembler::HS, end);
1496 if (size == wordSize) {
1497 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1498 __ verify_oop(temp);
1499 } else {
1500 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1501 __ decode_heap_oop(temp); // calls verify_oop
1502 }
1503 __ add(rscratch2, rscratch2, 1);
1504 __ b(loop);
1505 __ bind(end);
1506 }
1507
1508 // Arguments:
1509 // stub_id - is used to name the stub and identify all details of
1510 // how to perform the copy.
1511 //
1512 // entry - is assigned to the stub's post push entry point unless
1513 // it is null
1514 //
1515 // Inputs:
1516 // c_rarg0 - source array address
1517 // c_rarg1 - destination array address
1518 // c_rarg2 - element count, treated as ssize_t, can be zero
1519 //
1520 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1521 // the hardware handle it. The two dwords within qwords that span
1522 // cache line boundaries will still be loaded and stored atomically.
1523 //
1524 // Side Effects: nopush_entry is set to the (post push) entry point
1525 // so it can be used by the corresponding conjoint
1526 // copy method
1527 //
1528 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1529 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1530 RegSet saved_reg = RegSet::of(s, d, count);
1531 int size;
1532 bool aligned;
1533 bool is_oop;
1534 bool dest_uninitialized;
1535 switch (stub_id) {
1536 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1537 size = sizeof(jbyte);
1538 aligned = false;
1539 is_oop = false;
1540 dest_uninitialized = false;
1541 break;
1542 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1543 size = sizeof(jbyte);
1544 aligned = true;
1545 is_oop = false;
1546 dest_uninitialized = false;
1547 break;
1548 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1549 size = sizeof(jshort);
1550 aligned = false;
1551 is_oop = false;
1552 dest_uninitialized = false;
1553 break;
1554 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1555 size = sizeof(jshort);
1556 aligned = true;
1557 is_oop = false;
1558 dest_uninitialized = false;
1559 break;
1560 case StubId::stubgen_jint_disjoint_arraycopy_id:
1561 size = sizeof(jint);
1562 aligned = false;
1563 is_oop = false;
1564 dest_uninitialized = false;
1565 break;
1566 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1567 size = sizeof(jint);
1568 aligned = true;
1569 is_oop = false;
1570 dest_uninitialized = false;
1571 break;
1572 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1573 // since this is always aligned we can (should!) use the same
1574 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1575 ShouldNotReachHere();
1576 break;
1577 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1578 size = sizeof(jlong);
1579 aligned = true;
1580 is_oop = false;
1581 dest_uninitialized = false;
1582 break;
1583 case StubId::stubgen_oop_disjoint_arraycopy_id:
1584 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1585 aligned = !UseCompressedOops;
1586 is_oop = true;
1587 dest_uninitialized = false;
1588 break;
1589 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1590 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1591 aligned = !UseCompressedOops;
1592 is_oop = true;
1593 dest_uninitialized = false;
1594 break;
1595 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1596 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1597 aligned = !UseCompressedOops;
1598 is_oop = true;
1599 dest_uninitialized = true;
1600 break;
1601 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1602 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1603 aligned = !UseCompressedOops;
1604 is_oop = true;
1605 dest_uninitialized = true;
1606 break;
1607 default:
1608 ShouldNotReachHere();
1609 break;
1610 }
1611
1612 __ align(CodeEntryAlignment);
1613 StubCodeMark mark(this, stub_id);
1614 address start = __ pc();
1615 __ enter();
1616
1617 if (nopush_entry != nullptr) {
1618 *nopush_entry = __ pc();
1619 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1620 BLOCK_COMMENT("Entry:");
1621 }
1622
1623 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1624 if (dest_uninitialized) {
1625 decorators |= IS_DEST_UNINITIALIZED;
1626 }
1627 if (aligned) {
1628 decorators |= ARRAYCOPY_ALIGNED;
1629 }
1630
1631 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1632 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1633
1634 if (is_oop) {
1635 // save regs before copy_memory
1636 __ push(RegSet::of(d, count), sp);
1637 }
1638 {
1639 // UnsafeMemoryAccess page error: continue after unsafe access
1640 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1641 UnsafeMemoryAccessMark umam(this, add_entry, true);
1642 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1643 }
1644
1645 if (is_oop) {
1646 __ pop(RegSet::of(d, count), sp);
1647 if (VerifyOops)
1648 verify_oop_array(size, d, count, r16);
1649 }
1650
1651 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1652
1653 __ leave();
1654 __ mov(r0, zr); // return 0
1655 __ ret(lr);
1656 return start;
1657 }
1658
1659 // Arguments:
1660 // stub_id - is used to name the stub and identify all details of
1661 // how to perform the copy.
1662 //
1663 // nooverlap_target - identifes the (post push) entry for the
1664 // corresponding disjoint copy routine which can be
1665 // jumped to if the ranges do not actually overlap
1666 //
1667 // entry - is assigned to the stub's post push entry point unless
1668 // it is null
1669 //
1670 //
1671 // Inputs:
1672 // c_rarg0 - source array address
1673 // c_rarg1 - destination array address
1674 // c_rarg2 - element count, treated as ssize_t, can be zero
1675 //
1676 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1677 // the hardware handle it. The two dwords within qwords that span
1678 // cache line boundaries will still be loaded and stored atomically.
1679 //
1680 // Side Effects:
1681 // nopush_entry is set to the no-overlap entry point so it can be
1682 // used by some other conjoint copy method
1683 //
1684 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1685 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1686 RegSet saved_regs = RegSet::of(s, d, count);
1687 int size;
1688 bool aligned;
1689 bool is_oop;
1690 bool dest_uninitialized;
1691 switch (stub_id) {
1692 case StubId::stubgen_jbyte_arraycopy_id:
1693 size = sizeof(jbyte);
1694 aligned = false;
1695 is_oop = false;
1696 dest_uninitialized = false;
1697 break;
1698 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1699 size = sizeof(jbyte);
1700 aligned = true;
1701 is_oop = false;
1702 dest_uninitialized = false;
1703 break;
1704 case StubId::stubgen_jshort_arraycopy_id:
1705 size = sizeof(jshort);
1706 aligned = false;
1707 is_oop = false;
1708 dest_uninitialized = false;
1709 break;
1710 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1711 size = sizeof(jshort);
1712 aligned = true;
1713 is_oop = false;
1714 dest_uninitialized = false;
1715 break;
1716 case StubId::stubgen_jint_arraycopy_id:
1717 size = sizeof(jint);
1718 aligned = false;
1719 is_oop = false;
1720 dest_uninitialized = false;
1721 break;
1722 case StubId::stubgen_arrayof_jint_arraycopy_id:
1723 size = sizeof(jint);
1724 aligned = true;
1725 is_oop = false;
1726 dest_uninitialized = false;
1727 break;
1728 case StubId::stubgen_jlong_arraycopy_id:
1729 // since this is always aligned we can (should!) use the same
1730 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1731 ShouldNotReachHere();
1732 break;
1733 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1734 size = sizeof(jlong);
1735 aligned = true;
1736 is_oop = false;
1737 dest_uninitialized = false;
1738 break;
1739 case StubId::stubgen_oop_arraycopy_id:
1740 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1741 aligned = !UseCompressedOops;
1742 is_oop = true;
1743 dest_uninitialized = false;
1744 break;
1745 case StubId::stubgen_arrayof_oop_arraycopy_id:
1746 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1747 aligned = !UseCompressedOops;
1748 is_oop = true;
1749 dest_uninitialized = false;
1750 break;
1751 case StubId::stubgen_oop_arraycopy_uninit_id:
1752 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1753 aligned = !UseCompressedOops;
1754 is_oop = true;
1755 dest_uninitialized = true;
1756 break;
1757 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1758 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1759 aligned = !UseCompressedOops;
1760 is_oop = true;
1761 dest_uninitialized = true;
1762 break;
1763 default:
1764 ShouldNotReachHere();
1765 }
1766
1767 StubCodeMark mark(this, stub_id);
1768 address start = __ pc();
1769 __ enter();
1770
1771 if (nopush_entry != nullptr) {
1772 *nopush_entry = __ pc();
1773 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1774 BLOCK_COMMENT("Entry:");
1775 }
1776
1777 // use fwd copy when (d-s) above_equal (count*size)
1778 Label L_overlapping;
1779 __ sub(rscratch1, d, s);
1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1781 __ br(Assembler::LO, L_overlapping);
1782 __ b(RuntimeAddress(nooverlap_target));
1783 __ bind(L_overlapping);
1784
1785 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1786 if (dest_uninitialized) {
1787 decorators |= IS_DEST_UNINITIALIZED;
1788 }
1789 if (aligned) {
1790 decorators |= ARRAYCOPY_ALIGNED;
1791 }
1792
1793 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1794 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1795
1796 if (is_oop) {
1797 // save regs before copy_memory
1798 __ push(RegSet::of(d, count), sp);
1799 }
1800 {
1801 // UnsafeMemoryAccess page error: continue after unsafe access
1802 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1803 UnsafeMemoryAccessMark umam(this, add_entry, true);
1804 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1805 }
1806 if (is_oop) {
1807 __ pop(RegSet::of(d, count), sp);
1808 if (VerifyOops)
1809 verify_oop_array(size, d, count, r16);
1810 }
1811 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1812 __ leave();
1813 __ mov(r0, zr); // return 0
1814 __ ret(lr);
1815 return start;
1816 }
1817
1818 // Helper for generating a dynamic type check.
1819 // Smashes rscratch1, rscratch2.
1820 void generate_type_check(Register sub_klass,
1821 Register super_check_offset,
1822 Register super_klass,
1823 Register temp1,
1824 Register temp2,
1825 Register result,
1826 Label& L_success) {
1827 assert_different_registers(sub_klass, super_check_offset, super_klass);
1828
1829 BLOCK_COMMENT("type_check:");
1830
1831 Label L_miss;
1832
1833 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1834 super_check_offset);
1835 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1836
1837 // Fall through on failure!
1838 __ BIND(L_miss);
1839 }
1840
1841 //
1842 // Generate checkcasting array copy stub
1843 //
1844 // Input:
1845 // c_rarg0 - source array address
1846 // c_rarg1 - destination array address
1847 // c_rarg2 - element count, treated as ssize_t, can be zero
1848 // c_rarg3 - size_t ckoff (super_check_offset)
1849 // c_rarg4 - oop ckval (super_klass)
1850 //
1851 // Output:
1852 // r0 == 0 - success
1853 // r0 == -1^K - failure, where K is partial transfer count
1854 //
1855 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1856 bool dest_uninitialized;
1857 switch (stub_id) {
1858 case StubId::stubgen_checkcast_arraycopy_id:
1859 dest_uninitialized = false;
1860 break;
1861 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1862 dest_uninitialized = true;
1863 break;
1864 default:
1865 ShouldNotReachHere();
1866 }
1867
1868 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1869
1870 // Input registers (after setup_arg_regs)
1871 const Register from = c_rarg0; // source array address
1872 const Register to = c_rarg1; // destination array address
1873 const Register count = c_rarg2; // elementscount
1874 const Register ckoff = c_rarg3; // super_check_offset
1875 const Register ckval = c_rarg4; // super_klass
1876
1877 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1878
1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1880 const Register copied_oop = r22; // actual oop copied
1881 const Register count_save = r21; // orig elementscount
1882 const Register start_to = r20; // destination array start address
1883 const Register r19_klass = r19; // oop._klass
1884
1885 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1886 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1887
1888 //---------------------------------------------------------------
1889 // Assembler stub will be used for this call to arraycopy
1890 // if the two arrays are subtypes of Object[] but the
1891 // destination array type is not equal to or a supertype
1892 // of the source type. Each element must be separately
1893 // checked.
1894
1895 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1896 copied_oop, r19_klass, count_save);
1897
1898 __ align(CodeEntryAlignment);
1899 StubCodeMark mark(this, stub_id);
1900 address start = __ pc();
1901
1902 __ enter(); // required for proper stackwalking of RuntimeStub frame
1903
1904 #ifdef ASSERT
1905 // caller guarantees that the arrays really are different
1906 // otherwise, we would have to make conjoint checks
1907 { Label L;
1908 __ b(L); // conjoint check not yet implemented
1909 __ stop("checkcast_copy within a single array");
1910 __ bind(L);
1911 }
1912 #endif //ASSERT
1913
1914 // Caller of this entry point must set up the argument registers.
1915 if (nopush_entry != nullptr) {
1916 *nopush_entry = __ pc();
1917 BLOCK_COMMENT("Entry:");
1918 }
1919
1920 // Empty array: Nothing to do.
1921 __ cbz(count, L_done);
1922 __ push(RegSet::of(r19, r20, r21, r22), sp);
1923
1924 #ifdef ASSERT
1925 BLOCK_COMMENT("assert consistent ckoff/ckval");
1926 // The ckoff and ckval must be mutually consistent,
1927 // even though caller generates both.
1928 { Label L;
1929 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1930 __ ldrw(start_to, Address(ckval, sco_offset));
1931 __ cmpw(ckoff, start_to);
1932 __ br(Assembler::EQ, L);
1933 __ stop("super_check_offset inconsistent");
1934 __ bind(L);
1935 }
1936 #endif //ASSERT
1937
1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1939 bool is_oop = true;
1940 int element_size = UseCompressedOops ? 4 : 8;
1941 if (dest_uninitialized) {
1942 decorators |= IS_DEST_UNINITIALIZED;
1943 }
1944
1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1947
1948 // save the original count
1949 __ mov(count_save, count);
1950
1951 // Copy from low to high addresses
1952 __ mov(start_to, to); // Save destination array start address
1953 __ b(L_load_element);
1954
1955 // ======== begin loop ========
1956 // (Loop is rotated; its entry is L_load_element.)
1957 // Loop control:
1958 // for (; count != 0; count--) {
1959 // copied_oop = load_heap_oop(from++);
1960 // ... generate_type_check ...;
1961 // store_heap_oop(to++, copied_oop);
1962 // }
1963 __ align(OptoLoopAlignment);
1964
1965 __ BIND(L_store_element);
1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1967 __ post(to, element_size), copied_oop, noreg,
1968 gct1, gct2, gct3);
1969 __ sub(count, count, 1);
1970 __ cbz(count, L_do_card_marks);
1971
1972 // ======== loop entry is here ========
1973 __ BIND(L_load_element);
1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1975 copied_oop, noreg, __ post(from, element_size),
1976 gct1);
1977 __ cbz(copied_oop, L_store_element);
1978
1979 __ load_klass(r19_klass, copied_oop);// query the object klass
1980
1981 BLOCK_COMMENT("type_check:");
1982 generate_type_check(/*sub_klass*/r19_klass,
1983 /*super_check_offset*/ckoff,
1984 /*super_klass*/ckval,
1985 /*r_array_base*/gct1,
1986 /*temp2*/gct2,
1987 /*result*/r10, L_store_element);
1988
1989 // Fall through on failure!
1990
1991 // ======== end loop ========
1992
1993 // It was a real error; we must depend on the caller to finish the job.
1994 // Register count = remaining oops, count_orig = total oops.
1995 // Emit GC store barriers for the oops we have copied and report
1996 // their number to the caller.
1997
1998 __ subs(count, count_save, count); // K = partially copied oop count
1999 __ eon(count, count, zr); // report (-1^K) to caller
2000 __ br(Assembler::EQ, L_done_pop);
2001
2002 __ BIND(L_do_card_marks);
2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2004
2005 __ bind(L_done_pop);
2006 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2008
2009 __ bind(L_done);
2010 __ mov(r0, count);
2011 __ leave();
2012 __ ret(lr);
2013
2014 return start;
2015 }
2016
2017 // Perform range checks on the proposed arraycopy.
2018 // Kills temp, but nothing else.
2019 // Also, clean the sign bits of src_pos and dst_pos.
2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2021 Register src_pos, // source position (c_rarg1)
2022 Register dst, // destination array oo (c_rarg2)
2023 Register dst_pos, // destination position (c_rarg3)
2024 Register length,
2025 Register temp,
2026 Label& L_failed) {
2027 BLOCK_COMMENT("arraycopy_range_checks:");
2028
2029 assert_different_registers(rscratch1, temp);
2030
2031 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2033 __ addw(temp, length, src_pos);
2034 __ cmpw(temp, rscratch1);
2035 __ br(Assembler::HI, L_failed);
2036
2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2039 __ addw(temp, length, dst_pos);
2040 __ cmpw(temp, rscratch1);
2041 __ br(Assembler::HI, L_failed);
2042
2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2044 __ movw(src_pos, src_pos);
2045 __ movw(dst_pos, dst_pos);
2046
2047 BLOCK_COMMENT("arraycopy_range_checks done");
2048 }
2049
2050 // These stubs get called from some dumb test routine.
2051 // I'll write them properly when they're called from
2052 // something that's actually doing something.
2053 static void fake_arraycopy_stub(address src, address dst, int count) {
2054 assert(count == 0, "huh?");
2055 }
2056
2057
2058 //
2059 // Generate 'unsafe' array copy stub
2060 // Though just as safe as the other stubs, it takes an unscaled
2061 // size_t argument instead of an element count.
2062 //
2063 // Input:
2064 // c_rarg0 - source array address
2065 // c_rarg1 - destination array address
2066 // c_rarg2 - byte count, treated as ssize_t, can be zero
2067 //
2068 // Examines the alignment of the operands and dispatches
2069 // to a long, int, short, or byte copy loop.
2070 //
2071 address generate_unsafe_copy(address byte_copy_entry,
2072 address short_copy_entry,
2073 address int_copy_entry,
2074 address long_copy_entry) {
2075 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2076
2077 Label L_long_aligned, L_int_aligned, L_short_aligned;
2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2079
2080 __ align(CodeEntryAlignment);
2081 StubCodeMark mark(this, stub_id);
2082 address start = __ pc();
2083 __ enter(); // required for proper stackwalking of RuntimeStub frame
2084
2085 // bump this on entry, not on exit:
2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2087
2088 __ orr(rscratch1, s, d);
2089 __ orr(rscratch1, rscratch1, count);
2090
2091 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2092 __ cbz(rscratch1, L_long_aligned);
2093 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2094 __ cbz(rscratch1, L_int_aligned);
2095 __ tbz(rscratch1, 0, L_short_aligned);
2096 __ b(RuntimeAddress(byte_copy_entry));
2097
2098 __ BIND(L_short_aligned);
2099 __ lsr(count, count, LogBytesPerShort); // size => short_count
2100 __ b(RuntimeAddress(short_copy_entry));
2101 __ BIND(L_int_aligned);
2102 __ lsr(count, count, LogBytesPerInt); // size => int_count
2103 __ b(RuntimeAddress(int_copy_entry));
2104 __ BIND(L_long_aligned);
2105 __ lsr(count, count, LogBytesPerLong); // size => long_count
2106 __ b(RuntimeAddress(long_copy_entry));
2107
2108 return start;
2109 }
2110
2111 //
2112 // Generate generic array copy stubs
2113 //
2114 // Input:
2115 // c_rarg0 - src oop
2116 // c_rarg1 - src_pos (32-bits)
2117 // c_rarg2 - dst oop
2118 // c_rarg3 - dst_pos (32-bits)
2119 // c_rarg4 - element count (32-bits)
2120 //
2121 // Output:
2122 // r0 == 0 - success
2123 // r0 == -1^K - failure, where K is partial transfer count
2124 //
2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2126 address int_copy_entry, address oop_copy_entry,
2127 address long_copy_entry, address checkcast_copy_entry) {
2128 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2129
2130 Label L_failed, L_objArray;
2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2132
2133 // Input registers
2134 const Register src = c_rarg0; // source array oop
2135 const Register src_pos = c_rarg1; // source position
2136 const Register dst = c_rarg2; // destination array oop
2137 const Register dst_pos = c_rarg3; // destination position
2138 const Register length = c_rarg4;
2139
2140
2141 // Registers used as temps
2142 const Register dst_klass = c_rarg5;
2143
2144 __ align(CodeEntryAlignment);
2145
2146 StubCodeMark mark(this, stub_id);
2147
2148 address start = __ pc();
2149
2150 __ enter(); // required for proper stackwalking of RuntimeStub frame
2151
2152 // bump this on entry, not on exit:
2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2154
2155 //-----------------------------------------------------------------------
2156 // Assembler stub will be used for this call to arraycopy
2157 // if the following conditions are met:
2158 //
2159 // (1) src and dst must not be null.
2160 // (2) src_pos must not be negative.
2161 // (3) dst_pos must not be negative.
2162 // (4) length must not be negative.
2163 // (5) src klass and dst klass should be the same and not null.
2164 // (6) src and dst should be arrays.
2165 // (7) src_pos + length must not exceed length of src.
2166 // (8) dst_pos + length must not exceed length of dst.
2167 //
2168
2169 // if (src == nullptr) return -1;
2170 __ cbz(src, L_failed);
2171
2172 // if (src_pos < 0) return -1;
2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2174
2175 // if (dst == nullptr) return -1;
2176 __ cbz(dst, L_failed);
2177
2178 // if (dst_pos < 0) return -1;
2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2180
2181 // registers used as temp
2182 const Register scratch_length = r16; // elements count to copy
2183 const Register scratch_src_klass = r17; // array klass
2184 const Register lh = r15; // layout helper
2185
2186 // if (length < 0) return -1;
2187 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2189
2190 __ load_klass(scratch_src_klass, src);
2191 #ifdef ASSERT
2192 // assert(src->klass() != nullptr);
2193 {
2194 BLOCK_COMMENT("assert klasses not null {");
2195 Label L1, L2;
2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2197 __ bind(L1);
2198 __ stop("broken null klass");
2199 __ bind(L2);
2200 __ load_klass(rscratch1, dst);
2201 __ cbz(rscratch1, L1); // this would be broken also
2202 BLOCK_COMMENT("} assert klasses not null done");
2203 }
2204 #endif
2205
2206 // Load layout helper (32-bits)
2207 //
2208 // |array_tag| | header_size | element_type | |log2_element_size|
2209 // 32 30 24 16 8 2 0
2210 //
2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2212 //
2213
2214 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2215
2216 // Handle objArrays completely differently...
2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2219 __ movw(rscratch1, objArray_lh);
2220 __ eorw(rscratch2, lh, rscratch1);
2221 __ cbzw(rscratch2, L_objArray);
2222
2223 // if (src->klass() != dst->klass()) return -1;
2224 __ load_klass(rscratch2, dst);
2225 __ eor(rscratch2, rscratch2, scratch_src_klass);
2226 __ cbnz(rscratch2, L_failed);
2227
2228 // if (!src->is_Array()) return -1;
2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2230
2231 // At this point, it is known to be a typeArray (array_tag 0x3).
2232 #ifdef ASSERT
2233 {
2234 BLOCK_COMMENT("assert primitive array {");
2235 Label L;
2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2237 __ cmpw(lh, rscratch2);
2238 __ br(Assembler::GE, L);
2239 __ stop("must be a primitive array");
2240 __ bind(L);
2241 BLOCK_COMMENT("} assert primitive array done");
2242 }
2243 #endif
2244
2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2246 rscratch2, L_failed);
2247
2248 // TypeArrayKlass
2249 //
2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2252 //
2253
2254 const Register rscratch1_offset = rscratch1; // array offset
2255 const Register r15_elsize = lh; // element size
2256
2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2259 __ add(src, src, rscratch1_offset); // src array offset
2260 __ add(dst, dst, rscratch1_offset); // dst array offset
2261 BLOCK_COMMENT("choose copy loop based on element size");
2262
2263 // next registers should be set before the jump to corresponding stub
2264 const Register from = c_rarg0; // source array address
2265 const Register to = c_rarg1; // destination array address
2266 const Register count = c_rarg2; // elements count
2267
2268 // 'from', 'to', 'count' registers should be set in such order
2269 // since they are the same as 'src', 'src_pos', 'dst'.
2270
2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2272
2273 // The possible values of elsize are 0-3, i.e. exact_log2(element
2274 // size in bytes). We do a simple bitwise binary search.
2275 __ BIND(L_copy_bytes);
2276 __ tbnz(r15_elsize, 1, L_copy_ints);
2277 __ tbnz(r15_elsize, 0, L_copy_shorts);
2278 __ lea(from, Address(src, src_pos));// src_addr
2279 __ lea(to, Address(dst, dst_pos));// dst_addr
2280 __ movw(count, scratch_length); // length
2281 __ b(RuntimeAddress(byte_copy_entry));
2282
2283 __ BIND(L_copy_shorts);
2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2286 __ movw(count, scratch_length); // length
2287 __ b(RuntimeAddress(short_copy_entry));
2288
2289 __ BIND(L_copy_ints);
2290 __ tbnz(r15_elsize, 0, L_copy_longs);
2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2293 __ movw(count, scratch_length); // length
2294 __ b(RuntimeAddress(int_copy_entry));
2295
2296 __ BIND(L_copy_longs);
2297 #ifdef ASSERT
2298 {
2299 BLOCK_COMMENT("assert long copy {");
2300 Label L;
2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2302 __ cmpw(r15_elsize, LogBytesPerLong);
2303 __ br(Assembler::EQ, L);
2304 __ stop("must be long copy, but elsize is wrong");
2305 __ bind(L);
2306 BLOCK_COMMENT("} assert long copy done");
2307 }
2308 #endif
2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2311 __ movw(count, scratch_length); // length
2312 __ b(RuntimeAddress(long_copy_entry));
2313
2314 // ObjArrayKlass
2315 __ BIND(L_objArray);
2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2317
2318 Label L_plain_copy, L_checkcast_copy;
2319 // test array classes for subtyping
2320 __ load_klass(r15, dst);
2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2322 __ br(Assembler::NE, L_checkcast_copy);
2323
2324 // Identically typed arrays can be copied without element-wise checks.
2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2326 rscratch2, L_failed);
2327
2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2332 __ movw(count, scratch_length); // length
2333 __ BIND(L_plain_copy);
2334 __ b(RuntimeAddress(oop_copy_entry));
2335
2336 __ BIND(L_checkcast_copy);
2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2338 {
2339 // Before looking at dst.length, make sure dst is also an objArray.
2340 __ ldrw(rscratch1, Address(r15, lh_offset));
2341 __ movw(rscratch2, objArray_lh);
2342 __ eorw(rscratch1, rscratch1, rscratch2);
2343 __ cbnzw(rscratch1, L_failed);
2344
2345 // It is safe to examine both src.length and dst.length.
2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347 r15, L_failed);
2348
2349 __ load_klass(dst_klass, dst); // reload
2350
2351 // Marshal the base address arguments now, freeing registers.
2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2356 __ movw(count, length); // length (reloaded)
2357 Register sco_temp = c_rarg3; // this register is free now
2358 assert_different_registers(from, to, count, sco_temp,
2359 dst_klass, scratch_src_klass);
2360 // assert_clean_int(count, sco_temp);
2361
2362 // Generate the type check.
2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2365
2366 // Smashes rscratch1, rscratch2
2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2368 L_plain_copy);
2369
2370 // Fetch destination element klass from the ObjArrayKlass header.
2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2372 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2374
2375 // the checkcast_copy loop needs two extra arguments:
2376 assert(c_rarg3 == sco_temp, "#3 already in place");
2377 // Set up arguments for checkcast_copy_entry.
2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2379 __ b(RuntimeAddress(checkcast_copy_entry));
2380 }
2381
2382 __ BIND(L_failed);
2383 __ mov(r0, -1);
2384 __ leave(); // required for proper stackwalking of RuntimeStub frame
2385 __ ret(lr);
2386
2387 return start;
2388 }
2389
2390 //
2391 // Generate stub for array fill. If "aligned" is true, the
2392 // "to" address is assumed to be heapword aligned.
2393 //
2394 // Arguments for generated stub:
2395 // to: c_rarg0
2396 // value: c_rarg1
2397 // count: c_rarg2 treated as signed
2398 //
2399 address generate_fill(StubId stub_id) {
2400 BasicType t;
2401 bool aligned;
2402
2403 switch (stub_id) {
2404 case StubId::stubgen_jbyte_fill_id:
2405 t = T_BYTE;
2406 aligned = false;
2407 break;
2408 case StubId::stubgen_jshort_fill_id:
2409 t = T_SHORT;
2410 aligned = false;
2411 break;
2412 case StubId::stubgen_jint_fill_id:
2413 t = T_INT;
2414 aligned = false;
2415 break;
2416 case StubId::stubgen_arrayof_jbyte_fill_id:
2417 t = T_BYTE;
2418 aligned = true;
2419 break;
2420 case StubId::stubgen_arrayof_jshort_fill_id:
2421 t = T_SHORT;
2422 aligned = true;
2423 break;
2424 case StubId::stubgen_arrayof_jint_fill_id:
2425 t = T_INT;
2426 aligned = true;
2427 break;
2428 default:
2429 ShouldNotReachHere();
2430 };
2431
2432 __ align(CodeEntryAlignment);
2433 StubCodeMark mark(this, stub_id);
2434 address start = __ pc();
2435
2436 BLOCK_COMMENT("Entry:");
2437
2438 const Register to = c_rarg0; // source array address
2439 const Register value = c_rarg1; // value
2440 const Register count = c_rarg2; // elements count
2441
2442 const Register bz_base = r10; // base for block_zero routine
2443 const Register cnt_words = r11; // temp register
2444
2445 __ enter();
2446
2447 Label L_fill_elements, L_exit1;
2448
2449 int shift = -1;
2450 switch (t) {
2451 case T_BYTE:
2452 shift = 0;
2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2456 __ br(Assembler::LO, L_fill_elements);
2457 break;
2458 case T_SHORT:
2459 shift = 1;
2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2462 __ br(Assembler::LO, L_fill_elements);
2463 break;
2464 case T_INT:
2465 shift = 2;
2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2467 __ br(Assembler::LO, L_fill_elements);
2468 break;
2469 default: ShouldNotReachHere();
2470 }
2471
2472 // Align source address at 8 bytes address boundary.
2473 Label L_skip_align1, L_skip_align2, L_skip_align4;
2474 if (!aligned) {
2475 switch (t) {
2476 case T_BYTE:
2477 // One byte misalignment happens only for byte arrays.
2478 __ tbz(to, 0, L_skip_align1);
2479 __ strb(value, Address(__ post(to, 1)));
2480 __ subw(count, count, 1);
2481 __ bind(L_skip_align1);
2482 // Fallthrough
2483 case T_SHORT:
2484 // Two bytes misalignment happens only for byte and short (char) arrays.
2485 __ tbz(to, 1, L_skip_align2);
2486 __ strh(value, Address(__ post(to, 2)));
2487 __ subw(count, count, 2 >> shift);
2488 __ bind(L_skip_align2);
2489 // Fallthrough
2490 case T_INT:
2491 // Align to 8 bytes, we know we are 4 byte aligned to start.
2492 __ tbz(to, 2, L_skip_align4);
2493 __ strw(value, Address(__ post(to, 4)));
2494 __ subw(count, count, 4 >> shift);
2495 __ bind(L_skip_align4);
2496 break;
2497 default: ShouldNotReachHere();
2498 }
2499 }
2500
2501 //
2502 // Fill large chunks
2503 //
2504 __ lsrw(cnt_words, count, 3 - shift); // number of words
2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2507 if (UseBlockZeroing) {
2508 Label non_block_zeroing, rest;
2509 // If the fill value is zero we can use the fast zero_words().
2510 __ cbnz(value, non_block_zeroing);
2511 __ mov(bz_base, to);
2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2513 address tpc = __ zero_words(bz_base, cnt_words);
2514 if (tpc == nullptr) {
2515 fatal("CodeCache is full at generate_fill");
2516 }
2517 __ b(rest);
2518 __ bind(non_block_zeroing);
2519 __ fill_words(to, cnt_words, value);
2520 __ bind(rest);
2521 } else {
2522 __ fill_words(to, cnt_words, value);
2523 }
2524
2525 // Remaining count is less than 8 bytes. Fill it by a single store.
2526 // Note that the total length is no less than 8 bytes.
2527 if (t == T_BYTE || t == T_SHORT) {
2528 Label L_exit1;
2529 __ cbzw(count, L_exit1);
2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2531 __ str(value, Address(to, -8)); // overwrite some elements
2532 __ bind(L_exit1);
2533 __ leave();
2534 __ ret(lr);
2535 }
2536
2537 // Handle copies less than 8 bytes.
2538 Label L_fill_2, L_fill_4, L_exit2;
2539 __ bind(L_fill_elements);
2540 switch (t) {
2541 case T_BYTE:
2542 __ tbz(count, 0, L_fill_2);
2543 __ strb(value, Address(__ post(to, 1)));
2544 __ bind(L_fill_2);
2545 __ tbz(count, 1, L_fill_4);
2546 __ strh(value, Address(__ post(to, 2)));
2547 __ bind(L_fill_4);
2548 __ tbz(count, 2, L_exit2);
2549 __ strw(value, Address(to));
2550 break;
2551 case T_SHORT:
2552 __ tbz(count, 0, L_fill_4);
2553 __ strh(value, Address(__ post(to, 2)));
2554 __ bind(L_fill_4);
2555 __ tbz(count, 1, L_exit2);
2556 __ strw(value, Address(to));
2557 break;
2558 case T_INT:
2559 __ cbzw(count, L_exit2);
2560 __ strw(value, Address(to));
2561 break;
2562 default: ShouldNotReachHere();
2563 }
2564 __ bind(L_exit2);
2565 __ leave();
2566 __ ret(lr);
2567 return start;
2568 }
2569
2570 address generate_unsafecopy_common_error_exit() {
2571 address start_pc = __ pc();
2572 __ leave();
2573 __ mov(r0, 0);
2574 __ ret(lr);
2575 return start_pc;
2576 }
2577
2578 //
2579 // Generate 'unsafe' set memory stub
2580 // Though just as safe as the other stubs, it takes an unscaled
2581 // size_t (# bytes) argument instead of an element count.
2582 //
2583 // This fill operation is atomicity preserving: as long as the
2584 // address supplied is sufficiently aligned, all writes of up to 64
2585 // bits in size are single-copy atomic.
2586 //
2587 // Input:
2588 // c_rarg0 - destination array address
2589 // c_rarg1 - byte count (size_t)
2590 // c_rarg2 - byte value
2591 //
2592 address generate_unsafe_setmemory() {
2593 __ align(CodeEntryAlignment);
2594 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2595 address start = __ pc();
2596
2597 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2598 Label tail;
2599
2600 UnsafeMemoryAccessMark umam(this, true, false);
2601
2602 __ enter(); // required for proper stackwalking of RuntimeStub frame
2603
2604 __ dup(v0, __ T16B, value);
2605
2606 if (AvoidUnalignedAccesses) {
2607 __ cmp(count, (u1)16);
2608 __ br(__ LO, tail);
2609
2610 __ mov(rscratch1, 16);
2611 __ andr(rscratch2, dest, 15);
2612 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2613 __ strq(v0, Address(dest));
2614 __ sub(count, count, rscratch1);
2615 __ add(dest, dest, rscratch1);
2616 }
2617
2618 __ subs(count, count, (u1)64);
2619 __ br(__ LO, tail);
2620 {
2621 Label again;
2622 __ bind(again);
2623 __ stpq(v0, v0, Address(dest));
2624 __ stpq(v0, v0, Address(dest, 32));
2625
2626 __ subs(count, count, 64);
2627 __ add(dest, dest, 64);
2628 __ br(__ HS, again);
2629 }
2630
2631 __ bind(tail);
2632 // The count of bytes is off by 64, but we don't need to correct
2633 // it because we're only going to use the least-significant few
2634 // count bits from here on.
2635 // __ add(count, count, 64);
2636
2637 {
2638 Label dont;
2639 __ tbz(count, exact_log2(32), dont);
2640 __ stpq(v0, v0, __ post(dest, 32));
2641 __ bind(dont);
2642 }
2643 {
2644 Label dont;
2645 __ tbz(count, exact_log2(16), dont);
2646 __ strq(v0, __ post(dest, 16));
2647 __ bind(dont);
2648 }
2649 {
2650 Label dont;
2651 __ tbz(count, exact_log2(8), dont);
2652 __ strd(v0, __ post(dest, 8));
2653 __ bind(dont);
2654 }
2655
2656 Label finished;
2657 __ tst(count, 7);
2658 __ br(__ EQ, finished);
2659
2660 {
2661 Label dont;
2662 __ tbz(count, exact_log2(4), dont);
2663 __ strs(v0, __ post(dest, 4));
2664 __ bind(dont);
2665 }
2666 {
2667 Label dont;
2668 __ tbz(count, exact_log2(2), dont);
2669 __ bfi(value, value, 8, 8);
2670 __ strh(value, __ post(dest, 2));
2671 __ bind(dont);
2672 }
2673 {
2674 Label dont;
2675 __ tbz(count, exact_log2(1), dont);
2676 __ strb(value, Address(dest));
2677 __ bind(dont);
2678 }
2679
2680 __ bind(finished);
2681 __ leave();
2682 __ ret(lr);
2683
2684 return start;
2685 }
2686
2687 address generate_data_cache_writeback() {
2688 const Register line = c_rarg0; // address of line to write back
2689
2690 __ align(CodeEntryAlignment);
2691
2692 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2693 StubCodeMark mark(this, stub_id);
2694
2695 address start = __ pc();
2696 __ enter();
2697 __ cache_wb(Address(line, 0));
2698 __ leave();
2699 __ ret(lr);
2700
2701 return start;
2702 }
2703
2704 address generate_data_cache_writeback_sync() {
2705 const Register is_pre = c_rarg0; // pre or post sync
2706
2707 __ align(CodeEntryAlignment);
2708
2709 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2710 StubCodeMark mark(this, stub_id);
2711
2712 // pre wbsync is a no-op
2713 // post wbsync translates to an sfence
2714
2715 Label skip;
2716 address start = __ pc();
2717 __ enter();
2718 __ cbnz(is_pre, skip);
2719 __ cache_wbsync(false);
2720 __ bind(skip);
2721 __ leave();
2722 __ ret(lr);
2723
2724 return start;
2725 }
2726
2727 void generate_arraycopy_stubs() {
2728 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2729 // entry immediately following their stack push. This can be used
2730 // as a post-push branch target for compatible stubs when they
2731 // identify a special case that can be handled by the fallback
2732 // stub e.g a disjoint copy stub may be use as a special case
2733 // fallback for its compatible conjoint copy stub.
2734 //
2735 // A no push entry is always returned in the following local and
2736 // then published by assigning to the appropriate entry field in
2737 // class StubRoutines. The entry value is then passed to the
2738 // generator for the compatible stub. That means the entry must be
2739 // listed when saving to/restoring from the AOT cache, ensuring
2740 // that the inter-stub jumps are noted at AOT-cache save and
2741 // relocated at AOT cache load.
2742 address nopush_entry;
2743
2744 // generate the common exit first so later stubs can rely on it if
2745 // they want an UnsafeMemoryAccess exit non-local to the stub
2746 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2747 // register the stub as the default exit with class UnsafeMemoryAccess
2748 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2749
2750 // generate and publish arch64-specific bulk copy routines first
2751 // so we can call them from other copy stubs
2752 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2753 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2754
2755 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2756 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2757
2758 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2759 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2760
2761 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2762
2763 //*** jbyte
2764 // Always need aligned and unaligned versions
2765 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2766 // disjoint nopush entry is needed by conjoint copy
2767 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2768 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2769 // conjoint nopush entry is needed by generic/unsafe copy
2770 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2771 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2772 // disjoint arrayof nopush entry is needed by conjoint copy
2773 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2774 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2775
2776 //*** jshort
2777 // Always need aligned and unaligned versions
2778 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2779 // disjoint nopush entry is needed by conjoint copy
2780 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2781 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2782 // conjoint nopush entry is used by generic/unsafe copy
2783 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2784 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2785 // disjoint arrayof nopush entry is needed by conjoint copy
2786 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2787 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2788
2789 //*** jint
2790 // Aligned versions
2791 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2792 // disjoint arrayof nopush entry is needed by conjoint copy
2793 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2794 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2795 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2796 // jint_arraycopy_nopush always points to the unaligned version
2797 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2798 // disjoint nopush entry is needed by conjoint copy
2799 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2800 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2801 // conjoint nopush entry is needed by generic/unsafe copy
2802 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2803
2804 //*** jlong
2805 // It is always aligned
2806 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2807 // disjoint arrayof nopush entry is needed by conjoint copy
2808 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2809 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2810 // conjoint nopush entry is needed by generic/unsafe copy
2811 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2812 // disjoint normal/nopush and conjoint normal entries are not
2813 // generated since the arrayof versions are the same
2814 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2815 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2816 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2817
2818 //*** oops
2819 {
2820 StubRoutines::_arrayof_oop_disjoint_arraycopy
2821 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2822 // disjoint arrayof nopush entry is needed by conjoint copy
2823 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2824 StubRoutines::_arrayof_oop_arraycopy
2825 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2826 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2827 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2828 // Aligned versions without pre-barriers
2829 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2830 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2831 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2832 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2833 // note that we don't need a returned nopush entry because the
2834 // generic/unsafe copy does not cater for uninit arrays.
2835 StubRoutines::_arrayof_oop_arraycopy_uninit
2836 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2837 }
2838
2839 // for oop copies reuse arrayof entries for non-arrayof cases
2840 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2841 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2842 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2843 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2844 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2845 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2846
2847 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2848 // checkcast nopush entry is needed by generic copy
2849 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2850 // note that we don't need a returned nopush entry because the
2851 // generic copy does not cater for uninit arrays.
2852 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2853
2854 // unsafe arraycopy may fallback on conjoint stubs
2855 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2856 StubRoutines::_jshort_arraycopy_nopush,
2857 StubRoutines::_jint_arraycopy_nopush,
2858 StubRoutines::_jlong_arraycopy_nopush);
2859
2860 // generic arraycopy may fallback on conjoint stubs
2861 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2862 StubRoutines::_jshort_arraycopy_nopush,
2863 StubRoutines::_jint_arraycopy_nopush,
2864 StubRoutines::_oop_arraycopy_nopush,
2865 StubRoutines::_jlong_arraycopy_nopush,
2866 StubRoutines::_checkcast_arraycopy_nopush);
2867
2868 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2869 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2870 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2871 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2872 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2873 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2874 }
2875
2876 void generate_math_stubs() { Unimplemented(); }
2877
2878 // Arguments:
2879 //
2880 // Inputs:
2881 // c_rarg0 - source byte array address
2882 // c_rarg1 - destination byte array address
2883 // c_rarg2 - sessionKe (key) in little endian int array
2884 //
2885 address generate_aescrypt_encryptBlock() {
2886 __ align(CodeEntryAlignment);
2887 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2888 StubCodeMark mark(this, stub_id);
2889
2890 const Register from = c_rarg0; // source array address
2891 const Register to = c_rarg1; // destination array address
2892 const Register key = c_rarg2; // key array address
2893 const Register keylen = rscratch1;
2894
2895 address start = __ pc();
2896 __ enter();
2897
2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899
2900 __ aesenc_loadkeys(key, keylen);
2901 __ aesecb_encrypt(from, to, keylen);
2902
2903 __ mov(r0, 0);
2904
2905 __ leave();
2906 __ ret(lr);
2907
2908 return start;
2909 }
2910
2911 // Arguments:
2912 //
2913 // Inputs:
2914 // c_rarg0 - source byte array address
2915 // c_rarg1 - destination byte array address
2916 // c_rarg2 - sessionKd (key) in little endian int array
2917 //
2918 address generate_aescrypt_decryptBlock() {
2919 assert(UseAES, "need AES cryptographic extension support");
2920 __ align(CodeEntryAlignment);
2921 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2922 StubCodeMark mark(this, stub_id);
2923 Label L_doLast;
2924
2925 const Register from = c_rarg0; // source array address
2926 const Register to = c_rarg1; // destination array address
2927 const Register key = c_rarg2; // key array address
2928 const Register keylen = rscratch1;
2929
2930 address start = __ pc();
2931 __ enter(); // required for proper stackwalking of RuntimeStub frame
2932
2933 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2934
2935 __ aesecb_decrypt(from, to, key, keylen);
2936
2937 __ mov(r0, 0);
2938
2939 __ leave();
2940 __ ret(lr);
2941
2942 return start;
2943 }
2944
2945 // Arguments:
2946 //
2947 // Inputs:
2948 // c_rarg0 - source byte array address
2949 // c_rarg1 - destination byte array address
2950 // c_rarg2 - sessionKe (key) in little endian int array
2951 // c_rarg3 - r vector byte array address
2952 // c_rarg4 - input length
2953 //
2954 // Output:
2955 // x0 - input length
2956 //
2957 address generate_cipherBlockChaining_encryptAESCrypt() {
2958 assert(UseAES, "need AES cryptographic extension support");
2959 __ align(CodeEntryAlignment);
2960 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2961 StubCodeMark mark(this, stub_id);
2962
2963 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2964
2965 const Register from = c_rarg0; // source array address
2966 const Register to = c_rarg1; // destination array address
2967 const Register key = c_rarg2; // key array address
2968 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2969 // and left with the results of the last encryption block
2970 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2971 const Register keylen = rscratch1;
2972
2973 address start = __ pc();
2974
2975 __ enter();
2976
2977 __ movw(rscratch2, len_reg);
2978
2979 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2980
2981 __ ld1(v0, __ T16B, rvec);
2982
2983 __ cmpw(keylen, 52);
2984 __ br(Assembler::CC, L_loadkeys_44);
2985 __ br(Assembler::EQ, L_loadkeys_52);
2986
2987 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2988 __ rev32(v17, __ T16B, v17);
2989 __ rev32(v18, __ T16B, v18);
2990 __ BIND(L_loadkeys_52);
2991 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2992 __ rev32(v19, __ T16B, v19);
2993 __ rev32(v20, __ T16B, v20);
2994 __ BIND(L_loadkeys_44);
2995 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2996 __ rev32(v21, __ T16B, v21);
2997 __ rev32(v22, __ T16B, v22);
2998 __ rev32(v23, __ T16B, v23);
2999 __ rev32(v24, __ T16B, v24);
3000 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3001 __ rev32(v25, __ T16B, v25);
3002 __ rev32(v26, __ T16B, v26);
3003 __ rev32(v27, __ T16B, v27);
3004 __ rev32(v28, __ T16B, v28);
3005 __ ld1(v29, v30, v31, __ T16B, key);
3006 __ rev32(v29, __ T16B, v29);
3007 __ rev32(v30, __ T16B, v30);
3008 __ rev32(v31, __ T16B, v31);
3009
3010 __ BIND(L_aes_loop);
3011 __ ld1(v1, __ T16B, __ post(from, 16));
3012 __ eor(v0, __ T16B, v0, v1);
3013
3014 __ br(Assembler::CC, L_rounds_44);
3015 __ br(Assembler::EQ, L_rounds_52);
3016
3017 __ aese(v0, v17); __ aesmc(v0, v0);
3018 __ aese(v0, v18); __ aesmc(v0, v0);
3019 __ BIND(L_rounds_52);
3020 __ aese(v0, v19); __ aesmc(v0, v0);
3021 __ aese(v0, v20); __ aesmc(v0, v0);
3022 __ BIND(L_rounds_44);
3023 __ aese(v0, v21); __ aesmc(v0, v0);
3024 __ aese(v0, v22); __ aesmc(v0, v0);
3025 __ aese(v0, v23); __ aesmc(v0, v0);
3026 __ aese(v0, v24); __ aesmc(v0, v0);
3027 __ aese(v0, v25); __ aesmc(v0, v0);
3028 __ aese(v0, v26); __ aesmc(v0, v0);
3029 __ aese(v0, v27); __ aesmc(v0, v0);
3030 __ aese(v0, v28); __ aesmc(v0, v0);
3031 __ aese(v0, v29); __ aesmc(v0, v0);
3032 __ aese(v0, v30);
3033 __ eor(v0, __ T16B, v0, v31);
3034
3035 __ st1(v0, __ T16B, __ post(to, 16));
3036
3037 __ subw(len_reg, len_reg, 16);
3038 __ cbnzw(len_reg, L_aes_loop);
3039
3040 __ st1(v0, __ T16B, rvec);
3041
3042 __ mov(r0, rscratch2);
3043
3044 __ leave();
3045 __ ret(lr);
3046
3047 return start;
3048 }
3049
3050 // Arguments:
3051 //
3052 // Inputs:
3053 // c_rarg0 - source byte array address
3054 // c_rarg1 - destination byte array address
3055 // c_rarg2 - sessionKd (key) in little endian int array
3056 // c_rarg3 - r vector byte array address
3057 // c_rarg4 - input length
3058 //
3059 // Output:
3060 // r0 - input length
3061 //
3062 address generate_cipherBlockChaining_decryptAESCrypt() {
3063 assert(UseAES, "need AES cryptographic extension support");
3064 __ align(CodeEntryAlignment);
3065 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3066 StubCodeMark mark(this, stub_id);
3067
3068 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3069
3070 const Register from = c_rarg0; // source array address
3071 const Register to = c_rarg1; // destination array address
3072 const Register key = c_rarg2; // key array address
3073 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3074 // and left with the results of the last encryption block
3075 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3076 const Register keylen = rscratch1;
3077
3078 address start = __ pc();
3079
3080 __ enter();
3081
3082 __ movw(rscratch2, len_reg);
3083
3084 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3085
3086 __ ld1(v2, __ T16B, rvec);
3087
3088 __ ld1(v31, __ T16B, __ post(key, 16));
3089 __ rev32(v31, __ T16B, v31);
3090
3091 __ cmpw(keylen, 52);
3092 __ br(Assembler::CC, L_loadkeys_44);
3093 __ br(Assembler::EQ, L_loadkeys_52);
3094
3095 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3096 __ rev32(v17, __ T16B, v17);
3097 __ rev32(v18, __ T16B, v18);
3098 __ BIND(L_loadkeys_52);
3099 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3100 __ rev32(v19, __ T16B, v19);
3101 __ rev32(v20, __ T16B, v20);
3102 __ BIND(L_loadkeys_44);
3103 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3104 __ rev32(v21, __ T16B, v21);
3105 __ rev32(v22, __ T16B, v22);
3106 __ rev32(v23, __ T16B, v23);
3107 __ rev32(v24, __ T16B, v24);
3108 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3109 __ rev32(v25, __ T16B, v25);
3110 __ rev32(v26, __ T16B, v26);
3111 __ rev32(v27, __ T16B, v27);
3112 __ rev32(v28, __ T16B, v28);
3113 __ ld1(v29, v30, __ T16B, key);
3114 __ rev32(v29, __ T16B, v29);
3115 __ rev32(v30, __ T16B, v30);
3116
3117 __ BIND(L_aes_loop);
3118 __ ld1(v0, __ T16B, __ post(from, 16));
3119 __ orr(v1, __ T16B, v0, v0);
3120
3121 __ br(Assembler::CC, L_rounds_44);
3122 __ br(Assembler::EQ, L_rounds_52);
3123
3124 __ aesd(v0, v17); __ aesimc(v0, v0);
3125 __ aesd(v0, v18); __ aesimc(v0, v0);
3126 __ BIND(L_rounds_52);
3127 __ aesd(v0, v19); __ aesimc(v0, v0);
3128 __ aesd(v0, v20); __ aesimc(v0, v0);
3129 __ BIND(L_rounds_44);
3130 __ aesd(v0, v21); __ aesimc(v0, v0);
3131 __ aesd(v0, v22); __ aesimc(v0, v0);
3132 __ aesd(v0, v23); __ aesimc(v0, v0);
3133 __ aesd(v0, v24); __ aesimc(v0, v0);
3134 __ aesd(v0, v25); __ aesimc(v0, v0);
3135 __ aesd(v0, v26); __ aesimc(v0, v0);
3136 __ aesd(v0, v27); __ aesimc(v0, v0);
3137 __ aesd(v0, v28); __ aesimc(v0, v0);
3138 __ aesd(v0, v29); __ aesimc(v0, v0);
3139 __ aesd(v0, v30);
3140 __ eor(v0, __ T16B, v0, v31);
3141 __ eor(v0, __ T16B, v0, v2);
3142
3143 __ st1(v0, __ T16B, __ post(to, 16));
3144 __ orr(v2, __ T16B, v1, v1);
3145
3146 __ subw(len_reg, len_reg, 16);
3147 __ cbnzw(len_reg, L_aes_loop);
3148
3149 __ st1(v2, __ T16B, rvec);
3150
3151 __ mov(r0, rscratch2);
3152
3153 __ leave();
3154 __ ret(lr);
3155
3156 return start;
3157 }
3158
3159 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3160 // Inputs: 128-bits. in is preserved.
3161 // The least-significant 64-bit word is in the upper dword of each vector.
3162 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3163 // Output: result
3164 void be_add_128_64(FloatRegister result, FloatRegister in,
3165 FloatRegister inc, FloatRegister tmp) {
3166 assert_different_registers(result, tmp, inc);
3167
3168 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3169 // input
3170 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3171 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3172 // MSD == 0 (must be!) to LSD
3173 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3174 }
3175
3176 // CTR AES crypt.
3177 // Arguments:
3178 //
3179 // Inputs:
3180 // c_rarg0 - source byte array address
3181 // c_rarg1 - destination byte array address
3182 // c_rarg2 - sessionKe (key) in little endian int array
3183 // c_rarg3 - counter vector byte array address
3184 // c_rarg4 - input length
3185 // c_rarg5 - saved encryptedCounter start
3186 // c_rarg6 - saved used length
3187 //
3188 // Output:
3189 // r0 - input length
3190 //
3191 address generate_counterMode_AESCrypt() {
3192 const Register in = c_rarg0;
3193 const Register out = c_rarg1;
3194 const Register key = c_rarg2;
3195 const Register counter = c_rarg3;
3196 const Register saved_len = c_rarg4, len = r10;
3197 const Register saved_encrypted_ctr = c_rarg5;
3198 const Register used_ptr = c_rarg6, used = r12;
3199
3200 const Register offset = r7;
3201 const Register keylen = r11;
3202
3203 const unsigned char block_size = 16;
3204 const int bulk_width = 4;
3205 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3206 // performance with larger data sizes, but it also means that the
3207 // fast path isn't used until you have at least 8 blocks, and up
3208 // to 127 bytes of data will be executed on the slow path. For
3209 // that reason, and also so as not to blow away too much icache, 4
3210 // blocks seems like a sensible compromise.
3211
3212 // Algorithm:
3213 //
3214 // if (len == 0) {
3215 // goto DONE;
3216 // }
3217 // int result = len;
3218 // do {
3219 // if (used >= blockSize) {
3220 // if (len >= bulk_width * blockSize) {
3221 // CTR_large_block();
3222 // if (len == 0)
3223 // goto DONE;
3224 // }
3225 // for (;;) {
3226 // 16ByteVector v0 = counter;
3227 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3228 // used = 0;
3229 // if (len < blockSize)
3230 // break; /* goto NEXT */
3231 // 16ByteVector v1 = load16Bytes(in, offset);
3232 // v1 = v1 ^ encryptedCounter;
3233 // store16Bytes(out, offset);
3234 // used = blockSize;
3235 // offset += blockSize;
3236 // len -= blockSize;
3237 // if (len == 0)
3238 // goto DONE;
3239 // }
3240 // }
3241 // NEXT:
3242 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3243 // len--;
3244 // } while (len != 0);
3245 // DONE:
3246 // return result;
3247 //
3248 // CTR_large_block()
3249 // Wide bulk encryption of whole blocks.
3250
3251 __ align(CodeEntryAlignment);
3252 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3253 StubCodeMark mark(this, stub_id);
3254 const address start = __ pc();
3255 __ enter();
3256
3257 Label DONE, CTR_large_block, large_block_return;
3258 __ ldrw(used, Address(used_ptr));
3259 __ cbzw(saved_len, DONE);
3260
3261 __ mov(len, saved_len);
3262 __ mov(offset, 0);
3263
3264 // Compute #rounds for AES based on the length of the key array
3265 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3266
3267 __ aesenc_loadkeys(key, keylen);
3268
3269 {
3270 Label L_CTR_loop, NEXT;
3271
3272 __ bind(L_CTR_loop);
3273
3274 __ cmp(used, block_size);
3275 __ br(__ LO, NEXT);
3276
3277 // Maybe we have a lot of data
3278 __ subsw(rscratch1, len, bulk_width * block_size);
3279 __ br(__ HS, CTR_large_block);
3280 __ BIND(large_block_return);
3281 __ cbzw(len, DONE);
3282
3283 // Setup the counter
3284 __ movi(v4, __ T4S, 0);
3285 __ movi(v5, __ T4S, 1);
3286 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3287
3288 // 128-bit big-endian increment
3289 __ ld1(v0, __ T16B, counter);
3290 __ rev64(v16, __ T16B, v0);
3291 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3292 __ rev64(v16, __ T16B, v16);
3293 __ st1(v16, __ T16B, counter);
3294 // Previous counter value is in v0
3295 // v4 contains { 0, 1 }
3296
3297 {
3298 // We have fewer than bulk_width blocks of data left. Encrypt
3299 // them one by one until there is less than a full block
3300 // remaining, being careful to save both the encrypted counter
3301 // and the counter.
3302
3303 Label inner_loop;
3304 __ bind(inner_loop);
3305 // Counter to encrypt is in v0
3306 __ aesecb_encrypt(noreg, noreg, keylen);
3307 __ st1(v0, __ T16B, saved_encrypted_ctr);
3308
3309 // Do we have a remaining full block?
3310
3311 __ mov(used, 0);
3312 __ cmp(len, block_size);
3313 __ br(__ LO, NEXT);
3314
3315 // Yes, we have a full block
3316 __ ldrq(v1, Address(in, offset));
3317 __ eor(v1, __ T16B, v1, v0);
3318 __ strq(v1, Address(out, offset));
3319 __ mov(used, block_size);
3320 __ add(offset, offset, block_size);
3321
3322 __ subw(len, len, block_size);
3323 __ cbzw(len, DONE);
3324
3325 // Increment the counter, store it back
3326 __ orr(v0, __ T16B, v16, v16);
3327 __ rev64(v16, __ T16B, v16);
3328 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3329 __ rev64(v16, __ T16B, v16);
3330 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3331
3332 __ b(inner_loop);
3333 }
3334
3335 __ BIND(NEXT);
3336
3337 // Encrypt a single byte, and loop.
3338 // We expect this to be a rare event.
3339 __ ldrb(rscratch1, Address(in, offset));
3340 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3341 __ eor(rscratch1, rscratch1, rscratch2);
3342 __ strb(rscratch1, Address(out, offset));
3343 __ add(offset, offset, 1);
3344 __ add(used, used, 1);
3345 __ subw(len, len,1);
3346 __ cbnzw(len, L_CTR_loop);
3347 }
3348
3349 __ bind(DONE);
3350 __ strw(used, Address(used_ptr));
3351 __ mov(r0, saved_len);
3352
3353 __ leave(); // required for proper stackwalking of RuntimeStub frame
3354 __ ret(lr);
3355
3356 // Bulk encryption
3357
3358 __ BIND (CTR_large_block);
3359 assert(bulk_width == 4 || bulk_width == 8, "must be");
3360
3361 if (bulk_width == 8) {
3362 __ sub(sp, sp, 4 * 16);
3363 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3364 }
3365 __ sub(sp, sp, 4 * 16);
3366 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3367 RegSet saved_regs = (RegSet::of(in, out, offset)
3368 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3369 __ push(saved_regs, sp);
3370 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3371 __ add(in, in, offset);
3372 __ add(out, out, offset);
3373
3374 // Keys should already be loaded into the correct registers
3375
3376 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3377 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3378
3379 // AES/CTR loop
3380 {
3381 Label L_CTR_loop;
3382 __ BIND(L_CTR_loop);
3383
3384 // Setup the counters
3385 __ movi(v8, __ T4S, 0);
3386 __ movi(v9, __ T4S, 1);
3387 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3388
3389 for (int i = 0; i < bulk_width; i++) {
3390 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3391 __ rev64(v0_ofs, __ T16B, v16);
3392 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3393 }
3394
3395 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3396
3397 // Encrypt the counters
3398 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3399
3400 if (bulk_width == 8) {
3401 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3402 }
3403
3404 // XOR the encrypted counters with the inputs
3405 for (int i = 0; i < bulk_width; i++) {
3406 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3407 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3408 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3409 }
3410
3411 // Write the encrypted data
3412 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3413 if (bulk_width == 8) {
3414 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3415 }
3416
3417 __ subw(len, len, 16 * bulk_width);
3418 __ cbnzw(len, L_CTR_loop);
3419 }
3420
3421 // Save the counter back where it goes
3422 __ rev64(v16, __ T16B, v16);
3423 __ st1(v16, __ T16B, counter);
3424
3425 __ pop(saved_regs, sp);
3426
3427 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3428 if (bulk_width == 8) {
3429 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3430 }
3431
3432 __ andr(rscratch1, len, -16 * bulk_width);
3433 __ sub(len, len, rscratch1);
3434 __ add(offset, offset, rscratch1);
3435 __ mov(used, 16);
3436 __ strw(used, Address(used_ptr));
3437 __ b(large_block_return);
3438
3439 return start;
3440 }
3441
3442 // Vector AES Galois Counter Mode implementation. Parameters:
3443 //
3444 // in = c_rarg0
3445 // len = c_rarg1
3446 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3447 // out = c_rarg3
3448 // key = c_rarg4
3449 // state = c_rarg5 - GHASH.state
3450 // subkeyHtbl = c_rarg6 - powers of H
3451 // counter = c_rarg7 - 16 bytes of CTR
3452 // return - number of processed bytes
3453 address generate_galoisCounterMode_AESCrypt() {
3454 Label ghash_polynomial; // local data generated after code
3455
3456 __ align(CodeEntryAlignment);
3457 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3458 StubCodeMark mark(this, stub_id);
3459 address start = __ pc();
3460 __ enter();
3461
3462 const Register in = c_rarg0;
3463 const Register len = c_rarg1;
3464 const Register ct = c_rarg2;
3465 const Register out = c_rarg3;
3466 // and updated with the incremented counter in the end
3467
3468 const Register key = c_rarg4;
3469 const Register state = c_rarg5;
3470
3471 const Register subkeyHtbl = c_rarg6;
3472
3473 const Register counter = c_rarg7;
3474
3475 const Register keylen = r10;
3476 // Save state before entering routine
3477 __ sub(sp, sp, 4 * 16);
3478 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3479 __ sub(sp, sp, 4 * 16);
3480 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3481
3482 // __ andr(len, len, -512);
3483 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3484 __ str(len, __ pre(sp, -2 * wordSize));
3485
3486 Label DONE;
3487 __ cbz(len, DONE);
3488
3489 // Compute #rounds for AES based on the length of the key array
3490 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3491
3492 __ aesenc_loadkeys(key, keylen);
3493 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3494 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3495
3496 // AES/CTR loop
3497 {
3498 Label L_CTR_loop;
3499 __ BIND(L_CTR_loop);
3500
3501 // Setup the counters
3502 __ movi(v8, __ T4S, 0);
3503 __ movi(v9, __ T4S, 1);
3504 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3505
3506 assert(v0->encoding() < v8->encoding(), "");
3507 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3508 FloatRegister f = as_FloatRegister(i);
3509 __ rev32(f, __ T16B, v16);
3510 __ addv(v16, __ T4S, v16, v8);
3511 }
3512
3513 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3514
3515 // Encrypt the counters
3516 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3517
3518 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3519
3520 // XOR the encrypted counters with the inputs
3521 for (int i = 0; i < 8; i++) {
3522 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3523 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3524 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3525 }
3526 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3527 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3528
3529 __ subw(len, len, 16 * 8);
3530 __ cbnzw(len, L_CTR_loop);
3531 }
3532
3533 __ rev32(v16, __ T16B, v16);
3534 __ st1(v16, __ T16B, counter);
3535
3536 __ ldr(len, Address(sp));
3537 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3538
3539 // GHASH/CTR loop
3540 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3541 len, /*unrolls*/4);
3542
3543 #ifdef ASSERT
3544 { Label L;
3545 __ cmp(len, (unsigned char)0);
3546 __ br(Assembler::EQ, L);
3547 __ stop("stubGenerator: abort");
3548 __ bind(L);
3549 }
3550 #endif
3551
3552 __ bind(DONE);
3553 // Return the number of bytes processed
3554 __ ldr(r0, __ post(sp, 2 * wordSize));
3555
3556 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3557 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3558
3559 __ leave(); // required for proper stackwalking of RuntimeStub frame
3560 __ ret(lr);
3561
3562 // bind label and generate polynomial data
3563 __ align(wordSize * 2);
3564 __ bind(ghash_polynomial);
3565 __ emit_int64(0x87); // The low-order bits of the field
3566 // polynomial (i.e. p = z^7+z^2+z+1)
3567 // repeated in the low and high parts of a
3568 // 128-bit vector
3569 __ emit_int64(0x87);
3570
3571 return start;
3572 }
3573
3574 class Cached64Bytes {
3575 private:
3576 MacroAssembler *_masm;
3577 Register _regs[8];
3578
3579 public:
3580 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3581 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3582 auto it = rs.begin();
3583 for (auto &r: _regs) {
3584 r = *it;
3585 ++it;
3586 }
3587 }
3588
3589 void gen_loads(Register base) {
3590 for (int i = 0; i < 8; i += 2) {
3591 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3592 }
3593 }
3594
3595 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3596 void extract_u32(Register dest, int i) {
3597 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3598 }
3599 };
3600
3601 // Utility routines for md5.
3602 // Clobbers r10 and r11.
3603 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3604 int k, int s, int t) {
3605 Register rscratch3 = r10;
3606 Register rscratch4 = r11;
3607
3608 __ eorw(rscratch3, r3, r4);
3609 __ movw(rscratch2, t);
3610 __ andw(rscratch3, rscratch3, r2);
3611 __ addw(rscratch4, r1, rscratch2);
3612 reg_cache.extract_u32(rscratch1, k);
3613 __ eorw(rscratch3, rscratch3, r4);
3614 __ addw(rscratch4, rscratch4, rscratch1);
3615 __ addw(rscratch3, rscratch3, rscratch4);
3616 __ rorw(rscratch2, rscratch3, 32 - s);
3617 __ addw(r1, rscratch2, r2);
3618 }
3619
3620 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3621 int k, int s, int t) {
3622 Register rscratch3 = r10;
3623 Register rscratch4 = r11;
3624
3625 reg_cache.extract_u32(rscratch1, k);
3626 __ movw(rscratch2, t);
3627 __ addw(rscratch4, r1, rscratch2);
3628 __ addw(rscratch4, rscratch4, rscratch1);
3629 __ bicw(rscratch2, r3, r4);
3630 __ andw(rscratch3, r2, r4);
3631 __ addw(rscratch2, rscratch2, rscratch4);
3632 __ addw(rscratch2, rscratch2, rscratch3);
3633 __ rorw(rscratch2, rscratch2, 32 - s);
3634 __ addw(r1, rscratch2, r2);
3635 }
3636
3637 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3638 int k, int s, int t) {
3639 Register rscratch3 = r10;
3640 Register rscratch4 = r11;
3641
3642 __ eorw(rscratch3, r3, r4);
3643 __ movw(rscratch2, t);
3644 __ addw(rscratch4, r1, rscratch2);
3645 reg_cache.extract_u32(rscratch1, k);
3646 __ eorw(rscratch3, rscratch3, r2);
3647 __ addw(rscratch4, rscratch4, rscratch1);
3648 __ addw(rscratch3, rscratch3, rscratch4);
3649 __ rorw(rscratch2, rscratch3, 32 - s);
3650 __ addw(r1, rscratch2, r2);
3651 }
3652
3653 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3654 int k, int s, int t) {
3655 Register rscratch3 = r10;
3656 Register rscratch4 = r11;
3657
3658 __ movw(rscratch3, t);
3659 __ ornw(rscratch2, r2, r4);
3660 __ addw(rscratch4, r1, rscratch3);
3661 reg_cache.extract_u32(rscratch1, k);
3662 __ eorw(rscratch3, rscratch2, r3);
3663 __ addw(rscratch4, rscratch4, rscratch1);
3664 __ addw(rscratch3, rscratch3, rscratch4);
3665 __ rorw(rscratch2, rscratch3, 32 - s);
3666 __ addw(r1, rscratch2, r2);
3667 }
3668
3669 // Arguments:
3670 //
3671 // Inputs:
3672 // c_rarg0 - byte[] source+offset
3673 // c_rarg1 - int[] SHA.state
3674 // c_rarg2 - int offset
3675 // c_rarg3 - int limit
3676 //
3677 address generate_md5_implCompress(StubId stub_id) {
3678 bool multi_block;
3679 switch (stub_id) {
3680 case StubId::stubgen_md5_implCompress_id:
3681 multi_block = false;
3682 break;
3683 case StubId::stubgen_md5_implCompressMB_id:
3684 multi_block = true;
3685 break;
3686 default:
3687 ShouldNotReachHere();
3688 }
3689 __ align(CodeEntryAlignment);
3690
3691 StubCodeMark mark(this, stub_id);
3692 address start = __ pc();
3693
3694 Register buf = c_rarg0;
3695 Register state = c_rarg1;
3696 Register ofs = c_rarg2;
3697 Register limit = c_rarg3;
3698 Register a = r4;
3699 Register b = r5;
3700 Register c = r6;
3701 Register d = r7;
3702 Register rscratch3 = r10;
3703 Register rscratch4 = r11;
3704
3705 Register state_regs[2] = { r12, r13 };
3706 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3707 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3708
3709 __ push(saved_regs, sp);
3710
3711 __ ldp(state_regs[0], state_regs[1], Address(state));
3712 __ ubfx(a, state_regs[0], 0, 32);
3713 __ ubfx(b, state_regs[0], 32, 32);
3714 __ ubfx(c, state_regs[1], 0, 32);
3715 __ ubfx(d, state_regs[1], 32, 32);
3716
3717 Label md5_loop;
3718 __ BIND(md5_loop);
3719
3720 reg_cache.gen_loads(buf);
3721
3722 // Round 1
3723 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3724 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3725 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3726 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3727 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3728 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3729 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3730 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3731 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3732 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3733 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3734 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3735 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3736 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3737 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3738 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3739
3740 // Round 2
3741 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3742 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3743 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3744 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3745 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3746 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3747 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3748 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3749 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3750 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3751 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3752 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3753 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3754 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3755 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3756 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3757
3758 // Round 3
3759 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3760 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3761 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3762 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3763 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3764 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3765 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3766 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3767 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3768 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3769 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3770 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3771 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3772 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3773 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3774 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3775
3776 // Round 4
3777 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3778 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3779 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3780 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3781 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3782 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3783 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3784 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3785 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3786 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3787 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3788 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3789 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3790 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3791 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3792 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3793
3794 __ addw(a, state_regs[0], a);
3795 __ ubfx(rscratch2, state_regs[0], 32, 32);
3796 __ addw(b, rscratch2, b);
3797 __ addw(c, state_regs[1], c);
3798 __ ubfx(rscratch4, state_regs[1], 32, 32);
3799 __ addw(d, rscratch4, d);
3800
3801 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3802 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3803
3804 if (multi_block) {
3805 __ add(buf, buf, 64);
3806 __ add(ofs, ofs, 64);
3807 __ cmp(ofs, limit);
3808 __ br(Assembler::LE, md5_loop);
3809 __ mov(c_rarg0, ofs); // return ofs
3810 }
3811
3812 // write hash values back in the correct order
3813 __ stp(state_regs[0], state_regs[1], Address(state));
3814
3815 __ pop(saved_regs, sp);
3816
3817 __ ret(lr);
3818
3819 return start;
3820 }
3821
3822 // Arguments:
3823 //
3824 // Inputs:
3825 // c_rarg0 - byte[] source+offset
3826 // c_rarg1 - int[] SHA.state
3827 // c_rarg2 - int offset
3828 // c_rarg3 - int limit
3829 //
3830 address generate_sha1_implCompress(StubId stub_id) {
3831 bool multi_block;
3832 switch (stub_id) {
3833 case StubId::stubgen_sha1_implCompress_id:
3834 multi_block = false;
3835 break;
3836 case StubId::stubgen_sha1_implCompressMB_id:
3837 multi_block = true;
3838 break;
3839 default:
3840 ShouldNotReachHere();
3841 }
3842
3843 __ align(CodeEntryAlignment);
3844
3845 StubCodeMark mark(this, stub_id);
3846 address start = __ pc();
3847
3848 Register buf = c_rarg0;
3849 Register state = c_rarg1;
3850 Register ofs = c_rarg2;
3851 Register limit = c_rarg3;
3852
3853 Label keys;
3854 Label sha1_loop;
3855
3856 // load the keys into v0..v3
3857 __ adr(rscratch1, keys);
3858 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3859 // load 5 words state into v6, v7
3860 __ ldrq(v6, Address(state, 0));
3861 __ ldrs(v7, Address(state, 16));
3862
3863
3864 __ BIND(sha1_loop);
3865 // load 64 bytes of data into v16..v19
3866 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3867 __ rev32(v16, __ T16B, v16);
3868 __ rev32(v17, __ T16B, v17);
3869 __ rev32(v18, __ T16B, v18);
3870 __ rev32(v19, __ T16B, v19);
3871
3872 // do the sha1
3873 __ addv(v4, __ T4S, v16, v0);
3874 __ orr(v20, __ T16B, v6, v6);
3875
3876 FloatRegister d0 = v16;
3877 FloatRegister d1 = v17;
3878 FloatRegister d2 = v18;
3879 FloatRegister d3 = v19;
3880
3881 for (int round = 0; round < 20; round++) {
3882 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3883 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3884 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3885 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3886 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3887
3888 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3889 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3890 __ sha1h(tmp2, __ T4S, v20);
3891 if (round < 5)
3892 __ sha1c(v20, __ T4S, tmp3, tmp4);
3893 else if (round < 10 || round >= 15)
3894 __ sha1p(v20, __ T4S, tmp3, tmp4);
3895 else
3896 __ sha1m(v20, __ T4S, tmp3, tmp4);
3897 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3898
3899 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3900 }
3901
3902 __ addv(v7, __ T2S, v7, v21);
3903 __ addv(v6, __ T4S, v6, v20);
3904
3905 if (multi_block) {
3906 __ add(ofs, ofs, 64);
3907 __ cmp(ofs, limit);
3908 __ br(Assembler::LE, sha1_loop);
3909 __ mov(c_rarg0, ofs); // return ofs
3910 }
3911
3912 __ strq(v6, Address(state, 0));
3913 __ strs(v7, Address(state, 16));
3914
3915 __ ret(lr);
3916
3917 __ bind(keys);
3918 __ emit_int32(0x5a827999);
3919 __ emit_int32(0x6ed9eba1);
3920 __ emit_int32(0x8f1bbcdc);
3921 __ emit_int32(0xca62c1d6);
3922
3923 return start;
3924 }
3925
3926
3927 // Arguments:
3928 //
3929 // Inputs:
3930 // c_rarg0 - byte[] source+offset
3931 // c_rarg1 - int[] SHA.state
3932 // c_rarg2 - int offset
3933 // c_rarg3 - int limit
3934 //
3935 address generate_sha256_implCompress(StubId stub_id) {
3936 bool multi_block;
3937 switch (stub_id) {
3938 case StubId::stubgen_sha256_implCompress_id:
3939 multi_block = false;
3940 break;
3941 case StubId::stubgen_sha256_implCompressMB_id:
3942 multi_block = true;
3943 break;
3944 default:
3945 ShouldNotReachHere();
3946 }
3947
3948 static const uint32_t round_consts[64] = {
3949 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3950 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3951 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3952 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3953 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3954 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3955 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3956 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3957 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3958 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3959 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3960 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3961 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3962 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3963 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3964 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3965 };
3966
3967 __ align(CodeEntryAlignment);
3968
3969 StubCodeMark mark(this, stub_id);
3970 address start = __ pc();
3971
3972 Register buf = c_rarg0;
3973 Register state = c_rarg1;
3974 Register ofs = c_rarg2;
3975 Register limit = c_rarg3;
3976
3977 Label sha1_loop;
3978
3979 __ stpd(v8, v9, __ pre(sp, -32));
3980 __ stpd(v10, v11, Address(sp, 16));
3981
3982 // dga == v0
3983 // dgb == v1
3984 // dg0 == v2
3985 // dg1 == v3
3986 // dg2 == v4
3987 // t0 == v6
3988 // t1 == v7
3989
3990 // load 16 keys to v16..v31
3991 __ lea(rscratch1, ExternalAddress((address)round_consts));
3992 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3993 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3994 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3995 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3996
3997 // load 8 words (256 bits) state
3998 __ ldpq(v0, v1, state);
3999
4000 __ BIND(sha1_loop);
4001 // load 64 bytes of data into v8..v11
4002 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4003 __ rev32(v8, __ T16B, v8);
4004 __ rev32(v9, __ T16B, v9);
4005 __ rev32(v10, __ T16B, v10);
4006 __ rev32(v11, __ T16B, v11);
4007
4008 __ addv(v6, __ T4S, v8, v16);
4009 __ orr(v2, __ T16B, v0, v0);
4010 __ orr(v3, __ T16B, v1, v1);
4011
4012 FloatRegister d0 = v8;
4013 FloatRegister d1 = v9;
4014 FloatRegister d2 = v10;
4015 FloatRegister d3 = v11;
4016
4017
4018 for (int round = 0; round < 16; round++) {
4019 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4020 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4021 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4022 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4023
4024 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4025 __ orr(v4, __ T16B, v2, v2);
4026 if (round < 15)
4027 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4028 __ sha256h(v2, __ T4S, v3, tmp2);
4029 __ sha256h2(v3, __ T4S, v4, tmp2);
4030 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4031
4032 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4033 }
4034
4035 __ addv(v0, __ T4S, v0, v2);
4036 __ addv(v1, __ T4S, v1, v3);
4037
4038 if (multi_block) {
4039 __ add(ofs, ofs, 64);
4040 __ cmp(ofs, limit);
4041 __ br(Assembler::LE, sha1_loop);
4042 __ mov(c_rarg0, ofs); // return ofs
4043 }
4044
4045 __ ldpd(v10, v11, Address(sp, 16));
4046 __ ldpd(v8, v9, __ post(sp, 32));
4047
4048 __ stpq(v0, v1, state);
4049
4050 __ ret(lr);
4051
4052 return start;
4053 }
4054
4055 // Double rounds for sha512.
4056 void sha512_dround(int dr,
4057 FloatRegister vi0, FloatRegister vi1,
4058 FloatRegister vi2, FloatRegister vi3,
4059 FloatRegister vi4, FloatRegister vrc0,
4060 FloatRegister vrc1, FloatRegister vin0,
4061 FloatRegister vin1, FloatRegister vin2,
4062 FloatRegister vin3, FloatRegister vin4) {
4063 if (dr < 36) {
4064 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4065 }
4066 __ addv(v5, __ T2D, vrc0, vin0);
4067 __ ext(v6, __ T16B, vi2, vi3, 8);
4068 __ ext(v5, __ T16B, v5, v5, 8);
4069 __ ext(v7, __ T16B, vi1, vi2, 8);
4070 __ addv(vi3, __ T2D, vi3, v5);
4071 if (dr < 32) {
4072 __ ext(v5, __ T16B, vin3, vin4, 8);
4073 __ sha512su0(vin0, __ T2D, vin1);
4074 }
4075 __ sha512h(vi3, __ T2D, v6, v7);
4076 if (dr < 32) {
4077 __ sha512su1(vin0, __ T2D, vin2, v5);
4078 }
4079 __ addv(vi4, __ T2D, vi1, vi3);
4080 __ sha512h2(vi3, __ T2D, vi1, vi0);
4081 }
4082
4083 // Arguments:
4084 //
4085 // Inputs:
4086 // c_rarg0 - byte[] source+offset
4087 // c_rarg1 - int[] SHA.state
4088 // c_rarg2 - int offset
4089 // c_rarg3 - int limit
4090 //
4091 address generate_sha512_implCompress(StubId stub_id) {
4092 bool multi_block;
4093 switch (stub_id) {
4094 case StubId::stubgen_sha512_implCompress_id:
4095 multi_block = false;
4096 break;
4097 case StubId::stubgen_sha512_implCompressMB_id:
4098 multi_block = true;
4099 break;
4100 default:
4101 ShouldNotReachHere();
4102 }
4103
4104 static const uint64_t round_consts[80] = {
4105 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4106 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4107 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4108 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4109 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4110 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4111 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4112 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4113 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4114 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4115 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4116 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4117 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4118 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4119 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4120 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4121 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4122 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4123 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4124 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4125 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4126 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4127 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4128 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4129 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4130 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4131 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4132 };
4133
4134 __ align(CodeEntryAlignment);
4135
4136 StubCodeMark mark(this, stub_id);
4137 address start = __ pc();
4138
4139 Register buf = c_rarg0;
4140 Register state = c_rarg1;
4141 Register ofs = c_rarg2;
4142 Register limit = c_rarg3;
4143
4144 __ stpd(v8, v9, __ pre(sp, -64));
4145 __ stpd(v10, v11, Address(sp, 16));
4146 __ stpd(v12, v13, Address(sp, 32));
4147 __ stpd(v14, v15, Address(sp, 48));
4148
4149 Label sha512_loop;
4150
4151 // load state
4152 __ ld1(v8, v9, v10, v11, __ T2D, state);
4153
4154 // load first 4 round constants
4155 __ lea(rscratch1, ExternalAddress((address)round_consts));
4156 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4157
4158 __ BIND(sha512_loop);
4159 // load 128B of data into v12..v19
4160 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4161 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4162 __ rev64(v12, __ T16B, v12);
4163 __ rev64(v13, __ T16B, v13);
4164 __ rev64(v14, __ T16B, v14);
4165 __ rev64(v15, __ T16B, v15);
4166 __ rev64(v16, __ T16B, v16);
4167 __ rev64(v17, __ T16B, v17);
4168 __ rev64(v18, __ T16B, v18);
4169 __ rev64(v19, __ T16B, v19);
4170
4171 __ mov(rscratch2, rscratch1);
4172
4173 __ mov(v0, __ T16B, v8);
4174 __ mov(v1, __ T16B, v9);
4175 __ mov(v2, __ T16B, v10);
4176 __ mov(v3, __ T16B, v11);
4177
4178 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4179 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4180 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4181 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4182 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4183 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4184 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4185 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4186 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4187 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4188 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4189 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4190 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4191 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4192 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4193 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4194 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4195 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4196 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4197 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4198 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4199 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4200 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4201 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4202 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4203 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4204 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4205 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4206 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4207 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4208 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4209 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4210 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4211 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4212 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4213 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4214 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4215 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4216 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4217 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4218
4219 __ addv(v8, __ T2D, v8, v0);
4220 __ addv(v9, __ T2D, v9, v1);
4221 __ addv(v10, __ T2D, v10, v2);
4222 __ addv(v11, __ T2D, v11, v3);
4223
4224 if (multi_block) {
4225 __ add(ofs, ofs, 128);
4226 __ cmp(ofs, limit);
4227 __ br(Assembler::LE, sha512_loop);
4228 __ mov(c_rarg0, ofs); // return ofs
4229 }
4230
4231 __ st1(v8, v9, v10, v11, __ T2D, state);
4232
4233 __ ldpd(v14, v15, Address(sp, 48));
4234 __ ldpd(v12, v13, Address(sp, 32));
4235 __ ldpd(v10, v11, Address(sp, 16));
4236 __ ldpd(v8, v9, __ post(sp, 64));
4237
4238 __ ret(lr);
4239
4240 return start;
4241 }
4242
4243 // Execute one round of keccak of two computations in parallel.
4244 // One of the states should be loaded into the lower halves of
4245 // the vector registers v0-v24, the other should be loaded into
4246 // the upper halves of those registers. The ld1r instruction loads
4247 // the round constant into both halves of register v31.
4248 // Intermediate results c0...c5 and d0...d5 are computed
4249 // in registers v25...v30.
4250 // All vector instructions that are used operate on both register
4251 // halves in parallel.
4252 // If only a single computation is needed, one can only load the lower halves.
4253 void keccak_round(Register rscratch1) {
4254 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4255 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4256 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4257 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4258 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4259 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4260 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4261 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4262 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4263 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4264
4265 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4266 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4267 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4268 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4269 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4270
4271 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4272 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4273 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4274 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4275 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4276 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4277 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4278 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4279 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4280 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4281 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4282 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4283 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4284 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4285 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4286 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4287 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4288 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4289 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4290 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4291 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4292 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4293 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4294 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4295 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4296
4297 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4298 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4299 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4300 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4301 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4302
4303 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4304
4305 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4306 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4307 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4308 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4309 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4310
4311 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4312 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4313 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4314 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4315 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4316
4317 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4318 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4319 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4320 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4321 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4322
4323 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4324 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4325 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4326 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4327 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4328
4329 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4330 }
4331
4332 // Arguments:
4333 //
4334 // Inputs:
4335 // c_rarg0 - byte[] source+offset
4336 // c_rarg1 - byte[] SHA.state
4337 // c_rarg2 - int block_size
4338 // c_rarg3 - int offset
4339 // c_rarg4 - int limit
4340 //
4341 address generate_sha3_implCompress(StubId stub_id) {
4342 bool multi_block;
4343 switch (stub_id) {
4344 case StubId::stubgen_sha3_implCompress_id:
4345 multi_block = false;
4346 break;
4347 case StubId::stubgen_sha3_implCompressMB_id:
4348 multi_block = true;
4349 break;
4350 default:
4351 ShouldNotReachHere();
4352 }
4353
4354 static const uint64_t round_consts[24] = {
4355 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4356 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4357 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4358 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4359 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4360 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4361 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4362 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4363 };
4364
4365 __ align(CodeEntryAlignment);
4366
4367 StubCodeMark mark(this, stub_id);
4368 address start = __ pc();
4369
4370 Register buf = c_rarg0;
4371 Register state = c_rarg1;
4372 Register block_size = c_rarg2;
4373 Register ofs = c_rarg3;
4374 Register limit = c_rarg4;
4375
4376 Label sha3_loop, rounds24_loop;
4377 Label sha3_512_or_sha3_384, shake128;
4378
4379 __ stpd(v8, v9, __ pre(sp, -64));
4380 __ stpd(v10, v11, Address(sp, 16));
4381 __ stpd(v12, v13, Address(sp, 32));
4382 __ stpd(v14, v15, Address(sp, 48));
4383
4384 // load state
4385 __ add(rscratch1, state, 32);
4386 __ ld1(v0, v1, v2, v3, __ T1D, state);
4387 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4388 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4389 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4390 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4391 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4392 __ ld1(v24, __ T1D, rscratch1);
4393
4394 __ BIND(sha3_loop);
4395
4396 // 24 keccak rounds
4397 __ movw(rscratch2, 24);
4398
4399 // load round_constants base
4400 __ lea(rscratch1, ExternalAddress((address) round_consts));
4401
4402 // load input
4403 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4404 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4405 __ eor(v0, __ T8B, v0, v25);
4406 __ eor(v1, __ T8B, v1, v26);
4407 __ eor(v2, __ T8B, v2, v27);
4408 __ eor(v3, __ T8B, v3, v28);
4409 __ eor(v4, __ T8B, v4, v29);
4410 __ eor(v5, __ T8B, v5, v30);
4411 __ eor(v6, __ T8B, v6, v31);
4412
4413 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4414 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4415
4416 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4417 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4418 __ eor(v7, __ T8B, v7, v25);
4419 __ eor(v8, __ T8B, v8, v26);
4420 __ eor(v9, __ T8B, v9, v27);
4421 __ eor(v10, __ T8B, v10, v28);
4422 __ eor(v11, __ T8B, v11, v29);
4423 __ eor(v12, __ T8B, v12, v30);
4424 __ eor(v13, __ T8B, v13, v31);
4425
4426 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4427 __ eor(v14, __ T8B, v14, v25);
4428 __ eor(v15, __ T8B, v15, v26);
4429 __ eor(v16, __ T8B, v16, v27);
4430
4431 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4432 __ andw(c_rarg5, block_size, 48);
4433 __ cbzw(c_rarg5, rounds24_loop);
4434
4435 __ tbnz(block_size, 5, shake128);
4436 // block_size == 144, bit5 == 0, SHA3-224
4437 __ ldrd(v28, __ post(buf, 8));
4438 __ eor(v17, __ T8B, v17, v28);
4439 __ b(rounds24_loop);
4440
4441 __ BIND(shake128);
4442 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4443 __ eor(v17, __ T8B, v17, v28);
4444 __ eor(v18, __ T8B, v18, v29);
4445 __ eor(v19, __ T8B, v19, v30);
4446 __ eor(v20, __ T8B, v20, v31);
4447 __ b(rounds24_loop); // block_size == 168, SHAKE128
4448
4449 __ BIND(sha3_512_or_sha3_384);
4450 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4451 __ eor(v7, __ T8B, v7, v25);
4452 __ eor(v8, __ T8B, v8, v26);
4453 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4454
4455 // SHA3-384
4456 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4457 __ eor(v9, __ T8B, v9, v27);
4458 __ eor(v10, __ T8B, v10, v28);
4459 __ eor(v11, __ T8B, v11, v29);
4460 __ eor(v12, __ T8B, v12, v30);
4461
4462 __ BIND(rounds24_loop);
4463 __ subw(rscratch2, rscratch2, 1);
4464
4465 keccak_round(rscratch1);
4466
4467 __ cbnzw(rscratch2, rounds24_loop);
4468
4469 if (multi_block) {
4470 __ add(ofs, ofs, block_size);
4471 __ cmp(ofs, limit);
4472 __ br(Assembler::LE, sha3_loop);
4473 __ mov(c_rarg0, ofs); // return ofs
4474 }
4475
4476 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4477 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4478 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4479 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4480 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4481 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4482 __ st1(v24, __ T1D, state);
4483
4484 // restore callee-saved registers
4485 __ ldpd(v14, v15, Address(sp, 48));
4486 __ ldpd(v12, v13, Address(sp, 32));
4487 __ ldpd(v10, v11, Address(sp, 16));
4488 __ ldpd(v8, v9, __ post(sp, 64));
4489
4490 __ ret(lr);
4491
4492 return start;
4493 }
4494
4495 // Inputs:
4496 // c_rarg0 - long[] state0
4497 // c_rarg1 - long[] state1
4498 address generate_double_keccak() {
4499 static const uint64_t round_consts[24] = {
4500 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4501 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4502 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4503 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4504 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4505 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4506 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4507 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4508 };
4509
4510 // Implements the double_keccak() method of the
4511 // sun.secyrity.provider.SHA3Parallel class
4512 __ align(CodeEntryAlignment);
4513 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4514 address start = __ pc();
4515 __ enter();
4516
4517 Register state0 = c_rarg0;
4518 Register state1 = c_rarg1;
4519
4520 Label rounds24_loop;
4521
4522 // save callee-saved registers
4523 __ stpd(v8, v9, __ pre(sp, -64));
4524 __ stpd(v10, v11, Address(sp, 16));
4525 __ stpd(v12, v13, Address(sp, 32));
4526 __ stpd(v14, v15, Address(sp, 48));
4527
4528 // load states
4529 __ add(rscratch1, state0, 32);
4530 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4531 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4532 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4533 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4534 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4535 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4536 __ ld1(v24, __ D, 0, rscratch1);
4537 __ add(rscratch1, state1, 32);
4538 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4539 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4540 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4541 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4542 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4543 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4544 __ ld1(v24, __ D, 1, rscratch1);
4545
4546 // 24 keccak rounds
4547 __ movw(rscratch2, 24);
4548
4549 // load round_constants base
4550 __ lea(rscratch1, ExternalAddress((address) round_consts));
4551
4552 __ BIND(rounds24_loop);
4553 __ subw(rscratch2, rscratch2, 1);
4554 keccak_round(rscratch1);
4555 __ cbnzw(rscratch2, rounds24_loop);
4556
4557 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4558 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4559 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4560 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4561 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4562 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4563 __ st1(v24, __ D, 0, state0);
4564 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4565 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4566 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4567 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4568 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4569 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4570 __ st1(v24, __ D, 1, state1);
4571
4572 // restore callee-saved vector registers
4573 __ ldpd(v14, v15, Address(sp, 48));
4574 __ ldpd(v12, v13, Address(sp, 32));
4575 __ ldpd(v10, v11, Address(sp, 16));
4576 __ ldpd(v8, v9, __ post(sp, 64));
4577
4578 __ leave(); // required for proper stackwalking of RuntimeStub frame
4579 __ mov(r0, zr); // return 0
4580 __ ret(lr);
4581
4582 return start;
4583 }
4584
4585 // ChaCha20 block function. This version parallelizes the 32-bit
4586 // state elements on each of 16 vectors, producing 4 blocks of
4587 // keystream at a time.
4588 //
4589 // state (int[16]) = c_rarg0
4590 // keystream (byte[256]) = c_rarg1
4591 // return - number of bytes of produced keystream (always 256)
4592 //
4593 // This implementation takes each 32-bit integer from the state
4594 // array and broadcasts it across all 4 32-bit lanes of a vector register
4595 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4596 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4597 // the quarter round schedule is implemented as outlined in RFC 7539 section
4598 // 2.3. However, instead of sequentially processing the 3 quarter round
4599 // operations represented by one QUARTERROUND function, we instead stack all
4600 // the adds, xors and left-rotations from the first 4 quarter rounds together
4601 // and then do the same for the second set of 4 quarter rounds. This removes
4602 // some latency that would otherwise be incurred by waiting for an add to
4603 // complete before performing an xor (which depends on the result of the
4604 // add), etc. An adjustment happens between the first and second groups of 4
4605 // quarter rounds, but this is done only in the inputs to the macro functions
4606 // that generate the assembly instructions - these adjustments themselves are
4607 // not part of the resulting assembly.
4608 // The 4 registers v0-v3 are used during the quarter round operations as
4609 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4610 // registers become the vectors involved in adding the start state back onto
4611 // the post-QR working state. After the adds are complete, each of the 16
4612 // vectors write their first lane back to the keystream buffer, followed
4613 // by the second lane from all vectors and so on.
4614 address generate_chacha20Block_blockpar() {
4615 Label L_twoRounds, L_cc20_const;
4616 __ align(CodeEntryAlignment);
4617 StubId stub_id = StubId::stubgen_chacha20Block_id;
4618 StubCodeMark mark(this, stub_id);
4619 address start = __ pc();
4620 __ enter();
4621
4622 int i, j;
4623 const Register state = c_rarg0;
4624 const Register keystream = c_rarg1;
4625 const Register loopCtr = r10;
4626 const Register tmpAddr = r11;
4627 const FloatRegister ctrAddOverlay = v28;
4628 const FloatRegister lrot8Tbl = v29;
4629
4630 // Organize SIMD registers in an array that facilitates
4631 // putting repetitive opcodes into loop structures. It is
4632 // important that each grouping of 4 registers is monotonically
4633 // increasing to support the requirements of multi-register
4634 // instructions (e.g. ld4r, st4, etc.)
4635 const FloatRegister workSt[16] = {
4636 v4, v5, v6, v7, v16, v17, v18, v19,
4637 v20, v21, v22, v23, v24, v25, v26, v27
4638 };
4639
4640 // Pull in constant data. The first 16 bytes are the add overlay
4641 // which is applied to the vector holding the counter (state[12]).
4642 // The second 16 bytes is the index register for the 8-bit left
4643 // rotation tbl instruction.
4644 __ adr(tmpAddr, L_cc20_const);
4645 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4646
4647 // Load from memory and interlace across 16 SIMD registers,
4648 // With each word from memory being broadcast to all lanes of
4649 // each successive SIMD register.
4650 // Addr(0) -> All lanes in workSt[i]
4651 // Addr(4) -> All lanes workSt[i + 1], etc.
4652 __ mov(tmpAddr, state);
4653 for (i = 0; i < 16; i += 4) {
4654 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4655 __ post(tmpAddr, 16));
4656 }
4657 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4658
4659 // Before entering the loop, create 5 4-register arrays. These
4660 // will hold the 4 registers that represent the a/b/c/d fields
4661 // in the quarter round operation. For instance the "b" field
4662 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4663 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4664 // since it is part of a diagonal organization. The aSet and scratch
4665 // register sets are defined at declaration time because they do not change
4666 // organization at any point during the 20-round processing.
4667 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4668 FloatRegister bSet[4];
4669 FloatRegister cSet[4];
4670 FloatRegister dSet[4];
4671 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4672
4673 // Set up the 10 iteration loop and perform all 8 quarter round ops
4674 __ mov(loopCtr, 10);
4675 __ BIND(L_twoRounds);
4676
4677 // Set to columnar organization and do the following 4 quarter-rounds:
4678 // QUARTERROUND(0, 4, 8, 12)
4679 // QUARTERROUND(1, 5, 9, 13)
4680 // QUARTERROUND(2, 6, 10, 14)
4681 // QUARTERROUND(3, 7, 11, 15)
4682 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4683 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4684 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4685
4686 __ cc20_qr_add4(aSet, bSet); // a += b
4687 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4688 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4689
4690 __ cc20_qr_add4(cSet, dSet); // c += d
4691 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4692 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4693
4694 __ cc20_qr_add4(aSet, bSet); // a += b
4695 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4696 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4697
4698 __ cc20_qr_add4(cSet, dSet); // c += d
4699 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4700 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4701
4702 // Set to diagonal organization and do the next 4 quarter-rounds:
4703 // QUARTERROUND(0, 5, 10, 15)
4704 // QUARTERROUND(1, 6, 11, 12)
4705 // QUARTERROUND(2, 7, 8, 13)
4706 // QUARTERROUND(3, 4, 9, 14)
4707 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4708 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4709 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4710
4711 __ cc20_qr_add4(aSet, bSet); // a += b
4712 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4713 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4714
4715 __ cc20_qr_add4(cSet, dSet); // c += d
4716 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4717 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4718
4719 __ cc20_qr_add4(aSet, bSet); // a += b
4720 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4721 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4722
4723 __ cc20_qr_add4(cSet, dSet); // c += d
4724 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4725 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4726
4727 // Decrement and iterate
4728 __ sub(loopCtr, loopCtr, 1);
4729 __ cbnz(loopCtr, L_twoRounds);
4730
4731 __ mov(tmpAddr, state);
4732
4733 // Add the starting state back to the post-loop keystream
4734 // state. We read/interlace the state array from memory into
4735 // 4 registers similar to what we did in the beginning. Then
4736 // add the counter overlay onto workSt[12] at the end.
4737 for (i = 0; i < 16; i += 4) {
4738 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4739 __ addv(workSt[i], __ T4S, workSt[i], v0);
4740 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4741 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4742 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4743 }
4744 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4745
4746 // Write working state into the keystream buffer. This is accomplished
4747 // by taking the lane "i" from each of the four vectors and writing
4748 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4749 // repeating with the next 4 vectors until all 16 vectors have been used.
4750 // Then move to the next lane and repeat the process until all lanes have
4751 // been written.
4752 for (i = 0; i < 4; i++) {
4753 for (j = 0; j < 16; j += 4) {
4754 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4755 __ post(keystream, 16));
4756 }
4757 }
4758
4759 __ mov(r0, 256); // Return length of output keystream
4760 __ leave();
4761 __ ret(lr);
4762
4763 // bind label and generate local constant data used by this stub
4764 // The constant data is broken into two 128-bit segments to be loaded
4765 // onto FloatRegisters. The first 128 bits are a counter add overlay
4766 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4767 // The second 128-bits is a table constant used for 8-bit left rotations.
4768 __ BIND(L_cc20_const);
4769 __ emit_int64(0x0000000100000000UL);
4770 __ emit_int64(0x0000000300000002UL);
4771 __ emit_int64(0x0605040702010003UL);
4772 __ emit_int64(0x0E0D0C0F0A09080BUL);
4773
4774 return start;
4775 }
4776
4777 // Helpers to schedule parallel operation bundles across vector
4778 // register sequences of size 2, 4 or 8.
4779
4780 // Implement various primitive computations across vector sequences
4781
4782 template<int N>
4783 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4784 const VSeq<N>& v1, const VSeq<N>& v2) {
4785 // output must not be constant
4786 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4787 // output cannot overwrite pending inputs
4788 assert(!vs_write_before_read(v, v1), "output overwrites input");
4789 assert(!vs_write_before_read(v, v2), "output overwrites input");
4790 for (int i = 0; i < N; i++) {
4791 __ addv(v[i], T, v1[i], v2[i]);
4792 }
4793 }
4794
4795 template<int N>
4796 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4797 const VSeq<N>& v1, const VSeq<N>& v2) {
4798 // output must not be constant
4799 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4800 // output cannot overwrite pending inputs
4801 assert(!vs_write_before_read(v, v1), "output overwrites input");
4802 assert(!vs_write_before_read(v, v2), "output overwrites input");
4803 for (int i = 0; i < N; i++) {
4804 __ subv(v[i], T, v1[i], v2[i]);
4805 }
4806 }
4807
4808 template<int N>
4809 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4810 const VSeq<N>& v1, const VSeq<N>& v2) {
4811 // output must not be constant
4812 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4813 // output cannot overwrite pending inputs
4814 assert(!vs_write_before_read(v, v1), "output overwrites input");
4815 assert(!vs_write_before_read(v, v2), "output overwrites input");
4816 for (int i = 0; i < N; i++) {
4817 __ mulv(v[i], T, v1[i], v2[i]);
4818 }
4819 }
4820
4821 template<int N>
4822 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4823 // output must not be constant
4824 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4825 // output cannot overwrite pending inputs
4826 assert(!vs_write_before_read(v, v1), "output overwrites input");
4827 for (int i = 0; i < N; i++) {
4828 __ negr(v[i], T, v1[i]);
4829 }
4830 }
4831
4832 template<int N>
4833 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4834 const VSeq<N>& v1, int shift) {
4835 // output must not be constant
4836 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4837 // output cannot overwrite pending inputs
4838 assert(!vs_write_before_read(v, v1), "output overwrites input");
4839 for (int i = 0; i < N; i++) {
4840 __ sshr(v[i], T, v1[i], shift);
4841 }
4842 }
4843
4844 template<int N>
4845 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4846 // output must not be constant
4847 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4848 // output cannot overwrite pending inputs
4849 assert(!vs_write_before_read(v, v1), "output overwrites input");
4850 assert(!vs_write_before_read(v, v2), "output overwrites input");
4851 for (int i = 0; i < N; i++) {
4852 __ andr(v[i], __ T16B, v1[i], v2[i]);
4853 }
4854 }
4855
4856 template<int N>
4857 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4858 // output must not be constant
4859 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4860 // output cannot overwrite pending inputs
4861 assert(!vs_write_before_read(v, v1), "output overwrites input");
4862 assert(!vs_write_before_read(v, v2), "output overwrites input");
4863 for (int i = 0; i < N; i++) {
4864 __ orr(v[i], __ T16B, v1[i], v2[i]);
4865 }
4866 }
4867
4868 template<int N>
4869 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4870 // output must not be constant
4871 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4872 // output cannot overwrite pending inputs
4873 assert(!vs_write_before_read(v, v1), "output overwrites input");
4874 for (int i = 0; i < N; i++) {
4875 __ notr(v[i], __ T16B, v1[i]);
4876 }
4877 }
4878
4879 template<int N>
4880 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4881 // output must not be constant
4882 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4883 // output cannot overwrite pending inputs
4884 assert(!vs_write_before_read(v, v1), "output overwrites input");
4885 assert(!vs_write_before_read(v, v2), "output overwrites input");
4886 for (int i = 0; i < N; i++) {
4887 __ sqdmulh(v[i], T, v1[i], v2[i]);
4888 }
4889 }
4890
4891 template<int N>
4892 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4893 // output must not be constant
4894 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4895 // output cannot overwrite pending inputs
4896 assert(!vs_write_before_read(v, v1), "output overwrites input");
4897 assert(!vs_write_before_read(v, v2), "output overwrites input");
4898 for (int i = 0; i < N; i++) {
4899 __ mlsv(v[i], T, v1[i], v2[i]);
4900 }
4901 }
4902
4903 // load N/2 successive pairs of quadword values from memory in order
4904 // into N successive vector registers of the sequence via the
4905 // address supplied in base.
4906 template<int N>
4907 void vs_ldpq(const VSeq<N>& v, Register base) {
4908 for (int i = 0; i < N; i += 2) {
4909 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4910 }
4911 }
4912
4913 // load N/2 successive pairs of quadword values from memory in order
4914 // into N vector registers of the sequence via the address supplied
4915 // in base using post-increment addressing
4916 template<int N>
4917 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4918 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4919 for (int i = 0; i < N; i += 2) {
4920 __ ldpq(v[i], v[i+1], __ post(base, 32));
4921 }
4922 }
4923
4924 // store N successive vector registers of the sequence into N/2
4925 // successive pairs of quadword memory locations via the address
4926 // supplied in base using post-increment addressing
4927 template<int N>
4928 void vs_stpq_post(const VSeq<N>& v, Register base) {
4929 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4930 for (int i = 0; i < N; i += 2) {
4931 __ stpq(v[i], v[i+1], __ post(base, 32));
4932 }
4933 }
4934
4935 // load N/2 pairs of quadword values from memory de-interleaved into
4936 // N vector registers 2 at a time via the address supplied in base
4937 // using post-increment addressing.
4938 template<int N>
4939 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4940 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4941 for (int i = 0; i < N; i += 2) {
4942 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4943 }
4944 }
4945
4946 // store N vector registers interleaved into N/2 pairs of quadword
4947 // memory locations via the address supplied in base using
4948 // post-increment addressing.
4949 template<int N>
4950 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4951 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4952 for (int i = 0; i < N; i += 2) {
4953 __ st2(v[i], v[i+1], T, __ post(base, 32));
4954 }
4955 }
4956
4957 // load N quadword values from memory de-interleaved into N vector
4958 // registers 3 elements at a time via the address supplied in base.
4959 template<int N>
4960 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4961 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4962 for (int i = 0; i < N; i += 3) {
4963 __ ld3(v[i], v[i+1], v[i+2], T, base);
4964 }
4965 }
4966
4967 // load N quadword values from memory de-interleaved into N vector
4968 // registers 3 elements at a time via the address supplied in base
4969 // using post-increment addressing.
4970 template<int N>
4971 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4972 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4973 for (int i = 0; i < N; i += 3) {
4974 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4975 }
4976 }
4977
4978 // load N/2 pairs of quadword values from memory into N vector
4979 // registers via the address supplied in base with each pair indexed
4980 // using the the start offset plus the corresponding entry in the
4981 // offsets array
4982 template<int N>
4983 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
4984 for (int i = 0; i < N/2; i++) {
4985 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4986 }
4987 }
4988
4989 // store N vector registers into N/2 pairs of quadword memory
4990 // locations via the address supplied in base with each pair indexed
4991 // using the the start offset plus the corresponding entry in the
4992 // offsets array
4993 template<int N>
4994 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
4995 for (int i = 0; i < N/2; i++) {
4996 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4997 }
4998 }
4999
5000 // load N single quadword values from memory into N vector registers
5001 // via the address supplied in base with each value indexed using
5002 // the the start offset plus the corresponding entry in the offsets
5003 // array
5004 template<int N>
5005 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5006 int start, int (&offsets)[N]) {
5007 for (int i = 0; i < N; i++) {
5008 __ ldr(v[i], T, Address(base, start + offsets[i]));
5009 }
5010 }
5011
5012 // store N vector registers into N single quadword memory locations
5013 // via the address supplied in base with each value indexed using
5014 // the the start offset plus the corresponding entry in the offsets
5015 // array
5016 template<int N>
5017 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5018 int start, int (&offsets)[N]) {
5019 for (int i = 0; i < N; i++) {
5020 __ str(v[i], T, Address(base, start + offsets[i]));
5021 }
5022 }
5023
5024 // load N/2 pairs of quadword values from memory de-interleaved into
5025 // N vector registers 2 at a time via the address supplied in base
5026 // with each pair indexed using the the start offset plus the
5027 // corresponding entry in the offsets array
5028 template<int N>
5029 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5030 Register tmp, int start, int (&offsets)[N/2]) {
5031 for (int i = 0; i < N/2; i++) {
5032 __ add(tmp, base, start + offsets[i]);
5033 __ ld2(v[2*i], v[2*i+1], T, tmp);
5034 }
5035 }
5036
5037 // store N vector registers 2 at a time interleaved into N/2 pairs
5038 // of quadword memory locations via the address supplied in base
5039 // with each pair indexed using the the start offset plus the
5040 // corresponding entry in the offsets array
5041 template<int N>
5042 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5043 Register tmp, int start, int (&offsets)[N/2]) {
5044 for (int i = 0; i < N/2; i++) {
5045 __ add(tmp, base, start + offsets[i]);
5046 __ st2(v[2*i], v[2*i+1], T, tmp);
5047 }
5048 }
5049
5050 // Helper routines for various flavours of Montgomery multiply
5051
5052 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5053 // multiplications in parallel
5054 //
5055
5056 // See the montMul() method of the sun.security.provider.ML_DSA
5057 // class.
5058 //
5059 // Computes 4x4S results or 8x8H results
5060 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5061 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5062 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5063 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5064 // Outputs: va - 4x4S or 4x8H vector register sequences
5065 // vb, vc, vtmp and vq must all be disjoint
5066 // va must be disjoint from all other inputs/temps or must equal vc
5067 // va must have a non-zero delta i.e. it must not be a constant vseq.
5068 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5069 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5070 Assembler::SIMD_Arrangement T,
5071 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5072 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5073 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5074 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5075 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5076
5077 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5078 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5079
5080 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5081
5082 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5083 assert(vs_disjoint(va, vb), "va and vb overlap");
5084 assert(vs_disjoint(va, vq), "va and vq overlap");
5085 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5086 assert(!va.is_constant(), "output vector must identify 4 different registers");
5087
5088 // schedule 4 streams of instructions across the vector sequences
5089 for (int i = 0; i < 4; i++) {
5090 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5091 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5092 }
5093
5094 for (int i = 0; i < 4; i++) {
5095 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5096 }
5097
5098 for (int i = 0; i < 4; i++) {
5099 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5100 }
5101
5102 for (int i = 0; i < 4; i++) {
5103 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5104 }
5105 }
5106
5107 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5108 // multiplications in parallel
5109 //
5110
5111 // See the montMul() method of the sun.security.provider.ML_DSA
5112 // class.
5113 //
5114 // Computes 4x4S results or 8x8H results
5115 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5116 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5117 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5118 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5119 // Outputs: va - 4x4S or 4x8H vector register sequences
5120 // vb, vc, vtmp and vq must all be disjoint
5121 // va must be disjoint from all other inputs/temps or must equal vc
5122 // va must have a non-zero delta i.e. it must not be a constant vseq.
5123 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5124 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5125 Assembler::SIMD_Arrangement T,
5126 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5127 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5128 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5129 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5130 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5131
5132 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5133 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5134
5135 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5136
5137 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5138 assert(vs_disjoint(va, vb), "va and vb overlap");
5139 assert(vs_disjoint(va, vq), "va and vq overlap");
5140 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5141 assert(!va.is_constant(), "output vector must identify 2 different registers");
5142
5143 // schedule 2 streams of instructions across the vector sequences
5144 for (int i = 0; i < 2; i++) {
5145 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5146 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5147 }
5148
5149 for (int i = 0; i < 2; i++) {
5150 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5151 }
5152
5153 for (int i = 0; i < 2; i++) {
5154 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5155 }
5156
5157 for (int i = 0; i < 2; i++) {
5158 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5159 }
5160 }
5161
5162 // Perform 16 16-bit Montgomery multiplications in parallel.
5163 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5164 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5165 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5166 // It will assert that the register use is valid
5167 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5168 }
5169
5170 // Perform 32 16-bit Montgomery multiplications in parallel.
5171 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5172 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5173 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5174 // It will assert that the register use is valid
5175 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5176 }
5177
5178 // Perform 64 16-bit Montgomery multiplications in parallel.
5179 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5180 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5181 // Schedule two successive 4x8H multiplies via the montmul helper
5182 // on the front and back halves of va, vb and vc. The helper will
5183 // assert that the register use has no overlap conflicts on each
5184 // individual call but we also need to ensure that the necessary
5185 // disjoint/equality constraints are met across both calls.
5186
5187 // vb, vc, vtmp and vq must be disjoint. va must either be
5188 // disjoint from all other registers or equal vc
5189
5190 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5191 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5192 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5193
5194 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5195 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5196
5197 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5198
5199 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5200 assert(vs_disjoint(va, vb), "va and vb overlap");
5201 assert(vs_disjoint(va, vq), "va and vq overlap");
5202 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5203
5204 // we multiply the front and back halves of each sequence 4 at a
5205 // time because
5206 //
5207 // 1) we are currently only able to get 4-way instruction
5208 // parallelism at best
5209 //
5210 // 2) we need registers for the constants in vq and temporary
5211 // scratch registers to hold intermediate results so vtmp can only
5212 // be a VSeq<4> which means we only have 4 scratch slots
5213
5214 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5215 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5216 }
5217
5218 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5219 const VSeq<4>& vc,
5220 const VSeq<4>& vtmp,
5221 const VSeq<2>& vq) {
5222 // compute a = montmul(a1, c)
5223 kyber_montmul32(vc, va1, vc, vtmp, vq);
5224 // ouptut a1 = a0 - a
5225 vs_subv(va1, __ T8H, va0, vc);
5226 // and a0 = a0 + a
5227 vs_addv(va0, __ T8H, va0, vc);
5228 }
5229
5230 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5231 const VSeq<4>& vb,
5232 const VSeq<4>& vtmp1,
5233 const VSeq<4>& vtmp2,
5234 const VSeq<2>& vq) {
5235 // compute c = a0 - a1
5236 vs_subv(vtmp1, __ T8H, va0, va1);
5237 // output a0 = a0 + a1
5238 vs_addv(va0, __ T8H, va0, va1);
5239 // output a1 = b montmul c
5240 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5241 }
5242
5243 void load64shorts(const VSeq<8>& v, Register shorts) {
5244 vs_ldpq_post(v, shorts);
5245 }
5246
5247 void load32shorts(const VSeq<4>& v, Register shorts) {
5248 vs_ldpq_post(v, shorts);
5249 }
5250
5251 void store64shorts(VSeq<8> v, Register tmpAddr) {
5252 vs_stpq_post(v, tmpAddr);
5253 }
5254
5255 // Kyber NTT function.
5256 // Implements
5257 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5258 //
5259 // coeffs (short[256]) = c_rarg0
5260 // ntt_zetas (short[256]) = c_rarg1
5261 address generate_kyberNtt() {
5262
5263 __ align(CodeEntryAlignment);
5264 StubId stub_id = StubId::stubgen_kyberNtt_id;
5265 StubCodeMark mark(this, stub_id);
5266 address start = __ pc();
5267 __ enter();
5268
5269 const Register coeffs = c_rarg0;
5270 const Register zetas = c_rarg1;
5271
5272 const Register kyberConsts = r10;
5273 const Register tmpAddr = r11;
5274
5275 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5276 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5277 VSeq<2> vq(30); // n.b. constants overlap vs3
5278
5279 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5280 // load the montmul constants
5281 vs_ldpq(vq, kyberConsts);
5282
5283 // Each level corresponds to an iteration of the outermost loop of the
5284 // Java method seilerNTT(int[] coeffs). There are some differences
5285 // from what is done in the seilerNTT() method, though:
5286 // 1. The computation is using 16-bit signed values, we do not convert them
5287 // to ints here.
5288 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5289 // this array for each level, it is easier that way to fill up the vector
5290 // registers.
5291 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5292 // multiplications (this is because that way there should not be any
5293 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5294 // that we can use the 16-bit arithmetic in the vector unit.
5295 //
5296 // On each level, we fill up the vector registers in such a way that the
5297 // array elements that need to be multiplied by the zetas go into one
5298 // set of vector registers while the corresponding ones that don't need to
5299 // be multiplied, go into another set.
5300 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5301 // registers interleaving the steps of 4 identical computations,
5302 // each done on 8 16-bit values per register.
5303
5304 // At levels 0-3 the coefficients multiplied by or added/subtracted
5305 // to the zetas occur in discrete blocks whose size is some multiple
5306 // of 32.
5307
5308 // level 0
5309 __ add(tmpAddr, coeffs, 256);
5310 load64shorts(vs1, tmpAddr);
5311 load64shorts(vs2, zetas);
5312 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5313 __ add(tmpAddr, coeffs, 0);
5314 load64shorts(vs1, tmpAddr);
5315 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5316 vs_addv(vs1, __ T8H, vs1, vs2);
5317 __ add(tmpAddr, coeffs, 0);
5318 vs_stpq_post(vs1, tmpAddr);
5319 __ add(tmpAddr, coeffs, 256);
5320 vs_stpq_post(vs3, tmpAddr);
5321 // restore montmul constants
5322 vs_ldpq(vq, kyberConsts);
5323 load64shorts(vs1, tmpAddr);
5324 load64shorts(vs2, zetas);
5325 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5326 __ add(tmpAddr, coeffs, 128);
5327 load64shorts(vs1, tmpAddr);
5328 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5329 vs_addv(vs1, __ T8H, vs1, vs2);
5330 __ add(tmpAddr, coeffs, 128);
5331 store64shorts(vs1, tmpAddr);
5332 __ add(tmpAddr, coeffs, 384);
5333 store64shorts(vs3, tmpAddr);
5334
5335 // level 1
5336 // restore montmul constants
5337 vs_ldpq(vq, kyberConsts);
5338 __ add(tmpAddr, coeffs, 128);
5339 load64shorts(vs1, tmpAddr);
5340 load64shorts(vs2, zetas);
5341 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5342 __ add(tmpAddr, coeffs, 0);
5343 load64shorts(vs1, tmpAddr);
5344 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5345 vs_addv(vs1, __ T8H, vs1, vs2);
5346 __ add(tmpAddr, coeffs, 0);
5347 store64shorts(vs1, tmpAddr);
5348 store64shorts(vs3, tmpAddr);
5349 vs_ldpq(vq, kyberConsts);
5350 __ add(tmpAddr, coeffs, 384);
5351 load64shorts(vs1, tmpAddr);
5352 load64shorts(vs2, zetas);
5353 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5354 __ add(tmpAddr, coeffs, 256);
5355 load64shorts(vs1, tmpAddr);
5356 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5357 vs_addv(vs1, __ T8H, vs1, vs2);
5358 __ add(tmpAddr, coeffs, 256);
5359 store64shorts(vs1, tmpAddr);
5360 store64shorts(vs3, tmpAddr);
5361
5362 // level 2
5363 vs_ldpq(vq, kyberConsts);
5364 int offsets1[4] = { 0, 32, 128, 160 };
5365 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5366 load64shorts(vs2, zetas);
5367 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5368 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5369 // kyber_subv_addv64();
5370 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5371 vs_addv(vs1, __ T8H, vs1, vs2);
5372 __ add(tmpAddr, coeffs, 0);
5373 vs_stpq_post(vs_front(vs1), tmpAddr);
5374 vs_stpq_post(vs_front(vs3), tmpAddr);
5375 vs_stpq_post(vs_back(vs1), tmpAddr);
5376 vs_stpq_post(vs_back(vs3), tmpAddr);
5377 vs_ldpq(vq, kyberConsts);
5378 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5379 load64shorts(vs2, zetas);
5380 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5381 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5382 // kyber_subv_addv64();
5383 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5384 vs_addv(vs1, __ T8H, vs1, vs2);
5385 __ add(tmpAddr, coeffs, 256);
5386 vs_stpq_post(vs_front(vs1), tmpAddr);
5387 vs_stpq_post(vs_front(vs3), tmpAddr);
5388 vs_stpq_post(vs_back(vs1), tmpAddr);
5389 vs_stpq_post(vs_back(vs3), tmpAddr);
5390
5391 // level 3
5392 vs_ldpq(vq, kyberConsts);
5393 int offsets2[4] = { 0, 64, 128, 192 };
5394 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5395 load64shorts(vs2, zetas);
5396 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5397 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5398 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5399 vs_addv(vs1, __ T8H, vs1, vs2);
5400 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5401 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5402
5403 vs_ldpq(vq, kyberConsts);
5404 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5405 load64shorts(vs2, zetas);
5406 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5407 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5408 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5409 vs_addv(vs1, __ T8H, vs1, vs2);
5410 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5411 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5412
5413 // level 4
5414 // At level 4 coefficients occur in 8 discrete blocks of size 16
5415 // so they are loaded using employing an ldr at 8 distinct offsets.
5416
5417 vs_ldpq(vq, kyberConsts);
5418 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5419 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5420 load64shorts(vs2, zetas);
5421 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5422 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5423 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5424 vs_addv(vs1, __ T8H, vs1, vs2);
5425 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5426 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5427
5428 vs_ldpq(vq, kyberConsts);
5429 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5430 load64shorts(vs2, zetas);
5431 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5432 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5433 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5434 vs_addv(vs1, __ T8H, vs1, vs2);
5435 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5436 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5437
5438 // level 5
5439 // At level 5 related coefficients occur in discrete blocks of size 8 so
5440 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5441
5442 vs_ldpq(vq, kyberConsts);
5443 int offsets4[4] = { 0, 32, 64, 96 };
5444 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5445 load32shorts(vs_front(vs2), zetas);
5446 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5447 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5448 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5449 load32shorts(vs_front(vs2), zetas);
5450 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5451 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5452 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5453 load32shorts(vs_front(vs2), zetas);
5454 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5455 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5456
5457 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5458 load32shorts(vs_front(vs2), zetas);
5459 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5460 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5461
5462 // level 6
5463 // At level 6 related coefficients occur in discrete blocks of size 4 so
5464 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5465
5466 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5467 load32shorts(vs_front(vs2), zetas);
5468 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5469 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5470 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5471 // __ ldpq(v18, v19, __ post(zetas, 32));
5472 load32shorts(vs_front(vs2), zetas);
5473 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5474 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5475
5476 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5477 load32shorts(vs_front(vs2), zetas);
5478 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5479 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5480
5481 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5482 load32shorts(vs_front(vs2), zetas);
5483 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5484 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5485
5486 __ leave(); // required for proper stackwalking of RuntimeStub frame
5487 __ mov(r0, zr); // return 0
5488 __ ret(lr);
5489
5490 return start;
5491 }
5492
5493 // Kyber Inverse NTT function
5494 // Implements
5495 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5496 //
5497 // coeffs (short[256]) = c_rarg0
5498 // ntt_zetas (short[256]) = c_rarg1
5499 address generate_kyberInverseNtt() {
5500
5501 __ align(CodeEntryAlignment);
5502 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5503 StubCodeMark mark(this, stub_id);
5504 address start = __ pc();
5505 __ enter();
5506
5507 const Register coeffs = c_rarg0;
5508 const Register zetas = c_rarg1;
5509
5510 const Register kyberConsts = r10;
5511 const Register tmpAddr = r11;
5512 const Register tmpAddr2 = c_rarg2;
5513
5514 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5515 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5516 VSeq<2> vq(30); // n.b. constants overlap vs3
5517
5518 __ lea(kyberConsts,
5519 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5520
5521 // level 0
5522 // At level 0 related coefficients occur in discrete blocks of size 4 so
5523 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5524
5525 vs_ldpq(vq, kyberConsts);
5526 int offsets4[4] = { 0, 32, 64, 96 };
5527 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5528 load32shorts(vs_front(vs2), zetas);
5529 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5530 vs_front(vs2), vs_back(vs2), vtmp, vq);
5531 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5532 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5533 load32shorts(vs_front(vs2), zetas);
5534 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5535 vs_front(vs2), vs_back(vs2), vtmp, vq);
5536 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5537 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5538 load32shorts(vs_front(vs2), zetas);
5539 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5540 vs_front(vs2), vs_back(vs2), vtmp, vq);
5541 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5542 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5543 load32shorts(vs_front(vs2), zetas);
5544 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5545 vs_front(vs2), vs_back(vs2), vtmp, vq);
5546 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5547
5548 // level 1
5549 // At level 1 related coefficients occur in discrete blocks of size 8 so
5550 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5551
5552 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5553 load32shorts(vs_front(vs2), zetas);
5554 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5555 vs_front(vs2), vs_back(vs2), vtmp, vq);
5556 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5557 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5558 load32shorts(vs_front(vs2), zetas);
5559 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5560 vs_front(vs2), vs_back(vs2), vtmp, vq);
5561 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5562
5563 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5564 load32shorts(vs_front(vs2), zetas);
5565 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5566 vs_front(vs2), vs_back(vs2), vtmp, vq);
5567 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5568 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5569 load32shorts(vs_front(vs2), zetas);
5570 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5571 vs_front(vs2), vs_back(vs2), vtmp, vq);
5572 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5573
5574 // level 2
5575 // At level 2 coefficients occur in 8 discrete blocks of size 16
5576 // so they are loaded using employing an ldr at 8 distinct offsets.
5577
5578 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5579 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5580 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5581 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5582 vs_subv(vs1, __ T8H, vs1, vs2);
5583 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5584 load64shorts(vs2, zetas);
5585 vs_ldpq(vq, kyberConsts);
5586 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5587 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5588
5589 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5590 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5591 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5592 vs_subv(vs1, __ T8H, vs1, vs2);
5593 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5594 load64shorts(vs2, zetas);
5595 vs_ldpq(vq, kyberConsts);
5596 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5597 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5598
5599 // Barrett reduction at indexes where overflow may happen
5600
5601 // load q and the multiplier for the Barrett reduction
5602 __ add(tmpAddr, kyberConsts, 16);
5603 vs_ldpq(vq, tmpAddr);
5604
5605 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5606 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5607 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5608 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5609 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5610 vs_sshr(vs2, __ T8H, vs2, 11);
5611 vs_mlsv(vs1, __ T8H, vs2, vq1);
5612 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5613 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5614 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5615 vs_sshr(vs2, __ T8H, vs2, 11);
5616 vs_mlsv(vs1, __ T8H, vs2, vq1);
5617 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5618
5619 // level 3
5620 // From level 3 upwards coefficients occur in discrete blocks whose size is
5621 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5622
5623 int offsets2[4] = { 0, 64, 128, 192 };
5624 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5625 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5626 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5627 vs_subv(vs1, __ T8H, vs1, vs2);
5628 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5629 load64shorts(vs2, zetas);
5630 vs_ldpq(vq, kyberConsts);
5631 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5632 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5633
5634 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5635 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5636 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5637 vs_subv(vs1, __ T8H, vs1, vs2);
5638 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5639 load64shorts(vs2, zetas);
5640 vs_ldpq(vq, kyberConsts);
5641 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5642 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5643
5644 // level 4
5645
5646 int offsets1[4] = { 0, 32, 128, 160 };
5647 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5648 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5649 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5650 vs_subv(vs1, __ T8H, vs1, vs2);
5651 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5652 load64shorts(vs2, zetas);
5653 vs_ldpq(vq, kyberConsts);
5654 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5655 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5656
5657 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5658 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5659 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5660 vs_subv(vs1, __ T8H, vs1, vs2);
5661 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5662 load64shorts(vs2, zetas);
5663 vs_ldpq(vq, kyberConsts);
5664 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5665 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5666
5667 // level 5
5668
5669 __ add(tmpAddr, coeffs, 0);
5670 load64shorts(vs1, tmpAddr);
5671 __ add(tmpAddr, coeffs, 128);
5672 load64shorts(vs2, tmpAddr);
5673 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5674 vs_subv(vs1, __ T8H, vs1, vs2);
5675 __ add(tmpAddr, coeffs, 0);
5676 store64shorts(vs3, tmpAddr);
5677 load64shorts(vs2, zetas);
5678 vs_ldpq(vq, kyberConsts);
5679 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5680 __ add(tmpAddr, coeffs, 128);
5681 store64shorts(vs2, tmpAddr);
5682
5683 load64shorts(vs1, tmpAddr);
5684 __ add(tmpAddr, coeffs, 384);
5685 load64shorts(vs2, tmpAddr);
5686 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5687 vs_subv(vs1, __ T8H, vs1, vs2);
5688 __ add(tmpAddr, coeffs, 256);
5689 store64shorts(vs3, tmpAddr);
5690 load64shorts(vs2, zetas);
5691 vs_ldpq(vq, kyberConsts);
5692 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5693 __ add(tmpAddr, coeffs, 384);
5694 store64shorts(vs2, tmpAddr);
5695
5696 // Barrett reduction at indexes where overflow may happen
5697
5698 // load q and the multiplier for the Barrett reduction
5699 __ add(tmpAddr, kyberConsts, 16);
5700 vs_ldpq(vq, tmpAddr);
5701
5702 int offsets0[2] = { 0, 256 };
5703 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5704 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5705 vs_sshr(vs2, __ T8H, vs2, 11);
5706 vs_mlsv(vs1, __ T8H, vs2, vq1);
5707 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5708
5709 // level 6
5710
5711 __ add(tmpAddr, coeffs, 0);
5712 load64shorts(vs1, tmpAddr);
5713 __ add(tmpAddr, coeffs, 256);
5714 load64shorts(vs2, tmpAddr);
5715 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5716 vs_subv(vs1, __ T8H, vs1, vs2);
5717 __ add(tmpAddr, coeffs, 0);
5718 store64shorts(vs3, tmpAddr);
5719 load64shorts(vs2, zetas);
5720 vs_ldpq(vq, kyberConsts);
5721 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5722 __ add(tmpAddr, coeffs, 256);
5723 store64shorts(vs2, tmpAddr);
5724
5725 __ add(tmpAddr, coeffs, 128);
5726 load64shorts(vs1, tmpAddr);
5727 __ add(tmpAddr, coeffs, 384);
5728 load64shorts(vs2, tmpAddr);
5729 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5730 vs_subv(vs1, __ T8H, vs1, vs2);
5731 __ add(tmpAddr, coeffs, 128);
5732 store64shorts(vs3, tmpAddr);
5733 load64shorts(vs2, zetas);
5734 vs_ldpq(vq, kyberConsts);
5735 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5736 __ add(tmpAddr, coeffs, 384);
5737 store64shorts(vs2, tmpAddr);
5738
5739 // multiply by 2^-n
5740
5741 // load toMont(2^-n mod q)
5742 __ add(tmpAddr, kyberConsts, 48);
5743 __ ldr(v29, __ Q, tmpAddr);
5744
5745 vs_ldpq(vq, kyberConsts);
5746 __ add(tmpAddr, coeffs, 0);
5747 load64shorts(vs1, tmpAddr);
5748 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5749 __ add(tmpAddr, coeffs, 0);
5750 store64shorts(vs2, tmpAddr);
5751
5752 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5753 load64shorts(vs1, tmpAddr);
5754 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5755 __ add(tmpAddr, coeffs, 128);
5756 store64shorts(vs2, tmpAddr);
5757
5758 // now tmpAddr contains coeffs + 256
5759 load64shorts(vs1, tmpAddr);
5760 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5761 __ add(tmpAddr, coeffs, 256);
5762 store64shorts(vs2, tmpAddr);
5763
5764 // now tmpAddr contains coeffs + 384
5765 load64shorts(vs1, tmpAddr);
5766 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5767 __ add(tmpAddr, coeffs, 384);
5768 store64shorts(vs2, tmpAddr);
5769
5770 __ leave(); // required for proper stackwalking of RuntimeStub frame
5771 __ mov(r0, zr); // return 0
5772 __ ret(lr);
5773
5774 return start;
5775 }
5776
5777 // Kyber multiply polynomials in the NTT domain.
5778 // Implements
5779 // static int implKyberNttMult(
5780 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5781 //
5782 // result (short[256]) = c_rarg0
5783 // ntta (short[256]) = c_rarg1
5784 // nttb (short[256]) = c_rarg2
5785 // zetas (short[128]) = c_rarg3
5786 address generate_kyberNttMult() {
5787
5788 __ align(CodeEntryAlignment);
5789 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5790 StubCodeMark mark(this, stub_id);
5791 address start = __ pc();
5792 __ enter();
5793
5794 const Register result = c_rarg0;
5795 const Register ntta = c_rarg1;
5796 const Register nttb = c_rarg2;
5797 const Register zetas = c_rarg3;
5798
5799 const Register kyberConsts = r10;
5800 const Register limit = r11;
5801
5802 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5803 VSeq<4> vs3(16), vs4(20);
5804 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5805 VSeq<2> vz(28); // pair of zetas
5806 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5807
5808 __ lea(kyberConsts,
5809 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5810
5811 Label kyberNttMult_loop;
5812
5813 __ add(limit, result, 512);
5814
5815 // load q and qinv
5816 vs_ldpq(vq, kyberConsts);
5817
5818 // load R^2 mod q (to convert back from Montgomery representation)
5819 __ add(kyberConsts, kyberConsts, 64);
5820 __ ldr(v27, __ Q, kyberConsts);
5821
5822 __ BIND(kyberNttMult_loop);
5823
5824 // load 16 zetas
5825 vs_ldpq_post(vz, zetas);
5826
5827 // load 2 sets of 32 coefficients from the two input arrays
5828 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5829 // are striped across pairs of vector registers
5830 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5831 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5832 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5833 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5834
5835 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5836 // i.e. montmul the first and second halves of vs1 in order and
5837 // then with one sequence reversed storing the two results in vs3
5838 //
5839 // vs3[0] <- montmul(a0, b0)
5840 // vs3[1] <- montmul(a1, b1)
5841 // vs3[2] <- montmul(a0, b1)
5842 // vs3[3] <- montmul(a1, b0)
5843 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5844 kyber_montmul16(vs_back(vs3),
5845 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5846
5847 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5848 // i.e. montmul the first and second halves of vs4 in order and
5849 // then with one sequence reversed storing the two results in vs1
5850 //
5851 // vs1[0] <- montmul(a2, b2)
5852 // vs1[1] <- montmul(a3, b3)
5853 // vs1[2] <- montmul(a2, b3)
5854 // vs1[3] <- montmul(a3, b2)
5855 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5856 kyber_montmul16(vs_back(vs1),
5857 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5858
5859 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5860 // We can schedule two montmuls at a time if we use a suitable vector
5861 // sequence <vs3[1], vs1[1]>.
5862 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5863 VSeq<2> vs5(vs3[1], delta);
5864
5865 // vs3[1] <- montmul(montmul(a1, b1), z0)
5866 // vs1[1] <- montmul(montmul(a3, b3), z1)
5867 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5868
5869 // add results in pairs storing in vs3
5870 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5871 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5872 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5873
5874 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5875 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5876 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5877
5878 // vs1 <- montmul(vs3, montRSquareModQ)
5879 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5880
5881 // store back the two pairs of result vectors de-interleaved as 8H elements
5882 // i.e. storing each pairs of shorts striped across a register pair adjacent
5883 // in memory
5884 vs_st2_post(vs1, __ T8H, result);
5885
5886 __ cmp(result, limit);
5887 __ br(Assembler::NE, kyberNttMult_loop);
5888
5889 __ leave(); // required for proper stackwalking of RuntimeStub frame
5890 __ mov(r0, zr); // return 0
5891 __ ret(lr);
5892
5893 return start;
5894 }
5895
5896 // Kyber add 2 polynomials.
5897 // Implements
5898 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5899 //
5900 // result (short[256]) = c_rarg0
5901 // a (short[256]) = c_rarg1
5902 // b (short[256]) = c_rarg2
5903 address generate_kyberAddPoly_2() {
5904
5905 __ align(CodeEntryAlignment);
5906 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5907 StubCodeMark mark(this, stub_id);
5908 address start = __ pc();
5909 __ enter();
5910
5911 const Register result = c_rarg0;
5912 const Register a = c_rarg1;
5913 const Register b = c_rarg2;
5914
5915 const Register kyberConsts = r11;
5916
5917 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5918 // So, we can load, add and store the data in 3 groups of 11,
5919 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5920 // registers. A further constraint is that the mapping needs
5921 // to skip callee saves. So, we allocate the register
5922 // sequences using two 8 sequences, two 2 sequences and two
5923 // single registers.
5924 VSeq<8> vs1_1(0);
5925 VSeq<2> vs1_2(16);
5926 FloatRegister vs1_3 = v28;
5927 VSeq<8> vs2_1(18);
5928 VSeq<2> vs2_2(26);
5929 FloatRegister vs2_3 = v29;
5930
5931 // two constant vector sequences
5932 VSeq<8> vc_1(31, 0);
5933 VSeq<2> vc_2(31, 0);
5934
5935 FloatRegister vc_3 = v31;
5936 __ lea(kyberConsts,
5937 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5938
5939 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5940 for (int i = 0; i < 3; i++) {
5941 // load 80 or 88 values from a into vs1_1/2/3
5942 vs_ldpq_post(vs1_1, a);
5943 vs_ldpq_post(vs1_2, a);
5944 if (i < 2) {
5945 __ ldr(vs1_3, __ Q, __ post(a, 16));
5946 }
5947 // load 80 or 88 values from b into vs2_1/2/3
5948 vs_ldpq_post(vs2_1, b);
5949 vs_ldpq_post(vs2_2, b);
5950 if (i < 2) {
5951 __ ldr(vs2_3, __ Q, __ post(b, 16));
5952 }
5953 // sum 80 or 88 values across vs1 and vs2 into vs1
5954 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5955 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5956 if (i < 2) {
5957 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5958 }
5959 // add constant to all 80 or 88 results
5960 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5961 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5962 if (i < 2) {
5963 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5964 }
5965 // store 80 or 88 values
5966 vs_stpq_post(vs1_1, result);
5967 vs_stpq_post(vs1_2, result);
5968 if (i < 2) {
5969 __ str(vs1_3, __ Q, __ post(result, 16));
5970 }
5971 }
5972
5973 __ leave(); // required for proper stackwalking of RuntimeStub frame
5974 __ mov(r0, zr); // return 0
5975 __ ret(lr);
5976
5977 return start;
5978 }
5979
5980 // Kyber add 3 polynomials.
5981 // Implements
5982 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
5983 //
5984 // result (short[256]) = c_rarg0
5985 // a (short[256]) = c_rarg1
5986 // b (short[256]) = c_rarg2
5987 // c (short[256]) = c_rarg3
5988 address generate_kyberAddPoly_3() {
5989
5990 __ align(CodeEntryAlignment);
5991 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
5992 StubCodeMark mark(this, stub_id);
5993 address start = __ pc();
5994 __ enter();
5995
5996 const Register result = c_rarg0;
5997 const Register a = c_rarg1;
5998 const Register b = c_rarg2;
5999 const Register c = c_rarg3;
6000
6001 const Register kyberConsts = r11;
6002
6003 // As above we sum 256 sets of values in total i.e. 32 x 8H
6004 // quadwords. So, we can load, add and store the data in 3
6005 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6006 // of 10 or 11 registers. A further constraint is that the
6007 // mapping needs to skip callee saves. So, we allocate the
6008 // register sequences using two 8 sequences, two 2 sequences
6009 // and two single registers.
6010 VSeq<8> vs1_1(0);
6011 VSeq<2> vs1_2(16);
6012 FloatRegister vs1_3 = v28;
6013 VSeq<8> vs2_1(18);
6014 VSeq<2> vs2_2(26);
6015 FloatRegister vs2_3 = v29;
6016
6017 // two constant vector sequences
6018 VSeq<8> vc_1(31, 0);
6019 VSeq<2> vc_2(31, 0);
6020
6021 FloatRegister vc_3 = v31;
6022
6023 __ lea(kyberConsts,
6024 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6025
6026 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6027 for (int i = 0; i < 3; i++) {
6028 // load 80 or 88 values from a into vs1_1/2/3
6029 vs_ldpq_post(vs1_1, a);
6030 vs_ldpq_post(vs1_2, a);
6031 if (i < 2) {
6032 __ ldr(vs1_3, __ Q, __ post(a, 16));
6033 }
6034 // load 80 or 88 values from b into vs2_1/2/3
6035 vs_ldpq_post(vs2_1, b);
6036 vs_ldpq_post(vs2_2, b);
6037 if (i < 2) {
6038 __ ldr(vs2_3, __ Q, __ post(b, 16));
6039 }
6040 // sum 80 or 88 values across vs1 and vs2 into vs1
6041 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6042 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6043 if (i < 2) {
6044 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6045 }
6046 // load 80 or 88 values from c into vs2_1/2/3
6047 vs_ldpq_post(vs2_1, c);
6048 vs_ldpq_post(vs2_2, c);
6049 if (i < 2) {
6050 __ ldr(vs2_3, __ Q, __ post(c, 16));
6051 }
6052 // sum 80 or 88 values across vs1 and vs2 into vs1
6053 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6054 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6055 if (i < 2) {
6056 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6057 }
6058 // add constant to all 80 or 88 results
6059 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6060 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6061 if (i < 2) {
6062 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6063 }
6064 // store 80 or 88 values
6065 vs_stpq_post(vs1_1, result);
6066 vs_stpq_post(vs1_2, result);
6067 if (i < 2) {
6068 __ str(vs1_3, __ Q, __ post(result, 16));
6069 }
6070 }
6071
6072 __ leave(); // required for proper stackwalking of RuntimeStub frame
6073 __ mov(r0, zr); // return 0
6074 __ ret(lr);
6075
6076 return start;
6077 }
6078
6079 // Kyber parse XOF output to polynomial coefficient candidates
6080 // or decodePoly(12, ...).
6081 // Implements
6082 // static int implKyber12To16(
6083 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6084 //
6085 // we assume that parsed and condensed are allocated such that for
6086 // n = (parsedLength + 63) / 64
6087 // n blocks of 96 bytes of input can be processed, i.e.
6088 // index + n * 96 <= condensed.length and
6089 // n * 64 <= parsed.length
6090 //
6091 // condensed (byte[]) = c_rarg0
6092 // condensedIndex = c_rarg1
6093 // parsed (short[]) = c_rarg2
6094 // parsedLength = c_rarg3
6095 address generate_kyber12To16() {
6096 Label L_F00, L_loop;
6097
6098 __ align(CodeEntryAlignment);
6099 StubId stub_id = StubId::stubgen_kyber12To16_id;
6100 StubCodeMark mark(this, stub_id);
6101 address start = __ pc();
6102 __ enter();
6103
6104 const Register condensed = c_rarg0;
6105 const Register condensedOffs = c_rarg1;
6106 const Register parsed = c_rarg2;
6107 const Register parsedLength = c_rarg3;
6108
6109 const Register tmpAddr = r11;
6110
6111 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6112 // quadwords so we need a 6 vector sequence for the inputs.
6113 // Parsing produces 64 shorts, employing two 8 vector
6114 // sequences to store and combine the intermediate data.
6115 VSeq<6> vin(24);
6116 VSeq<8> va(0), vb(16);
6117
6118 __ adr(tmpAddr, L_F00);
6119 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6120 __ add(condensed, condensed, condensedOffs);
6121
6122 __ BIND(L_loop);
6123 // load 96 (6 x 16B) byte values
6124 vs_ld3_post(vin, __ T16B, condensed);
6125
6126 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6127 // holds 48 (16x3) contiguous bytes from memory striped
6128 // horizontally across each of the 16 byte lanes. Equivalently,
6129 // that is 16 pairs of 12-bit integers. Likewise the back half
6130 // holds the next 48 bytes in the same arrangement.
6131
6132 // Each vector in the front half can also be viewed as a vertical
6133 // strip across the 16 pairs of 12 bit integers. Each byte in
6134 // vin[0] stores the low 8 bits of the first int in a pair. Each
6135 // byte in vin[1] stores the high 4 bits of the first int and the
6136 // low 4 bits of the second int. Each byte in vin[2] stores the
6137 // high 8 bits of the second int. Likewise the vectors in second
6138 // half.
6139
6140 // Converting the data to 16-bit shorts requires first of all
6141 // expanding each of the 6 x 16B vectors into 6 corresponding
6142 // pairs of 8H vectors. Mask, shift and add operations on the
6143 // resulting vector pairs can be used to combine 4 and 8 bit
6144 // parts of related 8H vector elements.
6145 //
6146 // The middle vectors (vin[2] and vin[5]) are actually expanded
6147 // twice, one copy manipulated to provide the lower 4 bits
6148 // belonging to the first short in a pair and another copy
6149 // manipulated to provide the higher 4 bits belonging to the
6150 // second short in a pair. This is why the the vector sequences va
6151 // and vb used to hold the expanded 8H elements are of length 8.
6152
6153 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6154 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6155 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6156 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6157 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6158 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6159 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6160 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6161
6162 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6163 // and vb[4:5]
6164 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6165 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6166 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6167 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6168 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6169 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6170
6171 // shift lo byte of copy 1 of the middle stripe into the high byte
6172 __ shl(va[2], __ T8H, va[2], 8);
6173 __ shl(va[3], __ T8H, va[3], 8);
6174 __ shl(vb[2], __ T8H, vb[2], 8);
6175 __ shl(vb[3], __ T8H, vb[3], 8);
6176
6177 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6178 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6179 // are in bit positions [4..11].
6180 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6181 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6182 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6183 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6184
6185 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6186 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6187 // copy2
6188 __ andr(va[2], __ T16B, va[2], v31);
6189 __ andr(va[3], __ T16B, va[3], v31);
6190 __ ushr(va[4], __ T8H, va[4], 4);
6191 __ ushr(va[5], __ T8H, va[5], 4);
6192 __ andr(vb[2], __ T16B, vb[2], v31);
6193 __ andr(vb[3], __ T16B, vb[3], v31);
6194 __ ushr(vb[4], __ T8H, vb[4], 4);
6195 __ ushr(vb[5], __ T8H, vb[5], 4);
6196
6197 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6198 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6199 // n.b. the ordering ensures: i) inputs are consumed before they
6200 // are overwritten ii) the order of 16-bit results across successive
6201 // pairs of vectors in va and then vb reflects the order of the
6202 // corresponding 12-bit inputs
6203 __ addv(va[0], __ T8H, va[0], va[2]);
6204 __ addv(va[2], __ T8H, va[1], va[3]);
6205 __ addv(va[1], __ T8H, va[4], va[6]);
6206 __ addv(va[3], __ T8H, va[5], va[7]);
6207 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6208 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6209 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6210 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6211
6212 // store 64 results interleaved as shorts
6213 vs_st2_post(vs_front(va), __ T8H, parsed);
6214 vs_st2_post(vs_front(vb), __ T8H, parsed);
6215
6216 __ sub(parsedLength, parsedLength, 64);
6217 __ cmp(parsedLength, (u1)0);
6218 __ br(Assembler::GT, L_loop);
6219
6220 __ leave(); // required for proper stackwalking of RuntimeStub frame
6221 __ mov(r0, zr); // return 0
6222 __ ret(lr);
6223
6224 // bind label and generate constant data used by this stub
6225 __ BIND(L_F00);
6226 __ emit_int64(0x0f000f000f000f00);
6227 __ emit_int64(0x0f000f000f000f00);
6228
6229 return start;
6230 }
6231
6232 // Kyber Barrett reduce function.
6233 // Implements
6234 // static int implKyberBarrettReduce(short[] coeffs) {}
6235 //
6236 // coeffs (short[256]) = c_rarg0
6237 address generate_kyberBarrettReduce() {
6238
6239 __ align(CodeEntryAlignment);
6240 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6241 StubCodeMark mark(this, stub_id);
6242 address start = __ pc();
6243 __ enter();
6244
6245 const Register coeffs = c_rarg0;
6246
6247 const Register kyberConsts = r10;
6248 const Register result = r11;
6249
6250 // As above we process 256 sets of values in total i.e. 32 x
6251 // 8H quadwords. So, we can load, add and store the data in 3
6252 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6253 // of 10 or 11 registers. A further constraint is that the
6254 // mapping needs to skip callee saves. So, we allocate the
6255 // register sequences using two 8 sequences, two 2 sequences
6256 // and two single registers.
6257 VSeq<8> vs1_1(0);
6258 VSeq<2> vs1_2(16);
6259 FloatRegister vs1_3 = v28;
6260 VSeq<8> vs2_1(18);
6261 VSeq<2> vs2_2(26);
6262 FloatRegister vs2_3 = v29;
6263
6264 // we also need a pair of corresponding constant sequences
6265
6266 VSeq<8> vc1_1(30, 0);
6267 VSeq<2> vc1_2(30, 0);
6268 FloatRegister vc1_3 = v30; // for kyber_q
6269
6270 VSeq<8> vc2_1(31, 0);
6271 VSeq<2> vc2_2(31, 0);
6272 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6273
6274 __ add(result, coeffs, 0);
6275 __ lea(kyberConsts,
6276 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6277
6278 // load q and the multiplier for the Barrett reduction
6279 __ add(kyberConsts, kyberConsts, 16);
6280 __ ldpq(vc1_3, vc2_3, kyberConsts);
6281
6282 for (int i = 0; i < 3; i++) {
6283 // load 80 or 88 coefficients
6284 vs_ldpq_post(vs1_1, coeffs);
6285 vs_ldpq_post(vs1_2, coeffs);
6286 if (i < 2) {
6287 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6288 }
6289
6290 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6291 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6292 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6293 if (i < 2) {
6294 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6295 }
6296
6297 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6298 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6299 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6300 if (i < 2) {
6301 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6302 }
6303
6304 // vs1 <- vs1 - vs2 * kyber_q
6305 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6306 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6307 if (i < 2) {
6308 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6309 }
6310
6311 vs_stpq_post(vs1_1, result);
6312 vs_stpq_post(vs1_2, result);
6313 if (i < 2) {
6314 __ str(vs1_3, __ Q, __ post(result, 16));
6315 }
6316 }
6317
6318 __ leave(); // required for proper stackwalking of RuntimeStub frame
6319 __ mov(r0, zr); // return 0
6320 __ ret(lr);
6321
6322 return start;
6323 }
6324
6325
6326 // Dilithium-specific montmul helper routines that generate parallel
6327 // code for, respectively, a single 4x4s vector sequence montmul or
6328 // two such multiplies in a row.
6329
6330 // Perform 16 32-bit Montgomery multiplications in parallel
6331 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6332 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6333 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6334 // It will assert that the register use is valid
6335 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6336 }
6337
6338 // Perform 2x16 32-bit Montgomery multiplications in parallel
6339 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6340 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6341 // Schedule two successive 4x4S multiplies via the montmul helper
6342 // on the front and back halves of va, vb and vc. The helper will
6343 // assert that the register use has no overlap conflicts on each
6344 // individual call but we also need to ensure that the necessary
6345 // disjoint/equality constraints are met across both calls.
6346
6347 // vb, vc, vtmp and vq must be disjoint. va must either be
6348 // disjoint from all other registers or equal vc
6349
6350 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6351 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6352 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6353
6354 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6355 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6356
6357 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6358
6359 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6360 assert(vs_disjoint(va, vb), "va and vb overlap");
6361 assert(vs_disjoint(va, vq), "va and vq overlap");
6362 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6363
6364 // We multiply the front and back halves of each sequence 4 at a
6365 // time because
6366 //
6367 // 1) we are currently only able to get 4-way instruction
6368 // parallelism at best
6369 //
6370 // 2) we need registers for the constants in vq and temporary
6371 // scratch registers to hold intermediate results so vtmp can only
6372 // be a VSeq<4> which means we only have 4 scratch slots.
6373
6374 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6375 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6376 }
6377
6378 // Perform combined montmul then add/sub on 4x4S vectors.
6379 void dilithium_montmul16_sub_add(
6380 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6381 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6382 // compute a = montmul(a1, c)
6383 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6384 // ouptut a1 = a0 - a
6385 vs_subv(va1, __ T4S, va0, vc);
6386 // and a0 = a0 + a
6387 vs_addv(va0, __ T4S, va0, vc);
6388 }
6389
6390 // Perform combined add/sub then montul on 4x4S vectors.
6391 void dilithium_sub_add_montmul16(
6392 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6393 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6394 // compute c = a0 - a1
6395 vs_subv(vtmp1, __ T4S, va0, va1);
6396 // output a0 = a0 + a1
6397 vs_addv(va0, __ T4S, va0, va1);
6398 // output a1 = b montmul c
6399 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6400 }
6401
6402 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6403 // in the Java implementation come in sequences of at least 8, so we
6404 // can use ldpq to collect the corresponding data into pairs of vector
6405 // registers.
6406 // We collect the coefficients corresponding to the 'j+l' indexes into
6407 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6408 // then we do the (Montgomery) multiplications by the zetas in parallel
6409 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6410 // v0-v7, then do the additions into v24-v31 and the subtractions into
6411 // v0-v7 and finally save the results back to the coeffs array.
6412 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6413 const Register coeffs, const Register zetas) {
6414 int c1 = 0;
6415 int c2 = 512;
6416 int startIncr;
6417 // don't use callee save registers v8 - v15
6418 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6419 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6420 VSeq<2> vq(30); // n.b. constants overlap vs3
6421 int offsets[4] = { 0, 32, 64, 96 };
6422
6423 for (int level = 0; level < 5; level++) {
6424 int c1Start = c1;
6425 int c2Start = c2;
6426 if (level == 3) {
6427 offsets[1] = 32;
6428 offsets[2] = 128;
6429 offsets[3] = 160;
6430 } else if (level == 4) {
6431 offsets[1] = 64;
6432 offsets[2] = 128;
6433 offsets[3] = 192;
6434 }
6435
6436 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6437 // time at 4 different offsets and multiply them in order by the
6438 // next set of input values. So we employ indexed load and store
6439 // pair instructions with arrangement 4S.
6440 for (int i = 0; i < 4; i++) {
6441 // reload q and qinv
6442 vs_ldpq(vq, dilithiumConsts); // qInv, q
6443 // load 8x4S coefficients via second start pos == c2
6444 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6445 // load next 8x4S inputs == b
6446 vs_ldpq_post(vs2, zetas);
6447 // compute a == c2 * b mod MONT_Q
6448 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6449 // load 8x4s coefficients via first start pos == c1
6450 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6451 // compute a1 = c1 + a
6452 vs_addv(vs3, __ T4S, vs1, vs2);
6453 // compute a2 = c1 - a
6454 vs_subv(vs1, __ T4S, vs1, vs2);
6455 // output a1 and a2
6456 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6457 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6458
6459 int k = 4 * level + i;
6460
6461 if (k > 7) {
6462 startIncr = 256;
6463 } else if (k == 5) {
6464 startIncr = 384;
6465 } else {
6466 startIncr = 128;
6467 }
6468
6469 c1Start += startIncr;
6470 c2Start += startIncr;
6471 }
6472
6473 c2 /= 2;
6474 }
6475 }
6476
6477 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6478 // Implements the method
6479 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6480 // of the Java class sun.security.provider
6481 //
6482 // coeffs (int[256]) = c_rarg0
6483 // zetas (int[256]) = c_rarg1
6484 address generate_dilithiumAlmostNtt() {
6485
6486 __ align(CodeEntryAlignment);
6487 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6488 StubCodeMark mark(this, stub_id);
6489 address start = __ pc();
6490 __ enter();
6491
6492 const Register coeffs = c_rarg0;
6493 const Register zetas = c_rarg1;
6494
6495 const Register tmpAddr = r9;
6496 const Register dilithiumConsts = r10;
6497 const Register result = r11;
6498 // don't use callee save registers v8 - v15
6499 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6500 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6501 VSeq<2> vq(30); // n.b. constants overlap vs3
6502 int offsets[4] = { 0, 32, 64, 96};
6503 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6504 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6505 __ add(result, coeffs, 0);
6506 __ lea(dilithiumConsts,
6507 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6508
6509 // Each level represents one iteration of the outer for loop of the Java version.
6510
6511 // level 0-4
6512 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6513
6514 // level 5
6515
6516 // At level 5 the coefficients we need to combine with the zetas
6517 // are grouped in memory in blocks of size 4. So, for both sets of
6518 // coefficients we load 4 adjacent values at 8 different offsets
6519 // using an indexed ldr with register variant Q and multiply them
6520 // in sequence order by the next set of inputs. Likewise we store
6521 // the resuls using an indexed str with register variant Q.
6522 for (int i = 0; i < 1024; i += 256) {
6523 // reload constants q, qinv each iteration as they get clobbered later
6524 vs_ldpq(vq, dilithiumConsts); // qInv, q
6525 // load 32 (8x4S) coefficients via first offsets = c1
6526 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6527 // load next 32 (8x4S) inputs = b
6528 vs_ldpq_post(vs2, zetas);
6529 // a = b montul c1
6530 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6531 // load 32 (8x4S) coefficients via second offsets = c2
6532 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6533 // add/sub with result of multiply
6534 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6535 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6536 // write back new coefficients using same offsets
6537 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6538 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6539 }
6540
6541 // level 6
6542 // At level 6 the coefficients we need to combine with the zetas
6543 // are grouped in memory in pairs, the first two being montmul
6544 // inputs and the second add/sub inputs. We can still implement
6545 // the montmul+sub+add using 4-way parallelism but only if we
6546 // combine the coefficients with the zetas 16 at a time. We load 8
6547 // adjacent values at 4 different offsets using an ld2 load with
6548 // arrangement 2D. That interleaves the lower and upper halves of
6549 // each pair of quadwords into successive vector registers. We
6550 // then need to montmul the 4 even elements of the coefficients
6551 // register sequence by the zetas in order and then add/sub the 4
6552 // odd elements of the coefficients register sequence. We use an
6553 // equivalent st2 operation to store the results back into memory
6554 // de-interleaved.
6555 for (int i = 0; i < 1024; i += 128) {
6556 // reload constants q, qinv each iteration as they get clobbered later
6557 vs_ldpq(vq, dilithiumConsts); // qInv, q
6558 // load interleaved 16 (4x2D) coefficients via offsets
6559 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6560 // load next 16 (4x4S) inputs
6561 vs_ldpq_post(vs_front(vs2), zetas);
6562 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6563 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6564 vs_front(vs2), vtmp, vq);
6565 // store interleaved 16 (4x2D) coefficients via offsets
6566 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6567 }
6568
6569 // level 7
6570 // At level 7 the coefficients we need to combine with the zetas
6571 // occur singly with montmul inputs alterating with add/sub
6572 // inputs. Once again we can use 4-way parallelism to combine 16
6573 // zetas at a time. However, we have to load 8 adjacent values at
6574 // 4 different offsets using an ld2 load with arrangement 4S. That
6575 // interleaves the the odd words of each pair into one
6576 // coefficients vector register and the even words of the pair
6577 // into the next register. We then need to montmul the 4 even
6578 // elements of the coefficients register sequence by the zetas in
6579 // order and then add/sub the 4 odd elements of the coefficients
6580 // register sequence. We use an equivalent st2 operation to store
6581 // the results back into memory de-interleaved.
6582
6583 for (int i = 0; i < 1024; i += 128) {
6584 // reload constants q, qinv each iteration as they get clobbered later
6585 vs_ldpq(vq, dilithiumConsts); // qInv, q
6586 // load interleaved 16 (4x4S) coefficients via offsets
6587 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6588 // load next 16 (4x4S) inputs
6589 vs_ldpq_post(vs_front(vs2), zetas);
6590 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6591 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6592 vs_front(vs2), vtmp, vq);
6593 // store interleaved 16 (4x4S) coefficients via offsets
6594 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6595 }
6596 __ leave(); // required for proper stackwalking of RuntimeStub frame
6597 __ mov(r0, zr); // return 0
6598 __ ret(lr);
6599
6600 return start;
6601 }
6602
6603 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6604 // in the Java implementation come in sequences of at least 8, so we
6605 // can use ldpq to collect the corresponding data into pairs of vector
6606 // registers
6607 // We collect the coefficients that correspond to the 'j's into vs1
6608 // the coefficiets that correspond to the 'j+l's into vs2 then
6609 // do the additions into vs3 and the subtractions into vs1 then
6610 // save the result of the additions, load the zetas into vs2
6611 // do the (Montgomery) multiplications by zeta in parallel into vs2
6612 // finally save the results back to the coeffs array
6613 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6614 const Register coeffs, const Register zetas) {
6615 int c1 = 0;
6616 int c2 = 32;
6617 int startIncr;
6618 int offsets[4];
6619 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6620 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6621 VSeq<2> vq(30); // n.b. constants overlap vs3
6622
6623 offsets[0] = 0;
6624
6625 for (int level = 3; level < 8; level++) {
6626 int c1Start = c1;
6627 int c2Start = c2;
6628 if (level == 3) {
6629 offsets[1] = 64;
6630 offsets[2] = 128;
6631 offsets[3] = 192;
6632 } else if (level == 4) {
6633 offsets[1] = 32;
6634 offsets[2] = 128;
6635 offsets[3] = 160;
6636 } else {
6637 offsets[1] = 32;
6638 offsets[2] = 64;
6639 offsets[3] = 96;
6640 }
6641
6642 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6643 // time at 4 different offsets and multiply them in order by the
6644 // next set of input values. So we employ indexed load and store
6645 // pair instructions with arrangement 4S.
6646 for (int i = 0; i < 4; i++) {
6647 // load v1 32 (8x4S) coefficients relative to first start index
6648 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6649 // load v2 32 (8x4S) coefficients relative to second start index
6650 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6651 // a0 = v1 + v2 -- n.b. clobbers vqs
6652 vs_addv(vs3, __ T4S, vs1, vs2);
6653 // a1 = v1 - v2
6654 vs_subv(vs1, __ T4S, vs1, vs2);
6655 // save a1 relative to first start index
6656 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6657 // load constants q, qinv each iteration as they get clobbered above
6658 vs_ldpq(vq, dilithiumConsts); // qInv, q
6659 // load b next 32 (8x4S) inputs
6660 vs_ldpq_post(vs2, zetas);
6661 // a = a1 montmul b
6662 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6663 // save a relative to second start index
6664 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6665
6666 int k = 4 * level + i;
6667
6668 if (k < 24) {
6669 startIncr = 256;
6670 } else if (k == 25) {
6671 startIncr = 384;
6672 } else {
6673 startIncr = 128;
6674 }
6675
6676 c1Start += startIncr;
6677 c2Start += startIncr;
6678 }
6679
6680 c2 *= 2;
6681 }
6682 }
6683
6684 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6685 // Implements the method
6686 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6687 // the sun.security.provider.ML_DSA class.
6688 //
6689 // coeffs (int[256]) = c_rarg0
6690 // zetas (int[256]) = c_rarg1
6691 address generate_dilithiumAlmostInverseNtt() {
6692
6693 __ align(CodeEntryAlignment);
6694 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6695 StubCodeMark mark(this, stub_id);
6696 address start = __ pc();
6697 __ enter();
6698
6699 const Register coeffs = c_rarg0;
6700 const Register zetas = c_rarg1;
6701
6702 const Register tmpAddr = r9;
6703 const Register dilithiumConsts = r10;
6704 const Register result = r11;
6705 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6706 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6707 VSeq<2> vq(30); // n.b. constants overlap vs3
6708 int offsets[4] = { 0, 32, 64, 96 };
6709 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6710 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6711
6712 __ add(result, coeffs, 0);
6713 __ lea(dilithiumConsts,
6714 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6715
6716 // Each level represents one iteration of the outer for loop of the Java version
6717
6718 // level 0
6719 // At level 0 we need to interleave adjacent quartets of
6720 // coefficients before we multiply and add/sub by the next 16
6721 // zetas just as we did for level 7 in the multiply code. So we
6722 // load and store the values using an ld2/st2 with arrangement 4S.
6723 for (int i = 0; i < 1024; i += 128) {
6724 // load constants q, qinv
6725 // n.b. this can be moved out of the loop as they do not get
6726 // clobbered by first two loops
6727 vs_ldpq(vq, dilithiumConsts); // qInv, q
6728 // a0/a1 load interleaved 32 (8x4S) coefficients
6729 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6730 // b load next 32 (8x4S) inputs
6731 vs_ldpq_post(vs_front(vs2), zetas);
6732 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6733 // n.b. second half of vs2 provides temporary register storage
6734 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6735 vs_front(vs2), vs_back(vs2), vtmp, vq);
6736 // a0/a1 store interleaved 32 (8x4S) coefficients
6737 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6738 }
6739
6740 // level 1
6741 // At level 1 we need to interleave pairs of adjacent pairs of
6742 // coefficients before we multiply by the next 16 zetas just as we
6743 // did for level 6 in the multiply code. So we load and store the
6744 // values an ld2/st2 with arrangement 2D.
6745 for (int i = 0; i < 1024; i += 128) {
6746 // a0/a1 load interleaved 32 (8x2D) coefficients
6747 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6748 // b load next 16 (4x4S) inputs
6749 vs_ldpq_post(vs_front(vs2), zetas);
6750 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6751 // n.b. second half of vs2 provides temporary register storage
6752 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6753 vs_front(vs2), vs_back(vs2), vtmp, vq);
6754 // a0/a1 store interleaved 32 (8x2D) coefficients
6755 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6756 }
6757
6758 // level 2
6759 // At level 2 coefficients come in blocks of 4. So, we load 4
6760 // adjacent coefficients at 8 distinct offsets for both the first
6761 // and second coefficient sequences, using an ldr with register
6762 // variant Q then combine them with next set of 32 zetas. Likewise
6763 // we store the results using an str with register variant Q.
6764 for (int i = 0; i < 1024; i += 256) {
6765 // c0 load 32 (8x4S) coefficients via first offsets
6766 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6767 // c1 load 32 (8x4S) coefficients via second offsets
6768 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6769 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6770 vs_addv(vs3, __ T4S, vs1, vs2);
6771 // c = c0 - c1
6772 vs_subv(vs1, __ T4S, vs1, vs2);
6773 // store a0 32 (8x4S) coefficients via first offsets
6774 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6775 // b load 32 (8x4S) next inputs
6776 vs_ldpq_post(vs2, zetas);
6777 // reload constants q, qinv -- they were clobbered earlier
6778 vs_ldpq(vq, dilithiumConsts); // qInv, q
6779 // compute a1 = b montmul c
6780 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6781 // store a1 32 (8x4S) coefficients via second offsets
6782 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6783 }
6784
6785 // level 3-7
6786 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6787
6788 __ leave(); // required for proper stackwalking of RuntimeStub frame
6789 __ mov(r0, zr); // return 0
6790 __ ret(lr);
6791
6792 return start;
6793 }
6794
6795 // Dilithium multiply polynomials in the NTT domain.
6796 // Straightforward implementation of the method
6797 // static int implDilithiumNttMult(
6798 // int[] result, int[] ntta, int[] nttb {} of
6799 // the sun.security.provider.ML_DSA class.
6800 //
6801 // result (int[256]) = c_rarg0
6802 // poly1 (int[256]) = c_rarg1
6803 // poly2 (int[256]) = c_rarg2
6804 address generate_dilithiumNttMult() {
6805
6806 __ align(CodeEntryAlignment);
6807 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6808 StubCodeMark mark(this, stub_id);
6809 address start = __ pc();
6810 __ enter();
6811
6812 Label L_loop;
6813
6814 const Register result = c_rarg0;
6815 const Register poly1 = c_rarg1;
6816 const Register poly2 = c_rarg2;
6817
6818 const Register dilithiumConsts = r10;
6819 const Register len = r11;
6820
6821 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6822 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6823 VSeq<2> vq(30); // n.b. constants overlap vs3
6824 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6825
6826 __ lea(dilithiumConsts,
6827 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6828
6829 // load constants q, qinv
6830 vs_ldpq(vq, dilithiumConsts); // qInv, q
6831 // load constant rSquare into v29
6832 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6833
6834 __ mov(len, zr);
6835 __ add(len, len, 1024);
6836
6837 __ BIND(L_loop);
6838
6839 // b load 32 (8x4S) next inputs from poly1
6840 vs_ldpq_post(vs1, poly1);
6841 // c load 32 (8x4S) next inputs from poly2
6842 vs_ldpq_post(vs2, poly2);
6843 // compute a = b montmul c
6844 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6845 // compute a = rsquare montmul a
6846 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6847 // save a 32 (8x4S) results
6848 vs_stpq_post(vs2, result);
6849
6850 __ sub(len, len, 128);
6851 __ cmp(len, (u1)128);
6852 __ br(Assembler::GE, L_loop);
6853
6854 __ leave(); // required for proper stackwalking of RuntimeStub frame
6855 __ mov(r0, zr); // return 0
6856 __ ret(lr);
6857
6858 return start;
6859 }
6860
6861 // Dilithium Motgomery multiply an array by a constant.
6862 // A straightforward implementation of the method
6863 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6864 // of the sun.security.provider.MLDSA class
6865 //
6866 // coeffs (int[256]) = c_rarg0
6867 // constant (int) = c_rarg1
6868 address generate_dilithiumMontMulByConstant() {
6869
6870 __ align(CodeEntryAlignment);
6871 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6872 StubCodeMark mark(this, stub_id);
6873 address start = __ pc();
6874 __ enter();
6875
6876 Label L_loop;
6877
6878 const Register coeffs = c_rarg0;
6879 const Register constant = c_rarg1;
6880
6881 const Register dilithiumConsts = r10;
6882 const Register result = r11;
6883 const Register len = r12;
6884
6885 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6886 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6887 VSeq<2> vq(30); // n.b. constants overlap vs3
6888 VSeq<8> vconst(29, 0); // for montmul by constant
6889
6890 // results track inputs
6891 __ add(result, coeffs, 0);
6892 __ lea(dilithiumConsts,
6893 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6894
6895 // load constants q, qinv -- they do not get clobbered by first two loops
6896 vs_ldpq(vq, dilithiumConsts); // qInv, q
6897 // copy caller supplied constant across vconst
6898 __ dup(vconst[0], __ T4S, constant);
6899 __ mov(len, zr);
6900 __ add(len, len, 1024);
6901
6902 __ BIND(L_loop);
6903
6904 // load next 32 inputs
6905 vs_ldpq_post(vs2, coeffs);
6906 // mont mul by constant
6907 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6908 // write next 32 results
6909 vs_stpq_post(vs2, result);
6910
6911 __ sub(len, len, 128);
6912 __ cmp(len, (u1)128);
6913 __ br(Assembler::GE, L_loop);
6914
6915 __ leave(); // required for proper stackwalking of RuntimeStub frame
6916 __ mov(r0, zr); // return 0
6917 __ ret(lr);
6918
6919 return start;
6920 }
6921
6922 // Dilithium decompose poly.
6923 // Implements the method
6924 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6925 // of the sun.security.provider.ML_DSA class
6926 //
6927 // input (int[256]) = c_rarg0
6928 // lowPart (int[256]) = c_rarg1
6929 // highPart (int[256]) = c_rarg2
6930 // twoGamma2 (int) = c_rarg3
6931 // multiplier (int) = c_rarg4
6932 address generate_dilithiumDecomposePoly() {
6933
6934 __ align(CodeEntryAlignment);
6935 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
6936 StubCodeMark mark(this, stub_id);
6937 address start = __ pc();
6938 Label L_loop;
6939
6940 const Register input = c_rarg0;
6941 const Register lowPart = c_rarg1;
6942 const Register highPart = c_rarg2;
6943 const Register twoGamma2 = c_rarg3;
6944 const Register multiplier = c_rarg4;
6945
6946 const Register len = r9;
6947 const Register dilithiumConsts = r10;
6948 const Register tmp = r11;
6949
6950 // 6 independent sets of 4x4s values
6951 VSeq<4> vs1(0), vs2(4), vs3(8);
6952 VSeq<4> vs4(12), vs5(16), vtmp(20);
6953
6954 // 7 constants for cross-multiplying
6955 VSeq<4> one(25, 0);
6956 VSeq<4> qminus1(26, 0);
6957 VSeq<4> g2(27, 0);
6958 VSeq<4> twog2(28, 0);
6959 VSeq<4> mult(29, 0);
6960 VSeq<4> q(30, 0);
6961 VSeq<4> qadd(31, 0);
6962
6963 __ enter();
6964
6965 __ lea(dilithiumConsts,
6966 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6967
6968 // save callee-saved registers
6969 __ stpd(v8, v9, __ pre(sp, -64));
6970 __ stpd(v10, v11, Address(sp, 16));
6971 __ stpd(v12, v13, Address(sp, 32));
6972 __ stpd(v14, v15, Address(sp, 48));
6973
6974 // populate constant registers
6975 __ mov(tmp, zr);
6976 __ add(tmp, tmp, 1);
6977 __ dup(one[0], __ T4S, tmp); // 1
6978 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
6979 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
6980 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
6981 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
6982 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
6983 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
6984
6985 __ mov(len, zr);
6986 __ add(len, len, 1024);
6987
6988 __ BIND(L_loop);
6989
6990 // load next 4x4S inputs interleaved: rplus --> vs1
6991 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
6992
6993 // rplus = rplus - ((rplus + qadd) >> 23) * q
6994 vs_addv(vtmp, __ T4S, vs1, qadd);
6995 vs_sshr(vtmp, __ T4S, vtmp, 23);
6996 vs_mulv(vtmp, __ T4S, vtmp, q);
6997 vs_subv(vs1, __ T4S, vs1, vtmp);
6998
6999 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7000 vs_sshr(vtmp, __ T4S, vs1, 31);
7001 vs_andr(vtmp, vtmp, q);
7002 vs_addv(vs1, __ T4S, vs1, vtmp);
7003
7004 // quotient --> vs2
7005 // int quotient = (rplus * multiplier) >> 22;
7006 vs_mulv(vtmp, __ T4S, vs1, mult);
7007 vs_sshr(vs2, __ T4S, vtmp, 22);
7008
7009 // r0 --> vs3
7010 // int r0 = rplus - quotient * twoGamma2;
7011 vs_mulv(vtmp, __ T4S, vs2, twog2);
7012 vs_subv(vs3, __ T4S, vs1, vtmp);
7013
7014 // mask --> vs4
7015 // int mask = (twoGamma2 - r0) >> 22;
7016 vs_subv(vtmp, __ T4S, twog2, vs3);
7017 vs_sshr(vs4, __ T4S, vtmp, 22);
7018
7019 // r0 -= (mask & twoGamma2);
7020 vs_andr(vtmp, vs4, twog2);
7021 vs_subv(vs3, __ T4S, vs3, vtmp);
7022
7023 // quotient += (mask & 1);
7024 vs_andr(vtmp, vs4, one);
7025 vs_addv(vs2, __ T4S, vs2, vtmp);
7026
7027 // mask = (twoGamma2 / 2 - r0) >> 31;
7028 vs_subv(vtmp, __ T4S, g2, vs3);
7029 vs_sshr(vs4, __ T4S, vtmp, 31);
7030
7031 // r0 -= (mask & twoGamma2);
7032 vs_andr(vtmp, vs4, twog2);
7033 vs_subv(vs3, __ T4S, vs3, vtmp);
7034
7035 // quotient += (mask & 1);
7036 vs_andr(vtmp, vs4, one);
7037 vs_addv(vs2, __ T4S, vs2, vtmp);
7038
7039 // r1 --> vs5
7040 // int r1 = rplus - r0 - (dilithium_q - 1);
7041 vs_subv(vtmp, __ T4S, vs1, vs3);
7042 vs_subv(vs5, __ T4S, vtmp, qminus1);
7043
7044 // r1 --> vs1 (overwriting rplus)
7045 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7046 vs_negr(vtmp, __ T4S, vs5);
7047 vs_orr(vtmp, vs5, vtmp);
7048 vs_sshr(vs1, __ T4S, vtmp, 31);
7049
7050 // r0 += ~r1;
7051 vs_notr(vtmp, vs1);
7052 vs_addv(vs3, __ T4S, vs3, vtmp);
7053
7054 // r1 = r1 & quotient;
7055 vs_andr(vs1, vs2, vs1);
7056
7057 // store results inteleaved
7058 // lowPart[m] = r0;
7059 // highPart[m] = r1;
7060 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7061 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7062
7063 __ sub(len, len, 64);
7064 __ cmp(len, (u1)64);
7065 __ br(Assembler::GE, L_loop);
7066
7067 // restore callee-saved vector registers
7068 __ ldpd(v14, v15, Address(sp, 48));
7069 __ ldpd(v12, v13, Address(sp, 32));
7070 __ ldpd(v10, v11, Address(sp, 16));
7071 __ ldpd(v8, v9, __ post(sp, 64));
7072
7073 __ leave(); // required for proper stackwalking of RuntimeStub frame
7074 __ mov(r0, zr); // return 0
7075 __ ret(lr);
7076
7077 return start;
7078 }
7079
7080 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7081 Register tmp0, Register tmp1, Register tmp2) {
7082 __ bic(tmp0, a2, a1); // for a0
7083 __ bic(tmp1, a3, a2); // for a1
7084 __ bic(tmp2, a4, a3); // for a2
7085 __ eor(a2, a2, tmp2);
7086 __ bic(tmp2, a0, a4); // for a3
7087 __ eor(a3, a3, tmp2);
7088 __ bic(tmp2, a1, a0); // for a4
7089 __ eor(a0, a0, tmp0);
7090 __ eor(a1, a1, tmp1);
7091 __ eor(a4, a4, tmp2);
7092 }
7093
7094 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7095 Register a0, Register a1, Register a2, Register a3, Register a4,
7096 Register a5, Register a6, Register a7, Register a8, Register a9,
7097 Register a10, Register a11, Register a12, Register a13, Register a14,
7098 Register a15, Register a16, Register a17, Register a18, Register a19,
7099 Register a20, Register a21, Register a22, Register a23, Register a24,
7100 Register tmp0, Register tmp1, Register tmp2) {
7101 __ eor3(tmp1, a4, a9, a14);
7102 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7103 __ eor3(tmp2, a1, a6, a11);
7104 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7105 __ rax1(tmp2, tmp0, tmp1); // d0
7106 {
7107
7108 Register tmp3, tmp4;
7109 if (can_use_fp && can_use_r18) {
7110 tmp3 = rfp;
7111 tmp4 = r18_tls;
7112 } else {
7113 tmp3 = a4;
7114 tmp4 = a9;
7115 __ stp(tmp3, tmp4, __ pre(sp, -16));
7116 }
7117
7118 __ eor3(tmp3, a0, a5, a10);
7119 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7120 __ eor(a0, a0, tmp2);
7121 __ eor(a5, a5, tmp2);
7122 __ eor(a10, a10, tmp2);
7123 __ eor(a15, a15, tmp2);
7124 __ eor(a20, a20, tmp2); // d0(tmp2)
7125 __ eor3(tmp3, a2, a7, a12);
7126 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7127 __ rax1(tmp3, tmp4, tmp2); // d1
7128 __ eor(a1, a1, tmp3);
7129 __ eor(a6, a6, tmp3);
7130 __ eor(a11, a11, tmp3);
7131 __ eor(a16, a16, tmp3);
7132 __ eor(a21, a21, tmp3); // d1(tmp3)
7133 __ rax1(tmp3, tmp2, tmp0); // d3
7134 __ eor3(tmp2, a3, a8, a13);
7135 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7136 __ eor(a3, a3, tmp3);
7137 __ eor(a8, a8, tmp3);
7138 __ eor(a13, a13, tmp3);
7139 __ eor(a18, a18, tmp3);
7140 __ eor(a23, a23, tmp3);
7141 __ rax1(tmp2, tmp1, tmp0); // d2
7142 __ eor(a2, a2, tmp2);
7143 __ eor(a7, a7, tmp2);
7144 __ eor(a12, a12, tmp2);
7145 __ rax1(tmp0, tmp0, tmp4); // d4
7146 if (!can_use_fp || !can_use_r18) {
7147 __ ldp(tmp3, tmp4, __ post(sp, 16));
7148 }
7149 __ eor(a17, a17, tmp2);
7150 __ eor(a22, a22, tmp2);
7151 __ eor(a4, a4, tmp0);
7152 __ eor(a9, a9, tmp0);
7153 __ eor(a14, a14, tmp0);
7154 __ eor(a19, a19, tmp0);
7155 __ eor(a24, a24, tmp0);
7156 }
7157
7158 __ rol(tmp0, a10, 3);
7159 __ rol(a10, a1, 1);
7160 __ rol(a1, a6, 44);
7161 __ rol(a6, a9, 20);
7162 __ rol(a9, a22, 61);
7163 __ rol(a22, a14, 39);
7164 __ rol(a14, a20, 18);
7165 __ rol(a20, a2, 62);
7166 __ rol(a2, a12, 43);
7167 __ rol(a12, a13, 25);
7168 __ rol(a13, a19, 8) ;
7169 __ rol(a19, a23, 56);
7170 __ rol(a23, a15, 41);
7171 __ rol(a15, a4, 27);
7172 __ rol(a4, a24, 14);
7173 __ rol(a24, a21, 2);
7174 __ rol(a21, a8, 55);
7175 __ rol(a8, a16, 45);
7176 __ rol(a16, a5, 36);
7177 __ rol(a5, a3, 28);
7178 __ rol(a3, a18, 21);
7179 __ rol(a18, a17, 15);
7180 __ rol(a17, a11, 10);
7181 __ rol(a11, a7, 6);
7182 __ mov(a7, tmp0);
7183
7184 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7185 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7186 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7187 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7188 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7189
7190 __ ldr(tmp1, __ post(rc, 8));
7191 __ eor(a0, a0, tmp1);
7192
7193 }
7194
7195 // Arguments:
7196 //
7197 // Inputs:
7198 // c_rarg0 - byte[] source+offset
7199 // c_rarg1 - byte[] SHA.state
7200 // c_rarg2 - int block_size
7201 // c_rarg3 - int offset
7202 // c_rarg4 - int limit
7203 //
7204 address generate_sha3_implCompress_gpr(StubId stub_id) {
7205 bool multi_block;
7206 switch (stub_id) {
7207 case StubId::stubgen_sha3_implCompress_id:
7208 multi_block = false;
7209 break;
7210 case StubId::stubgen_sha3_implCompressMB_id:
7211 multi_block = true;
7212 break;
7213 default:
7214 ShouldNotReachHere();
7215 }
7216
7217 static const uint64_t round_consts[24] = {
7218 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7219 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7220 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7221 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7222 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7223 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7224 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7225 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7226 };
7227
7228 __ align(CodeEntryAlignment);
7229 StubCodeMark mark(this, stub_id);
7230 address start = __ pc();
7231
7232 Register buf = c_rarg0;
7233 Register state = c_rarg1;
7234 Register block_size = c_rarg2;
7235 Register ofs = c_rarg3;
7236 Register limit = c_rarg4;
7237
7238 // use r3.r17,r19..r28 to keep a0..a24.
7239 // a0..a24 are respective locals from SHA3.java
7240 Register a0 = r25,
7241 a1 = r26,
7242 a2 = r27,
7243 a3 = r3,
7244 a4 = r4,
7245 a5 = r5,
7246 a6 = r6,
7247 a7 = r7,
7248 a8 = rscratch1, // r8
7249 a9 = rscratch2, // r9
7250 a10 = r10,
7251 a11 = r11,
7252 a12 = r12,
7253 a13 = r13,
7254 a14 = r14,
7255 a15 = r15,
7256 a16 = r16,
7257 a17 = r17,
7258 a18 = r28,
7259 a19 = r19,
7260 a20 = r20,
7261 a21 = r21,
7262 a22 = r22,
7263 a23 = r23,
7264 a24 = r24;
7265
7266 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7267
7268 Label sha3_loop, rounds24_preloop, loop_body;
7269 Label sha3_512_or_sha3_384, shake128;
7270
7271 bool can_use_r18 = false;
7272 #ifndef R18_RESERVED
7273 can_use_r18 = true;
7274 #endif
7275 bool can_use_fp = !PreserveFramePointer;
7276
7277 __ enter();
7278
7279 // save almost all yet unsaved gpr registers on stack
7280 __ str(block_size, __ pre(sp, -128));
7281 if (multi_block) {
7282 __ stpw(ofs, limit, Address(sp, 8));
7283 }
7284 // 8 bytes at sp+16 will be used to keep buf
7285 __ stp(r19, r20, Address(sp, 32));
7286 __ stp(r21, r22, Address(sp, 48));
7287 __ stp(r23, r24, Address(sp, 64));
7288 __ stp(r25, r26, Address(sp, 80));
7289 __ stp(r27, r28, Address(sp, 96));
7290 if (can_use_r18 && can_use_fp) {
7291 __ stp(r18_tls, state, Address(sp, 112));
7292 } else {
7293 __ str(state, Address(sp, 112));
7294 }
7295
7296 // begin sha3 calculations: loading a0..a24 from state arrary
7297 __ ldp(a0, a1, state);
7298 __ ldp(a2, a3, Address(state, 16));
7299 __ ldp(a4, a5, Address(state, 32));
7300 __ ldp(a6, a7, Address(state, 48));
7301 __ ldp(a8, a9, Address(state, 64));
7302 __ ldp(a10, a11, Address(state, 80));
7303 __ ldp(a12, a13, Address(state, 96));
7304 __ ldp(a14, a15, Address(state, 112));
7305 __ ldp(a16, a17, Address(state, 128));
7306 __ ldp(a18, a19, Address(state, 144));
7307 __ ldp(a20, a21, Address(state, 160));
7308 __ ldp(a22, a23, Address(state, 176));
7309 __ ldr(a24, Address(state, 192));
7310
7311 __ BIND(sha3_loop);
7312
7313 // load input
7314 __ ldp(tmp3, tmp2, __ post(buf, 16));
7315 __ eor(a0, a0, tmp3);
7316 __ eor(a1, a1, tmp2);
7317 __ ldp(tmp3, tmp2, __ post(buf, 16));
7318 __ eor(a2, a2, tmp3);
7319 __ eor(a3, a3, tmp2);
7320 __ ldp(tmp3, tmp2, __ post(buf, 16));
7321 __ eor(a4, a4, tmp3);
7322 __ eor(a5, a5, tmp2);
7323 __ ldr(tmp3, __ post(buf, 8));
7324 __ eor(a6, a6, tmp3);
7325
7326 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7327 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7328
7329 __ ldp(tmp3, tmp2, __ post(buf, 16));
7330 __ eor(a7, a7, tmp3);
7331 __ eor(a8, a8, tmp2);
7332 __ ldp(tmp3, tmp2, __ post(buf, 16));
7333 __ eor(a9, a9, tmp3);
7334 __ eor(a10, a10, tmp2);
7335 __ ldp(tmp3, tmp2, __ post(buf, 16));
7336 __ eor(a11, a11, tmp3);
7337 __ eor(a12, a12, tmp2);
7338 __ ldp(tmp3, tmp2, __ post(buf, 16));
7339 __ eor(a13, a13, tmp3);
7340 __ eor(a14, a14, tmp2);
7341 __ ldp(tmp3, tmp2, __ post(buf, 16));
7342 __ eor(a15, a15, tmp3);
7343 __ eor(a16, a16, tmp2);
7344
7345 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7346 __ andw(tmp2, block_size, 48);
7347 __ cbzw(tmp2, rounds24_preloop);
7348 __ tbnz(block_size, 5, shake128);
7349 // block_size == 144, bit5 == 0, SHA3-244
7350 __ ldr(tmp3, __ post(buf, 8));
7351 __ eor(a17, a17, tmp3);
7352 __ b(rounds24_preloop);
7353
7354 __ BIND(shake128);
7355 __ ldp(tmp3, tmp2, __ post(buf, 16));
7356 __ eor(a17, a17, tmp3);
7357 __ eor(a18, a18, tmp2);
7358 __ ldp(tmp3, tmp2, __ post(buf, 16));
7359 __ eor(a19, a19, tmp3);
7360 __ eor(a20, a20, tmp2);
7361 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7362
7363 __ BIND(sha3_512_or_sha3_384);
7364 __ ldp(tmp3, tmp2, __ post(buf, 16));
7365 __ eor(a7, a7, tmp3);
7366 __ eor(a8, a8, tmp2);
7367 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7368
7369 // SHA3-384
7370 __ ldp(tmp3, tmp2, __ post(buf, 16));
7371 __ eor(a9, a9, tmp3);
7372 __ eor(a10, a10, tmp2);
7373 __ ldp(tmp3, tmp2, __ post(buf, 16));
7374 __ eor(a11, a11, tmp3);
7375 __ eor(a12, a12, tmp2);
7376
7377 __ BIND(rounds24_preloop);
7378 __ fmovs(v0, 24.0); // float loop counter,
7379 __ fmovs(v1, 1.0); // exact representation
7380
7381 __ str(buf, Address(sp, 16));
7382 __ lea(tmp3, ExternalAddress((address) round_consts));
7383
7384 __ BIND(loop_body);
7385 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7386 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7387 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7388 tmp0, tmp1, tmp2);
7389 __ fsubs(v0, v0, v1);
7390 __ fcmps(v0, 0.0);
7391 __ br(__ NE, loop_body);
7392
7393 if (multi_block) {
7394 __ ldrw(block_size, sp); // block_size
7395 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7396 __ addw(tmp2, tmp2, block_size);
7397 __ cmpw(tmp2, tmp1);
7398 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7399 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7400 __ br(Assembler::LE, sha3_loop);
7401 __ movw(c_rarg0, tmp2); // return offset
7402 }
7403 if (can_use_fp && can_use_r18) {
7404 __ ldp(r18_tls, state, Address(sp, 112));
7405 } else {
7406 __ ldr(state, Address(sp, 112));
7407 }
7408 // save calculated sha3 state
7409 __ stp(a0, a1, Address(state));
7410 __ stp(a2, a3, Address(state, 16));
7411 __ stp(a4, a5, Address(state, 32));
7412 __ stp(a6, a7, Address(state, 48));
7413 __ stp(a8, a9, Address(state, 64));
7414 __ stp(a10, a11, Address(state, 80));
7415 __ stp(a12, a13, Address(state, 96));
7416 __ stp(a14, a15, Address(state, 112));
7417 __ stp(a16, a17, Address(state, 128));
7418 __ stp(a18, a19, Address(state, 144));
7419 __ stp(a20, a21, Address(state, 160));
7420 __ stp(a22, a23, Address(state, 176));
7421 __ str(a24, Address(state, 192));
7422
7423 // restore required registers from stack
7424 __ ldp(r19, r20, Address(sp, 32));
7425 __ ldp(r21, r22, Address(sp, 48));
7426 __ ldp(r23, r24, Address(sp, 64));
7427 __ ldp(r25, r26, Address(sp, 80));
7428 __ ldp(r27, r28, Address(sp, 96));
7429 if (can_use_fp && can_use_r18) {
7430 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7431 } // else no need to recalculate rfp, since it wasn't changed
7432
7433 __ leave();
7434
7435 __ ret(lr);
7436
7437 return start;
7438 }
7439
7440 /**
7441 * Arguments:
7442 *
7443 * Inputs:
7444 * c_rarg0 - int crc
7445 * c_rarg1 - byte* buf
7446 * c_rarg2 - int length
7447 *
7448 * Output:
7449 * rax - int crc result
7450 */
7451 address generate_updateBytesCRC32() {
7452 assert(UseCRC32Intrinsics, "what are we doing here?");
7453
7454 __ align(CodeEntryAlignment);
7455 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7456 StubCodeMark mark(this, stub_id);
7457
7458 address start = __ pc();
7459
7460 const Register crc = c_rarg0; // crc
7461 const Register buf = c_rarg1; // source java byte array address
7462 const Register len = c_rarg2; // length
7463 const Register table0 = c_rarg3; // crc_table address
7464 const Register table1 = c_rarg4;
7465 const Register table2 = c_rarg5;
7466 const Register table3 = c_rarg6;
7467 const Register tmp3 = c_rarg7;
7468
7469 BLOCK_COMMENT("Entry:");
7470 __ enter(); // required for proper stackwalking of RuntimeStub frame
7471
7472 __ kernel_crc32(crc, buf, len,
7473 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7474
7475 __ leave(); // required for proper stackwalking of RuntimeStub frame
7476 __ ret(lr);
7477
7478 return start;
7479 }
7480
7481 /**
7482 * Arguments:
7483 *
7484 * Inputs:
7485 * c_rarg0 - int crc
7486 * c_rarg1 - byte* buf
7487 * c_rarg2 - int length
7488 * c_rarg3 - int* table
7489 *
7490 * Output:
7491 * r0 - int crc result
7492 */
7493 address generate_updateBytesCRC32C() {
7494 assert(UseCRC32CIntrinsics, "what are we doing here?");
7495
7496 __ align(CodeEntryAlignment);
7497 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7498 StubCodeMark mark(this, stub_id);
7499
7500 address start = __ pc();
7501
7502 const Register crc = c_rarg0; // crc
7503 const Register buf = c_rarg1; // source java byte array address
7504 const Register len = c_rarg2; // length
7505 const Register table0 = c_rarg3; // crc_table address
7506 const Register table1 = c_rarg4;
7507 const Register table2 = c_rarg5;
7508 const Register table3 = c_rarg6;
7509 const Register tmp3 = c_rarg7;
7510
7511 BLOCK_COMMENT("Entry:");
7512 __ enter(); // required for proper stackwalking of RuntimeStub frame
7513
7514 __ kernel_crc32c(crc, buf, len,
7515 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7516
7517 __ leave(); // required for proper stackwalking of RuntimeStub frame
7518 __ ret(lr);
7519
7520 return start;
7521 }
7522
7523 /***
7524 * Arguments:
7525 *
7526 * Inputs:
7527 * c_rarg0 - int adler
7528 * c_rarg1 - byte* buff
7529 * c_rarg2 - int len
7530 *
7531 * Output:
7532 * c_rarg0 - int adler result
7533 */
7534 address generate_updateBytesAdler32() {
7535 __ align(CodeEntryAlignment);
7536 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7537 StubCodeMark mark(this, stub_id);
7538 address start = __ pc();
7539
7540 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7541
7542 // Aliases
7543 Register adler = c_rarg0;
7544 Register s1 = c_rarg0;
7545 Register s2 = c_rarg3;
7546 Register buff = c_rarg1;
7547 Register len = c_rarg2;
7548 Register nmax = r4;
7549 Register base = r5;
7550 Register count = r6;
7551 Register temp0 = rscratch1;
7552 Register temp1 = rscratch2;
7553 FloatRegister vbytes = v0;
7554 FloatRegister vs1acc = v1;
7555 FloatRegister vs2acc = v2;
7556 FloatRegister vtable = v3;
7557
7558 // Max number of bytes we can process before having to take the mod
7559 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7560 uint64_t BASE = 0xfff1;
7561 uint64_t NMAX = 0x15B0;
7562
7563 __ mov(base, BASE);
7564 __ mov(nmax, NMAX);
7565
7566 // Load accumulation coefficients for the upper 16 bits
7567 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7568 __ ld1(vtable, __ T16B, Address(temp0));
7569
7570 // s1 is initialized to the lower 16 bits of adler
7571 // s2 is initialized to the upper 16 bits of adler
7572 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7573 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7574
7575 // The pipelined loop needs at least 16 elements for 1 iteration
7576 // It does check this, but it is more effective to skip to the cleanup loop
7577 __ cmp(len, (u1)16);
7578 __ br(Assembler::HS, L_nmax);
7579 __ cbz(len, L_combine);
7580
7581 __ bind(L_simple_by1_loop);
7582 __ ldrb(temp0, Address(__ post(buff, 1)));
7583 __ add(s1, s1, temp0);
7584 __ add(s2, s2, s1);
7585 __ subs(len, len, 1);
7586 __ br(Assembler::HI, L_simple_by1_loop);
7587
7588 // s1 = s1 % BASE
7589 __ subs(temp0, s1, base);
7590 __ csel(s1, temp0, s1, Assembler::HS);
7591
7592 // s2 = s2 % BASE
7593 __ lsr(temp0, s2, 16);
7594 __ lsl(temp1, temp0, 4);
7595 __ sub(temp1, temp1, temp0);
7596 __ add(s2, temp1, s2, ext::uxth);
7597
7598 __ subs(temp0, s2, base);
7599 __ csel(s2, temp0, s2, Assembler::HS);
7600
7601 __ b(L_combine);
7602
7603 __ bind(L_nmax);
7604 __ subs(len, len, nmax);
7605 __ sub(count, nmax, 16);
7606 __ br(Assembler::LO, L_by16);
7607
7608 __ bind(L_nmax_loop);
7609
7610 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7611 vbytes, vs1acc, vs2acc, vtable);
7612
7613 __ subs(count, count, 16);
7614 __ br(Assembler::HS, L_nmax_loop);
7615
7616 // s1 = s1 % BASE
7617 __ lsr(temp0, s1, 16);
7618 __ lsl(temp1, temp0, 4);
7619 __ sub(temp1, temp1, temp0);
7620 __ add(temp1, temp1, s1, ext::uxth);
7621
7622 __ lsr(temp0, temp1, 16);
7623 __ lsl(s1, temp0, 4);
7624 __ sub(s1, s1, temp0);
7625 __ add(s1, s1, temp1, ext:: uxth);
7626
7627 __ subs(temp0, s1, base);
7628 __ csel(s1, temp0, s1, Assembler::HS);
7629
7630 // s2 = s2 % BASE
7631 __ lsr(temp0, s2, 16);
7632 __ lsl(temp1, temp0, 4);
7633 __ sub(temp1, temp1, temp0);
7634 __ add(temp1, temp1, s2, ext::uxth);
7635
7636 __ lsr(temp0, temp1, 16);
7637 __ lsl(s2, temp0, 4);
7638 __ sub(s2, s2, temp0);
7639 __ add(s2, s2, temp1, ext:: uxth);
7640
7641 __ subs(temp0, s2, base);
7642 __ csel(s2, temp0, s2, Assembler::HS);
7643
7644 __ subs(len, len, nmax);
7645 __ sub(count, nmax, 16);
7646 __ br(Assembler::HS, L_nmax_loop);
7647
7648 __ bind(L_by16);
7649 __ adds(len, len, count);
7650 __ br(Assembler::LO, L_by1);
7651
7652 __ bind(L_by16_loop);
7653
7654 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7655 vbytes, vs1acc, vs2acc, vtable);
7656
7657 __ subs(len, len, 16);
7658 __ br(Assembler::HS, L_by16_loop);
7659
7660 __ bind(L_by1);
7661 __ adds(len, len, 15);
7662 __ br(Assembler::LO, L_do_mod);
7663
7664 __ bind(L_by1_loop);
7665 __ ldrb(temp0, Address(__ post(buff, 1)));
7666 __ add(s1, temp0, s1);
7667 __ add(s2, s2, s1);
7668 __ subs(len, len, 1);
7669 __ br(Assembler::HS, L_by1_loop);
7670
7671 __ bind(L_do_mod);
7672 // s1 = s1 % BASE
7673 __ lsr(temp0, s1, 16);
7674 __ lsl(temp1, temp0, 4);
7675 __ sub(temp1, temp1, temp0);
7676 __ add(temp1, temp1, s1, ext::uxth);
7677
7678 __ lsr(temp0, temp1, 16);
7679 __ lsl(s1, temp0, 4);
7680 __ sub(s1, s1, temp0);
7681 __ add(s1, s1, temp1, ext:: uxth);
7682
7683 __ subs(temp0, s1, base);
7684 __ csel(s1, temp0, s1, Assembler::HS);
7685
7686 // s2 = s2 % BASE
7687 __ lsr(temp0, s2, 16);
7688 __ lsl(temp1, temp0, 4);
7689 __ sub(temp1, temp1, temp0);
7690 __ add(temp1, temp1, s2, ext::uxth);
7691
7692 __ lsr(temp0, temp1, 16);
7693 __ lsl(s2, temp0, 4);
7694 __ sub(s2, s2, temp0);
7695 __ add(s2, s2, temp1, ext:: uxth);
7696
7697 __ subs(temp0, s2, base);
7698 __ csel(s2, temp0, s2, Assembler::HS);
7699
7700 // Combine lower bits and higher bits
7701 __ bind(L_combine);
7702 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7703
7704 __ ret(lr);
7705
7706 return start;
7707 }
7708
7709 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7710 Register temp0, Register temp1, FloatRegister vbytes,
7711 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7712 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7713 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7714 // In non-vectorized code, we update s1 and s2 as:
7715 // s1 <- s1 + b1
7716 // s2 <- s2 + s1
7717 // s1 <- s1 + b2
7718 // s2 <- s2 + b1
7719 // ...
7720 // s1 <- s1 + b16
7721 // s2 <- s2 + s1
7722 // Putting above assignments together, we have:
7723 // s1_new = s1 + b1 + b2 + ... + b16
7724 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7725 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7726 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7727 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7728
7729 // s2 = s2 + s1 * 16
7730 __ add(s2, s2, s1, Assembler::LSL, 4);
7731
7732 // vs1acc = b1 + b2 + b3 + ... + b16
7733 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7734 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7735 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7736 __ uaddlv(vs1acc, __ T16B, vbytes);
7737 __ uaddlv(vs2acc, __ T8H, vs2acc);
7738
7739 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7740 __ fmovd(temp0, vs1acc);
7741 __ fmovd(temp1, vs2acc);
7742 __ add(s1, s1, temp0);
7743 __ add(s2, s2, temp1);
7744 }
7745
7746 /**
7747 * Arguments:
7748 *
7749 * Input:
7750 * c_rarg0 - x address
7751 * c_rarg1 - x length
7752 * c_rarg2 - y address
7753 * c_rarg3 - y length
7754 * c_rarg4 - z address
7755 */
7756 address generate_multiplyToLen() {
7757 __ align(CodeEntryAlignment);
7758 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7759 StubCodeMark mark(this, stub_id);
7760
7761 address start = __ pc();
7762
7763 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
7764 return start;
7765 }
7766 const Register x = r0;
7767 const Register xlen = r1;
7768 const Register y = r2;
7769 const Register ylen = r3;
7770 const Register z = r4;
7771
7772 const Register tmp0 = r5;
7773 const Register tmp1 = r10;
7774 const Register tmp2 = r11;
7775 const Register tmp3 = r12;
7776 const Register tmp4 = r13;
7777 const Register tmp5 = r14;
7778 const Register tmp6 = r15;
7779 const Register tmp7 = r16;
7780
7781 BLOCK_COMMENT("Entry:");
7782 __ enter(); // required for proper stackwalking of RuntimeStub frame
7783 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7784 __ leave(); // required for proper stackwalking of RuntimeStub frame
7785 __ ret(lr);
7786
7787 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
7788 return start;
7789 }
7790
7791 address generate_squareToLen() {
7792 // squareToLen algorithm for sizes 1..127 described in java code works
7793 // faster than multiply_to_len on some CPUs and slower on others, but
7794 // multiply_to_len shows a bit better overall results
7795 __ align(CodeEntryAlignment);
7796 StubId stub_id = StubId::stubgen_squareToLen_id;
7797 StubCodeMark mark(this, stub_id);
7798 address start = __ pc();
7799
7800 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
7801 return start;
7802 }
7803 const Register x = r0;
7804 const Register xlen = r1;
7805 const Register z = r2;
7806 const Register y = r4; // == x
7807 const Register ylen = r5; // == xlen
7808
7809 const Register tmp0 = r3;
7810 const Register tmp1 = r10;
7811 const Register tmp2 = r11;
7812 const Register tmp3 = r12;
7813 const Register tmp4 = r13;
7814 const Register tmp5 = r14;
7815 const Register tmp6 = r15;
7816 const Register tmp7 = r16;
7817
7818 RegSet spilled_regs = RegSet::of(y, ylen);
7819 BLOCK_COMMENT("Entry:");
7820 __ enter();
7821 __ push(spilled_regs, sp);
7822 __ mov(y, x);
7823 __ mov(ylen, xlen);
7824 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7825 __ pop(spilled_regs, sp);
7826 __ leave();
7827 __ ret(lr);
7828
7829 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
7830 return start;
7831 }
7832
7833 address generate_mulAdd() {
7834 __ align(CodeEntryAlignment);
7835 StubId stub_id = StubId::stubgen_mulAdd_id;
7836 StubCodeMark mark(this, stub_id);
7837
7838 address start = __ pc();
7839
7840 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
7841 return start;
7842 }
7843 const Register out = r0;
7844 const Register in = r1;
7845 const Register offset = r2;
7846 const Register len = r3;
7847 const Register k = r4;
7848
7849 BLOCK_COMMENT("Entry:");
7850 __ enter();
7851 __ mul_add(out, in, offset, len, k);
7852 __ leave();
7853 __ ret(lr);
7854
7855 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
7856 return start;
7857 }
7858
7859 // Arguments:
7860 //
7861 // Input:
7862 // c_rarg0 - newArr address
7863 // c_rarg1 - oldArr address
7864 // c_rarg2 - newIdx
7865 // c_rarg3 - shiftCount
7866 // c_rarg4 - numIter
7867 //
7868 address generate_bigIntegerRightShift() {
7869 __ align(CodeEntryAlignment);
7870 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7871 StubCodeMark mark(this, stub_id);
7872 address start = __ pc();
7873
7874 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7875
7876 Register newArr = c_rarg0;
7877 Register oldArr = c_rarg1;
7878 Register newIdx = c_rarg2;
7879 Register shiftCount = c_rarg3;
7880 Register numIter = c_rarg4;
7881 Register idx = numIter;
7882
7883 Register newArrCur = rscratch1;
7884 Register shiftRevCount = rscratch2;
7885 Register oldArrCur = r13;
7886 Register oldArrNext = r14;
7887
7888 FloatRegister oldElem0 = v0;
7889 FloatRegister oldElem1 = v1;
7890 FloatRegister newElem = v2;
7891 FloatRegister shiftVCount = v3;
7892 FloatRegister shiftVRevCount = v4;
7893
7894 __ cbz(idx, Exit);
7895
7896 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7897
7898 // left shift count
7899 __ movw(shiftRevCount, 32);
7900 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7901
7902 // numIter too small to allow a 4-words SIMD loop, rolling back
7903 __ cmp(numIter, (u1)4);
7904 __ br(Assembler::LT, ShiftThree);
7905
7906 __ dup(shiftVCount, __ T4S, shiftCount);
7907 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7908 __ negr(shiftVCount, __ T4S, shiftVCount);
7909
7910 __ BIND(ShiftSIMDLoop);
7911
7912 // Calculate the load addresses
7913 __ sub(idx, idx, 4);
7914 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7915 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7916 __ add(oldArrCur, oldArrNext, 4);
7917
7918 // Load 4 words and process
7919 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7920 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7921 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7922 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7923 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7924 __ st1(newElem, __ T4S, Address(newArrCur));
7925
7926 __ cmp(idx, (u1)4);
7927 __ br(Assembler::LT, ShiftTwoLoop);
7928 __ b(ShiftSIMDLoop);
7929
7930 __ BIND(ShiftTwoLoop);
7931 __ cbz(idx, Exit);
7932 __ cmp(idx, (u1)1);
7933 __ br(Assembler::EQ, ShiftOne);
7934
7935 // Calculate the load addresses
7936 __ sub(idx, idx, 2);
7937 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7938 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7939 __ add(oldArrCur, oldArrNext, 4);
7940
7941 // Load 2 words and process
7942 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
7943 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
7944 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
7945 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
7946 __ orr(newElem, __ T8B, oldElem0, oldElem1);
7947 __ st1(newElem, __ T2S, Address(newArrCur));
7948 __ b(ShiftTwoLoop);
7949
7950 __ BIND(ShiftThree);
7951 __ tbz(idx, 1, ShiftOne);
7952 __ tbz(idx, 0, ShiftTwo);
7953 __ ldrw(r10, Address(oldArr, 12));
7954 __ ldrw(r11, Address(oldArr, 8));
7955 __ lsrvw(r10, r10, shiftCount);
7956 __ lslvw(r11, r11, shiftRevCount);
7957 __ orrw(r12, r10, r11);
7958 __ strw(r12, Address(newArr, 8));
7959
7960 __ BIND(ShiftTwo);
7961 __ ldrw(r10, Address(oldArr, 8));
7962 __ ldrw(r11, Address(oldArr, 4));
7963 __ lsrvw(r10, r10, shiftCount);
7964 __ lslvw(r11, r11, shiftRevCount);
7965 __ orrw(r12, r10, r11);
7966 __ strw(r12, Address(newArr, 4));
7967
7968 __ BIND(ShiftOne);
7969 __ ldrw(r10, Address(oldArr, 4));
7970 __ ldrw(r11, Address(oldArr));
7971 __ lsrvw(r10, r10, shiftCount);
7972 __ lslvw(r11, r11, shiftRevCount);
7973 __ orrw(r12, r10, r11);
7974 __ strw(r12, Address(newArr));
7975
7976 __ BIND(Exit);
7977 __ ret(lr);
7978
7979 return start;
7980 }
7981
7982 // Arguments:
7983 //
7984 // Input:
7985 // c_rarg0 - newArr address
7986 // c_rarg1 - oldArr address
7987 // c_rarg2 - newIdx
7988 // c_rarg3 - shiftCount
7989 // c_rarg4 - numIter
7990 //
7991 address generate_bigIntegerLeftShift() {
7992 __ align(CodeEntryAlignment);
7993 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
7994 StubCodeMark mark(this, stub_id);
7995 address start = __ pc();
7996
7997 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7998
7999 Register newArr = c_rarg0;
8000 Register oldArr = c_rarg1;
8001 Register newIdx = c_rarg2;
8002 Register shiftCount = c_rarg3;
8003 Register numIter = c_rarg4;
8004
8005 Register shiftRevCount = rscratch1;
8006 Register oldArrNext = rscratch2;
8007
8008 FloatRegister oldElem0 = v0;
8009 FloatRegister oldElem1 = v1;
8010 FloatRegister newElem = v2;
8011 FloatRegister shiftVCount = v3;
8012 FloatRegister shiftVRevCount = v4;
8013
8014 __ cbz(numIter, Exit);
8015
8016 __ add(oldArrNext, oldArr, 4);
8017 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8018
8019 // right shift count
8020 __ movw(shiftRevCount, 32);
8021 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8022
8023 // numIter too small to allow a 4-words SIMD loop, rolling back
8024 __ cmp(numIter, (u1)4);
8025 __ br(Assembler::LT, ShiftThree);
8026
8027 __ dup(shiftVCount, __ T4S, shiftCount);
8028 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8029 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8030
8031 __ BIND(ShiftSIMDLoop);
8032
8033 // load 4 words and process
8034 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8035 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8036 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8037 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8038 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8039 __ st1(newElem, __ T4S, __ post(newArr, 16));
8040 __ sub(numIter, numIter, 4);
8041
8042 __ cmp(numIter, (u1)4);
8043 __ br(Assembler::LT, ShiftTwoLoop);
8044 __ b(ShiftSIMDLoop);
8045
8046 __ BIND(ShiftTwoLoop);
8047 __ cbz(numIter, Exit);
8048 __ cmp(numIter, (u1)1);
8049 __ br(Assembler::EQ, ShiftOne);
8050
8051 // load 2 words and process
8052 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8053 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8054 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8055 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8056 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8057 __ st1(newElem, __ T2S, __ post(newArr, 8));
8058 __ sub(numIter, numIter, 2);
8059 __ b(ShiftTwoLoop);
8060
8061 __ BIND(ShiftThree);
8062 __ ldrw(r10, __ post(oldArr, 4));
8063 __ ldrw(r11, __ post(oldArrNext, 4));
8064 __ lslvw(r10, r10, shiftCount);
8065 __ lsrvw(r11, r11, shiftRevCount);
8066 __ orrw(r12, r10, r11);
8067 __ strw(r12, __ post(newArr, 4));
8068 __ tbz(numIter, 1, Exit);
8069 __ tbz(numIter, 0, ShiftOne);
8070
8071 __ BIND(ShiftTwo);
8072 __ ldrw(r10, __ post(oldArr, 4));
8073 __ ldrw(r11, __ post(oldArrNext, 4));
8074 __ lslvw(r10, r10, shiftCount);
8075 __ lsrvw(r11, r11, shiftRevCount);
8076 __ orrw(r12, r10, r11);
8077 __ strw(r12, __ post(newArr, 4));
8078
8079 __ BIND(ShiftOne);
8080 __ ldrw(r10, Address(oldArr));
8081 __ ldrw(r11, Address(oldArrNext));
8082 __ lslvw(r10, r10, shiftCount);
8083 __ lsrvw(r11, r11, shiftRevCount);
8084 __ orrw(r12, r10, r11);
8085 __ strw(r12, Address(newArr));
8086
8087 __ BIND(Exit);
8088 __ ret(lr);
8089
8090 return start;
8091 }
8092
8093 address generate_count_positives(address &count_positives_long) {
8094 const u1 large_loop_size = 64;
8095 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8096 int dcache_line = VM_Version::dcache_line_size();
8097
8098 Register ary1 = r1, len = r2, result = r0;
8099
8100 __ align(CodeEntryAlignment);
8101
8102 StubId stub_id = StubId::stubgen_count_positives_id;
8103 StubCodeMark mark(this, stub_id);
8104
8105 address entry = __ pc();
8106
8107 __ enter();
8108 // precondition: a copy of len is already in result
8109 // __ mov(result, len);
8110
8111 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8112 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8113
8114 __ cmp(len, (u1)15);
8115 __ br(Assembler::GT, LEN_OVER_15);
8116 // The only case when execution falls into this code is when pointer is near
8117 // the end of memory page and we have to avoid reading next page
8118 __ add(ary1, ary1, len);
8119 __ subs(len, len, 8);
8120 __ br(Assembler::GT, LEN_OVER_8);
8121 __ ldr(rscratch2, Address(ary1, -8));
8122 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8123 __ lsrv(rscratch2, rscratch2, rscratch1);
8124 __ tst(rscratch2, UPPER_BIT_MASK);
8125 __ csel(result, zr, result, Assembler::NE);
8126 __ leave();
8127 __ ret(lr);
8128 __ bind(LEN_OVER_8);
8129 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8130 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8131 __ tst(rscratch2, UPPER_BIT_MASK);
8132 __ br(Assembler::NE, RET_NO_POP);
8133 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8134 __ lsrv(rscratch1, rscratch1, rscratch2);
8135 __ tst(rscratch1, UPPER_BIT_MASK);
8136 __ bind(RET_NO_POP);
8137 __ csel(result, zr, result, Assembler::NE);
8138 __ leave();
8139 __ ret(lr);
8140
8141 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8142 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8143
8144 count_positives_long = __ pc(); // 2nd entry point
8145
8146 __ enter();
8147
8148 __ bind(LEN_OVER_15);
8149 __ push(spilled_regs, sp);
8150 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8151 __ cbz(rscratch2, ALIGNED);
8152 __ ldp(tmp6, tmp1, Address(ary1));
8153 __ mov(tmp5, 16);
8154 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8155 __ add(ary1, ary1, rscratch1);
8156 __ orr(tmp6, tmp6, tmp1);
8157 __ tst(tmp6, UPPER_BIT_MASK);
8158 __ br(Assembler::NE, RET_ADJUST);
8159 __ sub(len, len, rscratch1);
8160
8161 __ bind(ALIGNED);
8162 __ cmp(len, large_loop_size);
8163 __ br(Assembler::LT, CHECK_16);
8164 // Perform 16-byte load as early return in pre-loop to handle situation
8165 // when initially aligned large array has negative values at starting bytes,
8166 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8167 // slower. Cases with negative bytes further ahead won't be affected that
8168 // much. In fact, it'll be faster due to early loads, less instructions and
8169 // less branches in LARGE_LOOP.
8170 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8171 __ sub(len, len, 16);
8172 __ orr(tmp6, tmp6, tmp1);
8173 __ tst(tmp6, UPPER_BIT_MASK);
8174 __ br(Assembler::NE, RET_ADJUST_16);
8175 __ cmp(len, large_loop_size);
8176 __ br(Assembler::LT, CHECK_16);
8177
8178 if (SoftwarePrefetchHintDistance >= 0
8179 && SoftwarePrefetchHintDistance >= dcache_line) {
8180 // initial prefetch
8181 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8182 }
8183 __ bind(LARGE_LOOP);
8184 if (SoftwarePrefetchHintDistance >= 0) {
8185 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8186 }
8187 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8188 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8189 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8190 // instructions per cycle and have less branches, but this approach disables
8191 // early return, thus, all 64 bytes are loaded and checked every time.
8192 __ ldp(tmp2, tmp3, Address(ary1));
8193 __ ldp(tmp4, tmp5, Address(ary1, 16));
8194 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8195 __ ldp(tmp6, tmp1, Address(ary1, 48));
8196 __ add(ary1, ary1, large_loop_size);
8197 __ sub(len, len, large_loop_size);
8198 __ orr(tmp2, tmp2, tmp3);
8199 __ orr(tmp4, tmp4, tmp5);
8200 __ orr(rscratch1, rscratch1, rscratch2);
8201 __ orr(tmp6, tmp6, tmp1);
8202 __ orr(tmp2, tmp2, tmp4);
8203 __ orr(rscratch1, rscratch1, tmp6);
8204 __ orr(tmp2, tmp2, rscratch1);
8205 __ tst(tmp2, UPPER_BIT_MASK);
8206 __ br(Assembler::NE, RET_ADJUST_LONG);
8207 __ cmp(len, large_loop_size);
8208 __ br(Assembler::GE, LARGE_LOOP);
8209
8210 __ bind(CHECK_16); // small 16-byte load pre-loop
8211 __ cmp(len, (u1)16);
8212 __ br(Assembler::LT, POST_LOOP16);
8213
8214 __ bind(LOOP16); // small 16-byte load loop
8215 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8216 __ sub(len, len, 16);
8217 __ orr(tmp2, tmp2, tmp3);
8218 __ tst(tmp2, UPPER_BIT_MASK);
8219 __ br(Assembler::NE, RET_ADJUST_16);
8220 __ cmp(len, (u1)16);
8221 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8222
8223 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8224 __ cmp(len, (u1)8);
8225 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8226 __ ldr(tmp3, Address(__ post(ary1, 8)));
8227 __ tst(tmp3, UPPER_BIT_MASK);
8228 __ br(Assembler::NE, RET_ADJUST);
8229 __ sub(len, len, 8);
8230
8231 __ bind(POST_LOOP16_LOAD_TAIL);
8232 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8233 __ ldr(tmp1, Address(ary1));
8234 __ mov(tmp2, 64);
8235 __ sub(tmp4, tmp2, len, __ LSL, 3);
8236 __ lslv(tmp1, tmp1, tmp4);
8237 __ tst(tmp1, UPPER_BIT_MASK);
8238 __ br(Assembler::NE, RET_ADJUST);
8239 // Fallthrough
8240
8241 __ bind(RET_LEN);
8242 __ pop(spilled_regs, sp);
8243 __ leave();
8244 __ ret(lr);
8245
8246 // difference result - len is the count of guaranteed to be
8247 // positive bytes
8248
8249 __ bind(RET_ADJUST_LONG);
8250 __ add(len, len, (u1)(large_loop_size - 16));
8251 __ bind(RET_ADJUST_16);
8252 __ add(len, len, 16);
8253 __ bind(RET_ADJUST);
8254 __ pop(spilled_regs, sp);
8255 __ leave();
8256 __ sub(result, result, len);
8257 __ ret(lr);
8258
8259 return entry;
8260 }
8261
8262 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8263 bool usePrefetch, Label &NOT_EQUAL) {
8264 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8265 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8266 tmp7 = r12, tmp8 = r13;
8267 Label LOOP;
8268
8269 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8270 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8271 __ bind(LOOP);
8272 if (usePrefetch) {
8273 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8274 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8275 }
8276 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8277 __ eor(tmp1, tmp1, tmp2);
8278 __ eor(tmp3, tmp3, tmp4);
8279 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8280 __ orr(tmp1, tmp1, tmp3);
8281 __ cbnz(tmp1, NOT_EQUAL);
8282 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8283 __ eor(tmp5, tmp5, tmp6);
8284 __ eor(tmp7, tmp7, tmp8);
8285 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8286 __ orr(tmp5, tmp5, tmp7);
8287 __ cbnz(tmp5, NOT_EQUAL);
8288 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8289 __ eor(tmp1, tmp1, tmp2);
8290 __ eor(tmp3, tmp3, tmp4);
8291 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8292 __ orr(tmp1, tmp1, tmp3);
8293 __ cbnz(tmp1, NOT_EQUAL);
8294 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8295 __ eor(tmp5, tmp5, tmp6);
8296 __ sub(cnt1, cnt1, 8 * wordSize);
8297 __ eor(tmp7, tmp7, tmp8);
8298 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8299 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8300 // cmp) because subs allows an unlimited range of immediate operand.
8301 __ subs(tmp6, cnt1, loopThreshold);
8302 __ orr(tmp5, tmp5, tmp7);
8303 __ cbnz(tmp5, NOT_EQUAL);
8304 __ br(__ GE, LOOP);
8305 // post-loop
8306 __ eor(tmp1, tmp1, tmp2);
8307 __ eor(tmp3, tmp3, tmp4);
8308 __ orr(tmp1, tmp1, tmp3);
8309 __ sub(cnt1, cnt1, 2 * wordSize);
8310 __ cbnz(tmp1, NOT_EQUAL);
8311 }
8312
8313 void generate_large_array_equals_loop_simd(int loopThreshold,
8314 bool usePrefetch, Label &NOT_EQUAL) {
8315 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8316 tmp2 = rscratch2;
8317 Label LOOP;
8318
8319 __ bind(LOOP);
8320 if (usePrefetch) {
8321 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8322 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8323 }
8324 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8325 __ sub(cnt1, cnt1, 8 * wordSize);
8326 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8327 __ subs(tmp1, cnt1, loopThreshold);
8328 __ eor(v0, __ T16B, v0, v4);
8329 __ eor(v1, __ T16B, v1, v5);
8330 __ eor(v2, __ T16B, v2, v6);
8331 __ eor(v3, __ T16B, v3, v7);
8332 __ orr(v0, __ T16B, v0, v1);
8333 __ orr(v1, __ T16B, v2, v3);
8334 __ orr(v0, __ T16B, v0, v1);
8335 __ umov(tmp1, v0, __ D, 0);
8336 __ umov(tmp2, v0, __ D, 1);
8337 __ orr(tmp1, tmp1, tmp2);
8338 __ cbnz(tmp1, NOT_EQUAL);
8339 __ br(__ GE, LOOP);
8340 }
8341
8342 // a1 = r1 - array1 address
8343 // a2 = r2 - array2 address
8344 // result = r0 - return value. Already contains "false"
8345 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8346 // r3-r5 are reserved temporary registers
8347 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8348 address generate_large_array_equals() {
8349 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8350 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8351 tmp7 = r12, tmp8 = r13;
8352 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8353 SMALL_LOOP, POST_LOOP;
8354 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8355 // calculate if at least 32 prefetched bytes are used
8356 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8357 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8358 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8359 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8360 tmp5, tmp6, tmp7, tmp8);
8361
8362 __ align(CodeEntryAlignment);
8363
8364 StubId stub_id = StubId::stubgen_large_array_equals_id;
8365 StubCodeMark mark(this, stub_id);
8366
8367 address entry = __ pc();
8368 __ enter();
8369 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8370 // also advance pointers to use post-increment instead of pre-increment
8371 __ add(a1, a1, wordSize);
8372 __ add(a2, a2, wordSize);
8373 if (AvoidUnalignedAccesses) {
8374 // both implementations (SIMD/nonSIMD) are using relatively large load
8375 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8376 // on some CPUs in case of address is not at least 16-byte aligned.
8377 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8378 // load if needed at least for 1st address and make if 16-byte aligned.
8379 Label ALIGNED16;
8380 __ tbz(a1, 3, ALIGNED16);
8381 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8382 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8383 __ sub(cnt1, cnt1, wordSize);
8384 __ eor(tmp1, tmp1, tmp2);
8385 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8386 __ bind(ALIGNED16);
8387 }
8388 if (UseSIMDForArrayEquals) {
8389 if (SoftwarePrefetchHintDistance >= 0) {
8390 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8391 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8392 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8393 /* prfm = */ true, NOT_EQUAL);
8394 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8395 __ br(__ LT, TAIL);
8396 }
8397 __ bind(NO_PREFETCH_LARGE_LOOP);
8398 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8399 /* prfm = */ false, NOT_EQUAL);
8400 } else {
8401 __ push(spilled_regs, sp);
8402 if (SoftwarePrefetchHintDistance >= 0) {
8403 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8404 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8405 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8406 /* prfm = */ true, NOT_EQUAL);
8407 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8408 __ br(__ LT, TAIL);
8409 }
8410 __ bind(NO_PREFETCH_LARGE_LOOP);
8411 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8412 /* prfm = */ false, NOT_EQUAL);
8413 }
8414 __ bind(TAIL);
8415 __ cbz(cnt1, EQUAL);
8416 __ subs(cnt1, cnt1, wordSize);
8417 __ br(__ LE, POST_LOOP);
8418 __ bind(SMALL_LOOP);
8419 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8420 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8421 __ subs(cnt1, cnt1, wordSize);
8422 __ eor(tmp1, tmp1, tmp2);
8423 __ cbnz(tmp1, NOT_EQUAL);
8424 __ br(__ GT, SMALL_LOOP);
8425 __ bind(POST_LOOP);
8426 __ ldr(tmp1, Address(a1, cnt1));
8427 __ ldr(tmp2, Address(a2, cnt1));
8428 __ eor(tmp1, tmp1, tmp2);
8429 __ cbnz(tmp1, NOT_EQUAL);
8430 __ bind(EQUAL);
8431 __ mov(result, true);
8432 __ bind(NOT_EQUAL);
8433 if (!UseSIMDForArrayEquals) {
8434 __ pop(spilled_regs, sp);
8435 }
8436 __ bind(NOT_EQUAL_NO_POP);
8437 __ leave();
8438 __ ret(lr);
8439 return entry;
8440 }
8441
8442 // result = r0 - return value. Contains initial hashcode value on entry.
8443 // ary = r1 - array address
8444 // cnt = r2 - elements count
8445 // Clobbers: v0-v13, rscratch1, rscratch2
8446 address generate_large_arrays_hashcode(BasicType eltype) {
8447 const Register result = r0, ary = r1, cnt = r2;
8448 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8449 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8450 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8451 const FloatRegister vpowm = v13;
8452
8453 ARRAYS_HASHCODE_REGISTERS;
8454
8455 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8456
8457 unsigned int vf; // vectorization factor
8458 bool multiply_by_halves;
8459 Assembler::SIMD_Arrangement load_arrangement;
8460 switch (eltype) {
8461 case T_BOOLEAN:
8462 case T_BYTE:
8463 load_arrangement = Assembler::T8B;
8464 multiply_by_halves = true;
8465 vf = 8;
8466 break;
8467 case T_CHAR:
8468 case T_SHORT:
8469 load_arrangement = Assembler::T8H;
8470 multiply_by_halves = true;
8471 vf = 8;
8472 break;
8473 case T_INT:
8474 load_arrangement = Assembler::T4S;
8475 multiply_by_halves = false;
8476 vf = 4;
8477 break;
8478 default:
8479 ShouldNotReachHere();
8480 }
8481
8482 // Unroll factor
8483 const unsigned uf = 4;
8484
8485 // Effective vectorization factor
8486 const unsigned evf = vf * uf;
8487
8488 __ align(CodeEntryAlignment);
8489
8490 StubId stub_id;
8491 switch (eltype) {
8492 case T_BOOLEAN:
8493 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8494 break;
8495 case T_BYTE:
8496 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8497 break;
8498 case T_CHAR:
8499 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8500 break;
8501 case T_SHORT:
8502 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8503 break;
8504 case T_INT:
8505 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8506 break;
8507 default:
8508 stub_id = StubId::NO_STUBID;
8509 ShouldNotReachHere();
8510 };
8511
8512 StubCodeMark mark(this, stub_id);
8513
8514 address entry = __ pc();
8515 __ enter();
8516
8517 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8518 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8519 // value shouldn't change throughout both loops.
8520 __ movw(rscratch1, intpow(31U, 3));
8521 __ mov(vpow, Assembler::S, 0, rscratch1);
8522 __ movw(rscratch1, intpow(31U, 2));
8523 __ mov(vpow, Assembler::S, 1, rscratch1);
8524 __ movw(rscratch1, intpow(31U, 1));
8525 __ mov(vpow, Assembler::S, 2, rscratch1);
8526 __ movw(rscratch1, intpow(31U, 0));
8527 __ mov(vpow, Assembler::S, 3, rscratch1);
8528
8529 __ mov(vmul0, Assembler::T16B, 0);
8530 __ mov(vmul0, Assembler::S, 3, result);
8531
8532 __ andr(rscratch2, cnt, (uf - 1) * vf);
8533 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8534
8535 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8536 __ mov(vpowm, Assembler::S, 0, rscratch1);
8537
8538 // SMALL LOOP
8539 __ bind(SMALL_LOOP);
8540
8541 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8542 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8543 __ subsw(rscratch2, rscratch2, vf);
8544
8545 if (load_arrangement == Assembler::T8B) {
8546 // Extend 8B to 8H to be able to use vector multiply
8547 // instructions
8548 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8549 if (is_signed_subword_type(eltype)) {
8550 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8551 } else {
8552 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8553 }
8554 }
8555
8556 switch (load_arrangement) {
8557 case Assembler::T4S:
8558 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8559 break;
8560 case Assembler::T8B:
8561 case Assembler::T8H:
8562 assert(is_subword_type(eltype), "subword type expected");
8563 if (is_signed_subword_type(eltype)) {
8564 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8565 } else {
8566 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8567 }
8568 break;
8569 default:
8570 __ should_not_reach_here();
8571 }
8572
8573 // Process the upper half of a vector
8574 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8575 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8576 if (is_signed_subword_type(eltype)) {
8577 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8578 } else {
8579 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8580 }
8581 }
8582
8583 __ br(Assembler::HI, SMALL_LOOP);
8584
8585 // SMALL LOOP'S EPILOQUE
8586 __ lsr(rscratch2, cnt, exact_log2(evf));
8587 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8588
8589 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8590 __ addv(vmul0, Assembler::T4S, vmul0);
8591 __ umov(result, vmul0, Assembler::S, 0);
8592
8593 // TAIL
8594 __ bind(TAIL);
8595
8596 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8597 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8598 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8599 __ andr(rscratch2, cnt, vf - 1);
8600 __ bind(TAIL_SHORTCUT);
8601 __ adr(rscratch1, BR_BASE);
8602 // For Cortex-A53 offset is 4 because 2 nops are generated.
8603 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8604 __ movw(rscratch2, 0x1f);
8605 __ br(rscratch1);
8606
8607 for (size_t i = 0; i < vf - 1; ++i) {
8608 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8609 eltype);
8610 __ maddw(result, result, rscratch2, rscratch1);
8611 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8612 // Generate 2nd nop to have 4 instructions per iteration.
8613 if (VM_Version::supports_a53mac()) {
8614 __ nop();
8615 }
8616 }
8617 __ bind(BR_BASE);
8618
8619 __ leave();
8620 __ ret(lr);
8621
8622 // LARGE LOOP
8623 __ bind(LARGE_LOOP_PREHEADER);
8624
8625 __ lsr(rscratch2, cnt, exact_log2(evf));
8626
8627 if (multiply_by_halves) {
8628 // 31^4 - multiplier between lower and upper parts of a register
8629 __ movw(rscratch1, intpow(31U, vf / 2));
8630 __ mov(vpowm, Assembler::S, 1, rscratch1);
8631 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8632 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8633 __ mov(vpowm, Assembler::S, 0, rscratch1);
8634 } else {
8635 // 31^16
8636 __ movw(rscratch1, intpow(31U, evf));
8637 __ mov(vpowm, Assembler::S, 0, rscratch1);
8638 }
8639
8640 __ mov(vmul3, Assembler::T16B, 0);
8641 __ mov(vmul2, Assembler::T16B, 0);
8642 __ mov(vmul1, Assembler::T16B, 0);
8643
8644 __ bind(LARGE_LOOP);
8645
8646 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8647 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8648 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8649 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8650
8651 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8652 Address(__ post(ary, evf * type2aelembytes(eltype))));
8653
8654 if (load_arrangement == Assembler::T8B) {
8655 // Extend 8B to 8H to be able to use vector multiply
8656 // instructions
8657 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8658 if (is_signed_subword_type(eltype)) {
8659 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8660 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8661 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8662 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8663 } else {
8664 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8665 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8666 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8667 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8668 }
8669 }
8670
8671 switch (load_arrangement) {
8672 case Assembler::T4S:
8673 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8674 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8675 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8676 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8677 break;
8678 case Assembler::T8B:
8679 case Assembler::T8H:
8680 assert(is_subword_type(eltype), "subword type expected");
8681 if (is_signed_subword_type(eltype)) {
8682 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8683 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8684 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8685 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8686 } else {
8687 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8688 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8689 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8690 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8691 }
8692 break;
8693 default:
8694 __ should_not_reach_here();
8695 }
8696
8697 // Process the upper half of a vector
8698 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8699 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8700 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8701 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8702 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8703 if (is_signed_subword_type(eltype)) {
8704 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8705 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8706 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8707 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8708 } else {
8709 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8710 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8711 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8712 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8713 }
8714 }
8715
8716 __ subsw(rscratch2, rscratch2, 1);
8717 __ br(Assembler::HI, LARGE_LOOP);
8718
8719 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8720 __ addv(vmul3, Assembler::T4S, vmul3);
8721 __ umov(result, vmul3, Assembler::S, 0);
8722
8723 __ mov(rscratch2, intpow(31U, vf));
8724
8725 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8726 __ addv(vmul2, Assembler::T4S, vmul2);
8727 __ umov(rscratch1, vmul2, Assembler::S, 0);
8728 __ maddw(result, result, rscratch2, rscratch1);
8729
8730 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8731 __ addv(vmul1, Assembler::T4S, vmul1);
8732 __ umov(rscratch1, vmul1, Assembler::S, 0);
8733 __ maddw(result, result, rscratch2, rscratch1);
8734
8735 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8736 __ addv(vmul0, Assembler::T4S, vmul0);
8737 __ umov(rscratch1, vmul0, Assembler::S, 0);
8738 __ maddw(result, result, rscratch2, rscratch1);
8739
8740 __ andr(rscratch2, cnt, vf - 1);
8741 __ cbnz(rscratch2, TAIL_SHORTCUT);
8742
8743 __ leave();
8744 __ ret(lr);
8745
8746 return entry;
8747 }
8748
8749 address generate_dsin_dcos(bool isCos) {
8750 __ align(CodeEntryAlignment);
8751 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8752 StubCodeMark mark(this, stub_id);
8753 address start = __ pc();
8754 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8755 (address)StubRoutines::aarch64::_two_over_pi,
8756 (address)StubRoutines::aarch64::_pio2,
8757 (address)StubRoutines::aarch64::_dsin_coef,
8758 (address)StubRoutines::aarch64::_dcos_coef);
8759 return start;
8760 }
8761
8762 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8763 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8764 Label &DIFF2) {
8765 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8766 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8767
8768 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8769 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8770 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8771 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8772
8773 __ fmovd(tmpL, vtmp3);
8774 __ eor(rscratch2, tmp3, tmpL);
8775 __ cbnz(rscratch2, DIFF2);
8776
8777 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8778 __ umov(tmpL, vtmp3, __ D, 1);
8779 __ eor(rscratch2, tmpU, tmpL);
8780 __ cbnz(rscratch2, DIFF1);
8781
8782 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8783 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8784 __ fmovd(tmpL, vtmp);
8785 __ eor(rscratch2, tmp3, tmpL);
8786 __ cbnz(rscratch2, DIFF2);
8787
8788 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8789 __ umov(tmpL, vtmp, __ D, 1);
8790 __ eor(rscratch2, tmpU, tmpL);
8791 __ cbnz(rscratch2, DIFF1);
8792 }
8793
8794 // r0 = result
8795 // r1 = str1
8796 // r2 = cnt1
8797 // r3 = str2
8798 // r4 = cnt2
8799 // r10 = tmp1
8800 // r11 = tmp2
8801 address generate_compare_long_string_different_encoding(bool isLU) {
8802 __ align(CodeEntryAlignment);
8803 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8804 StubCodeMark mark(this, stub_id);
8805 address entry = __ pc();
8806 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8807 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8808 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8809 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8810 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8811 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8812 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8813
8814 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8815
8816 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8817 // cnt2 == amount of characters left to compare
8818 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8819 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8820 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8821 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8822 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8823 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8824 __ eor(rscratch2, tmp1, tmp2);
8825 __ mov(rscratch1, tmp2);
8826 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8827 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8828 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8829 __ push(spilled_regs, sp);
8830 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8831 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8832
8833 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8834
8835 if (SoftwarePrefetchHintDistance >= 0) {
8836 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8837 __ br(__ LT, NO_PREFETCH);
8838 __ bind(LARGE_LOOP_PREFETCH);
8839 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8840 __ mov(tmp4, 2);
8841 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8842 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8843 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8844 __ subs(tmp4, tmp4, 1);
8845 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8846 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8847 __ mov(tmp4, 2);
8848 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8849 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8850 __ subs(tmp4, tmp4, 1);
8851 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8852 __ sub(cnt2, cnt2, 64);
8853 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8854 __ br(__ GE, LARGE_LOOP_PREFETCH);
8855 }
8856 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8857 __ bind(NO_PREFETCH);
8858 __ subs(cnt2, cnt2, 16);
8859 __ br(__ LT, TAIL);
8860 __ align(OptoLoopAlignment);
8861 __ bind(SMALL_LOOP); // smaller loop
8862 __ subs(cnt2, cnt2, 16);
8863 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8864 __ br(__ GE, SMALL_LOOP);
8865 __ cmn(cnt2, (u1)16);
8866 __ br(__ EQ, LOAD_LAST);
8867 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8868 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8869 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8870 __ ldr(tmp3, Address(cnt1, -8));
8871 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8872 __ b(LOAD_LAST);
8873 __ bind(DIFF2);
8874 __ mov(tmpU, tmp3);
8875 __ bind(DIFF1);
8876 __ pop(spilled_regs, sp);
8877 __ b(CALCULATE_DIFFERENCE);
8878 __ bind(LOAD_LAST);
8879 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8880 // No need to load it again
8881 __ mov(tmpU, tmp3);
8882 __ pop(spilled_regs, sp);
8883
8884 // tmp2 points to the address of the last 4 Latin1 characters right now
8885 __ ldrs(vtmp, Address(tmp2));
8886 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8887 __ fmovd(tmpL, vtmp);
8888
8889 __ eor(rscratch2, tmpU, tmpL);
8890 __ cbz(rscratch2, DONE);
8891
8892 // Find the first different characters in the longwords and
8893 // compute their difference.
8894 __ bind(CALCULATE_DIFFERENCE);
8895 __ rev(rscratch2, rscratch2);
8896 __ clz(rscratch2, rscratch2);
8897 __ andr(rscratch2, rscratch2, -16);
8898 __ lsrv(tmp1, tmp1, rscratch2);
8899 __ uxthw(tmp1, tmp1);
8900 __ lsrv(rscratch1, rscratch1, rscratch2);
8901 __ uxthw(rscratch1, rscratch1);
8902 __ subw(result, tmp1, rscratch1);
8903 __ bind(DONE);
8904 __ ret(lr);
8905 return entry;
8906 }
8907
8908 // r0 = input (float16)
8909 // v0 = result (float)
8910 // v1 = temporary float register
8911 address generate_float16ToFloat() {
8912 __ align(CodeEntryAlignment);
8913 StubId stub_id = StubId::stubgen_hf2f_id;
8914 StubCodeMark mark(this, stub_id);
8915 address entry = __ pc();
8916 BLOCK_COMMENT("Entry:");
8917 __ flt16_to_flt(v0, r0, v1);
8918 __ ret(lr);
8919 return entry;
8920 }
8921
8922 // v0 = input (float)
8923 // r0 = result (float16)
8924 // v1 = temporary float register
8925 address generate_floatToFloat16() {
8926 __ align(CodeEntryAlignment);
8927 StubId stub_id = StubId::stubgen_f2hf_id;
8928 StubCodeMark mark(this, stub_id);
8929 address entry = __ pc();
8930 BLOCK_COMMENT("Entry:");
8931 __ flt_to_flt16(r0, v0, v1);
8932 __ ret(lr);
8933 return entry;
8934 }
8935
8936 address generate_method_entry_barrier() {
8937 __ align(CodeEntryAlignment);
8938 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
8939 StubCodeMark mark(this, stub_id);
8940
8941 Label deoptimize_label;
8942
8943 address start = __ pc();
8944
8945 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
8946
8947 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
8948 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8949 // We can get here despite the nmethod being good, if we have not
8950 // yet applied our cross modification fence (or data fence).
8951 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
8952 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
8953 __ ldrw(rscratch2, rscratch2);
8954 __ strw(rscratch2, thread_epoch_addr);
8955 __ isb();
8956 __ membar(__ LoadLoad);
8957 }
8958
8959 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
8960
8961 __ enter();
8962 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
8963
8964 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
8965
8966 __ push_call_clobbered_registers();
8967
8968 __ mov(c_rarg0, rscratch2);
8969 __ call_VM_leaf
8970 (CAST_FROM_FN_PTR
8971 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
8972
8973 __ reset_last_Java_frame(true);
8974
8975 __ mov(rscratch1, r0);
8976
8977 __ pop_call_clobbered_registers();
8978
8979 __ cbnz(rscratch1, deoptimize_label);
8980
8981 __ leave();
8982 __ ret(lr);
8983
8984 __ BIND(deoptimize_label);
8985
8986 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
8987 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
8988
8989 __ mov(sp, rscratch1);
8990 __ br(rscratch2);
8991
8992 return start;
8993 }
8994
8995 // r0 = result
8996 // r1 = str1
8997 // r2 = cnt1
8998 // r3 = str2
8999 // r4 = cnt2
9000 // r10 = tmp1
9001 // r11 = tmp2
9002 address generate_compare_long_string_same_encoding(bool isLL) {
9003 __ align(CodeEntryAlignment);
9004 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9005 StubCodeMark mark(this, stub_id);
9006 address entry = __ pc();
9007 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9008 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9009
9010 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9011
9012 // exit from large loop when less than 64 bytes left to read or we're about
9013 // to prefetch memory behind array border
9014 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9015
9016 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9017 __ eor(rscratch2, tmp1, tmp2);
9018 __ cbnz(rscratch2, CAL_DIFFERENCE);
9019
9020 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9021 // update pointers, because of previous read
9022 __ add(str1, str1, wordSize);
9023 __ add(str2, str2, wordSize);
9024 if (SoftwarePrefetchHintDistance >= 0) {
9025 __ align(OptoLoopAlignment);
9026 __ bind(LARGE_LOOP_PREFETCH);
9027 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9028 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9029
9030 for (int i = 0; i < 4; i++) {
9031 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9032 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9033 __ cmp(tmp1, tmp2);
9034 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9035 __ br(Assembler::NE, DIFF);
9036 }
9037 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9038 __ add(str1, str1, 64);
9039 __ add(str2, str2, 64);
9040 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9041 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9042 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9043 }
9044
9045 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9046 __ br(Assembler::LE, LESS16);
9047 __ align(OptoLoopAlignment);
9048 __ bind(LOOP_COMPARE16);
9049 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9050 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9051 __ cmp(tmp1, tmp2);
9052 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9053 __ br(Assembler::NE, DIFF);
9054 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9055 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9056 __ br(Assembler::LT, LESS16);
9057
9058 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9059 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9060 __ cmp(tmp1, tmp2);
9061 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9062 __ br(Assembler::NE, DIFF);
9063 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9064 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9065 __ br(Assembler::GE, LOOP_COMPARE16);
9066 __ cbz(cnt2, LENGTH_DIFF);
9067
9068 __ bind(LESS16);
9069 // each 8 compare
9070 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9071 __ br(Assembler::LE, LESS8);
9072 __ ldr(tmp1, Address(__ post(str1, 8)));
9073 __ ldr(tmp2, Address(__ post(str2, 8)));
9074 __ eor(rscratch2, tmp1, tmp2);
9075 __ cbnz(rscratch2, CAL_DIFFERENCE);
9076 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9077
9078 __ bind(LESS8); // directly load last 8 bytes
9079 if (!isLL) {
9080 __ add(cnt2, cnt2, cnt2);
9081 }
9082 __ ldr(tmp1, Address(str1, cnt2));
9083 __ ldr(tmp2, Address(str2, cnt2));
9084 __ eor(rscratch2, tmp1, tmp2);
9085 __ cbz(rscratch2, LENGTH_DIFF);
9086 __ b(CAL_DIFFERENCE);
9087
9088 __ bind(DIFF);
9089 __ cmp(tmp1, tmp2);
9090 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9091 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9092 // reuse rscratch2 register for the result of eor instruction
9093 __ eor(rscratch2, tmp1, tmp2);
9094
9095 __ bind(CAL_DIFFERENCE);
9096 __ rev(rscratch2, rscratch2);
9097 __ clz(rscratch2, rscratch2);
9098 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9099 __ lsrv(tmp1, tmp1, rscratch2);
9100 __ lsrv(tmp2, tmp2, rscratch2);
9101 if (isLL) {
9102 __ uxtbw(tmp1, tmp1);
9103 __ uxtbw(tmp2, tmp2);
9104 } else {
9105 __ uxthw(tmp1, tmp1);
9106 __ uxthw(tmp2, tmp2);
9107 }
9108 __ subw(result, tmp1, tmp2);
9109
9110 __ bind(LENGTH_DIFF);
9111 __ ret(lr);
9112 return entry;
9113 }
9114
9115 enum string_compare_mode {
9116 LL,
9117 LU,
9118 UL,
9119 UU,
9120 };
9121
9122 // The following registers are declared in aarch64.ad
9123 // r0 = result
9124 // r1 = str1
9125 // r2 = cnt1
9126 // r3 = str2
9127 // r4 = cnt2
9128 // r10 = tmp1
9129 // r11 = tmp2
9130 // z0 = ztmp1
9131 // z1 = ztmp2
9132 // p0 = pgtmp1
9133 // p1 = pgtmp2
9134 address generate_compare_long_string_sve(string_compare_mode mode) {
9135 StubId stub_id;
9136 switch (mode) {
9137 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9138 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9139 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9140 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9141 default: ShouldNotReachHere();
9142 }
9143
9144 __ align(CodeEntryAlignment);
9145 address entry = __ pc();
9146 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9147 tmp1 = r10, tmp2 = r11;
9148
9149 Label LOOP, DONE, MISMATCH;
9150 Register vec_len = tmp1;
9151 Register idx = tmp2;
9152 // The minimum of the string lengths has been stored in cnt2.
9153 Register cnt = cnt2;
9154 FloatRegister ztmp1 = z0, ztmp2 = z1;
9155 PRegister pgtmp1 = p0, pgtmp2 = p1;
9156
9157 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9158 switch (mode) { \
9159 case LL: \
9160 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9161 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9162 break; \
9163 case LU: \
9164 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9165 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9166 break; \
9167 case UL: \
9168 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9169 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9170 break; \
9171 case UU: \
9172 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9173 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9174 break; \
9175 default: \
9176 ShouldNotReachHere(); \
9177 }
9178
9179 StubCodeMark mark(this, stub_id);
9180
9181 __ mov(idx, 0);
9182 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9183
9184 if (mode == LL) {
9185 __ sve_cntb(vec_len);
9186 } else {
9187 __ sve_cnth(vec_len);
9188 }
9189
9190 __ sub(rscratch1, cnt, vec_len);
9191
9192 __ bind(LOOP);
9193
9194 // main loop
9195 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9196 __ add(idx, idx, vec_len);
9197 // Compare strings.
9198 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9199 __ br(__ NE, MISMATCH);
9200 __ cmp(idx, rscratch1);
9201 __ br(__ LT, LOOP);
9202
9203 // post loop, last iteration
9204 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9205
9206 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9207 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9208 __ br(__ EQ, DONE);
9209
9210 __ bind(MISMATCH);
9211
9212 // Crop the vector to find its location.
9213 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9214 // Extract the first different characters of each string.
9215 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9216 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9217
9218 // Compute the difference of the first different characters.
9219 __ sub(result, rscratch1, rscratch2);
9220
9221 __ bind(DONE);
9222 __ ret(lr);
9223 #undef LOAD_PAIR
9224 return entry;
9225 }
9226
9227 void generate_compare_long_strings() {
9228 if (UseSVE == 0) {
9229 StubRoutines::aarch64::_compare_long_string_LL
9230 = generate_compare_long_string_same_encoding(true);
9231 StubRoutines::aarch64::_compare_long_string_UU
9232 = generate_compare_long_string_same_encoding(false);
9233 StubRoutines::aarch64::_compare_long_string_LU
9234 = generate_compare_long_string_different_encoding(true);
9235 StubRoutines::aarch64::_compare_long_string_UL
9236 = generate_compare_long_string_different_encoding(false);
9237 } else {
9238 StubRoutines::aarch64::_compare_long_string_LL
9239 = generate_compare_long_string_sve(LL);
9240 StubRoutines::aarch64::_compare_long_string_UU
9241 = generate_compare_long_string_sve(UU);
9242 StubRoutines::aarch64::_compare_long_string_LU
9243 = generate_compare_long_string_sve(LU);
9244 StubRoutines::aarch64::_compare_long_string_UL
9245 = generate_compare_long_string_sve(UL);
9246 }
9247 }
9248
9249 // R0 = result
9250 // R1 = str2
9251 // R2 = cnt1
9252 // R3 = str1
9253 // R4 = cnt2
9254 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9255 //
9256 // This generic linear code use few additional ideas, which makes it faster:
9257 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9258 // in order to skip initial loading(help in systems with 1 ld pipeline)
9259 // 2) we can use "fast" algorithm of finding single character to search for
9260 // first symbol with less branches(1 branch per each loaded register instead
9261 // of branch for each symbol), so, this is where constants like
9262 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9263 // 3) after loading and analyzing 1st register of source string, it can be
9264 // used to search for every 1st character entry, saving few loads in
9265 // comparison with "simplier-but-slower" implementation
9266 // 4) in order to avoid lots of push/pop operations, code below is heavily
9267 // re-using/re-initializing/compressing register values, which makes code
9268 // larger and a bit less readable, however, most of extra operations are
9269 // issued during loads or branches, so, penalty is minimal
9270 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9271 StubId stub_id;
9272 if (str1_isL) {
9273 if (str2_isL) {
9274 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9275 } else {
9276 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9277 }
9278 } else {
9279 if (str2_isL) {
9280 ShouldNotReachHere();
9281 } else {
9282 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9283 }
9284 }
9285 __ align(CodeEntryAlignment);
9286 StubCodeMark mark(this, stub_id);
9287 address entry = __ pc();
9288
9289 int str1_chr_size = str1_isL ? 1 : 2;
9290 int str2_chr_size = str2_isL ? 1 : 2;
9291 int str1_chr_shift = str1_isL ? 0 : 1;
9292 int str2_chr_shift = str2_isL ? 0 : 1;
9293 bool isL = str1_isL && str2_isL;
9294 // parameters
9295 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9296 // temporary registers
9297 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9298 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9299 // redefinitions
9300 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9301
9302 __ push(spilled_regs, sp);
9303 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9304 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9305 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9306 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9307 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9308 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9309 // Read whole register from str1. It is safe, because length >=8 here
9310 __ ldr(ch1, Address(str1));
9311 // Read whole register from str2. It is safe, because length >=8 here
9312 __ ldr(ch2, Address(str2));
9313 __ sub(cnt2, cnt2, cnt1);
9314 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9315 if (str1_isL != str2_isL) {
9316 __ eor(v0, __ T16B, v0, v0);
9317 }
9318 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9319 __ mul(first, first, tmp1);
9320 // check if we have less than 1 register to check
9321 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9322 if (str1_isL != str2_isL) {
9323 __ fmovd(v1, ch1);
9324 }
9325 __ br(__ LE, L_SMALL);
9326 __ eor(ch2, first, ch2);
9327 if (str1_isL != str2_isL) {
9328 __ zip1(v1, __ T16B, v1, v0);
9329 }
9330 __ sub(tmp2, ch2, tmp1);
9331 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9332 __ bics(tmp2, tmp2, ch2);
9333 if (str1_isL != str2_isL) {
9334 __ fmovd(ch1, v1);
9335 }
9336 __ br(__ NE, L_HAS_ZERO);
9337 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9338 __ add(result, result, wordSize/str2_chr_size);
9339 __ add(str2, str2, wordSize);
9340 __ br(__ LT, L_POST_LOOP);
9341 __ BIND(L_LOOP);
9342 __ ldr(ch2, Address(str2));
9343 __ eor(ch2, first, ch2);
9344 __ sub(tmp2, ch2, tmp1);
9345 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9346 __ bics(tmp2, tmp2, ch2);
9347 __ br(__ NE, L_HAS_ZERO);
9348 __ BIND(L_LOOP_PROCEED);
9349 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9350 __ add(str2, str2, wordSize);
9351 __ add(result, result, wordSize/str2_chr_size);
9352 __ br(__ GE, L_LOOP);
9353 __ BIND(L_POST_LOOP);
9354 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9355 __ br(__ LE, NOMATCH);
9356 __ ldr(ch2, Address(str2));
9357 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9358 __ eor(ch2, first, ch2);
9359 __ sub(tmp2, ch2, tmp1);
9360 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9361 __ mov(tmp4, -1); // all bits set
9362 __ b(L_SMALL_PROCEED);
9363 __ align(OptoLoopAlignment);
9364 __ BIND(L_SMALL);
9365 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9366 __ eor(ch2, first, ch2);
9367 if (str1_isL != str2_isL) {
9368 __ zip1(v1, __ T16B, v1, v0);
9369 }
9370 __ sub(tmp2, ch2, tmp1);
9371 __ mov(tmp4, -1); // all bits set
9372 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9373 if (str1_isL != str2_isL) {
9374 __ fmovd(ch1, v1); // move converted 4 symbols
9375 }
9376 __ BIND(L_SMALL_PROCEED);
9377 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9378 __ bic(tmp2, tmp2, ch2);
9379 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9380 __ rbit(tmp2, tmp2);
9381 __ br(__ EQ, NOMATCH);
9382 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9383 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9384 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9385 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9386 if (str2_isL) { // LL
9387 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9388 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9389 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9390 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9391 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9392 } else {
9393 __ mov(ch2, 0xE); // all bits in byte set except last one
9394 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9395 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9396 __ lslv(tmp2, tmp2, tmp4);
9397 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9398 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9399 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9400 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9401 }
9402 __ cmp(ch1, ch2);
9403 __ mov(tmp4, wordSize/str2_chr_size);
9404 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9405 __ BIND(L_SMALL_CMP_LOOP);
9406 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9407 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9408 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9409 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9410 __ add(tmp4, tmp4, 1);
9411 __ cmp(tmp4, cnt1);
9412 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9413 __ cmp(first, ch2);
9414 __ br(__ EQ, L_SMALL_CMP_LOOP);
9415 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9416 __ cbz(tmp2, NOMATCH); // no more matches. exit
9417 __ clz(tmp4, tmp2);
9418 __ add(result, result, 1); // advance index
9419 __ add(str2, str2, str2_chr_size); // advance pointer
9420 __ b(L_SMALL_HAS_ZERO_LOOP);
9421 __ align(OptoLoopAlignment);
9422 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9423 __ cmp(first, ch2);
9424 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9425 __ b(DONE);
9426 __ align(OptoLoopAlignment);
9427 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9428 if (str2_isL) { // LL
9429 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9430 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9431 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9432 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9433 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9434 } else {
9435 __ mov(ch2, 0xE); // all bits in byte set except last one
9436 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9437 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9438 __ lslv(tmp2, tmp2, tmp4);
9439 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9440 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9441 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9442 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9443 }
9444 __ cmp(ch1, ch2);
9445 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9446 __ b(DONE);
9447 __ align(OptoLoopAlignment);
9448 __ BIND(L_HAS_ZERO);
9449 __ rbit(tmp2, tmp2);
9450 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9451 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9452 // It's fine because both counters are 32bit and are not changed in this
9453 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9454 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9455 __ sub(result, result, 1);
9456 __ BIND(L_HAS_ZERO_LOOP);
9457 __ mov(cnt1, wordSize/str2_chr_size);
9458 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9459 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9460 if (str2_isL) {
9461 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9462 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9463 __ lslv(tmp2, tmp2, tmp4);
9464 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9465 __ add(tmp4, tmp4, 1);
9466 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9467 __ lsl(tmp2, tmp2, 1);
9468 __ mov(tmp4, wordSize/str2_chr_size);
9469 } else {
9470 __ mov(ch2, 0xE);
9471 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9472 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9473 __ lslv(tmp2, tmp2, tmp4);
9474 __ add(tmp4, tmp4, 1);
9475 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9476 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9477 __ lsl(tmp2, tmp2, 1);
9478 __ mov(tmp4, wordSize/str2_chr_size);
9479 __ sub(str2, str2, str2_chr_size);
9480 }
9481 __ cmp(ch1, ch2);
9482 __ mov(tmp4, wordSize/str2_chr_size);
9483 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9484 __ BIND(L_CMP_LOOP);
9485 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9486 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9487 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9488 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9489 __ add(tmp4, tmp4, 1);
9490 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9491 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9492 __ cmp(cnt1, ch2);
9493 __ br(__ EQ, L_CMP_LOOP);
9494 __ BIND(L_CMP_LOOP_NOMATCH);
9495 // here we're not matched
9496 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9497 __ clz(tmp4, tmp2);
9498 __ add(str2, str2, str2_chr_size); // advance pointer
9499 __ b(L_HAS_ZERO_LOOP);
9500 __ align(OptoLoopAlignment);
9501 __ BIND(L_CMP_LOOP_LAST_CMP);
9502 __ cmp(cnt1, ch2);
9503 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9504 __ b(DONE);
9505 __ align(OptoLoopAlignment);
9506 __ BIND(L_CMP_LOOP_LAST_CMP2);
9507 if (str2_isL) {
9508 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9509 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9510 __ lslv(tmp2, tmp2, tmp4);
9511 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9512 __ add(tmp4, tmp4, 1);
9513 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9514 __ lsl(tmp2, tmp2, 1);
9515 } else {
9516 __ mov(ch2, 0xE);
9517 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9518 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9519 __ lslv(tmp2, tmp2, tmp4);
9520 __ add(tmp4, tmp4, 1);
9521 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9522 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9523 __ lsl(tmp2, tmp2, 1);
9524 __ sub(str2, str2, str2_chr_size);
9525 }
9526 __ cmp(ch1, ch2);
9527 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9528 __ b(DONE);
9529 __ align(OptoLoopAlignment);
9530 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9531 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9532 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9533 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9534 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9535 // result by analyzed characters value, so, we can just reset lower bits
9536 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9537 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9538 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9539 // index of last analyzed substring inside current octet. So, str2 in at
9540 // respective start address. We need to advance it to next octet
9541 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9542 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9543 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9544 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9545 __ movw(cnt2, cnt2);
9546 __ b(L_LOOP_PROCEED);
9547 __ align(OptoLoopAlignment);
9548 __ BIND(NOMATCH);
9549 __ mov(result, -1);
9550 __ BIND(DONE);
9551 __ pop(spilled_regs, sp);
9552 __ ret(lr);
9553 return entry;
9554 }
9555
9556 void generate_string_indexof_stubs() {
9557 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9558 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9559 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9560 }
9561
9562 void inflate_and_store_2_fp_registers(bool generatePrfm,
9563 FloatRegister src1, FloatRegister src2) {
9564 Register dst = r1;
9565 __ zip1(v1, __ T16B, src1, v0);
9566 __ zip2(v2, __ T16B, src1, v0);
9567 if (generatePrfm) {
9568 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9569 }
9570 __ zip1(v3, __ T16B, src2, v0);
9571 __ zip2(v4, __ T16B, src2, v0);
9572 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9573 }
9574
9575 // R0 = src
9576 // R1 = dst
9577 // R2 = len
9578 // R3 = len >> 3
9579 // V0 = 0
9580 // v1 = loaded 8 bytes
9581 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9582 address generate_large_byte_array_inflate() {
9583 __ align(CodeEntryAlignment);
9584 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9585 StubCodeMark mark(this, stub_id);
9586 address entry = __ pc();
9587 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9588 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9589 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9590
9591 // do one more 8-byte read to have address 16-byte aligned in most cases
9592 // also use single store instruction
9593 __ ldrd(v2, __ post(src, 8));
9594 __ sub(octetCounter, octetCounter, 2);
9595 __ zip1(v1, __ T16B, v1, v0);
9596 __ zip1(v2, __ T16B, v2, v0);
9597 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9598 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9599 __ subs(rscratch1, octetCounter, large_loop_threshold);
9600 __ br(__ LE, LOOP_START);
9601 __ b(LOOP_PRFM_START);
9602 __ bind(LOOP_PRFM);
9603 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9604 __ bind(LOOP_PRFM_START);
9605 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9606 __ sub(octetCounter, octetCounter, 8);
9607 __ subs(rscratch1, octetCounter, large_loop_threshold);
9608 inflate_and_store_2_fp_registers(true, v3, v4);
9609 inflate_and_store_2_fp_registers(true, v5, v6);
9610 __ br(__ GT, LOOP_PRFM);
9611 __ cmp(octetCounter, (u1)8);
9612 __ br(__ LT, DONE);
9613 __ bind(LOOP);
9614 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9615 __ bind(LOOP_START);
9616 __ sub(octetCounter, octetCounter, 8);
9617 __ cmp(octetCounter, (u1)8);
9618 inflate_and_store_2_fp_registers(false, v3, v4);
9619 inflate_and_store_2_fp_registers(false, v5, v6);
9620 __ br(__ GE, LOOP);
9621 __ bind(DONE);
9622 __ ret(lr);
9623 return entry;
9624 }
9625
9626 /**
9627 * Arguments:
9628 *
9629 * Input:
9630 * c_rarg0 - current state address
9631 * c_rarg1 - H key address
9632 * c_rarg2 - data address
9633 * c_rarg3 - number of blocks
9634 *
9635 * Output:
9636 * Updated state at c_rarg0
9637 */
9638 address generate_ghash_processBlocks() {
9639 // Bafflingly, GCM uses little-endian for the byte order, but
9640 // big-endian for the bit order. For example, the polynomial 1 is
9641 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9642 //
9643 // So, we must either reverse the bytes in each word and do
9644 // everything big-endian or reverse the bits in each byte and do
9645 // it little-endian. On AArch64 it's more idiomatic to reverse
9646 // the bits in each byte (we have an instruction, RBIT, to do
9647 // that) and keep the data in little-endian bit order through the
9648 // calculation, bit-reversing the inputs and outputs.
9649
9650 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9651 StubCodeMark mark(this, stub_id);
9652 Label polynomial; // local data generated at end of stub
9653 __ align(CodeEntryAlignment);
9654 address start = __ pc();
9655
9656 Register state = c_rarg0;
9657 Register subkeyH = c_rarg1;
9658 Register data = c_rarg2;
9659 Register blocks = c_rarg3;
9660
9661 FloatRegister vzr = v30;
9662 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9663
9664 __ adr(rscratch1, polynomial);
9665 __ ldrq(v24, rscratch1); // The field polynomial
9666
9667 __ ldrq(v0, Address(state));
9668 __ ldrq(v1, Address(subkeyH));
9669
9670 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9671 __ rbit(v0, __ T16B, v0);
9672 __ rev64(v1, __ T16B, v1);
9673 __ rbit(v1, __ T16B, v1);
9674
9675 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9676 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9677
9678 {
9679 Label L_ghash_loop;
9680 __ bind(L_ghash_loop);
9681
9682 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9683 // reversing each byte
9684 __ rbit(v2, __ T16B, v2);
9685 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9686
9687 // Multiply state in v2 by subkey in v1
9688 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9689 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9690 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9691 // Reduce v7:v5 by the field polynomial
9692 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9693
9694 __ sub(blocks, blocks, 1);
9695 __ cbnz(blocks, L_ghash_loop);
9696 }
9697
9698 // The bit-reversed result is at this point in v0
9699 __ rev64(v0, __ T16B, v0);
9700 __ rbit(v0, __ T16B, v0);
9701
9702 __ st1(v0, __ T16B, state);
9703 __ ret(lr);
9704
9705 // bind label and generate local polynomial data
9706 __ align(wordSize * 2);
9707 __ bind(polynomial);
9708 __ emit_int64(0x87); // The low-order bits of the field
9709 // polynomial (i.e. p = z^7+z^2+z+1)
9710 // repeated in the low and high parts of a
9711 // 128-bit vector
9712 __ emit_int64(0x87);
9713
9714 return start;
9715 }
9716
9717 address generate_ghash_processBlocks_wide() {
9718 address small = generate_ghash_processBlocks();
9719
9720 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9721 StubCodeMark mark(this, stub_id);
9722 Label polynomial; // local data generated after stub
9723 __ align(CodeEntryAlignment);
9724 address start = __ pc();
9725
9726 Register state = c_rarg0;
9727 Register subkeyH = c_rarg1;
9728 Register data = c_rarg2;
9729 Register blocks = c_rarg3;
9730
9731 const int unroll = 4;
9732
9733 __ cmp(blocks, (unsigned char)(unroll * 2));
9734 __ br(__ LT, small);
9735
9736 if (unroll > 1) {
9737 // Save state before entering routine
9738 __ sub(sp, sp, 4 * 16);
9739 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9740 __ sub(sp, sp, 4 * 16);
9741 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9742 }
9743
9744 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9745
9746 if (unroll > 1) {
9747 // And restore state
9748 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9749 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9750 }
9751
9752 __ cmp(blocks, (unsigned char)0);
9753 __ br(__ GT, small);
9754
9755 __ ret(lr);
9756
9757 // bind label and generate polynomial data
9758 __ align(wordSize * 2);
9759 __ bind(polynomial);
9760 __ emit_int64(0x87); // The low-order bits of the field
9761 // polynomial (i.e. p = z^7+z^2+z+1)
9762 // repeated in the low and high parts of a
9763 // 128-bit vector
9764 __ emit_int64(0x87);
9765
9766 return start;
9767
9768 }
9769
9770 void generate_base64_encode_simdround(Register src, Register dst,
9771 FloatRegister codec, u8 size) {
9772
9773 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9774 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9775 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9776
9777 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9778
9779 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9780
9781 __ ushr(ind0, arrangement, in0, 2);
9782
9783 __ ushr(ind1, arrangement, in1, 2);
9784 __ shl(in0, arrangement, in0, 6);
9785 __ orr(ind1, arrangement, ind1, in0);
9786 __ ushr(ind1, arrangement, ind1, 2);
9787
9788 __ ushr(ind2, arrangement, in2, 4);
9789 __ shl(in1, arrangement, in1, 4);
9790 __ orr(ind2, arrangement, in1, ind2);
9791 __ ushr(ind2, arrangement, ind2, 2);
9792
9793 __ shl(ind3, arrangement, in2, 2);
9794 __ ushr(ind3, arrangement, ind3, 2);
9795
9796 __ tbl(out0, arrangement, codec, 4, ind0);
9797 __ tbl(out1, arrangement, codec, 4, ind1);
9798 __ tbl(out2, arrangement, codec, 4, ind2);
9799 __ tbl(out3, arrangement, codec, 4, ind3);
9800
9801 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9802 }
9803
9804 /**
9805 * Arguments:
9806 *
9807 * Input:
9808 * c_rarg0 - src_start
9809 * c_rarg1 - src_offset
9810 * c_rarg2 - src_length
9811 * c_rarg3 - dest_start
9812 * c_rarg4 - dest_offset
9813 * c_rarg5 - isURL
9814 *
9815 */
9816 address generate_base64_encodeBlock() {
9817
9818 static const char toBase64[64] = {
9819 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9820 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9821 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9822 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9823 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9824 };
9825
9826 static const char toBase64URL[64] = {
9827 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9828 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9829 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9830 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9831 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9832 };
9833
9834 __ align(CodeEntryAlignment);
9835 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9836 StubCodeMark mark(this, stub_id);
9837 address start = __ pc();
9838
9839 Register src = c_rarg0; // source array
9840 Register soff = c_rarg1; // source start offset
9841 Register send = c_rarg2; // source end offset
9842 Register dst = c_rarg3; // dest array
9843 Register doff = c_rarg4; // position for writing to dest array
9844 Register isURL = c_rarg5; // Base64 or URL character set
9845
9846 // c_rarg6 and c_rarg7 are free to use as temps
9847 Register codec = c_rarg6;
9848 Register length = c_rarg7;
9849
9850 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9851
9852 __ add(src, src, soff);
9853 __ add(dst, dst, doff);
9854 __ sub(length, send, soff);
9855
9856 // load the codec base address
9857 __ lea(codec, ExternalAddress((address) toBase64));
9858 __ cbz(isURL, ProcessData);
9859 __ lea(codec, ExternalAddress((address) toBase64URL));
9860
9861 __ BIND(ProcessData);
9862
9863 // too short to formup a SIMD loop, roll back
9864 __ cmp(length, (u1)24);
9865 __ br(Assembler::LT, Process3B);
9866
9867 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9868
9869 __ BIND(Process48B);
9870 __ cmp(length, (u1)48);
9871 __ br(Assembler::LT, Process24B);
9872 generate_base64_encode_simdround(src, dst, v0, 16);
9873 __ sub(length, length, 48);
9874 __ b(Process48B);
9875
9876 __ BIND(Process24B);
9877 __ cmp(length, (u1)24);
9878 __ br(Assembler::LT, SIMDExit);
9879 generate_base64_encode_simdround(src, dst, v0, 8);
9880 __ sub(length, length, 24);
9881
9882 __ BIND(SIMDExit);
9883 __ cbz(length, Exit);
9884
9885 __ BIND(Process3B);
9886 // 3 src bytes, 24 bits
9887 __ ldrb(r10, __ post(src, 1));
9888 __ ldrb(r11, __ post(src, 1));
9889 __ ldrb(r12, __ post(src, 1));
9890 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9891 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9892 // codec index
9893 __ ubfmw(r15, r12, 18, 23);
9894 __ ubfmw(r14, r12, 12, 17);
9895 __ ubfmw(r13, r12, 6, 11);
9896 __ andw(r12, r12, 63);
9897 // get the code based on the codec
9898 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9899 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9900 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9901 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9902 __ strb(r15, __ post(dst, 1));
9903 __ strb(r14, __ post(dst, 1));
9904 __ strb(r13, __ post(dst, 1));
9905 __ strb(r12, __ post(dst, 1));
9906 __ sub(length, length, 3);
9907 __ cbnz(length, Process3B);
9908
9909 __ BIND(Exit);
9910 __ ret(lr);
9911
9912 return start;
9913 }
9914
9915 void generate_base64_decode_simdround(Register src, Register dst,
9916 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9917
9918 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9919 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9920
9921 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9922 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9923
9924 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9925
9926 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9927
9928 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9929
9930 // we need unsigned saturating subtract, to make sure all input values
9931 // in range [0, 63] will have 0U value in the higher half lookup
9932 __ uqsubv(decH0, __ T16B, in0, v27);
9933 __ uqsubv(decH1, __ T16B, in1, v27);
9934 __ uqsubv(decH2, __ T16B, in2, v27);
9935 __ uqsubv(decH3, __ T16B, in3, v27);
9936
9937 // lower half lookup
9938 __ tbl(decL0, arrangement, codecL, 4, in0);
9939 __ tbl(decL1, arrangement, codecL, 4, in1);
9940 __ tbl(decL2, arrangement, codecL, 4, in2);
9941 __ tbl(decL3, arrangement, codecL, 4, in3);
9942
9943 // higher half lookup
9944 __ tbx(decH0, arrangement, codecH, 4, decH0);
9945 __ tbx(decH1, arrangement, codecH, 4, decH1);
9946 __ tbx(decH2, arrangement, codecH, 4, decH2);
9947 __ tbx(decH3, arrangement, codecH, 4, decH3);
9948
9949 // combine lower and higher
9950 __ orr(decL0, arrangement, decL0, decH0);
9951 __ orr(decL1, arrangement, decL1, decH1);
9952 __ orr(decL2, arrangement, decL2, decH2);
9953 __ orr(decL3, arrangement, decL3, decH3);
9954
9955 // check illegal inputs, value larger than 63 (maximum of 6 bits)
9956 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
9957 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
9958 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
9959 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
9960 __ orr(in0, arrangement, decH0, decH1);
9961 __ orr(in1, arrangement, decH2, decH3);
9962 __ orr(in2, arrangement, in0, in1);
9963 __ umaxv(in3, arrangement, in2);
9964 __ umov(rscratch2, in3, __ B, 0);
9965
9966 // get the data to output
9967 __ shl(out0, arrangement, decL0, 2);
9968 __ ushr(out1, arrangement, decL1, 4);
9969 __ orr(out0, arrangement, out0, out1);
9970 __ shl(out1, arrangement, decL1, 4);
9971 __ ushr(out2, arrangement, decL2, 2);
9972 __ orr(out1, arrangement, out1, out2);
9973 __ shl(out2, arrangement, decL2, 6);
9974 __ orr(out2, arrangement, out2, decL3);
9975
9976 __ cbz(rscratch2, NoIllegalData);
9977
9978 // handle illegal input
9979 __ umov(r10, in2, __ D, 0);
9980 if (size == 16) {
9981 __ cbnz(r10, ErrorInLowerHalf);
9982
9983 // illegal input is in higher half, store the lower half now.
9984 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
9985
9986 __ umov(r10, in2, __ D, 1);
9987 __ umov(r11, out0, __ D, 1);
9988 __ umov(r12, out1, __ D, 1);
9989 __ umov(r13, out2, __ D, 1);
9990 __ b(StoreLegalData);
9991
9992 __ BIND(ErrorInLowerHalf);
9993 }
9994 __ umov(r11, out0, __ D, 0);
9995 __ umov(r12, out1, __ D, 0);
9996 __ umov(r13, out2, __ D, 0);
9997
9998 __ BIND(StoreLegalData);
9999 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10000 __ strb(r11, __ post(dst, 1));
10001 __ strb(r12, __ post(dst, 1));
10002 __ strb(r13, __ post(dst, 1));
10003 __ lsr(r10, r10, 8);
10004 __ lsr(r11, r11, 8);
10005 __ lsr(r12, r12, 8);
10006 __ lsr(r13, r13, 8);
10007 __ b(StoreLegalData);
10008
10009 __ BIND(NoIllegalData);
10010 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10011 }
10012
10013
10014 /**
10015 * Arguments:
10016 *
10017 * Input:
10018 * c_rarg0 - src_start
10019 * c_rarg1 - src_offset
10020 * c_rarg2 - src_length
10021 * c_rarg3 - dest_start
10022 * c_rarg4 - dest_offset
10023 * c_rarg5 - isURL
10024 * c_rarg6 - isMIME
10025 *
10026 */
10027 address generate_base64_decodeBlock() {
10028
10029 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10030 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10031 // titled "Base64 decoding".
10032
10033 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10034 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10035 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10036 static const uint8_t fromBase64ForNoSIMD[256] = {
10037 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10038 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10039 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10040 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10041 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10042 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10043 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10044 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10045 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10046 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10047 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10048 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10049 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10050 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053 };
10054
10055 static const uint8_t fromBase64URLForNoSIMD[256] = {
10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10059 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10060 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10061 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10062 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10063 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10064 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10065 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10066 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10067 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10068 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10069 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10070 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072 };
10073
10074 // A legal value of base64 code is in range [0, 127]. We need two lookups
10075 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10076 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10077 // table vector lookup use tbx, out of range indices are unchanged in
10078 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10079 // The value of index 64 is set to 0, so that we know that we already get the
10080 // decoded data with the 1st lookup.
10081 static const uint8_t fromBase64ForSIMD[128] = {
10082 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10083 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10084 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10085 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10086 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10087 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10088 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10089 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10090 };
10091
10092 static const uint8_t fromBase64URLForSIMD[128] = {
10093 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10096 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10097 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10098 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10099 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10100 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10101 };
10102
10103 __ align(CodeEntryAlignment);
10104 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10105 StubCodeMark mark(this, stub_id);
10106 address start = __ pc();
10107
10108 Register src = c_rarg0; // source array
10109 Register soff = c_rarg1; // source start offset
10110 Register send = c_rarg2; // source end offset
10111 Register dst = c_rarg3; // dest array
10112 Register doff = c_rarg4; // position for writing to dest array
10113 Register isURL = c_rarg5; // Base64 or URL character set
10114 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10115
10116 Register length = send; // reuse send as length of source data to process
10117
10118 Register simd_codec = c_rarg6;
10119 Register nosimd_codec = c_rarg7;
10120
10121 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10122
10123 __ enter();
10124
10125 __ add(src, src, soff);
10126 __ add(dst, dst, doff);
10127
10128 __ mov(doff, dst);
10129
10130 __ sub(length, send, soff);
10131 __ bfm(length, zr, 0, 1);
10132
10133 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10134 __ cbz(isURL, ProcessData);
10135 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10136
10137 __ BIND(ProcessData);
10138 __ mov(rscratch1, length);
10139 __ cmp(length, (u1)144); // 144 = 80 + 64
10140 __ br(Assembler::LT, Process4B);
10141
10142 // In the MIME case, the line length cannot be more than 76
10143 // bytes (see RFC 2045). This is too short a block for SIMD
10144 // to be worthwhile, so we use non-SIMD here.
10145 __ movw(rscratch1, 79);
10146
10147 __ BIND(Process4B);
10148 __ ldrw(r14, __ post(src, 4));
10149 __ ubfxw(r10, r14, 0, 8);
10150 __ ubfxw(r11, r14, 8, 8);
10151 __ ubfxw(r12, r14, 16, 8);
10152 __ ubfxw(r13, r14, 24, 8);
10153 // get the de-code
10154 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10155 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10156 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10157 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10158 // error detection, 255u indicates an illegal input
10159 __ orrw(r14, r10, r11);
10160 __ orrw(r15, r12, r13);
10161 __ orrw(r14, r14, r15);
10162 __ tbnz(r14, 7, Exit);
10163 // recover the data
10164 __ lslw(r14, r10, 10);
10165 __ bfiw(r14, r11, 4, 6);
10166 __ bfmw(r14, r12, 2, 5);
10167 __ rev16w(r14, r14);
10168 __ bfiw(r13, r12, 6, 2);
10169 __ strh(r14, __ post(dst, 2));
10170 __ strb(r13, __ post(dst, 1));
10171 // non-simd loop
10172 __ subsw(rscratch1, rscratch1, 4);
10173 __ br(Assembler::GT, Process4B);
10174
10175 // if exiting from PreProcess80B, rscratch1 == -1;
10176 // otherwise, rscratch1 == 0.
10177 __ cbzw(rscratch1, Exit);
10178 __ sub(length, length, 80);
10179
10180 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10181 __ cbz(isURL, SIMDEnter);
10182 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10183
10184 __ BIND(SIMDEnter);
10185 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10186 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10187 __ mov(rscratch1, 63);
10188 __ dup(v27, __ T16B, rscratch1);
10189
10190 __ BIND(Process64B);
10191 __ cmp(length, (u1)64);
10192 __ br(Assembler::LT, Process32B);
10193 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10194 __ sub(length, length, 64);
10195 __ b(Process64B);
10196
10197 __ BIND(Process32B);
10198 __ cmp(length, (u1)32);
10199 __ br(Assembler::LT, SIMDExit);
10200 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10201 __ sub(length, length, 32);
10202 __ b(Process32B);
10203
10204 __ BIND(SIMDExit);
10205 __ cbz(length, Exit);
10206 __ movw(rscratch1, length);
10207 __ b(Process4B);
10208
10209 __ BIND(Exit);
10210 __ sub(c_rarg0, dst, doff);
10211
10212 __ leave();
10213 __ ret(lr);
10214
10215 return start;
10216 }
10217
10218 // Support for spin waits.
10219 address generate_spin_wait() {
10220 __ align(CodeEntryAlignment);
10221 StubId stub_id = StubId::stubgen_spin_wait_id;
10222 StubCodeMark mark(this, stub_id);
10223 address start = __ pc();
10224
10225 __ spin_wait();
10226 __ ret(lr);
10227
10228 return start;
10229 }
10230
10231 void generate_lookup_secondary_supers_table_stub() {
10232 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10233 StubCodeMark mark(this, stub_id);
10234
10235 const Register
10236 r_super_klass = r0,
10237 r_array_base = r1,
10238 r_array_length = r2,
10239 r_array_index = r3,
10240 r_sub_klass = r4,
10241 r_bitmap = rscratch2,
10242 result = r5;
10243 const FloatRegister
10244 vtemp = v0;
10245
10246 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10247 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10248 Label L_success;
10249 __ enter();
10250 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10251 r_array_base, r_array_length, r_array_index,
10252 vtemp, result, slot,
10253 /*stub_is_near*/true);
10254 __ leave();
10255 __ ret(lr);
10256 }
10257 }
10258
10259 // Slow path implementation for UseSecondarySupersTable.
10260 address generate_lookup_secondary_supers_table_slow_path_stub() {
10261 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10262 StubCodeMark mark(this, stub_id);
10263
10264 address start = __ pc();
10265 const Register
10266 r_super_klass = r0, // argument
10267 r_array_base = r1, // argument
10268 temp1 = r2, // temp
10269 r_array_index = r3, // argument
10270 r_bitmap = rscratch2, // argument
10271 result = r5; // argument
10272
10273 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10274 __ ret(lr);
10275
10276 return start;
10277 }
10278
10279 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10280
10281 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10282 //
10283 // If LSE is in use, generate LSE versions of all the stubs. The
10284 // non-LSE versions are in atomic_aarch64.S.
10285
10286 // class AtomicStubMark records the entry point of a stub and the
10287 // stub pointer which will point to it. The stub pointer is set to
10288 // the entry point when ~AtomicStubMark() is called, which must be
10289 // after ICache::invalidate_range. This ensures safe publication of
10290 // the generated code.
10291 class AtomicStubMark {
10292 address _entry_point;
10293 aarch64_atomic_stub_t *_stub;
10294 MacroAssembler *_masm;
10295 public:
10296 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10297 _masm = masm;
10298 __ align(32);
10299 _entry_point = __ pc();
10300 _stub = stub;
10301 }
10302 ~AtomicStubMark() {
10303 *_stub = (aarch64_atomic_stub_t)_entry_point;
10304 }
10305 };
10306
10307 // NB: For memory_order_conservative we need a trailing membar after
10308 // LSE atomic operations but not a leading membar.
10309 //
10310 // We don't need a leading membar because a clause in the Arm ARM
10311 // says:
10312 //
10313 // Barrier-ordered-before
10314 //
10315 // Barrier instructions order prior Memory effects before subsequent
10316 // Memory effects generated by the same Observer. A read or a write
10317 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10318 // Observer if and only if RW1 appears in program order before RW 2
10319 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10320 // instruction with both Acquire and Release semantics.
10321 //
10322 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10323 // and Release semantics, therefore we don't need a leading
10324 // barrier. However, there is no corresponding Barrier-ordered-after
10325 // relationship, therefore we need a trailing membar to prevent a
10326 // later store or load from being reordered with the store in an
10327 // atomic instruction.
10328 //
10329 // This was checked by using the herd7 consistency model simulator
10330 // (http://diy.inria.fr/) with this test case:
10331 //
10332 // AArch64 LseCas
10333 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10334 // P0 | P1;
10335 // LDR W4, [X2] | MOV W3, #0;
10336 // DMB LD | MOV W4, #1;
10337 // LDR W3, [X1] | CASAL W3, W4, [X1];
10338 // | DMB ISH;
10339 // | STR W4, [X2];
10340 // exists
10341 // (0:X3=0 /\ 0:X4=1)
10342 //
10343 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10344 // with the store to x in P1. Without the DMB in P1 this may happen.
10345 //
10346 // At the time of writing we don't know of any AArch64 hardware that
10347 // reorders stores in this way, but the Reference Manual permits it.
10348
10349 void gen_cas_entry(Assembler::operand_size size,
10350 atomic_memory_order order) {
10351 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10352 exchange_val = c_rarg2;
10353 bool acquire, release;
10354 switch (order) {
10355 case memory_order_relaxed:
10356 acquire = false;
10357 release = false;
10358 break;
10359 case memory_order_release:
10360 acquire = false;
10361 release = true;
10362 break;
10363 default:
10364 acquire = true;
10365 release = true;
10366 break;
10367 }
10368 __ mov(prev, compare_val);
10369 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10370 if (order == memory_order_conservative) {
10371 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10372 }
10373 if (size == Assembler::xword) {
10374 __ mov(r0, prev);
10375 } else {
10376 __ movw(r0, prev);
10377 }
10378 __ ret(lr);
10379 }
10380
10381 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10382 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10383 // If not relaxed, then default to conservative. Relaxed is the only
10384 // case we use enough to be worth specializing.
10385 if (order == memory_order_relaxed) {
10386 __ ldadd(size, incr, prev, addr);
10387 } else {
10388 __ ldaddal(size, incr, prev, addr);
10389 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10390 }
10391 if (size == Assembler::xword) {
10392 __ mov(r0, prev);
10393 } else {
10394 __ movw(r0, prev);
10395 }
10396 __ ret(lr);
10397 }
10398
10399 void gen_swpal_entry(Assembler::operand_size size) {
10400 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10401 __ swpal(size, incr, prev, addr);
10402 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10403 if (size == Assembler::xword) {
10404 __ mov(r0, prev);
10405 } else {
10406 __ movw(r0, prev);
10407 }
10408 __ ret(lr);
10409 }
10410
10411 void generate_atomic_entry_points() {
10412 if (! UseLSE) {
10413 return;
10414 }
10415 __ align(CodeEntryAlignment);
10416 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10417 StubCodeMark mark(this, stub_id);
10418 address first_entry = __ pc();
10419
10420 // ADD, memory_order_conservative
10421 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10422 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10423 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10424 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10425
10426 // ADD, memory_order_relaxed
10427 AtomicStubMark mark_fetch_add_4_relaxed
10428 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10429 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10430 AtomicStubMark mark_fetch_add_8_relaxed
10431 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10432 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10433
10434 // XCHG, memory_order_conservative
10435 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10436 gen_swpal_entry(Assembler::word);
10437 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10438 gen_swpal_entry(Assembler::xword);
10439
10440 // CAS, memory_order_conservative
10441 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10442 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10443 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10444 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10445 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10446 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10447
10448 // CAS, memory_order_relaxed
10449 AtomicStubMark mark_cmpxchg_1_relaxed
10450 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10451 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10452 AtomicStubMark mark_cmpxchg_4_relaxed
10453 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10454 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10455 AtomicStubMark mark_cmpxchg_8_relaxed
10456 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10457 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10458
10459 AtomicStubMark mark_cmpxchg_4_release
10460 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10461 gen_cas_entry(MacroAssembler::word, memory_order_release);
10462 AtomicStubMark mark_cmpxchg_8_release
10463 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10464 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10465
10466 AtomicStubMark mark_cmpxchg_4_seq_cst
10467 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10468 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10469 AtomicStubMark mark_cmpxchg_8_seq_cst
10470 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10471 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10472
10473 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10474 }
10475 #endif // LINUX
10476
10477 address generate_cont_thaw(Continuation::thaw_kind kind) {
10478 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10479 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10480
10481 address start = __ pc();
10482
10483 if (return_barrier) {
10484 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10485 __ mov(sp, rscratch1);
10486 }
10487 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10488
10489 if (return_barrier) {
10490 // preserve possible return value from a method returning to the return barrier
10491 __ fmovd(rscratch1, v0);
10492 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10493 }
10494
10495 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10496 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10497 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10498
10499 if (return_barrier) {
10500 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10501 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10502 __ fmovd(v0, rscratch1);
10503 }
10504 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10505
10506
10507 Label thaw_success;
10508 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10509 __ cbnz(rscratch2, thaw_success);
10510 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10511 __ br(rscratch1);
10512 __ bind(thaw_success);
10513
10514 // make room for the thawed frames
10515 __ sub(rscratch1, sp, rscratch2);
10516 __ andr(rscratch1, rscratch1, -16); // align
10517 __ mov(sp, rscratch1);
10518
10519 if (return_barrier) {
10520 // save original return value -- again
10521 __ fmovd(rscratch1, v0);
10522 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10523 }
10524
10525 // If we want, we can templatize thaw by kind, and have three different entries
10526 __ movw(c_rarg1, (uint32_t)kind);
10527
10528 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10529 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10530
10531 if (return_barrier) {
10532 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10533 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10534 __ fmovd(v0, rscratch1);
10535 } else {
10536 __ mov(r0, zr); // return 0 (success) from doYield
10537 }
10538
10539 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10540 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10541 __ mov(rfp, sp);
10542
10543 if (return_barrier_exception) {
10544 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10545 __ authenticate_return_address(c_rarg1);
10546 __ verify_oop(r0);
10547 // save return value containing the exception oop in callee-saved R19
10548 __ mov(r19, r0);
10549
10550 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10551
10552 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10553 // __ reinitialize_ptrue();
10554
10555 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10556
10557 __ mov(r1, r0); // the exception handler
10558 __ mov(r0, r19); // restore return value containing the exception oop
10559 __ verify_oop(r0);
10560
10561 __ leave();
10562 __ mov(r3, lr);
10563 __ br(r1); // the exception handler
10564 } else {
10565 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10566 __ leave();
10567 __ ret(lr);
10568 }
10569
10570 return start;
10571 }
10572
10573 address generate_cont_thaw() {
10574 if (!Continuations::enabled()) return nullptr;
10575
10576 StubId stub_id = StubId::stubgen_cont_thaw_id;
10577 StubCodeMark mark(this, stub_id);
10578 address start = __ pc();
10579 generate_cont_thaw(Continuation::thaw_top);
10580 return start;
10581 }
10582
10583 address generate_cont_returnBarrier() {
10584 if (!Continuations::enabled()) return nullptr;
10585
10586 // TODO: will probably need multiple return barriers depending on return type
10587 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10588 StubCodeMark mark(this, stub_id);
10589 address start = __ pc();
10590
10591 generate_cont_thaw(Continuation::thaw_return_barrier);
10592
10593 return start;
10594 }
10595
10596 address generate_cont_returnBarrier_exception() {
10597 if (!Continuations::enabled()) return nullptr;
10598
10599 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10600 StubCodeMark mark(this, stub_id);
10601 address start = __ pc();
10602
10603 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10604
10605 return start;
10606 }
10607
10608 address generate_cont_preempt_stub() {
10609 if (!Continuations::enabled()) return nullptr;
10610 StubId stub_id = StubId::stubgen_cont_preempt_id;
10611 StubCodeMark mark(this, stub_id);
10612 address start = __ pc();
10613
10614 __ reset_last_Java_frame(true);
10615
10616 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10617 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10618 __ mov(sp, rscratch2);
10619
10620 Label preemption_cancelled;
10621 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10622 __ cbnz(rscratch1, preemption_cancelled);
10623
10624 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10625 SharedRuntime::continuation_enter_cleanup(_masm);
10626 __ leave();
10627 __ ret(lr);
10628
10629 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10630 __ bind(preemption_cancelled);
10631 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10632 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10633 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10634 __ ldr(rscratch1, Address(rscratch1));
10635 __ br(rscratch1);
10636
10637 return start;
10638 }
10639
10640 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10641 // are represented as long[5], with BITS_PER_LIMB = 26.
10642 // Pack five 26-bit limbs into three 64-bit registers.
10643 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10644 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10645 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10646 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10647 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10648
10649 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10650 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10651 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10652 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10653
10654 if (dest2->is_valid()) {
10655 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10656 } else {
10657 #ifdef ASSERT
10658 Label OK;
10659 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10660 __ br(__ EQ, OK);
10661 __ stop("high bits of Poly1305 integer should be zero");
10662 __ should_not_reach_here();
10663 __ bind(OK);
10664 #endif
10665 }
10666 }
10667
10668 // As above, but return only a 128-bit integer, packed into two
10669 // 64-bit registers.
10670 void pack_26(Register dest0, Register dest1, Register src) {
10671 pack_26(dest0, dest1, noreg, src);
10672 }
10673
10674 // Multiply and multiply-accumulate unsigned 64-bit registers.
10675 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10676 __ mul(prod_lo, n, m);
10677 __ umulh(prod_hi, n, m);
10678 }
10679 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10680 wide_mul(rscratch1, rscratch2, n, m);
10681 __ adds(sum_lo, sum_lo, rscratch1);
10682 __ adc(sum_hi, sum_hi, rscratch2);
10683 }
10684
10685 // Poly1305, RFC 7539
10686
10687 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10688 // description of the tricks used to simplify and accelerate this
10689 // computation.
10690
10691 address generate_poly1305_processBlocks() {
10692 __ align(CodeEntryAlignment);
10693 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10694 StubCodeMark mark(this, stub_id);
10695 address start = __ pc();
10696 Label here;
10697 __ enter();
10698 RegSet callee_saved = RegSet::range(r19, r28);
10699 __ push(callee_saved, sp);
10700
10701 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10702
10703 // Arguments
10704 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10705
10706 // R_n is the 128-bit randomly-generated key, packed into two
10707 // registers. The caller passes this key to us as long[5], with
10708 // BITS_PER_LIMB = 26.
10709 const Register R_0 = *++regs, R_1 = *++regs;
10710 pack_26(R_0, R_1, r_start);
10711
10712 // RR_n is (R_n >> 2) * 5
10713 const Register RR_0 = *++regs, RR_1 = *++regs;
10714 __ lsr(RR_0, R_0, 2);
10715 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10716 __ lsr(RR_1, R_1, 2);
10717 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10718
10719 // U_n is the current checksum
10720 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10721 pack_26(U_0, U_1, U_2, acc_start);
10722
10723 static constexpr int BLOCK_LENGTH = 16;
10724 Label DONE, LOOP;
10725
10726 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10727 __ br(Assembler::LT, DONE); {
10728 __ bind(LOOP);
10729
10730 // S_n is to be the sum of U_n and the next block of data
10731 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10732 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10733 __ adds(S_0, U_0, S_0);
10734 __ adcs(S_1, U_1, S_1);
10735 __ adc(S_2, U_2, zr);
10736 __ add(S_2, S_2, 1);
10737
10738 const Register U_0HI = *++regs, U_1HI = *++regs;
10739
10740 // NB: this logic depends on some of the special properties of
10741 // Poly1305 keys. In particular, because we know that the top
10742 // four bits of R_0 and R_1 are zero, we can add together
10743 // partial products without any risk of needing to propagate a
10744 // carry out.
10745 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10746 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10747 __ andr(U_2, R_0, 3);
10748 __ mul(U_2, S_2, U_2);
10749
10750 // Recycle registers S_0, S_1, S_2
10751 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10752
10753 // Partial reduction mod 2**130 - 5
10754 __ adds(U_1, U_0HI, U_1);
10755 __ adc(U_2, U_1HI, U_2);
10756 // Sum now in U_2:U_1:U_0.
10757 // Dead: U_0HI, U_1HI.
10758 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10759
10760 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10761
10762 // First, U_2:U_1:U_0 += (U_2 >> 2)
10763 __ lsr(rscratch1, U_2, 2);
10764 __ andr(U_2, U_2, (u8)3);
10765 __ adds(U_0, U_0, rscratch1);
10766 __ adcs(U_1, U_1, zr);
10767 __ adc(U_2, U_2, zr);
10768 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10769 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10770 __ adcs(U_1, U_1, zr);
10771 __ adc(U_2, U_2, zr);
10772
10773 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10774 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10775 __ br(~ Assembler::LT, LOOP);
10776 }
10777
10778 // Further reduce modulo 2^130 - 5
10779 __ lsr(rscratch1, U_2, 2);
10780 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10781 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10782 __ adcs(U_1, U_1, zr);
10783 __ andr(U_2, U_2, (u1)3);
10784 __ adc(U_2, U_2, zr);
10785
10786 // Unpack the sum into five 26-bit limbs and write to memory.
10787 __ ubfiz(rscratch1, U_0, 0, 26);
10788 __ ubfx(rscratch2, U_0, 26, 26);
10789 __ stp(rscratch1, rscratch2, Address(acc_start));
10790 __ ubfx(rscratch1, U_0, 52, 12);
10791 __ bfi(rscratch1, U_1, 12, 14);
10792 __ ubfx(rscratch2, U_1, 14, 26);
10793 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10794 __ ubfx(rscratch1, U_1, 40, 24);
10795 __ bfi(rscratch1, U_2, 24, 3);
10796 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10797
10798 __ bind(DONE);
10799 __ pop(callee_saved, sp);
10800 __ leave();
10801 __ ret(lr);
10802
10803 return start;
10804 }
10805
10806 // exception handler for upcall stubs
10807 address generate_upcall_stub_exception_handler() {
10808 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10809 StubCodeMark mark(this, stub_id);
10810 address start = __ pc();
10811
10812 // Native caller has no idea how to handle exceptions,
10813 // so we just crash here. Up to callee to catch exceptions.
10814 __ verify_oop(r0);
10815 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10816 __ blr(rscratch1);
10817 __ should_not_reach_here();
10818
10819 return start;
10820 }
10821
10822 // load Method* target of MethodHandle
10823 // j_rarg0 = jobject receiver
10824 // rmethod = result
10825 address generate_upcall_stub_load_target() {
10826 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10827 StubCodeMark mark(this, stub_id);
10828 address start = __ pc();
10829
10830 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10831 // Load target method from receiver
10832 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10833 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10834 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10835 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10836 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10837 noreg, noreg);
10838 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10839
10840 __ ret(lr);
10841
10842 return start;
10843 }
10844
10845 #undef __
10846 #define __ masm->
10847
10848 class MontgomeryMultiplyGenerator : public MacroAssembler {
10849
10850 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10851 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10852
10853 RegSet _toSave;
10854 bool _squaring;
10855
10856 public:
10857 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10858 : MacroAssembler(as->code()), _squaring(squaring) {
10859
10860 // Register allocation
10861
10862 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10863 Pa_base = *regs; // Argument registers
10864 if (squaring)
10865 Pb_base = Pa_base;
10866 else
10867 Pb_base = *++regs;
10868 Pn_base = *++regs;
10869 Rlen= *++regs;
10870 inv = *++regs;
10871 Pm_base = *++regs;
10872
10873 // Working registers:
10874 Ra = *++regs; // The current digit of a, b, n, and m.
10875 Rb = *++regs;
10876 Rm = *++regs;
10877 Rn = *++regs;
10878
10879 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10880 Pb = *++regs;
10881 Pm = *++regs;
10882 Pn = *++regs;
10883
10884 t0 = *++regs; // Three registers which form a
10885 t1 = *++regs; // triple-precision accumuator.
10886 t2 = *++regs;
10887
10888 Ri = *++regs; // Inner and outer loop indexes.
10889 Rj = *++regs;
10890
10891 Rhi_ab = *++regs; // Product registers: low and high parts
10892 Rlo_ab = *++regs; // of a*b and m*n.
10893 Rhi_mn = *++regs;
10894 Rlo_mn = *++regs;
10895
10896 // r19 and up are callee-saved.
10897 _toSave = RegSet::range(r19, *regs) + Pm_base;
10898 }
10899
10900 private:
10901 void save_regs() {
10902 push(_toSave, sp);
10903 }
10904
10905 void restore_regs() {
10906 pop(_toSave, sp);
10907 }
10908
10909 template <typename T>
10910 void unroll_2(Register count, T block) {
10911 Label loop, end, odd;
10912 tbnz(count, 0, odd);
10913 cbz(count, end);
10914 align(16);
10915 bind(loop);
10916 (this->*block)();
10917 bind(odd);
10918 (this->*block)();
10919 subs(count, count, 2);
10920 br(Assembler::GT, loop);
10921 bind(end);
10922 }
10923
10924 template <typename T>
10925 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10926 Label loop, end, odd;
10927 tbnz(count, 0, odd);
10928 cbz(count, end);
10929 align(16);
10930 bind(loop);
10931 (this->*block)(d, s, tmp);
10932 bind(odd);
10933 (this->*block)(d, s, tmp);
10934 subs(count, count, 2);
10935 br(Assembler::GT, loop);
10936 bind(end);
10937 }
10938
10939 void pre1(RegisterOrConstant i) {
10940 block_comment("pre1");
10941 // Pa = Pa_base;
10942 // Pb = Pb_base + i;
10943 // Pm = Pm_base;
10944 // Pn = Pn_base + i;
10945 // Ra = *Pa;
10946 // Rb = *Pb;
10947 // Rm = *Pm;
10948 // Rn = *Pn;
10949 ldr(Ra, Address(Pa_base));
10950 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10951 ldr(Rm, Address(Pm_base));
10952 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10953 lea(Pa, Address(Pa_base));
10954 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10955 lea(Pm, Address(Pm_base));
10956 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10957
10958 // Zero the m*n result.
10959 mov(Rhi_mn, zr);
10960 mov(Rlo_mn, zr);
10961 }
10962
10963 // The core multiply-accumulate step of a Montgomery
10964 // multiplication. The idea is to schedule operations as a
10965 // pipeline so that instructions with long latencies (loads and
10966 // multiplies) have time to complete before their results are
10967 // used. This most benefits in-order implementations of the
10968 // architecture but out-of-order ones also benefit.
10969 void step() {
10970 block_comment("step");
10971 // MACC(Ra, Rb, t0, t1, t2);
10972 // Ra = *++Pa;
10973 // Rb = *--Pb;
10974 umulh(Rhi_ab, Ra, Rb);
10975 mul(Rlo_ab, Ra, Rb);
10976 ldr(Ra, pre(Pa, wordSize));
10977 ldr(Rb, pre(Pb, -wordSize));
10978 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10979 // previous iteration.
10980 // MACC(Rm, Rn, t0, t1, t2);
10981 // Rm = *++Pm;
10982 // Rn = *--Pn;
10983 umulh(Rhi_mn, Rm, Rn);
10984 mul(Rlo_mn, Rm, Rn);
10985 ldr(Rm, pre(Pm, wordSize));
10986 ldr(Rn, pre(Pn, -wordSize));
10987 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10988 }
10989
10990 void post1() {
10991 block_comment("post1");
10992
10993 // MACC(Ra, Rb, t0, t1, t2);
10994 // Ra = *++Pa;
10995 // Rb = *--Pb;
10996 umulh(Rhi_ab, Ra, Rb);
10997 mul(Rlo_ab, Ra, Rb);
10998 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
10999 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11000
11001 // *Pm = Rm = t0 * inv;
11002 mul(Rm, t0, inv);
11003 str(Rm, Address(Pm));
11004
11005 // MACC(Rm, Rn, t0, t1, t2);
11006 // t0 = t1; t1 = t2; t2 = 0;
11007 umulh(Rhi_mn, Rm, Rn);
11008
11009 #ifndef PRODUCT
11010 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11011 {
11012 mul(Rlo_mn, Rm, Rn);
11013 add(Rlo_mn, t0, Rlo_mn);
11014 Label ok;
11015 cbz(Rlo_mn, ok); {
11016 stop("broken Montgomery multiply");
11017 } bind(ok);
11018 }
11019 #endif
11020 // We have very carefully set things up so that
11021 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11022 // the lower half of Rm * Rn because we know the result already:
11023 // it must be -t0. t0 + (-t0) must generate a carry iff
11024 // t0 != 0. So, rather than do a mul and an adds we just set
11025 // the carry flag iff t0 is nonzero.
11026 //
11027 // mul(Rlo_mn, Rm, Rn);
11028 // adds(zr, t0, Rlo_mn);
11029 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11030 adcs(t0, t1, Rhi_mn);
11031 adc(t1, t2, zr);
11032 mov(t2, zr);
11033 }
11034
11035 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11036 block_comment("pre2");
11037 // Pa = Pa_base + i-len;
11038 // Pb = Pb_base + len;
11039 // Pm = Pm_base + i-len;
11040 // Pn = Pn_base + len;
11041
11042 if (i.is_register()) {
11043 sub(Rj, i.as_register(), len);
11044 } else {
11045 mov(Rj, i.as_constant());
11046 sub(Rj, Rj, len);
11047 }
11048 // Rj == i-len
11049
11050 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11051 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11052 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11053 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11054
11055 // Ra = *++Pa;
11056 // Rb = *--Pb;
11057 // Rm = *++Pm;
11058 // Rn = *--Pn;
11059 ldr(Ra, pre(Pa, wordSize));
11060 ldr(Rb, pre(Pb, -wordSize));
11061 ldr(Rm, pre(Pm, wordSize));
11062 ldr(Rn, pre(Pn, -wordSize));
11063
11064 mov(Rhi_mn, zr);
11065 mov(Rlo_mn, zr);
11066 }
11067
11068 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11069 block_comment("post2");
11070 if (i.is_constant()) {
11071 mov(Rj, i.as_constant()-len.as_constant());
11072 } else {
11073 sub(Rj, i.as_register(), len);
11074 }
11075
11076 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11077
11078 // As soon as we know the least significant digit of our result,
11079 // store it.
11080 // Pm_base[i-len] = t0;
11081 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11082
11083 // t0 = t1; t1 = t2; t2 = 0;
11084 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11085 adc(t1, t2, zr);
11086 mov(t2, zr);
11087 }
11088
11089 // A carry in t0 after Montgomery multiplication means that we
11090 // should subtract multiples of n from our result in m. We'll
11091 // keep doing that until there is no carry.
11092 void normalize(RegisterOrConstant len) {
11093 block_comment("normalize");
11094 // while (t0)
11095 // t0 = sub(Pm_base, Pn_base, t0, len);
11096 Label loop, post, again;
11097 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11098 cbz(t0, post); {
11099 bind(again); {
11100 mov(i, zr);
11101 mov(cnt, len);
11102 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11103 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11104 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11105 align(16);
11106 bind(loop); {
11107 sbcs(Rm, Rm, Rn);
11108 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11109 add(i, i, 1);
11110 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11111 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11112 sub(cnt, cnt, 1);
11113 } cbnz(cnt, loop);
11114 sbc(t0, t0, zr);
11115 } cbnz(t0, again);
11116 } bind(post);
11117 }
11118
11119 // Move memory at s to d, reversing words.
11120 // Increments d to end of copied memory
11121 // Destroys tmp1, tmp2
11122 // Preserves len
11123 // Leaves s pointing to the address which was in d at start
11124 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11125 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11126 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11127
11128 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11129 mov(tmp1, len);
11130 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11131 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11132 }
11133 // where
11134 void reverse1(Register d, Register s, Register tmp) {
11135 ldr(tmp, pre(s, -wordSize));
11136 ror(tmp, tmp, 32);
11137 str(tmp, post(d, wordSize));
11138 }
11139
11140 void step_squaring() {
11141 // An extra ACC
11142 step();
11143 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11144 }
11145
11146 void last_squaring(RegisterOrConstant i) {
11147 Label dont;
11148 // if ((i & 1) == 0) {
11149 tbnz(i.as_register(), 0, dont); {
11150 // MACC(Ra, Rb, t0, t1, t2);
11151 // Ra = *++Pa;
11152 // Rb = *--Pb;
11153 umulh(Rhi_ab, Ra, Rb);
11154 mul(Rlo_ab, Ra, Rb);
11155 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11156 } bind(dont);
11157 }
11158
11159 void extra_step_squaring() {
11160 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11161
11162 // MACC(Rm, Rn, t0, t1, t2);
11163 // Rm = *++Pm;
11164 // Rn = *--Pn;
11165 umulh(Rhi_mn, Rm, Rn);
11166 mul(Rlo_mn, Rm, Rn);
11167 ldr(Rm, pre(Pm, wordSize));
11168 ldr(Rn, pre(Pn, -wordSize));
11169 }
11170
11171 void post1_squaring() {
11172 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11173
11174 // *Pm = Rm = t0 * inv;
11175 mul(Rm, t0, inv);
11176 str(Rm, Address(Pm));
11177
11178 // MACC(Rm, Rn, t0, t1, t2);
11179 // t0 = t1; t1 = t2; t2 = 0;
11180 umulh(Rhi_mn, Rm, Rn);
11181
11182 #ifndef PRODUCT
11183 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11184 {
11185 mul(Rlo_mn, Rm, Rn);
11186 add(Rlo_mn, t0, Rlo_mn);
11187 Label ok;
11188 cbz(Rlo_mn, ok); {
11189 stop("broken Montgomery multiply");
11190 } bind(ok);
11191 }
11192 #endif
11193 // We have very carefully set things up so that
11194 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11195 // the lower half of Rm * Rn because we know the result already:
11196 // it must be -t0. t0 + (-t0) must generate a carry iff
11197 // t0 != 0. So, rather than do a mul and an adds we just set
11198 // the carry flag iff t0 is nonzero.
11199 //
11200 // mul(Rlo_mn, Rm, Rn);
11201 // adds(zr, t0, Rlo_mn);
11202 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11203 adcs(t0, t1, Rhi_mn);
11204 adc(t1, t2, zr);
11205 mov(t2, zr);
11206 }
11207
11208 void acc(Register Rhi, Register Rlo,
11209 Register t0, Register t1, Register t2) {
11210 adds(t0, t0, Rlo);
11211 adcs(t1, t1, Rhi);
11212 adc(t2, t2, zr);
11213 }
11214
11215 public:
11216 /**
11217 * Fast Montgomery multiplication. The derivation of the
11218 * algorithm is in A Cryptographic Library for the Motorola
11219 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11220 *
11221 * Arguments:
11222 *
11223 * Inputs for multiplication:
11224 * c_rarg0 - int array elements a
11225 * c_rarg1 - int array elements b
11226 * c_rarg2 - int array elements n (the modulus)
11227 * c_rarg3 - int length
11228 * c_rarg4 - int inv
11229 * c_rarg5 - int array elements m (the result)
11230 *
11231 * Inputs for squaring:
11232 * c_rarg0 - int array elements a
11233 * c_rarg1 - int array elements n (the modulus)
11234 * c_rarg2 - int length
11235 * c_rarg3 - int inv
11236 * c_rarg4 - int array elements m (the result)
11237 *
11238 */
11239 address generate_multiply() {
11240 Label argh, nothing;
11241 bind(argh);
11242 stop("MontgomeryMultiply total_allocation must be <= 8192");
11243
11244 align(CodeEntryAlignment);
11245 address entry = pc();
11246
11247 cbzw(Rlen, nothing);
11248
11249 enter();
11250
11251 // Make room.
11252 cmpw(Rlen, 512);
11253 br(Assembler::HI, argh);
11254 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11255 andr(sp, Ra, -2 * wordSize);
11256
11257 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11258
11259 {
11260 // Copy input args, reversing as we go. We use Ra as a
11261 // temporary variable.
11262 reverse(Ra, Pa_base, Rlen, t0, t1);
11263 if (!_squaring)
11264 reverse(Ra, Pb_base, Rlen, t0, t1);
11265 reverse(Ra, Pn_base, Rlen, t0, t1);
11266 }
11267
11268 // Push all call-saved registers and also Pm_base which we'll need
11269 // at the end.
11270 save_regs();
11271
11272 #ifndef PRODUCT
11273 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11274 {
11275 ldr(Rn, Address(Pn_base, 0));
11276 mul(Rlo_mn, Rn, inv);
11277 subs(zr, Rlo_mn, -1);
11278 Label ok;
11279 br(EQ, ok); {
11280 stop("broken inverse in Montgomery multiply");
11281 } bind(ok);
11282 }
11283 #endif
11284
11285 mov(Pm_base, Ra);
11286
11287 mov(t0, zr);
11288 mov(t1, zr);
11289 mov(t2, zr);
11290
11291 block_comment("for (int i = 0; i < len; i++) {");
11292 mov(Ri, zr); {
11293 Label loop, end;
11294 cmpw(Ri, Rlen);
11295 br(Assembler::GE, end);
11296
11297 bind(loop);
11298 pre1(Ri);
11299
11300 block_comment(" for (j = i; j; j--) {"); {
11301 movw(Rj, Ri);
11302 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11303 } block_comment(" } // j");
11304
11305 post1();
11306 addw(Ri, Ri, 1);
11307 cmpw(Ri, Rlen);
11308 br(Assembler::LT, loop);
11309 bind(end);
11310 block_comment("} // i");
11311 }
11312
11313 block_comment("for (int i = len; i < 2*len; i++) {");
11314 mov(Ri, Rlen); {
11315 Label loop, end;
11316 cmpw(Ri, Rlen, Assembler::LSL, 1);
11317 br(Assembler::GE, end);
11318
11319 bind(loop);
11320 pre2(Ri, Rlen);
11321
11322 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11323 lslw(Rj, Rlen, 1);
11324 subw(Rj, Rj, Ri);
11325 subw(Rj, Rj, 1);
11326 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11327 } block_comment(" } // j");
11328
11329 post2(Ri, Rlen);
11330 addw(Ri, Ri, 1);
11331 cmpw(Ri, Rlen, Assembler::LSL, 1);
11332 br(Assembler::LT, loop);
11333 bind(end);
11334 }
11335 block_comment("} // i");
11336
11337 normalize(Rlen);
11338
11339 mov(Ra, Pm_base); // Save Pm_base in Ra
11340 restore_regs(); // Restore caller's Pm_base
11341
11342 // Copy our result into caller's Pm_base
11343 reverse(Pm_base, Ra, Rlen, t0, t1);
11344
11345 leave();
11346 bind(nothing);
11347 ret(lr);
11348
11349 return entry;
11350 }
11351 // In C, approximately:
11352
11353 // void
11354 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11355 // julong Pn_base[], julong Pm_base[],
11356 // julong inv, int len) {
11357 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11358 // julong *Pa, *Pb, *Pn, *Pm;
11359 // julong Ra, Rb, Rn, Rm;
11360
11361 // int i;
11362
11363 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11364
11365 // for (i = 0; i < len; i++) {
11366 // int j;
11367
11368 // Pa = Pa_base;
11369 // Pb = Pb_base + i;
11370 // Pm = Pm_base;
11371 // Pn = Pn_base + i;
11372
11373 // Ra = *Pa;
11374 // Rb = *Pb;
11375 // Rm = *Pm;
11376 // Rn = *Pn;
11377
11378 // int iters = i;
11379 // for (j = 0; iters--; j++) {
11380 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11381 // MACC(Ra, Rb, t0, t1, t2);
11382 // Ra = *++Pa;
11383 // Rb = *--Pb;
11384 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11385 // MACC(Rm, Rn, t0, t1, t2);
11386 // Rm = *++Pm;
11387 // Rn = *--Pn;
11388 // }
11389
11390 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11391 // MACC(Ra, Rb, t0, t1, t2);
11392 // *Pm = Rm = t0 * inv;
11393 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11394 // MACC(Rm, Rn, t0, t1, t2);
11395
11396 // assert(t0 == 0, "broken Montgomery multiply");
11397
11398 // t0 = t1; t1 = t2; t2 = 0;
11399 // }
11400
11401 // for (i = len; i < 2*len; i++) {
11402 // int j;
11403
11404 // Pa = Pa_base + i-len;
11405 // Pb = Pb_base + len;
11406 // Pm = Pm_base + i-len;
11407 // Pn = Pn_base + len;
11408
11409 // Ra = *++Pa;
11410 // Rb = *--Pb;
11411 // Rm = *++Pm;
11412 // Rn = *--Pn;
11413
11414 // int iters = len*2-i-1;
11415 // for (j = i-len+1; iters--; j++) {
11416 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11417 // MACC(Ra, Rb, t0, t1, t2);
11418 // Ra = *++Pa;
11419 // Rb = *--Pb;
11420 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11421 // MACC(Rm, Rn, t0, t1, t2);
11422 // Rm = *++Pm;
11423 // Rn = *--Pn;
11424 // }
11425
11426 // Pm_base[i-len] = t0;
11427 // t0 = t1; t1 = t2; t2 = 0;
11428 // }
11429
11430 // while (t0)
11431 // t0 = sub(Pm_base, Pn_base, t0, len);
11432 // }
11433
11434 /**
11435 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11436 * multiplies than Montgomery multiplication so it should be up to
11437 * 25% faster. However, its loop control is more complex and it
11438 * may actually run slower on some machines.
11439 *
11440 * Arguments:
11441 *
11442 * Inputs:
11443 * c_rarg0 - int array elements a
11444 * c_rarg1 - int array elements n (the modulus)
11445 * c_rarg2 - int length
11446 * c_rarg3 - int inv
11447 * c_rarg4 - int array elements m (the result)
11448 *
11449 */
11450 address generate_square() {
11451 Label argh;
11452 bind(argh);
11453 stop("MontgomeryMultiply total_allocation must be <= 8192");
11454
11455 align(CodeEntryAlignment);
11456 address entry = pc();
11457
11458 enter();
11459
11460 // Make room.
11461 cmpw(Rlen, 512);
11462 br(Assembler::HI, argh);
11463 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11464 andr(sp, Ra, -2 * wordSize);
11465
11466 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11467
11468 {
11469 // Copy input args, reversing as we go. We use Ra as a
11470 // temporary variable.
11471 reverse(Ra, Pa_base, Rlen, t0, t1);
11472 reverse(Ra, Pn_base, Rlen, t0, t1);
11473 }
11474
11475 // Push all call-saved registers and also Pm_base which we'll need
11476 // at the end.
11477 save_regs();
11478
11479 mov(Pm_base, Ra);
11480
11481 mov(t0, zr);
11482 mov(t1, zr);
11483 mov(t2, zr);
11484
11485 block_comment("for (int i = 0; i < len; i++) {");
11486 mov(Ri, zr); {
11487 Label loop, end;
11488 bind(loop);
11489 cmp(Ri, Rlen);
11490 br(Assembler::GE, end);
11491
11492 pre1(Ri);
11493
11494 block_comment("for (j = (i+1)/2; j; j--) {"); {
11495 add(Rj, Ri, 1);
11496 lsr(Rj, Rj, 1);
11497 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11498 } block_comment(" } // j");
11499
11500 last_squaring(Ri);
11501
11502 block_comment(" for (j = i/2; j; j--) {"); {
11503 lsr(Rj, Ri, 1);
11504 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11505 } block_comment(" } // j");
11506
11507 post1_squaring();
11508 add(Ri, Ri, 1);
11509 cmp(Ri, Rlen);
11510 br(Assembler::LT, loop);
11511
11512 bind(end);
11513 block_comment("} // i");
11514 }
11515
11516 block_comment("for (int i = len; i < 2*len; i++) {");
11517 mov(Ri, Rlen); {
11518 Label loop, end;
11519 bind(loop);
11520 cmp(Ri, Rlen, Assembler::LSL, 1);
11521 br(Assembler::GE, end);
11522
11523 pre2(Ri, Rlen);
11524
11525 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11526 lsl(Rj, Rlen, 1);
11527 sub(Rj, Rj, Ri);
11528 sub(Rj, Rj, 1);
11529 lsr(Rj, Rj, 1);
11530 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11531 } block_comment(" } // j");
11532
11533 last_squaring(Ri);
11534
11535 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11536 lsl(Rj, Rlen, 1);
11537 sub(Rj, Rj, Ri);
11538 lsr(Rj, Rj, 1);
11539 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11540 } block_comment(" } // j");
11541
11542 post2(Ri, Rlen);
11543 add(Ri, Ri, 1);
11544 cmp(Ri, Rlen, Assembler::LSL, 1);
11545
11546 br(Assembler::LT, loop);
11547 bind(end);
11548 block_comment("} // i");
11549 }
11550
11551 normalize(Rlen);
11552
11553 mov(Ra, Pm_base); // Save Pm_base in Ra
11554 restore_regs(); // Restore caller's Pm_base
11555
11556 // Copy our result into caller's Pm_base
11557 reverse(Pm_base, Ra, Rlen, t0, t1);
11558
11559 leave();
11560 ret(lr);
11561
11562 return entry;
11563 }
11564 // In C, approximately:
11565
11566 // void
11567 // montgomery_square(julong Pa_base[], julong Pn_base[],
11568 // julong Pm_base[], julong inv, int len) {
11569 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11570 // julong *Pa, *Pb, *Pn, *Pm;
11571 // julong Ra, Rb, Rn, Rm;
11572
11573 // int i;
11574
11575 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11576
11577 // for (i = 0; i < len; i++) {
11578 // int j;
11579
11580 // Pa = Pa_base;
11581 // Pb = Pa_base + i;
11582 // Pm = Pm_base;
11583 // Pn = Pn_base + i;
11584
11585 // Ra = *Pa;
11586 // Rb = *Pb;
11587 // Rm = *Pm;
11588 // Rn = *Pn;
11589
11590 // int iters = (i+1)/2;
11591 // for (j = 0; iters--; j++) {
11592 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11593 // MACC2(Ra, Rb, t0, t1, t2);
11594 // Ra = *++Pa;
11595 // Rb = *--Pb;
11596 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11597 // MACC(Rm, Rn, t0, t1, t2);
11598 // Rm = *++Pm;
11599 // Rn = *--Pn;
11600 // }
11601 // if ((i & 1) == 0) {
11602 // assert(Ra == Pa_base[j], "must be");
11603 // MACC(Ra, Ra, t0, t1, t2);
11604 // }
11605 // iters = i/2;
11606 // assert(iters == i-j, "must be");
11607 // for (; iters--; j++) {
11608 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11609 // MACC(Rm, Rn, t0, t1, t2);
11610 // Rm = *++Pm;
11611 // Rn = *--Pn;
11612 // }
11613
11614 // *Pm = Rm = t0 * inv;
11615 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11616 // MACC(Rm, Rn, t0, t1, t2);
11617
11618 // assert(t0 == 0, "broken Montgomery multiply");
11619
11620 // t0 = t1; t1 = t2; t2 = 0;
11621 // }
11622
11623 // for (i = len; i < 2*len; i++) {
11624 // int start = i-len+1;
11625 // int end = start + (len - start)/2;
11626 // int j;
11627
11628 // Pa = Pa_base + i-len;
11629 // Pb = Pa_base + len;
11630 // Pm = Pm_base + i-len;
11631 // Pn = Pn_base + len;
11632
11633 // Ra = *++Pa;
11634 // Rb = *--Pb;
11635 // Rm = *++Pm;
11636 // Rn = *--Pn;
11637
11638 // int iters = (2*len-i-1)/2;
11639 // assert(iters == end-start, "must be");
11640 // for (j = start; iters--; j++) {
11641 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11642 // MACC2(Ra, Rb, t0, t1, t2);
11643 // Ra = *++Pa;
11644 // Rb = *--Pb;
11645 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11646 // MACC(Rm, Rn, t0, t1, t2);
11647 // Rm = *++Pm;
11648 // Rn = *--Pn;
11649 // }
11650 // if ((i & 1) == 0) {
11651 // assert(Ra == Pa_base[j], "must be");
11652 // MACC(Ra, Ra, t0, t1, t2);
11653 // }
11654 // iters = (2*len-i)/2;
11655 // assert(iters == len-j, "must be");
11656 // for (; iters--; j++) {
11657 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11658 // MACC(Rm, Rn, t0, t1, t2);
11659 // Rm = *++Pm;
11660 // Rn = *--Pn;
11661 // }
11662 // Pm_base[i-len] = t0;
11663 // t0 = t1; t1 = t2; t2 = 0;
11664 // }
11665
11666 // while (t0)
11667 // t0 = sub(Pm_base, Pn_base, t0, len);
11668 // }
11669 };
11670
11671 // Initialization
11672 void generate_preuniverse_stubs() {
11673 // preuniverse stubs are not needed for aarch64
11674 }
11675
11676 void generate_initial_stubs() {
11677 // Generate initial stubs and initializes the entry points
11678
11679 // entry points that exist in all platforms Note: This is code
11680 // that could be shared among different platforms - however the
11681 // benefit seems to be smaller than the disadvantage of having a
11682 // much more complicated generator structure. See also comment in
11683 // stubRoutines.hpp.
11684
11685 StubRoutines::_forward_exception_entry = generate_forward_exception();
11686
11687 StubRoutines::_call_stub_entry =
11688 generate_call_stub(StubRoutines::_call_stub_return_address);
11689
11690 // is referenced by megamorphic call
11691 StubRoutines::_catch_exception_entry = generate_catch_exception();
11692
11693 // Initialize table for copy memory (arraycopy) check.
11694 if (UnsafeMemoryAccess::_table == nullptr) {
11695 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11696 }
11697
11698 if (UseCRC32Intrinsics) {
11699 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11700 }
11701
11702 if (UseCRC32CIntrinsics) {
11703 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11704 }
11705
11706 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11707 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11708 }
11709
11710 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11711 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11712 }
11713
11714 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11715 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11716 StubRoutines::_hf2f = generate_float16ToFloat();
11717 StubRoutines::_f2hf = generate_floatToFloat16();
11718 }
11719 }
11720
11721 void generate_continuation_stubs() {
11722 // Continuation stubs:
11723 StubRoutines::_cont_thaw = generate_cont_thaw();
11724 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11725 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11726 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11727 }
11728
11729 void generate_final_stubs() {
11730 // support for verify_oop (must happen after universe_init)
11731 if (VerifyOops) {
11732 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11733 }
11734
11735 // arraycopy stubs used by compilers
11736 generate_arraycopy_stubs();
11737
11738 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11739
11740 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11741
11742 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11743 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11744
11745 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11746
11747 generate_atomic_entry_points();
11748
11749 #endif // LINUX
11750
11751 #ifdef COMPILER2
11752 if (UseSecondarySupersTable) {
11753 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11754 if (! InlineSecondarySupersTest) {
11755 generate_lookup_secondary_supers_table_stub();
11756 }
11757 }
11758 #endif
11759
11760 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11761
11762 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11763 }
11764
11765 void generate_compiler_stubs() {
11766 #if COMPILER2_OR_JVMCI
11767
11768 if (UseSVE == 0) {
11769 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11770 }
11771
11772 // array equals stub for large arrays.
11773 if (!UseSimpleArrayEquals) {
11774 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11775 }
11776
11777 // arrays_hascode stub for large arrays.
11778 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11779 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11780 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11781 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11782 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11783
11784 // byte_array_inflate stub for large arrays.
11785 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11786
11787 // countPositives stub for large arrays.
11788 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11789
11790 generate_compare_long_strings();
11791
11792 generate_string_indexof_stubs();
11793
11794 #ifdef COMPILER2
11795 if (UseMultiplyToLenIntrinsic) {
11796 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11797 }
11798
11799 if (UseSquareToLenIntrinsic) {
11800 StubRoutines::_squareToLen = generate_squareToLen();
11801 }
11802
11803 if (UseMulAddIntrinsic) {
11804 StubRoutines::_mulAdd = generate_mulAdd();
11805 }
11806
11807 if (UseSIMDForBigIntegerShiftIntrinsics) {
11808 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11809 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11810 }
11811
11812 if (UseMontgomeryMultiplyIntrinsic) {
11813 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11814 StubCodeMark mark(this, stub_id);
11815 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11816 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11817 }
11818
11819 if (UseMontgomerySquareIntrinsic) {
11820 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11821 StubCodeMark mark(this, stub_id);
11822 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11823 // We use generate_multiply() rather than generate_square()
11824 // because it's faster for the sizes of modulus we care about.
11825 StubRoutines::_montgomerySquare = g.generate_multiply();
11826 }
11827
11828 #endif // COMPILER2
11829
11830 if (UseChaCha20Intrinsics) {
11831 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11832 }
11833
11834 if (UseKyberIntrinsics) {
11835 StubRoutines::_kyberNtt = generate_kyberNtt();
11836 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11837 StubRoutines::_kyberNttMult = generate_kyberNttMult();
11838 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11839 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11840 StubRoutines::_kyber12To16 = generate_kyber12To16();
11841 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11842 }
11843
11844 if (UseDilithiumIntrinsics) {
11845 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11846 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11847 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11848 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11849 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11850 }
11851
11852 if (UseBASE64Intrinsics) {
11853 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11854 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11855 }
11856
11857 // data cache line writeback
11858 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11859 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11860
11861 if (UseAESIntrinsics) {
11862 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11863 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11864 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11865 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11866 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11867 }
11868 if (UseGHASHIntrinsics) {
11869 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11870 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11871 }
11872 if (UseAESIntrinsics && UseGHASHIntrinsics) {
11873 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11874 }
11875
11876 if (UseMD5Intrinsics) {
11877 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11878 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11879 }
11880 if (UseSHA1Intrinsics) {
11881 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11882 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11883 }
11884 if (UseSHA256Intrinsics) {
11885 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11886 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11887 }
11888 if (UseSHA512Intrinsics) {
11889 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11890 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11891 }
11892 if (UseSHA3Intrinsics) {
11893
11894 StubRoutines::_double_keccak = generate_double_keccak();
11895 if (UseSIMDForSHA3Intrinsic) {
11896 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11897 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11898 } else {
11899 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11900 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11901 }
11902 }
11903
11904 if (UsePoly1305Intrinsics) {
11905 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11906 }
11907
11908 // generate Adler32 intrinsics code
11909 if (UseAdler32Intrinsics) {
11910 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11911 }
11912
11913 #endif // COMPILER2_OR_JVMCI
11914 }
11915
11916 public:
11917 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11918 switch(blob_id) {
11919 case BlobId::stubgen_preuniverse_id:
11920 generate_preuniverse_stubs();
11921 break;
11922 case BlobId::stubgen_initial_id:
11923 generate_initial_stubs();
11924 break;
11925 case BlobId::stubgen_continuation_id:
11926 generate_continuation_stubs();
11927 break;
11928 case BlobId::stubgen_compiler_id:
11929 generate_compiler_stubs();
11930 break;
11931 case BlobId::stubgen_final_id:
11932 generate_final_stubs();
11933 break;
11934 default:
11935 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11936 break;
11937 };
11938 }
11939 }; // end class declaration
11940
11941 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11942 StubGenerator g(code, blob_id);
11943 }
11944
11945
11946 #if defined (LINUX)
11947
11948 // Define pointers to atomic stubs and initialize them to point to the
11949 // code in atomic_aarch64.S.
11950
11951 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
11952 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11953 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
11954 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11955 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11956
11957 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11958 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11959 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11960 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11961 DEFAULT_ATOMIC_OP(xchg, 4, )
11962 DEFAULT_ATOMIC_OP(xchg, 8, )
11963 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11964 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11965 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11966 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11967 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11968 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11969 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11970 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11971 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11972 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11973
11974 #undef DEFAULT_ATOMIC_OP
11975
11976 #endif // LINUX