1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 __ ldr(j_rarg2, result);
332 Label is_long, is_float, is_double, exit;
333 __ ldr(j_rarg1, result_type);
334 __ cmp(j_rarg1, (u1)T_OBJECT);
335 __ br(Assembler::EQ, is_long);
336 __ cmp(j_rarg1, (u1)T_LONG);
337 __ br(Assembler::EQ, is_long);
338 __ cmp(j_rarg1, (u1)T_FLOAT);
339 __ br(Assembler::EQ, is_float);
340 __ cmp(j_rarg1, (u1)T_DOUBLE);
341 __ br(Assembler::EQ, is_double);
342
343 // handle T_INT case
344 __ strw(r0, Address(j_rarg2));
345
346 __ BIND(exit);
347
348 // pop parameters
349 __ sub(esp, rfp, -sp_after_call_off * wordSize);
350
351 #ifdef ASSERT
352 // verify that threads correspond
353 {
354 Label L, S;
355 __ ldr(rscratch1, thread);
356 __ cmp(rthread, rscratch1);
357 __ br(Assembler::NE, S);
358 __ get_thread(rscratch1);
359 __ cmp(rthread, rscratch1);
360 __ br(Assembler::EQ, L);
361 __ BIND(S);
362 __ stop("StubRoutines::call_stub: threads must correspond");
363 __ BIND(L);
364 }
365 #endif
366
367 __ pop_cont_fastpath(rthread);
368
369 // restore callee-save registers
370 __ ldpd(v15, v14, d15_save);
371 __ ldpd(v13, v12, d13_save);
372 __ ldpd(v11, v10, d11_save);
373 __ ldpd(v9, v8, d9_save);
374
375 __ ldp(r28, r27, r28_save);
376 __ ldp(r26, r25, r26_save);
377 __ ldp(r24, r23, r24_save);
378 __ ldp(r22, r21, r22_save);
379 __ ldp(r20, r19, r20_save);
380
381 // restore fpcr
382 __ ldr(rscratch1, fpcr_save);
383 __ set_fpcr(rscratch1);
384
385 __ ldp(c_rarg0, c_rarg1, call_wrapper);
386 __ ldrw(c_rarg2, result_type);
387 __ ldr(c_rarg3, method);
388 __ ldp(c_rarg4, c_rarg5, entry_point);
389 __ ldp(c_rarg6, c_rarg7, parameter_size);
390
391 // leave frame and return to caller
392 __ leave();
393 __ ret(lr);
394
395 // handle return types different from T_INT
396
397 __ BIND(is_long);
398 __ str(r0, Address(j_rarg2, 0));
399 __ br(Assembler::AL, exit);
400
401 __ BIND(is_float);
402 __ strs(j_farg0, Address(j_rarg2, 0));
403 __ br(Assembler::AL, exit);
404
405 __ BIND(is_double);
406 __ strd(j_farg0, Address(j_rarg2, 0));
407 __ br(Assembler::AL, exit);
408
409 return start;
410 }
411
412 // Return point for a Java call if there's an exception thrown in
413 // Java code. The exception is caught and transformed into a
414 // pending exception stored in JavaThread that can be tested from
415 // within the VM.
416 //
417 // Note: Usually the parameters are removed by the callee. In case
418 // of an exception crossing an activation frame boundary, that is
419 // not the case if the callee is compiled code => need to setup the
420 // rsp.
421 //
422 // r0: exception oop
423
424 address generate_catch_exception() {
425 StubId stub_id = StubId::stubgen_catch_exception_id;
426 StubCodeMark mark(this, stub_id);
427 address start = __ pc();
428
429 // same as in generate_call_stub():
430 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
431 const Address thread (rfp, thread_off * wordSize);
432
433 #ifdef ASSERT
434 // verify that threads correspond
435 {
436 Label L, S;
437 __ ldr(rscratch1, thread);
438 __ cmp(rthread, rscratch1);
439 __ br(Assembler::NE, S);
440 __ get_thread(rscratch1);
441 __ cmp(rthread, rscratch1);
442 __ br(Assembler::EQ, L);
443 __ bind(S);
444 __ stop("StubRoutines::catch_exception: threads must correspond");
445 __ bind(L);
446 }
447 #endif
448
449 // set pending exception
450 __ verify_oop(r0);
451
452 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
453 __ mov(rscratch1, (address)__FILE__);
454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
455 __ movw(rscratch1, (int)__LINE__);
456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
457
458 // complete return to VM
459 assert(StubRoutines::_call_stub_return_address != nullptr,
460 "_call_stub_return_address must have been generated before");
461 __ b(StubRoutines::_call_stub_return_address);
462
463 return start;
464 }
465
466 // Continuation point for runtime calls returning with a pending
467 // exception. The pending exception check happened in the runtime
468 // or native call stub. The pending exception in Thread is
469 // converted into a Java-level exception.
470 //
471 // Contract with Java-level exception handlers:
472 // r0: exception
473 // r3: throwing pc
474 //
475 // NOTE: At entry of this stub, exception-pc must be in LR !!
476
477 // NOTE: this is always used as a jump target within generated code
478 // so it just needs to be generated code with no x86 prolog
479
480 address generate_forward_exception() {
481 StubId stub_id = StubId::stubgen_forward_exception_id;
482 StubCodeMark mark(this, stub_id);
483 address start = __ pc();
484
485 // Upon entry, LR points to the return address returning into
486 // Java (interpreted or compiled) code; i.e., the return address
487 // becomes the throwing pc.
488 //
489 // Arguments pushed before the runtime call are still on the stack
490 // but the exception handler will reset the stack pointer ->
491 // ignore them. A potential result in registers can be ignored as
492 // well.
493
494 #ifdef ASSERT
495 // make sure this code is only executed if there is a pending exception
496 {
497 Label L;
498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
499 __ cbnz(rscratch1, L);
500 __ stop("StubRoutines::forward exception: no pending exception (1)");
501 __ bind(L);
502 }
503 #endif
504
505 // compute exception handler into r19
506
507 // call the VM to find the handler address associated with the
508 // caller address. pass thread in r0 and caller pc (ret address)
509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
510 // the stack.
511 __ mov(c_rarg1, lr);
512 // lr will be trashed by the VM call so we move it to R19
513 // (callee-saved) because we also need to pass it to the handler
514 // returned by this call.
515 __ mov(r19, lr);
516 BLOCK_COMMENT("call exception_handler_for_return_address");
517 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
518 SharedRuntime::exception_handler_for_return_address),
519 rthread, c_rarg1);
520 // Reinitialize the ptrue predicate register, in case the external runtime
521 // call clobbers ptrue reg, as we may return to SVE compiled code.
522 __ reinitialize_ptrue();
523
524 // we should not really care that lr is no longer the callee
525 // address. we saved the value the handler needs in r19 so we can
526 // just copy it to r3. however, the C2 handler will push its own
527 // frame and then calls into the VM and the VM code asserts that
528 // the PC for the frame above the handler belongs to a compiled
529 // Java method. So, we restore lr here to satisfy that assert.
530 __ mov(lr, r19);
531 // setup r0 & r3 & clear pending exception
532 __ mov(r3, r19);
533 __ mov(r19, r0);
534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
535 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
536
537 #ifdef ASSERT
538 // make sure exception is set
539 {
540 Label L;
541 __ cbnz(r0, L);
542 __ stop("StubRoutines::forward exception: no pending exception (2)");
543 __ bind(L);
544 }
545 #endif
546
547 // continue at exception handler
548 // r0: exception
549 // r3: throwing pc
550 // r19: exception handler
551 __ verify_oop(r0);
552 __ br(r19);
553
554 return start;
555 }
556
557 // Non-destructive plausibility checks for oops
558 //
559 // Arguments:
560 // r0: oop to verify
561 // rscratch1: error message
562 //
563 // Stack after saving c_rarg3:
564 // [tos + 0]: saved c_rarg3
565 // [tos + 1]: saved c_rarg2
566 // [tos + 2]: saved lr
567 // [tos + 3]: saved rscratch2
568 // [tos + 4]: saved r0
569 // [tos + 5]: saved rscratch1
570 address generate_verify_oop() {
571 StubId stub_id = StubId::stubgen_verify_oop_id;
572 StubCodeMark mark(this, stub_id);
573 address start = __ pc();
574
575 Label exit, error;
576
577 // save c_rarg2 and c_rarg3
578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
579
580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
582 __ ldr(c_rarg3, Address(c_rarg2));
583 __ add(c_rarg3, c_rarg3, 1);
584 __ str(c_rarg3, Address(c_rarg2));
585
586 // object is in r0
587 // make sure object is 'reasonable'
588 __ cbz(r0, exit); // if obj is null it is OK
589
590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
592
593 // return if everything seems ok
594 __ bind(exit);
595
596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
597 __ ret(lr);
598
599 // handle errors
600 __ bind(error);
601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
602
603 __ push(RegSet::range(r0, r29), sp);
604 // debug(char* msg, int64_t pc, int64_t regs[])
605 __ mov(c_rarg0, rscratch1); // pass address of error message
606 __ mov(c_rarg1, lr); // pass return address
607 __ mov(c_rarg2, sp); // pass address of regs on stack
608 #ifndef PRODUCT
609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
610 #endif
611 BLOCK_COMMENT("call MacroAssembler::debug");
612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
613 __ blr(rscratch1);
614 __ hlt(0);
615
616 return start;
617 }
618
619 // Generate indices for iota vector.
620 address generate_iota_indices(StubId stub_id) {
621 __ align(CodeEntryAlignment);
622 StubCodeMark mark(this, stub_id);
623 address start = __ pc();
624 // B
625 __ emit_data64(0x0706050403020100, relocInfo::none);
626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
627 // H
628 __ emit_data64(0x0003000200010000, relocInfo::none);
629 __ emit_data64(0x0007000600050004, relocInfo::none);
630 // S
631 __ emit_data64(0x0000000100000000, relocInfo::none);
632 __ emit_data64(0x0000000300000002, relocInfo::none);
633 // D
634 __ emit_data64(0x0000000000000000, relocInfo::none);
635 __ emit_data64(0x0000000000000001, relocInfo::none);
636 // S - FP
637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
639 // D - FP
640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
642 return start;
643 }
644
645 // The inner part of zero_words(). This is the bulk operation,
646 // zeroing words in blocks, possibly using DC ZVA to do it. The
647 // caller is responsible for zeroing the last few words.
648 //
649 // Inputs:
650 // r10: the HeapWord-aligned base address of an array to zero.
651 // r11: the count in HeapWords, r11 > 0.
652 //
653 // Returns r10 and r11, adjusted for the caller to clear.
654 // r10: the base address of the tail of words left to clear.
655 // r11: the number of words in the tail.
656 // r11 < MacroAssembler::zero_words_block_size.
657
658 address generate_zero_blocks() {
659 Label done;
660 Label base_aligned;
661
662 Register base = r10, cnt = r11;
663
664 __ align(CodeEntryAlignment);
665 StubId stub_id = StubId::stubgen_zero_blocks_id;
666 StubCodeMark mark(this, stub_id);
667 address start = __ pc();
668
669 if (UseBlockZeroing) {
670 int zva_length = VM_Version::zva_length();
671
672 // Ensure ZVA length can be divided by 16. This is required by
673 // the subsequent operations.
674 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
675
676 __ tbz(base, 3, base_aligned);
677 __ str(zr, Address(__ post(base, 8)));
678 __ sub(cnt, cnt, 1);
679 __ bind(base_aligned);
680
681 // Ensure count >= zva_length * 2 so that it still deserves a zva after
682 // alignment.
683 Label small;
684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
685 __ subs(rscratch1, cnt, low_limit >> 3);
686 __ br(Assembler::LT, small);
687 __ zero_dcache_blocks(base, cnt);
688 __ bind(small);
689 }
690
691 {
692 // Number of stp instructions we'll unroll
693 const int unroll =
694 MacroAssembler::zero_words_block_size / 2;
695 // Clear the remaining blocks.
696 Label loop;
697 __ subs(cnt, cnt, unroll * 2);
698 __ br(Assembler::LT, done);
699 __ bind(loop);
700 for (int i = 0; i < unroll; i++)
701 __ stp(zr, zr, __ post(base, 16));
702 __ subs(cnt, cnt, unroll * 2);
703 __ br(Assembler::GE, loop);
704 __ bind(done);
705 __ add(cnt, cnt, unroll * 2);
706 }
707
708 __ ret(lr);
709
710 return start;
711 }
712
713
714 typedef enum {
715 copy_forwards = 1,
716 copy_backwards = -1
717 } copy_direction;
718
719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
720 // for arraycopy stubs.
721 class ArrayCopyBarrierSetHelper : StackObj {
722 BarrierSetAssembler* _bs_asm;
723 MacroAssembler* _masm;
724 DecoratorSet _decorators;
725 BasicType _type;
726 Register _gct1;
727 Register _gct2;
728 Register _gct3;
729 FloatRegister _gcvt1;
730 FloatRegister _gcvt2;
731 FloatRegister _gcvt3;
732
733 public:
734 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
735 DecoratorSet decorators,
736 BasicType type,
737 Register gct1,
738 Register gct2,
739 Register gct3,
740 FloatRegister gcvt1,
741 FloatRegister gcvt2,
742 FloatRegister gcvt3)
743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
744 _masm(masm),
745 _decorators(decorators),
746 _type(type),
747 _gct1(gct1),
748 _gct2(gct2),
749 _gct3(gct3),
750 _gcvt1(gcvt1),
751 _gcvt2(gcvt2),
752 _gcvt3(gcvt3) {
753 }
754
755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
757 dst1, dst2, src,
758 _gct1, _gct2, _gcvt1);
759 }
760
761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
763 dst, src1, src2,
764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
765 }
766
767 void copy_load_at_16(Register dst1, Register dst2, Address src) {
768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
769 dst1, dst2, src,
770 _gct1);
771 }
772
773 void copy_store_at_16(Address dst, Register src1, Register src2) {
774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
775 dst, src1, src2,
776 _gct1, _gct2, _gct3);
777 }
778
779 void copy_load_at_8(Register dst, Address src) {
780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
781 dst, noreg, src,
782 _gct1);
783 }
784
785 void copy_store_at_8(Address dst, Register src) {
786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
787 dst, src, noreg,
788 _gct1, _gct2, _gct3);
789 }
790 };
791
792 // Bulk copy of blocks of 8 words.
793 //
794 // count is a count of words.
795 //
796 // Precondition: count >= 8
797 //
798 // Postconditions:
799 //
800 // The least significant bit of count contains the remaining count
801 // of words to copy. The rest of count is trash.
802 //
803 // s and d are adjusted to point to the remaining words to copy
804 //
805 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
806 BasicType type;
807 copy_direction direction;
808
809 switch (stub_id) {
810 case StubId::stubgen_copy_byte_f_id:
811 direction = copy_forwards;
812 type = T_BYTE;
813 break;
814 case StubId::stubgen_copy_byte_b_id:
815 direction = copy_backwards;
816 type = T_BYTE;
817 break;
818 case StubId::stubgen_copy_oop_f_id:
819 direction = copy_forwards;
820 type = T_OBJECT;
821 break;
822 case StubId::stubgen_copy_oop_b_id:
823 direction = copy_backwards;
824 type = T_OBJECT;
825 break;
826 case StubId::stubgen_copy_oop_uninit_f_id:
827 direction = copy_forwards;
828 type = T_OBJECT;
829 break;
830 case StubId::stubgen_copy_oop_uninit_b_id:
831 direction = copy_backwards;
832 type = T_OBJECT;
833 break;
834 default:
835 ShouldNotReachHere();
836 }
837
838 int unit = wordSize * direction;
839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
840
841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
842 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
843 const Register stride = r14;
844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
847
848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
849 assert_different_registers(s, d, count, rscratch1, rscratch2);
850
851 Label again, drain;
852
853 __ align(CodeEntryAlignment);
854
855 StubCodeMark mark(this, stub_id);
856
857 address start = __ pc();
858
859 Label unaligned_copy_long;
860 if (AvoidUnalignedAccesses) {
861 __ tbnz(d, 3, unaligned_copy_long);
862 }
863
864 if (direction == copy_forwards) {
865 __ sub(s, s, bias);
866 __ sub(d, d, bias);
867 }
868
869 #ifdef ASSERT
870 // Make sure we are never given < 8 words
871 {
872 Label L;
873 __ cmp(count, (u1)8);
874 __ br(Assembler::GE, L);
875 __ stop("genrate_copy_longs called with < 8 words");
876 __ bind(L);
877 }
878 #endif
879
880 // Fill 8 registers
881 if (UseSIMDForMemoryOps) {
882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
884 } else {
885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
889 }
890
891 __ subs(count, count, 16);
892 __ br(Assembler::LO, drain);
893
894 int prefetch = PrefetchCopyIntervalInBytes;
895 bool use_stride = false;
896 if (direction == copy_backwards) {
897 use_stride = prefetch > 256;
898 prefetch = -prefetch;
899 if (use_stride) __ mov(stride, prefetch);
900 }
901
902 __ bind(again);
903
904 if (PrefetchCopyIntervalInBytes > 0)
905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
906
907 if (UseSIMDForMemoryOps) {
908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
912 } else {
913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
921 }
922
923 __ subs(count, count, 8);
924 __ br(Assembler::HS, again);
925
926 // Drain
927 __ bind(drain);
928 if (UseSIMDForMemoryOps) {
929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
931 } else {
932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 }
937
938 {
939 Label L1, L2;
940 __ tbz(count, exact_log2(4), L1);
941 if (UseSIMDForMemoryOps) {
942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
944 } else {
945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
949 }
950 __ bind(L1);
951
952 if (direction == copy_forwards) {
953 __ add(s, s, bias);
954 __ add(d, d, bias);
955 }
956
957 __ tbz(count, 1, L2);
958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
960 __ bind(L2);
961 }
962
963 __ ret(lr);
964
965 if (AvoidUnalignedAccesses) {
966 Label drain, again;
967 // Register order for storing. Order is different for backward copy.
968
969 __ bind(unaligned_copy_long);
970
971 // source address is even aligned, target odd aligned
972 //
973 // when forward copying word pairs we read long pairs at offsets
974 // {0, 2, 4, 6} (in long words). when backwards copying we read
975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
976 // address by -2 in the forwards case so we can compute the
977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
978 // or -1.
979 //
980 // when forward copying we need to store 1 word, 3 pairs and
981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
982 // zero offset We adjust the destination by -1 which means we
983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
984 //
985 // When backwards copyng we need to store 1 word, 3 pairs and
986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
987 // offsets {1, 3, 5, 7, 8} * unit.
988
989 if (direction == copy_forwards) {
990 __ sub(s, s, 16);
991 __ sub(d, d, 8);
992 }
993
994 // Fill 8 registers
995 //
996 // for forwards copy s was offset by -16 from the original input
997 // value of s so the register contents are at these offsets
998 // relative to the 64 bit block addressed by that original input
999 // and so on for each successive 64 byte block when s is updated
1000 //
1001 // t0 at offset 0, t1 at offset 8
1002 // t2 at offset 16, t3 at offset 24
1003 // t4 at offset 32, t5 at offset 40
1004 // t6 at offset 48, t7 at offset 56
1005
1006 // for backwards copy s was not offset so the register contents
1007 // are at these offsets into the preceding 64 byte block
1008 // relative to that original input and so on for each successive
1009 // preceding 64 byte block when s is updated. this explains the
1010 // slightly counter-intuitive looking pattern of register usage
1011 // in the stp instructions for backwards copy.
1012 //
1013 // t0 at offset -16, t1 at offset -8
1014 // t2 at offset -32, t3 at offset -24
1015 // t4 at offset -48, t5 at offset -40
1016 // t6 at offset -64, t7 at offset -56
1017
1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1022
1023 __ subs(count, count, 16);
1024 __ br(Assembler::LO, drain);
1025
1026 int prefetch = PrefetchCopyIntervalInBytes;
1027 bool use_stride = false;
1028 if (direction == copy_backwards) {
1029 use_stride = prefetch > 256;
1030 prefetch = -prefetch;
1031 if (use_stride) __ mov(stride, prefetch);
1032 }
1033
1034 __ bind(again);
1035
1036 if (PrefetchCopyIntervalInBytes > 0)
1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1038
1039 if (direction == copy_forwards) {
1040 // allowing for the offset of -8 the store instructions place
1041 // registers into the target 64 bit block at the following
1042 // offsets
1043 //
1044 // t0 at offset 0
1045 // t1 at offset 8, t2 at offset 16
1046 // t3 at offset 24, t4 at offset 32
1047 // t5 at offset 40, t6 at offset 48
1048 // t7 at offset 56
1049
1050 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1059 } else {
1060 // d was not offset when we started so the registers are
1061 // written into the 64 bit block preceding d with the following
1062 // offsets
1063 //
1064 // t1 at offset -8
1065 // t3 at offset -24, t0 at offset -16
1066 // t5 at offset -48, t2 at offset -32
1067 // t7 at offset -56, t4 at offset -48
1068 // t6 at offset -64
1069 //
1070 // note that this matches the offsets previously noted for the
1071 // loads
1072
1073 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1082 }
1083
1084 __ subs(count, count, 8);
1085 __ br(Assembler::HS, again);
1086
1087 // Drain
1088 //
1089 // this uses the same pattern of offsets and register arguments
1090 // as above
1091 __ bind(drain);
1092 if (direction == copy_forwards) {
1093 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1098 } else {
1099 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1104 }
1105 // now we need to copy any remaining part block which may
1106 // include a 4 word block subblock and/or a 2 word subblock.
1107 // bits 2 and 1 in the count are the tell-tale for whether we
1108 // have each such subblock
1109 {
1110 Label L1, L2;
1111 __ tbz(count, exact_log2(4), L1);
1112 // this is the same as above but copying only 4 longs hence
1113 // with only one intervening stp between the str instructions
1114 // but note that the offsets and registers still follow the
1115 // same pattern
1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1118 if (direction == copy_forwards) {
1119 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1122 } else {
1123 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1126 }
1127 __ bind(L1);
1128
1129 __ tbz(count, 1, L2);
1130 // this is the same as above but copying only 2 longs hence
1131 // there is no intervening stp between the str instructions
1132 // but note that the offset and register patterns are still
1133 // the same
1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1135 if (direction == copy_forwards) {
1136 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1141 }
1142 __ bind(L2);
1143
1144 // for forwards copy we need to re-adjust the offsets we
1145 // applied so that s and d are follow the last words written
1146
1147 if (direction == copy_forwards) {
1148 __ add(s, s, 16);
1149 __ add(d, d, 8);
1150 }
1151
1152 }
1153
1154 __ ret(lr);
1155 }
1156
1157 return start;
1158 }
1159
1160 // Small copy: less than 16 bytes.
1161 //
1162 // NB: Ignores all of the bits of count which represent more than 15
1163 // bytes, so a caller doesn't have to mask them.
1164
1165 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1166 bool is_backwards = step < 0;
1167 size_t granularity = g_uabs(step);
1168 int direction = is_backwards ? -1 : 1;
1169
1170 Label Lword, Lint, Lshort, Lbyte;
1171
1172 assert(granularity
1173 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1174
1175 const Register t0 = r3;
1176 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1177 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1178
1179 // ??? I don't know if this bit-test-and-branch is the right thing
1180 // to do. It does a lot of jumping, resulting in several
1181 // mispredicted branches. It might make more sense to do this
1182 // with something like Duff's device with a single computed branch.
1183
1184 __ tbz(count, 3 - exact_log2(granularity), Lword);
1185 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1186 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1187 __ bind(Lword);
1188
1189 if (granularity <= sizeof (jint)) {
1190 __ tbz(count, 2 - exact_log2(granularity), Lint);
1191 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1192 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1193 __ bind(Lint);
1194 }
1195
1196 if (granularity <= sizeof (jshort)) {
1197 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1198 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1199 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1200 __ bind(Lshort);
1201 }
1202
1203 if (granularity <= sizeof (jbyte)) {
1204 __ tbz(count, 0, Lbyte);
1205 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1206 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1207 __ bind(Lbyte);
1208 }
1209 }
1210
1211 // All-singing all-dancing memory copy.
1212 //
1213 // Copy count units of memory from s to d. The size of a unit is
1214 // step, which can be positive or negative depending on the direction
1215 // of copy. If is_aligned is false, we align the source address.
1216 //
1217
1218 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1219 Register s, Register d, Register count, int step) {
1220 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1221 bool is_backwards = step < 0;
1222 unsigned int granularity = g_uabs(step);
1223 const Register t0 = r3, t1 = r4;
1224
1225 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1226 // load all the data before writing anything
1227 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1228 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1229 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1230 const Register send = r17, dend = r16;
1231 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1232 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1233 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1234
1235 if (PrefetchCopyIntervalInBytes > 0)
1236 __ prfm(Address(s, 0), PLDL1KEEP);
1237 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1238 __ br(Assembler::HI, copy_big);
1239
1240 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1241 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1242
1243 __ cmp(count, u1(16/granularity));
1244 __ br(Assembler::LS, copy16);
1245
1246 __ cmp(count, u1(64/granularity));
1247 __ br(Assembler::HI, copy80);
1248
1249 __ cmp(count, u1(32/granularity));
1250 __ br(Assembler::LS, copy32);
1251
1252 // 33..64 bytes
1253 if (UseSIMDForMemoryOps) {
1254 bs.copy_load_at_32(v0, v1, Address(s, 0));
1255 bs.copy_load_at_32(v2, v3, Address(send, -32));
1256 bs.copy_store_at_32(Address(d, 0), v0, v1);
1257 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1258 } else {
1259 bs.copy_load_at_16(t0, t1, Address(s, 0));
1260 bs.copy_load_at_16(t2, t3, Address(s, 16));
1261 bs.copy_load_at_16(t4, t5, Address(send, -32));
1262 bs.copy_load_at_16(t6, t7, Address(send, -16));
1263
1264 bs.copy_store_at_16(Address(d, 0), t0, t1);
1265 bs.copy_store_at_16(Address(d, 16), t2, t3);
1266 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1267 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1268 }
1269 __ b(finish);
1270
1271 // 17..32 bytes
1272 __ bind(copy32);
1273 bs.copy_load_at_16(t0, t1, Address(s, 0));
1274 bs.copy_load_at_16(t6, t7, Address(send, -16));
1275
1276 bs.copy_store_at_16(Address(d, 0), t0, t1);
1277 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1278 __ b(finish);
1279
1280 // 65..80/96 bytes
1281 // (96 bytes if SIMD because we do 32 byes per instruction)
1282 __ bind(copy80);
1283 if (UseSIMDForMemoryOps) {
1284 bs.copy_load_at_32(v0, v1, Address(s, 0));
1285 bs.copy_load_at_32(v2, v3, Address(s, 32));
1286 // Unaligned pointers can be an issue for copying.
1287 // The issue has more chances to happen when granularity of data is
1288 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1289 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1290 // The most performance drop has been seen for the range 65-80 bytes.
1291 // For such cases using the pair of ldp/stp instead of the third pair of
1292 // ldpq/stpq fixes the performance issue.
1293 if (granularity < sizeof (jint)) {
1294 Label copy96;
1295 __ cmp(count, u1(80/granularity));
1296 __ br(Assembler::HI, copy96);
1297 bs.copy_load_at_16(t0, t1, Address(send, -16));
1298
1299 bs.copy_store_at_32(Address(d, 0), v0, v1);
1300 bs.copy_store_at_32(Address(d, 32), v2, v3);
1301
1302 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1303 __ b(finish);
1304
1305 __ bind(copy96);
1306 }
1307 bs.copy_load_at_32(v4, v5, Address(send, -32));
1308
1309 bs.copy_store_at_32(Address(d, 0), v0, v1);
1310 bs.copy_store_at_32(Address(d, 32), v2, v3);
1311
1312 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1313 } else {
1314 bs.copy_load_at_16(t0, t1, Address(s, 0));
1315 bs.copy_load_at_16(t2, t3, Address(s, 16));
1316 bs.copy_load_at_16(t4, t5, Address(s, 32));
1317 bs.copy_load_at_16(t6, t7, Address(s, 48));
1318 bs.copy_load_at_16(t8, t9, Address(send, -16));
1319
1320 bs.copy_store_at_16(Address(d, 0), t0, t1);
1321 bs.copy_store_at_16(Address(d, 16), t2, t3);
1322 bs.copy_store_at_16(Address(d, 32), t4, t5);
1323 bs.copy_store_at_16(Address(d, 48), t6, t7);
1324 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1325 }
1326 __ b(finish);
1327
1328 // 0..16 bytes
1329 __ bind(copy16);
1330 __ cmp(count, u1(8/granularity));
1331 __ br(Assembler::LO, copy8);
1332
1333 // 8..16 bytes
1334 bs.copy_load_at_8(t0, Address(s, 0));
1335 bs.copy_load_at_8(t1, Address(send, -8));
1336 bs.copy_store_at_8(Address(d, 0), t0);
1337 bs.copy_store_at_8(Address(dend, -8), t1);
1338 __ b(finish);
1339
1340 if (granularity < 8) {
1341 // 4..7 bytes
1342 __ bind(copy8);
1343 __ tbz(count, 2 - exact_log2(granularity), copy4);
1344 __ ldrw(t0, Address(s, 0));
1345 __ ldrw(t1, Address(send, -4));
1346 __ strw(t0, Address(d, 0));
1347 __ strw(t1, Address(dend, -4));
1348 __ b(finish);
1349 if (granularity < 4) {
1350 // 0..3 bytes
1351 __ bind(copy4);
1352 __ cbz(count, finish); // get rid of 0 case
1353 if (granularity == 2) {
1354 __ ldrh(t0, Address(s, 0));
1355 __ strh(t0, Address(d, 0));
1356 } else { // granularity == 1
1357 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1358 // the first and last byte.
1359 // Handle the 3 byte case by loading and storing base + count/2
1360 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1361 // This does means in the 1 byte case we load/store the same
1362 // byte 3 times.
1363 __ lsr(count, count, 1);
1364 __ ldrb(t0, Address(s, 0));
1365 __ ldrb(t1, Address(send, -1));
1366 __ ldrb(t2, Address(s, count));
1367 __ strb(t0, Address(d, 0));
1368 __ strb(t1, Address(dend, -1));
1369 __ strb(t2, Address(d, count));
1370 }
1371 __ b(finish);
1372 }
1373 }
1374
1375 __ bind(copy_big);
1376 if (is_backwards) {
1377 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1378 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1379 }
1380
1381 // Now we've got the small case out of the way we can align the
1382 // source address on a 2-word boundary.
1383
1384 // Here we will materialize a count in r15, which is used by copy_memory_small
1385 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1386 // Up until here, we have used t9, which aliases r15, but from here on, that register
1387 // can not be used as a temp register, as it contains the count.
1388
1389 Label aligned;
1390
1391 if (is_aligned) {
1392 // We may have to adjust by 1 word to get s 2-word-aligned.
1393 __ tbz(s, exact_log2(wordSize), aligned);
1394 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1395 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1396 __ sub(count, count, wordSize/granularity);
1397 } else {
1398 if (is_backwards) {
1399 __ andr(r15, s, 2 * wordSize - 1);
1400 } else {
1401 __ neg(r15, s);
1402 __ andr(r15, r15, 2 * wordSize - 1);
1403 }
1404 // r15 is the byte adjustment needed to align s.
1405 __ cbz(r15, aligned);
1406 int shift = exact_log2(granularity);
1407 if (shift > 0) {
1408 __ lsr(r15, r15, shift);
1409 }
1410 __ sub(count, count, r15);
1411
1412 #if 0
1413 // ?? This code is only correct for a disjoint copy. It may or
1414 // may not make sense to use it in that case.
1415
1416 // Copy the first pair; s and d may not be aligned.
1417 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1418 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1419
1420 // Align s and d, adjust count
1421 if (is_backwards) {
1422 __ sub(s, s, r15);
1423 __ sub(d, d, r15);
1424 } else {
1425 __ add(s, s, r15);
1426 __ add(d, d, r15);
1427 }
1428 #else
1429 copy_memory_small(decorators, type, s, d, r15, step);
1430 #endif
1431 }
1432
1433 __ bind(aligned);
1434
1435 // s is now 2-word-aligned.
1436
1437 // We have a count of units and some trailing bytes. Adjust the
1438 // count and do a bulk copy of words. If the shift is zero
1439 // perform a move instead to benefit from zero latency moves.
1440 int shift = exact_log2(wordSize/granularity);
1441 if (shift > 0) {
1442 __ lsr(r15, count, shift);
1443 } else {
1444 __ mov(r15, count);
1445 }
1446 if (direction == copy_forwards) {
1447 if (type != T_OBJECT) {
1448 __ bl(StubRoutines::aarch64::copy_byte_f());
1449 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1450 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1451 } else {
1452 __ bl(StubRoutines::aarch64::copy_oop_f());
1453 }
1454 } else {
1455 if (type != T_OBJECT) {
1456 __ bl(StubRoutines::aarch64::copy_byte_b());
1457 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1458 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1459 } else {
1460 __ bl(StubRoutines::aarch64::copy_oop_b());
1461 }
1462 }
1463
1464 // And the tail.
1465 copy_memory_small(decorators, type, s, d, count, step);
1466
1467 if (granularity >= 8) __ bind(copy8);
1468 if (granularity >= 4) __ bind(copy4);
1469 __ bind(finish);
1470 }
1471
1472
1473 void clobber_registers() {
1474 #ifdef ASSERT
1475 RegSet clobbered
1476 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1477 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1478 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1479 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1480 __ mov(*it, rscratch1);
1481 }
1482 #endif
1483
1484 }
1485
1486 // Scan over array at a for count oops, verifying each one.
1487 // Preserves a and count, clobbers rscratch1 and rscratch2.
1488 void verify_oop_array (int size, Register a, Register count, Register temp) {
1489 Label loop, end;
1490 __ mov(rscratch1, a);
1491 __ mov(rscratch2, zr);
1492 __ bind(loop);
1493 __ cmp(rscratch2, count);
1494 __ br(Assembler::HS, end);
1495 if (size == wordSize) {
1496 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1497 __ verify_oop(temp);
1498 } else {
1499 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1500 __ decode_heap_oop(temp); // calls verify_oop
1501 }
1502 __ add(rscratch2, rscratch2, 1);
1503 __ b(loop);
1504 __ bind(end);
1505 }
1506
1507 // Arguments:
1508 // stub_id - is used to name the stub and identify all details of
1509 // how to perform the copy.
1510 //
1511 // entry - is assigned to the stub's post push entry point unless
1512 // it is null
1513 //
1514 // Inputs:
1515 // c_rarg0 - source array address
1516 // c_rarg1 - destination array address
1517 // c_rarg2 - element count, treated as ssize_t, can be zero
1518 //
1519 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1520 // the hardware handle it. The two dwords within qwords that span
1521 // cache line boundaries will still be loaded and stored atomically.
1522 //
1523 // Side Effects: nopush_entry is set to the (post push) entry point
1524 // so it can be used by the corresponding conjoint
1525 // copy method
1526 //
1527 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1528 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1529 RegSet saved_reg = RegSet::of(s, d, count);
1530 int size;
1531 bool aligned;
1532 bool is_oop;
1533 bool dest_uninitialized;
1534 switch (stub_id) {
1535 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1536 size = sizeof(jbyte);
1537 aligned = false;
1538 is_oop = false;
1539 dest_uninitialized = false;
1540 break;
1541 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1542 size = sizeof(jbyte);
1543 aligned = true;
1544 is_oop = false;
1545 dest_uninitialized = false;
1546 break;
1547 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1548 size = sizeof(jshort);
1549 aligned = false;
1550 is_oop = false;
1551 dest_uninitialized = false;
1552 break;
1553 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1554 size = sizeof(jshort);
1555 aligned = true;
1556 is_oop = false;
1557 dest_uninitialized = false;
1558 break;
1559 case StubId::stubgen_jint_disjoint_arraycopy_id:
1560 size = sizeof(jint);
1561 aligned = false;
1562 is_oop = false;
1563 dest_uninitialized = false;
1564 break;
1565 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1566 size = sizeof(jint);
1567 aligned = true;
1568 is_oop = false;
1569 dest_uninitialized = false;
1570 break;
1571 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1572 // since this is always aligned we can (should!) use the same
1573 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1574 ShouldNotReachHere();
1575 break;
1576 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1577 size = sizeof(jlong);
1578 aligned = true;
1579 is_oop = false;
1580 dest_uninitialized = false;
1581 break;
1582 case StubId::stubgen_oop_disjoint_arraycopy_id:
1583 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1584 aligned = !UseCompressedOops;
1585 is_oop = true;
1586 dest_uninitialized = false;
1587 break;
1588 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1589 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1590 aligned = !UseCompressedOops;
1591 is_oop = true;
1592 dest_uninitialized = false;
1593 break;
1594 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1595 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1596 aligned = !UseCompressedOops;
1597 is_oop = true;
1598 dest_uninitialized = true;
1599 break;
1600 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1602 aligned = !UseCompressedOops;
1603 is_oop = true;
1604 dest_uninitialized = true;
1605 break;
1606 default:
1607 ShouldNotReachHere();
1608 break;
1609 }
1610
1611 __ align(CodeEntryAlignment);
1612 StubCodeMark mark(this, stub_id);
1613 address start = __ pc();
1614 __ enter();
1615
1616 if (nopush_entry != nullptr) {
1617 *nopush_entry = __ pc();
1618 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1619 BLOCK_COMMENT("Entry:");
1620 }
1621
1622 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1623 if (dest_uninitialized) {
1624 decorators |= IS_DEST_UNINITIALIZED;
1625 }
1626 if (aligned) {
1627 decorators |= ARRAYCOPY_ALIGNED;
1628 }
1629
1630 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1631 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1632
1633 if (is_oop) {
1634 // save regs before copy_memory
1635 __ push(RegSet::of(d, count), sp);
1636 }
1637 {
1638 // UnsafeMemoryAccess page error: continue after unsafe access
1639 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1640 UnsafeMemoryAccessMark umam(this, add_entry, true);
1641 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1642 }
1643
1644 if (is_oop) {
1645 __ pop(RegSet::of(d, count), sp);
1646 if (VerifyOops)
1647 verify_oop_array(size, d, count, r16);
1648 }
1649
1650 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1651
1652 __ leave();
1653 __ mov(r0, zr); // return 0
1654 __ ret(lr);
1655 return start;
1656 }
1657
1658 // Arguments:
1659 // stub_id - is used to name the stub and identify all details of
1660 // how to perform the copy.
1661 //
1662 // nooverlap_target - identifes the (post push) entry for the
1663 // corresponding disjoint copy routine which can be
1664 // jumped to if the ranges do not actually overlap
1665 //
1666 // entry - is assigned to the stub's post push entry point unless
1667 // it is null
1668 //
1669 //
1670 // Inputs:
1671 // c_rarg0 - source array address
1672 // c_rarg1 - destination array address
1673 // c_rarg2 - element count, treated as ssize_t, can be zero
1674 //
1675 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1676 // the hardware handle it. The two dwords within qwords that span
1677 // cache line boundaries will still be loaded and stored atomically.
1678 //
1679 // Side Effects:
1680 // nopush_entry is set to the no-overlap entry point so it can be
1681 // used by some other conjoint copy method
1682 //
1683 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1684 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1685 RegSet saved_regs = RegSet::of(s, d, count);
1686 int size;
1687 bool aligned;
1688 bool is_oop;
1689 bool dest_uninitialized;
1690 switch (stub_id) {
1691 case StubId::stubgen_jbyte_arraycopy_id:
1692 size = sizeof(jbyte);
1693 aligned = false;
1694 is_oop = false;
1695 dest_uninitialized = false;
1696 break;
1697 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1698 size = sizeof(jbyte);
1699 aligned = true;
1700 is_oop = false;
1701 dest_uninitialized = false;
1702 break;
1703 case StubId::stubgen_jshort_arraycopy_id:
1704 size = sizeof(jshort);
1705 aligned = false;
1706 is_oop = false;
1707 dest_uninitialized = false;
1708 break;
1709 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1710 size = sizeof(jshort);
1711 aligned = true;
1712 is_oop = false;
1713 dest_uninitialized = false;
1714 break;
1715 case StubId::stubgen_jint_arraycopy_id:
1716 size = sizeof(jint);
1717 aligned = false;
1718 is_oop = false;
1719 dest_uninitialized = false;
1720 break;
1721 case StubId::stubgen_arrayof_jint_arraycopy_id:
1722 size = sizeof(jint);
1723 aligned = true;
1724 is_oop = false;
1725 dest_uninitialized = false;
1726 break;
1727 case StubId::stubgen_jlong_arraycopy_id:
1728 // since this is always aligned we can (should!) use the same
1729 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1730 ShouldNotReachHere();
1731 break;
1732 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1733 size = sizeof(jlong);
1734 aligned = true;
1735 is_oop = false;
1736 dest_uninitialized = false;
1737 break;
1738 case StubId::stubgen_oop_arraycopy_id:
1739 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1740 aligned = !UseCompressedOops;
1741 is_oop = true;
1742 dest_uninitialized = false;
1743 break;
1744 case StubId::stubgen_arrayof_oop_arraycopy_id:
1745 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1746 aligned = !UseCompressedOops;
1747 is_oop = true;
1748 dest_uninitialized = false;
1749 break;
1750 case StubId::stubgen_oop_arraycopy_uninit_id:
1751 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1752 aligned = !UseCompressedOops;
1753 is_oop = true;
1754 dest_uninitialized = true;
1755 break;
1756 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1758 aligned = !UseCompressedOops;
1759 is_oop = true;
1760 dest_uninitialized = true;
1761 break;
1762 default:
1763 ShouldNotReachHere();
1764 }
1765
1766 StubCodeMark mark(this, stub_id);
1767 address start = __ pc();
1768 __ enter();
1769
1770 if (nopush_entry != nullptr) {
1771 *nopush_entry = __ pc();
1772 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1773 BLOCK_COMMENT("Entry:");
1774 }
1775
1776 // use fwd copy when (d-s) above_equal (count*size)
1777 Label L_overlapping;
1778 __ sub(rscratch1, d, s);
1779 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1780 __ br(Assembler::LO, L_overlapping);
1781 __ b(RuntimeAddress(nooverlap_target));
1782 __ bind(L_overlapping);
1783
1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1785 if (dest_uninitialized) {
1786 decorators |= IS_DEST_UNINITIALIZED;
1787 }
1788 if (aligned) {
1789 decorators |= ARRAYCOPY_ALIGNED;
1790 }
1791
1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1794
1795 if (is_oop) {
1796 // save regs before copy_memory
1797 __ push(RegSet::of(d, count), sp);
1798 }
1799 {
1800 // UnsafeMemoryAccess page error: continue after unsafe access
1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1802 UnsafeMemoryAccessMark umam(this, add_entry, true);
1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1804 }
1805 if (is_oop) {
1806 __ pop(RegSet::of(d, count), sp);
1807 if (VerifyOops)
1808 verify_oop_array(size, d, count, r16);
1809 }
1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1811 __ leave();
1812 __ mov(r0, zr); // return 0
1813 __ ret(lr);
1814 return start;
1815 }
1816
1817 // Helper for generating a dynamic type check.
1818 // Smashes rscratch1, rscratch2.
1819 void generate_type_check(Register sub_klass,
1820 Register super_check_offset,
1821 Register super_klass,
1822 Register temp1,
1823 Register temp2,
1824 Register result,
1825 Label& L_success) {
1826 assert_different_registers(sub_klass, super_check_offset, super_klass);
1827
1828 BLOCK_COMMENT("type_check:");
1829
1830 Label L_miss;
1831
1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1833 super_check_offset);
1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1835
1836 // Fall through on failure!
1837 __ BIND(L_miss);
1838 }
1839
1840 //
1841 // Generate checkcasting array copy stub
1842 //
1843 // Input:
1844 // c_rarg0 - source array address
1845 // c_rarg1 - destination array address
1846 // c_rarg2 - element count, treated as ssize_t, can be zero
1847 // c_rarg3 - size_t ckoff (super_check_offset)
1848 // c_rarg4 - oop ckval (super_klass)
1849 //
1850 // Output:
1851 // r0 == 0 - success
1852 // r0 == -1^K - failure, where K is partial transfer count
1853 //
1854 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1855 bool dest_uninitialized;
1856 switch (stub_id) {
1857 case StubId::stubgen_checkcast_arraycopy_id:
1858 dest_uninitialized = false;
1859 break;
1860 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1861 dest_uninitialized = true;
1862 break;
1863 default:
1864 ShouldNotReachHere();
1865 }
1866
1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1868
1869 // Input registers (after setup_arg_regs)
1870 const Register from = c_rarg0; // source array address
1871 const Register to = c_rarg1; // destination array address
1872 const Register count = c_rarg2; // elementscount
1873 const Register ckoff = c_rarg3; // super_check_offset
1874 const Register ckval = c_rarg4; // super_klass
1875
1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1877
1878 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1879 const Register copied_oop = r22; // actual oop copied
1880 const Register count_save = r21; // orig elementscount
1881 const Register start_to = r20; // destination array start address
1882 const Register r19_klass = r19; // oop._klass
1883
1884 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1885 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1886
1887 //---------------------------------------------------------------
1888 // Assembler stub will be used for this call to arraycopy
1889 // if the two arrays are subtypes of Object[] but the
1890 // destination array type is not equal to or a supertype
1891 // of the source type. Each element must be separately
1892 // checked.
1893
1894 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1895 copied_oop, r19_klass, count_save);
1896
1897 __ align(CodeEntryAlignment);
1898 StubCodeMark mark(this, stub_id);
1899 address start = __ pc();
1900
1901 __ enter(); // required for proper stackwalking of RuntimeStub frame
1902
1903 #ifdef ASSERT
1904 // caller guarantees that the arrays really are different
1905 // otherwise, we would have to make conjoint checks
1906 { Label L;
1907 __ b(L); // conjoint check not yet implemented
1908 __ stop("checkcast_copy within a single array");
1909 __ bind(L);
1910 }
1911 #endif //ASSERT
1912
1913 // Caller of this entry point must set up the argument registers.
1914 if (nopush_entry != nullptr) {
1915 *nopush_entry = __ pc();
1916 BLOCK_COMMENT("Entry:");
1917 }
1918
1919 // Empty array: Nothing to do.
1920 __ cbz(count, L_done);
1921 __ push(RegSet::of(r19, r20, r21, r22), sp);
1922
1923 #ifdef ASSERT
1924 BLOCK_COMMENT("assert consistent ckoff/ckval");
1925 // The ckoff and ckval must be mutually consistent,
1926 // even though caller generates both.
1927 { Label L;
1928 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1929 __ ldrw(start_to, Address(ckval, sco_offset));
1930 __ cmpw(ckoff, start_to);
1931 __ br(Assembler::EQ, L);
1932 __ stop("super_check_offset inconsistent");
1933 __ bind(L);
1934 }
1935 #endif //ASSERT
1936
1937 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1938 bool is_oop = true;
1939 int element_size = UseCompressedOops ? 4 : 8;
1940 if (dest_uninitialized) {
1941 decorators |= IS_DEST_UNINITIALIZED;
1942 }
1943
1944 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1945 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1946
1947 // save the original count
1948 __ mov(count_save, count);
1949
1950 // Copy from low to high addresses
1951 __ mov(start_to, to); // Save destination array start address
1952 __ b(L_load_element);
1953
1954 // ======== begin loop ========
1955 // (Loop is rotated; its entry is L_load_element.)
1956 // Loop control:
1957 // for (; count != 0; count--) {
1958 // copied_oop = load_heap_oop(from++);
1959 // ... generate_type_check ...;
1960 // store_heap_oop(to++, copied_oop);
1961 // }
1962 __ align(OptoLoopAlignment);
1963
1964 __ BIND(L_store_element);
1965 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1966 __ post(to, element_size), copied_oop, noreg,
1967 gct1, gct2, gct3);
1968 __ sub(count, count, 1);
1969 __ cbz(count, L_do_card_marks);
1970
1971 // ======== loop entry is here ========
1972 __ BIND(L_load_element);
1973 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1974 copied_oop, noreg, __ post(from, element_size),
1975 gct1);
1976 __ cbz(copied_oop, L_store_element);
1977
1978 __ load_klass(r19_klass, copied_oop);// query the object klass
1979
1980 BLOCK_COMMENT("type_check:");
1981 generate_type_check(/*sub_klass*/r19_klass,
1982 /*super_check_offset*/ckoff,
1983 /*super_klass*/ckval,
1984 /*r_array_base*/gct1,
1985 /*temp2*/gct2,
1986 /*result*/r10, L_store_element);
1987
1988 // Fall through on failure!
1989
1990 // ======== end loop ========
1991
1992 // It was a real error; we must depend on the caller to finish the job.
1993 // Register count = remaining oops, count_orig = total oops.
1994 // Emit GC store barriers for the oops we have copied and report
1995 // their number to the caller.
1996
1997 __ subs(count, count_save, count); // K = partially copied oop count
1998 __ eon(count, count, zr); // report (-1^K) to caller
1999 __ br(Assembler::EQ, L_done_pop);
2000
2001 __ BIND(L_do_card_marks);
2002 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2003
2004 __ bind(L_done_pop);
2005 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2006 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2007
2008 __ bind(L_done);
2009 __ mov(r0, count);
2010 __ leave();
2011 __ ret(lr);
2012
2013 return start;
2014 }
2015
2016 // Perform range checks on the proposed arraycopy.
2017 // Kills temp, but nothing else.
2018 // Also, clean the sign bits of src_pos and dst_pos.
2019 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2020 Register src_pos, // source position (c_rarg1)
2021 Register dst, // destination array oo (c_rarg2)
2022 Register dst_pos, // destination position (c_rarg3)
2023 Register length,
2024 Register temp,
2025 Label& L_failed) {
2026 BLOCK_COMMENT("arraycopy_range_checks:");
2027
2028 assert_different_registers(rscratch1, temp);
2029
2030 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2031 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2032 __ addw(temp, length, src_pos);
2033 __ cmpw(temp, rscratch1);
2034 __ br(Assembler::HI, L_failed);
2035
2036 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2037 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2038 __ addw(temp, length, dst_pos);
2039 __ cmpw(temp, rscratch1);
2040 __ br(Assembler::HI, L_failed);
2041
2042 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2043 __ movw(src_pos, src_pos);
2044 __ movw(dst_pos, dst_pos);
2045
2046 BLOCK_COMMENT("arraycopy_range_checks done");
2047 }
2048
2049 // These stubs get called from some dumb test routine.
2050 // I'll write them properly when they're called from
2051 // something that's actually doing something.
2052 static void fake_arraycopy_stub(address src, address dst, int count) {
2053 assert(count == 0, "huh?");
2054 }
2055
2056
2057 //
2058 // Generate 'unsafe' array copy stub
2059 // Though just as safe as the other stubs, it takes an unscaled
2060 // size_t argument instead of an element count.
2061 //
2062 // Input:
2063 // c_rarg0 - source array address
2064 // c_rarg1 - destination array address
2065 // c_rarg2 - byte count, treated as ssize_t, can be zero
2066 //
2067 // Examines the alignment of the operands and dispatches
2068 // to a long, int, short, or byte copy loop.
2069 //
2070 address generate_unsafe_copy(address byte_copy_entry,
2071 address short_copy_entry,
2072 address int_copy_entry,
2073 address long_copy_entry) {
2074 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2075
2076 Label L_long_aligned, L_int_aligned, L_short_aligned;
2077 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2078
2079 __ align(CodeEntryAlignment);
2080 StubCodeMark mark(this, stub_id);
2081 address start = __ pc();
2082 __ enter(); // required for proper stackwalking of RuntimeStub frame
2083
2084 // bump this on entry, not on exit:
2085 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2086
2087 __ orr(rscratch1, s, d);
2088 __ orr(rscratch1, rscratch1, count);
2089
2090 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2091 __ cbz(rscratch1, L_long_aligned);
2092 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2093 __ cbz(rscratch1, L_int_aligned);
2094 __ tbz(rscratch1, 0, L_short_aligned);
2095 __ b(RuntimeAddress(byte_copy_entry));
2096
2097 __ BIND(L_short_aligned);
2098 __ lsr(count, count, LogBytesPerShort); // size => short_count
2099 __ b(RuntimeAddress(short_copy_entry));
2100 __ BIND(L_int_aligned);
2101 __ lsr(count, count, LogBytesPerInt); // size => int_count
2102 __ b(RuntimeAddress(int_copy_entry));
2103 __ BIND(L_long_aligned);
2104 __ lsr(count, count, LogBytesPerLong); // size => long_count
2105 __ b(RuntimeAddress(long_copy_entry));
2106
2107 return start;
2108 }
2109
2110 //
2111 // Generate generic array copy stubs
2112 //
2113 // Input:
2114 // c_rarg0 - src oop
2115 // c_rarg1 - src_pos (32-bits)
2116 // c_rarg2 - dst oop
2117 // c_rarg3 - dst_pos (32-bits)
2118 // c_rarg4 - element count (32-bits)
2119 //
2120 // Output:
2121 // r0 == 0 - success
2122 // r0 == -1^K - failure, where K is partial transfer count
2123 //
2124 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2125 address int_copy_entry, address oop_copy_entry,
2126 address long_copy_entry, address checkcast_copy_entry) {
2127 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2128
2129 Label L_failed, L_objArray;
2130 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2131
2132 // Input registers
2133 const Register src = c_rarg0; // source array oop
2134 const Register src_pos = c_rarg1; // source position
2135 const Register dst = c_rarg2; // destination array oop
2136 const Register dst_pos = c_rarg3; // destination position
2137 const Register length = c_rarg4;
2138
2139
2140 // Registers used as temps
2141 const Register dst_klass = c_rarg5;
2142
2143 __ align(CodeEntryAlignment);
2144
2145 StubCodeMark mark(this, stub_id);
2146
2147 address start = __ pc();
2148
2149 __ enter(); // required for proper stackwalking of RuntimeStub frame
2150
2151 // bump this on entry, not on exit:
2152 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2153
2154 //-----------------------------------------------------------------------
2155 // Assembler stub will be used for this call to arraycopy
2156 // if the following conditions are met:
2157 //
2158 // (1) src and dst must not be null.
2159 // (2) src_pos must not be negative.
2160 // (3) dst_pos must not be negative.
2161 // (4) length must not be negative.
2162 // (5) src klass and dst klass should be the same and not null.
2163 // (6) src and dst should be arrays.
2164 // (7) src_pos + length must not exceed length of src.
2165 // (8) dst_pos + length must not exceed length of dst.
2166 //
2167
2168 // if (src == nullptr) return -1;
2169 __ cbz(src, L_failed);
2170
2171 // if (src_pos < 0) return -1;
2172 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2173
2174 // if (dst == nullptr) return -1;
2175 __ cbz(dst, L_failed);
2176
2177 // if (dst_pos < 0) return -1;
2178 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2179
2180 // registers used as temp
2181 const Register scratch_length = r16; // elements count to copy
2182 const Register scratch_src_klass = r17; // array klass
2183 const Register lh = r15; // layout helper
2184
2185 // if (length < 0) return -1;
2186 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2187 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2188
2189 __ load_klass(scratch_src_klass, src);
2190 #ifdef ASSERT
2191 // assert(src->klass() != nullptr);
2192 {
2193 BLOCK_COMMENT("assert klasses not null {");
2194 Label L1, L2;
2195 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2196 __ bind(L1);
2197 __ stop("broken null klass");
2198 __ bind(L2);
2199 __ load_klass(rscratch1, dst);
2200 __ cbz(rscratch1, L1); // this would be broken also
2201 BLOCK_COMMENT("} assert klasses not null done");
2202 }
2203 #endif
2204
2205 // Load layout helper (32-bits)
2206 //
2207 // |array_tag| | header_size | element_type | |log2_element_size|
2208 // 32 30 24 16 8 2 0
2209 //
2210 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2211 //
2212
2213 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2214
2215 // Handle objArrays completely differently...
2216 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2217 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2218 __ movw(rscratch1, objArray_lh);
2219 __ eorw(rscratch2, lh, rscratch1);
2220 __ cbzw(rscratch2, L_objArray);
2221
2222 // if (src->klass() != dst->klass()) return -1;
2223 __ load_klass(rscratch2, dst);
2224 __ eor(rscratch2, rscratch2, scratch_src_klass);
2225 __ cbnz(rscratch2, L_failed);
2226
2227 // if (!src->is_Array()) return -1;
2228 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2229
2230 // At this point, it is known to be a typeArray (array_tag 0x3).
2231 #ifdef ASSERT
2232 {
2233 BLOCK_COMMENT("assert primitive array {");
2234 Label L;
2235 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2236 __ cmpw(lh, rscratch2);
2237 __ br(Assembler::GE, L);
2238 __ stop("must be a primitive array");
2239 __ bind(L);
2240 BLOCK_COMMENT("} assert primitive array done");
2241 }
2242 #endif
2243
2244 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2245 rscratch2, L_failed);
2246
2247 // TypeArrayKlass
2248 //
2249 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2250 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2251 //
2252
2253 const Register rscratch1_offset = rscratch1; // array offset
2254 const Register r15_elsize = lh; // element size
2255
2256 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2257 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2258 __ add(src, src, rscratch1_offset); // src array offset
2259 __ add(dst, dst, rscratch1_offset); // dst array offset
2260 BLOCK_COMMENT("choose copy loop based on element size");
2261
2262 // next registers should be set before the jump to corresponding stub
2263 const Register from = c_rarg0; // source array address
2264 const Register to = c_rarg1; // destination array address
2265 const Register count = c_rarg2; // elements count
2266
2267 // 'from', 'to', 'count' registers should be set in such order
2268 // since they are the same as 'src', 'src_pos', 'dst'.
2269
2270 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2271
2272 // The possible values of elsize are 0-3, i.e. exact_log2(element
2273 // size in bytes). We do a simple bitwise binary search.
2274 __ BIND(L_copy_bytes);
2275 __ tbnz(r15_elsize, 1, L_copy_ints);
2276 __ tbnz(r15_elsize, 0, L_copy_shorts);
2277 __ lea(from, Address(src, src_pos));// src_addr
2278 __ lea(to, Address(dst, dst_pos));// dst_addr
2279 __ movw(count, scratch_length); // length
2280 __ b(RuntimeAddress(byte_copy_entry));
2281
2282 __ BIND(L_copy_shorts);
2283 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2284 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2285 __ movw(count, scratch_length); // length
2286 __ b(RuntimeAddress(short_copy_entry));
2287
2288 __ BIND(L_copy_ints);
2289 __ tbnz(r15_elsize, 0, L_copy_longs);
2290 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2291 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2292 __ movw(count, scratch_length); // length
2293 __ b(RuntimeAddress(int_copy_entry));
2294
2295 __ BIND(L_copy_longs);
2296 #ifdef ASSERT
2297 {
2298 BLOCK_COMMENT("assert long copy {");
2299 Label L;
2300 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2301 __ cmpw(r15_elsize, LogBytesPerLong);
2302 __ br(Assembler::EQ, L);
2303 __ stop("must be long copy, but elsize is wrong");
2304 __ bind(L);
2305 BLOCK_COMMENT("} assert long copy done");
2306 }
2307 #endif
2308 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2309 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2310 __ movw(count, scratch_length); // length
2311 __ b(RuntimeAddress(long_copy_entry));
2312
2313 // ObjArrayKlass
2314 __ BIND(L_objArray);
2315 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2316
2317 Label L_plain_copy, L_checkcast_copy;
2318 // test array classes for subtyping
2319 __ load_klass(r15, dst);
2320 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2321 __ br(Assembler::NE, L_checkcast_copy);
2322
2323 // Identically typed arrays can be copied without element-wise checks.
2324 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2325 rscratch2, L_failed);
2326
2327 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2328 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2329 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2330 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2331 __ movw(count, scratch_length); // length
2332 __ BIND(L_plain_copy);
2333 __ b(RuntimeAddress(oop_copy_entry));
2334
2335 __ BIND(L_checkcast_copy);
2336 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2337 {
2338 // Before looking at dst.length, make sure dst is also an objArray.
2339 __ ldrw(rscratch1, Address(r15, lh_offset));
2340 __ movw(rscratch2, objArray_lh);
2341 __ eorw(rscratch1, rscratch1, rscratch2);
2342 __ cbnzw(rscratch1, L_failed);
2343
2344 // It is safe to examine both src.length and dst.length.
2345 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2346 r15, L_failed);
2347
2348 __ load_klass(dst_klass, dst); // reload
2349
2350 // Marshal the base address arguments now, freeing registers.
2351 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2352 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2354 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2355 __ movw(count, length); // length (reloaded)
2356 Register sco_temp = c_rarg3; // this register is free now
2357 assert_different_registers(from, to, count, sco_temp,
2358 dst_klass, scratch_src_klass);
2359 // assert_clean_int(count, sco_temp);
2360
2361 // Generate the type check.
2362 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2363 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2364
2365 // Smashes rscratch1, rscratch2
2366 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2367 L_plain_copy);
2368
2369 // Fetch destination element klass from the ObjArrayKlass header.
2370 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2371 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2372 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2373
2374 // the checkcast_copy loop needs two extra arguments:
2375 assert(c_rarg3 == sco_temp, "#3 already in place");
2376 // Set up arguments for checkcast_copy_entry.
2377 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2378 __ b(RuntimeAddress(checkcast_copy_entry));
2379 }
2380
2381 __ BIND(L_failed);
2382 __ mov(r0, -1);
2383 __ leave(); // required for proper stackwalking of RuntimeStub frame
2384 __ ret(lr);
2385
2386 return start;
2387 }
2388
2389 //
2390 // Generate stub for array fill. If "aligned" is true, the
2391 // "to" address is assumed to be heapword aligned.
2392 //
2393 // Arguments for generated stub:
2394 // to: c_rarg0
2395 // value: c_rarg1
2396 // count: c_rarg2 treated as signed
2397 //
2398 address generate_fill(StubId stub_id) {
2399 BasicType t;
2400 bool aligned;
2401
2402 switch (stub_id) {
2403 case StubId::stubgen_jbyte_fill_id:
2404 t = T_BYTE;
2405 aligned = false;
2406 break;
2407 case StubId::stubgen_jshort_fill_id:
2408 t = T_SHORT;
2409 aligned = false;
2410 break;
2411 case StubId::stubgen_jint_fill_id:
2412 t = T_INT;
2413 aligned = false;
2414 break;
2415 case StubId::stubgen_arrayof_jbyte_fill_id:
2416 t = T_BYTE;
2417 aligned = true;
2418 break;
2419 case StubId::stubgen_arrayof_jshort_fill_id:
2420 t = T_SHORT;
2421 aligned = true;
2422 break;
2423 case StubId::stubgen_arrayof_jint_fill_id:
2424 t = T_INT;
2425 aligned = true;
2426 break;
2427 default:
2428 ShouldNotReachHere();
2429 };
2430
2431 __ align(CodeEntryAlignment);
2432 StubCodeMark mark(this, stub_id);
2433 address start = __ pc();
2434
2435 BLOCK_COMMENT("Entry:");
2436
2437 const Register to = c_rarg0; // source array address
2438 const Register value = c_rarg1; // value
2439 const Register count = c_rarg2; // elements count
2440
2441 const Register bz_base = r10; // base for block_zero routine
2442 const Register cnt_words = r11; // temp register
2443
2444 __ enter();
2445
2446 Label L_fill_elements, L_exit1;
2447
2448 int shift = -1;
2449 switch (t) {
2450 case T_BYTE:
2451 shift = 0;
2452 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2453 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2454 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2455 __ br(Assembler::LO, L_fill_elements);
2456 break;
2457 case T_SHORT:
2458 shift = 1;
2459 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2460 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2461 __ br(Assembler::LO, L_fill_elements);
2462 break;
2463 case T_INT:
2464 shift = 2;
2465 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2466 __ br(Assembler::LO, L_fill_elements);
2467 break;
2468 default: ShouldNotReachHere();
2469 }
2470
2471 // Align source address at 8 bytes address boundary.
2472 Label L_skip_align1, L_skip_align2, L_skip_align4;
2473 if (!aligned) {
2474 switch (t) {
2475 case T_BYTE:
2476 // One byte misalignment happens only for byte arrays.
2477 __ tbz(to, 0, L_skip_align1);
2478 __ strb(value, Address(__ post(to, 1)));
2479 __ subw(count, count, 1);
2480 __ bind(L_skip_align1);
2481 // Fallthrough
2482 case T_SHORT:
2483 // Two bytes misalignment happens only for byte and short (char) arrays.
2484 __ tbz(to, 1, L_skip_align2);
2485 __ strh(value, Address(__ post(to, 2)));
2486 __ subw(count, count, 2 >> shift);
2487 __ bind(L_skip_align2);
2488 // Fallthrough
2489 case T_INT:
2490 // Align to 8 bytes, we know we are 4 byte aligned to start.
2491 __ tbz(to, 2, L_skip_align4);
2492 __ strw(value, Address(__ post(to, 4)));
2493 __ subw(count, count, 4 >> shift);
2494 __ bind(L_skip_align4);
2495 break;
2496 default: ShouldNotReachHere();
2497 }
2498 }
2499
2500 //
2501 // Fill large chunks
2502 //
2503 __ lsrw(cnt_words, count, 3 - shift); // number of words
2504 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2505 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2506 if (UseBlockZeroing) {
2507 Label non_block_zeroing, rest;
2508 // If the fill value is zero we can use the fast zero_words().
2509 __ cbnz(value, non_block_zeroing);
2510 __ mov(bz_base, to);
2511 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2512 address tpc = __ zero_words(bz_base, cnt_words);
2513 if (tpc == nullptr) {
2514 fatal("CodeCache is full at generate_fill");
2515 }
2516 __ b(rest);
2517 __ bind(non_block_zeroing);
2518 __ fill_words(to, cnt_words, value);
2519 __ bind(rest);
2520 } else {
2521 __ fill_words(to, cnt_words, value);
2522 }
2523
2524 // Remaining count is less than 8 bytes. Fill it by a single store.
2525 // Note that the total length is no less than 8 bytes.
2526 if (t == T_BYTE || t == T_SHORT) {
2527 Label L_exit1;
2528 __ cbzw(count, L_exit1);
2529 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2530 __ str(value, Address(to, -8)); // overwrite some elements
2531 __ bind(L_exit1);
2532 __ leave();
2533 __ ret(lr);
2534 }
2535
2536 // Handle copies less than 8 bytes.
2537 Label L_fill_2, L_fill_4, L_exit2;
2538 __ bind(L_fill_elements);
2539 switch (t) {
2540 case T_BYTE:
2541 __ tbz(count, 0, L_fill_2);
2542 __ strb(value, Address(__ post(to, 1)));
2543 __ bind(L_fill_2);
2544 __ tbz(count, 1, L_fill_4);
2545 __ strh(value, Address(__ post(to, 2)));
2546 __ bind(L_fill_4);
2547 __ tbz(count, 2, L_exit2);
2548 __ strw(value, Address(to));
2549 break;
2550 case T_SHORT:
2551 __ tbz(count, 0, L_fill_4);
2552 __ strh(value, Address(__ post(to, 2)));
2553 __ bind(L_fill_4);
2554 __ tbz(count, 1, L_exit2);
2555 __ strw(value, Address(to));
2556 break;
2557 case T_INT:
2558 __ cbzw(count, L_exit2);
2559 __ strw(value, Address(to));
2560 break;
2561 default: ShouldNotReachHere();
2562 }
2563 __ bind(L_exit2);
2564 __ leave();
2565 __ ret(lr);
2566 return start;
2567 }
2568
2569 address generate_unsafecopy_common_error_exit() {
2570 address start_pc = __ pc();
2571 __ leave();
2572 __ mov(r0, 0);
2573 __ ret(lr);
2574 return start_pc;
2575 }
2576
2577 //
2578 // Generate 'unsafe' set memory stub
2579 // Though just as safe as the other stubs, it takes an unscaled
2580 // size_t (# bytes) argument instead of an element count.
2581 //
2582 // This fill operation is atomicity preserving: as long as the
2583 // address supplied is sufficiently aligned, all writes of up to 64
2584 // bits in size are single-copy atomic.
2585 //
2586 // Input:
2587 // c_rarg0 - destination array address
2588 // c_rarg1 - byte count (size_t)
2589 // c_rarg2 - byte value
2590 //
2591 address generate_unsafe_setmemory() {
2592 __ align(CodeEntryAlignment);
2593 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2594 address start = __ pc();
2595
2596 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2597 Label tail;
2598
2599 UnsafeMemoryAccessMark umam(this, true, false);
2600
2601 __ enter(); // required for proper stackwalking of RuntimeStub frame
2602
2603 __ dup(v0, __ T16B, value);
2604
2605 if (AvoidUnalignedAccesses) {
2606 __ cmp(count, (u1)16);
2607 __ br(__ LO, tail);
2608
2609 __ mov(rscratch1, 16);
2610 __ andr(rscratch2, dest, 15);
2611 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2612 __ strq(v0, Address(dest));
2613 __ sub(count, count, rscratch1);
2614 __ add(dest, dest, rscratch1);
2615 }
2616
2617 __ subs(count, count, (u1)64);
2618 __ br(__ LO, tail);
2619 {
2620 Label again;
2621 __ bind(again);
2622 __ stpq(v0, v0, Address(dest));
2623 __ stpq(v0, v0, Address(dest, 32));
2624
2625 __ subs(count, count, 64);
2626 __ add(dest, dest, 64);
2627 __ br(__ HS, again);
2628 }
2629
2630 __ bind(tail);
2631 // The count of bytes is off by 64, but we don't need to correct
2632 // it because we're only going to use the least-significant few
2633 // count bits from here on.
2634 // __ add(count, count, 64);
2635
2636 {
2637 Label dont;
2638 __ tbz(count, exact_log2(32), dont);
2639 __ stpq(v0, v0, __ post(dest, 32));
2640 __ bind(dont);
2641 }
2642 {
2643 Label dont;
2644 __ tbz(count, exact_log2(16), dont);
2645 __ strq(v0, __ post(dest, 16));
2646 __ bind(dont);
2647 }
2648 {
2649 Label dont;
2650 __ tbz(count, exact_log2(8), dont);
2651 __ strd(v0, __ post(dest, 8));
2652 __ bind(dont);
2653 }
2654
2655 Label finished;
2656 __ tst(count, 7);
2657 __ br(__ EQ, finished);
2658
2659 {
2660 Label dont;
2661 __ tbz(count, exact_log2(4), dont);
2662 __ strs(v0, __ post(dest, 4));
2663 __ bind(dont);
2664 }
2665 {
2666 Label dont;
2667 __ tbz(count, exact_log2(2), dont);
2668 __ bfi(value, value, 8, 8);
2669 __ strh(value, __ post(dest, 2));
2670 __ bind(dont);
2671 }
2672 {
2673 Label dont;
2674 __ tbz(count, exact_log2(1), dont);
2675 __ strb(value, Address(dest));
2676 __ bind(dont);
2677 }
2678
2679 __ bind(finished);
2680 __ leave();
2681 __ ret(lr);
2682
2683 return start;
2684 }
2685
2686 address generate_data_cache_writeback() {
2687 const Register line = c_rarg0; // address of line to write back
2688
2689 __ align(CodeEntryAlignment);
2690
2691 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2692 StubCodeMark mark(this, stub_id);
2693
2694 address start = __ pc();
2695 __ enter();
2696 __ cache_wb(Address(line, 0));
2697 __ leave();
2698 __ ret(lr);
2699
2700 return start;
2701 }
2702
2703 address generate_data_cache_writeback_sync() {
2704 const Register is_pre = c_rarg0; // pre or post sync
2705
2706 __ align(CodeEntryAlignment);
2707
2708 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2709 StubCodeMark mark(this, stub_id);
2710
2711 // pre wbsync is a no-op
2712 // post wbsync translates to an sfence
2713
2714 Label skip;
2715 address start = __ pc();
2716 __ enter();
2717 __ cbnz(is_pre, skip);
2718 __ cache_wbsync(false);
2719 __ bind(skip);
2720 __ leave();
2721 __ ret(lr);
2722
2723 return start;
2724 }
2725
2726 void generate_arraycopy_stubs() {
2727 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2728 // entry immediately following their stack push. This can be used
2729 // as a post-push branch target for compatible stubs when they
2730 // identify a special case that can be handled by the fallback
2731 // stub e.g a disjoint copy stub may be use as a special case
2732 // fallback for its compatible conjoint copy stub.
2733 //
2734 // A no push entry is always returned in the following local and
2735 // then published by assigning to the appropriate entry field in
2736 // class StubRoutines. The entry value is then passed to the
2737 // generator for the compatible stub. That means the entry must be
2738 // listed when saving to/restoring from the AOT cache, ensuring
2739 // that the inter-stub jumps are noted at AOT-cache save and
2740 // relocated at AOT cache load.
2741 address nopush_entry;
2742
2743 // generate the common exit first so later stubs can rely on it if
2744 // they want an UnsafeMemoryAccess exit non-local to the stub
2745 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2746 // register the stub as the default exit with class UnsafeMemoryAccess
2747 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2748
2749 // generate and publish arch64-specific bulk copy routines first
2750 // so we can call them from other copy stubs
2751 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2752 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2753
2754 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2755 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2756
2757 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2758 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2759
2760 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2761
2762 //*** jbyte
2763 // Always need aligned and unaligned versions
2764 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2765 // disjoint nopush entry is needed by conjoint copy
2766 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2767 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2768 // conjoint nopush entry is needed by generic/unsafe copy
2769 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2770 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2771 // disjoint arrayof nopush entry is needed by conjoint copy
2772 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2773 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2774
2775 //*** jshort
2776 // Always need aligned and unaligned versions
2777 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2778 // disjoint nopush entry is needed by conjoint copy
2779 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2780 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2781 // conjoint nopush entry is used by generic/unsafe copy
2782 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2783 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2784 // disjoint arrayof nopush entry is needed by conjoint copy
2785 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2786 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2787
2788 //*** jint
2789 // Aligned versions
2790 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2791 // disjoint arrayof nopush entry is needed by conjoint copy
2792 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2793 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2794 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2795 // jint_arraycopy_nopush always points to the unaligned version
2796 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2797 // disjoint nopush entry is needed by conjoint copy
2798 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2799 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2800 // conjoint nopush entry is needed by generic/unsafe copy
2801 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2802
2803 //*** jlong
2804 // It is always aligned
2805 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2806 // disjoint arrayof nopush entry is needed by conjoint copy
2807 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2808 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2809 // conjoint nopush entry is needed by generic/unsafe copy
2810 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2811 // disjoint normal/nopush and conjoint normal entries are not
2812 // generated since the arrayof versions are the same
2813 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2814 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2815 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2816
2817 //*** oops
2818 {
2819 StubRoutines::_arrayof_oop_disjoint_arraycopy
2820 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2821 // disjoint arrayof nopush entry is needed by conjoint copy
2822 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2823 StubRoutines::_arrayof_oop_arraycopy
2824 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2825 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2826 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2827 // Aligned versions without pre-barriers
2828 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2829 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2830 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2831 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2832 // note that we don't need a returned nopush entry because the
2833 // generic/unsafe copy does not cater for uninit arrays.
2834 StubRoutines::_arrayof_oop_arraycopy_uninit
2835 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2836 }
2837
2838 // for oop copies reuse arrayof entries for non-arrayof cases
2839 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2840 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2841 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2842 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2843 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2844 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2845
2846 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2847 // checkcast nopush entry is needed by generic copy
2848 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2849 // note that we don't need a returned nopush entry because the
2850 // generic copy does not cater for uninit arrays.
2851 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2852
2853 // unsafe arraycopy may fallback on conjoint stubs
2854 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2855 StubRoutines::_jshort_arraycopy_nopush,
2856 StubRoutines::_jint_arraycopy_nopush,
2857 StubRoutines::_jlong_arraycopy_nopush);
2858
2859 // generic arraycopy may fallback on conjoint stubs
2860 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2861 StubRoutines::_jshort_arraycopy_nopush,
2862 StubRoutines::_jint_arraycopy_nopush,
2863 StubRoutines::_oop_arraycopy_nopush,
2864 StubRoutines::_jlong_arraycopy_nopush,
2865 StubRoutines::_checkcast_arraycopy_nopush);
2866
2867 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2868 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2869 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2870 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2871 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2872 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2873 }
2874
2875 void generate_math_stubs() { Unimplemented(); }
2876
2877 // Arguments:
2878 //
2879 // Inputs:
2880 // c_rarg0 - source byte array address
2881 // c_rarg1 - destination byte array address
2882 // c_rarg2 - sessionKe (key) in little endian int array
2883 //
2884 address generate_aescrypt_encryptBlock() {
2885 __ align(CodeEntryAlignment);
2886 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2887 StubCodeMark mark(this, stub_id);
2888
2889 const Register from = c_rarg0; // source array address
2890 const Register to = c_rarg1; // destination array address
2891 const Register key = c_rarg2; // key array address
2892 const Register keylen = rscratch1;
2893
2894 address start = __ pc();
2895 __ enter();
2896
2897 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2898
2899 __ aesenc_loadkeys(key, keylen);
2900 __ aesecb_encrypt(from, to, keylen);
2901
2902 __ mov(r0, 0);
2903
2904 __ leave();
2905 __ ret(lr);
2906
2907 return start;
2908 }
2909
2910 // Arguments:
2911 //
2912 // Inputs:
2913 // c_rarg0 - source byte array address
2914 // c_rarg1 - destination byte array address
2915 // c_rarg2 - sessionKd (key) in little endian int array
2916 //
2917 address generate_aescrypt_decryptBlock() {
2918 assert(UseAES, "need AES cryptographic extension support");
2919 __ align(CodeEntryAlignment);
2920 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2921 StubCodeMark mark(this, stub_id);
2922 Label L_doLast;
2923
2924 const Register from = c_rarg0; // source array address
2925 const Register to = c_rarg1; // destination array address
2926 const Register key = c_rarg2; // key array address
2927 const Register keylen = rscratch1;
2928
2929 address start = __ pc();
2930 __ enter(); // required for proper stackwalking of RuntimeStub frame
2931
2932 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2933
2934 __ aesecb_decrypt(from, to, key, keylen);
2935
2936 __ mov(r0, 0);
2937
2938 __ leave();
2939 __ ret(lr);
2940
2941 return start;
2942 }
2943
2944 // Arguments:
2945 //
2946 // Inputs:
2947 // c_rarg0 - source byte array address
2948 // c_rarg1 - destination byte array address
2949 // c_rarg2 - sessionKe (key) in little endian int array
2950 // c_rarg3 - r vector byte array address
2951 // c_rarg4 - input length
2952 //
2953 // Output:
2954 // x0 - input length
2955 //
2956 address generate_cipherBlockChaining_encryptAESCrypt() {
2957 assert(UseAES, "need AES cryptographic extension support");
2958 __ align(CodeEntryAlignment);
2959 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2960 StubCodeMark mark(this, stub_id);
2961
2962 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2963
2964 const Register from = c_rarg0; // source array address
2965 const Register to = c_rarg1; // destination array address
2966 const Register key = c_rarg2; // key array address
2967 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2968 // and left with the results of the last encryption block
2969 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2970 const Register keylen = rscratch1;
2971
2972 address start = __ pc();
2973
2974 __ enter();
2975
2976 __ movw(rscratch2, len_reg);
2977
2978 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2979
2980 __ ld1(v0, __ T16B, rvec);
2981
2982 __ cmpw(keylen, 52);
2983 __ br(Assembler::CC, L_loadkeys_44);
2984 __ br(Assembler::EQ, L_loadkeys_52);
2985
2986 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2987 __ rev32(v17, __ T16B, v17);
2988 __ rev32(v18, __ T16B, v18);
2989 __ BIND(L_loadkeys_52);
2990 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2991 __ rev32(v19, __ T16B, v19);
2992 __ rev32(v20, __ T16B, v20);
2993 __ BIND(L_loadkeys_44);
2994 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2995 __ rev32(v21, __ T16B, v21);
2996 __ rev32(v22, __ T16B, v22);
2997 __ rev32(v23, __ T16B, v23);
2998 __ rev32(v24, __ T16B, v24);
2999 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3000 __ rev32(v25, __ T16B, v25);
3001 __ rev32(v26, __ T16B, v26);
3002 __ rev32(v27, __ T16B, v27);
3003 __ rev32(v28, __ T16B, v28);
3004 __ ld1(v29, v30, v31, __ T16B, key);
3005 __ rev32(v29, __ T16B, v29);
3006 __ rev32(v30, __ T16B, v30);
3007 __ rev32(v31, __ T16B, v31);
3008
3009 __ BIND(L_aes_loop);
3010 __ ld1(v1, __ T16B, __ post(from, 16));
3011 __ eor(v0, __ T16B, v0, v1);
3012
3013 __ br(Assembler::CC, L_rounds_44);
3014 __ br(Assembler::EQ, L_rounds_52);
3015
3016 __ aese(v0, v17); __ aesmc(v0, v0);
3017 __ aese(v0, v18); __ aesmc(v0, v0);
3018 __ BIND(L_rounds_52);
3019 __ aese(v0, v19); __ aesmc(v0, v0);
3020 __ aese(v0, v20); __ aesmc(v0, v0);
3021 __ BIND(L_rounds_44);
3022 __ aese(v0, v21); __ aesmc(v0, v0);
3023 __ aese(v0, v22); __ aesmc(v0, v0);
3024 __ aese(v0, v23); __ aesmc(v0, v0);
3025 __ aese(v0, v24); __ aesmc(v0, v0);
3026 __ aese(v0, v25); __ aesmc(v0, v0);
3027 __ aese(v0, v26); __ aesmc(v0, v0);
3028 __ aese(v0, v27); __ aesmc(v0, v0);
3029 __ aese(v0, v28); __ aesmc(v0, v0);
3030 __ aese(v0, v29); __ aesmc(v0, v0);
3031 __ aese(v0, v30);
3032 __ eor(v0, __ T16B, v0, v31);
3033
3034 __ st1(v0, __ T16B, __ post(to, 16));
3035
3036 __ subw(len_reg, len_reg, 16);
3037 __ cbnzw(len_reg, L_aes_loop);
3038
3039 __ st1(v0, __ T16B, rvec);
3040
3041 __ mov(r0, rscratch2);
3042
3043 __ leave();
3044 __ ret(lr);
3045
3046 return start;
3047 }
3048
3049 // Arguments:
3050 //
3051 // Inputs:
3052 // c_rarg0 - source byte array address
3053 // c_rarg1 - destination byte array address
3054 // c_rarg2 - sessionKd (key) in little endian int array
3055 // c_rarg3 - r vector byte array address
3056 // c_rarg4 - input length
3057 //
3058 // Output:
3059 // r0 - input length
3060 //
3061 address generate_cipherBlockChaining_decryptAESCrypt() {
3062 assert(UseAES, "need AES cryptographic extension support");
3063 __ align(CodeEntryAlignment);
3064 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3065 StubCodeMark mark(this, stub_id);
3066
3067 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3068
3069 const Register from = c_rarg0; // source array address
3070 const Register to = c_rarg1; // destination array address
3071 const Register key = c_rarg2; // key array address
3072 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3073 // and left with the results of the last encryption block
3074 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3075 const Register keylen = rscratch1;
3076
3077 address start = __ pc();
3078
3079 __ enter();
3080
3081 __ movw(rscratch2, len_reg);
3082
3083 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3084
3085 __ ld1(v2, __ T16B, rvec);
3086
3087 __ ld1(v31, __ T16B, __ post(key, 16));
3088 __ rev32(v31, __ T16B, v31);
3089
3090 __ cmpw(keylen, 52);
3091 __ br(Assembler::CC, L_loadkeys_44);
3092 __ br(Assembler::EQ, L_loadkeys_52);
3093
3094 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3095 __ rev32(v17, __ T16B, v17);
3096 __ rev32(v18, __ T16B, v18);
3097 __ BIND(L_loadkeys_52);
3098 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3099 __ rev32(v19, __ T16B, v19);
3100 __ rev32(v20, __ T16B, v20);
3101 __ BIND(L_loadkeys_44);
3102 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3103 __ rev32(v21, __ T16B, v21);
3104 __ rev32(v22, __ T16B, v22);
3105 __ rev32(v23, __ T16B, v23);
3106 __ rev32(v24, __ T16B, v24);
3107 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3108 __ rev32(v25, __ T16B, v25);
3109 __ rev32(v26, __ T16B, v26);
3110 __ rev32(v27, __ T16B, v27);
3111 __ rev32(v28, __ T16B, v28);
3112 __ ld1(v29, v30, __ T16B, key);
3113 __ rev32(v29, __ T16B, v29);
3114 __ rev32(v30, __ T16B, v30);
3115
3116 __ BIND(L_aes_loop);
3117 __ ld1(v0, __ T16B, __ post(from, 16));
3118 __ orr(v1, __ T16B, v0, v0);
3119
3120 __ br(Assembler::CC, L_rounds_44);
3121 __ br(Assembler::EQ, L_rounds_52);
3122
3123 __ aesd(v0, v17); __ aesimc(v0, v0);
3124 __ aesd(v0, v18); __ aesimc(v0, v0);
3125 __ BIND(L_rounds_52);
3126 __ aesd(v0, v19); __ aesimc(v0, v0);
3127 __ aesd(v0, v20); __ aesimc(v0, v0);
3128 __ BIND(L_rounds_44);
3129 __ aesd(v0, v21); __ aesimc(v0, v0);
3130 __ aesd(v0, v22); __ aesimc(v0, v0);
3131 __ aesd(v0, v23); __ aesimc(v0, v0);
3132 __ aesd(v0, v24); __ aesimc(v0, v0);
3133 __ aesd(v0, v25); __ aesimc(v0, v0);
3134 __ aesd(v0, v26); __ aesimc(v0, v0);
3135 __ aesd(v0, v27); __ aesimc(v0, v0);
3136 __ aesd(v0, v28); __ aesimc(v0, v0);
3137 __ aesd(v0, v29); __ aesimc(v0, v0);
3138 __ aesd(v0, v30);
3139 __ eor(v0, __ T16B, v0, v31);
3140 __ eor(v0, __ T16B, v0, v2);
3141
3142 __ st1(v0, __ T16B, __ post(to, 16));
3143 __ orr(v2, __ T16B, v1, v1);
3144
3145 __ subw(len_reg, len_reg, 16);
3146 __ cbnzw(len_reg, L_aes_loop);
3147
3148 __ st1(v2, __ T16B, rvec);
3149
3150 __ mov(r0, rscratch2);
3151
3152 __ leave();
3153 __ ret(lr);
3154
3155 return start;
3156 }
3157
3158 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3159 // Inputs: 128-bits. in is preserved.
3160 // The least-significant 64-bit word is in the upper dword of each vector.
3161 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3162 // Output: result
3163 void be_add_128_64(FloatRegister result, FloatRegister in,
3164 FloatRegister inc, FloatRegister tmp) {
3165 assert_different_registers(result, tmp, inc);
3166
3167 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3168 // input
3169 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3170 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3171 // MSD == 0 (must be!) to LSD
3172 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3173 }
3174
3175 // CTR AES crypt.
3176 // Arguments:
3177 //
3178 // Inputs:
3179 // c_rarg0 - source byte array address
3180 // c_rarg1 - destination byte array address
3181 // c_rarg2 - sessionKe (key) in little endian int array
3182 // c_rarg3 - counter vector byte array address
3183 // c_rarg4 - input length
3184 // c_rarg5 - saved encryptedCounter start
3185 // c_rarg6 - saved used length
3186 //
3187 // Output:
3188 // r0 - input length
3189 //
3190 address generate_counterMode_AESCrypt() {
3191 const Register in = c_rarg0;
3192 const Register out = c_rarg1;
3193 const Register key = c_rarg2;
3194 const Register counter = c_rarg3;
3195 const Register saved_len = c_rarg4, len = r10;
3196 const Register saved_encrypted_ctr = c_rarg5;
3197 const Register used_ptr = c_rarg6, used = r12;
3198
3199 const Register offset = r7;
3200 const Register keylen = r11;
3201
3202 const unsigned char block_size = 16;
3203 const int bulk_width = 4;
3204 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3205 // performance with larger data sizes, but it also means that the
3206 // fast path isn't used until you have at least 8 blocks, and up
3207 // to 127 bytes of data will be executed on the slow path. For
3208 // that reason, and also so as not to blow away too much icache, 4
3209 // blocks seems like a sensible compromise.
3210
3211 // Algorithm:
3212 //
3213 // if (len == 0) {
3214 // goto DONE;
3215 // }
3216 // int result = len;
3217 // do {
3218 // if (used >= blockSize) {
3219 // if (len >= bulk_width * blockSize) {
3220 // CTR_large_block();
3221 // if (len == 0)
3222 // goto DONE;
3223 // }
3224 // for (;;) {
3225 // 16ByteVector v0 = counter;
3226 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3227 // used = 0;
3228 // if (len < blockSize)
3229 // break; /* goto NEXT */
3230 // 16ByteVector v1 = load16Bytes(in, offset);
3231 // v1 = v1 ^ encryptedCounter;
3232 // store16Bytes(out, offset);
3233 // used = blockSize;
3234 // offset += blockSize;
3235 // len -= blockSize;
3236 // if (len == 0)
3237 // goto DONE;
3238 // }
3239 // }
3240 // NEXT:
3241 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3242 // len--;
3243 // } while (len != 0);
3244 // DONE:
3245 // return result;
3246 //
3247 // CTR_large_block()
3248 // Wide bulk encryption of whole blocks.
3249
3250 __ align(CodeEntryAlignment);
3251 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3252 StubCodeMark mark(this, stub_id);
3253 const address start = __ pc();
3254 __ enter();
3255
3256 Label DONE, CTR_large_block, large_block_return;
3257 __ ldrw(used, Address(used_ptr));
3258 __ cbzw(saved_len, DONE);
3259
3260 __ mov(len, saved_len);
3261 __ mov(offset, 0);
3262
3263 // Compute #rounds for AES based on the length of the key array
3264 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3265
3266 __ aesenc_loadkeys(key, keylen);
3267
3268 {
3269 Label L_CTR_loop, NEXT;
3270
3271 __ bind(L_CTR_loop);
3272
3273 __ cmp(used, block_size);
3274 __ br(__ LO, NEXT);
3275
3276 // Maybe we have a lot of data
3277 __ subsw(rscratch1, len, bulk_width * block_size);
3278 __ br(__ HS, CTR_large_block);
3279 __ BIND(large_block_return);
3280 __ cbzw(len, DONE);
3281
3282 // Setup the counter
3283 __ movi(v4, __ T4S, 0);
3284 __ movi(v5, __ T4S, 1);
3285 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3286
3287 // 128-bit big-endian increment
3288 __ ld1(v0, __ T16B, counter);
3289 __ rev64(v16, __ T16B, v0);
3290 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3291 __ rev64(v16, __ T16B, v16);
3292 __ st1(v16, __ T16B, counter);
3293 // Previous counter value is in v0
3294 // v4 contains { 0, 1 }
3295
3296 {
3297 // We have fewer than bulk_width blocks of data left. Encrypt
3298 // them one by one until there is less than a full block
3299 // remaining, being careful to save both the encrypted counter
3300 // and the counter.
3301
3302 Label inner_loop;
3303 __ bind(inner_loop);
3304 // Counter to encrypt is in v0
3305 __ aesecb_encrypt(noreg, noreg, keylen);
3306 __ st1(v0, __ T16B, saved_encrypted_ctr);
3307
3308 // Do we have a remaining full block?
3309
3310 __ mov(used, 0);
3311 __ cmp(len, block_size);
3312 __ br(__ LO, NEXT);
3313
3314 // Yes, we have a full block
3315 __ ldrq(v1, Address(in, offset));
3316 __ eor(v1, __ T16B, v1, v0);
3317 __ strq(v1, Address(out, offset));
3318 __ mov(used, block_size);
3319 __ add(offset, offset, block_size);
3320
3321 __ subw(len, len, block_size);
3322 __ cbzw(len, DONE);
3323
3324 // Increment the counter, store it back
3325 __ orr(v0, __ T16B, v16, v16);
3326 __ rev64(v16, __ T16B, v16);
3327 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3328 __ rev64(v16, __ T16B, v16);
3329 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3330
3331 __ b(inner_loop);
3332 }
3333
3334 __ BIND(NEXT);
3335
3336 // Encrypt a single byte, and loop.
3337 // We expect this to be a rare event.
3338 __ ldrb(rscratch1, Address(in, offset));
3339 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3340 __ eor(rscratch1, rscratch1, rscratch2);
3341 __ strb(rscratch1, Address(out, offset));
3342 __ add(offset, offset, 1);
3343 __ add(used, used, 1);
3344 __ subw(len, len,1);
3345 __ cbnzw(len, L_CTR_loop);
3346 }
3347
3348 __ bind(DONE);
3349 __ strw(used, Address(used_ptr));
3350 __ mov(r0, saved_len);
3351
3352 __ leave(); // required for proper stackwalking of RuntimeStub frame
3353 __ ret(lr);
3354
3355 // Bulk encryption
3356
3357 __ BIND (CTR_large_block);
3358 assert(bulk_width == 4 || bulk_width == 8, "must be");
3359
3360 if (bulk_width == 8) {
3361 __ sub(sp, sp, 4 * 16);
3362 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3363 }
3364 __ sub(sp, sp, 4 * 16);
3365 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3366 RegSet saved_regs = (RegSet::of(in, out, offset)
3367 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3368 __ push(saved_regs, sp);
3369 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3370 __ add(in, in, offset);
3371 __ add(out, out, offset);
3372
3373 // Keys should already be loaded into the correct registers
3374
3375 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3376 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3377
3378 // AES/CTR loop
3379 {
3380 Label L_CTR_loop;
3381 __ BIND(L_CTR_loop);
3382
3383 // Setup the counters
3384 __ movi(v8, __ T4S, 0);
3385 __ movi(v9, __ T4S, 1);
3386 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3387
3388 for (int i = 0; i < bulk_width; i++) {
3389 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3390 __ rev64(v0_ofs, __ T16B, v16);
3391 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3392 }
3393
3394 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3395
3396 // Encrypt the counters
3397 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3398
3399 if (bulk_width == 8) {
3400 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3401 }
3402
3403 // XOR the encrypted counters with the inputs
3404 for (int i = 0; i < bulk_width; i++) {
3405 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3406 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3407 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3408 }
3409
3410 // Write the encrypted data
3411 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3412 if (bulk_width == 8) {
3413 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3414 }
3415
3416 __ subw(len, len, 16 * bulk_width);
3417 __ cbnzw(len, L_CTR_loop);
3418 }
3419
3420 // Save the counter back where it goes
3421 __ rev64(v16, __ T16B, v16);
3422 __ st1(v16, __ T16B, counter);
3423
3424 __ pop(saved_regs, sp);
3425
3426 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3427 if (bulk_width == 8) {
3428 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3429 }
3430
3431 __ andr(rscratch1, len, -16 * bulk_width);
3432 __ sub(len, len, rscratch1);
3433 __ add(offset, offset, rscratch1);
3434 __ mov(used, 16);
3435 __ strw(used, Address(used_ptr));
3436 __ b(large_block_return);
3437
3438 return start;
3439 }
3440
3441 // Vector AES Galois Counter Mode implementation. Parameters:
3442 //
3443 // in = c_rarg0
3444 // len = c_rarg1
3445 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3446 // out = c_rarg3
3447 // key = c_rarg4
3448 // state = c_rarg5 - GHASH.state
3449 // subkeyHtbl = c_rarg6 - powers of H
3450 // counter = c_rarg7 - 16 bytes of CTR
3451 // return - number of processed bytes
3452 address generate_galoisCounterMode_AESCrypt() {
3453 Label ghash_polynomial; // local data generated after code
3454
3455 __ align(CodeEntryAlignment);
3456 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3457 StubCodeMark mark(this, stub_id);
3458 address start = __ pc();
3459 __ enter();
3460
3461 const Register in = c_rarg0;
3462 const Register len = c_rarg1;
3463 const Register ct = c_rarg2;
3464 const Register out = c_rarg3;
3465 // and updated with the incremented counter in the end
3466
3467 const Register key = c_rarg4;
3468 const Register state = c_rarg5;
3469
3470 const Register subkeyHtbl = c_rarg6;
3471
3472 const Register counter = c_rarg7;
3473
3474 const Register keylen = r10;
3475 // Save state before entering routine
3476 __ sub(sp, sp, 4 * 16);
3477 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3478 __ sub(sp, sp, 4 * 16);
3479 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3480
3481 // __ andr(len, len, -512);
3482 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3483 __ str(len, __ pre(sp, -2 * wordSize));
3484
3485 Label DONE;
3486 __ cbz(len, DONE);
3487
3488 // Compute #rounds for AES based on the length of the key array
3489 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3490
3491 __ aesenc_loadkeys(key, keylen);
3492 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3493 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3494
3495 // AES/CTR loop
3496 {
3497 Label L_CTR_loop;
3498 __ BIND(L_CTR_loop);
3499
3500 // Setup the counters
3501 __ movi(v8, __ T4S, 0);
3502 __ movi(v9, __ T4S, 1);
3503 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3504
3505 assert(v0->encoding() < v8->encoding(), "");
3506 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3507 FloatRegister f = as_FloatRegister(i);
3508 __ rev32(f, __ T16B, v16);
3509 __ addv(v16, __ T4S, v16, v8);
3510 }
3511
3512 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3513
3514 // Encrypt the counters
3515 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3516
3517 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3518
3519 // XOR the encrypted counters with the inputs
3520 for (int i = 0; i < 8; i++) {
3521 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3522 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3523 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3524 }
3525 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3526 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3527
3528 __ subw(len, len, 16 * 8);
3529 __ cbnzw(len, L_CTR_loop);
3530 }
3531
3532 __ rev32(v16, __ T16B, v16);
3533 __ st1(v16, __ T16B, counter);
3534
3535 __ ldr(len, Address(sp));
3536 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3537
3538 // GHASH/CTR loop
3539 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3540 len, /*unrolls*/4);
3541
3542 #ifdef ASSERT
3543 { Label L;
3544 __ cmp(len, (unsigned char)0);
3545 __ br(Assembler::EQ, L);
3546 __ stop("stubGenerator: abort");
3547 __ bind(L);
3548 }
3549 #endif
3550
3551 __ bind(DONE);
3552 // Return the number of bytes processed
3553 __ ldr(r0, __ post(sp, 2 * wordSize));
3554
3555 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3556 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3557
3558 __ leave(); // required for proper stackwalking of RuntimeStub frame
3559 __ ret(lr);
3560
3561 // bind label and generate polynomial data
3562 __ align(wordSize * 2);
3563 __ bind(ghash_polynomial);
3564 __ emit_int64(0x87); // The low-order bits of the field
3565 // polynomial (i.e. p = z^7+z^2+z+1)
3566 // repeated in the low and high parts of a
3567 // 128-bit vector
3568 __ emit_int64(0x87);
3569
3570 return start;
3571 }
3572
3573 class Cached64Bytes {
3574 private:
3575 MacroAssembler *_masm;
3576 Register _regs[8];
3577
3578 public:
3579 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3580 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3581 auto it = rs.begin();
3582 for (auto &r: _regs) {
3583 r = *it;
3584 ++it;
3585 }
3586 }
3587
3588 void gen_loads(Register base) {
3589 for (int i = 0; i < 8; i += 2) {
3590 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3591 }
3592 }
3593
3594 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3595 void extract_u32(Register dest, int i) {
3596 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3597 }
3598 };
3599
3600 // Utility routines for md5.
3601 // Clobbers r10 and r11.
3602 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3603 int k, int s, int t) {
3604 Register rscratch3 = r10;
3605 Register rscratch4 = r11;
3606
3607 __ eorw(rscratch3, r3, r4);
3608 __ movw(rscratch2, t);
3609 __ andw(rscratch3, rscratch3, r2);
3610 __ addw(rscratch4, r1, rscratch2);
3611 reg_cache.extract_u32(rscratch1, k);
3612 __ eorw(rscratch3, rscratch3, r4);
3613 __ addw(rscratch4, rscratch4, rscratch1);
3614 __ addw(rscratch3, rscratch3, rscratch4);
3615 __ rorw(rscratch2, rscratch3, 32 - s);
3616 __ addw(r1, rscratch2, r2);
3617 }
3618
3619 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3620 int k, int s, int t) {
3621 Register rscratch3 = r10;
3622 Register rscratch4 = r11;
3623
3624 reg_cache.extract_u32(rscratch1, k);
3625 __ movw(rscratch2, t);
3626 __ addw(rscratch4, r1, rscratch2);
3627 __ addw(rscratch4, rscratch4, rscratch1);
3628 __ bicw(rscratch2, r3, r4);
3629 __ andw(rscratch3, r2, r4);
3630 __ addw(rscratch2, rscratch2, rscratch4);
3631 __ addw(rscratch2, rscratch2, rscratch3);
3632 __ rorw(rscratch2, rscratch2, 32 - s);
3633 __ addw(r1, rscratch2, r2);
3634 }
3635
3636 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3637 int k, int s, int t) {
3638 Register rscratch3 = r10;
3639 Register rscratch4 = r11;
3640
3641 __ eorw(rscratch3, r3, r4);
3642 __ movw(rscratch2, t);
3643 __ addw(rscratch4, r1, rscratch2);
3644 reg_cache.extract_u32(rscratch1, k);
3645 __ eorw(rscratch3, rscratch3, r2);
3646 __ addw(rscratch4, rscratch4, rscratch1);
3647 __ addw(rscratch3, rscratch3, rscratch4);
3648 __ rorw(rscratch2, rscratch3, 32 - s);
3649 __ addw(r1, rscratch2, r2);
3650 }
3651
3652 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3653 int k, int s, int t) {
3654 Register rscratch3 = r10;
3655 Register rscratch4 = r11;
3656
3657 __ movw(rscratch3, t);
3658 __ ornw(rscratch2, r2, r4);
3659 __ addw(rscratch4, r1, rscratch3);
3660 reg_cache.extract_u32(rscratch1, k);
3661 __ eorw(rscratch3, rscratch2, r3);
3662 __ addw(rscratch4, rscratch4, rscratch1);
3663 __ addw(rscratch3, rscratch3, rscratch4);
3664 __ rorw(rscratch2, rscratch3, 32 - s);
3665 __ addw(r1, rscratch2, r2);
3666 }
3667
3668 // Arguments:
3669 //
3670 // Inputs:
3671 // c_rarg0 - byte[] source+offset
3672 // c_rarg1 - int[] SHA.state
3673 // c_rarg2 - int offset
3674 // c_rarg3 - int limit
3675 //
3676 address generate_md5_implCompress(StubId stub_id) {
3677 bool multi_block;
3678 switch (stub_id) {
3679 case StubId::stubgen_md5_implCompress_id:
3680 multi_block = false;
3681 break;
3682 case StubId::stubgen_md5_implCompressMB_id:
3683 multi_block = true;
3684 break;
3685 default:
3686 ShouldNotReachHere();
3687 }
3688 __ align(CodeEntryAlignment);
3689
3690 StubCodeMark mark(this, stub_id);
3691 address start = __ pc();
3692
3693 Register buf = c_rarg0;
3694 Register state = c_rarg1;
3695 Register ofs = c_rarg2;
3696 Register limit = c_rarg3;
3697 Register a = r4;
3698 Register b = r5;
3699 Register c = r6;
3700 Register d = r7;
3701 Register rscratch3 = r10;
3702 Register rscratch4 = r11;
3703
3704 Register state_regs[2] = { r12, r13 };
3705 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3706 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3707
3708 __ push(saved_regs, sp);
3709
3710 __ ldp(state_regs[0], state_regs[1], Address(state));
3711 __ ubfx(a, state_regs[0], 0, 32);
3712 __ ubfx(b, state_regs[0], 32, 32);
3713 __ ubfx(c, state_regs[1], 0, 32);
3714 __ ubfx(d, state_regs[1], 32, 32);
3715
3716 Label md5_loop;
3717 __ BIND(md5_loop);
3718
3719 reg_cache.gen_loads(buf);
3720
3721 // Round 1
3722 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3723 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3724 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3725 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3726 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3727 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3728 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3729 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3730 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3731 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3732 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3733 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3734 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3735 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3736 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3737 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3738
3739 // Round 2
3740 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3741 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3742 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3743 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3744 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3745 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3746 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3747 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3748 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3749 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3750 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3751 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3752 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3753 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3754 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3755 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3756
3757 // Round 3
3758 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3759 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3760 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3761 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3762 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3763 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3764 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3765 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3766 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3767 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3768 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3769 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3770 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3771 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3772 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3773 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3774
3775 // Round 4
3776 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3777 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3778 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3779 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3780 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3781 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3782 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3783 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3784 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3785 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3786 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3787 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3788 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3789 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3790 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3791 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3792
3793 __ addw(a, state_regs[0], a);
3794 __ ubfx(rscratch2, state_regs[0], 32, 32);
3795 __ addw(b, rscratch2, b);
3796 __ addw(c, state_regs[1], c);
3797 __ ubfx(rscratch4, state_regs[1], 32, 32);
3798 __ addw(d, rscratch4, d);
3799
3800 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3801 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3802
3803 if (multi_block) {
3804 __ add(buf, buf, 64);
3805 __ add(ofs, ofs, 64);
3806 __ cmp(ofs, limit);
3807 __ br(Assembler::LE, md5_loop);
3808 __ mov(c_rarg0, ofs); // return ofs
3809 }
3810
3811 // write hash values back in the correct order
3812 __ stp(state_regs[0], state_regs[1], Address(state));
3813
3814 __ pop(saved_regs, sp);
3815
3816 __ ret(lr);
3817
3818 return start;
3819 }
3820
3821 // Arguments:
3822 //
3823 // Inputs:
3824 // c_rarg0 - byte[] source+offset
3825 // c_rarg1 - int[] SHA.state
3826 // c_rarg2 - int offset
3827 // c_rarg3 - int limit
3828 //
3829 address generate_sha1_implCompress(StubId stub_id) {
3830 bool multi_block;
3831 switch (stub_id) {
3832 case StubId::stubgen_sha1_implCompress_id:
3833 multi_block = false;
3834 break;
3835 case StubId::stubgen_sha1_implCompressMB_id:
3836 multi_block = true;
3837 break;
3838 default:
3839 ShouldNotReachHere();
3840 }
3841
3842 __ align(CodeEntryAlignment);
3843
3844 StubCodeMark mark(this, stub_id);
3845 address start = __ pc();
3846
3847 Register buf = c_rarg0;
3848 Register state = c_rarg1;
3849 Register ofs = c_rarg2;
3850 Register limit = c_rarg3;
3851
3852 Label keys;
3853 Label sha1_loop;
3854
3855 // load the keys into v0..v3
3856 __ adr(rscratch1, keys);
3857 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3858 // load 5 words state into v6, v7
3859 __ ldrq(v6, Address(state, 0));
3860 __ ldrs(v7, Address(state, 16));
3861
3862
3863 __ BIND(sha1_loop);
3864 // load 64 bytes of data into v16..v19
3865 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3866 __ rev32(v16, __ T16B, v16);
3867 __ rev32(v17, __ T16B, v17);
3868 __ rev32(v18, __ T16B, v18);
3869 __ rev32(v19, __ T16B, v19);
3870
3871 // do the sha1
3872 __ addv(v4, __ T4S, v16, v0);
3873 __ orr(v20, __ T16B, v6, v6);
3874
3875 FloatRegister d0 = v16;
3876 FloatRegister d1 = v17;
3877 FloatRegister d2 = v18;
3878 FloatRegister d3 = v19;
3879
3880 for (int round = 0; round < 20; round++) {
3881 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3882 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3883 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3884 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3885 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3886
3887 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3888 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3889 __ sha1h(tmp2, __ T4S, v20);
3890 if (round < 5)
3891 __ sha1c(v20, __ T4S, tmp3, tmp4);
3892 else if (round < 10 || round >= 15)
3893 __ sha1p(v20, __ T4S, tmp3, tmp4);
3894 else
3895 __ sha1m(v20, __ T4S, tmp3, tmp4);
3896 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3897
3898 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3899 }
3900
3901 __ addv(v7, __ T2S, v7, v21);
3902 __ addv(v6, __ T4S, v6, v20);
3903
3904 if (multi_block) {
3905 __ add(ofs, ofs, 64);
3906 __ cmp(ofs, limit);
3907 __ br(Assembler::LE, sha1_loop);
3908 __ mov(c_rarg0, ofs); // return ofs
3909 }
3910
3911 __ strq(v6, Address(state, 0));
3912 __ strs(v7, Address(state, 16));
3913
3914 __ ret(lr);
3915
3916 __ bind(keys);
3917 __ emit_int32(0x5a827999);
3918 __ emit_int32(0x6ed9eba1);
3919 __ emit_int32(0x8f1bbcdc);
3920 __ emit_int32(0xca62c1d6);
3921
3922 return start;
3923 }
3924
3925
3926 // Arguments:
3927 //
3928 // Inputs:
3929 // c_rarg0 - byte[] source+offset
3930 // c_rarg1 - int[] SHA.state
3931 // c_rarg2 - int offset
3932 // c_rarg3 - int limit
3933 //
3934 address generate_sha256_implCompress(StubId stub_id) {
3935 bool multi_block;
3936 switch (stub_id) {
3937 case StubId::stubgen_sha256_implCompress_id:
3938 multi_block = false;
3939 break;
3940 case StubId::stubgen_sha256_implCompressMB_id:
3941 multi_block = true;
3942 break;
3943 default:
3944 ShouldNotReachHere();
3945 }
3946
3947 static const uint32_t round_consts[64] = {
3948 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3949 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3950 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3951 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3952 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3953 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3954 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3955 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3956 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3957 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3958 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3959 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3960 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3961 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3962 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3963 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3964 };
3965
3966 __ align(CodeEntryAlignment);
3967
3968 StubCodeMark mark(this, stub_id);
3969 address start = __ pc();
3970
3971 Register buf = c_rarg0;
3972 Register state = c_rarg1;
3973 Register ofs = c_rarg2;
3974 Register limit = c_rarg3;
3975
3976 Label sha1_loop;
3977
3978 __ stpd(v8, v9, __ pre(sp, -32));
3979 __ stpd(v10, v11, Address(sp, 16));
3980
3981 // dga == v0
3982 // dgb == v1
3983 // dg0 == v2
3984 // dg1 == v3
3985 // dg2 == v4
3986 // t0 == v6
3987 // t1 == v7
3988
3989 // load 16 keys to v16..v31
3990 __ lea(rscratch1, ExternalAddress((address)round_consts));
3991 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3992 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3993 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3994 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3995
3996 // load 8 words (256 bits) state
3997 __ ldpq(v0, v1, state);
3998
3999 __ BIND(sha1_loop);
4000 // load 64 bytes of data into v8..v11
4001 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4002 __ rev32(v8, __ T16B, v8);
4003 __ rev32(v9, __ T16B, v9);
4004 __ rev32(v10, __ T16B, v10);
4005 __ rev32(v11, __ T16B, v11);
4006
4007 __ addv(v6, __ T4S, v8, v16);
4008 __ orr(v2, __ T16B, v0, v0);
4009 __ orr(v3, __ T16B, v1, v1);
4010
4011 FloatRegister d0 = v8;
4012 FloatRegister d1 = v9;
4013 FloatRegister d2 = v10;
4014 FloatRegister d3 = v11;
4015
4016
4017 for (int round = 0; round < 16; round++) {
4018 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4019 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4020 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4021 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4022
4023 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4024 __ orr(v4, __ T16B, v2, v2);
4025 if (round < 15)
4026 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4027 __ sha256h(v2, __ T4S, v3, tmp2);
4028 __ sha256h2(v3, __ T4S, v4, tmp2);
4029 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4030
4031 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4032 }
4033
4034 __ addv(v0, __ T4S, v0, v2);
4035 __ addv(v1, __ T4S, v1, v3);
4036
4037 if (multi_block) {
4038 __ add(ofs, ofs, 64);
4039 __ cmp(ofs, limit);
4040 __ br(Assembler::LE, sha1_loop);
4041 __ mov(c_rarg0, ofs); // return ofs
4042 }
4043
4044 __ ldpd(v10, v11, Address(sp, 16));
4045 __ ldpd(v8, v9, __ post(sp, 32));
4046
4047 __ stpq(v0, v1, state);
4048
4049 __ ret(lr);
4050
4051 return start;
4052 }
4053
4054 // Double rounds for sha512.
4055 void sha512_dround(int dr,
4056 FloatRegister vi0, FloatRegister vi1,
4057 FloatRegister vi2, FloatRegister vi3,
4058 FloatRegister vi4, FloatRegister vrc0,
4059 FloatRegister vrc1, FloatRegister vin0,
4060 FloatRegister vin1, FloatRegister vin2,
4061 FloatRegister vin3, FloatRegister vin4) {
4062 if (dr < 36) {
4063 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4064 }
4065 __ addv(v5, __ T2D, vrc0, vin0);
4066 __ ext(v6, __ T16B, vi2, vi3, 8);
4067 __ ext(v5, __ T16B, v5, v5, 8);
4068 __ ext(v7, __ T16B, vi1, vi2, 8);
4069 __ addv(vi3, __ T2D, vi3, v5);
4070 if (dr < 32) {
4071 __ ext(v5, __ T16B, vin3, vin4, 8);
4072 __ sha512su0(vin0, __ T2D, vin1);
4073 }
4074 __ sha512h(vi3, __ T2D, v6, v7);
4075 if (dr < 32) {
4076 __ sha512su1(vin0, __ T2D, vin2, v5);
4077 }
4078 __ addv(vi4, __ T2D, vi1, vi3);
4079 __ sha512h2(vi3, __ T2D, vi1, vi0);
4080 }
4081
4082 // Arguments:
4083 //
4084 // Inputs:
4085 // c_rarg0 - byte[] source+offset
4086 // c_rarg1 - int[] SHA.state
4087 // c_rarg2 - int offset
4088 // c_rarg3 - int limit
4089 //
4090 address generate_sha512_implCompress(StubId stub_id) {
4091 bool multi_block;
4092 switch (stub_id) {
4093 case StubId::stubgen_sha512_implCompress_id:
4094 multi_block = false;
4095 break;
4096 case StubId::stubgen_sha512_implCompressMB_id:
4097 multi_block = true;
4098 break;
4099 default:
4100 ShouldNotReachHere();
4101 }
4102
4103 static const uint64_t round_consts[80] = {
4104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4131 };
4132
4133 __ align(CodeEntryAlignment);
4134
4135 StubCodeMark mark(this, stub_id);
4136 address start = __ pc();
4137
4138 Register buf = c_rarg0;
4139 Register state = c_rarg1;
4140 Register ofs = c_rarg2;
4141 Register limit = c_rarg3;
4142
4143 __ stpd(v8, v9, __ pre(sp, -64));
4144 __ stpd(v10, v11, Address(sp, 16));
4145 __ stpd(v12, v13, Address(sp, 32));
4146 __ stpd(v14, v15, Address(sp, 48));
4147
4148 Label sha512_loop;
4149
4150 // load state
4151 __ ld1(v8, v9, v10, v11, __ T2D, state);
4152
4153 // load first 4 round constants
4154 __ lea(rscratch1, ExternalAddress((address)round_consts));
4155 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4156
4157 __ BIND(sha512_loop);
4158 // load 128B of data into v12..v19
4159 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4160 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4161 __ rev64(v12, __ T16B, v12);
4162 __ rev64(v13, __ T16B, v13);
4163 __ rev64(v14, __ T16B, v14);
4164 __ rev64(v15, __ T16B, v15);
4165 __ rev64(v16, __ T16B, v16);
4166 __ rev64(v17, __ T16B, v17);
4167 __ rev64(v18, __ T16B, v18);
4168 __ rev64(v19, __ T16B, v19);
4169
4170 __ mov(rscratch2, rscratch1);
4171
4172 __ mov(v0, __ T16B, v8);
4173 __ mov(v1, __ T16B, v9);
4174 __ mov(v2, __ T16B, v10);
4175 __ mov(v3, __ T16B, v11);
4176
4177 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4178 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4179 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4180 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4181 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4182 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4183 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4184 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4185 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4186 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4187 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4188 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4189 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4190 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4191 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4192 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4193 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4194 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4195 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4196 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4197 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4198 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4199 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4200 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4201 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4202 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4203 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4204 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4205 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4206 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4207 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4208 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4209 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4210 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4211 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4212 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4213 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4214 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4215 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4216 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4217
4218 __ addv(v8, __ T2D, v8, v0);
4219 __ addv(v9, __ T2D, v9, v1);
4220 __ addv(v10, __ T2D, v10, v2);
4221 __ addv(v11, __ T2D, v11, v3);
4222
4223 if (multi_block) {
4224 __ add(ofs, ofs, 128);
4225 __ cmp(ofs, limit);
4226 __ br(Assembler::LE, sha512_loop);
4227 __ mov(c_rarg0, ofs); // return ofs
4228 }
4229
4230 __ st1(v8, v9, v10, v11, __ T2D, state);
4231
4232 __ ldpd(v14, v15, Address(sp, 48));
4233 __ ldpd(v12, v13, Address(sp, 32));
4234 __ ldpd(v10, v11, Address(sp, 16));
4235 __ ldpd(v8, v9, __ post(sp, 64));
4236
4237 __ ret(lr);
4238
4239 return start;
4240 }
4241
4242 // Execute one round of keccak of two computations in parallel.
4243 // One of the states should be loaded into the lower halves of
4244 // the vector registers v0-v24, the other should be loaded into
4245 // the upper halves of those registers. The ld1r instruction loads
4246 // the round constant into both halves of register v31.
4247 // Intermediate results c0...c5 and d0...d5 are computed
4248 // in registers v25...v30.
4249 // All vector instructions that are used operate on both register
4250 // halves in parallel.
4251 // If only a single computation is needed, one can only load the lower halves.
4252 void keccak_round(Register rscratch1) {
4253 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4254 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4255 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4256 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4257 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4258 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4259 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4260 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4261 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4262 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4263
4264 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4265 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4266 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4267 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4268 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4269
4270 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4271 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4272 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4273 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4274 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4275 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4276 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4277 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4278 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4279 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4280 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4281 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4282 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4283 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4284 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4285 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4286 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4287 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4288 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4289 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4290 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4291 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4292 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4293 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4294 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4295
4296 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4297 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4298 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4299 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4300 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4301
4302 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4303
4304 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4305 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4306 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4307 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4308 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4309
4310 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4311 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4312 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4313 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4314 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4315
4316 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4317 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4318 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4319 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4320 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4321
4322 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4323 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4324 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4325 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4326 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4327
4328 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4329 }
4330
4331 // Arguments:
4332 //
4333 // Inputs:
4334 // c_rarg0 - byte[] source+offset
4335 // c_rarg1 - byte[] SHA.state
4336 // c_rarg2 - int block_size
4337 // c_rarg3 - int offset
4338 // c_rarg4 - int limit
4339 //
4340 address generate_sha3_implCompress(StubId stub_id) {
4341 bool multi_block;
4342 switch (stub_id) {
4343 case StubId::stubgen_sha3_implCompress_id:
4344 multi_block = false;
4345 break;
4346 case StubId::stubgen_sha3_implCompressMB_id:
4347 multi_block = true;
4348 break;
4349 default:
4350 ShouldNotReachHere();
4351 }
4352
4353 static const uint64_t round_consts[24] = {
4354 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4355 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4356 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4357 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4358 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4359 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4360 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4361 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4362 };
4363
4364 __ align(CodeEntryAlignment);
4365
4366 StubCodeMark mark(this, stub_id);
4367 address start = __ pc();
4368
4369 Register buf = c_rarg0;
4370 Register state = c_rarg1;
4371 Register block_size = c_rarg2;
4372 Register ofs = c_rarg3;
4373 Register limit = c_rarg4;
4374
4375 Label sha3_loop, rounds24_loop;
4376 Label sha3_512_or_sha3_384, shake128;
4377
4378 __ stpd(v8, v9, __ pre(sp, -64));
4379 __ stpd(v10, v11, Address(sp, 16));
4380 __ stpd(v12, v13, Address(sp, 32));
4381 __ stpd(v14, v15, Address(sp, 48));
4382
4383 // load state
4384 __ add(rscratch1, state, 32);
4385 __ ld1(v0, v1, v2, v3, __ T1D, state);
4386 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4387 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4388 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4389 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4390 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4391 __ ld1(v24, __ T1D, rscratch1);
4392
4393 __ BIND(sha3_loop);
4394
4395 // 24 keccak rounds
4396 __ movw(rscratch2, 24);
4397
4398 // load round_constants base
4399 __ lea(rscratch1, ExternalAddress((address) round_consts));
4400
4401 // load input
4402 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4403 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4404 __ eor(v0, __ T8B, v0, v25);
4405 __ eor(v1, __ T8B, v1, v26);
4406 __ eor(v2, __ T8B, v2, v27);
4407 __ eor(v3, __ T8B, v3, v28);
4408 __ eor(v4, __ T8B, v4, v29);
4409 __ eor(v5, __ T8B, v5, v30);
4410 __ eor(v6, __ T8B, v6, v31);
4411
4412 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4413 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4414
4415 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4416 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4417 __ eor(v7, __ T8B, v7, v25);
4418 __ eor(v8, __ T8B, v8, v26);
4419 __ eor(v9, __ T8B, v9, v27);
4420 __ eor(v10, __ T8B, v10, v28);
4421 __ eor(v11, __ T8B, v11, v29);
4422 __ eor(v12, __ T8B, v12, v30);
4423 __ eor(v13, __ T8B, v13, v31);
4424
4425 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4426 __ eor(v14, __ T8B, v14, v25);
4427 __ eor(v15, __ T8B, v15, v26);
4428 __ eor(v16, __ T8B, v16, v27);
4429
4430 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4431 __ andw(c_rarg5, block_size, 48);
4432 __ cbzw(c_rarg5, rounds24_loop);
4433
4434 __ tbnz(block_size, 5, shake128);
4435 // block_size == 144, bit5 == 0, SHA3-224
4436 __ ldrd(v28, __ post(buf, 8));
4437 __ eor(v17, __ T8B, v17, v28);
4438 __ b(rounds24_loop);
4439
4440 __ BIND(shake128);
4441 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4442 __ eor(v17, __ T8B, v17, v28);
4443 __ eor(v18, __ T8B, v18, v29);
4444 __ eor(v19, __ T8B, v19, v30);
4445 __ eor(v20, __ T8B, v20, v31);
4446 __ b(rounds24_loop); // block_size == 168, SHAKE128
4447
4448 __ BIND(sha3_512_or_sha3_384);
4449 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4450 __ eor(v7, __ T8B, v7, v25);
4451 __ eor(v8, __ T8B, v8, v26);
4452 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4453
4454 // SHA3-384
4455 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4456 __ eor(v9, __ T8B, v9, v27);
4457 __ eor(v10, __ T8B, v10, v28);
4458 __ eor(v11, __ T8B, v11, v29);
4459 __ eor(v12, __ T8B, v12, v30);
4460
4461 __ BIND(rounds24_loop);
4462 __ subw(rscratch2, rscratch2, 1);
4463
4464 keccak_round(rscratch1);
4465
4466 __ cbnzw(rscratch2, rounds24_loop);
4467
4468 if (multi_block) {
4469 __ add(ofs, ofs, block_size);
4470 __ cmp(ofs, limit);
4471 __ br(Assembler::LE, sha3_loop);
4472 __ mov(c_rarg0, ofs); // return ofs
4473 }
4474
4475 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4476 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4477 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4478 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4479 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4480 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4481 __ st1(v24, __ T1D, state);
4482
4483 // restore callee-saved registers
4484 __ ldpd(v14, v15, Address(sp, 48));
4485 __ ldpd(v12, v13, Address(sp, 32));
4486 __ ldpd(v10, v11, Address(sp, 16));
4487 __ ldpd(v8, v9, __ post(sp, 64));
4488
4489 __ ret(lr);
4490
4491 return start;
4492 }
4493
4494 // Inputs:
4495 // c_rarg0 - long[] state0
4496 // c_rarg1 - long[] state1
4497 address generate_double_keccak() {
4498 static const uint64_t round_consts[24] = {
4499 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4500 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4501 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4502 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4503 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4504 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4505 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4506 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4507 };
4508
4509 // Implements the double_keccak() method of the
4510 // sun.secyrity.provider.SHA3Parallel class
4511 __ align(CodeEntryAlignment);
4512 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4513 address start = __ pc();
4514 __ enter();
4515
4516 Register state0 = c_rarg0;
4517 Register state1 = c_rarg1;
4518
4519 Label rounds24_loop;
4520
4521 // save callee-saved registers
4522 __ stpd(v8, v9, __ pre(sp, -64));
4523 __ stpd(v10, v11, Address(sp, 16));
4524 __ stpd(v12, v13, Address(sp, 32));
4525 __ stpd(v14, v15, Address(sp, 48));
4526
4527 // load states
4528 __ add(rscratch1, state0, 32);
4529 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4530 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4531 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4532 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4533 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4534 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4535 __ ld1(v24, __ D, 0, rscratch1);
4536 __ add(rscratch1, state1, 32);
4537 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4538 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4539 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4540 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4541 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4542 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4543 __ ld1(v24, __ D, 1, rscratch1);
4544
4545 // 24 keccak rounds
4546 __ movw(rscratch2, 24);
4547
4548 // load round_constants base
4549 __ lea(rscratch1, ExternalAddress((address) round_consts));
4550
4551 __ BIND(rounds24_loop);
4552 __ subw(rscratch2, rscratch2, 1);
4553 keccak_round(rscratch1);
4554 __ cbnzw(rscratch2, rounds24_loop);
4555
4556 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4557 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4558 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4559 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4560 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4561 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4562 __ st1(v24, __ D, 0, state0);
4563 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4564 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4565 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4566 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4567 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4568 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4569 __ st1(v24, __ D, 1, state1);
4570
4571 // restore callee-saved vector registers
4572 __ ldpd(v14, v15, Address(sp, 48));
4573 __ ldpd(v12, v13, Address(sp, 32));
4574 __ ldpd(v10, v11, Address(sp, 16));
4575 __ ldpd(v8, v9, __ post(sp, 64));
4576
4577 __ leave(); // required for proper stackwalking of RuntimeStub frame
4578 __ mov(r0, zr); // return 0
4579 __ ret(lr);
4580
4581 return start;
4582 }
4583
4584 // ChaCha20 block function. This version parallelizes the 32-bit
4585 // state elements on each of 16 vectors, producing 4 blocks of
4586 // keystream at a time.
4587 //
4588 // state (int[16]) = c_rarg0
4589 // keystream (byte[256]) = c_rarg1
4590 // return - number of bytes of produced keystream (always 256)
4591 //
4592 // This implementation takes each 32-bit integer from the state
4593 // array and broadcasts it across all 4 32-bit lanes of a vector register
4594 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4595 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4596 // the quarter round schedule is implemented as outlined in RFC 7539 section
4597 // 2.3. However, instead of sequentially processing the 3 quarter round
4598 // operations represented by one QUARTERROUND function, we instead stack all
4599 // the adds, xors and left-rotations from the first 4 quarter rounds together
4600 // and then do the same for the second set of 4 quarter rounds. This removes
4601 // some latency that would otherwise be incurred by waiting for an add to
4602 // complete before performing an xor (which depends on the result of the
4603 // add), etc. An adjustment happens between the first and second groups of 4
4604 // quarter rounds, but this is done only in the inputs to the macro functions
4605 // that generate the assembly instructions - these adjustments themselves are
4606 // not part of the resulting assembly.
4607 // The 4 registers v0-v3 are used during the quarter round operations as
4608 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4609 // registers become the vectors involved in adding the start state back onto
4610 // the post-QR working state. After the adds are complete, each of the 16
4611 // vectors write their first lane back to the keystream buffer, followed
4612 // by the second lane from all vectors and so on.
4613 address generate_chacha20Block_blockpar() {
4614 Label L_twoRounds, L_cc20_const;
4615 __ align(CodeEntryAlignment);
4616 StubId stub_id = StubId::stubgen_chacha20Block_id;
4617 StubCodeMark mark(this, stub_id);
4618 address start = __ pc();
4619 __ enter();
4620
4621 int i, j;
4622 const Register state = c_rarg0;
4623 const Register keystream = c_rarg1;
4624 const Register loopCtr = r10;
4625 const Register tmpAddr = r11;
4626 const FloatRegister ctrAddOverlay = v28;
4627 const FloatRegister lrot8Tbl = v29;
4628
4629 // Organize SIMD registers in an array that facilitates
4630 // putting repetitive opcodes into loop structures. It is
4631 // important that each grouping of 4 registers is monotonically
4632 // increasing to support the requirements of multi-register
4633 // instructions (e.g. ld4r, st4, etc.)
4634 const FloatRegister workSt[16] = {
4635 v4, v5, v6, v7, v16, v17, v18, v19,
4636 v20, v21, v22, v23, v24, v25, v26, v27
4637 };
4638
4639 // Pull in constant data. The first 16 bytes are the add overlay
4640 // which is applied to the vector holding the counter (state[12]).
4641 // The second 16 bytes is the index register for the 8-bit left
4642 // rotation tbl instruction.
4643 __ adr(tmpAddr, L_cc20_const);
4644 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4645
4646 // Load from memory and interlace across 16 SIMD registers,
4647 // With each word from memory being broadcast to all lanes of
4648 // each successive SIMD register.
4649 // Addr(0) -> All lanes in workSt[i]
4650 // Addr(4) -> All lanes workSt[i + 1], etc.
4651 __ mov(tmpAddr, state);
4652 for (i = 0; i < 16; i += 4) {
4653 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4654 __ post(tmpAddr, 16));
4655 }
4656 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4657
4658 // Before entering the loop, create 5 4-register arrays. These
4659 // will hold the 4 registers that represent the a/b/c/d fields
4660 // in the quarter round operation. For instance the "b" field
4661 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4662 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4663 // since it is part of a diagonal organization. The aSet and scratch
4664 // register sets are defined at declaration time because they do not change
4665 // organization at any point during the 20-round processing.
4666 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4667 FloatRegister bSet[4];
4668 FloatRegister cSet[4];
4669 FloatRegister dSet[4];
4670 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4671
4672 // Set up the 10 iteration loop and perform all 8 quarter round ops
4673 __ mov(loopCtr, 10);
4674 __ BIND(L_twoRounds);
4675
4676 // Set to columnar organization and do the following 4 quarter-rounds:
4677 // QUARTERROUND(0, 4, 8, 12)
4678 // QUARTERROUND(1, 5, 9, 13)
4679 // QUARTERROUND(2, 6, 10, 14)
4680 // QUARTERROUND(3, 7, 11, 15)
4681 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4682 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4683 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4684
4685 __ cc20_qr_add4(aSet, bSet); // a += b
4686 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4687 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4688
4689 __ cc20_qr_add4(cSet, dSet); // c += d
4690 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4691 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4692
4693 __ cc20_qr_add4(aSet, bSet); // a += b
4694 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4695 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4696
4697 __ cc20_qr_add4(cSet, dSet); // c += d
4698 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4699 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4700
4701 // Set to diagonal organization and do the next 4 quarter-rounds:
4702 // QUARTERROUND(0, 5, 10, 15)
4703 // QUARTERROUND(1, 6, 11, 12)
4704 // QUARTERROUND(2, 7, 8, 13)
4705 // QUARTERROUND(3, 4, 9, 14)
4706 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4707 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4708 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4709
4710 __ cc20_qr_add4(aSet, bSet); // a += b
4711 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4712 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4713
4714 __ cc20_qr_add4(cSet, dSet); // c += d
4715 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4716 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4717
4718 __ cc20_qr_add4(aSet, bSet); // a += b
4719 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4720 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4721
4722 __ cc20_qr_add4(cSet, dSet); // c += d
4723 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4724 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4725
4726 // Decrement and iterate
4727 __ sub(loopCtr, loopCtr, 1);
4728 __ cbnz(loopCtr, L_twoRounds);
4729
4730 __ mov(tmpAddr, state);
4731
4732 // Add the starting state back to the post-loop keystream
4733 // state. We read/interlace the state array from memory into
4734 // 4 registers similar to what we did in the beginning. Then
4735 // add the counter overlay onto workSt[12] at the end.
4736 for (i = 0; i < 16; i += 4) {
4737 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4738 __ addv(workSt[i], __ T4S, workSt[i], v0);
4739 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4740 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4741 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4742 }
4743 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4744
4745 // Write working state into the keystream buffer. This is accomplished
4746 // by taking the lane "i" from each of the four vectors and writing
4747 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4748 // repeating with the next 4 vectors until all 16 vectors have been used.
4749 // Then move to the next lane and repeat the process until all lanes have
4750 // been written.
4751 for (i = 0; i < 4; i++) {
4752 for (j = 0; j < 16; j += 4) {
4753 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4754 __ post(keystream, 16));
4755 }
4756 }
4757
4758 __ mov(r0, 256); // Return length of output keystream
4759 __ leave();
4760 __ ret(lr);
4761
4762 // bind label and generate local constant data used by this stub
4763 // The constant data is broken into two 128-bit segments to be loaded
4764 // onto FloatRegisters. The first 128 bits are a counter add overlay
4765 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4766 // The second 128-bits is a table constant used for 8-bit left rotations.
4767 __ BIND(L_cc20_const);
4768 __ emit_int64(0x0000000100000000UL);
4769 __ emit_int64(0x0000000300000002UL);
4770 __ emit_int64(0x0605040702010003UL);
4771 __ emit_int64(0x0E0D0C0F0A09080BUL);
4772
4773 return start;
4774 }
4775
4776 // Helpers to schedule parallel operation bundles across vector
4777 // register sequences of size 2, 4 or 8.
4778
4779 // Implement various primitive computations across vector sequences
4780
4781 template<int N>
4782 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4783 const VSeq<N>& v1, const VSeq<N>& v2) {
4784 // output must not be constant
4785 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4786 // output cannot overwrite pending inputs
4787 assert(!vs_write_before_read(v, v1), "output overwrites input");
4788 assert(!vs_write_before_read(v, v2), "output overwrites input");
4789 for (int i = 0; i < N; i++) {
4790 __ addv(v[i], T, v1[i], v2[i]);
4791 }
4792 }
4793
4794 template<int N>
4795 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4796 const VSeq<N>& v1, const VSeq<N>& v2) {
4797 // output must not be constant
4798 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4799 // output cannot overwrite pending inputs
4800 assert(!vs_write_before_read(v, v1), "output overwrites input");
4801 assert(!vs_write_before_read(v, v2), "output overwrites input");
4802 for (int i = 0; i < N; i++) {
4803 __ subv(v[i], T, v1[i], v2[i]);
4804 }
4805 }
4806
4807 template<int N>
4808 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4809 const VSeq<N>& v1, const VSeq<N>& v2) {
4810 // output must not be constant
4811 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4812 // output cannot overwrite pending inputs
4813 assert(!vs_write_before_read(v, v1), "output overwrites input");
4814 assert(!vs_write_before_read(v, v2), "output overwrites input");
4815 for (int i = 0; i < N; i++) {
4816 __ mulv(v[i], T, v1[i], v2[i]);
4817 }
4818 }
4819
4820 template<int N>
4821 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4822 // output must not be constant
4823 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4824 // output cannot overwrite pending inputs
4825 assert(!vs_write_before_read(v, v1), "output overwrites input");
4826 for (int i = 0; i < N; i++) {
4827 __ negr(v[i], T, v1[i]);
4828 }
4829 }
4830
4831 template<int N>
4832 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4833 const VSeq<N>& v1, int shift) {
4834 // output must not be constant
4835 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4836 // output cannot overwrite pending inputs
4837 assert(!vs_write_before_read(v, v1), "output overwrites input");
4838 for (int i = 0; i < N; i++) {
4839 __ sshr(v[i], T, v1[i], shift);
4840 }
4841 }
4842
4843 template<int N>
4844 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4845 // output must not be constant
4846 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4847 // output cannot overwrite pending inputs
4848 assert(!vs_write_before_read(v, v1), "output overwrites input");
4849 assert(!vs_write_before_read(v, v2), "output overwrites input");
4850 for (int i = 0; i < N; i++) {
4851 __ andr(v[i], __ T16B, v1[i], v2[i]);
4852 }
4853 }
4854
4855 template<int N>
4856 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4857 // output must not be constant
4858 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4859 // output cannot overwrite pending inputs
4860 assert(!vs_write_before_read(v, v1), "output overwrites input");
4861 assert(!vs_write_before_read(v, v2), "output overwrites input");
4862 for (int i = 0; i < N; i++) {
4863 __ orr(v[i], __ T16B, v1[i], v2[i]);
4864 }
4865 }
4866
4867 template<int N>
4868 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4869 // output must not be constant
4870 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4871 // output cannot overwrite pending inputs
4872 assert(!vs_write_before_read(v, v1), "output overwrites input");
4873 for (int i = 0; i < N; i++) {
4874 __ notr(v[i], __ T16B, v1[i]);
4875 }
4876 }
4877
4878 template<int N>
4879 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4880 // output must not be constant
4881 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4882 // output cannot overwrite pending inputs
4883 assert(!vs_write_before_read(v, v1), "output overwrites input");
4884 assert(!vs_write_before_read(v, v2), "output overwrites input");
4885 for (int i = 0; i < N; i++) {
4886 __ sqdmulh(v[i], T, v1[i], v2[i]);
4887 }
4888 }
4889
4890 template<int N>
4891 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4892 // output must not be constant
4893 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4894 // output cannot overwrite pending inputs
4895 assert(!vs_write_before_read(v, v1), "output overwrites input");
4896 assert(!vs_write_before_read(v, v2), "output overwrites input");
4897 for (int i = 0; i < N; i++) {
4898 __ mlsv(v[i], T, v1[i], v2[i]);
4899 }
4900 }
4901
4902 // load N/2 successive pairs of quadword values from memory in order
4903 // into N successive vector registers of the sequence via the
4904 // address supplied in base.
4905 template<int N>
4906 void vs_ldpq(const VSeq<N>& v, Register base) {
4907 for (int i = 0; i < N; i += 2) {
4908 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4909 }
4910 }
4911
4912 // load N/2 successive pairs of quadword values from memory in order
4913 // into N vector registers of the sequence via the address supplied
4914 // in base using post-increment addressing
4915 template<int N>
4916 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4917 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4918 for (int i = 0; i < N; i += 2) {
4919 __ ldpq(v[i], v[i+1], __ post(base, 32));
4920 }
4921 }
4922
4923 // store N successive vector registers of the sequence into N/2
4924 // successive pairs of quadword memory locations via the address
4925 // supplied in base using post-increment addressing
4926 template<int N>
4927 void vs_stpq_post(const VSeq<N>& v, Register base) {
4928 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4929 for (int i = 0; i < N; i += 2) {
4930 __ stpq(v[i], v[i+1], __ post(base, 32));
4931 }
4932 }
4933
4934 // load N/2 pairs of quadword values from memory de-interleaved into
4935 // N vector registers 2 at a time via the address supplied in base
4936 // using post-increment addressing.
4937 template<int N>
4938 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4939 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4940 for (int i = 0; i < N; i += 2) {
4941 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4942 }
4943 }
4944
4945 // store N vector registers interleaved into N/2 pairs of quadword
4946 // memory locations via the address supplied in base using
4947 // post-increment addressing.
4948 template<int N>
4949 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4950 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4951 for (int i = 0; i < N; i += 2) {
4952 __ st2(v[i], v[i+1], T, __ post(base, 32));
4953 }
4954 }
4955
4956 // load N quadword values from memory de-interleaved into N vector
4957 // registers 3 elements at a time via the address supplied in base.
4958 template<int N>
4959 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4960 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4961 for (int i = 0; i < N; i += 3) {
4962 __ ld3(v[i], v[i+1], v[i+2], T, base);
4963 }
4964 }
4965
4966 // load N quadword values from memory de-interleaved into N vector
4967 // registers 3 elements at a time via the address supplied in base
4968 // using post-increment addressing.
4969 template<int N>
4970 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4971 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4972 for (int i = 0; i < N; i += 3) {
4973 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4974 }
4975 }
4976
4977 // load N/2 pairs of quadword values from memory into N vector
4978 // registers via the address supplied in base with each pair indexed
4979 // using the the start offset plus the corresponding entry in the
4980 // offsets array
4981 template<int N>
4982 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
4983 for (int i = 0; i < N/2; i++) {
4984 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4985 }
4986 }
4987
4988 // store N vector registers into N/2 pairs of quadword memory
4989 // locations via the address supplied in base with each pair indexed
4990 // using the the start offset plus the corresponding entry in the
4991 // offsets array
4992 template<int N>
4993 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
4994 for (int i = 0; i < N/2; i++) {
4995 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4996 }
4997 }
4998
4999 // load N single quadword values from memory into N vector registers
5000 // via the address supplied in base with each value indexed using
5001 // the the start offset plus the corresponding entry in the offsets
5002 // array
5003 template<int N>
5004 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5005 int start, int (&offsets)[N]) {
5006 for (int i = 0; i < N; i++) {
5007 __ ldr(v[i], T, Address(base, start + offsets[i]));
5008 }
5009 }
5010
5011 // store N vector registers into N single quadword memory locations
5012 // via the address supplied in base with each value indexed using
5013 // the the start offset plus the corresponding entry in the offsets
5014 // array
5015 template<int N>
5016 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5017 int start, int (&offsets)[N]) {
5018 for (int i = 0; i < N; i++) {
5019 __ str(v[i], T, Address(base, start + offsets[i]));
5020 }
5021 }
5022
5023 // load N/2 pairs of quadword values from memory de-interleaved into
5024 // N vector registers 2 at a time via the address supplied in base
5025 // with each pair indexed using the the start offset plus the
5026 // corresponding entry in the offsets array
5027 template<int N>
5028 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5029 Register tmp, int start, int (&offsets)[N/2]) {
5030 for (int i = 0; i < N/2; i++) {
5031 __ add(tmp, base, start + offsets[i]);
5032 __ ld2(v[2*i], v[2*i+1], T, tmp);
5033 }
5034 }
5035
5036 // store N vector registers 2 at a time interleaved into N/2 pairs
5037 // of quadword memory locations via the address supplied in base
5038 // with each pair indexed using the the start offset plus the
5039 // corresponding entry in the offsets array
5040 template<int N>
5041 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5042 Register tmp, int start, int (&offsets)[N/2]) {
5043 for (int i = 0; i < N/2; i++) {
5044 __ add(tmp, base, start + offsets[i]);
5045 __ st2(v[2*i], v[2*i+1], T, tmp);
5046 }
5047 }
5048
5049 // Helper routines for various flavours of Montgomery multiply
5050
5051 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5052 // multiplications in parallel
5053 //
5054
5055 // See the montMul() method of the sun.security.provider.ML_DSA
5056 // class.
5057 //
5058 // Computes 4x4S results or 8x8H results
5059 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5060 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5061 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5062 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5063 // Outputs: va - 4x4S or 4x8H vector register sequences
5064 // vb, vc, vtmp and vq must all be disjoint
5065 // va must be disjoint from all other inputs/temps or must equal vc
5066 // va must have a non-zero delta i.e. it must not be a constant vseq.
5067 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5068 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5069 Assembler::SIMD_Arrangement T,
5070 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5071 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5072 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5073 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5074 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5075
5076 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5077 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5078
5079 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5080
5081 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5082 assert(vs_disjoint(va, vb), "va and vb overlap");
5083 assert(vs_disjoint(va, vq), "va and vq overlap");
5084 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5085 assert(!va.is_constant(), "output vector must identify 4 different registers");
5086
5087 // schedule 4 streams of instructions across the vector sequences
5088 for (int i = 0; i < 4; i++) {
5089 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5090 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5091 }
5092
5093 for (int i = 0; i < 4; i++) {
5094 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5095 }
5096
5097 for (int i = 0; i < 4; i++) {
5098 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5099 }
5100
5101 for (int i = 0; i < 4; i++) {
5102 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5103 }
5104 }
5105
5106 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5107 // multiplications in parallel
5108 //
5109
5110 // See the montMul() method of the sun.security.provider.ML_DSA
5111 // class.
5112 //
5113 // Computes 4x4S results or 8x8H results
5114 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5115 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5116 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5117 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5118 // Outputs: va - 4x4S or 4x8H vector register sequences
5119 // vb, vc, vtmp and vq must all be disjoint
5120 // va must be disjoint from all other inputs/temps or must equal vc
5121 // va must have a non-zero delta i.e. it must not be a constant vseq.
5122 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5123 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5124 Assembler::SIMD_Arrangement T,
5125 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5126 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5127 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5128 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5129 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5130
5131 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5132 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5133
5134 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5135
5136 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5137 assert(vs_disjoint(va, vb), "va and vb overlap");
5138 assert(vs_disjoint(va, vq), "va and vq overlap");
5139 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5140 assert(!va.is_constant(), "output vector must identify 2 different registers");
5141
5142 // schedule 2 streams of instructions across the vector sequences
5143 for (int i = 0; i < 2; i++) {
5144 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5145 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5146 }
5147
5148 for (int i = 0; i < 2; i++) {
5149 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5150 }
5151
5152 for (int i = 0; i < 2; i++) {
5153 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5154 }
5155
5156 for (int i = 0; i < 2; i++) {
5157 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5158 }
5159 }
5160
5161 // Perform 16 16-bit Montgomery multiplications in parallel.
5162 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5163 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5164 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5165 // It will assert that the register use is valid
5166 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5167 }
5168
5169 // Perform 32 16-bit Montgomery multiplications in parallel.
5170 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5171 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5172 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5173 // It will assert that the register use is valid
5174 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5175 }
5176
5177 // Perform 64 16-bit Montgomery multiplications in parallel.
5178 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5179 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5180 // Schedule two successive 4x8H multiplies via the montmul helper
5181 // on the front and back halves of va, vb and vc. The helper will
5182 // assert that the register use has no overlap conflicts on each
5183 // individual call but we also need to ensure that the necessary
5184 // disjoint/equality constraints are met across both calls.
5185
5186 // vb, vc, vtmp and vq must be disjoint. va must either be
5187 // disjoint from all other registers or equal vc
5188
5189 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5190 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5191 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5192
5193 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5194 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5195
5196 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5197
5198 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5199 assert(vs_disjoint(va, vb), "va and vb overlap");
5200 assert(vs_disjoint(va, vq), "va and vq overlap");
5201 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5202
5203 // we multiply the front and back halves of each sequence 4 at a
5204 // time because
5205 //
5206 // 1) we are currently only able to get 4-way instruction
5207 // parallelism at best
5208 //
5209 // 2) we need registers for the constants in vq and temporary
5210 // scratch registers to hold intermediate results so vtmp can only
5211 // be a VSeq<4> which means we only have 4 scratch slots
5212
5213 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5214 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5215 }
5216
5217 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5218 const VSeq<4>& vc,
5219 const VSeq<4>& vtmp,
5220 const VSeq<2>& vq) {
5221 // compute a = montmul(a1, c)
5222 kyber_montmul32(vc, va1, vc, vtmp, vq);
5223 // ouptut a1 = a0 - a
5224 vs_subv(va1, __ T8H, va0, vc);
5225 // and a0 = a0 + a
5226 vs_addv(va0, __ T8H, va0, vc);
5227 }
5228
5229 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5230 const VSeq<4>& vb,
5231 const VSeq<4>& vtmp1,
5232 const VSeq<4>& vtmp2,
5233 const VSeq<2>& vq) {
5234 // compute c = a0 - a1
5235 vs_subv(vtmp1, __ T8H, va0, va1);
5236 // output a0 = a0 + a1
5237 vs_addv(va0, __ T8H, va0, va1);
5238 // output a1 = b montmul c
5239 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5240 }
5241
5242 void load64shorts(const VSeq<8>& v, Register shorts) {
5243 vs_ldpq_post(v, shorts);
5244 }
5245
5246 void load32shorts(const VSeq<4>& v, Register shorts) {
5247 vs_ldpq_post(v, shorts);
5248 }
5249
5250 void store64shorts(VSeq<8> v, Register tmpAddr) {
5251 vs_stpq_post(v, tmpAddr);
5252 }
5253
5254 // Kyber NTT function.
5255 // Implements
5256 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5257 //
5258 // coeffs (short[256]) = c_rarg0
5259 // ntt_zetas (short[256]) = c_rarg1
5260 address generate_kyberNtt() {
5261
5262 __ align(CodeEntryAlignment);
5263 StubId stub_id = StubId::stubgen_kyberNtt_id;
5264 StubCodeMark mark(this, stub_id);
5265 address start = __ pc();
5266 __ enter();
5267
5268 const Register coeffs = c_rarg0;
5269 const Register zetas = c_rarg1;
5270
5271 const Register kyberConsts = r10;
5272 const Register tmpAddr = r11;
5273
5274 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5275 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5276 VSeq<2> vq(30); // n.b. constants overlap vs3
5277
5278 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5279 // load the montmul constants
5280 vs_ldpq(vq, kyberConsts);
5281
5282 // Each level corresponds to an iteration of the outermost loop of the
5283 // Java method seilerNTT(int[] coeffs). There are some differences
5284 // from what is done in the seilerNTT() method, though:
5285 // 1. The computation is using 16-bit signed values, we do not convert them
5286 // to ints here.
5287 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5288 // this array for each level, it is easier that way to fill up the vector
5289 // registers.
5290 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5291 // multiplications (this is because that way there should not be any
5292 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5293 // that we can use the 16-bit arithmetic in the vector unit.
5294 //
5295 // On each level, we fill up the vector registers in such a way that the
5296 // array elements that need to be multiplied by the zetas go into one
5297 // set of vector registers while the corresponding ones that don't need to
5298 // be multiplied, go into another set.
5299 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5300 // registers interleaving the steps of 4 identical computations,
5301 // each done on 8 16-bit values per register.
5302
5303 // At levels 0-3 the coefficients multiplied by or added/subtracted
5304 // to the zetas occur in discrete blocks whose size is some multiple
5305 // of 32.
5306
5307 // level 0
5308 __ add(tmpAddr, coeffs, 256);
5309 load64shorts(vs1, tmpAddr);
5310 load64shorts(vs2, zetas);
5311 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5312 __ add(tmpAddr, coeffs, 0);
5313 load64shorts(vs1, tmpAddr);
5314 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5315 vs_addv(vs1, __ T8H, vs1, vs2);
5316 __ add(tmpAddr, coeffs, 0);
5317 vs_stpq_post(vs1, tmpAddr);
5318 __ add(tmpAddr, coeffs, 256);
5319 vs_stpq_post(vs3, tmpAddr);
5320 // restore montmul constants
5321 vs_ldpq(vq, kyberConsts);
5322 load64shorts(vs1, tmpAddr);
5323 load64shorts(vs2, zetas);
5324 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5325 __ add(tmpAddr, coeffs, 128);
5326 load64shorts(vs1, tmpAddr);
5327 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5328 vs_addv(vs1, __ T8H, vs1, vs2);
5329 __ add(tmpAddr, coeffs, 128);
5330 store64shorts(vs1, tmpAddr);
5331 __ add(tmpAddr, coeffs, 384);
5332 store64shorts(vs3, tmpAddr);
5333
5334 // level 1
5335 // restore montmul constants
5336 vs_ldpq(vq, kyberConsts);
5337 __ add(tmpAddr, coeffs, 128);
5338 load64shorts(vs1, tmpAddr);
5339 load64shorts(vs2, zetas);
5340 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5341 __ add(tmpAddr, coeffs, 0);
5342 load64shorts(vs1, tmpAddr);
5343 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5344 vs_addv(vs1, __ T8H, vs1, vs2);
5345 __ add(tmpAddr, coeffs, 0);
5346 store64shorts(vs1, tmpAddr);
5347 store64shorts(vs3, tmpAddr);
5348 vs_ldpq(vq, kyberConsts);
5349 __ add(tmpAddr, coeffs, 384);
5350 load64shorts(vs1, tmpAddr);
5351 load64shorts(vs2, zetas);
5352 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5353 __ add(tmpAddr, coeffs, 256);
5354 load64shorts(vs1, tmpAddr);
5355 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5356 vs_addv(vs1, __ T8H, vs1, vs2);
5357 __ add(tmpAddr, coeffs, 256);
5358 store64shorts(vs1, tmpAddr);
5359 store64shorts(vs3, tmpAddr);
5360
5361 // level 2
5362 vs_ldpq(vq, kyberConsts);
5363 int offsets1[4] = { 0, 32, 128, 160 };
5364 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5365 load64shorts(vs2, zetas);
5366 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5367 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5368 // kyber_subv_addv64();
5369 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5370 vs_addv(vs1, __ T8H, vs1, vs2);
5371 __ add(tmpAddr, coeffs, 0);
5372 vs_stpq_post(vs_front(vs1), tmpAddr);
5373 vs_stpq_post(vs_front(vs3), tmpAddr);
5374 vs_stpq_post(vs_back(vs1), tmpAddr);
5375 vs_stpq_post(vs_back(vs3), tmpAddr);
5376 vs_ldpq(vq, kyberConsts);
5377 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5378 load64shorts(vs2, zetas);
5379 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5380 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5381 // kyber_subv_addv64();
5382 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5383 vs_addv(vs1, __ T8H, vs1, vs2);
5384 __ add(tmpAddr, coeffs, 256);
5385 vs_stpq_post(vs_front(vs1), tmpAddr);
5386 vs_stpq_post(vs_front(vs3), tmpAddr);
5387 vs_stpq_post(vs_back(vs1), tmpAddr);
5388 vs_stpq_post(vs_back(vs3), tmpAddr);
5389
5390 // level 3
5391 vs_ldpq(vq, kyberConsts);
5392 int offsets2[4] = { 0, 64, 128, 192 };
5393 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5394 load64shorts(vs2, zetas);
5395 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5396 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5397 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5398 vs_addv(vs1, __ T8H, vs1, vs2);
5399 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5400 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5401
5402 vs_ldpq(vq, kyberConsts);
5403 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5404 load64shorts(vs2, zetas);
5405 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5406 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5407 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5408 vs_addv(vs1, __ T8H, vs1, vs2);
5409 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5410 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5411
5412 // level 4
5413 // At level 4 coefficients occur in 8 discrete blocks of size 16
5414 // so they are loaded using employing an ldr at 8 distinct offsets.
5415
5416 vs_ldpq(vq, kyberConsts);
5417 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5418 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5419 load64shorts(vs2, zetas);
5420 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5421 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5422 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5423 vs_addv(vs1, __ T8H, vs1, vs2);
5424 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5425 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5426
5427 vs_ldpq(vq, kyberConsts);
5428 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5429 load64shorts(vs2, zetas);
5430 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5431 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5432 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5433 vs_addv(vs1, __ T8H, vs1, vs2);
5434 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5435 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5436
5437 // level 5
5438 // At level 5 related coefficients occur in discrete blocks of size 8 so
5439 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5440
5441 vs_ldpq(vq, kyberConsts);
5442 int offsets4[4] = { 0, 32, 64, 96 };
5443 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5444 load32shorts(vs_front(vs2), zetas);
5445 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5446 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5447 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5448 load32shorts(vs_front(vs2), zetas);
5449 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5450 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5451 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5452 load32shorts(vs_front(vs2), zetas);
5453 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5454 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5455
5456 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5457 load32shorts(vs_front(vs2), zetas);
5458 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5459 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5460
5461 // level 6
5462 // At level 6 related coefficients occur in discrete blocks of size 4 so
5463 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5464
5465 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5466 load32shorts(vs_front(vs2), zetas);
5467 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5468 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5469 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5470 // __ ldpq(v18, v19, __ post(zetas, 32));
5471 load32shorts(vs_front(vs2), zetas);
5472 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5473 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5474
5475 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5476 load32shorts(vs_front(vs2), zetas);
5477 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5478 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5479
5480 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5481 load32shorts(vs_front(vs2), zetas);
5482 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5483 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5484
5485 __ leave(); // required for proper stackwalking of RuntimeStub frame
5486 __ mov(r0, zr); // return 0
5487 __ ret(lr);
5488
5489 return start;
5490 }
5491
5492 // Kyber Inverse NTT function
5493 // Implements
5494 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5495 //
5496 // coeffs (short[256]) = c_rarg0
5497 // ntt_zetas (short[256]) = c_rarg1
5498 address generate_kyberInverseNtt() {
5499
5500 __ align(CodeEntryAlignment);
5501 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5502 StubCodeMark mark(this, stub_id);
5503 address start = __ pc();
5504 __ enter();
5505
5506 const Register coeffs = c_rarg0;
5507 const Register zetas = c_rarg1;
5508
5509 const Register kyberConsts = r10;
5510 const Register tmpAddr = r11;
5511 const Register tmpAddr2 = c_rarg2;
5512
5513 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5514 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5515 VSeq<2> vq(30); // n.b. constants overlap vs3
5516
5517 __ lea(kyberConsts,
5518 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5519
5520 // level 0
5521 // At level 0 related coefficients occur in discrete blocks of size 4 so
5522 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5523
5524 vs_ldpq(vq, kyberConsts);
5525 int offsets4[4] = { 0, 32, 64, 96 };
5526 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5527 load32shorts(vs_front(vs2), zetas);
5528 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5529 vs_front(vs2), vs_back(vs2), vtmp, vq);
5530 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5531 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5532 load32shorts(vs_front(vs2), zetas);
5533 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5534 vs_front(vs2), vs_back(vs2), vtmp, vq);
5535 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5536 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5537 load32shorts(vs_front(vs2), zetas);
5538 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5539 vs_front(vs2), vs_back(vs2), vtmp, vq);
5540 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5541 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5542 load32shorts(vs_front(vs2), zetas);
5543 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5544 vs_front(vs2), vs_back(vs2), vtmp, vq);
5545 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5546
5547 // level 1
5548 // At level 1 related coefficients occur in discrete blocks of size 8 so
5549 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5550
5551 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5552 load32shorts(vs_front(vs2), zetas);
5553 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5554 vs_front(vs2), vs_back(vs2), vtmp, vq);
5555 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5556 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5557 load32shorts(vs_front(vs2), zetas);
5558 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5559 vs_front(vs2), vs_back(vs2), vtmp, vq);
5560 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5561
5562 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5563 load32shorts(vs_front(vs2), zetas);
5564 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5565 vs_front(vs2), vs_back(vs2), vtmp, vq);
5566 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5567 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5568 load32shorts(vs_front(vs2), zetas);
5569 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5570 vs_front(vs2), vs_back(vs2), vtmp, vq);
5571 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5572
5573 // level 2
5574 // At level 2 coefficients occur in 8 discrete blocks of size 16
5575 // so they are loaded using employing an ldr at 8 distinct offsets.
5576
5577 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5578 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5579 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5580 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5581 vs_subv(vs1, __ T8H, vs1, vs2);
5582 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5583 load64shorts(vs2, zetas);
5584 vs_ldpq(vq, kyberConsts);
5585 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5586 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5587
5588 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5589 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5590 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5591 vs_subv(vs1, __ T8H, vs1, vs2);
5592 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5593 load64shorts(vs2, zetas);
5594 vs_ldpq(vq, kyberConsts);
5595 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5596 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5597
5598 // Barrett reduction at indexes where overflow may happen
5599
5600 // load q and the multiplier for the Barrett reduction
5601 __ add(tmpAddr, kyberConsts, 16);
5602 vs_ldpq(vq, tmpAddr);
5603
5604 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5605 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5606 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5607 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5608 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5609 vs_sshr(vs2, __ T8H, vs2, 11);
5610 vs_mlsv(vs1, __ T8H, vs2, vq1);
5611 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5612 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5613 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5614 vs_sshr(vs2, __ T8H, vs2, 11);
5615 vs_mlsv(vs1, __ T8H, vs2, vq1);
5616 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5617
5618 // level 3
5619 // From level 3 upwards coefficients occur in discrete blocks whose size is
5620 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5621
5622 int offsets2[4] = { 0, 64, 128, 192 };
5623 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5624 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5625 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5626 vs_subv(vs1, __ T8H, vs1, vs2);
5627 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5628 load64shorts(vs2, zetas);
5629 vs_ldpq(vq, kyberConsts);
5630 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5631 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5632
5633 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5634 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5635 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5636 vs_subv(vs1, __ T8H, vs1, vs2);
5637 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5638 load64shorts(vs2, zetas);
5639 vs_ldpq(vq, kyberConsts);
5640 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5641 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5642
5643 // level 4
5644
5645 int offsets1[4] = { 0, 32, 128, 160 };
5646 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5647 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5648 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5649 vs_subv(vs1, __ T8H, vs1, vs2);
5650 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5651 load64shorts(vs2, zetas);
5652 vs_ldpq(vq, kyberConsts);
5653 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5654 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5655
5656 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5657 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5658 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5659 vs_subv(vs1, __ T8H, vs1, vs2);
5660 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5661 load64shorts(vs2, zetas);
5662 vs_ldpq(vq, kyberConsts);
5663 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5664 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5665
5666 // level 5
5667
5668 __ add(tmpAddr, coeffs, 0);
5669 load64shorts(vs1, tmpAddr);
5670 __ add(tmpAddr, coeffs, 128);
5671 load64shorts(vs2, tmpAddr);
5672 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5673 vs_subv(vs1, __ T8H, vs1, vs2);
5674 __ add(tmpAddr, coeffs, 0);
5675 store64shorts(vs3, tmpAddr);
5676 load64shorts(vs2, zetas);
5677 vs_ldpq(vq, kyberConsts);
5678 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5679 __ add(tmpAddr, coeffs, 128);
5680 store64shorts(vs2, tmpAddr);
5681
5682 load64shorts(vs1, tmpAddr);
5683 __ add(tmpAddr, coeffs, 384);
5684 load64shorts(vs2, tmpAddr);
5685 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5686 vs_subv(vs1, __ T8H, vs1, vs2);
5687 __ add(tmpAddr, coeffs, 256);
5688 store64shorts(vs3, tmpAddr);
5689 load64shorts(vs2, zetas);
5690 vs_ldpq(vq, kyberConsts);
5691 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5692 __ add(tmpAddr, coeffs, 384);
5693 store64shorts(vs2, tmpAddr);
5694
5695 // Barrett reduction at indexes where overflow may happen
5696
5697 // load q and the multiplier for the Barrett reduction
5698 __ add(tmpAddr, kyberConsts, 16);
5699 vs_ldpq(vq, tmpAddr);
5700
5701 int offsets0[2] = { 0, 256 };
5702 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5703 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5704 vs_sshr(vs2, __ T8H, vs2, 11);
5705 vs_mlsv(vs1, __ T8H, vs2, vq1);
5706 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5707
5708 // level 6
5709
5710 __ add(tmpAddr, coeffs, 0);
5711 load64shorts(vs1, tmpAddr);
5712 __ add(tmpAddr, coeffs, 256);
5713 load64shorts(vs2, tmpAddr);
5714 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5715 vs_subv(vs1, __ T8H, vs1, vs2);
5716 __ add(tmpAddr, coeffs, 0);
5717 store64shorts(vs3, tmpAddr);
5718 load64shorts(vs2, zetas);
5719 vs_ldpq(vq, kyberConsts);
5720 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5721 __ add(tmpAddr, coeffs, 256);
5722 store64shorts(vs2, tmpAddr);
5723
5724 __ add(tmpAddr, coeffs, 128);
5725 load64shorts(vs1, tmpAddr);
5726 __ add(tmpAddr, coeffs, 384);
5727 load64shorts(vs2, tmpAddr);
5728 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5729 vs_subv(vs1, __ T8H, vs1, vs2);
5730 __ add(tmpAddr, coeffs, 128);
5731 store64shorts(vs3, tmpAddr);
5732 load64shorts(vs2, zetas);
5733 vs_ldpq(vq, kyberConsts);
5734 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5735 __ add(tmpAddr, coeffs, 384);
5736 store64shorts(vs2, tmpAddr);
5737
5738 // multiply by 2^-n
5739
5740 // load toMont(2^-n mod q)
5741 __ add(tmpAddr, kyberConsts, 48);
5742 __ ldr(v29, __ Q, tmpAddr);
5743
5744 vs_ldpq(vq, kyberConsts);
5745 __ add(tmpAddr, coeffs, 0);
5746 load64shorts(vs1, tmpAddr);
5747 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5748 __ add(tmpAddr, coeffs, 0);
5749 store64shorts(vs2, tmpAddr);
5750
5751 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5752 load64shorts(vs1, tmpAddr);
5753 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5754 __ add(tmpAddr, coeffs, 128);
5755 store64shorts(vs2, tmpAddr);
5756
5757 // now tmpAddr contains coeffs + 256
5758 load64shorts(vs1, tmpAddr);
5759 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5760 __ add(tmpAddr, coeffs, 256);
5761 store64shorts(vs2, tmpAddr);
5762
5763 // now tmpAddr contains coeffs + 384
5764 load64shorts(vs1, tmpAddr);
5765 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5766 __ add(tmpAddr, coeffs, 384);
5767 store64shorts(vs2, tmpAddr);
5768
5769 __ leave(); // required for proper stackwalking of RuntimeStub frame
5770 __ mov(r0, zr); // return 0
5771 __ ret(lr);
5772
5773 return start;
5774 }
5775
5776 // Kyber multiply polynomials in the NTT domain.
5777 // Implements
5778 // static int implKyberNttMult(
5779 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5780 //
5781 // result (short[256]) = c_rarg0
5782 // ntta (short[256]) = c_rarg1
5783 // nttb (short[256]) = c_rarg2
5784 // zetas (short[128]) = c_rarg3
5785 address generate_kyberNttMult() {
5786
5787 __ align(CodeEntryAlignment);
5788 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5789 StubCodeMark mark(this, stub_id);
5790 address start = __ pc();
5791 __ enter();
5792
5793 const Register result = c_rarg0;
5794 const Register ntta = c_rarg1;
5795 const Register nttb = c_rarg2;
5796 const Register zetas = c_rarg3;
5797
5798 const Register kyberConsts = r10;
5799 const Register limit = r11;
5800
5801 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5802 VSeq<4> vs3(16), vs4(20);
5803 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5804 VSeq<2> vz(28); // pair of zetas
5805 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5806
5807 __ lea(kyberConsts,
5808 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5809
5810 Label kyberNttMult_loop;
5811
5812 __ add(limit, result, 512);
5813
5814 // load q and qinv
5815 vs_ldpq(vq, kyberConsts);
5816
5817 // load R^2 mod q (to convert back from Montgomery representation)
5818 __ add(kyberConsts, kyberConsts, 64);
5819 __ ldr(v27, __ Q, kyberConsts);
5820
5821 __ BIND(kyberNttMult_loop);
5822
5823 // load 16 zetas
5824 vs_ldpq_post(vz, zetas);
5825
5826 // load 2 sets of 32 coefficients from the two input arrays
5827 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5828 // are striped across pairs of vector registers
5829 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5830 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5831 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5832 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5833
5834 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5835 // i.e. montmul the first and second halves of vs1 in order and
5836 // then with one sequence reversed storing the two results in vs3
5837 //
5838 // vs3[0] <- montmul(a0, b0)
5839 // vs3[1] <- montmul(a1, b1)
5840 // vs3[2] <- montmul(a0, b1)
5841 // vs3[3] <- montmul(a1, b0)
5842 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5843 kyber_montmul16(vs_back(vs3),
5844 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5845
5846 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5847 // i.e. montmul the first and second halves of vs4 in order and
5848 // then with one sequence reversed storing the two results in vs1
5849 //
5850 // vs1[0] <- montmul(a2, b2)
5851 // vs1[1] <- montmul(a3, b3)
5852 // vs1[2] <- montmul(a2, b3)
5853 // vs1[3] <- montmul(a3, b2)
5854 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5855 kyber_montmul16(vs_back(vs1),
5856 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5857
5858 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5859 // We can schedule two montmuls at a time if we use a suitable vector
5860 // sequence <vs3[1], vs1[1]>.
5861 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5862 VSeq<2> vs5(vs3[1], delta);
5863
5864 // vs3[1] <- montmul(montmul(a1, b1), z0)
5865 // vs1[1] <- montmul(montmul(a3, b3), z1)
5866 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5867
5868 // add results in pairs storing in vs3
5869 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5870 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5871 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5872
5873 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5874 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5875 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5876
5877 // vs1 <- montmul(vs3, montRSquareModQ)
5878 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5879
5880 // store back the two pairs of result vectors de-interleaved as 8H elements
5881 // i.e. storing each pairs of shorts striped across a register pair adjacent
5882 // in memory
5883 vs_st2_post(vs1, __ T8H, result);
5884
5885 __ cmp(result, limit);
5886 __ br(Assembler::NE, kyberNttMult_loop);
5887
5888 __ leave(); // required for proper stackwalking of RuntimeStub frame
5889 __ mov(r0, zr); // return 0
5890 __ ret(lr);
5891
5892 return start;
5893 }
5894
5895 // Kyber add 2 polynomials.
5896 // Implements
5897 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5898 //
5899 // result (short[256]) = c_rarg0
5900 // a (short[256]) = c_rarg1
5901 // b (short[256]) = c_rarg2
5902 address generate_kyberAddPoly_2() {
5903
5904 __ align(CodeEntryAlignment);
5905 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5906 StubCodeMark mark(this, stub_id);
5907 address start = __ pc();
5908 __ enter();
5909
5910 const Register result = c_rarg0;
5911 const Register a = c_rarg1;
5912 const Register b = c_rarg2;
5913
5914 const Register kyberConsts = r11;
5915
5916 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5917 // So, we can load, add and store the data in 3 groups of 11,
5918 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5919 // registers. A further constraint is that the mapping needs
5920 // to skip callee saves. So, we allocate the register
5921 // sequences using two 8 sequences, two 2 sequences and two
5922 // single registers.
5923 VSeq<8> vs1_1(0);
5924 VSeq<2> vs1_2(16);
5925 FloatRegister vs1_3 = v28;
5926 VSeq<8> vs2_1(18);
5927 VSeq<2> vs2_2(26);
5928 FloatRegister vs2_3 = v29;
5929
5930 // two constant vector sequences
5931 VSeq<8> vc_1(31, 0);
5932 VSeq<2> vc_2(31, 0);
5933
5934 FloatRegister vc_3 = v31;
5935 __ lea(kyberConsts,
5936 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5937
5938 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5939 for (int i = 0; i < 3; i++) {
5940 // load 80 or 88 values from a into vs1_1/2/3
5941 vs_ldpq_post(vs1_1, a);
5942 vs_ldpq_post(vs1_2, a);
5943 if (i < 2) {
5944 __ ldr(vs1_3, __ Q, __ post(a, 16));
5945 }
5946 // load 80 or 88 values from b into vs2_1/2/3
5947 vs_ldpq_post(vs2_1, b);
5948 vs_ldpq_post(vs2_2, b);
5949 if (i < 2) {
5950 __ ldr(vs2_3, __ Q, __ post(b, 16));
5951 }
5952 // sum 80 or 88 values across vs1 and vs2 into vs1
5953 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5954 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5955 if (i < 2) {
5956 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5957 }
5958 // add constant to all 80 or 88 results
5959 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5960 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5961 if (i < 2) {
5962 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5963 }
5964 // store 80 or 88 values
5965 vs_stpq_post(vs1_1, result);
5966 vs_stpq_post(vs1_2, result);
5967 if (i < 2) {
5968 __ str(vs1_3, __ Q, __ post(result, 16));
5969 }
5970 }
5971
5972 __ leave(); // required for proper stackwalking of RuntimeStub frame
5973 __ mov(r0, zr); // return 0
5974 __ ret(lr);
5975
5976 return start;
5977 }
5978
5979 // Kyber add 3 polynomials.
5980 // Implements
5981 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
5982 //
5983 // result (short[256]) = c_rarg0
5984 // a (short[256]) = c_rarg1
5985 // b (short[256]) = c_rarg2
5986 // c (short[256]) = c_rarg3
5987 address generate_kyberAddPoly_3() {
5988
5989 __ align(CodeEntryAlignment);
5990 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
5991 StubCodeMark mark(this, stub_id);
5992 address start = __ pc();
5993 __ enter();
5994
5995 const Register result = c_rarg0;
5996 const Register a = c_rarg1;
5997 const Register b = c_rarg2;
5998 const Register c = c_rarg3;
5999
6000 const Register kyberConsts = r11;
6001
6002 // As above we sum 256 sets of values in total i.e. 32 x 8H
6003 // quadwords. So, we can load, add and store the data in 3
6004 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6005 // of 10 or 11 registers. A further constraint is that the
6006 // mapping needs to skip callee saves. So, we allocate the
6007 // register sequences using two 8 sequences, two 2 sequences
6008 // and two single registers.
6009 VSeq<8> vs1_1(0);
6010 VSeq<2> vs1_2(16);
6011 FloatRegister vs1_3 = v28;
6012 VSeq<8> vs2_1(18);
6013 VSeq<2> vs2_2(26);
6014 FloatRegister vs2_3 = v29;
6015
6016 // two constant vector sequences
6017 VSeq<8> vc_1(31, 0);
6018 VSeq<2> vc_2(31, 0);
6019
6020 FloatRegister vc_3 = v31;
6021
6022 __ lea(kyberConsts,
6023 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6024
6025 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6026 for (int i = 0; i < 3; i++) {
6027 // load 80 or 88 values from a into vs1_1/2/3
6028 vs_ldpq_post(vs1_1, a);
6029 vs_ldpq_post(vs1_2, a);
6030 if (i < 2) {
6031 __ ldr(vs1_3, __ Q, __ post(a, 16));
6032 }
6033 // load 80 or 88 values from b into vs2_1/2/3
6034 vs_ldpq_post(vs2_1, b);
6035 vs_ldpq_post(vs2_2, b);
6036 if (i < 2) {
6037 __ ldr(vs2_3, __ Q, __ post(b, 16));
6038 }
6039 // sum 80 or 88 values across vs1 and vs2 into vs1
6040 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6041 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6042 if (i < 2) {
6043 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6044 }
6045 // load 80 or 88 values from c into vs2_1/2/3
6046 vs_ldpq_post(vs2_1, c);
6047 vs_ldpq_post(vs2_2, c);
6048 if (i < 2) {
6049 __ ldr(vs2_3, __ Q, __ post(c, 16));
6050 }
6051 // sum 80 or 88 values across vs1 and vs2 into vs1
6052 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6053 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6054 if (i < 2) {
6055 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6056 }
6057 // add constant to all 80 or 88 results
6058 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6059 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6060 if (i < 2) {
6061 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6062 }
6063 // store 80 or 88 values
6064 vs_stpq_post(vs1_1, result);
6065 vs_stpq_post(vs1_2, result);
6066 if (i < 2) {
6067 __ str(vs1_3, __ Q, __ post(result, 16));
6068 }
6069 }
6070
6071 __ leave(); // required for proper stackwalking of RuntimeStub frame
6072 __ mov(r0, zr); // return 0
6073 __ ret(lr);
6074
6075 return start;
6076 }
6077
6078 // Kyber parse XOF output to polynomial coefficient candidates
6079 // or decodePoly(12, ...).
6080 // Implements
6081 // static int implKyber12To16(
6082 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6083 //
6084 // we assume that parsed and condensed are allocated such that for
6085 // n = (parsedLength + 63) / 64
6086 // n blocks of 96 bytes of input can be processed, i.e.
6087 // index + n * 96 <= condensed.length and
6088 // n * 64 <= parsed.length
6089 //
6090 // condensed (byte[]) = c_rarg0
6091 // condensedIndex = c_rarg1
6092 // parsed (short[]) = c_rarg2
6093 // parsedLength = c_rarg3
6094 address generate_kyber12To16() {
6095 Label L_F00, L_loop;
6096
6097 __ align(CodeEntryAlignment);
6098 StubId stub_id = StubId::stubgen_kyber12To16_id;
6099 StubCodeMark mark(this, stub_id);
6100 address start = __ pc();
6101 __ enter();
6102
6103 const Register condensed = c_rarg0;
6104 const Register condensedOffs = c_rarg1;
6105 const Register parsed = c_rarg2;
6106 const Register parsedLength = c_rarg3;
6107
6108 const Register tmpAddr = r11;
6109
6110 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6111 // quadwords so we need a 6 vector sequence for the inputs.
6112 // Parsing produces 64 shorts, employing two 8 vector
6113 // sequences to store and combine the intermediate data.
6114 VSeq<6> vin(24);
6115 VSeq<8> va(0), vb(16);
6116
6117 __ adr(tmpAddr, L_F00);
6118 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6119 __ add(condensed, condensed, condensedOffs);
6120
6121 __ BIND(L_loop);
6122 // load 96 (6 x 16B) byte values
6123 vs_ld3_post(vin, __ T16B, condensed);
6124
6125 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6126 // holds 48 (16x3) contiguous bytes from memory striped
6127 // horizontally across each of the 16 byte lanes. Equivalently,
6128 // that is 16 pairs of 12-bit integers. Likewise the back half
6129 // holds the next 48 bytes in the same arrangement.
6130
6131 // Each vector in the front half can also be viewed as a vertical
6132 // strip across the 16 pairs of 12 bit integers. Each byte in
6133 // vin[0] stores the low 8 bits of the first int in a pair. Each
6134 // byte in vin[1] stores the high 4 bits of the first int and the
6135 // low 4 bits of the second int. Each byte in vin[2] stores the
6136 // high 8 bits of the second int. Likewise the vectors in second
6137 // half.
6138
6139 // Converting the data to 16-bit shorts requires first of all
6140 // expanding each of the 6 x 16B vectors into 6 corresponding
6141 // pairs of 8H vectors. Mask, shift and add operations on the
6142 // resulting vector pairs can be used to combine 4 and 8 bit
6143 // parts of related 8H vector elements.
6144 //
6145 // The middle vectors (vin[2] and vin[5]) are actually expanded
6146 // twice, one copy manipulated to provide the lower 4 bits
6147 // belonging to the first short in a pair and another copy
6148 // manipulated to provide the higher 4 bits belonging to the
6149 // second short in a pair. This is why the the vector sequences va
6150 // and vb used to hold the expanded 8H elements are of length 8.
6151
6152 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6153 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6154 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6155 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6156 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6157 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6158 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6159 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6160
6161 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6162 // and vb[4:5]
6163 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6164 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6165 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6166 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6167 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6168 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6169
6170 // shift lo byte of copy 1 of the middle stripe into the high byte
6171 __ shl(va[2], __ T8H, va[2], 8);
6172 __ shl(va[3], __ T8H, va[3], 8);
6173 __ shl(vb[2], __ T8H, vb[2], 8);
6174 __ shl(vb[3], __ T8H, vb[3], 8);
6175
6176 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6177 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6178 // are in bit positions [4..11].
6179 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6180 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6181 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6182 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6183
6184 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6185 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6186 // copy2
6187 __ andr(va[2], __ T16B, va[2], v31);
6188 __ andr(va[3], __ T16B, va[3], v31);
6189 __ ushr(va[4], __ T8H, va[4], 4);
6190 __ ushr(va[5], __ T8H, va[5], 4);
6191 __ andr(vb[2], __ T16B, vb[2], v31);
6192 __ andr(vb[3], __ T16B, vb[3], v31);
6193 __ ushr(vb[4], __ T8H, vb[4], 4);
6194 __ ushr(vb[5], __ T8H, vb[5], 4);
6195
6196 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6197 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6198 // n.b. the ordering ensures: i) inputs are consumed before they
6199 // are overwritten ii) the order of 16-bit results across successive
6200 // pairs of vectors in va and then vb reflects the order of the
6201 // corresponding 12-bit inputs
6202 __ addv(va[0], __ T8H, va[0], va[2]);
6203 __ addv(va[2], __ T8H, va[1], va[3]);
6204 __ addv(va[1], __ T8H, va[4], va[6]);
6205 __ addv(va[3], __ T8H, va[5], va[7]);
6206 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6207 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6208 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6209 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6210
6211 // store 64 results interleaved as shorts
6212 vs_st2_post(vs_front(va), __ T8H, parsed);
6213 vs_st2_post(vs_front(vb), __ T8H, parsed);
6214
6215 __ sub(parsedLength, parsedLength, 64);
6216 __ cmp(parsedLength, (u1)0);
6217 __ br(Assembler::GT, L_loop);
6218
6219 __ leave(); // required for proper stackwalking of RuntimeStub frame
6220 __ mov(r0, zr); // return 0
6221 __ ret(lr);
6222
6223 // bind label and generate constant data used by this stub
6224 __ BIND(L_F00);
6225 __ emit_int64(0x0f000f000f000f00);
6226 __ emit_int64(0x0f000f000f000f00);
6227
6228 return start;
6229 }
6230
6231 // Kyber Barrett reduce function.
6232 // Implements
6233 // static int implKyberBarrettReduce(short[] coeffs) {}
6234 //
6235 // coeffs (short[256]) = c_rarg0
6236 address generate_kyberBarrettReduce() {
6237
6238 __ align(CodeEntryAlignment);
6239 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6240 StubCodeMark mark(this, stub_id);
6241 address start = __ pc();
6242 __ enter();
6243
6244 const Register coeffs = c_rarg0;
6245
6246 const Register kyberConsts = r10;
6247 const Register result = r11;
6248
6249 // As above we process 256 sets of values in total i.e. 32 x
6250 // 8H quadwords. So, we can load, add and store the data in 3
6251 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6252 // of 10 or 11 registers. A further constraint is that the
6253 // mapping needs to skip callee saves. So, we allocate the
6254 // register sequences using two 8 sequences, two 2 sequences
6255 // and two single registers.
6256 VSeq<8> vs1_1(0);
6257 VSeq<2> vs1_2(16);
6258 FloatRegister vs1_3 = v28;
6259 VSeq<8> vs2_1(18);
6260 VSeq<2> vs2_2(26);
6261 FloatRegister vs2_3 = v29;
6262
6263 // we also need a pair of corresponding constant sequences
6264
6265 VSeq<8> vc1_1(30, 0);
6266 VSeq<2> vc1_2(30, 0);
6267 FloatRegister vc1_3 = v30; // for kyber_q
6268
6269 VSeq<8> vc2_1(31, 0);
6270 VSeq<2> vc2_2(31, 0);
6271 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6272
6273 __ add(result, coeffs, 0);
6274 __ lea(kyberConsts,
6275 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6276
6277 // load q and the multiplier for the Barrett reduction
6278 __ add(kyberConsts, kyberConsts, 16);
6279 __ ldpq(vc1_3, vc2_3, kyberConsts);
6280
6281 for (int i = 0; i < 3; i++) {
6282 // load 80 or 88 coefficients
6283 vs_ldpq_post(vs1_1, coeffs);
6284 vs_ldpq_post(vs1_2, coeffs);
6285 if (i < 2) {
6286 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6287 }
6288
6289 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6290 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6291 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6292 if (i < 2) {
6293 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6294 }
6295
6296 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6297 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6298 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6299 if (i < 2) {
6300 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6301 }
6302
6303 // vs1 <- vs1 - vs2 * kyber_q
6304 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6305 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6306 if (i < 2) {
6307 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6308 }
6309
6310 vs_stpq_post(vs1_1, result);
6311 vs_stpq_post(vs1_2, result);
6312 if (i < 2) {
6313 __ str(vs1_3, __ Q, __ post(result, 16));
6314 }
6315 }
6316
6317 __ leave(); // required for proper stackwalking of RuntimeStub frame
6318 __ mov(r0, zr); // return 0
6319 __ ret(lr);
6320
6321 return start;
6322 }
6323
6324
6325 // Dilithium-specific montmul helper routines that generate parallel
6326 // code for, respectively, a single 4x4s vector sequence montmul or
6327 // two such multiplies in a row.
6328
6329 // Perform 16 32-bit Montgomery multiplications in parallel
6330 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6331 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6332 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6333 // It will assert that the register use is valid
6334 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6335 }
6336
6337 // Perform 2x16 32-bit Montgomery multiplications in parallel
6338 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6339 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6340 // Schedule two successive 4x4S multiplies via the montmul helper
6341 // on the front and back halves of va, vb and vc. The helper will
6342 // assert that the register use has no overlap conflicts on each
6343 // individual call but we also need to ensure that the necessary
6344 // disjoint/equality constraints are met across both calls.
6345
6346 // vb, vc, vtmp and vq must be disjoint. va must either be
6347 // disjoint from all other registers or equal vc
6348
6349 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6350 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6351 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6352
6353 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6354 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6355
6356 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6357
6358 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6359 assert(vs_disjoint(va, vb), "va and vb overlap");
6360 assert(vs_disjoint(va, vq), "va and vq overlap");
6361 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6362
6363 // We multiply the front and back halves of each sequence 4 at a
6364 // time because
6365 //
6366 // 1) we are currently only able to get 4-way instruction
6367 // parallelism at best
6368 //
6369 // 2) we need registers for the constants in vq and temporary
6370 // scratch registers to hold intermediate results so vtmp can only
6371 // be a VSeq<4> which means we only have 4 scratch slots.
6372
6373 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6374 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6375 }
6376
6377 // Perform combined montmul then add/sub on 4x4S vectors.
6378 void dilithium_montmul16_sub_add(
6379 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6380 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6381 // compute a = montmul(a1, c)
6382 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6383 // ouptut a1 = a0 - a
6384 vs_subv(va1, __ T4S, va0, vc);
6385 // and a0 = a0 + a
6386 vs_addv(va0, __ T4S, va0, vc);
6387 }
6388
6389 // Perform combined add/sub then montul on 4x4S vectors.
6390 void dilithium_sub_add_montmul16(
6391 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6392 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6393 // compute c = a0 - a1
6394 vs_subv(vtmp1, __ T4S, va0, va1);
6395 // output a0 = a0 + a1
6396 vs_addv(va0, __ T4S, va0, va1);
6397 // output a1 = b montmul c
6398 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6399 }
6400
6401 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6402 // in the Java implementation come in sequences of at least 8, so we
6403 // can use ldpq to collect the corresponding data into pairs of vector
6404 // registers.
6405 // We collect the coefficients corresponding to the 'j+l' indexes into
6406 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6407 // then we do the (Montgomery) multiplications by the zetas in parallel
6408 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6409 // v0-v7, then do the additions into v24-v31 and the subtractions into
6410 // v0-v7 and finally save the results back to the coeffs array.
6411 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6412 const Register coeffs, const Register zetas) {
6413 int c1 = 0;
6414 int c2 = 512;
6415 int startIncr;
6416 // don't use callee save registers v8 - v15
6417 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6418 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6419 VSeq<2> vq(30); // n.b. constants overlap vs3
6420 int offsets[4] = { 0, 32, 64, 96 };
6421
6422 for (int level = 0; level < 5; level++) {
6423 int c1Start = c1;
6424 int c2Start = c2;
6425 if (level == 3) {
6426 offsets[1] = 32;
6427 offsets[2] = 128;
6428 offsets[3] = 160;
6429 } else if (level == 4) {
6430 offsets[1] = 64;
6431 offsets[2] = 128;
6432 offsets[3] = 192;
6433 }
6434
6435 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6436 // time at 4 different offsets and multiply them in order by the
6437 // next set of input values. So we employ indexed load and store
6438 // pair instructions with arrangement 4S.
6439 for (int i = 0; i < 4; i++) {
6440 // reload q and qinv
6441 vs_ldpq(vq, dilithiumConsts); // qInv, q
6442 // load 8x4S coefficients via second start pos == c2
6443 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6444 // load next 8x4S inputs == b
6445 vs_ldpq_post(vs2, zetas);
6446 // compute a == c2 * b mod MONT_Q
6447 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6448 // load 8x4s coefficients via first start pos == c1
6449 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6450 // compute a1 = c1 + a
6451 vs_addv(vs3, __ T4S, vs1, vs2);
6452 // compute a2 = c1 - a
6453 vs_subv(vs1, __ T4S, vs1, vs2);
6454 // output a1 and a2
6455 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6456 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6457
6458 int k = 4 * level + i;
6459
6460 if (k > 7) {
6461 startIncr = 256;
6462 } else if (k == 5) {
6463 startIncr = 384;
6464 } else {
6465 startIncr = 128;
6466 }
6467
6468 c1Start += startIncr;
6469 c2Start += startIncr;
6470 }
6471
6472 c2 /= 2;
6473 }
6474 }
6475
6476 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6477 // Implements the method
6478 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6479 // of the Java class sun.security.provider
6480 //
6481 // coeffs (int[256]) = c_rarg0
6482 // zetas (int[256]) = c_rarg1
6483 address generate_dilithiumAlmostNtt() {
6484
6485 __ align(CodeEntryAlignment);
6486 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6487 StubCodeMark mark(this, stub_id);
6488 address start = __ pc();
6489 __ enter();
6490
6491 const Register coeffs = c_rarg0;
6492 const Register zetas = c_rarg1;
6493
6494 const Register tmpAddr = r9;
6495 const Register dilithiumConsts = r10;
6496 const Register result = r11;
6497 // don't use callee save registers v8 - v15
6498 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6499 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6500 VSeq<2> vq(30); // n.b. constants overlap vs3
6501 int offsets[4] = { 0, 32, 64, 96};
6502 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6503 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6504 __ add(result, coeffs, 0);
6505 __ lea(dilithiumConsts,
6506 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6507
6508 // Each level represents one iteration of the outer for loop of the Java version.
6509
6510 // level 0-4
6511 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6512
6513 // level 5
6514
6515 // At level 5 the coefficients we need to combine with the zetas
6516 // are grouped in memory in blocks of size 4. So, for both sets of
6517 // coefficients we load 4 adjacent values at 8 different offsets
6518 // using an indexed ldr with register variant Q and multiply them
6519 // in sequence order by the next set of inputs. Likewise we store
6520 // the resuls using an indexed str with register variant Q.
6521 for (int i = 0; i < 1024; i += 256) {
6522 // reload constants q, qinv each iteration as they get clobbered later
6523 vs_ldpq(vq, dilithiumConsts); // qInv, q
6524 // load 32 (8x4S) coefficients via first offsets = c1
6525 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6526 // load next 32 (8x4S) inputs = b
6527 vs_ldpq_post(vs2, zetas);
6528 // a = b montul c1
6529 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6530 // load 32 (8x4S) coefficients via second offsets = c2
6531 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6532 // add/sub with result of multiply
6533 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6534 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6535 // write back new coefficients using same offsets
6536 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6537 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6538 }
6539
6540 // level 6
6541 // At level 6 the coefficients we need to combine with the zetas
6542 // are grouped in memory in pairs, the first two being montmul
6543 // inputs and the second add/sub inputs. We can still implement
6544 // the montmul+sub+add using 4-way parallelism but only if we
6545 // combine the coefficients with the zetas 16 at a time. We load 8
6546 // adjacent values at 4 different offsets using an ld2 load with
6547 // arrangement 2D. That interleaves the lower and upper halves of
6548 // each pair of quadwords into successive vector registers. We
6549 // then need to montmul the 4 even elements of the coefficients
6550 // register sequence by the zetas in order and then add/sub the 4
6551 // odd elements of the coefficients register sequence. We use an
6552 // equivalent st2 operation to store the results back into memory
6553 // de-interleaved.
6554 for (int i = 0; i < 1024; i += 128) {
6555 // reload constants q, qinv each iteration as they get clobbered later
6556 vs_ldpq(vq, dilithiumConsts); // qInv, q
6557 // load interleaved 16 (4x2D) coefficients via offsets
6558 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6559 // load next 16 (4x4S) inputs
6560 vs_ldpq_post(vs_front(vs2), zetas);
6561 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6562 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6563 vs_front(vs2), vtmp, vq);
6564 // store interleaved 16 (4x2D) coefficients via offsets
6565 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6566 }
6567
6568 // level 7
6569 // At level 7 the coefficients we need to combine with the zetas
6570 // occur singly with montmul inputs alterating with add/sub
6571 // inputs. Once again we can use 4-way parallelism to combine 16
6572 // zetas at a time. However, we have to load 8 adjacent values at
6573 // 4 different offsets using an ld2 load with arrangement 4S. That
6574 // interleaves the the odd words of each pair into one
6575 // coefficients vector register and the even words of the pair
6576 // into the next register. We then need to montmul the 4 even
6577 // elements of the coefficients register sequence by the zetas in
6578 // order and then add/sub the 4 odd elements of the coefficients
6579 // register sequence. We use an equivalent st2 operation to store
6580 // the results back into memory de-interleaved.
6581
6582 for (int i = 0; i < 1024; i += 128) {
6583 // reload constants q, qinv each iteration as they get clobbered later
6584 vs_ldpq(vq, dilithiumConsts); // qInv, q
6585 // load interleaved 16 (4x4S) coefficients via offsets
6586 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6587 // load next 16 (4x4S) inputs
6588 vs_ldpq_post(vs_front(vs2), zetas);
6589 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6590 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6591 vs_front(vs2), vtmp, vq);
6592 // store interleaved 16 (4x4S) coefficients via offsets
6593 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6594 }
6595 __ leave(); // required for proper stackwalking of RuntimeStub frame
6596 __ mov(r0, zr); // return 0
6597 __ ret(lr);
6598
6599 return start;
6600 }
6601
6602 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6603 // in the Java implementation come in sequences of at least 8, so we
6604 // can use ldpq to collect the corresponding data into pairs of vector
6605 // registers
6606 // We collect the coefficients that correspond to the 'j's into vs1
6607 // the coefficiets that correspond to the 'j+l's into vs2 then
6608 // do the additions into vs3 and the subtractions into vs1 then
6609 // save the result of the additions, load the zetas into vs2
6610 // do the (Montgomery) multiplications by zeta in parallel into vs2
6611 // finally save the results back to the coeffs array
6612 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6613 const Register coeffs, const Register zetas) {
6614 int c1 = 0;
6615 int c2 = 32;
6616 int startIncr;
6617 int offsets[4];
6618 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6619 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6620 VSeq<2> vq(30); // n.b. constants overlap vs3
6621
6622 offsets[0] = 0;
6623
6624 for (int level = 3; level < 8; level++) {
6625 int c1Start = c1;
6626 int c2Start = c2;
6627 if (level == 3) {
6628 offsets[1] = 64;
6629 offsets[2] = 128;
6630 offsets[3] = 192;
6631 } else if (level == 4) {
6632 offsets[1] = 32;
6633 offsets[2] = 128;
6634 offsets[3] = 160;
6635 } else {
6636 offsets[1] = 32;
6637 offsets[2] = 64;
6638 offsets[3] = 96;
6639 }
6640
6641 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6642 // time at 4 different offsets and multiply them in order by the
6643 // next set of input values. So we employ indexed load and store
6644 // pair instructions with arrangement 4S.
6645 for (int i = 0; i < 4; i++) {
6646 // load v1 32 (8x4S) coefficients relative to first start index
6647 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6648 // load v2 32 (8x4S) coefficients relative to second start index
6649 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6650 // a0 = v1 + v2 -- n.b. clobbers vqs
6651 vs_addv(vs3, __ T4S, vs1, vs2);
6652 // a1 = v1 - v2
6653 vs_subv(vs1, __ T4S, vs1, vs2);
6654 // save a1 relative to first start index
6655 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6656 // load constants q, qinv each iteration as they get clobbered above
6657 vs_ldpq(vq, dilithiumConsts); // qInv, q
6658 // load b next 32 (8x4S) inputs
6659 vs_ldpq_post(vs2, zetas);
6660 // a = a1 montmul b
6661 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6662 // save a relative to second start index
6663 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6664
6665 int k = 4 * level + i;
6666
6667 if (k < 24) {
6668 startIncr = 256;
6669 } else if (k == 25) {
6670 startIncr = 384;
6671 } else {
6672 startIncr = 128;
6673 }
6674
6675 c1Start += startIncr;
6676 c2Start += startIncr;
6677 }
6678
6679 c2 *= 2;
6680 }
6681 }
6682
6683 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6684 // Implements the method
6685 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6686 // the sun.security.provider.ML_DSA class.
6687 //
6688 // coeffs (int[256]) = c_rarg0
6689 // zetas (int[256]) = c_rarg1
6690 address generate_dilithiumAlmostInverseNtt() {
6691
6692 __ align(CodeEntryAlignment);
6693 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6694 StubCodeMark mark(this, stub_id);
6695 address start = __ pc();
6696 __ enter();
6697
6698 const Register coeffs = c_rarg0;
6699 const Register zetas = c_rarg1;
6700
6701 const Register tmpAddr = r9;
6702 const Register dilithiumConsts = r10;
6703 const Register result = r11;
6704 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6705 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6706 VSeq<2> vq(30); // n.b. constants overlap vs3
6707 int offsets[4] = { 0, 32, 64, 96 };
6708 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6709 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6710
6711 __ add(result, coeffs, 0);
6712 __ lea(dilithiumConsts,
6713 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6714
6715 // Each level represents one iteration of the outer for loop of the Java version
6716
6717 // level 0
6718 // At level 0 we need to interleave adjacent quartets of
6719 // coefficients before we multiply and add/sub by the next 16
6720 // zetas just as we did for level 7 in the multiply code. So we
6721 // load and store the values using an ld2/st2 with arrangement 4S.
6722 for (int i = 0; i < 1024; i += 128) {
6723 // load constants q, qinv
6724 // n.b. this can be moved out of the loop as they do not get
6725 // clobbered by first two loops
6726 vs_ldpq(vq, dilithiumConsts); // qInv, q
6727 // a0/a1 load interleaved 32 (8x4S) coefficients
6728 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6729 // b load next 32 (8x4S) inputs
6730 vs_ldpq_post(vs_front(vs2), zetas);
6731 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6732 // n.b. second half of vs2 provides temporary register storage
6733 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6734 vs_front(vs2), vs_back(vs2), vtmp, vq);
6735 // a0/a1 store interleaved 32 (8x4S) coefficients
6736 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6737 }
6738
6739 // level 1
6740 // At level 1 we need to interleave pairs of adjacent pairs of
6741 // coefficients before we multiply by the next 16 zetas just as we
6742 // did for level 6 in the multiply code. So we load and store the
6743 // values an ld2/st2 with arrangement 2D.
6744 for (int i = 0; i < 1024; i += 128) {
6745 // a0/a1 load interleaved 32 (8x2D) coefficients
6746 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6747 // b load next 16 (4x4S) inputs
6748 vs_ldpq_post(vs_front(vs2), zetas);
6749 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6750 // n.b. second half of vs2 provides temporary register storage
6751 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6752 vs_front(vs2), vs_back(vs2), vtmp, vq);
6753 // a0/a1 store interleaved 32 (8x2D) coefficients
6754 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6755 }
6756
6757 // level 2
6758 // At level 2 coefficients come in blocks of 4. So, we load 4
6759 // adjacent coefficients at 8 distinct offsets for both the first
6760 // and second coefficient sequences, using an ldr with register
6761 // variant Q then combine them with next set of 32 zetas. Likewise
6762 // we store the results using an str with register variant Q.
6763 for (int i = 0; i < 1024; i += 256) {
6764 // c0 load 32 (8x4S) coefficients via first offsets
6765 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6766 // c1 load 32 (8x4S) coefficients via second offsets
6767 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6768 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6769 vs_addv(vs3, __ T4S, vs1, vs2);
6770 // c = c0 - c1
6771 vs_subv(vs1, __ T4S, vs1, vs2);
6772 // store a0 32 (8x4S) coefficients via first offsets
6773 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6774 // b load 32 (8x4S) next inputs
6775 vs_ldpq_post(vs2, zetas);
6776 // reload constants q, qinv -- they were clobbered earlier
6777 vs_ldpq(vq, dilithiumConsts); // qInv, q
6778 // compute a1 = b montmul c
6779 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6780 // store a1 32 (8x4S) coefficients via second offsets
6781 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6782 }
6783
6784 // level 3-7
6785 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6786
6787 __ leave(); // required for proper stackwalking of RuntimeStub frame
6788 __ mov(r0, zr); // return 0
6789 __ ret(lr);
6790
6791 return start;
6792 }
6793
6794 // Dilithium multiply polynomials in the NTT domain.
6795 // Straightforward implementation of the method
6796 // static int implDilithiumNttMult(
6797 // int[] result, int[] ntta, int[] nttb {} of
6798 // the sun.security.provider.ML_DSA class.
6799 //
6800 // result (int[256]) = c_rarg0
6801 // poly1 (int[256]) = c_rarg1
6802 // poly2 (int[256]) = c_rarg2
6803 address generate_dilithiumNttMult() {
6804
6805 __ align(CodeEntryAlignment);
6806 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6807 StubCodeMark mark(this, stub_id);
6808 address start = __ pc();
6809 __ enter();
6810
6811 Label L_loop;
6812
6813 const Register result = c_rarg0;
6814 const Register poly1 = c_rarg1;
6815 const Register poly2 = c_rarg2;
6816
6817 const Register dilithiumConsts = r10;
6818 const Register len = r11;
6819
6820 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6821 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6822 VSeq<2> vq(30); // n.b. constants overlap vs3
6823 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6824
6825 __ lea(dilithiumConsts,
6826 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6827
6828 // load constants q, qinv
6829 vs_ldpq(vq, dilithiumConsts); // qInv, q
6830 // load constant rSquare into v29
6831 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6832
6833 __ mov(len, zr);
6834 __ add(len, len, 1024);
6835
6836 __ BIND(L_loop);
6837
6838 // b load 32 (8x4S) next inputs from poly1
6839 vs_ldpq_post(vs1, poly1);
6840 // c load 32 (8x4S) next inputs from poly2
6841 vs_ldpq_post(vs2, poly2);
6842 // compute a = b montmul c
6843 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6844 // compute a = rsquare montmul a
6845 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6846 // save a 32 (8x4S) results
6847 vs_stpq_post(vs2, result);
6848
6849 __ sub(len, len, 128);
6850 __ cmp(len, (u1)128);
6851 __ br(Assembler::GE, L_loop);
6852
6853 __ leave(); // required for proper stackwalking of RuntimeStub frame
6854 __ mov(r0, zr); // return 0
6855 __ ret(lr);
6856
6857 return start;
6858 }
6859
6860 // Dilithium Motgomery multiply an array by a constant.
6861 // A straightforward implementation of the method
6862 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6863 // of the sun.security.provider.MLDSA class
6864 //
6865 // coeffs (int[256]) = c_rarg0
6866 // constant (int) = c_rarg1
6867 address generate_dilithiumMontMulByConstant() {
6868
6869 __ align(CodeEntryAlignment);
6870 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6871 StubCodeMark mark(this, stub_id);
6872 address start = __ pc();
6873 __ enter();
6874
6875 Label L_loop;
6876
6877 const Register coeffs = c_rarg0;
6878 const Register constant = c_rarg1;
6879
6880 const Register dilithiumConsts = r10;
6881 const Register result = r11;
6882 const Register len = r12;
6883
6884 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6885 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6886 VSeq<2> vq(30); // n.b. constants overlap vs3
6887 VSeq<8> vconst(29, 0); // for montmul by constant
6888
6889 // results track inputs
6890 __ add(result, coeffs, 0);
6891 __ lea(dilithiumConsts,
6892 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6893
6894 // load constants q, qinv -- they do not get clobbered by first two loops
6895 vs_ldpq(vq, dilithiumConsts); // qInv, q
6896 // copy caller supplied constant across vconst
6897 __ dup(vconst[0], __ T4S, constant);
6898 __ mov(len, zr);
6899 __ add(len, len, 1024);
6900
6901 __ BIND(L_loop);
6902
6903 // load next 32 inputs
6904 vs_ldpq_post(vs2, coeffs);
6905 // mont mul by constant
6906 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6907 // write next 32 results
6908 vs_stpq_post(vs2, result);
6909
6910 __ sub(len, len, 128);
6911 __ cmp(len, (u1)128);
6912 __ br(Assembler::GE, L_loop);
6913
6914 __ leave(); // required for proper stackwalking of RuntimeStub frame
6915 __ mov(r0, zr); // return 0
6916 __ ret(lr);
6917
6918 return start;
6919 }
6920
6921 // Dilithium decompose poly.
6922 // Implements the method
6923 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6924 // of the sun.security.provider.ML_DSA class
6925 //
6926 // input (int[256]) = c_rarg0
6927 // lowPart (int[256]) = c_rarg1
6928 // highPart (int[256]) = c_rarg2
6929 // twoGamma2 (int) = c_rarg3
6930 // multiplier (int) = c_rarg4
6931 address generate_dilithiumDecomposePoly() {
6932
6933 __ align(CodeEntryAlignment);
6934 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
6935 StubCodeMark mark(this, stub_id);
6936 address start = __ pc();
6937 Label L_loop;
6938
6939 const Register input = c_rarg0;
6940 const Register lowPart = c_rarg1;
6941 const Register highPart = c_rarg2;
6942 const Register twoGamma2 = c_rarg3;
6943 const Register multiplier = c_rarg4;
6944
6945 const Register len = r9;
6946 const Register dilithiumConsts = r10;
6947 const Register tmp = r11;
6948
6949 // 6 independent sets of 4x4s values
6950 VSeq<4> vs1(0), vs2(4), vs3(8);
6951 VSeq<4> vs4(12), vs5(16), vtmp(20);
6952
6953 // 7 constants for cross-multiplying
6954 VSeq<4> one(25, 0);
6955 VSeq<4> qminus1(26, 0);
6956 VSeq<4> g2(27, 0);
6957 VSeq<4> twog2(28, 0);
6958 VSeq<4> mult(29, 0);
6959 VSeq<4> q(30, 0);
6960 VSeq<4> qadd(31, 0);
6961
6962 __ enter();
6963
6964 __ lea(dilithiumConsts,
6965 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6966
6967 // save callee-saved registers
6968 __ stpd(v8, v9, __ pre(sp, -64));
6969 __ stpd(v10, v11, Address(sp, 16));
6970 __ stpd(v12, v13, Address(sp, 32));
6971 __ stpd(v14, v15, Address(sp, 48));
6972
6973 // populate constant registers
6974 __ mov(tmp, zr);
6975 __ add(tmp, tmp, 1);
6976 __ dup(one[0], __ T4S, tmp); // 1
6977 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
6978 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
6979 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
6980 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
6981 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
6982 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
6983
6984 __ mov(len, zr);
6985 __ add(len, len, 1024);
6986
6987 __ BIND(L_loop);
6988
6989 // load next 4x4S inputs interleaved: rplus --> vs1
6990 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
6991
6992 // rplus = rplus - ((rplus + qadd) >> 23) * q
6993 vs_addv(vtmp, __ T4S, vs1, qadd);
6994 vs_sshr(vtmp, __ T4S, vtmp, 23);
6995 vs_mulv(vtmp, __ T4S, vtmp, q);
6996 vs_subv(vs1, __ T4S, vs1, vtmp);
6997
6998 // rplus = rplus + ((rplus >> 31) & dilithium_q);
6999 vs_sshr(vtmp, __ T4S, vs1, 31);
7000 vs_andr(vtmp, vtmp, q);
7001 vs_addv(vs1, __ T4S, vs1, vtmp);
7002
7003 // quotient --> vs2
7004 // int quotient = (rplus * multiplier) >> 22;
7005 vs_mulv(vtmp, __ T4S, vs1, mult);
7006 vs_sshr(vs2, __ T4S, vtmp, 22);
7007
7008 // r0 --> vs3
7009 // int r0 = rplus - quotient * twoGamma2;
7010 vs_mulv(vtmp, __ T4S, vs2, twog2);
7011 vs_subv(vs3, __ T4S, vs1, vtmp);
7012
7013 // mask --> vs4
7014 // int mask = (twoGamma2 - r0) >> 22;
7015 vs_subv(vtmp, __ T4S, twog2, vs3);
7016 vs_sshr(vs4, __ T4S, vtmp, 22);
7017
7018 // r0 -= (mask & twoGamma2);
7019 vs_andr(vtmp, vs4, twog2);
7020 vs_subv(vs3, __ T4S, vs3, vtmp);
7021
7022 // quotient += (mask & 1);
7023 vs_andr(vtmp, vs4, one);
7024 vs_addv(vs2, __ T4S, vs2, vtmp);
7025
7026 // mask = (twoGamma2 / 2 - r0) >> 31;
7027 vs_subv(vtmp, __ T4S, g2, vs3);
7028 vs_sshr(vs4, __ T4S, vtmp, 31);
7029
7030 // r0 -= (mask & twoGamma2);
7031 vs_andr(vtmp, vs4, twog2);
7032 vs_subv(vs3, __ T4S, vs3, vtmp);
7033
7034 // quotient += (mask & 1);
7035 vs_andr(vtmp, vs4, one);
7036 vs_addv(vs2, __ T4S, vs2, vtmp);
7037
7038 // r1 --> vs5
7039 // int r1 = rplus - r0 - (dilithium_q - 1);
7040 vs_subv(vtmp, __ T4S, vs1, vs3);
7041 vs_subv(vs5, __ T4S, vtmp, qminus1);
7042
7043 // r1 --> vs1 (overwriting rplus)
7044 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7045 vs_negr(vtmp, __ T4S, vs5);
7046 vs_orr(vtmp, vs5, vtmp);
7047 vs_sshr(vs1, __ T4S, vtmp, 31);
7048
7049 // r0 += ~r1;
7050 vs_notr(vtmp, vs1);
7051 vs_addv(vs3, __ T4S, vs3, vtmp);
7052
7053 // r1 = r1 & quotient;
7054 vs_andr(vs1, vs2, vs1);
7055
7056 // store results inteleaved
7057 // lowPart[m] = r0;
7058 // highPart[m] = r1;
7059 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7060 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7061
7062 __ sub(len, len, 64);
7063 __ cmp(len, (u1)64);
7064 __ br(Assembler::GE, L_loop);
7065
7066 // restore callee-saved vector registers
7067 __ ldpd(v14, v15, Address(sp, 48));
7068 __ ldpd(v12, v13, Address(sp, 32));
7069 __ ldpd(v10, v11, Address(sp, 16));
7070 __ ldpd(v8, v9, __ post(sp, 64));
7071
7072 __ leave(); // required for proper stackwalking of RuntimeStub frame
7073 __ mov(r0, zr); // return 0
7074 __ ret(lr);
7075
7076 return start;
7077 }
7078
7079 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7080 Register tmp0, Register tmp1, Register tmp2) {
7081 __ bic(tmp0, a2, a1); // for a0
7082 __ bic(tmp1, a3, a2); // for a1
7083 __ bic(tmp2, a4, a3); // for a2
7084 __ eor(a2, a2, tmp2);
7085 __ bic(tmp2, a0, a4); // for a3
7086 __ eor(a3, a3, tmp2);
7087 __ bic(tmp2, a1, a0); // for a4
7088 __ eor(a0, a0, tmp0);
7089 __ eor(a1, a1, tmp1);
7090 __ eor(a4, a4, tmp2);
7091 }
7092
7093 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7094 Register a0, Register a1, Register a2, Register a3, Register a4,
7095 Register a5, Register a6, Register a7, Register a8, Register a9,
7096 Register a10, Register a11, Register a12, Register a13, Register a14,
7097 Register a15, Register a16, Register a17, Register a18, Register a19,
7098 Register a20, Register a21, Register a22, Register a23, Register a24,
7099 Register tmp0, Register tmp1, Register tmp2) {
7100 __ eor3(tmp1, a4, a9, a14);
7101 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7102 __ eor3(tmp2, a1, a6, a11);
7103 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7104 __ rax1(tmp2, tmp0, tmp1); // d0
7105 {
7106
7107 Register tmp3, tmp4;
7108 if (can_use_fp && can_use_r18) {
7109 tmp3 = rfp;
7110 tmp4 = r18_tls;
7111 } else {
7112 tmp3 = a4;
7113 tmp4 = a9;
7114 __ stp(tmp3, tmp4, __ pre(sp, -16));
7115 }
7116
7117 __ eor3(tmp3, a0, a5, a10);
7118 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7119 __ eor(a0, a0, tmp2);
7120 __ eor(a5, a5, tmp2);
7121 __ eor(a10, a10, tmp2);
7122 __ eor(a15, a15, tmp2);
7123 __ eor(a20, a20, tmp2); // d0(tmp2)
7124 __ eor3(tmp3, a2, a7, a12);
7125 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7126 __ rax1(tmp3, tmp4, tmp2); // d1
7127 __ eor(a1, a1, tmp3);
7128 __ eor(a6, a6, tmp3);
7129 __ eor(a11, a11, tmp3);
7130 __ eor(a16, a16, tmp3);
7131 __ eor(a21, a21, tmp3); // d1(tmp3)
7132 __ rax1(tmp3, tmp2, tmp0); // d3
7133 __ eor3(tmp2, a3, a8, a13);
7134 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7135 __ eor(a3, a3, tmp3);
7136 __ eor(a8, a8, tmp3);
7137 __ eor(a13, a13, tmp3);
7138 __ eor(a18, a18, tmp3);
7139 __ eor(a23, a23, tmp3);
7140 __ rax1(tmp2, tmp1, tmp0); // d2
7141 __ eor(a2, a2, tmp2);
7142 __ eor(a7, a7, tmp2);
7143 __ eor(a12, a12, tmp2);
7144 __ rax1(tmp0, tmp0, tmp4); // d4
7145 if (!can_use_fp || !can_use_r18) {
7146 __ ldp(tmp3, tmp4, __ post(sp, 16));
7147 }
7148 __ eor(a17, a17, tmp2);
7149 __ eor(a22, a22, tmp2);
7150 __ eor(a4, a4, tmp0);
7151 __ eor(a9, a9, tmp0);
7152 __ eor(a14, a14, tmp0);
7153 __ eor(a19, a19, tmp0);
7154 __ eor(a24, a24, tmp0);
7155 }
7156
7157 __ rol(tmp0, a10, 3);
7158 __ rol(a10, a1, 1);
7159 __ rol(a1, a6, 44);
7160 __ rol(a6, a9, 20);
7161 __ rol(a9, a22, 61);
7162 __ rol(a22, a14, 39);
7163 __ rol(a14, a20, 18);
7164 __ rol(a20, a2, 62);
7165 __ rol(a2, a12, 43);
7166 __ rol(a12, a13, 25);
7167 __ rol(a13, a19, 8) ;
7168 __ rol(a19, a23, 56);
7169 __ rol(a23, a15, 41);
7170 __ rol(a15, a4, 27);
7171 __ rol(a4, a24, 14);
7172 __ rol(a24, a21, 2);
7173 __ rol(a21, a8, 55);
7174 __ rol(a8, a16, 45);
7175 __ rol(a16, a5, 36);
7176 __ rol(a5, a3, 28);
7177 __ rol(a3, a18, 21);
7178 __ rol(a18, a17, 15);
7179 __ rol(a17, a11, 10);
7180 __ rol(a11, a7, 6);
7181 __ mov(a7, tmp0);
7182
7183 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7184 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7185 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7186 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7187 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7188
7189 __ ldr(tmp1, __ post(rc, 8));
7190 __ eor(a0, a0, tmp1);
7191
7192 }
7193
7194 // Arguments:
7195 //
7196 // Inputs:
7197 // c_rarg0 - byte[] source+offset
7198 // c_rarg1 - byte[] SHA.state
7199 // c_rarg2 - int block_size
7200 // c_rarg3 - int offset
7201 // c_rarg4 - int limit
7202 //
7203 address generate_sha3_implCompress_gpr(StubId stub_id) {
7204 bool multi_block;
7205 switch (stub_id) {
7206 case StubId::stubgen_sha3_implCompress_id:
7207 multi_block = false;
7208 break;
7209 case StubId::stubgen_sha3_implCompressMB_id:
7210 multi_block = true;
7211 break;
7212 default:
7213 ShouldNotReachHere();
7214 }
7215
7216 static const uint64_t round_consts[24] = {
7217 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7218 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7219 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7220 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7221 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7222 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7223 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7224 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7225 };
7226
7227 __ align(CodeEntryAlignment);
7228 StubCodeMark mark(this, stub_id);
7229 address start = __ pc();
7230
7231 Register buf = c_rarg0;
7232 Register state = c_rarg1;
7233 Register block_size = c_rarg2;
7234 Register ofs = c_rarg3;
7235 Register limit = c_rarg4;
7236
7237 // use r3.r17,r19..r28 to keep a0..a24.
7238 // a0..a24 are respective locals from SHA3.java
7239 Register a0 = r25,
7240 a1 = r26,
7241 a2 = r27,
7242 a3 = r3,
7243 a4 = r4,
7244 a5 = r5,
7245 a6 = r6,
7246 a7 = r7,
7247 a8 = rscratch1, // r8
7248 a9 = rscratch2, // r9
7249 a10 = r10,
7250 a11 = r11,
7251 a12 = r12,
7252 a13 = r13,
7253 a14 = r14,
7254 a15 = r15,
7255 a16 = r16,
7256 a17 = r17,
7257 a18 = r28,
7258 a19 = r19,
7259 a20 = r20,
7260 a21 = r21,
7261 a22 = r22,
7262 a23 = r23,
7263 a24 = r24;
7264
7265 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7266
7267 Label sha3_loop, rounds24_preloop, loop_body;
7268 Label sha3_512_or_sha3_384, shake128;
7269
7270 bool can_use_r18 = false;
7271 #ifndef R18_RESERVED
7272 can_use_r18 = true;
7273 #endif
7274 bool can_use_fp = !PreserveFramePointer;
7275
7276 __ enter();
7277
7278 // save almost all yet unsaved gpr registers on stack
7279 __ str(block_size, __ pre(sp, -128));
7280 if (multi_block) {
7281 __ stpw(ofs, limit, Address(sp, 8));
7282 }
7283 // 8 bytes at sp+16 will be used to keep buf
7284 __ stp(r19, r20, Address(sp, 32));
7285 __ stp(r21, r22, Address(sp, 48));
7286 __ stp(r23, r24, Address(sp, 64));
7287 __ stp(r25, r26, Address(sp, 80));
7288 __ stp(r27, r28, Address(sp, 96));
7289 if (can_use_r18 && can_use_fp) {
7290 __ stp(r18_tls, state, Address(sp, 112));
7291 } else {
7292 __ str(state, Address(sp, 112));
7293 }
7294
7295 // begin sha3 calculations: loading a0..a24 from state arrary
7296 __ ldp(a0, a1, state);
7297 __ ldp(a2, a3, Address(state, 16));
7298 __ ldp(a4, a5, Address(state, 32));
7299 __ ldp(a6, a7, Address(state, 48));
7300 __ ldp(a8, a9, Address(state, 64));
7301 __ ldp(a10, a11, Address(state, 80));
7302 __ ldp(a12, a13, Address(state, 96));
7303 __ ldp(a14, a15, Address(state, 112));
7304 __ ldp(a16, a17, Address(state, 128));
7305 __ ldp(a18, a19, Address(state, 144));
7306 __ ldp(a20, a21, Address(state, 160));
7307 __ ldp(a22, a23, Address(state, 176));
7308 __ ldr(a24, Address(state, 192));
7309
7310 __ BIND(sha3_loop);
7311
7312 // load input
7313 __ ldp(tmp3, tmp2, __ post(buf, 16));
7314 __ eor(a0, a0, tmp3);
7315 __ eor(a1, a1, tmp2);
7316 __ ldp(tmp3, tmp2, __ post(buf, 16));
7317 __ eor(a2, a2, tmp3);
7318 __ eor(a3, a3, tmp2);
7319 __ ldp(tmp3, tmp2, __ post(buf, 16));
7320 __ eor(a4, a4, tmp3);
7321 __ eor(a5, a5, tmp2);
7322 __ ldr(tmp3, __ post(buf, 8));
7323 __ eor(a6, a6, tmp3);
7324
7325 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7326 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7327
7328 __ ldp(tmp3, tmp2, __ post(buf, 16));
7329 __ eor(a7, a7, tmp3);
7330 __ eor(a8, a8, tmp2);
7331 __ ldp(tmp3, tmp2, __ post(buf, 16));
7332 __ eor(a9, a9, tmp3);
7333 __ eor(a10, a10, tmp2);
7334 __ ldp(tmp3, tmp2, __ post(buf, 16));
7335 __ eor(a11, a11, tmp3);
7336 __ eor(a12, a12, tmp2);
7337 __ ldp(tmp3, tmp2, __ post(buf, 16));
7338 __ eor(a13, a13, tmp3);
7339 __ eor(a14, a14, tmp2);
7340 __ ldp(tmp3, tmp2, __ post(buf, 16));
7341 __ eor(a15, a15, tmp3);
7342 __ eor(a16, a16, tmp2);
7343
7344 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7345 __ andw(tmp2, block_size, 48);
7346 __ cbzw(tmp2, rounds24_preloop);
7347 __ tbnz(block_size, 5, shake128);
7348 // block_size == 144, bit5 == 0, SHA3-244
7349 __ ldr(tmp3, __ post(buf, 8));
7350 __ eor(a17, a17, tmp3);
7351 __ b(rounds24_preloop);
7352
7353 __ BIND(shake128);
7354 __ ldp(tmp3, tmp2, __ post(buf, 16));
7355 __ eor(a17, a17, tmp3);
7356 __ eor(a18, a18, tmp2);
7357 __ ldp(tmp3, tmp2, __ post(buf, 16));
7358 __ eor(a19, a19, tmp3);
7359 __ eor(a20, a20, tmp2);
7360 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7361
7362 __ BIND(sha3_512_or_sha3_384);
7363 __ ldp(tmp3, tmp2, __ post(buf, 16));
7364 __ eor(a7, a7, tmp3);
7365 __ eor(a8, a8, tmp2);
7366 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7367
7368 // SHA3-384
7369 __ ldp(tmp3, tmp2, __ post(buf, 16));
7370 __ eor(a9, a9, tmp3);
7371 __ eor(a10, a10, tmp2);
7372 __ ldp(tmp3, tmp2, __ post(buf, 16));
7373 __ eor(a11, a11, tmp3);
7374 __ eor(a12, a12, tmp2);
7375
7376 __ BIND(rounds24_preloop);
7377 __ fmovs(v0, 24.0); // float loop counter,
7378 __ fmovs(v1, 1.0); // exact representation
7379
7380 __ str(buf, Address(sp, 16));
7381 __ lea(tmp3, ExternalAddress((address) round_consts));
7382
7383 __ BIND(loop_body);
7384 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7385 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7386 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7387 tmp0, tmp1, tmp2);
7388 __ fsubs(v0, v0, v1);
7389 __ fcmps(v0, 0.0);
7390 __ br(__ NE, loop_body);
7391
7392 if (multi_block) {
7393 __ ldrw(block_size, sp); // block_size
7394 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7395 __ addw(tmp2, tmp2, block_size);
7396 __ cmpw(tmp2, tmp1);
7397 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7398 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7399 __ br(Assembler::LE, sha3_loop);
7400 __ movw(c_rarg0, tmp2); // return offset
7401 }
7402 if (can_use_fp && can_use_r18) {
7403 __ ldp(r18_tls, state, Address(sp, 112));
7404 } else {
7405 __ ldr(state, Address(sp, 112));
7406 }
7407 // save calculated sha3 state
7408 __ stp(a0, a1, Address(state));
7409 __ stp(a2, a3, Address(state, 16));
7410 __ stp(a4, a5, Address(state, 32));
7411 __ stp(a6, a7, Address(state, 48));
7412 __ stp(a8, a9, Address(state, 64));
7413 __ stp(a10, a11, Address(state, 80));
7414 __ stp(a12, a13, Address(state, 96));
7415 __ stp(a14, a15, Address(state, 112));
7416 __ stp(a16, a17, Address(state, 128));
7417 __ stp(a18, a19, Address(state, 144));
7418 __ stp(a20, a21, Address(state, 160));
7419 __ stp(a22, a23, Address(state, 176));
7420 __ str(a24, Address(state, 192));
7421
7422 // restore required registers from stack
7423 __ ldp(r19, r20, Address(sp, 32));
7424 __ ldp(r21, r22, Address(sp, 48));
7425 __ ldp(r23, r24, Address(sp, 64));
7426 __ ldp(r25, r26, Address(sp, 80));
7427 __ ldp(r27, r28, Address(sp, 96));
7428 if (can_use_fp && can_use_r18) {
7429 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7430 } // else no need to recalculate rfp, since it wasn't changed
7431
7432 __ leave();
7433
7434 __ ret(lr);
7435
7436 return start;
7437 }
7438
7439 /**
7440 * Arguments:
7441 *
7442 * Inputs:
7443 * c_rarg0 - int crc
7444 * c_rarg1 - byte* buf
7445 * c_rarg2 - int length
7446 *
7447 * Output:
7448 * rax - int crc result
7449 */
7450 address generate_updateBytesCRC32() {
7451 assert(UseCRC32Intrinsics, "what are we doing here?");
7452
7453 __ align(CodeEntryAlignment);
7454 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7455 StubCodeMark mark(this, stub_id);
7456
7457 address start = __ pc();
7458
7459 const Register crc = c_rarg0; // crc
7460 const Register buf = c_rarg1; // source java byte array address
7461 const Register len = c_rarg2; // length
7462 const Register table0 = c_rarg3; // crc_table address
7463 const Register table1 = c_rarg4;
7464 const Register table2 = c_rarg5;
7465 const Register table3 = c_rarg6;
7466 const Register tmp3 = c_rarg7;
7467
7468 BLOCK_COMMENT("Entry:");
7469 __ enter(); // required for proper stackwalking of RuntimeStub frame
7470
7471 __ kernel_crc32(crc, buf, len,
7472 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7473
7474 __ leave(); // required for proper stackwalking of RuntimeStub frame
7475 __ ret(lr);
7476
7477 return start;
7478 }
7479
7480 /**
7481 * Arguments:
7482 *
7483 * Inputs:
7484 * c_rarg0 - int crc
7485 * c_rarg1 - byte* buf
7486 * c_rarg2 - int length
7487 * c_rarg3 - int* table
7488 *
7489 * Output:
7490 * r0 - int crc result
7491 */
7492 address generate_updateBytesCRC32C() {
7493 assert(UseCRC32CIntrinsics, "what are we doing here?");
7494
7495 __ align(CodeEntryAlignment);
7496 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7497 StubCodeMark mark(this, stub_id);
7498
7499 address start = __ pc();
7500
7501 const Register crc = c_rarg0; // crc
7502 const Register buf = c_rarg1; // source java byte array address
7503 const Register len = c_rarg2; // length
7504 const Register table0 = c_rarg3; // crc_table address
7505 const Register table1 = c_rarg4;
7506 const Register table2 = c_rarg5;
7507 const Register table3 = c_rarg6;
7508 const Register tmp3 = c_rarg7;
7509
7510 BLOCK_COMMENT("Entry:");
7511 __ enter(); // required for proper stackwalking of RuntimeStub frame
7512
7513 __ kernel_crc32c(crc, buf, len,
7514 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7515
7516 __ leave(); // required for proper stackwalking of RuntimeStub frame
7517 __ ret(lr);
7518
7519 return start;
7520 }
7521
7522 /***
7523 * Arguments:
7524 *
7525 * Inputs:
7526 * c_rarg0 - int adler
7527 * c_rarg1 - byte* buff
7528 * c_rarg2 - int len
7529 *
7530 * Output:
7531 * c_rarg0 - int adler result
7532 */
7533 address generate_updateBytesAdler32() {
7534 __ align(CodeEntryAlignment);
7535 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7536 StubCodeMark mark(this, stub_id);
7537 address start = __ pc();
7538
7539 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7540
7541 // Aliases
7542 Register adler = c_rarg0;
7543 Register s1 = c_rarg0;
7544 Register s2 = c_rarg3;
7545 Register buff = c_rarg1;
7546 Register len = c_rarg2;
7547 Register nmax = r4;
7548 Register base = r5;
7549 Register count = r6;
7550 Register temp0 = rscratch1;
7551 Register temp1 = rscratch2;
7552 FloatRegister vbytes = v0;
7553 FloatRegister vs1acc = v1;
7554 FloatRegister vs2acc = v2;
7555 FloatRegister vtable = v3;
7556
7557 // Max number of bytes we can process before having to take the mod
7558 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7559 uint64_t BASE = 0xfff1;
7560 uint64_t NMAX = 0x15B0;
7561
7562 __ mov(base, BASE);
7563 __ mov(nmax, NMAX);
7564
7565 // Load accumulation coefficients for the upper 16 bits
7566 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7567 __ ld1(vtable, __ T16B, Address(temp0));
7568
7569 // s1 is initialized to the lower 16 bits of adler
7570 // s2 is initialized to the upper 16 bits of adler
7571 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7572 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7573
7574 // The pipelined loop needs at least 16 elements for 1 iteration
7575 // It does check this, but it is more effective to skip to the cleanup loop
7576 __ cmp(len, (u1)16);
7577 __ br(Assembler::HS, L_nmax);
7578 __ cbz(len, L_combine);
7579
7580 __ bind(L_simple_by1_loop);
7581 __ ldrb(temp0, Address(__ post(buff, 1)));
7582 __ add(s1, s1, temp0);
7583 __ add(s2, s2, s1);
7584 __ subs(len, len, 1);
7585 __ br(Assembler::HI, L_simple_by1_loop);
7586
7587 // s1 = s1 % BASE
7588 __ subs(temp0, s1, base);
7589 __ csel(s1, temp0, s1, Assembler::HS);
7590
7591 // s2 = s2 % BASE
7592 __ lsr(temp0, s2, 16);
7593 __ lsl(temp1, temp0, 4);
7594 __ sub(temp1, temp1, temp0);
7595 __ add(s2, temp1, s2, ext::uxth);
7596
7597 __ subs(temp0, s2, base);
7598 __ csel(s2, temp0, s2, Assembler::HS);
7599
7600 __ b(L_combine);
7601
7602 __ bind(L_nmax);
7603 __ subs(len, len, nmax);
7604 __ sub(count, nmax, 16);
7605 __ br(Assembler::LO, L_by16);
7606
7607 __ bind(L_nmax_loop);
7608
7609 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7610 vbytes, vs1acc, vs2acc, vtable);
7611
7612 __ subs(count, count, 16);
7613 __ br(Assembler::HS, L_nmax_loop);
7614
7615 // s1 = s1 % BASE
7616 __ lsr(temp0, s1, 16);
7617 __ lsl(temp1, temp0, 4);
7618 __ sub(temp1, temp1, temp0);
7619 __ add(temp1, temp1, s1, ext::uxth);
7620
7621 __ lsr(temp0, temp1, 16);
7622 __ lsl(s1, temp0, 4);
7623 __ sub(s1, s1, temp0);
7624 __ add(s1, s1, temp1, ext:: uxth);
7625
7626 __ subs(temp0, s1, base);
7627 __ csel(s1, temp0, s1, Assembler::HS);
7628
7629 // s2 = s2 % BASE
7630 __ lsr(temp0, s2, 16);
7631 __ lsl(temp1, temp0, 4);
7632 __ sub(temp1, temp1, temp0);
7633 __ add(temp1, temp1, s2, ext::uxth);
7634
7635 __ lsr(temp0, temp1, 16);
7636 __ lsl(s2, temp0, 4);
7637 __ sub(s2, s2, temp0);
7638 __ add(s2, s2, temp1, ext:: uxth);
7639
7640 __ subs(temp0, s2, base);
7641 __ csel(s2, temp0, s2, Assembler::HS);
7642
7643 __ subs(len, len, nmax);
7644 __ sub(count, nmax, 16);
7645 __ br(Assembler::HS, L_nmax_loop);
7646
7647 __ bind(L_by16);
7648 __ adds(len, len, count);
7649 __ br(Assembler::LO, L_by1);
7650
7651 __ bind(L_by16_loop);
7652
7653 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7654 vbytes, vs1acc, vs2acc, vtable);
7655
7656 __ subs(len, len, 16);
7657 __ br(Assembler::HS, L_by16_loop);
7658
7659 __ bind(L_by1);
7660 __ adds(len, len, 15);
7661 __ br(Assembler::LO, L_do_mod);
7662
7663 __ bind(L_by1_loop);
7664 __ ldrb(temp0, Address(__ post(buff, 1)));
7665 __ add(s1, temp0, s1);
7666 __ add(s2, s2, s1);
7667 __ subs(len, len, 1);
7668 __ br(Assembler::HS, L_by1_loop);
7669
7670 __ bind(L_do_mod);
7671 // s1 = s1 % BASE
7672 __ lsr(temp0, s1, 16);
7673 __ lsl(temp1, temp0, 4);
7674 __ sub(temp1, temp1, temp0);
7675 __ add(temp1, temp1, s1, ext::uxth);
7676
7677 __ lsr(temp0, temp1, 16);
7678 __ lsl(s1, temp0, 4);
7679 __ sub(s1, s1, temp0);
7680 __ add(s1, s1, temp1, ext:: uxth);
7681
7682 __ subs(temp0, s1, base);
7683 __ csel(s1, temp0, s1, Assembler::HS);
7684
7685 // s2 = s2 % BASE
7686 __ lsr(temp0, s2, 16);
7687 __ lsl(temp1, temp0, 4);
7688 __ sub(temp1, temp1, temp0);
7689 __ add(temp1, temp1, s2, ext::uxth);
7690
7691 __ lsr(temp0, temp1, 16);
7692 __ lsl(s2, temp0, 4);
7693 __ sub(s2, s2, temp0);
7694 __ add(s2, s2, temp1, ext:: uxth);
7695
7696 __ subs(temp0, s2, base);
7697 __ csel(s2, temp0, s2, Assembler::HS);
7698
7699 // Combine lower bits and higher bits
7700 __ bind(L_combine);
7701 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7702
7703 __ ret(lr);
7704
7705 return start;
7706 }
7707
7708 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7709 Register temp0, Register temp1, FloatRegister vbytes,
7710 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7711 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7712 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7713 // In non-vectorized code, we update s1 and s2 as:
7714 // s1 <- s1 + b1
7715 // s2 <- s2 + s1
7716 // s1 <- s1 + b2
7717 // s2 <- s2 + b1
7718 // ...
7719 // s1 <- s1 + b16
7720 // s2 <- s2 + s1
7721 // Putting above assignments together, we have:
7722 // s1_new = s1 + b1 + b2 + ... + b16
7723 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7724 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7725 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7726 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7727
7728 // s2 = s2 + s1 * 16
7729 __ add(s2, s2, s1, Assembler::LSL, 4);
7730
7731 // vs1acc = b1 + b2 + b3 + ... + b16
7732 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7733 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7734 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7735 __ uaddlv(vs1acc, __ T16B, vbytes);
7736 __ uaddlv(vs2acc, __ T8H, vs2acc);
7737
7738 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7739 __ fmovd(temp0, vs1acc);
7740 __ fmovd(temp1, vs2acc);
7741 __ add(s1, s1, temp0);
7742 __ add(s2, s2, temp1);
7743 }
7744
7745 /**
7746 * Arguments:
7747 *
7748 * Input:
7749 * c_rarg0 - x address
7750 * c_rarg1 - x length
7751 * c_rarg2 - y address
7752 * c_rarg3 - y length
7753 * c_rarg4 - z address
7754 */
7755 address generate_multiplyToLen() {
7756 __ align(CodeEntryAlignment);
7757 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7758 StubCodeMark mark(this, stub_id);
7759
7760 address start = __ pc();
7761 const Register x = r0;
7762 const Register xlen = r1;
7763 const Register y = r2;
7764 const Register ylen = r3;
7765 const Register z = r4;
7766
7767 const Register tmp0 = r5;
7768 const Register tmp1 = r10;
7769 const Register tmp2 = r11;
7770 const Register tmp3 = r12;
7771 const Register tmp4 = r13;
7772 const Register tmp5 = r14;
7773 const Register tmp6 = r15;
7774 const Register tmp7 = r16;
7775
7776 BLOCK_COMMENT("Entry:");
7777 __ enter(); // required for proper stackwalking of RuntimeStub frame
7778 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7779 __ leave(); // required for proper stackwalking of RuntimeStub frame
7780 __ ret(lr);
7781
7782 return start;
7783 }
7784
7785 address generate_squareToLen() {
7786 // squareToLen algorithm for sizes 1..127 described in java code works
7787 // faster than multiply_to_len on some CPUs and slower on others, but
7788 // multiply_to_len shows a bit better overall results
7789 __ align(CodeEntryAlignment);
7790 StubId stub_id = StubId::stubgen_squareToLen_id;
7791 StubCodeMark mark(this, stub_id);
7792 address start = __ pc();
7793
7794 const Register x = r0;
7795 const Register xlen = r1;
7796 const Register z = r2;
7797 const Register y = r4; // == x
7798 const Register ylen = r5; // == xlen
7799
7800 const Register tmp0 = r3;
7801 const Register tmp1 = r10;
7802 const Register tmp2 = r11;
7803 const Register tmp3 = r12;
7804 const Register tmp4 = r13;
7805 const Register tmp5 = r14;
7806 const Register tmp6 = r15;
7807 const Register tmp7 = r16;
7808
7809 RegSet spilled_regs = RegSet::of(y, ylen);
7810 BLOCK_COMMENT("Entry:");
7811 __ enter();
7812 __ push(spilled_regs, sp);
7813 __ mov(y, x);
7814 __ mov(ylen, xlen);
7815 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7816 __ pop(spilled_regs, sp);
7817 __ leave();
7818 __ ret(lr);
7819 return start;
7820 }
7821
7822 address generate_mulAdd() {
7823 __ align(CodeEntryAlignment);
7824 StubId stub_id = StubId::stubgen_mulAdd_id;
7825 StubCodeMark mark(this, stub_id);
7826
7827 address start = __ pc();
7828
7829 const Register out = r0;
7830 const Register in = r1;
7831 const Register offset = r2;
7832 const Register len = r3;
7833 const Register k = r4;
7834
7835 BLOCK_COMMENT("Entry:");
7836 __ enter();
7837 __ mul_add(out, in, offset, len, k);
7838 __ leave();
7839 __ ret(lr);
7840
7841 return start;
7842 }
7843
7844 // Arguments:
7845 //
7846 // Input:
7847 // c_rarg0 - newArr address
7848 // c_rarg1 - oldArr address
7849 // c_rarg2 - newIdx
7850 // c_rarg3 - shiftCount
7851 // c_rarg4 - numIter
7852 //
7853 address generate_bigIntegerRightShift() {
7854 __ align(CodeEntryAlignment);
7855 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7856 StubCodeMark mark(this, stub_id);
7857 address start = __ pc();
7858
7859 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7860
7861 Register newArr = c_rarg0;
7862 Register oldArr = c_rarg1;
7863 Register newIdx = c_rarg2;
7864 Register shiftCount = c_rarg3;
7865 Register numIter = c_rarg4;
7866 Register idx = numIter;
7867
7868 Register newArrCur = rscratch1;
7869 Register shiftRevCount = rscratch2;
7870 Register oldArrCur = r13;
7871 Register oldArrNext = r14;
7872
7873 FloatRegister oldElem0 = v0;
7874 FloatRegister oldElem1 = v1;
7875 FloatRegister newElem = v2;
7876 FloatRegister shiftVCount = v3;
7877 FloatRegister shiftVRevCount = v4;
7878
7879 __ cbz(idx, Exit);
7880
7881 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7882
7883 // left shift count
7884 __ movw(shiftRevCount, 32);
7885 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7886
7887 // numIter too small to allow a 4-words SIMD loop, rolling back
7888 __ cmp(numIter, (u1)4);
7889 __ br(Assembler::LT, ShiftThree);
7890
7891 __ dup(shiftVCount, __ T4S, shiftCount);
7892 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7893 __ negr(shiftVCount, __ T4S, shiftVCount);
7894
7895 __ BIND(ShiftSIMDLoop);
7896
7897 // Calculate the load addresses
7898 __ sub(idx, idx, 4);
7899 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7900 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7901 __ add(oldArrCur, oldArrNext, 4);
7902
7903 // Load 4 words and process
7904 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7905 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7906 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7907 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7908 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7909 __ st1(newElem, __ T4S, Address(newArrCur));
7910
7911 __ cmp(idx, (u1)4);
7912 __ br(Assembler::LT, ShiftTwoLoop);
7913 __ b(ShiftSIMDLoop);
7914
7915 __ BIND(ShiftTwoLoop);
7916 __ cbz(idx, Exit);
7917 __ cmp(idx, (u1)1);
7918 __ br(Assembler::EQ, ShiftOne);
7919
7920 // Calculate the load addresses
7921 __ sub(idx, idx, 2);
7922 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7923 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7924 __ add(oldArrCur, oldArrNext, 4);
7925
7926 // Load 2 words and process
7927 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
7928 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
7929 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
7930 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
7931 __ orr(newElem, __ T8B, oldElem0, oldElem1);
7932 __ st1(newElem, __ T2S, Address(newArrCur));
7933 __ b(ShiftTwoLoop);
7934
7935 __ BIND(ShiftThree);
7936 __ tbz(idx, 1, ShiftOne);
7937 __ tbz(idx, 0, ShiftTwo);
7938 __ ldrw(r10, Address(oldArr, 12));
7939 __ ldrw(r11, Address(oldArr, 8));
7940 __ lsrvw(r10, r10, shiftCount);
7941 __ lslvw(r11, r11, shiftRevCount);
7942 __ orrw(r12, r10, r11);
7943 __ strw(r12, Address(newArr, 8));
7944
7945 __ BIND(ShiftTwo);
7946 __ ldrw(r10, Address(oldArr, 8));
7947 __ ldrw(r11, Address(oldArr, 4));
7948 __ lsrvw(r10, r10, shiftCount);
7949 __ lslvw(r11, r11, shiftRevCount);
7950 __ orrw(r12, r10, r11);
7951 __ strw(r12, Address(newArr, 4));
7952
7953 __ BIND(ShiftOne);
7954 __ ldrw(r10, Address(oldArr, 4));
7955 __ ldrw(r11, Address(oldArr));
7956 __ lsrvw(r10, r10, shiftCount);
7957 __ lslvw(r11, r11, shiftRevCount);
7958 __ orrw(r12, r10, r11);
7959 __ strw(r12, Address(newArr));
7960
7961 __ BIND(Exit);
7962 __ ret(lr);
7963
7964 return start;
7965 }
7966
7967 // Arguments:
7968 //
7969 // Input:
7970 // c_rarg0 - newArr address
7971 // c_rarg1 - oldArr address
7972 // c_rarg2 - newIdx
7973 // c_rarg3 - shiftCount
7974 // c_rarg4 - numIter
7975 //
7976 address generate_bigIntegerLeftShift() {
7977 __ align(CodeEntryAlignment);
7978 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
7979 StubCodeMark mark(this, stub_id);
7980 address start = __ pc();
7981
7982 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7983
7984 Register newArr = c_rarg0;
7985 Register oldArr = c_rarg1;
7986 Register newIdx = c_rarg2;
7987 Register shiftCount = c_rarg3;
7988 Register numIter = c_rarg4;
7989
7990 Register shiftRevCount = rscratch1;
7991 Register oldArrNext = rscratch2;
7992
7993 FloatRegister oldElem0 = v0;
7994 FloatRegister oldElem1 = v1;
7995 FloatRegister newElem = v2;
7996 FloatRegister shiftVCount = v3;
7997 FloatRegister shiftVRevCount = v4;
7998
7999 __ cbz(numIter, Exit);
8000
8001 __ add(oldArrNext, oldArr, 4);
8002 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8003
8004 // right shift count
8005 __ movw(shiftRevCount, 32);
8006 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8007
8008 // numIter too small to allow a 4-words SIMD loop, rolling back
8009 __ cmp(numIter, (u1)4);
8010 __ br(Assembler::LT, ShiftThree);
8011
8012 __ dup(shiftVCount, __ T4S, shiftCount);
8013 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8014 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8015
8016 __ BIND(ShiftSIMDLoop);
8017
8018 // load 4 words and process
8019 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8020 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8021 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8022 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8023 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8024 __ st1(newElem, __ T4S, __ post(newArr, 16));
8025 __ sub(numIter, numIter, 4);
8026
8027 __ cmp(numIter, (u1)4);
8028 __ br(Assembler::LT, ShiftTwoLoop);
8029 __ b(ShiftSIMDLoop);
8030
8031 __ BIND(ShiftTwoLoop);
8032 __ cbz(numIter, Exit);
8033 __ cmp(numIter, (u1)1);
8034 __ br(Assembler::EQ, ShiftOne);
8035
8036 // load 2 words and process
8037 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8038 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8039 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8040 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8041 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8042 __ st1(newElem, __ T2S, __ post(newArr, 8));
8043 __ sub(numIter, numIter, 2);
8044 __ b(ShiftTwoLoop);
8045
8046 __ BIND(ShiftThree);
8047 __ ldrw(r10, __ post(oldArr, 4));
8048 __ ldrw(r11, __ post(oldArrNext, 4));
8049 __ lslvw(r10, r10, shiftCount);
8050 __ lsrvw(r11, r11, shiftRevCount);
8051 __ orrw(r12, r10, r11);
8052 __ strw(r12, __ post(newArr, 4));
8053 __ tbz(numIter, 1, Exit);
8054 __ tbz(numIter, 0, ShiftOne);
8055
8056 __ BIND(ShiftTwo);
8057 __ ldrw(r10, __ post(oldArr, 4));
8058 __ ldrw(r11, __ post(oldArrNext, 4));
8059 __ lslvw(r10, r10, shiftCount);
8060 __ lsrvw(r11, r11, shiftRevCount);
8061 __ orrw(r12, r10, r11);
8062 __ strw(r12, __ post(newArr, 4));
8063
8064 __ BIND(ShiftOne);
8065 __ ldrw(r10, Address(oldArr));
8066 __ ldrw(r11, Address(oldArrNext));
8067 __ lslvw(r10, r10, shiftCount);
8068 __ lsrvw(r11, r11, shiftRevCount);
8069 __ orrw(r12, r10, r11);
8070 __ strw(r12, Address(newArr));
8071
8072 __ BIND(Exit);
8073 __ ret(lr);
8074
8075 return start;
8076 }
8077
8078 address generate_count_positives(address &count_positives_long) {
8079 const u1 large_loop_size = 64;
8080 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8081 int dcache_line = VM_Version::dcache_line_size();
8082
8083 Register ary1 = r1, len = r2, result = r0;
8084
8085 __ align(CodeEntryAlignment);
8086
8087 StubId stub_id = StubId::stubgen_count_positives_id;
8088 StubCodeMark mark(this, stub_id);
8089
8090 address entry = __ pc();
8091
8092 __ enter();
8093 // precondition: a copy of len is already in result
8094 // __ mov(result, len);
8095
8096 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8097 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8098
8099 __ cmp(len, (u1)15);
8100 __ br(Assembler::GT, LEN_OVER_15);
8101 // The only case when execution falls into this code is when pointer is near
8102 // the end of memory page and we have to avoid reading next page
8103 __ add(ary1, ary1, len);
8104 __ subs(len, len, 8);
8105 __ br(Assembler::GT, LEN_OVER_8);
8106 __ ldr(rscratch2, Address(ary1, -8));
8107 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8108 __ lsrv(rscratch2, rscratch2, rscratch1);
8109 __ tst(rscratch2, UPPER_BIT_MASK);
8110 __ csel(result, zr, result, Assembler::NE);
8111 __ leave();
8112 __ ret(lr);
8113 __ bind(LEN_OVER_8);
8114 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8115 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8116 __ tst(rscratch2, UPPER_BIT_MASK);
8117 __ br(Assembler::NE, RET_NO_POP);
8118 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8119 __ lsrv(rscratch1, rscratch1, rscratch2);
8120 __ tst(rscratch1, UPPER_BIT_MASK);
8121 __ bind(RET_NO_POP);
8122 __ csel(result, zr, result, Assembler::NE);
8123 __ leave();
8124 __ ret(lr);
8125
8126 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8127 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8128
8129 count_positives_long = __ pc(); // 2nd entry point
8130
8131 __ enter();
8132
8133 __ bind(LEN_OVER_15);
8134 __ push(spilled_regs, sp);
8135 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8136 __ cbz(rscratch2, ALIGNED);
8137 __ ldp(tmp6, tmp1, Address(ary1));
8138 __ mov(tmp5, 16);
8139 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8140 __ add(ary1, ary1, rscratch1);
8141 __ orr(tmp6, tmp6, tmp1);
8142 __ tst(tmp6, UPPER_BIT_MASK);
8143 __ br(Assembler::NE, RET_ADJUST);
8144 __ sub(len, len, rscratch1);
8145
8146 __ bind(ALIGNED);
8147 __ cmp(len, large_loop_size);
8148 __ br(Assembler::LT, CHECK_16);
8149 // Perform 16-byte load as early return in pre-loop to handle situation
8150 // when initially aligned large array has negative values at starting bytes,
8151 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8152 // slower. Cases with negative bytes further ahead won't be affected that
8153 // much. In fact, it'll be faster due to early loads, less instructions and
8154 // less branches in LARGE_LOOP.
8155 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8156 __ sub(len, len, 16);
8157 __ orr(tmp6, tmp6, tmp1);
8158 __ tst(tmp6, UPPER_BIT_MASK);
8159 __ br(Assembler::NE, RET_ADJUST_16);
8160 __ cmp(len, large_loop_size);
8161 __ br(Assembler::LT, CHECK_16);
8162
8163 if (SoftwarePrefetchHintDistance >= 0
8164 && SoftwarePrefetchHintDistance >= dcache_line) {
8165 // initial prefetch
8166 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8167 }
8168 __ bind(LARGE_LOOP);
8169 if (SoftwarePrefetchHintDistance >= 0) {
8170 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8171 }
8172 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8173 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8174 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8175 // instructions per cycle and have less branches, but this approach disables
8176 // early return, thus, all 64 bytes are loaded and checked every time.
8177 __ ldp(tmp2, tmp3, Address(ary1));
8178 __ ldp(tmp4, tmp5, Address(ary1, 16));
8179 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8180 __ ldp(tmp6, tmp1, Address(ary1, 48));
8181 __ add(ary1, ary1, large_loop_size);
8182 __ sub(len, len, large_loop_size);
8183 __ orr(tmp2, tmp2, tmp3);
8184 __ orr(tmp4, tmp4, tmp5);
8185 __ orr(rscratch1, rscratch1, rscratch2);
8186 __ orr(tmp6, tmp6, tmp1);
8187 __ orr(tmp2, tmp2, tmp4);
8188 __ orr(rscratch1, rscratch1, tmp6);
8189 __ orr(tmp2, tmp2, rscratch1);
8190 __ tst(tmp2, UPPER_BIT_MASK);
8191 __ br(Assembler::NE, RET_ADJUST_LONG);
8192 __ cmp(len, large_loop_size);
8193 __ br(Assembler::GE, LARGE_LOOP);
8194
8195 __ bind(CHECK_16); // small 16-byte load pre-loop
8196 __ cmp(len, (u1)16);
8197 __ br(Assembler::LT, POST_LOOP16);
8198
8199 __ bind(LOOP16); // small 16-byte load loop
8200 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8201 __ sub(len, len, 16);
8202 __ orr(tmp2, tmp2, tmp3);
8203 __ tst(tmp2, UPPER_BIT_MASK);
8204 __ br(Assembler::NE, RET_ADJUST_16);
8205 __ cmp(len, (u1)16);
8206 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8207
8208 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8209 __ cmp(len, (u1)8);
8210 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8211 __ ldr(tmp3, Address(__ post(ary1, 8)));
8212 __ tst(tmp3, UPPER_BIT_MASK);
8213 __ br(Assembler::NE, RET_ADJUST);
8214 __ sub(len, len, 8);
8215
8216 __ bind(POST_LOOP16_LOAD_TAIL);
8217 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8218 __ ldr(tmp1, Address(ary1));
8219 __ mov(tmp2, 64);
8220 __ sub(tmp4, tmp2, len, __ LSL, 3);
8221 __ lslv(tmp1, tmp1, tmp4);
8222 __ tst(tmp1, UPPER_BIT_MASK);
8223 __ br(Assembler::NE, RET_ADJUST);
8224 // Fallthrough
8225
8226 __ bind(RET_LEN);
8227 __ pop(spilled_regs, sp);
8228 __ leave();
8229 __ ret(lr);
8230
8231 // difference result - len is the count of guaranteed to be
8232 // positive bytes
8233
8234 __ bind(RET_ADJUST_LONG);
8235 __ add(len, len, (u1)(large_loop_size - 16));
8236 __ bind(RET_ADJUST_16);
8237 __ add(len, len, 16);
8238 __ bind(RET_ADJUST);
8239 __ pop(spilled_regs, sp);
8240 __ leave();
8241 __ sub(result, result, len);
8242 __ ret(lr);
8243
8244 return entry;
8245 }
8246
8247 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8248 bool usePrefetch, Label &NOT_EQUAL) {
8249 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8250 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8251 tmp7 = r12, tmp8 = r13;
8252 Label LOOP;
8253
8254 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8255 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8256 __ bind(LOOP);
8257 if (usePrefetch) {
8258 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8259 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8260 }
8261 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8262 __ eor(tmp1, tmp1, tmp2);
8263 __ eor(tmp3, tmp3, tmp4);
8264 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8265 __ orr(tmp1, tmp1, tmp3);
8266 __ cbnz(tmp1, NOT_EQUAL);
8267 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8268 __ eor(tmp5, tmp5, tmp6);
8269 __ eor(tmp7, tmp7, tmp8);
8270 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8271 __ orr(tmp5, tmp5, tmp7);
8272 __ cbnz(tmp5, NOT_EQUAL);
8273 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8274 __ eor(tmp1, tmp1, tmp2);
8275 __ eor(tmp3, tmp3, tmp4);
8276 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8277 __ orr(tmp1, tmp1, tmp3);
8278 __ cbnz(tmp1, NOT_EQUAL);
8279 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8280 __ eor(tmp5, tmp5, tmp6);
8281 __ sub(cnt1, cnt1, 8 * wordSize);
8282 __ eor(tmp7, tmp7, tmp8);
8283 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8284 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8285 // cmp) because subs allows an unlimited range of immediate operand.
8286 __ subs(tmp6, cnt1, loopThreshold);
8287 __ orr(tmp5, tmp5, tmp7);
8288 __ cbnz(tmp5, NOT_EQUAL);
8289 __ br(__ GE, LOOP);
8290 // post-loop
8291 __ eor(tmp1, tmp1, tmp2);
8292 __ eor(tmp3, tmp3, tmp4);
8293 __ orr(tmp1, tmp1, tmp3);
8294 __ sub(cnt1, cnt1, 2 * wordSize);
8295 __ cbnz(tmp1, NOT_EQUAL);
8296 }
8297
8298 void generate_large_array_equals_loop_simd(int loopThreshold,
8299 bool usePrefetch, Label &NOT_EQUAL) {
8300 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8301 tmp2 = rscratch2;
8302 Label LOOP;
8303
8304 __ bind(LOOP);
8305 if (usePrefetch) {
8306 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8307 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8308 }
8309 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8310 __ sub(cnt1, cnt1, 8 * wordSize);
8311 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8312 __ subs(tmp1, cnt1, loopThreshold);
8313 __ eor(v0, __ T16B, v0, v4);
8314 __ eor(v1, __ T16B, v1, v5);
8315 __ eor(v2, __ T16B, v2, v6);
8316 __ eor(v3, __ T16B, v3, v7);
8317 __ orr(v0, __ T16B, v0, v1);
8318 __ orr(v1, __ T16B, v2, v3);
8319 __ orr(v0, __ T16B, v0, v1);
8320 __ umov(tmp1, v0, __ D, 0);
8321 __ umov(tmp2, v0, __ D, 1);
8322 __ orr(tmp1, tmp1, tmp2);
8323 __ cbnz(tmp1, NOT_EQUAL);
8324 __ br(__ GE, LOOP);
8325 }
8326
8327 // a1 = r1 - array1 address
8328 // a2 = r2 - array2 address
8329 // result = r0 - return value. Already contains "false"
8330 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8331 // r3-r5 are reserved temporary registers
8332 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8333 address generate_large_array_equals() {
8334 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8335 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8336 tmp7 = r12, tmp8 = r13;
8337 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8338 SMALL_LOOP, POST_LOOP;
8339 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8340 // calculate if at least 32 prefetched bytes are used
8341 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8342 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8343 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8344 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8345 tmp5, tmp6, tmp7, tmp8);
8346
8347 __ align(CodeEntryAlignment);
8348
8349 StubId stub_id = StubId::stubgen_large_array_equals_id;
8350 StubCodeMark mark(this, stub_id);
8351
8352 address entry = __ pc();
8353 __ enter();
8354 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8355 // also advance pointers to use post-increment instead of pre-increment
8356 __ add(a1, a1, wordSize);
8357 __ add(a2, a2, wordSize);
8358 if (AvoidUnalignedAccesses) {
8359 // both implementations (SIMD/nonSIMD) are using relatively large load
8360 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8361 // on some CPUs in case of address is not at least 16-byte aligned.
8362 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8363 // load if needed at least for 1st address and make if 16-byte aligned.
8364 Label ALIGNED16;
8365 __ tbz(a1, 3, ALIGNED16);
8366 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8367 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8368 __ sub(cnt1, cnt1, wordSize);
8369 __ eor(tmp1, tmp1, tmp2);
8370 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8371 __ bind(ALIGNED16);
8372 }
8373 if (UseSIMDForArrayEquals) {
8374 if (SoftwarePrefetchHintDistance >= 0) {
8375 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8376 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8377 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8378 /* prfm = */ true, NOT_EQUAL);
8379 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8380 __ br(__ LT, TAIL);
8381 }
8382 __ bind(NO_PREFETCH_LARGE_LOOP);
8383 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8384 /* prfm = */ false, NOT_EQUAL);
8385 } else {
8386 __ push(spilled_regs, sp);
8387 if (SoftwarePrefetchHintDistance >= 0) {
8388 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8389 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8390 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8391 /* prfm = */ true, NOT_EQUAL);
8392 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8393 __ br(__ LT, TAIL);
8394 }
8395 __ bind(NO_PREFETCH_LARGE_LOOP);
8396 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8397 /* prfm = */ false, NOT_EQUAL);
8398 }
8399 __ bind(TAIL);
8400 __ cbz(cnt1, EQUAL);
8401 __ subs(cnt1, cnt1, wordSize);
8402 __ br(__ LE, POST_LOOP);
8403 __ bind(SMALL_LOOP);
8404 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8405 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8406 __ subs(cnt1, cnt1, wordSize);
8407 __ eor(tmp1, tmp1, tmp2);
8408 __ cbnz(tmp1, NOT_EQUAL);
8409 __ br(__ GT, SMALL_LOOP);
8410 __ bind(POST_LOOP);
8411 __ ldr(tmp1, Address(a1, cnt1));
8412 __ ldr(tmp2, Address(a2, cnt1));
8413 __ eor(tmp1, tmp1, tmp2);
8414 __ cbnz(tmp1, NOT_EQUAL);
8415 __ bind(EQUAL);
8416 __ mov(result, true);
8417 __ bind(NOT_EQUAL);
8418 if (!UseSIMDForArrayEquals) {
8419 __ pop(spilled_regs, sp);
8420 }
8421 __ bind(NOT_EQUAL_NO_POP);
8422 __ leave();
8423 __ ret(lr);
8424 return entry;
8425 }
8426
8427 // result = r0 - return value. Contains initial hashcode value on entry.
8428 // ary = r1 - array address
8429 // cnt = r2 - elements count
8430 // Clobbers: v0-v13, rscratch1, rscratch2
8431 address generate_large_arrays_hashcode(BasicType eltype) {
8432 const Register result = r0, ary = r1, cnt = r2;
8433 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8434 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8435 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8436 const FloatRegister vpowm = v13;
8437
8438 ARRAYS_HASHCODE_REGISTERS;
8439
8440 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8441
8442 unsigned int vf; // vectorization factor
8443 bool multiply_by_halves;
8444 Assembler::SIMD_Arrangement load_arrangement;
8445 switch (eltype) {
8446 case T_BOOLEAN:
8447 case T_BYTE:
8448 load_arrangement = Assembler::T8B;
8449 multiply_by_halves = true;
8450 vf = 8;
8451 break;
8452 case T_CHAR:
8453 case T_SHORT:
8454 load_arrangement = Assembler::T8H;
8455 multiply_by_halves = true;
8456 vf = 8;
8457 break;
8458 case T_INT:
8459 load_arrangement = Assembler::T4S;
8460 multiply_by_halves = false;
8461 vf = 4;
8462 break;
8463 default:
8464 ShouldNotReachHere();
8465 }
8466
8467 // Unroll factor
8468 const unsigned uf = 4;
8469
8470 // Effective vectorization factor
8471 const unsigned evf = vf * uf;
8472
8473 __ align(CodeEntryAlignment);
8474
8475 StubId stub_id;
8476 switch (eltype) {
8477 case T_BOOLEAN:
8478 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8479 break;
8480 case T_BYTE:
8481 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8482 break;
8483 case T_CHAR:
8484 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8485 break;
8486 case T_SHORT:
8487 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8488 break;
8489 case T_INT:
8490 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8491 break;
8492 default:
8493 stub_id = StubId::NO_STUBID;
8494 ShouldNotReachHere();
8495 };
8496
8497 StubCodeMark mark(this, stub_id);
8498
8499 address entry = __ pc();
8500 __ enter();
8501
8502 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8503 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8504 // value shouldn't change throughout both loops.
8505 __ movw(rscratch1, intpow(31U, 3));
8506 __ mov(vpow, Assembler::S, 0, rscratch1);
8507 __ movw(rscratch1, intpow(31U, 2));
8508 __ mov(vpow, Assembler::S, 1, rscratch1);
8509 __ movw(rscratch1, intpow(31U, 1));
8510 __ mov(vpow, Assembler::S, 2, rscratch1);
8511 __ movw(rscratch1, intpow(31U, 0));
8512 __ mov(vpow, Assembler::S, 3, rscratch1);
8513
8514 __ mov(vmul0, Assembler::T16B, 0);
8515 __ mov(vmul0, Assembler::S, 3, result);
8516
8517 __ andr(rscratch2, cnt, (uf - 1) * vf);
8518 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8519
8520 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8521 __ mov(vpowm, Assembler::S, 0, rscratch1);
8522
8523 // SMALL LOOP
8524 __ bind(SMALL_LOOP);
8525
8526 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8527 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8528 __ subsw(rscratch2, rscratch2, vf);
8529
8530 if (load_arrangement == Assembler::T8B) {
8531 // Extend 8B to 8H to be able to use vector multiply
8532 // instructions
8533 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8534 if (is_signed_subword_type(eltype)) {
8535 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8536 } else {
8537 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8538 }
8539 }
8540
8541 switch (load_arrangement) {
8542 case Assembler::T4S:
8543 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8544 break;
8545 case Assembler::T8B:
8546 case Assembler::T8H:
8547 assert(is_subword_type(eltype), "subword type expected");
8548 if (is_signed_subword_type(eltype)) {
8549 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8550 } else {
8551 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8552 }
8553 break;
8554 default:
8555 __ should_not_reach_here();
8556 }
8557
8558 // Process the upper half of a vector
8559 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8560 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8561 if (is_signed_subword_type(eltype)) {
8562 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8563 } else {
8564 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8565 }
8566 }
8567
8568 __ br(Assembler::HI, SMALL_LOOP);
8569
8570 // SMALL LOOP'S EPILOQUE
8571 __ lsr(rscratch2, cnt, exact_log2(evf));
8572 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8573
8574 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8575 __ addv(vmul0, Assembler::T4S, vmul0);
8576 __ umov(result, vmul0, Assembler::S, 0);
8577
8578 // TAIL
8579 __ bind(TAIL);
8580
8581 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8582 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8583 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8584 __ andr(rscratch2, cnt, vf - 1);
8585 __ bind(TAIL_SHORTCUT);
8586 __ adr(rscratch1, BR_BASE);
8587 // For Cortex-A53 offset is 4 because 2 nops are generated.
8588 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8589 __ movw(rscratch2, 0x1f);
8590 __ br(rscratch1);
8591
8592 for (size_t i = 0; i < vf - 1; ++i) {
8593 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8594 eltype);
8595 __ maddw(result, result, rscratch2, rscratch1);
8596 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8597 // Generate 2nd nop to have 4 instructions per iteration.
8598 if (VM_Version::supports_a53mac()) {
8599 __ nop();
8600 }
8601 }
8602 __ bind(BR_BASE);
8603
8604 __ leave();
8605 __ ret(lr);
8606
8607 // LARGE LOOP
8608 __ bind(LARGE_LOOP_PREHEADER);
8609
8610 __ lsr(rscratch2, cnt, exact_log2(evf));
8611
8612 if (multiply_by_halves) {
8613 // 31^4 - multiplier between lower and upper parts of a register
8614 __ movw(rscratch1, intpow(31U, vf / 2));
8615 __ mov(vpowm, Assembler::S, 1, rscratch1);
8616 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8617 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8618 __ mov(vpowm, Assembler::S, 0, rscratch1);
8619 } else {
8620 // 31^16
8621 __ movw(rscratch1, intpow(31U, evf));
8622 __ mov(vpowm, Assembler::S, 0, rscratch1);
8623 }
8624
8625 __ mov(vmul3, Assembler::T16B, 0);
8626 __ mov(vmul2, Assembler::T16B, 0);
8627 __ mov(vmul1, Assembler::T16B, 0);
8628
8629 __ bind(LARGE_LOOP);
8630
8631 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8632 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8633 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8634 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8635
8636 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8637 Address(__ post(ary, evf * type2aelembytes(eltype))));
8638
8639 if (load_arrangement == Assembler::T8B) {
8640 // Extend 8B to 8H to be able to use vector multiply
8641 // instructions
8642 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8643 if (is_signed_subword_type(eltype)) {
8644 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8645 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8646 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8647 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8648 } else {
8649 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8650 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8651 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8652 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8653 }
8654 }
8655
8656 switch (load_arrangement) {
8657 case Assembler::T4S:
8658 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8659 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8660 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8661 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8662 break;
8663 case Assembler::T8B:
8664 case Assembler::T8H:
8665 assert(is_subword_type(eltype), "subword type expected");
8666 if (is_signed_subword_type(eltype)) {
8667 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8668 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8669 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8670 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8671 } else {
8672 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8673 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8674 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8675 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8676 }
8677 break;
8678 default:
8679 __ should_not_reach_here();
8680 }
8681
8682 // Process the upper half of a vector
8683 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8684 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8685 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8686 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8687 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8688 if (is_signed_subword_type(eltype)) {
8689 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8690 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8691 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8692 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8693 } else {
8694 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8695 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8696 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8697 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8698 }
8699 }
8700
8701 __ subsw(rscratch2, rscratch2, 1);
8702 __ br(Assembler::HI, LARGE_LOOP);
8703
8704 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8705 __ addv(vmul3, Assembler::T4S, vmul3);
8706 __ umov(result, vmul3, Assembler::S, 0);
8707
8708 __ mov(rscratch2, intpow(31U, vf));
8709
8710 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8711 __ addv(vmul2, Assembler::T4S, vmul2);
8712 __ umov(rscratch1, vmul2, Assembler::S, 0);
8713 __ maddw(result, result, rscratch2, rscratch1);
8714
8715 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8716 __ addv(vmul1, Assembler::T4S, vmul1);
8717 __ umov(rscratch1, vmul1, Assembler::S, 0);
8718 __ maddw(result, result, rscratch2, rscratch1);
8719
8720 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8721 __ addv(vmul0, Assembler::T4S, vmul0);
8722 __ umov(rscratch1, vmul0, Assembler::S, 0);
8723 __ maddw(result, result, rscratch2, rscratch1);
8724
8725 __ andr(rscratch2, cnt, vf - 1);
8726 __ cbnz(rscratch2, TAIL_SHORTCUT);
8727
8728 __ leave();
8729 __ ret(lr);
8730
8731 return entry;
8732 }
8733
8734 address generate_dsin_dcos(bool isCos) {
8735 __ align(CodeEntryAlignment);
8736 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8737 StubCodeMark mark(this, stub_id);
8738 address start = __ pc();
8739 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8740 (address)StubRoutines::aarch64::_two_over_pi,
8741 (address)StubRoutines::aarch64::_pio2,
8742 (address)StubRoutines::aarch64::_dsin_coef,
8743 (address)StubRoutines::aarch64::_dcos_coef);
8744 return start;
8745 }
8746
8747 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8748 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8749 Label &DIFF2) {
8750 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8751 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8752
8753 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8754 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8755 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8756 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8757
8758 __ fmovd(tmpL, vtmp3);
8759 __ eor(rscratch2, tmp3, tmpL);
8760 __ cbnz(rscratch2, DIFF2);
8761
8762 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8763 __ umov(tmpL, vtmp3, __ D, 1);
8764 __ eor(rscratch2, tmpU, tmpL);
8765 __ cbnz(rscratch2, DIFF1);
8766
8767 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8768 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8769 __ fmovd(tmpL, vtmp);
8770 __ eor(rscratch2, tmp3, tmpL);
8771 __ cbnz(rscratch2, DIFF2);
8772
8773 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8774 __ umov(tmpL, vtmp, __ D, 1);
8775 __ eor(rscratch2, tmpU, tmpL);
8776 __ cbnz(rscratch2, DIFF1);
8777 }
8778
8779 // r0 = result
8780 // r1 = str1
8781 // r2 = cnt1
8782 // r3 = str2
8783 // r4 = cnt2
8784 // r10 = tmp1
8785 // r11 = tmp2
8786 address generate_compare_long_string_different_encoding(bool isLU) {
8787 __ align(CodeEntryAlignment);
8788 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8789 StubCodeMark mark(this, stub_id);
8790 address entry = __ pc();
8791 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8792 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8793 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8794 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8795 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8796 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8797 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8798
8799 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8800
8801 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8802 // cnt2 == amount of characters left to compare
8803 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8804 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8805 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8806 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8807 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8808 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8809 __ eor(rscratch2, tmp1, tmp2);
8810 __ mov(rscratch1, tmp2);
8811 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8812 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8813 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8814 __ push(spilled_regs, sp);
8815 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8816 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8817
8818 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8819
8820 if (SoftwarePrefetchHintDistance >= 0) {
8821 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8822 __ br(__ LT, NO_PREFETCH);
8823 __ bind(LARGE_LOOP_PREFETCH);
8824 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8825 __ mov(tmp4, 2);
8826 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8827 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8828 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8829 __ subs(tmp4, tmp4, 1);
8830 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8831 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8832 __ mov(tmp4, 2);
8833 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8834 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8835 __ subs(tmp4, tmp4, 1);
8836 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8837 __ sub(cnt2, cnt2, 64);
8838 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8839 __ br(__ GE, LARGE_LOOP_PREFETCH);
8840 }
8841 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8842 __ bind(NO_PREFETCH);
8843 __ subs(cnt2, cnt2, 16);
8844 __ br(__ LT, TAIL);
8845 __ align(OptoLoopAlignment);
8846 __ bind(SMALL_LOOP); // smaller loop
8847 __ subs(cnt2, cnt2, 16);
8848 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8849 __ br(__ GE, SMALL_LOOP);
8850 __ cmn(cnt2, (u1)16);
8851 __ br(__ EQ, LOAD_LAST);
8852 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8853 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8854 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8855 __ ldr(tmp3, Address(cnt1, -8));
8856 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8857 __ b(LOAD_LAST);
8858 __ bind(DIFF2);
8859 __ mov(tmpU, tmp3);
8860 __ bind(DIFF1);
8861 __ pop(spilled_regs, sp);
8862 __ b(CALCULATE_DIFFERENCE);
8863 __ bind(LOAD_LAST);
8864 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8865 // No need to load it again
8866 __ mov(tmpU, tmp3);
8867 __ pop(spilled_regs, sp);
8868
8869 // tmp2 points to the address of the last 4 Latin1 characters right now
8870 __ ldrs(vtmp, Address(tmp2));
8871 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8872 __ fmovd(tmpL, vtmp);
8873
8874 __ eor(rscratch2, tmpU, tmpL);
8875 __ cbz(rscratch2, DONE);
8876
8877 // Find the first different characters in the longwords and
8878 // compute their difference.
8879 __ bind(CALCULATE_DIFFERENCE);
8880 __ rev(rscratch2, rscratch2);
8881 __ clz(rscratch2, rscratch2);
8882 __ andr(rscratch2, rscratch2, -16);
8883 __ lsrv(tmp1, tmp1, rscratch2);
8884 __ uxthw(tmp1, tmp1);
8885 __ lsrv(rscratch1, rscratch1, rscratch2);
8886 __ uxthw(rscratch1, rscratch1);
8887 __ subw(result, tmp1, rscratch1);
8888 __ bind(DONE);
8889 __ ret(lr);
8890 return entry;
8891 }
8892
8893 // r0 = input (float16)
8894 // v0 = result (float)
8895 // v1 = temporary float register
8896 address generate_float16ToFloat() {
8897 __ align(CodeEntryAlignment);
8898 StubId stub_id = StubId::stubgen_hf2f_id;
8899 StubCodeMark mark(this, stub_id);
8900 address entry = __ pc();
8901 BLOCK_COMMENT("Entry:");
8902 __ flt16_to_flt(v0, r0, v1);
8903 __ ret(lr);
8904 return entry;
8905 }
8906
8907 // v0 = input (float)
8908 // r0 = result (float16)
8909 // v1 = temporary float register
8910 address generate_floatToFloat16() {
8911 __ align(CodeEntryAlignment);
8912 StubId stub_id = StubId::stubgen_f2hf_id;
8913 StubCodeMark mark(this, stub_id);
8914 address entry = __ pc();
8915 BLOCK_COMMENT("Entry:");
8916 __ flt_to_flt16(r0, v0, v1);
8917 __ ret(lr);
8918 return entry;
8919 }
8920
8921 address generate_method_entry_barrier() {
8922 __ align(CodeEntryAlignment);
8923 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
8924 StubCodeMark mark(this, stub_id);
8925
8926 Label deoptimize_label;
8927
8928 address start = __ pc();
8929
8930 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
8931
8932 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
8933 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8934 // We can get here despite the nmethod being good, if we have not
8935 // yet applied our cross modification fence (or data fence).
8936 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
8937 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
8938 __ ldrw(rscratch2, rscratch2);
8939 __ strw(rscratch2, thread_epoch_addr);
8940 __ isb();
8941 __ membar(__ LoadLoad);
8942 }
8943
8944 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
8945
8946 __ enter();
8947 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
8948
8949 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
8950
8951 __ push_call_clobbered_registers();
8952
8953 __ mov(c_rarg0, rscratch2);
8954 __ call_VM_leaf
8955 (CAST_FROM_FN_PTR
8956 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
8957
8958 __ reset_last_Java_frame(true);
8959
8960 __ mov(rscratch1, r0);
8961
8962 __ pop_call_clobbered_registers();
8963
8964 __ cbnz(rscratch1, deoptimize_label);
8965
8966 __ leave();
8967 __ ret(lr);
8968
8969 __ BIND(deoptimize_label);
8970
8971 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
8972 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
8973
8974 __ mov(sp, rscratch1);
8975 __ br(rscratch2);
8976
8977 return start;
8978 }
8979
8980 // r0 = result
8981 // r1 = str1
8982 // r2 = cnt1
8983 // r3 = str2
8984 // r4 = cnt2
8985 // r10 = tmp1
8986 // r11 = tmp2
8987 address generate_compare_long_string_same_encoding(bool isLL) {
8988 __ align(CodeEntryAlignment);
8989 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
8990 StubCodeMark mark(this, stub_id);
8991 address entry = __ pc();
8992 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8993 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
8994
8995 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
8996
8997 // exit from large loop when less than 64 bytes left to read or we're about
8998 // to prefetch memory behind array border
8999 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9000
9001 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9002 __ eor(rscratch2, tmp1, tmp2);
9003 __ cbnz(rscratch2, CAL_DIFFERENCE);
9004
9005 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9006 // update pointers, because of previous read
9007 __ add(str1, str1, wordSize);
9008 __ add(str2, str2, wordSize);
9009 if (SoftwarePrefetchHintDistance >= 0) {
9010 __ align(OptoLoopAlignment);
9011 __ bind(LARGE_LOOP_PREFETCH);
9012 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9013 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9014
9015 for (int i = 0; i < 4; i++) {
9016 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9017 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9018 __ cmp(tmp1, tmp2);
9019 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9020 __ br(Assembler::NE, DIFF);
9021 }
9022 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9023 __ add(str1, str1, 64);
9024 __ add(str2, str2, 64);
9025 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9026 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9027 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9028 }
9029
9030 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9031 __ br(Assembler::LE, LESS16);
9032 __ align(OptoLoopAlignment);
9033 __ bind(LOOP_COMPARE16);
9034 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9035 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9036 __ cmp(tmp1, tmp2);
9037 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9038 __ br(Assembler::NE, DIFF);
9039 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9040 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9041 __ br(Assembler::LT, LESS16);
9042
9043 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9044 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9045 __ cmp(tmp1, tmp2);
9046 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9047 __ br(Assembler::NE, DIFF);
9048 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9049 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9050 __ br(Assembler::GE, LOOP_COMPARE16);
9051 __ cbz(cnt2, LENGTH_DIFF);
9052
9053 __ bind(LESS16);
9054 // each 8 compare
9055 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9056 __ br(Assembler::LE, LESS8);
9057 __ ldr(tmp1, Address(__ post(str1, 8)));
9058 __ ldr(tmp2, Address(__ post(str2, 8)));
9059 __ eor(rscratch2, tmp1, tmp2);
9060 __ cbnz(rscratch2, CAL_DIFFERENCE);
9061 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9062
9063 __ bind(LESS8); // directly load last 8 bytes
9064 if (!isLL) {
9065 __ add(cnt2, cnt2, cnt2);
9066 }
9067 __ ldr(tmp1, Address(str1, cnt2));
9068 __ ldr(tmp2, Address(str2, cnt2));
9069 __ eor(rscratch2, tmp1, tmp2);
9070 __ cbz(rscratch2, LENGTH_DIFF);
9071 __ b(CAL_DIFFERENCE);
9072
9073 __ bind(DIFF);
9074 __ cmp(tmp1, tmp2);
9075 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9076 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9077 // reuse rscratch2 register for the result of eor instruction
9078 __ eor(rscratch2, tmp1, tmp2);
9079
9080 __ bind(CAL_DIFFERENCE);
9081 __ rev(rscratch2, rscratch2);
9082 __ clz(rscratch2, rscratch2);
9083 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9084 __ lsrv(tmp1, tmp1, rscratch2);
9085 __ lsrv(tmp2, tmp2, rscratch2);
9086 if (isLL) {
9087 __ uxtbw(tmp1, tmp1);
9088 __ uxtbw(tmp2, tmp2);
9089 } else {
9090 __ uxthw(tmp1, tmp1);
9091 __ uxthw(tmp2, tmp2);
9092 }
9093 __ subw(result, tmp1, tmp2);
9094
9095 __ bind(LENGTH_DIFF);
9096 __ ret(lr);
9097 return entry;
9098 }
9099
9100 enum string_compare_mode {
9101 LL,
9102 LU,
9103 UL,
9104 UU,
9105 };
9106
9107 // The following registers are declared in aarch64.ad
9108 // r0 = result
9109 // r1 = str1
9110 // r2 = cnt1
9111 // r3 = str2
9112 // r4 = cnt2
9113 // r10 = tmp1
9114 // r11 = tmp2
9115 // z0 = ztmp1
9116 // z1 = ztmp2
9117 // p0 = pgtmp1
9118 // p1 = pgtmp2
9119 address generate_compare_long_string_sve(string_compare_mode mode) {
9120 StubId stub_id;
9121 switch (mode) {
9122 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9123 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9124 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9125 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9126 default: ShouldNotReachHere();
9127 }
9128
9129 __ align(CodeEntryAlignment);
9130 address entry = __ pc();
9131 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9132 tmp1 = r10, tmp2 = r11;
9133
9134 Label LOOP, DONE, MISMATCH;
9135 Register vec_len = tmp1;
9136 Register idx = tmp2;
9137 // The minimum of the string lengths has been stored in cnt2.
9138 Register cnt = cnt2;
9139 FloatRegister ztmp1 = z0, ztmp2 = z1;
9140 PRegister pgtmp1 = p0, pgtmp2 = p1;
9141
9142 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9143 switch (mode) { \
9144 case LL: \
9145 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9146 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9147 break; \
9148 case LU: \
9149 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9150 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9151 break; \
9152 case UL: \
9153 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9154 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9155 break; \
9156 case UU: \
9157 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9158 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9159 break; \
9160 default: \
9161 ShouldNotReachHere(); \
9162 }
9163
9164 StubCodeMark mark(this, stub_id);
9165
9166 __ mov(idx, 0);
9167 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9168
9169 if (mode == LL) {
9170 __ sve_cntb(vec_len);
9171 } else {
9172 __ sve_cnth(vec_len);
9173 }
9174
9175 __ sub(rscratch1, cnt, vec_len);
9176
9177 __ bind(LOOP);
9178
9179 // main loop
9180 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9181 __ add(idx, idx, vec_len);
9182 // Compare strings.
9183 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9184 __ br(__ NE, MISMATCH);
9185 __ cmp(idx, rscratch1);
9186 __ br(__ LT, LOOP);
9187
9188 // post loop, last iteration
9189 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9190
9191 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9192 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9193 __ br(__ EQ, DONE);
9194
9195 __ bind(MISMATCH);
9196
9197 // Crop the vector to find its location.
9198 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9199 // Extract the first different characters of each string.
9200 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9201 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9202
9203 // Compute the difference of the first different characters.
9204 __ sub(result, rscratch1, rscratch2);
9205
9206 __ bind(DONE);
9207 __ ret(lr);
9208 #undef LOAD_PAIR
9209 return entry;
9210 }
9211
9212 void generate_compare_long_strings() {
9213 if (UseSVE == 0) {
9214 StubRoutines::aarch64::_compare_long_string_LL
9215 = generate_compare_long_string_same_encoding(true);
9216 StubRoutines::aarch64::_compare_long_string_UU
9217 = generate_compare_long_string_same_encoding(false);
9218 StubRoutines::aarch64::_compare_long_string_LU
9219 = generate_compare_long_string_different_encoding(true);
9220 StubRoutines::aarch64::_compare_long_string_UL
9221 = generate_compare_long_string_different_encoding(false);
9222 } else {
9223 StubRoutines::aarch64::_compare_long_string_LL
9224 = generate_compare_long_string_sve(LL);
9225 StubRoutines::aarch64::_compare_long_string_UU
9226 = generate_compare_long_string_sve(UU);
9227 StubRoutines::aarch64::_compare_long_string_LU
9228 = generate_compare_long_string_sve(LU);
9229 StubRoutines::aarch64::_compare_long_string_UL
9230 = generate_compare_long_string_sve(UL);
9231 }
9232 }
9233
9234 // R0 = result
9235 // R1 = str2
9236 // R2 = cnt1
9237 // R3 = str1
9238 // R4 = cnt2
9239 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9240 //
9241 // This generic linear code use few additional ideas, which makes it faster:
9242 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9243 // in order to skip initial loading(help in systems with 1 ld pipeline)
9244 // 2) we can use "fast" algorithm of finding single character to search for
9245 // first symbol with less branches(1 branch per each loaded register instead
9246 // of branch for each symbol), so, this is where constants like
9247 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9248 // 3) after loading and analyzing 1st register of source string, it can be
9249 // used to search for every 1st character entry, saving few loads in
9250 // comparison with "simplier-but-slower" implementation
9251 // 4) in order to avoid lots of push/pop operations, code below is heavily
9252 // re-using/re-initializing/compressing register values, which makes code
9253 // larger and a bit less readable, however, most of extra operations are
9254 // issued during loads or branches, so, penalty is minimal
9255 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9256 StubId stub_id;
9257 if (str1_isL) {
9258 if (str2_isL) {
9259 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9260 } else {
9261 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9262 }
9263 } else {
9264 if (str2_isL) {
9265 ShouldNotReachHere();
9266 } else {
9267 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9268 }
9269 }
9270 __ align(CodeEntryAlignment);
9271 StubCodeMark mark(this, stub_id);
9272 address entry = __ pc();
9273
9274 int str1_chr_size = str1_isL ? 1 : 2;
9275 int str2_chr_size = str2_isL ? 1 : 2;
9276 int str1_chr_shift = str1_isL ? 0 : 1;
9277 int str2_chr_shift = str2_isL ? 0 : 1;
9278 bool isL = str1_isL && str2_isL;
9279 // parameters
9280 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9281 // temporary registers
9282 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9283 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9284 // redefinitions
9285 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9286
9287 __ push(spilled_regs, sp);
9288 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9289 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9290 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9291 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9292 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9293 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9294 // Read whole register from str1. It is safe, because length >=8 here
9295 __ ldr(ch1, Address(str1));
9296 // Read whole register from str2. It is safe, because length >=8 here
9297 __ ldr(ch2, Address(str2));
9298 __ sub(cnt2, cnt2, cnt1);
9299 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9300 if (str1_isL != str2_isL) {
9301 __ eor(v0, __ T16B, v0, v0);
9302 }
9303 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9304 __ mul(first, first, tmp1);
9305 // check if we have less than 1 register to check
9306 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9307 if (str1_isL != str2_isL) {
9308 __ fmovd(v1, ch1);
9309 }
9310 __ br(__ LE, L_SMALL);
9311 __ eor(ch2, first, ch2);
9312 if (str1_isL != str2_isL) {
9313 __ zip1(v1, __ T16B, v1, v0);
9314 }
9315 __ sub(tmp2, ch2, tmp1);
9316 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9317 __ bics(tmp2, tmp2, ch2);
9318 if (str1_isL != str2_isL) {
9319 __ fmovd(ch1, v1);
9320 }
9321 __ br(__ NE, L_HAS_ZERO);
9322 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9323 __ add(result, result, wordSize/str2_chr_size);
9324 __ add(str2, str2, wordSize);
9325 __ br(__ LT, L_POST_LOOP);
9326 __ BIND(L_LOOP);
9327 __ ldr(ch2, Address(str2));
9328 __ eor(ch2, first, ch2);
9329 __ sub(tmp2, ch2, tmp1);
9330 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9331 __ bics(tmp2, tmp2, ch2);
9332 __ br(__ NE, L_HAS_ZERO);
9333 __ BIND(L_LOOP_PROCEED);
9334 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9335 __ add(str2, str2, wordSize);
9336 __ add(result, result, wordSize/str2_chr_size);
9337 __ br(__ GE, L_LOOP);
9338 __ BIND(L_POST_LOOP);
9339 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9340 __ br(__ LE, NOMATCH);
9341 __ ldr(ch2, Address(str2));
9342 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9343 __ eor(ch2, first, ch2);
9344 __ sub(tmp2, ch2, tmp1);
9345 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9346 __ mov(tmp4, -1); // all bits set
9347 __ b(L_SMALL_PROCEED);
9348 __ align(OptoLoopAlignment);
9349 __ BIND(L_SMALL);
9350 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9351 __ eor(ch2, first, ch2);
9352 if (str1_isL != str2_isL) {
9353 __ zip1(v1, __ T16B, v1, v0);
9354 }
9355 __ sub(tmp2, ch2, tmp1);
9356 __ mov(tmp4, -1); // all bits set
9357 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9358 if (str1_isL != str2_isL) {
9359 __ fmovd(ch1, v1); // move converted 4 symbols
9360 }
9361 __ BIND(L_SMALL_PROCEED);
9362 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9363 __ bic(tmp2, tmp2, ch2);
9364 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9365 __ rbit(tmp2, tmp2);
9366 __ br(__ EQ, NOMATCH);
9367 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9368 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9369 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9370 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9371 if (str2_isL) { // LL
9372 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9373 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9374 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9375 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9376 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9377 } else {
9378 __ mov(ch2, 0xE); // all bits in byte set except last one
9379 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9380 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9381 __ lslv(tmp2, tmp2, tmp4);
9382 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9383 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9384 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9385 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9386 }
9387 __ cmp(ch1, ch2);
9388 __ mov(tmp4, wordSize/str2_chr_size);
9389 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9390 __ BIND(L_SMALL_CMP_LOOP);
9391 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9392 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9393 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9394 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9395 __ add(tmp4, tmp4, 1);
9396 __ cmp(tmp4, cnt1);
9397 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9398 __ cmp(first, ch2);
9399 __ br(__ EQ, L_SMALL_CMP_LOOP);
9400 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9401 __ cbz(tmp2, NOMATCH); // no more matches. exit
9402 __ clz(tmp4, tmp2);
9403 __ add(result, result, 1); // advance index
9404 __ add(str2, str2, str2_chr_size); // advance pointer
9405 __ b(L_SMALL_HAS_ZERO_LOOP);
9406 __ align(OptoLoopAlignment);
9407 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9408 __ cmp(first, ch2);
9409 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9410 __ b(DONE);
9411 __ align(OptoLoopAlignment);
9412 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9413 if (str2_isL) { // LL
9414 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9415 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9416 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9417 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9418 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9419 } else {
9420 __ mov(ch2, 0xE); // all bits in byte set except last one
9421 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9422 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9423 __ lslv(tmp2, tmp2, tmp4);
9424 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9425 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9426 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9427 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9428 }
9429 __ cmp(ch1, ch2);
9430 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9431 __ b(DONE);
9432 __ align(OptoLoopAlignment);
9433 __ BIND(L_HAS_ZERO);
9434 __ rbit(tmp2, tmp2);
9435 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9436 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9437 // It's fine because both counters are 32bit and are not changed in this
9438 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9439 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9440 __ sub(result, result, 1);
9441 __ BIND(L_HAS_ZERO_LOOP);
9442 __ mov(cnt1, wordSize/str2_chr_size);
9443 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9444 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9445 if (str2_isL) {
9446 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9447 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9448 __ lslv(tmp2, tmp2, tmp4);
9449 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9450 __ add(tmp4, tmp4, 1);
9451 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9452 __ lsl(tmp2, tmp2, 1);
9453 __ mov(tmp4, wordSize/str2_chr_size);
9454 } else {
9455 __ mov(ch2, 0xE);
9456 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9457 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9458 __ lslv(tmp2, tmp2, tmp4);
9459 __ add(tmp4, tmp4, 1);
9460 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9461 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9462 __ lsl(tmp2, tmp2, 1);
9463 __ mov(tmp4, wordSize/str2_chr_size);
9464 __ sub(str2, str2, str2_chr_size);
9465 }
9466 __ cmp(ch1, ch2);
9467 __ mov(tmp4, wordSize/str2_chr_size);
9468 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9469 __ BIND(L_CMP_LOOP);
9470 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9471 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9472 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9473 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9474 __ add(tmp4, tmp4, 1);
9475 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9476 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9477 __ cmp(cnt1, ch2);
9478 __ br(__ EQ, L_CMP_LOOP);
9479 __ BIND(L_CMP_LOOP_NOMATCH);
9480 // here we're not matched
9481 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9482 __ clz(tmp4, tmp2);
9483 __ add(str2, str2, str2_chr_size); // advance pointer
9484 __ b(L_HAS_ZERO_LOOP);
9485 __ align(OptoLoopAlignment);
9486 __ BIND(L_CMP_LOOP_LAST_CMP);
9487 __ cmp(cnt1, ch2);
9488 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9489 __ b(DONE);
9490 __ align(OptoLoopAlignment);
9491 __ BIND(L_CMP_LOOP_LAST_CMP2);
9492 if (str2_isL) {
9493 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9494 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9495 __ lslv(tmp2, tmp2, tmp4);
9496 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9497 __ add(tmp4, tmp4, 1);
9498 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9499 __ lsl(tmp2, tmp2, 1);
9500 } else {
9501 __ mov(ch2, 0xE);
9502 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9503 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9504 __ lslv(tmp2, tmp2, tmp4);
9505 __ add(tmp4, tmp4, 1);
9506 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9507 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9508 __ lsl(tmp2, tmp2, 1);
9509 __ sub(str2, str2, str2_chr_size);
9510 }
9511 __ cmp(ch1, ch2);
9512 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9513 __ b(DONE);
9514 __ align(OptoLoopAlignment);
9515 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9516 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9517 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9518 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9519 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9520 // result by analyzed characters value, so, we can just reset lower bits
9521 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9522 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9523 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9524 // index of last analyzed substring inside current octet. So, str2 in at
9525 // respective start address. We need to advance it to next octet
9526 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9527 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9528 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9529 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9530 __ movw(cnt2, cnt2);
9531 __ b(L_LOOP_PROCEED);
9532 __ align(OptoLoopAlignment);
9533 __ BIND(NOMATCH);
9534 __ mov(result, -1);
9535 __ BIND(DONE);
9536 __ pop(spilled_regs, sp);
9537 __ ret(lr);
9538 return entry;
9539 }
9540
9541 void generate_string_indexof_stubs() {
9542 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9543 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9544 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9545 }
9546
9547 void inflate_and_store_2_fp_registers(bool generatePrfm,
9548 FloatRegister src1, FloatRegister src2) {
9549 Register dst = r1;
9550 __ zip1(v1, __ T16B, src1, v0);
9551 __ zip2(v2, __ T16B, src1, v0);
9552 if (generatePrfm) {
9553 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9554 }
9555 __ zip1(v3, __ T16B, src2, v0);
9556 __ zip2(v4, __ T16B, src2, v0);
9557 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9558 }
9559
9560 // R0 = src
9561 // R1 = dst
9562 // R2 = len
9563 // R3 = len >> 3
9564 // V0 = 0
9565 // v1 = loaded 8 bytes
9566 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9567 address generate_large_byte_array_inflate() {
9568 __ align(CodeEntryAlignment);
9569 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9570 StubCodeMark mark(this, stub_id);
9571 address entry = __ pc();
9572 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9573 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9574 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9575
9576 // do one more 8-byte read to have address 16-byte aligned in most cases
9577 // also use single store instruction
9578 __ ldrd(v2, __ post(src, 8));
9579 __ sub(octetCounter, octetCounter, 2);
9580 __ zip1(v1, __ T16B, v1, v0);
9581 __ zip1(v2, __ T16B, v2, v0);
9582 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9583 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9584 __ subs(rscratch1, octetCounter, large_loop_threshold);
9585 __ br(__ LE, LOOP_START);
9586 __ b(LOOP_PRFM_START);
9587 __ bind(LOOP_PRFM);
9588 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9589 __ bind(LOOP_PRFM_START);
9590 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9591 __ sub(octetCounter, octetCounter, 8);
9592 __ subs(rscratch1, octetCounter, large_loop_threshold);
9593 inflate_and_store_2_fp_registers(true, v3, v4);
9594 inflate_and_store_2_fp_registers(true, v5, v6);
9595 __ br(__ GT, LOOP_PRFM);
9596 __ cmp(octetCounter, (u1)8);
9597 __ br(__ LT, DONE);
9598 __ bind(LOOP);
9599 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9600 __ bind(LOOP_START);
9601 __ sub(octetCounter, octetCounter, 8);
9602 __ cmp(octetCounter, (u1)8);
9603 inflate_and_store_2_fp_registers(false, v3, v4);
9604 inflate_and_store_2_fp_registers(false, v5, v6);
9605 __ br(__ GE, LOOP);
9606 __ bind(DONE);
9607 __ ret(lr);
9608 return entry;
9609 }
9610
9611 /**
9612 * Arguments:
9613 *
9614 * Input:
9615 * c_rarg0 - current state address
9616 * c_rarg1 - H key address
9617 * c_rarg2 - data address
9618 * c_rarg3 - number of blocks
9619 *
9620 * Output:
9621 * Updated state at c_rarg0
9622 */
9623 address generate_ghash_processBlocks() {
9624 // Bafflingly, GCM uses little-endian for the byte order, but
9625 // big-endian for the bit order. For example, the polynomial 1 is
9626 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9627 //
9628 // So, we must either reverse the bytes in each word and do
9629 // everything big-endian or reverse the bits in each byte and do
9630 // it little-endian. On AArch64 it's more idiomatic to reverse
9631 // the bits in each byte (we have an instruction, RBIT, to do
9632 // that) and keep the data in little-endian bit order through the
9633 // calculation, bit-reversing the inputs and outputs.
9634
9635 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9636 StubCodeMark mark(this, stub_id);
9637 Label polynomial; // local data generated at end of stub
9638 __ align(CodeEntryAlignment);
9639 address start = __ pc();
9640
9641 Register state = c_rarg0;
9642 Register subkeyH = c_rarg1;
9643 Register data = c_rarg2;
9644 Register blocks = c_rarg3;
9645
9646 FloatRegister vzr = v30;
9647 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9648
9649 __ adr(rscratch1, polynomial);
9650 __ ldrq(v24, rscratch1); // The field polynomial
9651
9652 __ ldrq(v0, Address(state));
9653 __ ldrq(v1, Address(subkeyH));
9654
9655 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9656 __ rbit(v0, __ T16B, v0);
9657 __ rev64(v1, __ T16B, v1);
9658 __ rbit(v1, __ T16B, v1);
9659
9660 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9661 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9662
9663 {
9664 Label L_ghash_loop;
9665 __ bind(L_ghash_loop);
9666
9667 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9668 // reversing each byte
9669 __ rbit(v2, __ T16B, v2);
9670 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9671
9672 // Multiply state in v2 by subkey in v1
9673 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9674 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9675 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9676 // Reduce v7:v5 by the field polynomial
9677 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9678
9679 __ sub(blocks, blocks, 1);
9680 __ cbnz(blocks, L_ghash_loop);
9681 }
9682
9683 // The bit-reversed result is at this point in v0
9684 __ rev64(v0, __ T16B, v0);
9685 __ rbit(v0, __ T16B, v0);
9686
9687 __ st1(v0, __ T16B, state);
9688 __ ret(lr);
9689
9690 // bind label and generate local polynomial data
9691 __ align(wordSize * 2);
9692 __ bind(polynomial);
9693 __ emit_int64(0x87); // The low-order bits of the field
9694 // polynomial (i.e. p = z^7+z^2+z+1)
9695 // repeated in the low and high parts of a
9696 // 128-bit vector
9697 __ emit_int64(0x87);
9698
9699 return start;
9700 }
9701
9702 address generate_ghash_processBlocks_wide() {
9703 address small = generate_ghash_processBlocks();
9704
9705 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9706 StubCodeMark mark(this, stub_id);
9707 Label polynomial; // local data generated after stub
9708 __ align(CodeEntryAlignment);
9709 address start = __ pc();
9710
9711 Register state = c_rarg0;
9712 Register subkeyH = c_rarg1;
9713 Register data = c_rarg2;
9714 Register blocks = c_rarg3;
9715
9716 const int unroll = 4;
9717
9718 __ cmp(blocks, (unsigned char)(unroll * 2));
9719 __ br(__ LT, small);
9720
9721 if (unroll > 1) {
9722 // Save state before entering routine
9723 __ sub(sp, sp, 4 * 16);
9724 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9725 __ sub(sp, sp, 4 * 16);
9726 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9727 }
9728
9729 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9730
9731 if (unroll > 1) {
9732 // And restore state
9733 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9734 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9735 }
9736
9737 __ cmp(blocks, (unsigned char)0);
9738 __ br(__ GT, small);
9739
9740 __ ret(lr);
9741
9742 // bind label and generate polynomial data
9743 __ align(wordSize * 2);
9744 __ bind(polynomial);
9745 __ emit_int64(0x87); // The low-order bits of the field
9746 // polynomial (i.e. p = z^7+z^2+z+1)
9747 // repeated in the low and high parts of a
9748 // 128-bit vector
9749 __ emit_int64(0x87);
9750
9751 return start;
9752
9753 }
9754
9755 void generate_base64_encode_simdround(Register src, Register dst,
9756 FloatRegister codec, u8 size) {
9757
9758 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9759 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9760 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9761
9762 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9763
9764 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9765
9766 __ ushr(ind0, arrangement, in0, 2);
9767
9768 __ ushr(ind1, arrangement, in1, 2);
9769 __ shl(in0, arrangement, in0, 6);
9770 __ orr(ind1, arrangement, ind1, in0);
9771 __ ushr(ind1, arrangement, ind1, 2);
9772
9773 __ ushr(ind2, arrangement, in2, 4);
9774 __ shl(in1, arrangement, in1, 4);
9775 __ orr(ind2, arrangement, in1, ind2);
9776 __ ushr(ind2, arrangement, ind2, 2);
9777
9778 __ shl(ind3, arrangement, in2, 2);
9779 __ ushr(ind3, arrangement, ind3, 2);
9780
9781 __ tbl(out0, arrangement, codec, 4, ind0);
9782 __ tbl(out1, arrangement, codec, 4, ind1);
9783 __ tbl(out2, arrangement, codec, 4, ind2);
9784 __ tbl(out3, arrangement, codec, 4, ind3);
9785
9786 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9787 }
9788
9789 /**
9790 * Arguments:
9791 *
9792 * Input:
9793 * c_rarg0 - src_start
9794 * c_rarg1 - src_offset
9795 * c_rarg2 - src_length
9796 * c_rarg3 - dest_start
9797 * c_rarg4 - dest_offset
9798 * c_rarg5 - isURL
9799 *
9800 */
9801 address generate_base64_encodeBlock() {
9802
9803 static const char toBase64[64] = {
9804 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9805 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9806 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9807 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9808 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9809 };
9810
9811 static const char toBase64URL[64] = {
9812 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9813 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9814 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9815 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9816 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9817 };
9818
9819 __ align(CodeEntryAlignment);
9820 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9821 StubCodeMark mark(this, stub_id);
9822 address start = __ pc();
9823
9824 Register src = c_rarg0; // source array
9825 Register soff = c_rarg1; // source start offset
9826 Register send = c_rarg2; // source end offset
9827 Register dst = c_rarg3; // dest array
9828 Register doff = c_rarg4; // position for writing to dest array
9829 Register isURL = c_rarg5; // Base64 or URL character set
9830
9831 // c_rarg6 and c_rarg7 are free to use as temps
9832 Register codec = c_rarg6;
9833 Register length = c_rarg7;
9834
9835 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9836
9837 __ add(src, src, soff);
9838 __ add(dst, dst, doff);
9839 __ sub(length, send, soff);
9840
9841 // load the codec base address
9842 __ lea(codec, ExternalAddress((address) toBase64));
9843 __ cbz(isURL, ProcessData);
9844 __ lea(codec, ExternalAddress((address) toBase64URL));
9845
9846 __ BIND(ProcessData);
9847
9848 // too short to formup a SIMD loop, roll back
9849 __ cmp(length, (u1)24);
9850 __ br(Assembler::LT, Process3B);
9851
9852 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9853
9854 __ BIND(Process48B);
9855 __ cmp(length, (u1)48);
9856 __ br(Assembler::LT, Process24B);
9857 generate_base64_encode_simdround(src, dst, v0, 16);
9858 __ sub(length, length, 48);
9859 __ b(Process48B);
9860
9861 __ BIND(Process24B);
9862 __ cmp(length, (u1)24);
9863 __ br(Assembler::LT, SIMDExit);
9864 generate_base64_encode_simdround(src, dst, v0, 8);
9865 __ sub(length, length, 24);
9866
9867 __ BIND(SIMDExit);
9868 __ cbz(length, Exit);
9869
9870 __ BIND(Process3B);
9871 // 3 src bytes, 24 bits
9872 __ ldrb(r10, __ post(src, 1));
9873 __ ldrb(r11, __ post(src, 1));
9874 __ ldrb(r12, __ post(src, 1));
9875 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9876 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9877 // codec index
9878 __ ubfmw(r15, r12, 18, 23);
9879 __ ubfmw(r14, r12, 12, 17);
9880 __ ubfmw(r13, r12, 6, 11);
9881 __ andw(r12, r12, 63);
9882 // get the code based on the codec
9883 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9884 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9885 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9886 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9887 __ strb(r15, __ post(dst, 1));
9888 __ strb(r14, __ post(dst, 1));
9889 __ strb(r13, __ post(dst, 1));
9890 __ strb(r12, __ post(dst, 1));
9891 __ sub(length, length, 3);
9892 __ cbnz(length, Process3B);
9893
9894 __ BIND(Exit);
9895 __ ret(lr);
9896
9897 return start;
9898 }
9899
9900 void generate_base64_decode_simdround(Register src, Register dst,
9901 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9902
9903 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9904 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9905
9906 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9907 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9908
9909 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9910
9911 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9912
9913 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9914
9915 // we need unsigned saturating subtract, to make sure all input values
9916 // in range [0, 63] will have 0U value in the higher half lookup
9917 __ uqsubv(decH0, __ T16B, in0, v27);
9918 __ uqsubv(decH1, __ T16B, in1, v27);
9919 __ uqsubv(decH2, __ T16B, in2, v27);
9920 __ uqsubv(decH3, __ T16B, in3, v27);
9921
9922 // lower half lookup
9923 __ tbl(decL0, arrangement, codecL, 4, in0);
9924 __ tbl(decL1, arrangement, codecL, 4, in1);
9925 __ tbl(decL2, arrangement, codecL, 4, in2);
9926 __ tbl(decL3, arrangement, codecL, 4, in3);
9927
9928 // higher half lookup
9929 __ tbx(decH0, arrangement, codecH, 4, decH0);
9930 __ tbx(decH1, arrangement, codecH, 4, decH1);
9931 __ tbx(decH2, arrangement, codecH, 4, decH2);
9932 __ tbx(decH3, arrangement, codecH, 4, decH3);
9933
9934 // combine lower and higher
9935 __ orr(decL0, arrangement, decL0, decH0);
9936 __ orr(decL1, arrangement, decL1, decH1);
9937 __ orr(decL2, arrangement, decL2, decH2);
9938 __ orr(decL3, arrangement, decL3, decH3);
9939
9940 // check illegal inputs, value larger than 63 (maximum of 6 bits)
9941 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
9942 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
9943 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
9944 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
9945 __ orr(in0, arrangement, decH0, decH1);
9946 __ orr(in1, arrangement, decH2, decH3);
9947 __ orr(in2, arrangement, in0, in1);
9948 __ umaxv(in3, arrangement, in2);
9949 __ umov(rscratch2, in3, __ B, 0);
9950
9951 // get the data to output
9952 __ shl(out0, arrangement, decL0, 2);
9953 __ ushr(out1, arrangement, decL1, 4);
9954 __ orr(out0, arrangement, out0, out1);
9955 __ shl(out1, arrangement, decL1, 4);
9956 __ ushr(out2, arrangement, decL2, 2);
9957 __ orr(out1, arrangement, out1, out2);
9958 __ shl(out2, arrangement, decL2, 6);
9959 __ orr(out2, arrangement, out2, decL3);
9960
9961 __ cbz(rscratch2, NoIllegalData);
9962
9963 // handle illegal input
9964 __ umov(r10, in2, __ D, 0);
9965 if (size == 16) {
9966 __ cbnz(r10, ErrorInLowerHalf);
9967
9968 // illegal input is in higher half, store the lower half now.
9969 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
9970
9971 __ umov(r10, in2, __ D, 1);
9972 __ umov(r11, out0, __ D, 1);
9973 __ umov(r12, out1, __ D, 1);
9974 __ umov(r13, out2, __ D, 1);
9975 __ b(StoreLegalData);
9976
9977 __ BIND(ErrorInLowerHalf);
9978 }
9979 __ umov(r11, out0, __ D, 0);
9980 __ umov(r12, out1, __ D, 0);
9981 __ umov(r13, out2, __ D, 0);
9982
9983 __ BIND(StoreLegalData);
9984 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
9985 __ strb(r11, __ post(dst, 1));
9986 __ strb(r12, __ post(dst, 1));
9987 __ strb(r13, __ post(dst, 1));
9988 __ lsr(r10, r10, 8);
9989 __ lsr(r11, r11, 8);
9990 __ lsr(r12, r12, 8);
9991 __ lsr(r13, r13, 8);
9992 __ b(StoreLegalData);
9993
9994 __ BIND(NoIllegalData);
9995 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
9996 }
9997
9998
9999 /**
10000 * Arguments:
10001 *
10002 * Input:
10003 * c_rarg0 - src_start
10004 * c_rarg1 - src_offset
10005 * c_rarg2 - src_length
10006 * c_rarg3 - dest_start
10007 * c_rarg4 - dest_offset
10008 * c_rarg5 - isURL
10009 * c_rarg6 - isMIME
10010 *
10011 */
10012 address generate_base64_decodeBlock() {
10013
10014 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10015 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10016 // titled "Base64 decoding".
10017
10018 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10019 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10020 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10021 static const uint8_t fromBase64ForNoSIMD[256] = {
10022 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10023 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10024 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10025 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10026 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10027 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10028 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10029 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10030 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10031 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10032 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10033 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10034 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10035 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10036 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10037 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10038 };
10039
10040 static const uint8_t fromBase64URLForNoSIMD[256] = {
10041 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10042 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10043 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10044 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10045 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10046 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10047 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10048 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10049 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10050 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10051 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057 };
10058
10059 // A legal value of base64 code is in range [0, 127]. We need two lookups
10060 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10061 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10062 // table vector lookup use tbx, out of range indices are unchanged in
10063 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10064 // The value of index 64 is set to 0, so that we know that we already get the
10065 // decoded data with the 1st lookup.
10066 static const uint8_t fromBase64ForSIMD[128] = {
10067 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10068 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10069 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10070 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10071 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10072 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10073 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10074 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10075 };
10076
10077 static const uint8_t fromBase64URLForSIMD[128] = {
10078 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10079 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10080 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10081 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10082 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10083 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10084 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10085 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10086 };
10087
10088 __ align(CodeEntryAlignment);
10089 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10090 StubCodeMark mark(this, stub_id);
10091 address start = __ pc();
10092
10093 Register src = c_rarg0; // source array
10094 Register soff = c_rarg1; // source start offset
10095 Register send = c_rarg2; // source end offset
10096 Register dst = c_rarg3; // dest array
10097 Register doff = c_rarg4; // position for writing to dest array
10098 Register isURL = c_rarg5; // Base64 or URL character set
10099 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10100
10101 Register length = send; // reuse send as length of source data to process
10102
10103 Register simd_codec = c_rarg6;
10104 Register nosimd_codec = c_rarg7;
10105
10106 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10107
10108 __ enter();
10109
10110 __ add(src, src, soff);
10111 __ add(dst, dst, doff);
10112
10113 __ mov(doff, dst);
10114
10115 __ sub(length, send, soff);
10116 __ bfm(length, zr, 0, 1);
10117
10118 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10119 __ cbz(isURL, ProcessData);
10120 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10121
10122 __ BIND(ProcessData);
10123 __ mov(rscratch1, length);
10124 __ cmp(length, (u1)144); // 144 = 80 + 64
10125 __ br(Assembler::LT, Process4B);
10126
10127 // In the MIME case, the line length cannot be more than 76
10128 // bytes (see RFC 2045). This is too short a block for SIMD
10129 // to be worthwhile, so we use non-SIMD here.
10130 __ movw(rscratch1, 79);
10131
10132 __ BIND(Process4B);
10133 __ ldrw(r14, __ post(src, 4));
10134 __ ubfxw(r10, r14, 0, 8);
10135 __ ubfxw(r11, r14, 8, 8);
10136 __ ubfxw(r12, r14, 16, 8);
10137 __ ubfxw(r13, r14, 24, 8);
10138 // get the de-code
10139 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10140 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10141 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10142 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10143 // error detection, 255u indicates an illegal input
10144 __ orrw(r14, r10, r11);
10145 __ orrw(r15, r12, r13);
10146 __ orrw(r14, r14, r15);
10147 __ tbnz(r14, 7, Exit);
10148 // recover the data
10149 __ lslw(r14, r10, 10);
10150 __ bfiw(r14, r11, 4, 6);
10151 __ bfmw(r14, r12, 2, 5);
10152 __ rev16w(r14, r14);
10153 __ bfiw(r13, r12, 6, 2);
10154 __ strh(r14, __ post(dst, 2));
10155 __ strb(r13, __ post(dst, 1));
10156 // non-simd loop
10157 __ subsw(rscratch1, rscratch1, 4);
10158 __ br(Assembler::GT, Process4B);
10159
10160 // if exiting from PreProcess80B, rscratch1 == -1;
10161 // otherwise, rscratch1 == 0.
10162 __ cbzw(rscratch1, Exit);
10163 __ sub(length, length, 80);
10164
10165 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10166 __ cbz(isURL, SIMDEnter);
10167 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10168
10169 __ BIND(SIMDEnter);
10170 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10171 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10172 __ mov(rscratch1, 63);
10173 __ dup(v27, __ T16B, rscratch1);
10174
10175 __ BIND(Process64B);
10176 __ cmp(length, (u1)64);
10177 __ br(Assembler::LT, Process32B);
10178 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10179 __ sub(length, length, 64);
10180 __ b(Process64B);
10181
10182 __ BIND(Process32B);
10183 __ cmp(length, (u1)32);
10184 __ br(Assembler::LT, SIMDExit);
10185 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10186 __ sub(length, length, 32);
10187 __ b(Process32B);
10188
10189 __ BIND(SIMDExit);
10190 __ cbz(length, Exit);
10191 __ movw(rscratch1, length);
10192 __ b(Process4B);
10193
10194 __ BIND(Exit);
10195 __ sub(c_rarg0, dst, doff);
10196
10197 __ leave();
10198 __ ret(lr);
10199
10200 return start;
10201 }
10202
10203 // Support for spin waits.
10204 address generate_spin_wait() {
10205 __ align(CodeEntryAlignment);
10206 StubId stub_id = StubId::stubgen_spin_wait_id;
10207 StubCodeMark mark(this, stub_id);
10208 address start = __ pc();
10209
10210 __ spin_wait();
10211 __ ret(lr);
10212
10213 return start;
10214 }
10215
10216 void generate_lookup_secondary_supers_table_stub() {
10217 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10218 StubCodeMark mark(this, stub_id);
10219
10220 const Register
10221 r_super_klass = r0,
10222 r_array_base = r1,
10223 r_array_length = r2,
10224 r_array_index = r3,
10225 r_sub_klass = r4,
10226 r_bitmap = rscratch2,
10227 result = r5;
10228 const FloatRegister
10229 vtemp = v0;
10230
10231 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10232 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10233 Label L_success;
10234 __ enter();
10235 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10236 r_array_base, r_array_length, r_array_index,
10237 vtemp, result, slot,
10238 /*stub_is_near*/true);
10239 __ leave();
10240 __ ret(lr);
10241 }
10242 }
10243
10244 // Slow path implementation for UseSecondarySupersTable.
10245 address generate_lookup_secondary_supers_table_slow_path_stub() {
10246 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10247 StubCodeMark mark(this, stub_id);
10248
10249 address start = __ pc();
10250 const Register
10251 r_super_klass = r0, // argument
10252 r_array_base = r1, // argument
10253 temp1 = r2, // temp
10254 r_array_index = r3, // argument
10255 r_bitmap = rscratch2, // argument
10256 result = r5; // argument
10257
10258 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10259 __ ret(lr);
10260
10261 return start;
10262 }
10263
10264 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10265
10266 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10267 //
10268 // If LSE is in use, generate LSE versions of all the stubs. The
10269 // non-LSE versions are in atomic_aarch64.S.
10270
10271 // class AtomicStubMark records the entry point of a stub and the
10272 // stub pointer which will point to it. The stub pointer is set to
10273 // the entry point when ~AtomicStubMark() is called, which must be
10274 // after ICache::invalidate_range. This ensures safe publication of
10275 // the generated code.
10276 class AtomicStubMark {
10277 address _entry_point;
10278 aarch64_atomic_stub_t *_stub;
10279 MacroAssembler *_masm;
10280 public:
10281 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10282 _masm = masm;
10283 __ align(32);
10284 _entry_point = __ pc();
10285 _stub = stub;
10286 }
10287 ~AtomicStubMark() {
10288 *_stub = (aarch64_atomic_stub_t)_entry_point;
10289 }
10290 };
10291
10292 // NB: For memory_order_conservative we need a trailing membar after
10293 // LSE atomic operations but not a leading membar.
10294 //
10295 // We don't need a leading membar because a clause in the Arm ARM
10296 // says:
10297 //
10298 // Barrier-ordered-before
10299 //
10300 // Barrier instructions order prior Memory effects before subsequent
10301 // Memory effects generated by the same Observer. A read or a write
10302 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10303 // Observer if and only if RW1 appears in program order before RW 2
10304 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10305 // instruction with both Acquire and Release semantics.
10306 //
10307 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10308 // and Release semantics, therefore we don't need a leading
10309 // barrier. However, there is no corresponding Barrier-ordered-after
10310 // relationship, therefore we need a trailing membar to prevent a
10311 // later store or load from being reordered with the store in an
10312 // atomic instruction.
10313 //
10314 // This was checked by using the herd7 consistency model simulator
10315 // (http://diy.inria.fr/) with this test case:
10316 //
10317 // AArch64 LseCas
10318 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10319 // P0 | P1;
10320 // LDR W4, [X2] | MOV W3, #0;
10321 // DMB LD | MOV W4, #1;
10322 // LDR W3, [X1] | CASAL W3, W4, [X1];
10323 // | DMB ISH;
10324 // | STR W4, [X2];
10325 // exists
10326 // (0:X3=0 /\ 0:X4=1)
10327 //
10328 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10329 // with the store to x in P1. Without the DMB in P1 this may happen.
10330 //
10331 // At the time of writing we don't know of any AArch64 hardware that
10332 // reorders stores in this way, but the Reference Manual permits it.
10333
10334 void gen_cas_entry(Assembler::operand_size size,
10335 atomic_memory_order order) {
10336 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10337 exchange_val = c_rarg2;
10338 bool acquire, release;
10339 switch (order) {
10340 case memory_order_relaxed:
10341 acquire = false;
10342 release = false;
10343 break;
10344 case memory_order_release:
10345 acquire = false;
10346 release = true;
10347 break;
10348 default:
10349 acquire = true;
10350 release = true;
10351 break;
10352 }
10353 __ mov(prev, compare_val);
10354 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10355 if (order == memory_order_conservative) {
10356 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10357 }
10358 if (size == Assembler::xword) {
10359 __ mov(r0, prev);
10360 } else {
10361 __ movw(r0, prev);
10362 }
10363 __ ret(lr);
10364 }
10365
10366 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10367 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10368 // If not relaxed, then default to conservative. Relaxed is the only
10369 // case we use enough to be worth specializing.
10370 if (order == memory_order_relaxed) {
10371 __ ldadd(size, incr, prev, addr);
10372 } else {
10373 __ ldaddal(size, incr, prev, addr);
10374 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10375 }
10376 if (size == Assembler::xword) {
10377 __ mov(r0, prev);
10378 } else {
10379 __ movw(r0, prev);
10380 }
10381 __ ret(lr);
10382 }
10383
10384 void gen_swpal_entry(Assembler::operand_size size) {
10385 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10386 __ swpal(size, incr, prev, addr);
10387 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10388 if (size == Assembler::xword) {
10389 __ mov(r0, prev);
10390 } else {
10391 __ movw(r0, prev);
10392 }
10393 __ ret(lr);
10394 }
10395
10396 void generate_atomic_entry_points() {
10397 if (! UseLSE) {
10398 return;
10399 }
10400 __ align(CodeEntryAlignment);
10401 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10402 StubCodeMark mark(this, stub_id);
10403 address first_entry = __ pc();
10404
10405 // ADD, memory_order_conservative
10406 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10407 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10408 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10409 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10410
10411 // ADD, memory_order_relaxed
10412 AtomicStubMark mark_fetch_add_4_relaxed
10413 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10414 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10415 AtomicStubMark mark_fetch_add_8_relaxed
10416 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10417 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10418
10419 // XCHG, memory_order_conservative
10420 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10421 gen_swpal_entry(Assembler::word);
10422 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10423 gen_swpal_entry(Assembler::xword);
10424
10425 // CAS, memory_order_conservative
10426 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10427 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10428 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10429 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10430 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10431 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10432
10433 // CAS, memory_order_relaxed
10434 AtomicStubMark mark_cmpxchg_1_relaxed
10435 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10436 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10437 AtomicStubMark mark_cmpxchg_4_relaxed
10438 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10439 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10440 AtomicStubMark mark_cmpxchg_8_relaxed
10441 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10442 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10443
10444 AtomicStubMark mark_cmpxchg_4_release
10445 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10446 gen_cas_entry(MacroAssembler::word, memory_order_release);
10447 AtomicStubMark mark_cmpxchg_8_release
10448 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10449 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10450
10451 AtomicStubMark mark_cmpxchg_4_seq_cst
10452 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10453 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10454 AtomicStubMark mark_cmpxchg_8_seq_cst
10455 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10456 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10457
10458 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10459 }
10460 #endif // LINUX
10461
10462 address generate_cont_thaw(Continuation::thaw_kind kind) {
10463 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10464 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10465
10466 address start = __ pc();
10467
10468 if (return_barrier) {
10469 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10470 __ mov(sp, rscratch1);
10471 }
10472 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10473
10474 if (return_barrier) {
10475 // preserve possible return value from a method returning to the return barrier
10476 __ fmovd(rscratch1, v0);
10477 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10478 }
10479
10480 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10481 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10482 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10483
10484 if (return_barrier) {
10485 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10486 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10487 __ fmovd(v0, rscratch1);
10488 }
10489 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10490
10491
10492 Label thaw_success;
10493 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10494 __ cbnz(rscratch2, thaw_success);
10495 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10496 __ br(rscratch1);
10497 __ bind(thaw_success);
10498
10499 // make room for the thawed frames
10500 __ sub(rscratch1, sp, rscratch2);
10501 __ andr(rscratch1, rscratch1, -16); // align
10502 __ mov(sp, rscratch1);
10503
10504 if (return_barrier) {
10505 // save original return value -- again
10506 __ fmovd(rscratch1, v0);
10507 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10508 }
10509
10510 // If we want, we can templatize thaw by kind, and have three different entries
10511 __ movw(c_rarg1, (uint32_t)kind);
10512
10513 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10514 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10515
10516 if (return_barrier) {
10517 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10518 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10519 __ fmovd(v0, rscratch1);
10520 } else {
10521 __ mov(r0, zr); // return 0 (success) from doYield
10522 }
10523
10524 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10525 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10526 __ mov(rfp, sp);
10527
10528 if (return_barrier_exception) {
10529 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10530 __ authenticate_return_address(c_rarg1);
10531 __ verify_oop(r0);
10532 // save return value containing the exception oop in callee-saved R19
10533 __ mov(r19, r0);
10534
10535 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10536
10537 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10538 // __ reinitialize_ptrue();
10539
10540 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10541
10542 __ mov(r1, r0); // the exception handler
10543 __ mov(r0, r19); // restore return value containing the exception oop
10544 __ verify_oop(r0);
10545
10546 __ leave();
10547 __ mov(r3, lr);
10548 __ br(r1); // the exception handler
10549 } else {
10550 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10551 __ leave();
10552 __ ret(lr);
10553 }
10554
10555 return start;
10556 }
10557
10558 address generate_cont_thaw() {
10559 if (!Continuations::enabled()) return nullptr;
10560
10561 StubId stub_id = StubId::stubgen_cont_thaw_id;
10562 StubCodeMark mark(this, stub_id);
10563 address start = __ pc();
10564 generate_cont_thaw(Continuation::thaw_top);
10565 return start;
10566 }
10567
10568 address generate_cont_returnBarrier() {
10569 if (!Continuations::enabled()) return nullptr;
10570
10571 // TODO: will probably need multiple return barriers depending on return type
10572 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10573 StubCodeMark mark(this, stub_id);
10574 address start = __ pc();
10575
10576 generate_cont_thaw(Continuation::thaw_return_barrier);
10577
10578 return start;
10579 }
10580
10581 address generate_cont_returnBarrier_exception() {
10582 if (!Continuations::enabled()) return nullptr;
10583
10584 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10585 StubCodeMark mark(this, stub_id);
10586 address start = __ pc();
10587
10588 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10589
10590 return start;
10591 }
10592
10593 address generate_cont_preempt_stub() {
10594 if (!Continuations::enabled()) return nullptr;
10595 StubId stub_id = StubId::stubgen_cont_preempt_id;
10596 StubCodeMark mark(this, stub_id);
10597 address start = __ pc();
10598
10599 __ reset_last_Java_frame(true);
10600
10601 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10602 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10603 __ mov(sp, rscratch2);
10604
10605 Label preemption_cancelled;
10606 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10607 __ cbnz(rscratch1, preemption_cancelled);
10608
10609 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10610 SharedRuntime::continuation_enter_cleanup(_masm);
10611 __ leave();
10612 __ ret(lr);
10613
10614 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10615 __ bind(preemption_cancelled);
10616 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10617 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10618 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10619 __ ldr(rscratch1, Address(rscratch1));
10620 __ br(rscratch1);
10621
10622 return start;
10623 }
10624
10625 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10626 // are represented as long[5], with BITS_PER_LIMB = 26.
10627 // Pack five 26-bit limbs into three 64-bit registers.
10628 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10629 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10630 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10631 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10632 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10633
10634 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10635 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10636 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10637 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10638
10639 if (dest2->is_valid()) {
10640 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10641 } else {
10642 #ifdef ASSERT
10643 Label OK;
10644 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10645 __ br(__ EQ, OK);
10646 __ stop("high bits of Poly1305 integer should be zero");
10647 __ should_not_reach_here();
10648 __ bind(OK);
10649 #endif
10650 }
10651 }
10652
10653 // As above, but return only a 128-bit integer, packed into two
10654 // 64-bit registers.
10655 void pack_26(Register dest0, Register dest1, Register src) {
10656 pack_26(dest0, dest1, noreg, src);
10657 }
10658
10659 // Multiply and multiply-accumulate unsigned 64-bit registers.
10660 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10661 __ mul(prod_lo, n, m);
10662 __ umulh(prod_hi, n, m);
10663 }
10664 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10665 wide_mul(rscratch1, rscratch2, n, m);
10666 __ adds(sum_lo, sum_lo, rscratch1);
10667 __ adc(sum_hi, sum_hi, rscratch2);
10668 }
10669
10670 // Poly1305, RFC 7539
10671
10672 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10673 // description of the tricks used to simplify and accelerate this
10674 // computation.
10675
10676 address generate_poly1305_processBlocks() {
10677 __ align(CodeEntryAlignment);
10678 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10679 StubCodeMark mark(this, stub_id);
10680 address start = __ pc();
10681 Label here;
10682 __ enter();
10683 RegSet callee_saved = RegSet::range(r19, r28);
10684 __ push(callee_saved, sp);
10685
10686 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10687
10688 // Arguments
10689 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10690
10691 // R_n is the 128-bit randomly-generated key, packed into two
10692 // registers. The caller passes this key to us as long[5], with
10693 // BITS_PER_LIMB = 26.
10694 const Register R_0 = *++regs, R_1 = *++regs;
10695 pack_26(R_0, R_1, r_start);
10696
10697 // RR_n is (R_n >> 2) * 5
10698 const Register RR_0 = *++regs, RR_1 = *++regs;
10699 __ lsr(RR_0, R_0, 2);
10700 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10701 __ lsr(RR_1, R_1, 2);
10702 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10703
10704 // U_n is the current checksum
10705 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10706 pack_26(U_0, U_1, U_2, acc_start);
10707
10708 static constexpr int BLOCK_LENGTH = 16;
10709 Label DONE, LOOP;
10710
10711 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10712 __ br(Assembler::LT, DONE); {
10713 __ bind(LOOP);
10714
10715 // S_n is to be the sum of U_n and the next block of data
10716 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10717 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10718 __ adds(S_0, U_0, S_0);
10719 __ adcs(S_1, U_1, S_1);
10720 __ adc(S_2, U_2, zr);
10721 __ add(S_2, S_2, 1);
10722
10723 const Register U_0HI = *++regs, U_1HI = *++regs;
10724
10725 // NB: this logic depends on some of the special properties of
10726 // Poly1305 keys. In particular, because we know that the top
10727 // four bits of R_0 and R_1 are zero, we can add together
10728 // partial products without any risk of needing to propagate a
10729 // carry out.
10730 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10731 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10732 __ andr(U_2, R_0, 3);
10733 __ mul(U_2, S_2, U_2);
10734
10735 // Recycle registers S_0, S_1, S_2
10736 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10737
10738 // Partial reduction mod 2**130 - 5
10739 __ adds(U_1, U_0HI, U_1);
10740 __ adc(U_2, U_1HI, U_2);
10741 // Sum now in U_2:U_1:U_0.
10742 // Dead: U_0HI, U_1HI.
10743 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10744
10745 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10746
10747 // First, U_2:U_1:U_0 += (U_2 >> 2)
10748 __ lsr(rscratch1, U_2, 2);
10749 __ andr(U_2, U_2, (u8)3);
10750 __ adds(U_0, U_0, rscratch1);
10751 __ adcs(U_1, U_1, zr);
10752 __ adc(U_2, U_2, zr);
10753 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10754 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10755 __ adcs(U_1, U_1, zr);
10756 __ adc(U_2, U_2, zr);
10757
10758 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10759 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10760 __ br(~ Assembler::LT, LOOP);
10761 }
10762
10763 // Further reduce modulo 2^130 - 5
10764 __ lsr(rscratch1, U_2, 2);
10765 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10766 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10767 __ adcs(U_1, U_1, zr);
10768 __ andr(U_2, U_2, (u1)3);
10769 __ adc(U_2, U_2, zr);
10770
10771 // Unpack the sum into five 26-bit limbs and write to memory.
10772 __ ubfiz(rscratch1, U_0, 0, 26);
10773 __ ubfx(rscratch2, U_0, 26, 26);
10774 __ stp(rscratch1, rscratch2, Address(acc_start));
10775 __ ubfx(rscratch1, U_0, 52, 12);
10776 __ bfi(rscratch1, U_1, 12, 14);
10777 __ ubfx(rscratch2, U_1, 14, 26);
10778 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10779 __ ubfx(rscratch1, U_1, 40, 24);
10780 __ bfi(rscratch1, U_2, 24, 3);
10781 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10782
10783 __ bind(DONE);
10784 __ pop(callee_saved, sp);
10785 __ leave();
10786 __ ret(lr);
10787
10788 return start;
10789 }
10790
10791 // exception handler for upcall stubs
10792 address generate_upcall_stub_exception_handler() {
10793 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10794 StubCodeMark mark(this, stub_id);
10795 address start = __ pc();
10796
10797 // Native caller has no idea how to handle exceptions,
10798 // so we just crash here. Up to callee to catch exceptions.
10799 __ verify_oop(r0);
10800 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10801 __ blr(rscratch1);
10802 __ should_not_reach_here();
10803
10804 return start;
10805 }
10806
10807 // load Method* target of MethodHandle
10808 // j_rarg0 = jobject receiver
10809 // rmethod = result
10810 address generate_upcall_stub_load_target() {
10811 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10812 StubCodeMark mark(this, stub_id);
10813 address start = __ pc();
10814
10815 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10816 // Load target method from receiver
10817 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10818 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10819 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10820 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10821 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10822 noreg, noreg);
10823 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10824
10825 __ ret(lr);
10826
10827 return start;
10828 }
10829
10830 #undef __
10831 #define __ masm->
10832
10833 class MontgomeryMultiplyGenerator : public MacroAssembler {
10834
10835 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10836 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10837
10838 RegSet _toSave;
10839 bool _squaring;
10840
10841 public:
10842 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10843 : MacroAssembler(as->code()), _squaring(squaring) {
10844
10845 // Register allocation
10846
10847 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10848 Pa_base = *regs; // Argument registers
10849 if (squaring)
10850 Pb_base = Pa_base;
10851 else
10852 Pb_base = *++regs;
10853 Pn_base = *++regs;
10854 Rlen= *++regs;
10855 inv = *++regs;
10856 Pm_base = *++regs;
10857
10858 // Working registers:
10859 Ra = *++regs; // The current digit of a, b, n, and m.
10860 Rb = *++regs;
10861 Rm = *++regs;
10862 Rn = *++regs;
10863
10864 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10865 Pb = *++regs;
10866 Pm = *++regs;
10867 Pn = *++regs;
10868
10869 t0 = *++regs; // Three registers which form a
10870 t1 = *++regs; // triple-precision accumuator.
10871 t2 = *++regs;
10872
10873 Ri = *++regs; // Inner and outer loop indexes.
10874 Rj = *++regs;
10875
10876 Rhi_ab = *++regs; // Product registers: low and high parts
10877 Rlo_ab = *++regs; // of a*b and m*n.
10878 Rhi_mn = *++regs;
10879 Rlo_mn = *++regs;
10880
10881 // r19 and up are callee-saved.
10882 _toSave = RegSet::range(r19, *regs) + Pm_base;
10883 }
10884
10885 private:
10886 void save_regs() {
10887 push(_toSave, sp);
10888 }
10889
10890 void restore_regs() {
10891 pop(_toSave, sp);
10892 }
10893
10894 template <typename T>
10895 void unroll_2(Register count, T block) {
10896 Label loop, end, odd;
10897 tbnz(count, 0, odd);
10898 cbz(count, end);
10899 align(16);
10900 bind(loop);
10901 (this->*block)();
10902 bind(odd);
10903 (this->*block)();
10904 subs(count, count, 2);
10905 br(Assembler::GT, loop);
10906 bind(end);
10907 }
10908
10909 template <typename T>
10910 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10911 Label loop, end, odd;
10912 tbnz(count, 0, odd);
10913 cbz(count, end);
10914 align(16);
10915 bind(loop);
10916 (this->*block)(d, s, tmp);
10917 bind(odd);
10918 (this->*block)(d, s, tmp);
10919 subs(count, count, 2);
10920 br(Assembler::GT, loop);
10921 bind(end);
10922 }
10923
10924 void pre1(RegisterOrConstant i) {
10925 block_comment("pre1");
10926 // Pa = Pa_base;
10927 // Pb = Pb_base + i;
10928 // Pm = Pm_base;
10929 // Pn = Pn_base + i;
10930 // Ra = *Pa;
10931 // Rb = *Pb;
10932 // Rm = *Pm;
10933 // Rn = *Pn;
10934 ldr(Ra, Address(Pa_base));
10935 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10936 ldr(Rm, Address(Pm_base));
10937 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10938 lea(Pa, Address(Pa_base));
10939 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10940 lea(Pm, Address(Pm_base));
10941 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10942
10943 // Zero the m*n result.
10944 mov(Rhi_mn, zr);
10945 mov(Rlo_mn, zr);
10946 }
10947
10948 // The core multiply-accumulate step of a Montgomery
10949 // multiplication. The idea is to schedule operations as a
10950 // pipeline so that instructions with long latencies (loads and
10951 // multiplies) have time to complete before their results are
10952 // used. This most benefits in-order implementations of the
10953 // architecture but out-of-order ones also benefit.
10954 void step() {
10955 block_comment("step");
10956 // MACC(Ra, Rb, t0, t1, t2);
10957 // Ra = *++Pa;
10958 // Rb = *--Pb;
10959 umulh(Rhi_ab, Ra, Rb);
10960 mul(Rlo_ab, Ra, Rb);
10961 ldr(Ra, pre(Pa, wordSize));
10962 ldr(Rb, pre(Pb, -wordSize));
10963 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
10964 // previous iteration.
10965 // MACC(Rm, Rn, t0, t1, t2);
10966 // Rm = *++Pm;
10967 // Rn = *--Pn;
10968 umulh(Rhi_mn, Rm, Rn);
10969 mul(Rlo_mn, Rm, Rn);
10970 ldr(Rm, pre(Pm, wordSize));
10971 ldr(Rn, pre(Pn, -wordSize));
10972 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10973 }
10974
10975 void post1() {
10976 block_comment("post1");
10977
10978 // MACC(Ra, Rb, t0, t1, t2);
10979 // Ra = *++Pa;
10980 // Rb = *--Pb;
10981 umulh(Rhi_ab, Ra, Rb);
10982 mul(Rlo_ab, Ra, Rb);
10983 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
10984 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
10985
10986 // *Pm = Rm = t0 * inv;
10987 mul(Rm, t0, inv);
10988 str(Rm, Address(Pm));
10989
10990 // MACC(Rm, Rn, t0, t1, t2);
10991 // t0 = t1; t1 = t2; t2 = 0;
10992 umulh(Rhi_mn, Rm, Rn);
10993
10994 #ifndef PRODUCT
10995 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
10996 {
10997 mul(Rlo_mn, Rm, Rn);
10998 add(Rlo_mn, t0, Rlo_mn);
10999 Label ok;
11000 cbz(Rlo_mn, ok); {
11001 stop("broken Montgomery multiply");
11002 } bind(ok);
11003 }
11004 #endif
11005 // We have very carefully set things up so that
11006 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11007 // the lower half of Rm * Rn because we know the result already:
11008 // it must be -t0. t0 + (-t0) must generate a carry iff
11009 // t0 != 0. So, rather than do a mul and an adds we just set
11010 // the carry flag iff t0 is nonzero.
11011 //
11012 // mul(Rlo_mn, Rm, Rn);
11013 // adds(zr, t0, Rlo_mn);
11014 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11015 adcs(t0, t1, Rhi_mn);
11016 adc(t1, t2, zr);
11017 mov(t2, zr);
11018 }
11019
11020 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11021 block_comment("pre2");
11022 // Pa = Pa_base + i-len;
11023 // Pb = Pb_base + len;
11024 // Pm = Pm_base + i-len;
11025 // Pn = Pn_base + len;
11026
11027 if (i.is_register()) {
11028 sub(Rj, i.as_register(), len);
11029 } else {
11030 mov(Rj, i.as_constant());
11031 sub(Rj, Rj, len);
11032 }
11033 // Rj == i-len
11034
11035 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11036 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11037 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11038 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11039
11040 // Ra = *++Pa;
11041 // Rb = *--Pb;
11042 // Rm = *++Pm;
11043 // Rn = *--Pn;
11044 ldr(Ra, pre(Pa, wordSize));
11045 ldr(Rb, pre(Pb, -wordSize));
11046 ldr(Rm, pre(Pm, wordSize));
11047 ldr(Rn, pre(Pn, -wordSize));
11048
11049 mov(Rhi_mn, zr);
11050 mov(Rlo_mn, zr);
11051 }
11052
11053 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11054 block_comment("post2");
11055 if (i.is_constant()) {
11056 mov(Rj, i.as_constant()-len.as_constant());
11057 } else {
11058 sub(Rj, i.as_register(), len);
11059 }
11060
11061 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11062
11063 // As soon as we know the least significant digit of our result,
11064 // store it.
11065 // Pm_base[i-len] = t0;
11066 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11067
11068 // t0 = t1; t1 = t2; t2 = 0;
11069 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11070 adc(t1, t2, zr);
11071 mov(t2, zr);
11072 }
11073
11074 // A carry in t0 after Montgomery multiplication means that we
11075 // should subtract multiples of n from our result in m. We'll
11076 // keep doing that until there is no carry.
11077 void normalize(RegisterOrConstant len) {
11078 block_comment("normalize");
11079 // while (t0)
11080 // t0 = sub(Pm_base, Pn_base, t0, len);
11081 Label loop, post, again;
11082 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11083 cbz(t0, post); {
11084 bind(again); {
11085 mov(i, zr);
11086 mov(cnt, len);
11087 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11088 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11089 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11090 align(16);
11091 bind(loop); {
11092 sbcs(Rm, Rm, Rn);
11093 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11094 add(i, i, 1);
11095 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11096 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11097 sub(cnt, cnt, 1);
11098 } cbnz(cnt, loop);
11099 sbc(t0, t0, zr);
11100 } cbnz(t0, again);
11101 } bind(post);
11102 }
11103
11104 // Move memory at s to d, reversing words.
11105 // Increments d to end of copied memory
11106 // Destroys tmp1, tmp2
11107 // Preserves len
11108 // Leaves s pointing to the address which was in d at start
11109 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11110 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11111 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11112
11113 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11114 mov(tmp1, len);
11115 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11116 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11117 }
11118 // where
11119 void reverse1(Register d, Register s, Register tmp) {
11120 ldr(tmp, pre(s, -wordSize));
11121 ror(tmp, tmp, 32);
11122 str(tmp, post(d, wordSize));
11123 }
11124
11125 void step_squaring() {
11126 // An extra ACC
11127 step();
11128 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11129 }
11130
11131 void last_squaring(RegisterOrConstant i) {
11132 Label dont;
11133 // if ((i & 1) == 0) {
11134 tbnz(i.as_register(), 0, dont); {
11135 // MACC(Ra, Rb, t0, t1, t2);
11136 // Ra = *++Pa;
11137 // Rb = *--Pb;
11138 umulh(Rhi_ab, Ra, Rb);
11139 mul(Rlo_ab, Ra, Rb);
11140 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11141 } bind(dont);
11142 }
11143
11144 void extra_step_squaring() {
11145 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11146
11147 // MACC(Rm, Rn, t0, t1, t2);
11148 // Rm = *++Pm;
11149 // Rn = *--Pn;
11150 umulh(Rhi_mn, Rm, Rn);
11151 mul(Rlo_mn, Rm, Rn);
11152 ldr(Rm, pre(Pm, wordSize));
11153 ldr(Rn, pre(Pn, -wordSize));
11154 }
11155
11156 void post1_squaring() {
11157 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11158
11159 // *Pm = Rm = t0 * inv;
11160 mul(Rm, t0, inv);
11161 str(Rm, Address(Pm));
11162
11163 // MACC(Rm, Rn, t0, t1, t2);
11164 // t0 = t1; t1 = t2; t2 = 0;
11165 umulh(Rhi_mn, Rm, Rn);
11166
11167 #ifndef PRODUCT
11168 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11169 {
11170 mul(Rlo_mn, Rm, Rn);
11171 add(Rlo_mn, t0, Rlo_mn);
11172 Label ok;
11173 cbz(Rlo_mn, ok); {
11174 stop("broken Montgomery multiply");
11175 } bind(ok);
11176 }
11177 #endif
11178 // We have very carefully set things up so that
11179 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11180 // the lower half of Rm * Rn because we know the result already:
11181 // it must be -t0. t0 + (-t0) must generate a carry iff
11182 // t0 != 0. So, rather than do a mul and an adds we just set
11183 // the carry flag iff t0 is nonzero.
11184 //
11185 // mul(Rlo_mn, Rm, Rn);
11186 // adds(zr, t0, Rlo_mn);
11187 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11188 adcs(t0, t1, Rhi_mn);
11189 adc(t1, t2, zr);
11190 mov(t2, zr);
11191 }
11192
11193 void acc(Register Rhi, Register Rlo,
11194 Register t0, Register t1, Register t2) {
11195 adds(t0, t0, Rlo);
11196 adcs(t1, t1, Rhi);
11197 adc(t2, t2, zr);
11198 }
11199
11200 public:
11201 /**
11202 * Fast Montgomery multiplication. The derivation of the
11203 * algorithm is in A Cryptographic Library for the Motorola
11204 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11205 *
11206 * Arguments:
11207 *
11208 * Inputs for multiplication:
11209 * c_rarg0 - int array elements a
11210 * c_rarg1 - int array elements b
11211 * c_rarg2 - int array elements n (the modulus)
11212 * c_rarg3 - int length
11213 * c_rarg4 - int inv
11214 * c_rarg5 - int array elements m (the result)
11215 *
11216 * Inputs for squaring:
11217 * c_rarg0 - int array elements a
11218 * c_rarg1 - int array elements n (the modulus)
11219 * c_rarg2 - int length
11220 * c_rarg3 - int inv
11221 * c_rarg4 - int array elements m (the result)
11222 *
11223 */
11224 address generate_multiply() {
11225 Label argh, nothing;
11226 bind(argh);
11227 stop("MontgomeryMultiply total_allocation must be <= 8192");
11228
11229 align(CodeEntryAlignment);
11230 address entry = pc();
11231
11232 cbzw(Rlen, nothing);
11233
11234 enter();
11235
11236 // Make room.
11237 cmpw(Rlen, 512);
11238 br(Assembler::HI, argh);
11239 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11240 andr(sp, Ra, -2 * wordSize);
11241
11242 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11243
11244 {
11245 // Copy input args, reversing as we go. We use Ra as a
11246 // temporary variable.
11247 reverse(Ra, Pa_base, Rlen, t0, t1);
11248 if (!_squaring)
11249 reverse(Ra, Pb_base, Rlen, t0, t1);
11250 reverse(Ra, Pn_base, Rlen, t0, t1);
11251 }
11252
11253 // Push all call-saved registers and also Pm_base which we'll need
11254 // at the end.
11255 save_regs();
11256
11257 #ifndef PRODUCT
11258 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11259 {
11260 ldr(Rn, Address(Pn_base, 0));
11261 mul(Rlo_mn, Rn, inv);
11262 subs(zr, Rlo_mn, -1);
11263 Label ok;
11264 br(EQ, ok); {
11265 stop("broken inverse in Montgomery multiply");
11266 } bind(ok);
11267 }
11268 #endif
11269
11270 mov(Pm_base, Ra);
11271
11272 mov(t0, zr);
11273 mov(t1, zr);
11274 mov(t2, zr);
11275
11276 block_comment("for (int i = 0; i < len; i++) {");
11277 mov(Ri, zr); {
11278 Label loop, end;
11279 cmpw(Ri, Rlen);
11280 br(Assembler::GE, end);
11281
11282 bind(loop);
11283 pre1(Ri);
11284
11285 block_comment(" for (j = i; j; j--) {"); {
11286 movw(Rj, Ri);
11287 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11288 } block_comment(" } // j");
11289
11290 post1();
11291 addw(Ri, Ri, 1);
11292 cmpw(Ri, Rlen);
11293 br(Assembler::LT, loop);
11294 bind(end);
11295 block_comment("} // i");
11296 }
11297
11298 block_comment("for (int i = len; i < 2*len; i++) {");
11299 mov(Ri, Rlen); {
11300 Label loop, end;
11301 cmpw(Ri, Rlen, Assembler::LSL, 1);
11302 br(Assembler::GE, end);
11303
11304 bind(loop);
11305 pre2(Ri, Rlen);
11306
11307 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11308 lslw(Rj, Rlen, 1);
11309 subw(Rj, Rj, Ri);
11310 subw(Rj, Rj, 1);
11311 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11312 } block_comment(" } // j");
11313
11314 post2(Ri, Rlen);
11315 addw(Ri, Ri, 1);
11316 cmpw(Ri, Rlen, Assembler::LSL, 1);
11317 br(Assembler::LT, loop);
11318 bind(end);
11319 }
11320 block_comment("} // i");
11321
11322 normalize(Rlen);
11323
11324 mov(Ra, Pm_base); // Save Pm_base in Ra
11325 restore_regs(); // Restore caller's Pm_base
11326
11327 // Copy our result into caller's Pm_base
11328 reverse(Pm_base, Ra, Rlen, t0, t1);
11329
11330 leave();
11331 bind(nothing);
11332 ret(lr);
11333
11334 return entry;
11335 }
11336 // In C, approximately:
11337
11338 // void
11339 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11340 // julong Pn_base[], julong Pm_base[],
11341 // julong inv, int len) {
11342 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11343 // julong *Pa, *Pb, *Pn, *Pm;
11344 // julong Ra, Rb, Rn, Rm;
11345
11346 // int i;
11347
11348 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11349
11350 // for (i = 0; i < len; i++) {
11351 // int j;
11352
11353 // Pa = Pa_base;
11354 // Pb = Pb_base + i;
11355 // Pm = Pm_base;
11356 // Pn = Pn_base + i;
11357
11358 // Ra = *Pa;
11359 // Rb = *Pb;
11360 // Rm = *Pm;
11361 // Rn = *Pn;
11362
11363 // int iters = i;
11364 // for (j = 0; iters--; j++) {
11365 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11366 // MACC(Ra, Rb, t0, t1, t2);
11367 // Ra = *++Pa;
11368 // Rb = *--Pb;
11369 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11370 // MACC(Rm, Rn, t0, t1, t2);
11371 // Rm = *++Pm;
11372 // Rn = *--Pn;
11373 // }
11374
11375 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11376 // MACC(Ra, Rb, t0, t1, t2);
11377 // *Pm = Rm = t0 * inv;
11378 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11379 // MACC(Rm, Rn, t0, t1, t2);
11380
11381 // assert(t0 == 0, "broken Montgomery multiply");
11382
11383 // t0 = t1; t1 = t2; t2 = 0;
11384 // }
11385
11386 // for (i = len; i < 2*len; i++) {
11387 // int j;
11388
11389 // Pa = Pa_base + i-len;
11390 // Pb = Pb_base + len;
11391 // Pm = Pm_base + i-len;
11392 // Pn = Pn_base + len;
11393
11394 // Ra = *++Pa;
11395 // Rb = *--Pb;
11396 // Rm = *++Pm;
11397 // Rn = *--Pn;
11398
11399 // int iters = len*2-i-1;
11400 // for (j = i-len+1; iters--; j++) {
11401 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11402 // MACC(Ra, Rb, t0, t1, t2);
11403 // Ra = *++Pa;
11404 // Rb = *--Pb;
11405 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11406 // MACC(Rm, Rn, t0, t1, t2);
11407 // Rm = *++Pm;
11408 // Rn = *--Pn;
11409 // }
11410
11411 // Pm_base[i-len] = t0;
11412 // t0 = t1; t1 = t2; t2 = 0;
11413 // }
11414
11415 // while (t0)
11416 // t0 = sub(Pm_base, Pn_base, t0, len);
11417 // }
11418
11419 /**
11420 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11421 * multiplies than Montgomery multiplication so it should be up to
11422 * 25% faster. However, its loop control is more complex and it
11423 * may actually run slower on some machines.
11424 *
11425 * Arguments:
11426 *
11427 * Inputs:
11428 * c_rarg0 - int array elements a
11429 * c_rarg1 - int array elements n (the modulus)
11430 * c_rarg2 - int length
11431 * c_rarg3 - int inv
11432 * c_rarg4 - int array elements m (the result)
11433 *
11434 */
11435 address generate_square() {
11436 Label argh;
11437 bind(argh);
11438 stop("MontgomeryMultiply total_allocation must be <= 8192");
11439
11440 align(CodeEntryAlignment);
11441 address entry = pc();
11442
11443 enter();
11444
11445 // Make room.
11446 cmpw(Rlen, 512);
11447 br(Assembler::HI, argh);
11448 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11449 andr(sp, Ra, -2 * wordSize);
11450
11451 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11452
11453 {
11454 // Copy input args, reversing as we go. We use Ra as a
11455 // temporary variable.
11456 reverse(Ra, Pa_base, Rlen, t0, t1);
11457 reverse(Ra, Pn_base, Rlen, t0, t1);
11458 }
11459
11460 // Push all call-saved registers and also Pm_base which we'll need
11461 // at the end.
11462 save_regs();
11463
11464 mov(Pm_base, Ra);
11465
11466 mov(t0, zr);
11467 mov(t1, zr);
11468 mov(t2, zr);
11469
11470 block_comment("for (int i = 0; i < len; i++) {");
11471 mov(Ri, zr); {
11472 Label loop, end;
11473 bind(loop);
11474 cmp(Ri, Rlen);
11475 br(Assembler::GE, end);
11476
11477 pre1(Ri);
11478
11479 block_comment("for (j = (i+1)/2; j; j--) {"); {
11480 add(Rj, Ri, 1);
11481 lsr(Rj, Rj, 1);
11482 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11483 } block_comment(" } // j");
11484
11485 last_squaring(Ri);
11486
11487 block_comment(" for (j = i/2; j; j--) {"); {
11488 lsr(Rj, Ri, 1);
11489 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11490 } block_comment(" } // j");
11491
11492 post1_squaring();
11493 add(Ri, Ri, 1);
11494 cmp(Ri, Rlen);
11495 br(Assembler::LT, loop);
11496
11497 bind(end);
11498 block_comment("} // i");
11499 }
11500
11501 block_comment("for (int i = len; i < 2*len; i++) {");
11502 mov(Ri, Rlen); {
11503 Label loop, end;
11504 bind(loop);
11505 cmp(Ri, Rlen, Assembler::LSL, 1);
11506 br(Assembler::GE, end);
11507
11508 pre2(Ri, Rlen);
11509
11510 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11511 lsl(Rj, Rlen, 1);
11512 sub(Rj, Rj, Ri);
11513 sub(Rj, Rj, 1);
11514 lsr(Rj, Rj, 1);
11515 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11516 } block_comment(" } // j");
11517
11518 last_squaring(Ri);
11519
11520 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11521 lsl(Rj, Rlen, 1);
11522 sub(Rj, Rj, Ri);
11523 lsr(Rj, Rj, 1);
11524 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11525 } block_comment(" } // j");
11526
11527 post2(Ri, Rlen);
11528 add(Ri, Ri, 1);
11529 cmp(Ri, Rlen, Assembler::LSL, 1);
11530
11531 br(Assembler::LT, loop);
11532 bind(end);
11533 block_comment("} // i");
11534 }
11535
11536 normalize(Rlen);
11537
11538 mov(Ra, Pm_base); // Save Pm_base in Ra
11539 restore_regs(); // Restore caller's Pm_base
11540
11541 // Copy our result into caller's Pm_base
11542 reverse(Pm_base, Ra, Rlen, t0, t1);
11543
11544 leave();
11545 ret(lr);
11546
11547 return entry;
11548 }
11549 // In C, approximately:
11550
11551 // void
11552 // montgomery_square(julong Pa_base[], julong Pn_base[],
11553 // julong Pm_base[], julong inv, int len) {
11554 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11555 // julong *Pa, *Pb, *Pn, *Pm;
11556 // julong Ra, Rb, Rn, Rm;
11557
11558 // int i;
11559
11560 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11561
11562 // for (i = 0; i < len; i++) {
11563 // int j;
11564
11565 // Pa = Pa_base;
11566 // Pb = Pa_base + i;
11567 // Pm = Pm_base;
11568 // Pn = Pn_base + i;
11569
11570 // Ra = *Pa;
11571 // Rb = *Pb;
11572 // Rm = *Pm;
11573 // Rn = *Pn;
11574
11575 // int iters = (i+1)/2;
11576 // for (j = 0; iters--; j++) {
11577 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11578 // MACC2(Ra, Rb, t0, t1, t2);
11579 // Ra = *++Pa;
11580 // Rb = *--Pb;
11581 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11582 // MACC(Rm, Rn, t0, t1, t2);
11583 // Rm = *++Pm;
11584 // Rn = *--Pn;
11585 // }
11586 // if ((i & 1) == 0) {
11587 // assert(Ra == Pa_base[j], "must be");
11588 // MACC(Ra, Ra, t0, t1, t2);
11589 // }
11590 // iters = i/2;
11591 // assert(iters == i-j, "must be");
11592 // for (; iters--; j++) {
11593 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11594 // MACC(Rm, Rn, t0, t1, t2);
11595 // Rm = *++Pm;
11596 // Rn = *--Pn;
11597 // }
11598
11599 // *Pm = Rm = t0 * inv;
11600 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11601 // MACC(Rm, Rn, t0, t1, t2);
11602
11603 // assert(t0 == 0, "broken Montgomery multiply");
11604
11605 // t0 = t1; t1 = t2; t2 = 0;
11606 // }
11607
11608 // for (i = len; i < 2*len; i++) {
11609 // int start = i-len+1;
11610 // int end = start + (len - start)/2;
11611 // int j;
11612
11613 // Pa = Pa_base + i-len;
11614 // Pb = Pa_base + len;
11615 // Pm = Pm_base + i-len;
11616 // Pn = Pn_base + len;
11617
11618 // Ra = *++Pa;
11619 // Rb = *--Pb;
11620 // Rm = *++Pm;
11621 // Rn = *--Pn;
11622
11623 // int iters = (2*len-i-1)/2;
11624 // assert(iters == end-start, "must be");
11625 // for (j = start; iters--; j++) {
11626 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11627 // MACC2(Ra, Rb, t0, t1, t2);
11628 // Ra = *++Pa;
11629 // Rb = *--Pb;
11630 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11631 // MACC(Rm, Rn, t0, t1, t2);
11632 // Rm = *++Pm;
11633 // Rn = *--Pn;
11634 // }
11635 // if ((i & 1) == 0) {
11636 // assert(Ra == Pa_base[j], "must be");
11637 // MACC(Ra, Ra, t0, t1, t2);
11638 // }
11639 // iters = (2*len-i)/2;
11640 // assert(iters == len-j, "must be");
11641 // for (; iters--; j++) {
11642 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11643 // MACC(Rm, Rn, t0, t1, t2);
11644 // Rm = *++Pm;
11645 // Rn = *--Pn;
11646 // }
11647 // Pm_base[i-len] = t0;
11648 // t0 = t1; t1 = t2; t2 = 0;
11649 // }
11650
11651 // while (t0)
11652 // t0 = sub(Pm_base, Pn_base, t0, len);
11653 // }
11654 };
11655
11656 // Initialization
11657 void generate_preuniverse_stubs() {
11658 // preuniverse stubs are not needed for aarch64
11659 }
11660
11661 void generate_initial_stubs() {
11662 // Generate initial stubs and initializes the entry points
11663
11664 // entry points that exist in all platforms Note: This is code
11665 // that could be shared among different platforms - however the
11666 // benefit seems to be smaller than the disadvantage of having a
11667 // much more complicated generator structure. See also comment in
11668 // stubRoutines.hpp.
11669
11670 StubRoutines::_forward_exception_entry = generate_forward_exception();
11671
11672 StubRoutines::_call_stub_entry =
11673 generate_call_stub(StubRoutines::_call_stub_return_address);
11674
11675 // is referenced by megamorphic call
11676 StubRoutines::_catch_exception_entry = generate_catch_exception();
11677
11678 // Initialize table for copy memory (arraycopy) check.
11679 if (UnsafeMemoryAccess::_table == nullptr) {
11680 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11681 }
11682
11683 if (UseCRC32Intrinsics) {
11684 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11685 }
11686
11687 if (UseCRC32CIntrinsics) {
11688 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11689 }
11690
11691 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11692 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11693 }
11694
11695 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11696 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11697 }
11698
11699 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11700 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11701 StubRoutines::_hf2f = generate_float16ToFloat();
11702 StubRoutines::_f2hf = generate_floatToFloat16();
11703 }
11704 }
11705
11706 void generate_continuation_stubs() {
11707 // Continuation stubs:
11708 StubRoutines::_cont_thaw = generate_cont_thaw();
11709 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11710 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11711 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11712 }
11713
11714 void generate_final_stubs() {
11715 // support for verify_oop (must happen after universe_init)
11716 if (VerifyOops) {
11717 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11718 }
11719
11720 // arraycopy stubs used by compilers
11721 generate_arraycopy_stubs();
11722
11723 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11724
11725 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11726
11727 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11728 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11729
11730 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11731
11732 generate_atomic_entry_points();
11733
11734 #endif // LINUX
11735
11736 #ifdef COMPILER2
11737 if (UseSecondarySupersTable) {
11738 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11739 if (! InlineSecondarySupersTest) {
11740 generate_lookup_secondary_supers_table_stub();
11741 }
11742 }
11743 #endif
11744
11745 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11746
11747 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11748 }
11749
11750 void generate_compiler_stubs() {
11751 #if COMPILER2_OR_JVMCI
11752
11753 if (UseSVE == 0) {
11754 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11755 }
11756
11757 // array equals stub for large arrays.
11758 if (!UseSimpleArrayEquals) {
11759 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11760 }
11761
11762 // arrays_hascode stub for large arrays.
11763 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11764 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11765 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11766 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11767 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11768
11769 // byte_array_inflate stub for large arrays.
11770 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11771
11772 // countPositives stub for large arrays.
11773 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11774
11775 generate_compare_long_strings();
11776
11777 generate_string_indexof_stubs();
11778
11779 #ifdef COMPILER2
11780 if (UseMultiplyToLenIntrinsic) {
11781 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11782 }
11783
11784 if (UseSquareToLenIntrinsic) {
11785 StubRoutines::_squareToLen = generate_squareToLen();
11786 }
11787
11788 if (UseMulAddIntrinsic) {
11789 StubRoutines::_mulAdd = generate_mulAdd();
11790 }
11791
11792 if (UseSIMDForBigIntegerShiftIntrinsics) {
11793 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11794 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11795 }
11796
11797 if (UseMontgomeryMultiplyIntrinsic) {
11798 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11799 StubCodeMark mark(this, stub_id);
11800 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11801 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11802 }
11803
11804 if (UseMontgomerySquareIntrinsic) {
11805 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11806 StubCodeMark mark(this, stub_id);
11807 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11808 // We use generate_multiply() rather than generate_square()
11809 // because it's faster for the sizes of modulus we care about.
11810 StubRoutines::_montgomerySquare = g.generate_multiply();
11811 }
11812
11813 #endif // COMPILER2
11814
11815 if (UseChaCha20Intrinsics) {
11816 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11817 }
11818
11819 if (UseKyberIntrinsics) {
11820 StubRoutines::_kyberNtt = generate_kyberNtt();
11821 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11822 StubRoutines::_kyberNttMult = generate_kyberNttMult();
11823 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11824 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11825 StubRoutines::_kyber12To16 = generate_kyber12To16();
11826 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11827 }
11828
11829 if (UseDilithiumIntrinsics) {
11830 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11831 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11832 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11833 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11834 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11835 }
11836
11837 if (UseBASE64Intrinsics) {
11838 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11839 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11840 }
11841
11842 // data cache line writeback
11843 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11844 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11845
11846 if (UseAESIntrinsics) {
11847 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11848 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11849 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11850 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11851 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11852 }
11853 if (UseGHASHIntrinsics) {
11854 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11855 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11856 }
11857 if (UseAESIntrinsics && UseGHASHIntrinsics) {
11858 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11859 }
11860
11861 if (UseMD5Intrinsics) {
11862 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11863 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11864 }
11865 if (UseSHA1Intrinsics) {
11866 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11867 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11868 }
11869 if (UseSHA256Intrinsics) {
11870 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11871 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11872 }
11873 if (UseSHA512Intrinsics) {
11874 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11875 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11876 }
11877 if (UseSHA3Intrinsics) {
11878
11879 StubRoutines::_double_keccak = generate_double_keccak();
11880 if (UseSIMDForSHA3Intrinsic) {
11881 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11882 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11883 } else {
11884 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11885 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11886 }
11887 }
11888
11889 if (UsePoly1305Intrinsics) {
11890 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11891 }
11892
11893 // generate Adler32 intrinsics code
11894 if (UseAdler32Intrinsics) {
11895 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11896 }
11897
11898 #endif // COMPILER2_OR_JVMCI
11899 }
11900
11901 public:
11902 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11903 switch(blob_id) {
11904 case BlobId::stubgen_preuniverse_id:
11905 generate_preuniverse_stubs();
11906 break;
11907 case BlobId::stubgen_initial_id:
11908 generate_initial_stubs();
11909 break;
11910 case BlobId::stubgen_continuation_id:
11911 generate_continuation_stubs();
11912 break;
11913 case BlobId::stubgen_compiler_id:
11914 generate_compiler_stubs();
11915 break;
11916 case BlobId::stubgen_final_id:
11917 generate_final_stubs();
11918 break;
11919 default:
11920 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11921 break;
11922 };
11923 }
11924 }; // end class declaration
11925
11926 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11927 StubGenerator g(code, blob_id);
11928 }
11929
11930
11931 #if defined (LINUX)
11932
11933 // Define pointers to atomic stubs and initialize them to point to the
11934 // code in atomic_aarch64.S.
11935
11936 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
11937 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
11938 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
11939 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
11940 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
11941
11942 DEFAULT_ATOMIC_OP(fetch_add, 4, )
11943 DEFAULT_ATOMIC_OP(fetch_add, 8, )
11944 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
11945 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
11946 DEFAULT_ATOMIC_OP(xchg, 4, )
11947 DEFAULT_ATOMIC_OP(xchg, 8, )
11948 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
11949 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
11950 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
11951 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
11952 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
11953 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
11954 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
11955 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
11956 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
11957 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
11958
11959 #undef DEFAULT_ATOMIC_OP
11960
11961 #endif // LINUX