1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "code/aotCodeCache.hpp"
31 #include "compiler/oopMap.hpp"
32 #include "gc/shared/barrierSet.hpp"
33 #include "gc/shared/barrierSetAssembler.hpp"
34 #include "gc/shared/gc_globals.hpp"
35 #include "gc/shared/tlab_globals.hpp"
36 #include "interpreter/interpreter.hpp"
37 #include "memory/universe.hpp"
38 #include "nativeInst_aarch64.hpp"
39 #include "oops/instanceOop.hpp"
40 #include "oops/method.hpp"
41 #include "oops/objArrayKlass.hpp"
42 #include "oops/oop.inline.hpp"
43 #include "prims/methodHandles.hpp"
44 #include "prims/upcallLinker.hpp"
45 #include "runtime/arguments.hpp"
46 #include "runtime/atomicAccess.hpp"
47 #include "runtime/continuation.hpp"
48 #include "runtime/continuationEntry.inline.hpp"
49 #include "runtime/frame.inline.hpp"
50 #include "runtime/handles.inline.hpp"
51 #include "runtime/javaThread.hpp"
52 #include "runtime/sharedRuntime.hpp"
53 #include "runtime/stubCodeGenerator.hpp"
54 #include "runtime/stubRoutines.hpp"
55 #include "utilities/align.hpp"
56 #include "utilities/checkedCast.hpp"
57 #include "utilities/debug.hpp"
58 #include "utilities/globalDefinitions.hpp"
59 #include "utilities/intpow.hpp"
60 #include "utilities/powerOfTwo.hpp"
61 #ifdef COMPILER2
62 #include "opto/runtime.hpp"
63 #endif
64 #if INCLUDE_ZGC
65 #include "gc/z/zThreadLocalData.hpp"
66 #endif
67
68 // Declaration and definition of StubGenerator (no .hpp file).
69 // For a more detailed description of the stub routine structure
70 // see the comment in stubRoutines.hpp
71
72 #undef __
73 #define __ _masm->
74
75 #ifdef PRODUCT
76 #define BLOCK_COMMENT(str) /* nothing */
77 #else
78 #define BLOCK_COMMENT(str) __ block_comment(str)
79 #endif
80
81 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
82
83 // Stub Code definitions
84
85 class StubGenerator: public StubCodeGenerator {
86 private:
87
88 #ifdef PRODUCT
89 #define inc_counter_np(counter) ((void)0)
90 #else
91 void inc_counter_np_(uint& counter) {
92 __ incrementw(ExternalAddress((address)&counter));
93 }
94 #define inc_counter_np(counter) \
95 BLOCK_COMMENT("inc_counter " #counter); \
96 inc_counter_np_(counter);
97 #endif
98
99 // Call stubs are used to call Java from C
100 //
101 // Arguments:
102 // c_rarg0: call wrapper address address
103 // c_rarg1: result address
104 // c_rarg2: result type BasicType
105 // c_rarg3: method Method*
106 // c_rarg4: (interpreter) entry point address
107 // c_rarg5: parameters intptr_t*
108 // c_rarg6: parameter size (in words) int
109 // c_rarg7: thread Thread*
110 //
111 // There is no return from the stub itself as any Java result
112 // is written to result
113 //
114 // we save r30 (lr) as the return PC at the base of the frame and
115 // link r29 (fp) below it as the frame pointer installing sp (r31)
116 // into fp.
117 //
118 // we save r0-r7, which accounts for all the c arguments.
119 //
120 // TODO: strictly do we need to save them all? they are treated as
121 // volatile by C so could we omit saving the ones we are going to
122 // place in global registers (thread? method?) or those we only use
123 // during setup of the Java call?
124 //
125 // we don't need to save r8 which C uses as an indirect result location
126 // return register.
127 //
128 // we don't need to save r9-r15 which both C and Java treat as
129 // volatile
130 //
131 // we don't need to save r16-18 because Java does not use them
132 //
133 // we save r19-r28 which Java uses as scratch registers and C
134 // expects to be callee-save
135 //
136 // we save the bottom 64 bits of each value stored in v8-v15; it is
137 // the responsibility of the caller to preserve larger values.
138 //
139 // so the stub frame looks like this when we enter Java code
140 //
141 // [ return_from_Java ] <--- sp
142 // [ argument word n ]
143 // ...
144 // -29 [ argument word 1 ]
145 // -28 [ saved Floating-point Control Register ]
146 // -26 [ saved v15 ] <--- sp_after_call
147 // -25 [ saved v14 ]
148 // -24 [ saved v13 ]
149 // -23 [ saved v12 ]
150 // -22 [ saved v11 ]
151 // -21 [ saved v10 ]
152 // -20 [ saved v9 ]
153 // -19 [ saved v8 ]
154 // -18 [ saved r28 ]
155 // -17 [ saved r27 ]
156 // -16 [ saved r26 ]
157 // -15 [ saved r25 ]
158 // -14 [ saved r24 ]
159 // -13 [ saved r23 ]
160 // -12 [ saved r22 ]
161 // -11 [ saved r21 ]
162 // -10 [ saved r20 ]
163 // -9 [ saved r19 ]
164 // -8 [ call wrapper (r0) ]
165 // -7 [ result (r1) ]
166 // -6 [ result type (r2) ]
167 // -5 [ method (r3) ]
168 // -4 [ entry point (r4) ]
169 // -3 [ parameters (r5) ]
170 // -2 [ parameter size (r6) ]
171 // -1 [ thread (r7) ]
172 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
173 // 1 [ saved lr (r30) ]
174
175 // Call stub stack layout word offsets from fp
176 enum call_stub_layout {
177 sp_after_call_off = -28,
178
179 fpcr_off = sp_after_call_off,
180 d15_off = -26,
181 d13_off = -24,
182 d11_off = -22,
183 d9_off = -20,
184
185 r28_off = -18,
186 r26_off = -16,
187 r24_off = -14,
188 r22_off = -12,
189 r20_off = -10,
190 call_wrapper_off = -8,
191 result_off = -7,
192 result_type_off = -6,
193 method_off = -5,
194 entry_point_off = -4,
195 parameter_size_off = -2,
196 thread_off = -1,
197 fp_f = 0,
198 retaddr_off = 1,
199 };
200
201 address generate_call_stub(address& return_address) {
202 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
203 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
204 "adjust this code");
205
206 StubId stub_id = StubId::stubgen_call_stub_id;
207 StubCodeMark mark(this, stub_id);
208 address start = __ pc();
209
210 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
211
212 const Address fpcr_save (rfp, fpcr_off * wordSize);
213 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
214 const Address result (rfp, result_off * wordSize);
215 const Address result_type (rfp, result_type_off * wordSize);
216 const Address method (rfp, method_off * wordSize);
217 const Address entry_point (rfp, entry_point_off * wordSize);
218 const Address parameter_size(rfp, parameter_size_off * wordSize);
219
220 const Address thread (rfp, thread_off * wordSize);
221
222 const Address d15_save (rfp, d15_off * wordSize);
223 const Address d13_save (rfp, d13_off * wordSize);
224 const Address d11_save (rfp, d11_off * wordSize);
225 const Address d9_save (rfp, d9_off * wordSize);
226
227 const Address r28_save (rfp, r28_off * wordSize);
228 const Address r26_save (rfp, r26_off * wordSize);
229 const Address r24_save (rfp, r24_off * wordSize);
230 const Address r22_save (rfp, r22_off * wordSize);
231 const Address r20_save (rfp, r20_off * wordSize);
232
233 // stub code
234
235 address aarch64_entry = __ pc();
236
237 // set up frame and move sp to end of save area
238 __ enter();
239 __ sub(sp, rfp, -sp_after_call_off * wordSize);
240
241 // save register parameters and Java scratch/global registers
242 // n.b. we save thread even though it gets installed in
243 // rthread because we want to sanity check rthread later
244 __ str(c_rarg7, thread);
245 __ strw(c_rarg6, parameter_size);
246 __ stp(c_rarg4, c_rarg5, entry_point);
247 __ stp(c_rarg2, c_rarg3, result_type);
248 __ stp(c_rarg0, c_rarg1, call_wrapper);
249
250 __ stp(r20, r19, r20_save);
251 __ stp(r22, r21, r22_save);
252 __ stp(r24, r23, r24_save);
253 __ stp(r26, r25, r26_save);
254 __ stp(r28, r27, r28_save);
255
256 __ stpd(v9, v8, d9_save);
257 __ stpd(v11, v10, d11_save);
258 __ stpd(v13, v12, d13_save);
259 __ stpd(v15, v14, d15_save);
260
261 __ get_fpcr(rscratch1);
262 __ str(rscratch1, fpcr_save);
263 // Set FPCR to the state we need. We do want Round to Nearest. We
264 // don't want non-IEEE rounding modes or floating-point traps.
265 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
266 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
267 __ set_fpcr(rscratch1);
268
269 // install Java thread in global register now we have saved
270 // whatever value it held
271 __ mov(rthread, c_rarg7);
272 // And method
273 __ mov(rmethod, c_rarg3);
274
275 // set up the heapbase register
276 __ reinit_heapbase();
277
278 #ifdef ASSERT
279 // make sure we have no pending exceptions
280 {
281 Label L;
282 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
283 __ cmp(rscratch1, (u1)NULL_WORD);
284 __ br(Assembler::EQ, L);
285 __ stop("StubRoutines::call_stub: entered with pending exception");
286 __ BIND(L);
287 }
288 #endif
289 // pass parameters if any
290 __ mov(esp, sp);
291 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
292 __ andr(sp, rscratch1, -2 * wordSize);
293
294 BLOCK_COMMENT("pass parameters if any");
295 Label parameters_done;
296 // parameter count is still in c_rarg6
297 // and parameter pointer identifying param 1 is in c_rarg5
298 __ cbzw(c_rarg6, parameters_done);
299
300 address loop = __ pc();
301 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
302 __ subsw(c_rarg6, c_rarg6, 1);
303 __ push(rscratch1);
304 __ br(Assembler::GT, loop);
305
306 __ BIND(parameters_done);
307
308 // call Java entry -- passing methdoOop, and current sp
309 // rmethod: Method*
310 // r19_sender_sp: sender sp
311 BLOCK_COMMENT("call Java function");
312 __ mov(r19_sender_sp, sp);
313 __ blr(c_rarg4);
314
315 // we do this here because the notify will already have been done
316 // if we get to the next instruction via an exception
317 //
318 // n.b. adding this instruction here affects the calculation of
319 // whether or not a routine returns to the call stub (used when
320 // doing stack walks) since the normal test is to check the return
321 // pc against the address saved below. so we may need to allow for
322 // this extra instruction in the check.
323
324 // save current address for use by exception handling code
325
326 return_address = __ pc();
327
328 // store result depending on type (everything that is not
329 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
330 // n.b. this assumes Java returns an integral result in r0
331 // and a floating result in j_farg0
332 __ ldr(j_rarg2, result);
333 Label is_long, is_float, is_double, exit;
334 __ ldr(j_rarg1, result_type);
335 __ cmp(j_rarg1, (u1)T_OBJECT);
336 __ br(Assembler::EQ, is_long);
337 __ cmp(j_rarg1, (u1)T_LONG);
338 __ br(Assembler::EQ, is_long);
339 __ cmp(j_rarg1, (u1)T_FLOAT);
340 __ br(Assembler::EQ, is_float);
341 __ cmp(j_rarg1, (u1)T_DOUBLE);
342 __ br(Assembler::EQ, is_double);
343
344 // handle T_INT case
345 __ strw(r0, Address(j_rarg2));
346
347 __ BIND(exit);
348
349 // pop parameters
350 __ sub(esp, rfp, -sp_after_call_off * wordSize);
351
352 #ifdef ASSERT
353 // verify that threads correspond
354 {
355 Label L, S;
356 __ ldr(rscratch1, thread);
357 __ cmp(rthread, rscratch1);
358 __ br(Assembler::NE, S);
359 __ get_thread(rscratch1);
360 __ cmp(rthread, rscratch1);
361 __ br(Assembler::EQ, L);
362 __ BIND(S);
363 __ stop("StubRoutines::call_stub: threads must correspond");
364 __ BIND(L);
365 }
366 #endif
367
368 __ pop_cont_fastpath(rthread);
369
370 // restore callee-save registers
371 __ ldpd(v15, v14, d15_save);
372 __ ldpd(v13, v12, d13_save);
373 __ ldpd(v11, v10, d11_save);
374 __ ldpd(v9, v8, d9_save);
375
376 __ ldp(r28, r27, r28_save);
377 __ ldp(r26, r25, r26_save);
378 __ ldp(r24, r23, r24_save);
379 __ ldp(r22, r21, r22_save);
380 __ ldp(r20, r19, r20_save);
381
382 // restore fpcr
383 __ ldr(rscratch1, fpcr_save);
384 __ set_fpcr(rscratch1);
385
386 __ ldp(c_rarg0, c_rarg1, call_wrapper);
387 __ ldrw(c_rarg2, result_type);
388 __ ldr(c_rarg3, method);
389 __ ldp(c_rarg4, c_rarg5, entry_point);
390 __ ldp(c_rarg6, c_rarg7, parameter_size);
391
392 // leave frame and return to caller
393 __ leave();
394 __ ret(lr);
395
396 // handle return types different from T_INT
397
398 __ BIND(is_long);
399 __ str(r0, Address(j_rarg2, 0));
400 __ br(Assembler::AL, exit);
401
402 __ BIND(is_float);
403 __ strs(j_farg0, Address(j_rarg2, 0));
404 __ br(Assembler::AL, exit);
405
406 __ BIND(is_double);
407 __ strd(j_farg0, Address(j_rarg2, 0));
408 __ br(Assembler::AL, exit);
409
410 return start;
411 }
412
413 // Return point for a Java call if there's an exception thrown in
414 // Java code. The exception is caught and transformed into a
415 // pending exception stored in JavaThread that can be tested from
416 // within the VM.
417 //
418 // Note: Usually the parameters are removed by the callee. In case
419 // of an exception crossing an activation frame boundary, that is
420 // not the case if the callee is compiled code => need to setup the
421 // rsp.
422 //
423 // r0: exception oop
424
425 address generate_catch_exception() {
426 StubId stub_id = StubId::stubgen_catch_exception_id;
427 StubCodeMark mark(this, stub_id);
428 address start = __ pc();
429
430 // same as in generate_call_stub():
431 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
432 const Address thread (rfp, thread_off * wordSize);
433
434 #ifdef ASSERT
435 // verify that threads correspond
436 {
437 Label L, S;
438 __ ldr(rscratch1, thread);
439 __ cmp(rthread, rscratch1);
440 __ br(Assembler::NE, S);
441 __ get_thread(rscratch1);
442 __ cmp(rthread, rscratch1);
443 __ br(Assembler::EQ, L);
444 __ bind(S);
445 __ stop("StubRoutines::catch_exception: threads must correspond");
446 __ bind(L);
447 }
448 #endif
449
450 // set pending exception
451 __ verify_oop(r0);
452
453 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
454 __ mov(rscratch1, (address)__FILE__);
455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
456 __ movw(rscratch1, (int)__LINE__);
457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
458
459 // complete return to VM
460 assert(StubRoutines::_call_stub_return_address != nullptr,
461 "_call_stub_return_address must have been generated before");
462 __ b(StubRoutines::_call_stub_return_address);
463
464 return start;
465 }
466
467 // Continuation point for runtime calls returning with a pending
468 // exception. The pending exception check happened in the runtime
469 // or native call stub. The pending exception in Thread is
470 // converted into a Java-level exception.
471 //
472 // Contract with Java-level exception handlers:
473 // r0: exception
474 // r3: throwing pc
475 //
476 // NOTE: At entry of this stub, exception-pc must be in LR !!
477
478 // NOTE: this is always used as a jump target within generated code
479 // so it just needs to be generated code with no x86 prolog
480
481 address generate_forward_exception() {
482 StubId stub_id = StubId::stubgen_forward_exception_id;
483 StubCodeMark mark(this, stub_id);
484 address start = __ pc();
485
486 // Upon entry, LR points to the return address returning into
487 // Java (interpreted or compiled) code; i.e., the return address
488 // becomes the throwing pc.
489 //
490 // Arguments pushed before the runtime call are still on the stack
491 // but the exception handler will reset the stack pointer ->
492 // ignore them. A potential result in registers can be ignored as
493 // well.
494
495 #ifdef ASSERT
496 // make sure this code is only executed if there is a pending exception
497 {
498 Label L;
499 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
500 __ cbnz(rscratch1, L);
501 __ stop("StubRoutines::forward exception: no pending exception (1)");
502 __ bind(L);
503 }
504 #endif
505
506 // compute exception handler into r19
507
508 // call the VM to find the handler address associated with the
509 // caller address. pass thread in r0 and caller pc (ret address)
510 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
511 // the stack.
512 __ mov(c_rarg1, lr);
513 // lr will be trashed by the VM call so we move it to R19
514 // (callee-saved) because we also need to pass it to the handler
515 // returned by this call.
516 __ mov(r19, lr);
517 BLOCK_COMMENT("call exception_handler_for_return_address");
518 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
519 SharedRuntime::exception_handler_for_return_address),
520 rthread, c_rarg1);
521 // Reinitialize the ptrue predicate register, in case the external runtime
522 // call clobbers ptrue reg, as we may return to SVE compiled code.
523 __ reinitialize_ptrue();
524
525 // we should not really care that lr is no longer the callee
526 // address. we saved the value the handler needs in r19 so we can
527 // just copy it to r3. however, the C2 handler will push its own
528 // frame and then calls into the VM and the VM code asserts that
529 // the PC for the frame above the handler belongs to a compiled
530 // Java method. So, we restore lr here to satisfy that assert.
531 __ mov(lr, r19);
532 // setup r0 & r3 & clear pending exception
533 __ mov(r3, r19);
534 __ mov(r19, r0);
535 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
536 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
537
538 #ifdef ASSERT
539 // make sure exception is set
540 {
541 Label L;
542 __ cbnz(r0, L);
543 __ stop("StubRoutines::forward exception: no pending exception (2)");
544 __ bind(L);
545 }
546 #endif
547
548 // continue at exception handler
549 // r0: exception
550 // r3: throwing pc
551 // r19: exception handler
552 __ verify_oop(r0);
553 __ br(r19);
554
555 return start;
556 }
557
558 // Non-destructive plausibility checks for oops
559 //
560 // Arguments:
561 // r0: oop to verify
562 // rscratch1: error message
563 //
564 // Stack after saving c_rarg3:
565 // [tos + 0]: saved c_rarg3
566 // [tos + 1]: saved c_rarg2
567 // [tos + 2]: saved lr
568 // [tos + 3]: saved rscratch2
569 // [tos + 4]: saved r0
570 // [tos + 5]: saved rscratch1
571 address generate_verify_oop() {
572 StubId stub_id = StubId::stubgen_verify_oop_id;
573 StubCodeMark mark(this, stub_id);
574 address start = __ pc();
575
576 Label exit, error;
577
578 // save c_rarg2 and c_rarg3
579 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
580
581 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
582 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
583 __ ldr(c_rarg3, Address(c_rarg2));
584 __ add(c_rarg3, c_rarg3, 1);
585 __ str(c_rarg3, Address(c_rarg2));
586
587 // object is in r0
588 // make sure object is 'reasonable'
589 __ cbz(r0, exit); // if obj is null it is OK
590
591 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
592 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
593
594 // return if everything seems ok
595 __ bind(exit);
596
597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
598 __ ret(lr);
599
600 // handle errors
601 __ bind(error);
602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
603
604 __ push(RegSet::range(r0, r29), sp);
605 // debug(char* msg, int64_t pc, int64_t regs[])
606 __ mov(c_rarg0, rscratch1); // pass address of error message
607 __ mov(c_rarg1, lr); // pass return address
608 __ mov(c_rarg2, sp); // pass address of regs on stack
609 #ifndef PRODUCT
610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
611 #endif
612 BLOCK_COMMENT("call MacroAssembler::debug");
613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
614 __ blr(rscratch1);
615 __ hlt(0);
616
617 return start;
618 }
619
620 // Generate indices for iota vector.
621 address generate_iota_indices(StubId stub_id) {
622 __ align(CodeEntryAlignment);
623 StubCodeMark mark(this, stub_id);
624 address start = __ pc();
625 // B
626 __ emit_data64(0x0706050403020100, relocInfo::none);
627 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
628 // H
629 __ emit_data64(0x0003000200010000, relocInfo::none);
630 __ emit_data64(0x0007000600050004, relocInfo::none);
631 // S
632 __ emit_data64(0x0000000100000000, relocInfo::none);
633 __ emit_data64(0x0000000300000002, relocInfo::none);
634 // D
635 __ emit_data64(0x0000000000000000, relocInfo::none);
636 __ emit_data64(0x0000000000000001, relocInfo::none);
637 // S - FP
638 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
639 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
640 // D - FP
641 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
642 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
643 return start;
644 }
645
646 // The inner part of zero_words(). This is the bulk operation,
647 // zeroing words in blocks, possibly using DC ZVA to do it. The
648 // caller is responsible for zeroing the last few words.
649 //
650 // Inputs:
651 // r10: the HeapWord-aligned base address of an array to zero.
652 // r11: the count in HeapWords, r11 > 0.
653 //
654 // Returns r10 and r11, adjusted for the caller to clear.
655 // r10: the base address of the tail of words left to clear.
656 // r11: the number of words in the tail.
657 // r11 < MacroAssembler::zero_words_block_size.
658
659 address generate_zero_blocks() {
660 Label done;
661 Label base_aligned;
662
663 Register base = r10, cnt = r11;
664
665 __ align(CodeEntryAlignment);
666 StubId stub_id = StubId::stubgen_zero_blocks_id;
667 StubCodeMark mark(this, stub_id);
668 address start = __ pc();
669
670 if (UseBlockZeroing) {
671 int zva_length = VM_Version::zva_length();
672
673 // Ensure ZVA length can be divided by 16. This is required by
674 // the subsequent operations.
675 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
676
677 __ tbz(base, 3, base_aligned);
678 __ str(zr, Address(__ post(base, 8)));
679 __ sub(cnt, cnt, 1);
680 __ bind(base_aligned);
681
682 // Ensure count >= zva_length * 2 so that it still deserves a zva after
683 // alignment.
684 Label small;
685 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
686 __ subs(rscratch1, cnt, low_limit >> 3);
687 __ br(Assembler::LT, small);
688 __ zero_dcache_blocks(base, cnt);
689 __ bind(small);
690 }
691
692 {
693 // Number of stp instructions we'll unroll
694 const int unroll =
695 MacroAssembler::zero_words_block_size / 2;
696 // Clear the remaining blocks.
697 Label loop;
698 __ subs(cnt, cnt, unroll * 2);
699 __ br(Assembler::LT, done);
700 __ bind(loop);
701 for (int i = 0; i < unroll; i++)
702 __ stp(zr, zr, __ post(base, 16));
703 __ subs(cnt, cnt, unroll * 2);
704 __ br(Assembler::GE, loop);
705 __ bind(done);
706 __ add(cnt, cnt, unroll * 2);
707 }
708
709 __ ret(lr);
710
711 return start;
712 }
713
714
715 typedef enum {
716 copy_forwards = 1,
717 copy_backwards = -1
718 } copy_direction;
719
720 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
721 // for arraycopy stubs.
722 class ArrayCopyBarrierSetHelper : StackObj {
723 BarrierSetAssembler* _bs_asm;
724 MacroAssembler* _masm;
725 DecoratorSet _decorators;
726 BasicType _type;
727 Register _gct1;
728 Register _gct2;
729 Register _gct3;
730 FloatRegister _gcvt1;
731 FloatRegister _gcvt2;
732 FloatRegister _gcvt3;
733
734 public:
735 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
736 DecoratorSet decorators,
737 BasicType type,
738 Register gct1,
739 Register gct2,
740 Register gct3,
741 FloatRegister gcvt1,
742 FloatRegister gcvt2,
743 FloatRegister gcvt3)
744 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
745 _masm(masm),
746 _decorators(decorators),
747 _type(type),
748 _gct1(gct1),
749 _gct2(gct2),
750 _gct3(gct3),
751 _gcvt1(gcvt1),
752 _gcvt2(gcvt2),
753 _gcvt3(gcvt3) {
754 }
755
756 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
757 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
758 dst1, dst2, src,
759 _gct1, _gct2, _gcvt1);
760 }
761
762 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
763 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
764 dst, src1, src2,
765 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
766 }
767
768 void copy_load_at_16(Register dst1, Register dst2, Address src) {
769 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
770 dst1, dst2, src,
771 _gct1);
772 }
773
774 void copy_store_at_16(Address dst, Register src1, Register src2) {
775 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
776 dst, src1, src2,
777 _gct1, _gct2, _gct3);
778 }
779
780 void copy_load_at_8(Register dst, Address src) {
781 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
782 dst, noreg, src,
783 _gct1);
784 }
785
786 void copy_store_at_8(Address dst, Register src) {
787 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
788 dst, src, noreg,
789 _gct1, _gct2, _gct3);
790 }
791 };
792
793 // Bulk copy of blocks of 8 words.
794 //
795 // count is a count of words.
796 //
797 // Precondition: count >= 8
798 //
799 // Postconditions:
800 //
801 // The least significant bit of count contains the remaining count
802 // of words to copy. The rest of count is trash.
803 //
804 // s and d are adjusted to point to the remaining words to copy
805 //
806 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
807 BasicType type;
808 copy_direction direction;
809
810 switch (stub_id) {
811 case StubId::stubgen_copy_byte_f_id:
812 direction = copy_forwards;
813 type = T_BYTE;
814 break;
815 case StubId::stubgen_copy_byte_b_id:
816 direction = copy_backwards;
817 type = T_BYTE;
818 break;
819 case StubId::stubgen_copy_oop_f_id:
820 direction = copy_forwards;
821 type = T_OBJECT;
822 break;
823 case StubId::stubgen_copy_oop_b_id:
824 direction = copy_backwards;
825 type = T_OBJECT;
826 break;
827 case StubId::stubgen_copy_oop_uninit_f_id:
828 direction = copy_forwards;
829 type = T_OBJECT;
830 break;
831 case StubId::stubgen_copy_oop_uninit_b_id:
832 direction = copy_backwards;
833 type = T_OBJECT;
834 break;
835 default:
836 ShouldNotReachHere();
837 }
838
839 int unit = wordSize * direction;
840 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
841
842 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
843 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
844 const Register stride = r14;
845 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
846 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
847 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
848
849 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
850 assert_different_registers(s, d, count, rscratch1, rscratch2);
851
852 Label again, drain;
853
854 __ align(CodeEntryAlignment);
855
856 StubCodeMark mark(this, stub_id);
857
858 address start = __ pc();
859
860 Label unaligned_copy_long;
861 if (AvoidUnalignedAccesses) {
862 __ tbnz(d, 3, unaligned_copy_long);
863 }
864
865 if (direction == copy_forwards) {
866 __ sub(s, s, bias);
867 __ sub(d, d, bias);
868 }
869
870 #ifdef ASSERT
871 // Make sure we are never given < 8 words
872 {
873 Label L;
874 __ cmp(count, (u1)8);
875 __ br(Assembler::GE, L);
876 __ stop("genrate_copy_longs called with < 8 words");
877 __ bind(L);
878 }
879 #endif
880
881 // Fill 8 registers
882 if (UseSIMDForMemoryOps) {
883 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
884 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
885 } else {
886 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
887 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
888 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
889 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
890 }
891
892 __ subs(count, count, 16);
893 __ br(Assembler::LO, drain);
894
895 int prefetch = PrefetchCopyIntervalInBytes;
896 bool use_stride = false;
897 if (direction == copy_backwards) {
898 use_stride = prefetch > 256;
899 prefetch = -prefetch;
900 if (use_stride) __ mov(stride, prefetch);
901 }
902
903 __ bind(again);
904
905 if (PrefetchCopyIntervalInBytes > 0)
906 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
907
908 if (UseSIMDForMemoryOps) {
909 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
910 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
911 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
912 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
913 } else {
914 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
915 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
916 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
917 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
918 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
919 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
920 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
921 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
922 }
923
924 __ subs(count, count, 8);
925 __ br(Assembler::HS, again);
926
927 // Drain
928 __ bind(drain);
929 if (UseSIMDForMemoryOps) {
930 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
931 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
932 } else {
933 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
934 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
935 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
936 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
937 }
938
939 {
940 Label L1, L2;
941 __ tbz(count, exact_log2(4), L1);
942 if (UseSIMDForMemoryOps) {
943 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
944 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
945 } else {
946 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
947 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
949 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
950 }
951 __ bind(L1);
952
953 if (direction == copy_forwards) {
954 __ add(s, s, bias);
955 __ add(d, d, bias);
956 }
957
958 __ tbz(count, 1, L2);
959 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
960 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
961 __ bind(L2);
962 }
963
964 __ ret(lr);
965
966 if (AvoidUnalignedAccesses) {
967 Label drain, again;
968 // Register order for storing. Order is different for backward copy.
969
970 __ bind(unaligned_copy_long);
971
972 // source address is even aligned, target odd aligned
973 //
974 // when forward copying word pairs we read long pairs at offsets
975 // {0, 2, 4, 6} (in long words). when backwards copying we read
976 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
977 // address by -2 in the forwards case so we can compute the
978 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
979 // or -1.
980 //
981 // when forward copying we need to store 1 word, 3 pairs and
982 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
983 // zero offset We adjust the destination by -1 which means we
984 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
985 //
986 // When backwards copyng we need to store 1 word, 3 pairs and
987 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
988 // offsets {1, 3, 5, 7, 8} * unit.
989
990 if (direction == copy_forwards) {
991 __ sub(s, s, 16);
992 __ sub(d, d, 8);
993 }
994
995 // Fill 8 registers
996 //
997 // for forwards copy s was offset by -16 from the original input
998 // value of s so the register contents are at these offsets
999 // relative to the 64 bit block addressed by that original input
1000 // and so on for each successive 64 byte block when s is updated
1001 //
1002 // t0 at offset 0, t1 at offset 8
1003 // t2 at offset 16, t3 at offset 24
1004 // t4 at offset 32, t5 at offset 40
1005 // t6 at offset 48, t7 at offset 56
1006
1007 // for backwards copy s was not offset so the register contents
1008 // are at these offsets into the preceding 64 byte block
1009 // relative to that original input and so on for each successive
1010 // preceding 64 byte block when s is updated. this explains the
1011 // slightly counter-intuitive looking pattern of register usage
1012 // in the stp instructions for backwards copy.
1013 //
1014 // t0 at offset -16, t1 at offset -8
1015 // t2 at offset -32, t3 at offset -24
1016 // t4 at offset -48, t5 at offset -40
1017 // t6 at offset -64, t7 at offset -56
1018
1019 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1020 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1021 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1022 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1023
1024 __ subs(count, count, 16);
1025 __ br(Assembler::LO, drain);
1026
1027 int prefetch = PrefetchCopyIntervalInBytes;
1028 bool use_stride = false;
1029 if (direction == copy_backwards) {
1030 use_stride = prefetch > 256;
1031 prefetch = -prefetch;
1032 if (use_stride) __ mov(stride, prefetch);
1033 }
1034
1035 __ bind(again);
1036
1037 if (PrefetchCopyIntervalInBytes > 0)
1038 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1039
1040 if (direction == copy_forwards) {
1041 // allowing for the offset of -8 the store instructions place
1042 // registers into the target 64 bit block at the following
1043 // offsets
1044 //
1045 // t0 at offset 0
1046 // t1 at offset 8, t2 at offset 16
1047 // t3 at offset 24, t4 at offset 32
1048 // t5 at offset 40, t6 at offset 48
1049 // t7 at offset 56
1050
1051 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1052 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1053 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1054 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1055 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1056 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1057 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1058 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1059 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1060 } else {
1061 // d was not offset when we started so the registers are
1062 // written into the 64 bit block preceding d with the following
1063 // offsets
1064 //
1065 // t1 at offset -8
1066 // t3 at offset -24, t0 at offset -16
1067 // t5 at offset -48, t2 at offset -32
1068 // t7 at offset -56, t4 at offset -48
1069 // t6 at offset -64
1070 //
1071 // note that this matches the offsets previously noted for the
1072 // loads
1073
1074 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1075 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1076 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1077 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1078 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1079 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1080 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1081 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1082 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1083 }
1084
1085 __ subs(count, count, 8);
1086 __ br(Assembler::HS, again);
1087
1088 // Drain
1089 //
1090 // this uses the same pattern of offsets and register arguments
1091 // as above
1092 __ bind(drain);
1093 if (direction == copy_forwards) {
1094 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1095 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1096 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1097 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1098 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1099 } else {
1100 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1101 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1102 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1103 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1104 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1105 }
1106 // now we need to copy any remaining part block which may
1107 // include a 4 word block subblock and/or a 2 word subblock.
1108 // bits 2 and 1 in the count are the tell-tale for whether we
1109 // have each such subblock
1110 {
1111 Label L1, L2;
1112 __ tbz(count, exact_log2(4), L1);
1113 // this is the same as above but copying only 4 longs hence
1114 // with only one intervening stp between the str instructions
1115 // but note that the offsets and registers still follow the
1116 // same pattern
1117 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1118 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1119 if (direction == copy_forwards) {
1120 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1121 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1122 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1123 } else {
1124 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1125 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1126 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1127 }
1128 __ bind(L1);
1129
1130 __ tbz(count, 1, L2);
1131 // this is the same as above but copying only 2 longs hence
1132 // there is no intervening stp between the str instructions
1133 // but note that the offset and register patterns are still
1134 // the same
1135 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1136 if (direction == copy_forwards) {
1137 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1138 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1139 } else {
1140 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1141 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1142 }
1143 __ bind(L2);
1144
1145 // for forwards copy we need to re-adjust the offsets we
1146 // applied so that s and d are follow the last words written
1147
1148 if (direction == copy_forwards) {
1149 __ add(s, s, 16);
1150 __ add(d, d, 8);
1151 }
1152
1153 }
1154
1155 __ ret(lr);
1156 }
1157
1158 return start;
1159 }
1160
1161 // Small copy: less than 16 bytes.
1162 //
1163 // NB: Ignores all of the bits of count which represent more than 15
1164 // bytes, so a caller doesn't have to mask them.
1165
1166 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1167 bool is_backwards = step < 0;
1168 size_t granularity = g_uabs(step);
1169 int direction = is_backwards ? -1 : 1;
1170
1171 Label Lword, Lint, Lshort, Lbyte;
1172
1173 assert(granularity
1174 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1175
1176 const Register t0 = r3;
1177 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1178 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1179
1180 // ??? I don't know if this bit-test-and-branch is the right thing
1181 // to do. It does a lot of jumping, resulting in several
1182 // mispredicted branches. It might make more sense to do this
1183 // with something like Duff's device with a single computed branch.
1184
1185 __ tbz(count, 3 - exact_log2(granularity), Lword);
1186 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1187 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1188 __ bind(Lword);
1189
1190 if (granularity <= sizeof (jint)) {
1191 __ tbz(count, 2 - exact_log2(granularity), Lint);
1192 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1193 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1194 __ bind(Lint);
1195 }
1196
1197 if (granularity <= sizeof (jshort)) {
1198 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1199 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1200 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1201 __ bind(Lshort);
1202 }
1203
1204 if (granularity <= sizeof (jbyte)) {
1205 __ tbz(count, 0, Lbyte);
1206 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1207 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1208 __ bind(Lbyte);
1209 }
1210 }
1211
1212 // All-singing all-dancing memory copy.
1213 //
1214 // Copy count units of memory from s to d. The size of a unit is
1215 // step, which can be positive or negative depending on the direction
1216 // of copy. If is_aligned is false, we align the source address.
1217 //
1218
1219 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1220 Register s, Register d, Register count, int step) {
1221 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1222 bool is_backwards = step < 0;
1223 unsigned int granularity = g_uabs(step);
1224 const Register t0 = r3, t1 = r4;
1225
1226 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1227 // load all the data before writing anything
1228 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1229 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1230 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1231 const Register send = r17, dend = r16;
1232 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1233 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1234 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1235
1236 if (PrefetchCopyIntervalInBytes > 0)
1237 __ prfm(Address(s, 0), PLDL1KEEP);
1238 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1239 __ br(Assembler::HI, copy_big);
1240
1241 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1242 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1243
1244 __ cmp(count, u1(16/granularity));
1245 __ br(Assembler::LS, copy16);
1246
1247 __ cmp(count, u1(64/granularity));
1248 __ br(Assembler::HI, copy80);
1249
1250 __ cmp(count, u1(32/granularity));
1251 __ br(Assembler::LS, copy32);
1252
1253 // 33..64 bytes
1254 if (UseSIMDForMemoryOps) {
1255 bs.copy_load_at_32(v0, v1, Address(s, 0));
1256 bs.copy_load_at_32(v2, v3, Address(send, -32));
1257 bs.copy_store_at_32(Address(d, 0), v0, v1);
1258 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1259 } else {
1260 bs.copy_load_at_16(t0, t1, Address(s, 0));
1261 bs.copy_load_at_16(t2, t3, Address(s, 16));
1262 bs.copy_load_at_16(t4, t5, Address(send, -32));
1263 bs.copy_load_at_16(t6, t7, Address(send, -16));
1264
1265 bs.copy_store_at_16(Address(d, 0), t0, t1);
1266 bs.copy_store_at_16(Address(d, 16), t2, t3);
1267 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1268 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1269 }
1270 __ b(finish);
1271
1272 // 17..32 bytes
1273 __ bind(copy32);
1274 bs.copy_load_at_16(t0, t1, Address(s, 0));
1275 bs.copy_load_at_16(t6, t7, Address(send, -16));
1276
1277 bs.copy_store_at_16(Address(d, 0), t0, t1);
1278 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1279 __ b(finish);
1280
1281 // 65..80/96 bytes
1282 // (96 bytes if SIMD because we do 32 byes per instruction)
1283 __ bind(copy80);
1284 if (UseSIMDForMemoryOps) {
1285 bs.copy_load_at_32(v0, v1, Address(s, 0));
1286 bs.copy_load_at_32(v2, v3, Address(s, 32));
1287 // Unaligned pointers can be an issue for copying.
1288 // The issue has more chances to happen when granularity of data is
1289 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1290 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1291 // The most performance drop has been seen for the range 65-80 bytes.
1292 // For such cases using the pair of ldp/stp instead of the third pair of
1293 // ldpq/stpq fixes the performance issue.
1294 if (granularity < sizeof (jint)) {
1295 Label copy96;
1296 __ cmp(count, u1(80/granularity));
1297 __ br(Assembler::HI, copy96);
1298 bs.copy_load_at_16(t0, t1, Address(send, -16));
1299
1300 bs.copy_store_at_32(Address(d, 0), v0, v1);
1301 bs.copy_store_at_32(Address(d, 32), v2, v3);
1302
1303 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1304 __ b(finish);
1305
1306 __ bind(copy96);
1307 }
1308 bs.copy_load_at_32(v4, v5, Address(send, -32));
1309
1310 bs.copy_store_at_32(Address(d, 0), v0, v1);
1311 bs.copy_store_at_32(Address(d, 32), v2, v3);
1312
1313 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1314 } else {
1315 bs.copy_load_at_16(t0, t1, Address(s, 0));
1316 bs.copy_load_at_16(t2, t3, Address(s, 16));
1317 bs.copy_load_at_16(t4, t5, Address(s, 32));
1318 bs.copy_load_at_16(t6, t7, Address(s, 48));
1319 bs.copy_load_at_16(t8, t9, Address(send, -16));
1320
1321 bs.copy_store_at_16(Address(d, 0), t0, t1);
1322 bs.copy_store_at_16(Address(d, 16), t2, t3);
1323 bs.copy_store_at_16(Address(d, 32), t4, t5);
1324 bs.copy_store_at_16(Address(d, 48), t6, t7);
1325 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1326 }
1327 __ b(finish);
1328
1329 // 0..16 bytes
1330 __ bind(copy16);
1331 __ cmp(count, u1(8/granularity));
1332 __ br(Assembler::LO, copy8);
1333
1334 // 8..16 bytes
1335 bs.copy_load_at_8(t0, Address(s, 0));
1336 bs.copy_load_at_8(t1, Address(send, -8));
1337 bs.copy_store_at_8(Address(d, 0), t0);
1338 bs.copy_store_at_8(Address(dend, -8), t1);
1339 __ b(finish);
1340
1341 if (granularity < 8) {
1342 // 4..7 bytes
1343 __ bind(copy8);
1344 __ tbz(count, 2 - exact_log2(granularity), copy4);
1345 __ ldrw(t0, Address(s, 0));
1346 __ ldrw(t1, Address(send, -4));
1347 __ strw(t0, Address(d, 0));
1348 __ strw(t1, Address(dend, -4));
1349 __ b(finish);
1350 if (granularity < 4) {
1351 // 0..3 bytes
1352 __ bind(copy4);
1353 __ cbz(count, finish); // get rid of 0 case
1354 if (granularity == 2) {
1355 __ ldrh(t0, Address(s, 0));
1356 __ strh(t0, Address(d, 0));
1357 } else { // granularity == 1
1358 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1359 // the first and last byte.
1360 // Handle the 3 byte case by loading and storing base + count/2
1361 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1362 // This does means in the 1 byte case we load/store the same
1363 // byte 3 times.
1364 __ lsr(count, count, 1);
1365 __ ldrb(t0, Address(s, 0));
1366 __ ldrb(t1, Address(send, -1));
1367 __ ldrb(t2, Address(s, count));
1368 __ strb(t0, Address(d, 0));
1369 __ strb(t1, Address(dend, -1));
1370 __ strb(t2, Address(d, count));
1371 }
1372 __ b(finish);
1373 }
1374 }
1375
1376 __ bind(copy_big);
1377 if (is_backwards) {
1378 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1379 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1380 }
1381
1382 // Now we've got the small case out of the way we can align the
1383 // source address on a 2-word boundary.
1384
1385 // Here we will materialize a count in r15, which is used by copy_memory_small
1386 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1387 // Up until here, we have used t9, which aliases r15, but from here on, that register
1388 // can not be used as a temp register, as it contains the count.
1389
1390 Label aligned;
1391
1392 if (is_aligned) {
1393 // We may have to adjust by 1 word to get s 2-word-aligned.
1394 __ tbz(s, exact_log2(wordSize), aligned);
1395 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1396 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1397 __ sub(count, count, wordSize/granularity);
1398 } else {
1399 if (is_backwards) {
1400 __ andr(r15, s, 2 * wordSize - 1);
1401 } else {
1402 __ neg(r15, s);
1403 __ andr(r15, r15, 2 * wordSize - 1);
1404 }
1405 // r15 is the byte adjustment needed to align s.
1406 __ cbz(r15, aligned);
1407 int shift = exact_log2(granularity);
1408 if (shift > 0) {
1409 __ lsr(r15, r15, shift);
1410 }
1411 __ sub(count, count, r15);
1412
1413 #if 0
1414 // ?? This code is only correct for a disjoint copy. It may or
1415 // may not make sense to use it in that case.
1416
1417 // Copy the first pair; s and d may not be aligned.
1418 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1419 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1420
1421 // Align s and d, adjust count
1422 if (is_backwards) {
1423 __ sub(s, s, r15);
1424 __ sub(d, d, r15);
1425 } else {
1426 __ add(s, s, r15);
1427 __ add(d, d, r15);
1428 }
1429 #else
1430 copy_memory_small(decorators, type, s, d, r15, step);
1431 #endif
1432 }
1433
1434 __ bind(aligned);
1435
1436 // s is now 2-word-aligned.
1437
1438 // We have a count of units and some trailing bytes. Adjust the
1439 // count and do a bulk copy of words. If the shift is zero
1440 // perform a move instead to benefit from zero latency moves.
1441 int shift = exact_log2(wordSize/granularity);
1442 if (shift > 0) {
1443 __ lsr(r15, count, shift);
1444 } else {
1445 __ mov(r15, count);
1446 }
1447 if (direction == copy_forwards) {
1448 if (type != T_OBJECT) {
1449 __ bl(StubRoutines::aarch64::copy_byte_f());
1450 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1451 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1452 } else {
1453 __ bl(StubRoutines::aarch64::copy_oop_f());
1454 }
1455 } else {
1456 if (type != T_OBJECT) {
1457 __ bl(StubRoutines::aarch64::copy_byte_b());
1458 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1459 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1460 } else {
1461 __ bl(StubRoutines::aarch64::copy_oop_b());
1462 }
1463 }
1464
1465 // And the tail.
1466 copy_memory_small(decorators, type, s, d, count, step);
1467
1468 if (granularity >= 8) __ bind(copy8);
1469 if (granularity >= 4) __ bind(copy4);
1470 __ bind(finish);
1471 }
1472
1473
1474 void clobber_registers() {
1475 #ifdef ASSERT
1476 RegSet clobbered
1477 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1478 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1479 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1480 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1481 __ mov(*it, rscratch1);
1482 }
1483 #endif
1484
1485 }
1486
1487 // Scan over array at a for count oops, verifying each one.
1488 // Preserves a and count, clobbers rscratch1 and rscratch2.
1489 void verify_oop_array (int size, Register a, Register count, Register temp) {
1490 Label loop, end;
1491 __ mov(rscratch1, a);
1492 __ mov(rscratch2, zr);
1493 __ bind(loop);
1494 __ cmp(rscratch2, count);
1495 __ br(Assembler::HS, end);
1496 if (size == wordSize) {
1497 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1498 __ verify_oop(temp);
1499 } else {
1500 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1501 __ decode_heap_oop(temp); // calls verify_oop
1502 }
1503 __ add(rscratch2, rscratch2, 1);
1504 __ b(loop);
1505 __ bind(end);
1506 }
1507
1508 // Arguments:
1509 // stub_id - is used to name the stub and identify all details of
1510 // how to perform the copy.
1511 //
1512 // entry - is assigned to the stub's post push entry point unless
1513 // it is null
1514 //
1515 // Inputs:
1516 // c_rarg0 - source array address
1517 // c_rarg1 - destination array address
1518 // c_rarg2 - element count, treated as ssize_t, can be zero
1519 //
1520 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1521 // the hardware handle it. The two dwords within qwords that span
1522 // cache line boundaries will still be loaded and stored atomically.
1523 //
1524 // Side Effects: nopush_entry is set to the (post push) entry point
1525 // so it can be used by the corresponding conjoint
1526 // copy method
1527 //
1528 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1529 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1530 RegSet saved_reg = RegSet::of(s, d, count);
1531 int size;
1532 bool aligned;
1533 bool is_oop;
1534 bool dest_uninitialized;
1535 switch (stub_id) {
1536 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1537 size = sizeof(jbyte);
1538 aligned = false;
1539 is_oop = false;
1540 dest_uninitialized = false;
1541 break;
1542 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1543 size = sizeof(jbyte);
1544 aligned = true;
1545 is_oop = false;
1546 dest_uninitialized = false;
1547 break;
1548 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1549 size = sizeof(jshort);
1550 aligned = false;
1551 is_oop = false;
1552 dest_uninitialized = false;
1553 break;
1554 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1555 size = sizeof(jshort);
1556 aligned = true;
1557 is_oop = false;
1558 dest_uninitialized = false;
1559 break;
1560 case StubId::stubgen_jint_disjoint_arraycopy_id:
1561 size = sizeof(jint);
1562 aligned = false;
1563 is_oop = false;
1564 dest_uninitialized = false;
1565 break;
1566 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1567 size = sizeof(jint);
1568 aligned = true;
1569 is_oop = false;
1570 dest_uninitialized = false;
1571 break;
1572 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1573 // since this is always aligned we can (should!) use the same
1574 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1575 ShouldNotReachHere();
1576 break;
1577 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1578 size = sizeof(jlong);
1579 aligned = true;
1580 is_oop = false;
1581 dest_uninitialized = false;
1582 break;
1583 case StubId::stubgen_oop_disjoint_arraycopy_id:
1584 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1585 aligned = !UseCompressedOops;
1586 is_oop = true;
1587 dest_uninitialized = false;
1588 break;
1589 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1590 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1591 aligned = !UseCompressedOops;
1592 is_oop = true;
1593 dest_uninitialized = false;
1594 break;
1595 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1596 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1597 aligned = !UseCompressedOops;
1598 is_oop = true;
1599 dest_uninitialized = true;
1600 break;
1601 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1602 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1603 aligned = !UseCompressedOops;
1604 is_oop = true;
1605 dest_uninitialized = true;
1606 break;
1607 default:
1608 ShouldNotReachHere();
1609 break;
1610 }
1611
1612 __ align(CodeEntryAlignment);
1613 StubCodeMark mark(this, stub_id);
1614 address start = __ pc();
1615 __ enter();
1616
1617 if (nopush_entry != nullptr) {
1618 *nopush_entry = __ pc();
1619 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1620 BLOCK_COMMENT("Entry:");
1621 }
1622
1623 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1624 if (dest_uninitialized) {
1625 decorators |= IS_DEST_UNINITIALIZED;
1626 }
1627 if (aligned) {
1628 decorators |= ARRAYCOPY_ALIGNED;
1629 }
1630
1631 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1632 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1633
1634 if (is_oop) {
1635 // save regs before copy_memory
1636 __ push(RegSet::of(d, count), sp);
1637 }
1638 {
1639 // UnsafeMemoryAccess page error: continue after unsafe access
1640 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1641 UnsafeMemoryAccessMark umam(this, add_entry, true);
1642 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1643 }
1644
1645 if (is_oop) {
1646 __ pop(RegSet::of(d, count), sp);
1647 if (VerifyOops)
1648 verify_oop_array(size, d, count, r16);
1649 }
1650
1651 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1652
1653 __ leave();
1654 __ mov(r0, zr); // return 0
1655 __ ret(lr);
1656 return start;
1657 }
1658
1659 // Arguments:
1660 // stub_id - is used to name the stub and identify all details of
1661 // how to perform the copy.
1662 //
1663 // nooverlap_target - identifes the (post push) entry for the
1664 // corresponding disjoint copy routine which can be
1665 // jumped to if the ranges do not actually overlap
1666 //
1667 // entry - is assigned to the stub's post push entry point unless
1668 // it is null
1669 //
1670 //
1671 // Inputs:
1672 // c_rarg0 - source array address
1673 // c_rarg1 - destination array address
1674 // c_rarg2 - element count, treated as ssize_t, can be zero
1675 //
1676 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1677 // the hardware handle it. The two dwords within qwords that span
1678 // cache line boundaries will still be loaded and stored atomically.
1679 //
1680 // Side Effects:
1681 // nopush_entry is set to the no-overlap entry point so it can be
1682 // used by some other conjoint copy method
1683 //
1684 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1685 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1686 RegSet saved_regs = RegSet::of(s, d, count);
1687 int size;
1688 bool aligned;
1689 bool is_oop;
1690 bool dest_uninitialized;
1691 switch (stub_id) {
1692 case StubId::stubgen_jbyte_arraycopy_id:
1693 size = sizeof(jbyte);
1694 aligned = false;
1695 is_oop = false;
1696 dest_uninitialized = false;
1697 break;
1698 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1699 size = sizeof(jbyte);
1700 aligned = true;
1701 is_oop = false;
1702 dest_uninitialized = false;
1703 break;
1704 case StubId::stubgen_jshort_arraycopy_id:
1705 size = sizeof(jshort);
1706 aligned = false;
1707 is_oop = false;
1708 dest_uninitialized = false;
1709 break;
1710 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1711 size = sizeof(jshort);
1712 aligned = true;
1713 is_oop = false;
1714 dest_uninitialized = false;
1715 break;
1716 case StubId::stubgen_jint_arraycopy_id:
1717 size = sizeof(jint);
1718 aligned = false;
1719 is_oop = false;
1720 dest_uninitialized = false;
1721 break;
1722 case StubId::stubgen_arrayof_jint_arraycopy_id:
1723 size = sizeof(jint);
1724 aligned = true;
1725 is_oop = false;
1726 dest_uninitialized = false;
1727 break;
1728 case StubId::stubgen_jlong_arraycopy_id:
1729 // since this is always aligned we can (should!) use the same
1730 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1731 ShouldNotReachHere();
1732 break;
1733 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1734 size = sizeof(jlong);
1735 aligned = true;
1736 is_oop = false;
1737 dest_uninitialized = false;
1738 break;
1739 case StubId::stubgen_oop_arraycopy_id:
1740 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1741 aligned = !UseCompressedOops;
1742 is_oop = true;
1743 dest_uninitialized = false;
1744 break;
1745 case StubId::stubgen_arrayof_oop_arraycopy_id:
1746 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1747 aligned = !UseCompressedOops;
1748 is_oop = true;
1749 dest_uninitialized = false;
1750 break;
1751 case StubId::stubgen_oop_arraycopy_uninit_id:
1752 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1753 aligned = !UseCompressedOops;
1754 is_oop = true;
1755 dest_uninitialized = true;
1756 break;
1757 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1758 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1759 aligned = !UseCompressedOops;
1760 is_oop = true;
1761 dest_uninitialized = true;
1762 break;
1763 default:
1764 ShouldNotReachHere();
1765 }
1766
1767 StubCodeMark mark(this, stub_id);
1768 address start = __ pc();
1769 __ enter();
1770
1771 if (nopush_entry != nullptr) {
1772 *nopush_entry = __ pc();
1773 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1774 BLOCK_COMMENT("Entry:");
1775 }
1776
1777 // use fwd copy when (d-s) above_equal (count*size)
1778 Label L_overlapping;
1779 __ sub(rscratch1, d, s);
1780 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1781 __ br(Assembler::LO, L_overlapping);
1782 __ b(RuntimeAddress(nooverlap_target));
1783 __ bind(L_overlapping);
1784
1785 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1786 if (dest_uninitialized) {
1787 decorators |= IS_DEST_UNINITIALIZED;
1788 }
1789 if (aligned) {
1790 decorators |= ARRAYCOPY_ALIGNED;
1791 }
1792
1793 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1794 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1795
1796 if (is_oop) {
1797 // save regs before copy_memory
1798 __ push(RegSet::of(d, count), sp);
1799 }
1800 {
1801 // UnsafeMemoryAccess page error: continue after unsafe access
1802 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1803 UnsafeMemoryAccessMark umam(this, add_entry, true);
1804 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1805 }
1806 if (is_oop) {
1807 __ pop(RegSet::of(d, count), sp);
1808 if (VerifyOops)
1809 verify_oop_array(size, d, count, r16);
1810 }
1811 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1812 __ leave();
1813 __ mov(r0, zr); // return 0
1814 __ ret(lr);
1815 return start;
1816 }
1817
1818 // Helper for generating a dynamic type check.
1819 // Smashes rscratch1, rscratch2.
1820 void generate_type_check(Register sub_klass,
1821 Register super_check_offset,
1822 Register super_klass,
1823 Register temp1,
1824 Register temp2,
1825 Register result,
1826 Label& L_success) {
1827 assert_different_registers(sub_klass, super_check_offset, super_klass);
1828
1829 BLOCK_COMMENT("type_check:");
1830
1831 Label L_miss;
1832
1833 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1834 super_check_offset);
1835 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1836
1837 // Fall through on failure!
1838 __ BIND(L_miss);
1839 }
1840
1841 //
1842 // Generate checkcasting array copy stub
1843 //
1844 // Input:
1845 // c_rarg0 - source array address
1846 // c_rarg1 - destination array address
1847 // c_rarg2 - element count, treated as ssize_t, can be zero
1848 // c_rarg3 - size_t ckoff (super_check_offset)
1849 // c_rarg4 - oop ckval (super_klass)
1850 //
1851 // Output:
1852 // r0 == 0 - success
1853 // r0 == -1^K - failure, where K is partial transfer count
1854 //
1855 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1856 bool dest_uninitialized;
1857 switch (stub_id) {
1858 case StubId::stubgen_checkcast_arraycopy_id:
1859 dest_uninitialized = false;
1860 break;
1861 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1862 dest_uninitialized = true;
1863 break;
1864 default:
1865 ShouldNotReachHere();
1866 }
1867
1868 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1869
1870 // Input registers (after setup_arg_regs)
1871 const Register from = c_rarg0; // source array address
1872 const Register to = c_rarg1; // destination array address
1873 const Register count = c_rarg2; // elementscount
1874 const Register ckoff = c_rarg3; // super_check_offset
1875 const Register ckval = c_rarg4; // super_klass
1876
1877 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1878 RegSet wb_post_saved_regs = RegSet::of(count);
1879
1880 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1881 const Register copied_oop = r22; // actual oop copied
1882 const Register count_save = r21; // orig elementscount
1883 const Register start_to = r20; // destination array start address
1884 const Register r19_klass = r19; // oop._klass
1885
1886 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1887 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1888
1889 //---------------------------------------------------------------
1890 // Assembler stub will be used for this call to arraycopy
1891 // if the two arrays are subtypes of Object[] but the
1892 // destination array type is not equal to or a supertype
1893 // of the source type. Each element must be separately
1894 // checked.
1895
1896 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1897 copied_oop, r19_klass, count_save);
1898
1899 __ align(CodeEntryAlignment);
1900 StubCodeMark mark(this, stub_id);
1901 address start = __ pc();
1902
1903 __ enter(); // required for proper stackwalking of RuntimeStub frame
1904
1905 #ifdef ASSERT
1906 // caller guarantees that the arrays really are different
1907 // otherwise, we would have to make conjoint checks
1908 { Label L;
1909 __ b(L); // conjoint check not yet implemented
1910 __ stop("checkcast_copy within a single array");
1911 __ bind(L);
1912 }
1913 #endif //ASSERT
1914
1915 // Caller of this entry point must set up the argument registers.
1916 if (nopush_entry != nullptr) {
1917 *nopush_entry = __ pc();
1918 BLOCK_COMMENT("Entry:");
1919 }
1920
1921 // Empty array: Nothing to do.
1922 __ cbz(count, L_done);
1923 __ push(RegSet::of(r19, r20, r21, r22), sp);
1924
1925 #ifdef ASSERT
1926 BLOCK_COMMENT("assert consistent ckoff/ckval");
1927 // The ckoff and ckval must be mutually consistent,
1928 // even though caller generates both.
1929 { Label L;
1930 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1931 __ ldrw(start_to, Address(ckval, sco_offset));
1932 __ cmpw(ckoff, start_to);
1933 __ br(Assembler::EQ, L);
1934 __ stop("super_check_offset inconsistent");
1935 __ bind(L);
1936 }
1937 #endif //ASSERT
1938
1939 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1940 bool is_oop = true;
1941 int element_size = UseCompressedOops ? 4 : 8;
1942 if (dest_uninitialized) {
1943 decorators |= IS_DEST_UNINITIALIZED;
1944 }
1945
1946 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1947 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1948
1949 // save the original count
1950 __ mov(count_save, count);
1951
1952 // Copy from low to high addresses
1953 __ mov(start_to, to); // Save destination array start address
1954 __ b(L_load_element);
1955
1956 // ======== begin loop ========
1957 // (Loop is rotated; its entry is L_load_element.)
1958 // Loop control:
1959 // for (; count != 0; count--) {
1960 // copied_oop = load_heap_oop(from++);
1961 // ... generate_type_check ...;
1962 // store_heap_oop(to++, copied_oop);
1963 // }
1964 __ align(OptoLoopAlignment);
1965
1966 __ BIND(L_store_element);
1967 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1968 __ post(to, element_size), copied_oop, noreg,
1969 gct1, gct2, gct3);
1970 __ sub(count, count, 1);
1971 __ cbz(count, L_do_card_marks);
1972
1973 // ======== loop entry is here ========
1974 __ BIND(L_load_element);
1975 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1976 copied_oop, noreg, __ post(from, element_size),
1977 gct1);
1978 __ cbz(copied_oop, L_store_element);
1979
1980 __ load_klass(r19_klass, copied_oop);// query the object klass
1981
1982 BLOCK_COMMENT("type_check:");
1983 generate_type_check(/*sub_klass*/r19_klass,
1984 /*super_check_offset*/ckoff,
1985 /*super_klass*/ckval,
1986 /*r_array_base*/gct1,
1987 /*temp2*/gct2,
1988 /*result*/r10, L_store_element);
1989
1990 // Fall through on failure!
1991
1992 // ======== end loop ========
1993
1994 // It was a real error; we must depend on the caller to finish the job.
1995 // Register count = remaining oops, count_orig = total oops.
1996 // Emit GC store barriers for the oops we have copied and report
1997 // their number to the caller.
1998
1999 __ subs(count, count_save, count); // K = partially copied oop count
2000 __ eon(count, count, zr); // report (-1^K) to caller
2001 __ br(Assembler::EQ, L_done_pop);
2002
2003 __ BIND(L_do_card_marks);
2004 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2005
2006 __ bind(L_done_pop);
2007 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2008 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2009
2010 __ bind(L_done);
2011 __ mov(r0, count);
2012 __ leave();
2013 __ ret(lr);
2014
2015 return start;
2016 }
2017
2018 // Perform range checks on the proposed arraycopy.
2019 // Kills temp, but nothing else.
2020 // Also, clean the sign bits of src_pos and dst_pos.
2021 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2022 Register src_pos, // source position (c_rarg1)
2023 Register dst, // destination array oo (c_rarg2)
2024 Register dst_pos, // destination position (c_rarg3)
2025 Register length,
2026 Register temp,
2027 Label& L_failed) {
2028 BLOCK_COMMENT("arraycopy_range_checks:");
2029
2030 assert_different_registers(rscratch1, temp);
2031
2032 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2033 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2034 __ addw(temp, length, src_pos);
2035 __ cmpw(temp, rscratch1);
2036 __ br(Assembler::HI, L_failed);
2037
2038 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2039 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2040 __ addw(temp, length, dst_pos);
2041 __ cmpw(temp, rscratch1);
2042 __ br(Assembler::HI, L_failed);
2043
2044 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2045 __ movw(src_pos, src_pos);
2046 __ movw(dst_pos, dst_pos);
2047
2048 BLOCK_COMMENT("arraycopy_range_checks done");
2049 }
2050
2051 // These stubs get called from some dumb test routine.
2052 // I'll write them properly when they're called from
2053 // something that's actually doing something.
2054 static void fake_arraycopy_stub(address src, address dst, int count) {
2055 assert(count == 0, "huh?");
2056 }
2057
2058
2059 //
2060 // Generate 'unsafe' array copy stub
2061 // Though just as safe as the other stubs, it takes an unscaled
2062 // size_t argument instead of an element count.
2063 //
2064 // Input:
2065 // c_rarg0 - source array address
2066 // c_rarg1 - destination array address
2067 // c_rarg2 - byte count, treated as ssize_t, can be zero
2068 //
2069 // Examines the alignment of the operands and dispatches
2070 // to a long, int, short, or byte copy loop.
2071 //
2072 address generate_unsafe_copy(address byte_copy_entry,
2073 address short_copy_entry,
2074 address int_copy_entry,
2075 address long_copy_entry) {
2076 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2077
2078 Label L_long_aligned, L_int_aligned, L_short_aligned;
2079 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2080
2081 __ align(CodeEntryAlignment);
2082 StubCodeMark mark(this, stub_id);
2083 address start = __ pc();
2084 __ enter(); // required for proper stackwalking of RuntimeStub frame
2085
2086 // bump this on entry, not on exit:
2087 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2088
2089 __ orr(rscratch1, s, d);
2090 __ orr(rscratch1, rscratch1, count);
2091
2092 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2093 __ cbz(rscratch1, L_long_aligned);
2094 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2095 __ cbz(rscratch1, L_int_aligned);
2096 __ tbz(rscratch1, 0, L_short_aligned);
2097 __ b(RuntimeAddress(byte_copy_entry));
2098
2099 __ BIND(L_short_aligned);
2100 __ lsr(count, count, LogBytesPerShort); // size => short_count
2101 __ b(RuntimeAddress(short_copy_entry));
2102 __ BIND(L_int_aligned);
2103 __ lsr(count, count, LogBytesPerInt); // size => int_count
2104 __ b(RuntimeAddress(int_copy_entry));
2105 __ BIND(L_long_aligned);
2106 __ lsr(count, count, LogBytesPerLong); // size => long_count
2107 __ b(RuntimeAddress(long_copy_entry));
2108
2109 return start;
2110 }
2111
2112 //
2113 // Generate generic array copy stubs
2114 //
2115 // Input:
2116 // c_rarg0 - src oop
2117 // c_rarg1 - src_pos (32-bits)
2118 // c_rarg2 - dst oop
2119 // c_rarg3 - dst_pos (32-bits)
2120 // c_rarg4 - element count (32-bits)
2121 //
2122 // Output:
2123 // r0 == 0 - success
2124 // r0 == -1^K - failure, where K is partial transfer count
2125 //
2126 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2127 address int_copy_entry, address oop_copy_entry,
2128 address long_copy_entry, address checkcast_copy_entry) {
2129 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2130
2131 Label L_failed, L_objArray;
2132 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2133
2134 // Input registers
2135 const Register src = c_rarg0; // source array oop
2136 const Register src_pos = c_rarg1; // source position
2137 const Register dst = c_rarg2; // destination array oop
2138 const Register dst_pos = c_rarg3; // destination position
2139 const Register length = c_rarg4;
2140
2141
2142 // Registers used as temps
2143 const Register dst_klass = c_rarg5;
2144
2145 __ align(CodeEntryAlignment);
2146
2147 StubCodeMark mark(this, stub_id);
2148
2149 address start = __ pc();
2150
2151 __ enter(); // required for proper stackwalking of RuntimeStub frame
2152
2153 // bump this on entry, not on exit:
2154 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2155
2156 //-----------------------------------------------------------------------
2157 // Assembler stub will be used for this call to arraycopy
2158 // if the following conditions are met:
2159 //
2160 // (1) src and dst must not be null.
2161 // (2) src_pos must not be negative.
2162 // (3) dst_pos must not be negative.
2163 // (4) length must not be negative.
2164 // (5) src klass and dst klass should be the same and not null.
2165 // (6) src and dst should be arrays.
2166 // (7) src_pos + length must not exceed length of src.
2167 // (8) dst_pos + length must not exceed length of dst.
2168 //
2169
2170 // if (src == nullptr) return -1;
2171 __ cbz(src, L_failed);
2172
2173 // if (src_pos < 0) return -1;
2174 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2175
2176 // if (dst == nullptr) return -1;
2177 __ cbz(dst, L_failed);
2178
2179 // if (dst_pos < 0) return -1;
2180 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2181
2182 // registers used as temp
2183 const Register scratch_length = r16; // elements count to copy
2184 const Register scratch_src_klass = r17; // array klass
2185 const Register lh = r15; // layout helper
2186
2187 // if (length < 0) return -1;
2188 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2189 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2190
2191 __ load_klass(scratch_src_klass, src);
2192 #ifdef ASSERT
2193 // assert(src->klass() != nullptr);
2194 {
2195 BLOCK_COMMENT("assert klasses not null {");
2196 Label L1, L2;
2197 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2198 __ bind(L1);
2199 __ stop("broken null klass");
2200 __ bind(L2);
2201 __ load_klass(rscratch1, dst);
2202 __ cbz(rscratch1, L1); // this would be broken also
2203 BLOCK_COMMENT("} assert klasses not null done");
2204 }
2205 #endif
2206
2207 // Load layout helper (32-bits)
2208 //
2209 // |array_tag| | header_size | element_type | |log2_element_size|
2210 // 32 30 24 16 8 2 0
2211 //
2212 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2213 //
2214
2215 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2216
2217 // Handle objArrays completely differently...
2218 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2219 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2220 __ movw(rscratch1, objArray_lh);
2221 __ eorw(rscratch2, lh, rscratch1);
2222 __ cbzw(rscratch2, L_objArray);
2223
2224 // if (src->klass() != dst->klass()) return -1;
2225 __ load_klass(rscratch2, dst);
2226 __ eor(rscratch2, rscratch2, scratch_src_klass);
2227 __ cbnz(rscratch2, L_failed);
2228
2229 // if (!src->is_Array()) return -1;
2230 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2231
2232 // At this point, it is known to be a typeArray (array_tag 0x3).
2233 #ifdef ASSERT
2234 {
2235 BLOCK_COMMENT("assert primitive array {");
2236 Label L;
2237 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2238 __ cmpw(lh, rscratch2);
2239 __ br(Assembler::GE, L);
2240 __ stop("must be a primitive array");
2241 __ bind(L);
2242 BLOCK_COMMENT("} assert primitive array done");
2243 }
2244 #endif
2245
2246 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2247 rscratch2, L_failed);
2248
2249 // TypeArrayKlass
2250 //
2251 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2252 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2253 //
2254
2255 const Register rscratch1_offset = rscratch1; // array offset
2256 const Register r15_elsize = lh; // element size
2257
2258 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2259 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2260 __ add(src, src, rscratch1_offset); // src array offset
2261 __ add(dst, dst, rscratch1_offset); // dst array offset
2262 BLOCK_COMMENT("choose copy loop based on element size");
2263
2264 // next registers should be set before the jump to corresponding stub
2265 const Register from = c_rarg0; // source array address
2266 const Register to = c_rarg1; // destination array address
2267 const Register count = c_rarg2; // elements count
2268
2269 // 'from', 'to', 'count' registers should be set in such order
2270 // since they are the same as 'src', 'src_pos', 'dst'.
2271
2272 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2273
2274 // The possible values of elsize are 0-3, i.e. exact_log2(element
2275 // size in bytes). We do a simple bitwise binary search.
2276 __ BIND(L_copy_bytes);
2277 __ tbnz(r15_elsize, 1, L_copy_ints);
2278 __ tbnz(r15_elsize, 0, L_copy_shorts);
2279 __ lea(from, Address(src, src_pos));// src_addr
2280 __ lea(to, Address(dst, dst_pos));// dst_addr
2281 __ movw(count, scratch_length); // length
2282 __ b(RuntimeAddress(byte_copy_entry));
2283
2284 __ BIND(L_copy_shorts);
2285 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2286 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2287 __ movw(count, scratch_length); // length
2288 __ b(RuntimeAddress(short_copy_entry));
2289
2290 __ BIND(L_copy_ints);
2291 __ tbnz(r15_elsize, 0, L_copy_longs);
2292 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2293 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2294 __ movw(count, scratch_length); // length
2295 __ b(RuntimeAddress(int_copy_entry));
2296
2297 __ BIND(L_copy_longs);
2298 #ifdef ASSERT
2299 {
2300 BLOCK_COMMENT("assert long copy {");
2301 Label L;
2302 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2303 __ cmpw(r15_elsize, LogBytesPerLong);
2304 __ br(Assembler::EQ, L);
2305 __ stop("must be long copy, but elsize is wrong");
2306 __ bind(L);
2307 BLOCK_COMMENT("} assert long copy done");
2308 }
2309 #endif
2310 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2311 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2312 __ movw(count, scratch_length); // length
2313 __ b(RuntimeAddress(long_copy_entry));
2314
2315 // ObjArrayKlass
2316 __ BIND(L_objArray);
2317 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2318
2319 Label L_plain_copy, L_checkcast_copy;
2320 // test array classes for subtyping
2321 __ load_klass(r15, dst);
2322 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2323 __ br(Assembler::NE, L_checkcast_copy);
2324
2325 // Identically typed arrays can be copied without element-wise checks.
2326 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2327 rscratch2, L_failed);
2328
2329 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2330 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2331 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2332 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2333 __ movw(count, scratch_length); // length
2334 __ BIND(L_plain_copy);
2335 __ b(RuntimeAddress(oop_copy_entry));
2336
2337 __ BIND(L_checkcast_copy);
2338 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2339 {
2340 // Before looking at dst.length, make sure dst is also an objArray.
2341 __ ldrw(rscratch1, Address(r15, lh_offset));
2342 __ movw(rscratch2, objArray_lh);
2343 __ eorw(rscratch1, rscratch1, rscratch2);
2344 __ cbnzw(rscratch1, L_failed);
2345
2346 // It is safe to examine both src.length and dst.length.
2347 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2348 r15, L_failed);
2349
2350 __ load_klass(dst_klass, dst); // reload
2351
2352 // Marshal the base address arguments now, freeing registers.
2353 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2354 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2355 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2356 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2357 __ movw(count, length); // length (reloaded)
2358 Register sco_temp = c_rarg3; // this register is free now
2359 assert_different_registers(from, to, count, sco_temp,
2360 dst_klass, scratch_src_klass);
2361 // assert_clean_int(count, sco_temp);
2362
2363 // Generate the type check.
2364 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2365 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2366
2367 // Smashes rscratch1, rscratch2
2368 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2369 L_plain_copy);
2370
2371 // Fetch destination element klass from the ObjArrayKlass header.
2372 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2373 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2374 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2375
2376 // the checkcast_copy loop needs two extra arguments:
2377 assert(c_rarg3 == sco_temp, "#3 already in place");
2378 // Set up arguments for checkcast_copy_entry.
2379 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2380 __ b(RuntimeAddress(checkcast_copy_entry));
2381 }
2382
2383 __ BIND(L_failed);
2384 __ mov(r0, -1);
2385 __ leave(); // required for proper stackwalking of RuntimeStub frame
2386 __ ret(lr);
2387
2388 return start;
2389 }
2390
2391 //
2392 // Generate stub for array fill. If "aligned" is true, the
2393 // "to" address is assumed to be heapword aligned.
2394 //
2395 // Arguments for generated stub:
2396 // to: c_rarg0
2397 // value: c_rarg1
2398 // count: c_rarg2 treated as signed
2399 //
2400 address generate_fill(StubId stub_id) {
2401 BasicType t;
2402 bool aligned;
2403
2404 switch (stub_id) {
2405 case StubId::stubgen_jbyte_fill_id:
2406 t = T_BYTE;
2407 aligned = false;
2408 break;
2409 case StubId::stubgen_jshort_fill_id:
2410 t = T_SHORT;
2411 aligned = false;
2412 break;
2413 case StubId::stubgen_jint_fill_id:
2414 t = T_INT;
2415 aligned = false;
2416 break;
2417 case StubId::stubgen_arrayof_jbyte_fill_id:
2418 t = T_BYTE;
2419 aligned = true;
2420 break;
2421 case StubId::stubgen_arrayof_jshort_fill_id:
2422 t = T_SHORT;
2423 aligned = true;
2424 break;
2425 case StubId::stubgen_arrayof_jint_fill_id:
2426 t = T_INT;
2427 aligned = true;
2428 break;
2429 default:
2430 ShouldNotReachHere();
2431 };
2432
2433 __ align(CodeEntryAlignment);
2434 StubCodeMark mark(this, stub_id);
2435 address start = __ pc();
2436
2437 BLOCK_COMMENT("Entry:");
2438
2439 const Register to = c_rarg0; // source array address
2440 const Register value = c_rarg1; // value
2441 const Register count = c_rarg2; // elements count
2442
2443 const Register bz_base = r10; // base for block_zero routine
2444 const Register cnt_words = r11; // temp register
2445
2446 __ enter();
2447
2448 Label L_fill_elements, L_exit1;
2449
2450 int shift = -1;
2451 switch (t) {
2452 case T_BYTE:
2453 shift = 0;
2454 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2455 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2456 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2457 __ br(Assembler::LO, L_fill_elements);
2458 break;
2459 case T_SHORT:
2460 shift = 1;
2461 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2462 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2463 __ br(Assembler::LO, L_fill_elements);
2464 break;
2465 case T_INT:
2466 shift = 2;
2467 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2468 __ br(Assembler::LO, L_fill_elements);
2469 break;
2470 default: ShouldNotReachHere();
2471 }
2472
2473 // Align source address at 8 bytes address boundary.
2474 Label L_skip_align1, L_skip_align2, L_skip_align4;
2475 if (!aligned) {
2476 switch (t) {
2477 case T_BYTE:
2478 // One byte misalignment happens only for byte arrays.
2479 __ tbz(to, 0, L_skip_align1);
2480 __ strb(value, Address(__ post(to, 1)));
2481 __ subw(count, count, 1);
2482 __ bind(L_skip_align1);
2483 // Fallthrough
2484 case T_SHORT:
2485 // Two bytes misalignment happens only for byte and short (char) arrays.
2486 __ tbz(to, 1, L_skip_align2);
2487 __ strh(value, Address(__ post(to, 2)));
2488 __ subw(count, count, 2 >> shift);
2489 __ bind(L_skip_align2);
2490 // Fallthrough
2491 case T_INT:
2492 // Align to 8 bytes, we know we are 4 byte aligned to start.
2493 __ tbz(to, 2, L_skip_align4);
2494 __ strw(value, Address(__ post(to, 4)));
2495 __ subw(count, count, 4 >> shift);
2496 __ bind(L_skip_align4);
2497 break;
2498 default: ShouldNotReachHere();
2499 }
2500 }
2501
2502 //
2503 // Fill large chunks
2504 //
2505 __ lsrw(cnt_words, count, 3 - shift); // number of words
2506 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2507 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2508 if (UseBlockZeroing) {
2509 Label non_block_zeroing, rest;
2510 // If the fill value is zero we can use the fast zero_words().
2511 __ cbnz(value, non_block_zeroing);
2512 __ mov(bz_base, to);
2513 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2514 address tpc = __ zero_words(bz_base, cnt_words);
2515 if (tpc == nullptr) {
2516 fatal("CodeCache is full at generate_fill");
2517 }
2518 __ b(rest);
2519 __ bind(non_block_zeroing);
2520 __ fill_words(to, cnt_words, value);
2521 __ bind(rest);
2522 } else {
2523 __ fill_words(to, cnt_words, value);
2524 }
2525
2526 // Remaining count is less than 8 bytes. Fill it by a single store.
2527 // Note that the total length is no less than 8 bytes.
2528 if (t == T_BYTE || t == T_SHORT) {
2529 Label L_exit1;
2530 __ cbzw(count, L_exit1);
2531 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2532 __ str(value, Address(to, -8)); // overwrite some elements
2533 __ bind(L_exit1);
2534 __ leave();
2535 __ ret(lr);
2536 }
2537
2538 // Handle copies less than 8 bytes.
2539 Label L_fill_2, L_fill_4, L_exit2;
2540 __ bind(L_fill_elements);
2541 switch (t) {
2542 case T_BYTE:
2543 __ tbz(count, 0, L_fill_2);
2544 __ strb(value, Address(__ post(to, 1)));
2545 __ bind(L_fill_2);
2546 __ tbz(count, 1, L_fill_4);
2547 __ strh(value, Address(__ post(to, 2)));
2548 __ bind(L_fill_4);
2549 __ tbz(count, 2, L_exit2);
2550 __ strw(value, Address(to));
2551 break;
2552 case T_SHORT:
2553 __ tbz(count, 0, L_fill_4);
2554 __ strh(value, Address(__ post(to, 2)));
2555 __ bind(L_fill_4);
2556 __ tbz(count, 1, L_exit2);
2557 __ strw(value, Address(to));
2558 break;
2559 case T_INT:
2560 __ cbzw(count, L_exit2);
2561 __ strw(value, Address(to));
2562 break;
2563 default: ShouldNotReachHere();
2564 }
2565 __ bind(L_exit2);
2566 __ leave();
2567 __ ret(lr);
2568 return start;
2569 }
2570
2571 address generate_unsafecopy_common_error_exit() {
2572 address start_pc = __ pc();
2573 __ leave();
2574 __ mov(r0, 0);
2575 __ ret(lr);
2576 return start_pc;
2577 }
2578
2579 //
2580 // Generate 'unsafe' set memory stub
2581 // Though just as safe as the other stubs, it takes an unscaled
2582 // size_t (# bytes) argument instead of an element count.
2583 //
2584 // This fill operation is atomicity preserving: as long as the
2585 // address supplied is sufficiently aligned, all writes of up to 64
2586 // bits in size are single-copy atomic.
2587 //
2588 // Input:
2589 // c_rarg0 - destination array address
2590 // c_rarg1 - byte count (size_t)
2591 // c_rarg2 - byte value
2592 //
2593 address generate_unsafe_setmemory() {
2594 __ align(CodeEntryAlignment);
2595 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2596 address start = __ pc();
2597
2598 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2599 Label tail;
2600
2601 UnsafeMemoryAccessMark umam(this, true, false);
2602
2603 __ enter(); // required for proper stackwalking of RuntimeStub frame
2604
2605 __ dup(v0, __ T16B, value);
2606
2607 if (AvoidUnalignedAccesses) {
2608 __ cmp(count, (u1)16);
2609 __ br(__ LO, tail);
2610
2611 __ mov(rscratch1, 16);
2612 __ andr(rscratch2, dest, 15);
2613 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2614 __ strq(v0, Address(dest));
2615 __ sub(count, count, rscratch1);
2616 __ add(dest, dest, rscratch1);
2617 }
2618
2619 __ subs(count, count, (u1)64);
2620 __ br(__ LO, tail);
2621 {
2622 Label again;
2623 __ bind(again);
2624 __ stpq(v0, v0, Address(dest));
2625 __ stpq(v0, v0, Address(dest, 32));
2626
2627 __ subs(count, count, 64);
2628 __ add(dest, dest, 64);
2629 __ br(__ HS, again);
2630 }
2631
2632 __ bind(tail);
2633 // The count of bytes is off by 64, but we don't need to correct
2634 // it because we're only going to use the least-significant few
2635 // count bits from here on.
2636 // __ add(count, count, 64);
2637
2638 {
2639 Label dont;
2640 __ tbz(count, exact_log2(32), dont);
2641 __ stpq(v0, v0, __ post(dest, 32));
2642 __ bind(dont);
2643 }
2644 {
2645 Label dont;
2646 __ tbz(count, exact_log2(16), dont);
2647 __ strq(v0, __ post(dest, 16));
2648 __ bind(dont);
2649 }
2650 {
2651 Label dont;
2652 __ tbz(count, exact_log2(8), dont);
2653 __ strd(v0, __ post(dest, 8));
2654 __ bind(dont);
2655 }
2656
2657 Label finished;
2658 __ tst(count, 7);
2659 __ br(__ EQ, finished);
2660
2661 {
2662 Label dont;
2663 __ tbz(count, exact_log2(4), dont);
2664 __ strs(v0, __ post(dest, 4));
2665 __ bind(dont);
2666 }
2667 {
2668 Label dont;
2669 __ tbz(count, exact_log2(2), dont);
2670 __ bfi(value, value, 8, 8);
2671 __ strh(value, __ post(dest, 2));
2672 __ bind(dont);
2673 }
2674 {
2675 Label dont;
2676 __ tbz(count, exact_log2(1), dont);
2677 __ strb(value, Address(dest));
2678 __ bind(dont);
2679 }
2680
2681 __ bind(finished);
2682 __ leave();
2683 __ ret(lr);
2684
2685 return start;
2686 }
2687
2688 address generate_data_cache_writeback() {
2689 const Register line = c_rarg0; // address of line to write back
2690
2691 __ align(CodeEntryAlignment);
2692
2693 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2694 StubCodeMark mark(this, stub_id);
2695
2696 address start = __ pc();
2697 __ enter();
2698 __ cache_wb(Address(line, 0));
2699 __ leave();
2700 __ ret(lr);
2701
2702 return start;
2703 }
2704
2705 address generate_data_cache_writeback_sync() {
2706 const Register is_pre = c_rarg0; // pre or post sync
2707
2708 __ align(CodeEntryAlignment);
2709
2710 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2711 StubCodeMark mark(this, stub_id);
2712
2713 // pre wbsync is a no-op
2714 // post wbsync translates to an sfence
2715
2716 Label skip;
2717 address start = __ pc();
2718 __ enter();
2719 __ cbnz(is_pre, skip);
2720 __ cache_wbsync(false);
2721 __ bind(skip);
2722 __ leave();
2723 __ ret(lr);
2724
2725 return start;
2726 }
2727
2728 void generate_arraycopy_stubs() {
2729 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2730 // entry immediately following their stack push. This can be used
2731 // as a post-push branch target for compatible stubs when they
2732 // identify a special case that can be handled by the fallback
2733 // stub e.g a disjoint copy stub may be use as a special case
2734 // fallback for its compatible conjoint copy stub.
2735 //
2736 // A no push entry is always returned in the following local and
2737 // then published by assigning to the appropriate entry field in
2738 // class StubRoutines. The entry value is then passed to the
2739 // generator for the compatible stub. That means the entry must be
2740 // listed when saving to/restoring from the AOT cache, ensuring
2741 // that the inter-stub jumps are noted at AOT-cache save and
2742 // relocated at AOT cache load.
2743 address nopush_entry;
2744
2745 // generate the common exit first so later stubs can rely on it if
2746 // they want an UnsafeMemoryAccess exit non-local to the stub
2747 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2748 // register the stub as the default exit with class UnsafeMemoryAccess
2749 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2750
2751 // generate and publish arch64-specific bulk copy routines first
2752 // so we can call them from other copy stubs
2753 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2754 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2755
2756 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2757 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2758
2759 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2760 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2761
2762 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2763
2764 //*** jbyte
2765 // Always need aligned and unaligned versions
2766 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2767 // disjoint nopush entry is needed by conjoint copy
2768 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2769 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2770 // conjoint nopush entry is needed by generic/unsafe copy
2771 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2772 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2773 // disjoint arrayof nopush entry is needed by conjoint copy
2774 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2775 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2776
2777 //*** jshort
2778 // Always need aligned and unaligned versions
2779 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2780 // disjoint nopush entry is needed by conjoint copy
2781 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2782 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2783 // conjoint nopush entry is used by generic/unsafe copy
2784 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2785 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2786 // disjoint arrayof nopush entry is needed by conjoint copy
2787 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2788 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2789
2790 //*** jint
2791 // Aligned versions
2792 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2793 // disjoint arrayof nopush entry is needed by conjoint copy
2794 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2795 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2796 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2797 // jint_arraycopy_nopush always points to the unaligned version
2798 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2799 // disjoint nopush entry is needed by conjoint copy
2800 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2801 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2802 // conjoint nopush entry is needed by generic/unsafe copy
2803 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2804
2805 //*** jlong
2806 // It is always aligned
2807 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2808 // disjoint arrayof nopush entry is needed by conjoint copy
2809 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2810 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2811 // conjoint nopush entry is needed by generic/unsafe copy
2812 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2813 // disjoint normal/nopush and conjoint normal entries are not
2814 // generated since the arrayof versions are the same
2815 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2816 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2817 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2818
2819 //*** oops
2820 {
2821 StubRoutines::_arrayof_oop_disjoint_arraycopy
2822 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2823 // disjoint arrayof nopush entry is needed by conjoint copy
2824 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2825 StubRoutines::_arrayof_oop_arraycopy
2826 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2827 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2828 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2829 // Aligned versions without pre-barriers
2830 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2831 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2832 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2833 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2834 // note that we don't need a returned nopush entry because the
2835 // generic/unsafe copy does not cater for uninit arrays.
2836 StubRoutines::_arrayof_oop_arraycopy_uninit
2837 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2838 }
2839
2840 // for oop copies reuse arrayof entries for non-arrayof cases
2841 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2842 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2843 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2844 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2845 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2846 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2847
2848 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2849 // checkcast nopush entry is needed by generic copy
2850 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2851 // note that we don't need a returned nopush entry because the
2852 // generic copy does not cater for uninit arrays.
2853 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2854
2855 // unsafe arraycopy may fallback on conjoint stubs
2856 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2857 StubRoutines::_jshort_arraycopy_nopush,
2858 StubRoutines::_jint_arraycopy_nopush,
2859 StubRoutines::_jlong_arraycopy_nopush);
2860
2861 // generic arraycopy may fallback on conjoint stubs
2862 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2863 StubRoutines::_jshort_arraycopy_nopush,
2864 StubRoutines::_jint_arraycopy_nopush,
2865 StubRoutines::_oop_arraycopy_nopush,
2866 StubRoutines::_jlong_arraycopy_nopush,
2867 StubRoutines::_checkcast_arraycopy_nopush);
2868
2869 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2870 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2871 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2872 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2873 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2874 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2875 }
2876
2877 void generate_math_stubs() { Unimplemented(); }
2878
2879 // Arguments:
2880 //
2881 // Inputs:
2882 // c_rarg0 - source byte array address
2883 // c_rarg1 - destination byte array address
2884 // c_rarg2 - K (key) in little endian int array
2885 //
2886 address generate_aescrypt_encryptBlock() {
2887 __ align(CodeEntryAlignment);
2888 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2889 StubCodeMark mark(this, stub_id);
2890
2891 const Register from = c_rarg0; // source array address
2892 const Register to = c_rarg1; // destination array address
2893 const Register key = c_rarg2; // key array address
2894 const Register keylen = rscratch1;
2895
2896 address start = __ pc();
2897 __ enter();
2898
2899 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2900
2901 __ aesenc_loadkeys(key, keylen);
2902 __ aesecb_encrypt(from, to, keylen);
2903
2904 __ mov(r0, 0);
2905
2906 __ leave();
2907 __ ret(lr);
2908
2909 return start;
2910 }
2911
2912 // Arguments:
2913 //
2914 // Inputs:
2915 // c_rarg0 - source byte array address
2916 // c_rarg1 - destination byte array address
2917 // c_rarg2 - K (key) in little endian int array
2918 //
2919 address generate_aescrypt_decryptBlock() {
2920 assert(UseAES, "need AES cryptographic extension support");
2921 __ align(CodeEntryAlignment);
2922 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2923 StubCodeMark mark(this, stub_id);
2924 Label L_doLast;
2925
2926 const Register from = c_rarg0; // source array address
2927 const Register to = c_rarg1; // destination array address
2928 const Register key = c_rarg2; // key array address
2929 const Register keylen = rscratch1;
2930
2931 address start = __ pc();
2932 __ enter(); // required for proper stackwalking of RuntimeStub frame
2933
2934 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2935
2936 __ aesecb_decrypt(from, to, key, keylen);
2937
2938 __ mov(r0, 0);
2939
2940 __ leave();
2941 __ ret(lr);
2942
2943 return start;
2944 }
2945
2946 // Arguments:
2947 //
2948 // Inputs:
2949 // c_rarg0 - source byte array address
2950 // c_rarg1 - destination byte array address
2951 // c_rarg2 - K (key) in little endian int array
2952 // c_rarg3 - r vector byte array address
2953 // c_rarg4 - input length
2954 //
2955 // Output:
2956 // x0 - input length
2957 //
2958 address generate_cipherBlockChaining_encryptAESCrypt() {
2959 assert(UseAES, "need AES cryptographic extension support");
2960 __ align(CodeEntryAlignment);
2961 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2962 StubCodeMark mark(this, stub_id);
2963
2964 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2965
2966 const Register from = c_rarg0; // source array address
2967 const Register to = c_rarg1; // destination array address
2968 const Register key = c_rarg2; // key array address
2969 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2970 // and left with the results of the last encryption block
2971 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2972 const Register keylen = rscratch1;
2973
2974 address start = __ pc();
2975
2976 __ enter();
2977
2978 __ movw(rscratch2, len_reg);
2979
2980 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2981
2982 __ ld1(v0, __ T16B, rvec);
2983
2984 __ cmpw(keylen, 52);
2985 __ br(Assembler::CC, L_loadkeys_44);
2986 __ br(Assembler::EQ, L_loadkeys_52);
2987
2988 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2989 __ rev32(v17, __ T16B, v17);
2990 __ rev32(v18, __ T16B, v18);
2991 __ BIND(L_loadkeys_52);
2992 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2993 __ rev32(v19, __ T16B, v19);
2994 __ rev32(v20, __ T16B, v20);
2995 __ BIND(L_loadkeys_44);
2996 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2997 __ rev32(v21, __ T16B, v21);
2998 __ rev32(v22, __ T16B, v22);
2999 __ rev32(v23, __ T16B, v23);
3000 __ rev32(v24, __ T16B, v24);
3001 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3002 __ rev32(v25, __ T16B, v25);
3003 __ rev32(v26, __ T16B, v26);
3004 __ rev32(v27, __ T16B, v27);
3005 __ rev32(v28, __ T16B, v28);
3006 __ ld1(v29, v30, v31, __ T16B, key);
3007 __ rev32(v29, __ T16B, v29);
3008 __ rev32(v30, __ T16B, v30);
3009 __ rev32(v31, __ T16B, v31);
3010
3011 __ BIND(L_aes_loop);
3012 __ ld1(v1, __ T16B, __ post(from, 16));
3013 __ eor(v0, __ T16B, v0, v1);
3014
3015 __ br(Assembler::CC, L_rounds_44);
3016 __ br(Assembler::EQ, L_rounds_52);
3017
3018 __ aese(v0, v17); __ aesmc(v0, v0);
3019 __ aese(v0, v18); __ aesmc(v0, v0);
3020 __ BIND(L_rounds_52);
3021 __ aese(v0, v19); __ aesmc(v0, v0);
3022 __ aese(v0, v20); __ aesmc(v0, v0);
3023 __ BIND(L_rounds_44);
3024 __ aese(v0, v21); __ aesmc(v0, v0);
3025 __ aese(v0, v22); __ aesmc(v0, v0);
3026 __ aese(v0, v23); __ aesmc(v0, v0);
3027 __ aese(v0, v24); __ aesmc(v0, v0);
3028 __ aese(v0, v25); __ aesmc(v0, v0);
3029 __ aese(v0, v26); __ aesmc(v0, v0);
3030 __ aese(v0, v27); __ aesmc(v0, v0);
3031 __ aese(v0, v28); __ aesmc(v0, v0);
3032 __ aese(v0, v29); __ aesmc(v0, v0);
3033 __ aese(v0, v30);
3034 __ eor(v0, __ T16B, v0, v31);
3035
3036 __ st1(v0, __ T16B, __ post(to, 16));
3037
3038 __ subw(len_reg, len_reg, 16);
3039 __ cbnzw(len_reg, L_aes_loop);
3040
3041 __ st1(v0, __ T16B, rvec);
3042
3043 __ mov(r0, rscratch2);
3044
3045 __ leave();
3046 __ ret(lr);
3047
3048 return start;
3049 }
3050
3051 // Arguments:
3052 //
3053 // Inputs:
3054 // c_rarg0 - source byte array address
3055 // c_rarg1 - destination byte array address
3056 // c_rarg2 - K (key) in little endian int array
3057 // c_rarg3 - r vector byte array address
3058 // c_rarg4 - input length
3059 //
3060 // Output:
3061 // r0 - input length
3062 //
3063 address generate_cipherBlockChaining_decryptAESCrypt() {
3064 assert(UseAES, "need AES cryptographic extension support");
3065 __ align(CodeEntryAlignment);
3066 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3067 StubCodeMark mark(this, stub_id);
3068
3069 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3070
3071 const Register from = c_rarg0; // source array address
3072 const Register to = c_rarg1; // destination array address
3073 const Register key = c_rarg2; // key array address
3074 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3075 // and left with the results of the last encryption block
3076 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3077 const Register keylen = rscratch1;
3078
3079 address start = __ pc();
3080
3081 __ enter();
3082
3083 __ movw(rscratch2, len_reg);
3084
3085 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3086
3087 __ ld1(v2, __ T16B, rvec);
3088
3089 __ ld1(v31, __ T16B, __ post(key, 16));
3090 __ rev32(v31, __ T16B, v31);
3091
3092 __ cmpw(keylen, 52);
3093 __ br(Assembler::CC, L_loadkeys_44);
3094 __ br(Assembler::EQ, L_loadkeys_52);
3095
3096 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3097 __ rev32(v17, __ T16B, v17);
3098 __ rev32(v18, __ T16B, v18);
3099 __ BIND(L_loadkeys_52);
3100 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3101 __ rev32(v19, __ T16B, v19);
3102 __ rev32(v20, __ T16B, v20);
3103 __ BIND(L_loadkeys_44);
3104 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3105 __ rev32(v21, __ T16B, v21);
3106 __ rev32(v22, __ T16B, v22);
3107 __ rev32(v23, __ T16B, v23);
3108 __ rev32(v24, __ T16B, v24);
3109 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3110 __ rev32(v25, __ T16B, v25);
3111 __ rev32(v26, __ T16B, v26);
3112 __ rev32(v27, __ T16B, v27);
3113 __ rev32(v28, __ T16B, v28);
3114 __ ld1(v29, v30, __ T16B, key);
3115 __ rev32(v29, __ T16B, v29);
3116 __ rev32(v30, __ T16B, v30);
3117
3118 __ BIND(L_aes_loop);
3119 __ ld1(v0, __ T16B, __ post(from, 16));
3120 __ orr(v1, __ T16B, v0, v0);
3121
3122 __ br(Assembler::CC, L_rounds_44);
3123 __ br(Assembler::EQ, L_rounds_52);
3124
3125 __ aesd(v0, v17); __ aesimc(v0, v0);
3126 __ aesd(v0, v18); __ aesimc(v0, v0);
3127 __ BIND(L_rounds_52);
3128 __ aesd(v0, v19); __ aesimc(v0, v0);
3129 __ aesd(v0, v20); __ aesimc(v0, v0);
3130 __ BIND(L_rounds_44);
3131 __ aesd(v0, v21); __ aesimc(v0, v0);
3132 __ aesd(v0, v22); __ aesimc(v0, v0);
3133 __ aesd(v0, v23); __ aesimc(v0, v0);
3134 __ aesd(v0, v24); __ aesimc(v0, v0);
3135 __ aesd(v0, v25); __ aesimc(v0, v0);
3136 __ aesd(v0, v26); __ aesimc(v0, v0);
3137 __ aesd(v0, v27); __ aesimc(v0, v0);
3138 __ aesd(v0, v28); __ aesimc(v0, v0);
3139 __ aesd(v0, v29); __ aesimc(v0, v0);
3140 __ aesd(v0, v30);
3141 __ eor(v0, __ T16B, v0, v31);
3142 __ eor(v0, __ T16B, v0, v2);
3143
3144 __ st1(v0, __ T16B, __ post(to, 16));
3145 __ orr(v2, __ T16B, v1, v1);
3146
3147 __ subw(len_reg, len_reg, 16);
3148 __ cbnzw(len_reg, L_aes_loop);
3149
3150 __ st1(v2, __ T16B, rvec);
3151
3152 __ mov(r0, rscratch2);
3153
3154 __ leave();
3155 __ ret(lr);
3156
3157 return start;
3158 }
3159
3160 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3161 // Inputs: 128-bits. in is preserved.
3162 // The least-significant 64-bit word is in the upper dword of each vector.
3163 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3164 // Output: result
3165 void be_add_128_64(FloatRegister result, FloatRegister in,
3166 FloatRegister inc, FloatRegister tmp) {
3167 assert_different_registers(result, tmp, inc);
3168
3169 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3170 // input
3171 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3172 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3173 // MSD == 0 (must be!) to LSD
3174 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3175 }
3176
3177 // CTR AES crypt.
3178 // Arguments:
3179 //
3180 // Inputs:
3181 // c_rarg0 - source byte array address
3182 // c_rarg1 - destination byte array address
3183 // c_rarg2 - K (key) in little endian int array
3184 // c_rarg3 - counter vector byte array address
3185 // c_rarg4 - input length
3186 // c_rarg5 - saved encryptedCounter start
3187 // c_rarg6 - saved used length
3188 //
3189 // Output:
3190 // r0 - input length
3191 //
3192 address generate_counterMode_AESCrypt() {
3193 const Register in = c_rarg0;
3194 const Register out = c_rarg1;
3195 const Register key = c_rarg2;
3196 const Register counter = c_rarg3;
3197 const Register saved_len = c_rarg4, len = r10;
3198 const Register saved_encrypted_ctr = c_rarg5;
3199 const Register used_ptr = c_rarg6, used = r12;
3200
3201 const Register offset = r7;
3202 const Register keylen = r11;
3203
3204 const unsigned char block_size = 16;
3205 const int bulk_width = 4;
3206 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3207 // performance with larger data sizes, but it also means that the
3208 // fast path isn't used until you have at least 8 blocks, and up
3209 // to 127 bytes of data will be executed on the slow path. For
3210 // that reason, and also so as not to blow away too much icache, 4
3211 // blocks seems like a sensible compromise.
3212
3213 // Algorithm:
3214 //
3215 // if (len == 0) {
3216 // goto DONE;
3217 // }
3218 // int result = len;
3219 // do {
3220 // if (used >= blockSize) {
3221 // if (len >= bulk_width * blockSize) {
3222 // CTR_large_block();
3223 // if (len == 0)
3224 // goto DONE;
3225 // }
3226 // for (;;) {
3227 // 16ByteVector v0 = counter;
3228 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3229 // used = 0;
3230 // if (len < blockSize)
3231 // break; /* goto NEXT */
3232 // 16ByteVector v1 = load16Bytes(in, offset);
3233 // v1 = v1 ^ encryptedCounter;
3234 // store16Bytes(out, offset);
3235 // used = blockSize;
3236 // offset += blockSize;
3237 // len -= blockSize;
3238 // if (len == 0)
3239 // goto DONE;
3240 // }
3241 // }
3242 // NEXT:
3243 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3244 // len--;
3245 // } while (len != 0);
3246 // DONE:
3247 // return result;
3248 //
3249 // CTR_large_block()
3250 // Wide bulk encryption of whole blocks.
3251
3252 __ align(CodeEntryAlignment);
3253 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3254 StubCodeMark mark(this, stub_id);
3255 const address start = __ pc();
3256 __ enter();
3257
3258 Label DONE, CTR_large_block, large_block_return;
3259 __ ldrw(used, Address(used_ptr));
3260 __ cbzw(saved_len, DONE);
3261
3262 __ mov(len, saved_len);
3263 __ mov(offset, 0);
3264
3265 // Compute #rounds for AES based on the length of the key array
3266 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3267
3268 __ aesenc_loadkeys(key, keylen);
3269
3270 {
3271 Label L_CTR_loop, NEXT;
3272
3273 __ bind(L_CTR_loop);
3274
3275 __ cmp(used, block_size);
3276 __ br(__ LO, NEXT);
3277
3278 // Maybe we have a lot of data
3279 __ subsw(rscratch1, len, bulk_width * block_size);
3280 __ br(__ HS, CTR_large_block);
3281 __ BIND(large_block_return);
3282 __ cbzw(len, DONE);
3283
3284 // Setup the counter
3285 __ movi(v4, __ T4S, 0);
3286 __ movi(v5, __ T4S, 1);
3287 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3288
3289 // 128-bit big-endian increment
3290 __ ld1(v0, __ T16B, counter);
3291 __ rev64(v16, __ T16B, v0);
3292 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3293 __ rev64(v16, __ T16B, v16);
3294 __ st1(v16, __ T16B, counter);
3295 // Previous counter value is in v0
3296 // v4 contains { 0, 1 }
3297
3298 {
3299 // We have fewer than bulk_width blocks of data left. Encrypt
3300 // them one by one until there is less than a full block
3301 // remaining, being careful to save both the encrypted counter
3302 // and the counter.
3303
3304 Label inner_loop;
3305 __ bind(inner_loop);
3306 // Counter to encrypt is in v0
3307 __ aesecb_encrypt(noreg, noreg, keylen);
3308 __ st1(v0, __ T16B, saved_encrypted_ctr);
3309
3310 // Do we have a remaining full block?
3311
3312 __ mov(used, 0);
3313 __ cmp(len, block_size);
3314 __ br(__ LO, NEXT);
3315
3316 // Yes, we have a full block
3317 __ ldrq(v1, Address(in, offset));
3318 __ eor(v1, __ T16B, v1, v0);
3319 __ strq(v1, Address(out, offset));
3320 __ mov(used, block_size);
3321 __ add(offset, offset, block_size);
3322
3323 __ subw(len, len, block_size);
3324 __ cbzw(len, DONE);
3325
3326 // Increment the counter, store it back
3327 __ orr(v0, __ T16B, v16, v16);
3328 __ rev64(v16, __ T16B, v16);
3329 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3330 __ rev64(v16, __ T16B, v16);
3331 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3332
3333 __ b(inner_loop);
3334 }
3335
3336 __ BIND(NEXT);
3337
3338 // Encrypt a single byte, and loop.
3339 // We expect this to be a rare event.
3340 __ ldrb(rscratch1, Address(in, offset));
3341 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3342 __ eor(rscratch1, rscratch1, rscratch2);
3343 __ strb(rscratch1, Address(out, offset));
3344 __ add(offset, offset, 1);
3345 __ add(used, used, 1);
3346 __ subw(len, len,1);
3347 __ cbnzw(len, L_CTR_loop);
3348 }
3349
3350 __ bind(DONE);
3351 __ strw(used, Address(used_ptr));
3352 __ mov(r0, saved_len);
3353
3354 __ leave(); // required for proper stackwalking of RuntimeStub frame
3355 __ ret(lr);
3356
3357 // Bulk encryption
3358
3359 __ BIND (CTR_large_block);
3360 assert(bulk_width == 4 || bulk_width == 8, "must be");
3361
3362 if (bulk_width == 8) {
3363 __ sub(sp, sp, 4 * 16);
3364 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3365 }
3366 __ sub(sp, sp, 4 * 16);
3367 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3368 RegSet saved_regs = (RegSet::of(in, out, offset)
3369 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3370 __ push(saved_regs, sp);
3371 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3372 __ add(in, in, offset);
3373 __ add(out, out, offset);
3374
3375 // Keys should already be loaded into the correct registers
3376
3377 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3378 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3379
3380 // AES/CTR loop
3381 {
3382 Label L_CTR_loop;
3383 __ BIND(L_CTR_loop);
3384
3385 // Setup the counters
3386 __ movi(v8, __ T4S, 0);
3387 __ movi(v9, __ T4S, 1);
3388 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3389
3390 for (int i = 0; i < bulk_width; i++) {
3391 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3392 __ rev64(v0_ofs, __ T16B, v16);
3393 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3394 }
3395
3396 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3397
3398 // Encrypt the counters
3399 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3400
3401 if (bulk_width == 8) {
3402 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3403 }
3404
3405 // XOR the encrypted counters with the inputs
3406 for (int i = 0; i < bulk_width; i++) {
3407 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3408 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3409 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3410 }
3411
3412 // Write the encrypted data
3413 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3414 if (bulk_width == 8) {
3415 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3416 }
3417
3418 __ subw(len, len, 16 * bulk_width);
3419 __ cbnzw(len, L_CTR_loop);
3420 }
3421
3422 // Save the counter back where it goes
3423 __ rev64(v16, __ T16B, v16);
3424 __ st1(v16, __ T16B, counter);
3425
3426 __ pop(saved_regs, sp);
3427
3428 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3429 if (bulk_width == 8) {
3430 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3431 }
3432
3433 __ andr(rscratch1, len, -16 * bulk_width);
3434 __ sub(len, len, rscratch1);
3435 __ add(offset, offset, rscratch1);
3436 __ mov(used, 16);
3437 __ strw(used, Address(used_ptr));
3438 __ b(large_block_return);
3439
3440 return start;
3441 }
3442
3443 // Vector AES Galois Counter Mode implementation. Parameters:
3444 //
3445 // in = c_rarg0
3446 // len = c_rarg1
3447 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3448 // out = c_rarg3
3449 // key = c_rarg4
3450 // state = c_rarg5 - GHASH.state
3451 // subkeyHtbl = c_rarg6 - powers of H
3452 // counter = c_rarg7 - 16 bytes of CTR
3453 // return - number of processed bytes
3454 address generate_galoisCounterMode_AESCrypt() {
3455 Label ghash_polynomial; // local data generated after code
3456
3457 __ align(CodeEntryAlignment);
3458 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3459 StubCodeMark mark(this, stub_id);
3460 address start = __ pc();
3461 __ enter();
3462
3463 const Register in = c_rarg0;
3464 const Register len = c_rarg1;
3465 const Register ct = c_rarg2;
3466 const Register out = c_rarg3;
3467 // and updated with the incremented counter in the end
3468
3469 const Register key = c_rarg4;
3470 const Register state = c_rarg5;
3471
3472 const Register subkeyHtbl = c_rarg6;
3473
3474 const Register counter = c_rarg7;
3475
3476 const Register keylen = r10;
3477 // Save state before entering routine
3478 __ sub(sp, sp, 4 * 16);
3479 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3480 __ sub(sp, sp, 4 * 16);
3481 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3482
3483 // __ andr(len, len, -512);
3484 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3485 __ str(len, __ pre(sp, -2 * wordSize));
3486
3487 Label DONE;
3488 __ cbz(len, DONE);
3489
3490 // Compute #rounds for AES based on the length of the key array
3491 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3492
3493 __ aesenc_loadkeys(key, keylen);
3494 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3495 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3496
3497 // AES/CTR loop
3498 {
3499 Label L_CTR_loop;
3500 __ BIND(L_CTR_loop);
3501
3502 // Setup the counters
3503 __ movi(v8, __ T4S, 0);
3504 __ movi(v9, __ T4S, 1);
3505 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3506
3507 assert(v0->encoding() < v8->encoding(), "");
3508 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3509 FloatRegister f = as_FloatRegister(i);
3510 __ rev32(f, __ T16B, v16);
3511 __ addv(v16, __ T4S, v16, v8);
3512 }
3513
3514 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3515
3516 // Encrypt the counters
3517 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3518
3519 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3520
3521 // XOR the encrypted counters with the inputs
3522 for (int i = 0; i < 8; i++) {
3523 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3524 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3525 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3526 }
3527 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3528 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3529
3530 __ subw(len, len, 16 * 8);
3531 __ cbnzw(len, L_CTR_loop);
3532 }
3533
3534 __ rev32(v16, __ T16B, v16);
3535 __ st1(v16, __ T16B, counter);
3536
3537 __ ldr(len, Address(sp));
3538 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3539
3540 // GHASH/CTR loop
3541 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3542 len, /*unrolls*/4);
3543
3544 #ifdef ASSERT
3545 { Label L;
3546 __ cmp(len, (unsigned char)0);
3547 __ br(Assembler::EQ, L);
3548 __ stop("stubGenerator: abort");
3549 __ bind(L);
3550 }
3551 #endif
3552
3553 __ bind(DONE);
3554 // Return the number of bytes processed
3555 __ ldr(r0, __ post(sp, 2 * wordSize));
3556
3557 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3558 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3559
3560 __ leave(); // required for proper stackwalking of RuntimeStub frame
3561 __ ret(lr);
3562
3563 // bind label and generate polynomial data
3564 __ align(wordSize * 2);
3565 __ bind(ghash_polynomial);
3566 __ emit_int64(0x87); // The low-order bits of the field
3567 // polynomial (i.e. p = z^7+z^2+z+1)
3568 // repeated in the low and high parts of a
3569 // 128-bit vector
3570 __ emit_int64(0x87);
3571
3572 return start;
3573 }
3574
3575 class Cached64Bytes {
3576 private:
3577 MacroAssembler *_masm;
3578 Register _regs[8];
3579
3580 public:
3581 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3582 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3583 auto it = rs.begin();
3584 for (auto &r: _regs) {
3585 r = *it;
3586 ++it;
3587 }
3588 }
3589
3590 void gen_loads(Register base) {
3591 for (int i = 0; i < 8; i += 2) {
3592 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3593 }
3594 }
3595
3596 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3597 void extract_u32(Register dest, int i) {
3598 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3599 }
3600 };
3601
3602 // Utility routines for md5.
3603 // Clobbers r10 and r11.
3604 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3605 int k, int s, int t) {
3606 Register rscratch3 = r10;
3607 Register rscratch4 = r11;
3608
3609 __ eorw(rscratch3, r3, r4);
3610 __ movw(rscratch2, t);
3611 __ andw(rscratch3, rscratch3, r2);
3612 __ addw(rscratch4, r1, rscratch2);
3613 reg_cache.extract_u32(rscratch1, k);
3614 __ eorw(rscratch3, rscratch3, r4);
3615 __ addw(rscratch4, rscratch4, rscratch1);
3616 __ addw(rscratch3, rscratch3, rscratch4);
3617 __ rorw(rscratch2, rscratch3, 32 - s);
3618 __ addw(r1, rscratch2, r2);
3619 }
3620
3621 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3622 int k, int s, int t) {
3623 Register rscratch3 = r10;
3624 Register rscratch4 = r11;
3625
3626 reg_cache.extract_u32(rscratch1, k);
3627 __ movw(rscratch2, t);
3628 __ addw(rscratch4, r1, rscratch2);
3629 __ addw(rscratch4, rscratch4, rscratch1);
3630 __ bicw(rscratch2, r3, r4);
3631 __ andw(rscratch3, r2, r4);
3632 __ addw(rscratch2, rscratch2, rscratch4);
3633 __ addw(rscratch2, rscratch2, rscratch3);
3634 __ rorw(rscratch2, rscratch2, 32 - s);
3635 __ addw(r1, rscratch2, r2);
3636 }
3637
3638 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3639 int k, int s, int t) {
3640 Register rscratch3 = r10;
3641 Register rscratch4 = r11;
3642
3643 __ eorw(rscratch3, r3, r4);
3644 __ movw(rscratch2, t);
3645 __ addw(rscratch4, r1, rscratch2);
3646 reg_cache.extract_u32(rscratch1, k);
3647 __ eorw(rscratch3, rscratch3, r2);
3648 __ addw(rscratch4, rscratch4, rscratch1);
3649 __ addw(rscratch3, rscratch3, rscratch4);
3650 __ rorw(rscratch2, rscratch3, 32 - s);
3651 __ addw(r1, rscratch2, r2);
3652 }
3653
3654 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3655 int k, int s, int t) {
3656 Register rscratch3 = r10;
3657 Register rscratch4 = r11;
3658
3659 __ movw(rscratch3, t);
3660 __ ornw(rscratch2, r2, r4);
3661 __ addw(rscratch4, r1, rscratch3);
3662 reg_cache.extract_u32(rscratch1, k);
3663 __ eorw(rscratch3, rscratch2, r3);
3664 __ addw(rscratch4, rscratch4, rscratch1);
3665 __ addw(rscratch3, rscratch3, rscratch4);
3666 __ rorw(rscratch2, rscratch3, 32 - s);
3667 __ addw(r1, rscratch2, r2);
3668 }
3669
3670 // Arguments:
3671 //
3672 // Inputs:
3673 // c_rarg0 - byte[] source+offset
3674 // c_rarg1 - int[] SHA.state
3675 // c_rarg2 - int offset
3676 // c_rarg3 - int limit
3677 //
3678 address generate_md5_implCompress(StubId stub_id) {
3679 bool multi_block;
3680 switch (stub_id) {
3681 case StubId::stubgen_md5_implCompress_id:
3682 multi_block = false;
3683 break;
3684 case StubId::stubgen_md5_implCompressMB_id:
3685 multi_block = true;
3686 break;
3687 default:
3688 ShouldNotReachHere();
3689 }
3690 __ align(CodeEntryAlignment);
3691
3692 StubCodeMark mark(this, stub_id);
3693 address start = __ pc();
3694
3695 Register buf = c_rarg0;
3696 Register state = c_rarg1;
3697 Register ofs = c_rarg2;
3698 Register limit = c_rarg3;
3699 Register a = r4;
3700 Register b = r5;
3701 Register c = r6;
3702 Register d = r7;
3703 Register rscratch3 = r10;
3704 Register rscratch4 = r11;
3705
3706 Register state_regs[2] = { r12, r13 };
3707 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3708 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3709
3710 __ push(saved_regs, sp);
3711
3712 __ ldp(state_regs[0], state_regs[1], Address(state));
3713 __ ubfx(a, state_regs[0], 0, 32);
3714 __ ubfx(b, state_regs[0], 32, 32);
3715 __ ubfx(c, state_regs[1], 0, 32);
3716 __ ubfx(d, state_regs[1], 32, 32);
3717
3718 Label md5_loop;
3719 __ BIND(md5_loop);
3720
3721 reg_cache.gen_loads(buf);
3722
3723 // Round 1
3724 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3725 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3726 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3727 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3728 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3729 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3730 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3731 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3732 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3733 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3734 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3735 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3736 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3737 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3738 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3739 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3740
3741 // Round 2
3742 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3743 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3744 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3745 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3746 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3747 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3748 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3749 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3750 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3751 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3752 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3753 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3754 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3755 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3756 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3757 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3758
3759 // Round 3
3760 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3761 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3762 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3763 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3764 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3765 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3766 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3767 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3768 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3769 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3770 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3771 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3772 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3773 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3774 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3775 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3776
3777 // Round 4
3778 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3779 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3780 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3781 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3782 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3783 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3784 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3785 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3786 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3787 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3788 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3789 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3790 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3791 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3792 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3793 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3794
3795 __ addw(a, state_regs[0], a);
3796 __ ubfx(rscratch2, state_regs[0], 32, 32);
3797 __ addw(b, rscratch2, b);
3798 __ addw(c, state_regs[1], c);
3799 __ ubfx(rscratch4, state_regs[1], 32, 32);
3800 __ addw(d, rscratch4, d);
3801
3802 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3803 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3804
3805 if (multi_block) {
3806 __ add(buf, buf, 64);
3807 __ add(ofs, ofs, 64);
3808 __ cmp(ofs, limit);
3809 __ br(Assembler::LE, md5_loop);
3810 __ mov(c_rarg0, ofs); // return ofs
3811 }
3812
3813 // write hash values back in the correct order
3814 __ stp(state_regs[0], state_regs[1], Address(state));
3815
3816 __ pop(saved_regs, sp);
3817
3818 __ ret(lr);
3819
3820 return start;
3821 }
3822
3823 // Arguments:
3824 //
3825 // Inputs:
3826 // c_rarg0 - byte[] source+offset
3827 // c_rarg1 - int[] SHA.state
3828 // c_rarg2 - int offset
3829 // c_rarg3 - int limit
3830 //
3831 address generate_sha1_implCompress(StubId stub_id) {
3832 bool multi_block;
3833 switch (stub_id) {
3834 case StubId::stubgen_sha1_implCompress_id:
3835 multi_block = false;
3836 break;
3837 case StubId::stubgen_sha1_implCompressMB_id:
3838 multi_block = true;
3839 break;
3840 default:
3841 ShouldNotReachHere();
3842 }
3843
3844 __ align(CodeEntryAlignment);
3845
3846 StubCodeMark mark(this, stub_id);
3847 address start = __ pc();
3848
3849 Register buf = c_rarg0;
3850 Register state = c_rarg1;
3851 Register ofs = c_rarg2;
3852 Register limit = c_rarg3;
3853
3854 Label keys;
3855 Label sha1_loop;
3856
3857 // load the keys into v0..v3
3858 __ adr(rscratch1, keys);
3859 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3860 // load 5 words state into v6, v7
3861 __ ldrq(v6, Address(state, 0));
3862 __ ldrs(v7, Address(state, 16));
3863
3864
3865 __ BIND(sha1_loop);
3866 // load 64 bytes of data into v16..v19
3867 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3868 __ rev32(v16, __ T16B, v16);
3869 __ rev32(v17, __ T16B, v17);
3870 __ rev32(v18, __ T16B, v18);
3871 __ rev32(v19, __ T16B, v19);
3872
3873 // do the sha1
3874 __ addv(v4, __ T4S, v16, v0);
3875 __ orr(v20, __ T16B, v6, v6);
3876
3877 FloatRegister d0 = v16;
3878 FloatRegister d1 = v17;
3879 FloatRegister d2 = v18;
3880 FloatRegister d3 = v19;
3881
3882 for (int round = 0; round < 20; round++) {
3883 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3884 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3885 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3886 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3887 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3888
3889 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3890 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3891 __ sha1h(tmp2, __ T4S, v20);
3892 if (round < 5)
3893 __ sha1c(v20, __ T4S, tmp3, tmp4);
3894 else if (round < 10 || round >= 15)
3895 __ sha1p(v20, __ T4S, tmp3, tmp4);
3896 else
3897 __ sha1m(v20, __ T4S, tmp3, tmp4);
3898 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3899
3900 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3901 }
3902
3903 __ addv(v7, __ T2S, v7, v21);
3904 __ addv(v6, __ T4S, v6, v20);
3905
3906 if (multi_block) {
3907 __ add(ofs, ofs, 64);
3908 __ cmp(ofs, limit);
3909 __ br(Assembler::LE, sha1_loop);
3910 __ mov(c_rarg0, ofs); // return ofs
3911 }
3912
3913 __ strq(v6, Address(state, 0));
3914 __ strs(v7, Address(state, 16));
3915
3916 __ ret(lr);
3917
3918 __ bind(keys);
3919 __ emit_int32(0x5a827999);
3920 __ emit_int32(0x6ed9eba1);
3921 __ emit_int32(0x8f1bbcdc);
3922 __ emit_int32(0xca62c1d6);
3923
3924 return start;
3925 }
3926
3927
3928 // Arguments:
3929 //
3930 // Inputs:
3931 // c_rarg0 - byte[] source+offset
3932 // c_rarg1 - int[] SHA.state
3933 // c_rarg2 - int offset
3934 // c_rarg3 - int limit
3935 //
3936 address generate_sha256_implCompress(StubId stub_id) {
3937 bool multi_block;
3938 switch (stub_id) {
3939 case StubId::stubgen_sha256_implCompress_id:
3940 multi_block = false;
3941 break;
3942 case StubId::stubgen_sha256_implCompressMB_id:
3943 multi_block = true;
3944 break;
3945 default:
3946 ShouldNotReachHere();
3947 }
3948
3949 static const uint32_t round_consts[64] = {
3950 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3951 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3952 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3953 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3954 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3955 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3956 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3957 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3958 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3959 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3960 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3961 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3962 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3963 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3964 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3965 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3966 };
3967
3968 __ align(CodeEntryAlignment);
3969
3970 StubCodeMark mark(this, stub_id);
3971 address start = __ pc();
3972
3973 Register buf = c_rarg0;
3974 Register state = c_rarg1;
3975 Register ofs = c_rarg2;
3976 Register limit = c_rarg3;
3977
3978 Label sha1_loop;
3979
3980 __ stpd(v8, v9, __ pre(sp, -32));
3981 __ stpd(v10, v11, Address(sp, 16));
3982
3983 // dga == v0
3984 // dgb == v1
3985 // dg0 == v2
3986 // dg1 == v3
3987 // dg2 == v4
3988 // t0 == v6
3989 // t1 == v7
3990
3991 // load 16 keys to v16..v31
3992 __ lea(rscratch1, ExternalAddress((address)round_consts));
3993 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3994 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3995 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3996 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3997
3998 // load 8 words (256 bits) state
3999 __ ldpq(v0, v1, state);
4000
4001 __ BIND(sha1_loop);
4002 // load 64 bytes of data into v8..v11
4003 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4004 __ rev32(v8, __ T16B, v8);
4005 __ rev32(v9, __ T16B, v9);
4006 __ rev32(v10, __ T16B, v10);
4007 __ rev32(v11, __ T16B, v11);
4008
4009 __ addv(v6, __ T4S, v8, v16);
4010 __ orr(v2, __ T16B, v0, v0);
4011 __ orr(v3, __ T16B, v1, v1);
4012
4013 FloatRegister d0 = v8;
4014 FloatRegister d1 = v9;
4015 FloatRegister d2 = v10;
4016 FloatRegister d3 = v11;
4017
4018
4019 for (int round = 0; round < 16; round++) {
4020 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4021 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4022 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4023 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4024
4025 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4026 __ orr(v4, __ T16B, v2, v2);
4027 if (round < 15)
4028 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4029 __ sha256h(v2, __ T4S, v3, tmp2);
4030 __ sha256h2(v3, __ T4S, v4, tmp2);
4031 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4032
4033 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4034 }
4035
4036 __ addv(v0, __ T4S, v0, v2);
4037 __ addv(v1, __ T4S, v1, v3);
4038
4039 if (multi_block) {
4040 __ add(ofs, ofs, 64);
4041 __ cmp(ofs, limit);
4042 __ br(Assembler::LE, sha1_loop);
4043 __ mov(c_rarg0, ofs); // return ofs
4044 }
4045
4046 __ ldpd(v10, v11, Address(sp, 16));
4047 __ ldpd(v8, v9, __ post(sp, 32));
4048
4049 __ stpq(v0, v1, state);
4050
4051 __ ret(lr);
4052
4053 return start;
4054 }
4055
4056 // Double rounds for sha512.
4057 void sha512_dround(int dr,
4058 FloatRegister vi0, FloatRegister vi1,
4059 FloatRegister vi2, FloatRegister vi3,
4060 FloatRegister vi4, FloatRegister vrc0,
4061 FloatRegister vrc1, FloatRegister vin0,
4062 FloatRegister vin1, FloatRegister vin2,
4063 FloatRegister vin3, FloatRegister vin4) {
4064 if (dr < 36) {
4065 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4066 }
4067 __ addv(v5, __ T2D, vrc0, vin0);
4068 __ ext(v6, __ T16B, vi2, vi3, 8);
4069 __ ext(v5, __ T16B, v5, v5, 8);
4070 __ ext(v7, __ T16B, vi1, vi2, 8);
4071 __ addv(vi3, __ T2D, vi3, v5);
4072 if (dr < 32) {
4073 __ ext(v5, __ T16B, vin3, vin4, 8);
4074 __ sha512su0(vin0, __ T2D, vin1);
4075 }
4076 __ sha512h(vi3, __ T2D, v6, v7);
4077 if (dr < 32) {
4078 __ sha512su1(vin0, __ T2D, vin2, v5);
4079 }
4080 __ addv(vi4, __ T2D, vi1, vi3);
4081 __ sha512h2(vi3, __ T2D, vi1, vi0);
4082 }
4083
4084 // Arguments:
4085 //
4086 // Inputs:
4087 // c_rarg0 - byte[] source+offset
4088 // c_rarg1 - int[] SHA.state
4089 // c_rarg2 - int offset
4090 // c_rarg3 - int limit
4091 //
4092 address generate_sha512_implCompress(StubId stub_id) {
4093 bool multi_block;
4094 switch (stub_id) {
4095 case StubId::stubgen_sha512_implCompress_id:
4096 multi_block = false;
4097 break;
4098 case StubId::stubgen_sha512_implCompressMB_id:
4099 multi_block = true;
4100 break;
4101 default:
4102 ShouldNotReachHere();
4103 }
4104
4105 static const uint64_t round_consts[80] = {
4106 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4107 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4108 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4109 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4110 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4111 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4112 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4113 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4114 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4115 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4116 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4117 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4118 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4119 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4120 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4121 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4122 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4123 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4124 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4125 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4126 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4127 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4128 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4129 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4130 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4131 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4132 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4133 };
4134
4135 __ align(CodeEntryAlignment);
4136
4137 StubCodeMark mark(this, stub_id);
4138 address start = __ pc();
4139
4140 Register buf = c_rarg0;
4141 Register state = c_rarg1;
4142 Register ofs = c_rarg2;
4143 Register limit = c_rarg3;
4144
4145 __ stpd(v8, v9, __ pre(sp, -64));
4146 __ stpd(v10, v11, Address(sp, 16));
4147 __ stpd(v12, v13, Address(sp, 32));
4148 __ stpd(v14, v15, Address(sp, 48));
4149
4150 Label sha512_loop;
4151
4152 // load state
4153 __ ld1(v8, v9, v10, v11, __ T2D, state);
4154
4155 // load first 4 round constants
4156 __ lea(rscratch1, ExternalAddress((address)round_consts));
4157 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4158
4159 __ BIND(sha512_loop);
4160 // load 128B of data into v12..v19
4161 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4162 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4163 __ rev64(v12, __ T16B, v12);
4164 __ rev64(v13, __ T16B, v13);
4165 __ rev64(v14, __ T16B, v14);
4166 __ rev64(v15, __ T16B, v15);
4167 __ rev64(v16, __ T16B, v16);
4168 __ rev64(v17, __ T16B, v17);
4169 __ rev64(v18, __ T16B, v18);
4170 __ rev64(v19, __ T16B, v19);
4171
4172 __ mov(rscratch2, rscratch1);
4173
4174 __ mov(v0, __ T16B, v8);
4175 __ mov(v1, __ T16B, v9);
4176 __ mov(v2, __ T16B, v10);
4177 __ mov(v3, __ T16B, v11);
4178
4179 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4180 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4181 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4182 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4183 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4184 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4185 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4186 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4187 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4188 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4189 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4190 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4191 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4192 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4193 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4194 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4195 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4196 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4197 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4198 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4199 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4200 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4201 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4202 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4203 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4204 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4205 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4206 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4207 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4208 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4209 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4210 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4211 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4212 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4213 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4214 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4215 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4216 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4217 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4218 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4219
4220 __ addv(v8, __ T2D, v8, v0);
4221 __ addv(v9, __ T2D, v9, v1);
4222 __ addv(v10, __ T2D, v10, v2);
4223 __ addv(v11, __ T2D, v11, v3);
4224
4225 if (multi_block) {
4226 __ add(ofs, ofs, 128);
4227 __ cmp(ofs, limit);
4228 __ br(Assembler::LE, sha512_loop);
4229 __ mov(c_rarg0, ofs); // return ofs
4230 }
4231
4232 __ st1(v8, v9, v10, v11, __ T2D, state);
4233
4234 __ ldpd(v14, v15, Address(sp, 48));
4235 __ ldpd(v12, v13, Address(sp, 32));
4236 __ ldpd(v10, v11, Address(sp, 16));
4237 __ ldpd(v8, v9, __ post(sp, 64));
4238
4239 __ ret(lr);
4240
4241 return start;
4242 }
4243
4244 // Execute one round of keccak of two computations in parallel.
4245 // One of the states should be loaded into the lower halves of
4246 // the vector registers v0-v24, the other should be loaded into
4247 // the upper halves of those registers. The ld1r instruction loads
4248 // the round constant into both halves of register v31.
4249 // Intermediate results c0...c5 and d0...d5 are computed
4250 // in registers v25...v30.
4251 // All vector instructions that are used operate on both register
4252 // halves in parallel.
4253 // If only a single computation is needed, one can only load the lower halves.
4254 void keccak_round(Register rscratch1) {
4255 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4256 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4257 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4258 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4259 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4260 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4261 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4262 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4263 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4264 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4265
4266 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4267 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4268 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4269 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4270 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4271
4272 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4273 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4274 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4275 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4276 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4277 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4278 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4279 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4280 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4281 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4282 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4283 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4284 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4285 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4286 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4287 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4288 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4289 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4290 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4291 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4292 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4293 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4294 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4295 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4296 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4297
4298 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4299 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4300 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4301 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4302 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4303
4304 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4305
4306 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4307 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4308 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4309 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4310 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4311
4312 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4313 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4314 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4315 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4316 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4317
4318 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4319 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4320 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4321 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4322 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4323
4324 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4325 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4326 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4327 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4328 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4329
4330 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4331 }
4332
4333 // Arguments:
4334 //
4335 // Inputs:
4336 // c_rarg0 - byte[] source+offset
4337 // c_rarg1 - byte[] SHA.state
4338 // c_rarg2 - int block_size
4339 // c_rarg3 - int offset
4340 // c_rarg4 - int limit
4341 //
4342 address generate_sha3_implCompress(StubId stub_id) {
4343 bool multi_block;
4344 switch (stub_id) {
4345 case StubId::stubgen_sha3_implCompress_id:
4346 multi_block = false;
4347 break;
4348 case StubId::stubgen_sha3_implCompressMB_id:
4349 multi_block = true;
4350 break;
4351 default:
4352 ShouldNotReachHere();
4353 }
4354
4355 static const uint64_t round_consts[24] = {
4356 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4357 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4358 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4359 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4360 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4361 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4362 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4363 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4364 };
4365
4366 __ align(CodeEntryAlignment);
4367
4368 StubCodeMark mark(this, stub_id);
4369 address start = __ pc();
4370
4371 Register buf = c_rarg0;
4372 Register state = c_rarg1;
4373 Register block_size = c_rarg2;
4374 Register ofs = c_rarg3;
4375 Register limit = c_rarg4;
4376
4377 Label sha3_loop, rounds24_loop;
4378 Label sha3_512_or_sha3_384, shake128;
4379
4380 __ stpd(v8, v9, __ pre(sp, -64));
4381 __ stpd(v10, v11, Address(sp, 16));
4382 __ stpd(v12, v13, Address(sp, 32));
4383 __ stpd(v14, v15, Address(sp, 48));
4384
4385 // load state
4386 __ add(rscratch1, state, 32);
4387 __ ld1(v0, v1, v2, v3, __ T1D, state);
4388 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4389 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4390 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4391 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4392 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4393 __ ld1(v24, __ T1D, rscratch1);
4394
4395 __ BIND(sha3_loop);
4396
4397 // 24 keccak rounds
4398 __ movw(rscratch2, 24);
4399
4400 // load round_constants base
4401 __ lea(rscratch1, ExternalAddress((address) round_consts));
4402
4403 // load input
4404 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4405 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4406 __ eor(v0, __ T8B, v0, v25);
4407 __ eor(v1, __ T8B, v1, v26);
4408 __ eor(v2, __ T8B, v2, v27);
4409 __ eor(v3, __ T8B, v3, v28);
4410 __ eor(v4, __ T8B, v4, v29);
4411 __ eor(v5, __ T8B, v5, v30);
4412 __ eor(v6, __ T8B, v6, v31);
4413
4414 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4415 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4416
4417 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4418 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4419 __ eor(v7, __ T8B, v7, v25);
4420 __ eor(v8, __ T8B, v8, v26);
4421 __ eor(v9, __ T8B, v9, v27);
4422 __ eor(v10, __ T8B, v10, v28);
4423 __ eor(v11, __ T8B, v11, v29);
4424 __ eor(v12, __ T8B, v12, v30);
4425 __ eor(v13, __ T8B, v13, v31);
4426
4427 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4428 __ eor(v14, __ T8B, v14, v25);
4429 __ eor(v15, __ T8B, v15, v26);
4430 __ eor(v16, __ T8B, v16, v27);
4431
4432 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4433 __ andw(c_rarg5, block_size, 48);
4434 __ cbzw(c_rarg5, rounds24_loop);
4435
4436 __ tbnz(block_size, 5, shake128);
4437 // block_size == 144, bit5 == 0, SHA3-224
4438 __ ldrd(v28, __ post(buf, 8));
4439 __ eor(v17, __ T8B, v17, v28);
4440 __ b(rounds24_loop);
4441
4442 __ BIND(shake128);
4443 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4444 __ eor(v17, __ T8B, v17, v28);
4445 __ eor(v18, __ T8B, v18, v29);
4446 __ eor(v19, __ T8B, v19, v30);
4447 __ eor(v20, __ T8B, v20, v31);
4448 __ b(rounds24_loop); // block_size == 168, SHAKE128
4449
4450 __ BIND(sha3_512_or_sha3_384);
4451 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4452 __ eor(v7, __ T8B, v7, v25);
4453 __ eor(v8, __ T8B, v8, v26);
4454 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4455
4456 // SHA3-384
4457 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4458 __ eor(v9, __ T8B, v9, v27);
4459 __ eor(v10, __ T8B, v10, v28);
4460 __ eor(v11, __ T8B, v11, v29);
4461 __ eor(v12, __ T8B, v12, v30);
4462
4463 __ BIND(rounds24_loop);
4464 __ subw(rscratch2, rscratch2, 1);
4465
4466 keccak_round(rscratch1);
4467
4468 __ cbnzw(rscratch2, rounds24_loop);
4469
4470 if (multi_block) {
4471 __ add(ofs, ofs, block_size);
4472 __ cmp(ofs, limit);
4473 __ br(Assembler::LE, sha3_loop);
4474 __ mov(c_rarg0, ofs); // return ofs
4475 }
4476
4477 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4478 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4479 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4480 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4481 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4482 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4483 __ st1(v24, __ T1D, state);
4484
4485 // restore callee-saved registers
4486 __ ldpd(v14, v15, Address(sp, 48));
4487 __ ldpd(v12, v13, Address(sp, 32));
4488 __ ldpd(v10, v11, Address(sp, 16));
4489 __ ldpd(v8, v9, __ post(sp, 64));
4490
4491 __ ret(lr);
4492
4493 return start;
4494 }
4495
4496 // Inputs:
4497 // c_rarg0 - long[] state0
4498 // c_rarg1 - long[] state1
4499 address generate_double_keccak() {
4500 static const uint64_t round_consts[24] = {
4501 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4502 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4503 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4504 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4505 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4506 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4507 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4508 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4509 };
4510
4511 // Implements the double_keccak() method of the
4512 // sun.secyrity.provider.SHA3Parallel class
4513 __ align(CodeEntryAlignment);
4514 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4515 address start = __ pc();
4516 __ enter();
4517
4518 Register state0 = c_rarg0;
4519 Register state1 = c_rarg1;
4520
4521 Label rounds24_loop;
4522
4523 // save callee-saved registers
4524 __ stpd(v8, v9, __ pre(sp, -64));
4525 __ stpd(v10, v11, Address(sp, 16));
4526 __ stpd(v12, v13, Address(sp, 32));
4527 __ stpd(v14, v15, Address(sp, 48));
4528
4529 // load states
4530 __ add(rscratch1, state0, 32);
4531 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4532 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4533 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4534 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4535 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4536 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4537 __ ld1(v24, __ D, 0, rscratch1);
4538 __ add(rscratch1, state1, 32);
4539 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4540 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4541 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4542 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4543 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4544 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4545 __ ld1(v24, __ D, 1, rscratch1);
4546
4547 // 24 keccak rounds
4548 __ movw(rscratch2, 24);
4549
4550 // load round_constants base
4551 __ lea(rscratch1, ExternalAddress((address) round_consts));
4552
4553 __ BIND(rounds24_loop);
4554 __ subw(rscratch2, rscratch2, 1);
4555 keccak_round(rscratch1);
4556 __ cbnzw(rscratch2, rounds24_loop);
4557
4558 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4559 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4560 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4561 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4562 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4563 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4564 __ st1(v24, __ D, 0, state0);
4565 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4566 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4567 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4568 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4569 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4570 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4571 __ st1(v24, __ D, 1, state1);
4572
4573 // restore callee-saved vector registers
4574 __ ldpd(v14, v15, Address(sp, 48));
4575 __ ldpd(v12, v13, Address(sp, 32));
4576 __ ldpd(v10, v11, Address(sp, 16));
4577 __ ldpd(v8, v9, __ post(sp, 64));
4578
4579 __ leave(); // required for proper stackwalking of RuntimeStub frame
4580 __ mov(r0, zr); // return 0
4581 __ ret(lr);
4582
4583 return start;
4584 }
4585
4586 // ChaCha20 block function. This version parallelizes the 32-bit
4587 // state elements on each of 16 vectors, producing 4 blocks of
4588 // keystream at a time.
4589 //
4590 // state (int[16]) = c_rarg0
4591 // keystream (byte[256]) = c_rarg1
4592 // return - number of bytes of produced keystream (always 256)
4593 //
4594 // This implementation takes each 32-bit integer from the state
4595 // array and broadcasts it across all 4 32-bit lanes of a vector register
4596 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4597 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4598 // the quarter round schedule is implemented as outlined in RFC 7539 section
4599 // 2.3. However, instead of sequentially processing the 3 quarter round
4600 // operations represented by one QUARTERROUND function, we instead stack all
4601 // the adds, xors and left-rotations from the first 4 quarter rounds together
4602 // and then do the same for the second set of 4 quarter rounds. This removes
4603 // some latency that would otherwise be incurred by waiting for an add to
4604 // complete before performing an xor (which depends on the result of the
4605 // add), etc. An adjustment happens between the first and second groups of 4
4606 // quarter rounds, but this is done only in the inputs to the macro functions
4607 // that generate the assembly instructions - these adjustments themselves are
4608 // not part of the resulting assembly.
4609 // The 4 registers v0-v3 are used during the quarter round operations as
4610 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4611 // registers become the vectors involved in adding the start state back onto
4612 // the post-QR working state. After the adds are complete, each of the 16
4613 // vectors write their first lane back to the keystream buffer, followed
4614 // by the second lane from all vectors and so on.
4615 address generate_chacha20Block_blockpar() {
4616 Label L_twoRounds, L_cc20_const;
4617 __ align(CodeEntryAlignment);
4618 StubId stub_id = StubId::stubgen_chacha20Block_id;
4619 StubCodeMark mark(this, stub_id);
4620 address start = __ pc();
4621 __ enter();
4622
4623 int i, j;
4624 const Register state = c_rarg0;
4625 const Register keystream = c_rarg1;
4626 const Register loopCtr = r10;
4627 const Register tmpAddr = r11;
4628 const FloatRegister ctrAddOverlay = v28;
4629 const FloatRegister lrot8Tbl = v29;
4630
4631 // Organize SIMD registers in an array that facilitates
4632 // putting repetitive opcodes into loop structures. It is
4633 // important that each grouping of 4 registers is monotonically
4634 // increasing to support the requirements of multi-register
4635 // instructions (e.g. ld4r, st4, etc.)
4636 const FloatRegister workSt[16] = {
4637 v4, v5, v6, v7, v16, v17, v18, v19,
4638 v20, v21, v22, v23, v24, v25, v26, v27
4639 };
4640
4641 // Pull in constant data. The first 16 bytes are the add overlay
4642 // which is applied to the vector holding the counter (state[12]).
4643 // The second 16 bytes is the index register for the 8-bit left
4644 // rotation tbl instruction.
4645 __ adr(tmpAddr, L_cc20_const);
4646 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4647
4648 // Load from memory and interlace across 16 SIMD registers,
4649 // With each word from memory being broadcast to all lanes of
4650 // each successive SIMD register.
4651 // Addr(0) -> All lanes in workSt[i]
4652 // Addr(4) -> All lanes workSt[i + 1], etc.
4653 __ mov(tmpAddr, state);
4654 for (i = 0; i < 16; i += 4) {
4655 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4656 __ post(tmpAddr, 16));
4657 }
4658 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4659
4660 // Before entering the loop, create 5 4-register arrays. These
4661 // will hold the 4 registers that represent the a/b/c/d fields
4662 // in the quarter round operation. For instance the "b" field
4663 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4664 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4665 // since it is part of a diagonal organization. The aSet and scratch
4666 // register sets are defined at declaration time because they do not change
4667 // organization at any point during the 20-round processing.
4668 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4669 FloatRegister bSet[4];
4670 FloatRegister cSet[4];
4671 FloatRegister dSet[4];
4672 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4673
4674 // Set up the 10 iteration loop and perform all 8 quarter round ops
4675 __ mov(loopCtr, 10);
4676 __ BIND(L_twoRounds);
4677
4678 // Set to columnar organization and do the following 4 quarter-rounds:
4679 // QUARTERROUND(0, 4, 8, 12)
4680 // QUARTERROUND(1, 5, 9, 13)
4681 // QUARTERROUND(2, 6, 10, 14)
4682 // QUARTERROUND(3, 7, 11, 15)
4683 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4684 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4685 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4686
4687 __ cc20_qr_add4(aSet, bSet); // a += b
4688 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4689 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4690
4691 __ cc20_qr_add4(cSet, dSet); // c += d
4692 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4693 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4694
4695 __ cc20_qr_add4(aSet, bSet); // a += b
4696 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4697 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4698
4699 __ cc20_qr_add4(cSet, dSet); // c += d
4700 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4701 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4702
4703 // Set to diagonal organization and do the next 4 quarter-rounds:
4704 // QUARTERROUND(0, 5, 10, 15)
4705 // QUARTERROUND(1, 6, 11, 12)
4706 // QUARTERROUND(2, 7, 8, 13)
4707 // QUARTERROUND(3, 4, 9, 14)
4708 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4709 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4710 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4711
4712 __ cc20_qr_add4(aSet, bSet); // a += b
4713 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4714 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4715
4716 __ cc20_qr_add4(cSet, dSet); // c += d
4717 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4718 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4719
4720 __ cc20_qr_add4(aSet, bSet); // a += b
4721 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4722 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4723
4724 __ cc20_qr_add4(cSet, dSet); // c += d
4725 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4726 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4727
4728 // Decrement and iterate
4729 __ sub(loopCtr, loopCtr, 1);
4730 __ cbnz(loopCtr, L_twoRounds);
4731
4732 __ mov(tmpAddr, state);
4733
4734 // Add the starting state back to the post-loop keystream
4735 // state. We read/interlace the state array from memory into
4736 // 4 registers similar to what we did in the beginning. Then
4737 // add the counter overlay onto workSt[12] at the end.
4738 for (i = 0; i < 16; i += 4) {
4739 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4740 __ addv(workSt[i], __ T4S, workSt[i], v0);
4741 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4742 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4743 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4744 }
4745 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4746
4747 // Write working state into the keystream buffer. This is accomplished
4748 // by taking the lane "i" from each of the four vectors and writing
4749 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4750 // repeating with the next 4 vectors until all 16 vectors have been used.
4751 // Then move to the next lane and repeat the process until all lanes have
4752 // been written.
4753 for (i = 0; i < 4; i++) {
4754 for (j = 0; j < 16; j += 4) {
4755 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4756 __ post(keystream, 16));
4757 }
4758 }
4759
4760 __ mov(r0, 256); // Return length of output keystream
4761 __ leave();
4762 __ ret(lr);
4763
4764 // bind label and generate local constant data used by this stub
4765 // The constant data is broken into two 128-bit segments to be loaded
4766 // onto FloatRegisters. The first 128 bits are a counter add overlay
4767 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4768 // The second 128-bits is a table constant used for 8-bit left rotations.
4769 __ BIND(L_cc20_const);
4770 __ emit_int64(0x0000000100000000UL);
4771 __ emit_int64(0x0000000300000002UL);
4772 __ emit_int64(0x0605040702010003UL);
4773 __ emit_int64(0x0E0D0C0F0A09080BUL);
4774
4775 return start;
4776 }
4777
4778 // Helpers to schedule parallel operation bundles across vector
4779 // register sequences of size 2, 4 or 8.
4780
4781 // Implement various primitive computations across vector sequences
4782
4783 template<int N>
4784 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4785 const VSeq<N>& v1, const VSeq<N>& v2) {
4786 // output must not be constant
4787 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4788 // output cannot overwrite pending inputs
4789 assert(!vs_write_before_read(v, v1), "output overwrites input");
4790 assert(!vs_write_before_read(v, v2), "output overwrites input");
4791 for (int i = 0; i < N; i++) {
4792 __ addv(v[i], T, v1[i], v2[i]);
4793 }
4794 }
4795
4796 template<int N>
4797 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4798 const VSeq<N>& v1, const VSeq<N>& v2) {
4799 // output must not be constant
4800 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4801 // output cannot overwrite pending inputs
4802 assert(!vs_write_before_read(v, v1), "output overwrites input");
4803 assert(!vs_write_before_read(v, v2), "output overwrites input");
4804 for (int i = 0; i < N; i++) {
4805 __ subv(v[i], T, v1[i], v2[i]);
4806 }
4807 }
4808
4809 template<int N>
4810 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4811 const VSeq<N>& v1, const VSeq<N>& v2) {
4812 // output must not be constant
4813 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4814 // output cannot overwrite pending inputs
4815 assert(!vs_write_before_read(v, v1), "output overwrites input");
4816 assert(!vs_write_before_read(v, v2), "output overwrites input");
4817 for (int i = 0; i < N; i++) {
4818 __ mulv(v[i], T, v1[i], v2[i]);
4819 }
4820 }
4821
4822 template<int N>
4823 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4824 // output must not be constant
4825 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4826 // output cannot overwrite pending inputs
4827 assert(!vs_write_before_read(v, v1), "output overwrites input");
4828 for (int i = 0; i < N; i++) {
4829 __ negr(v[i], T, v1[i]);
4830 }
4831 }
4832
4833 template<int N>
4834 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4835 const VSeq<N>& v1, int shift) {
4836 // output must not be constant
4837 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4838 // output cannot overwrite pending inputs
4839 assert(!vs_write_before_read(v, v1), "output overwrites input");
4840 for (int i = 0; i < N; i++) {
4841 __ sshr(v[i], T, v1[i], shift);
4842 }
4843 }
4844
4845 template<int N>
4846 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4847 // output must not be constant
4848 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4849 // output cannot overwrite pending inputs
4850 assert(!vs_write_before_read(v, v1), "output overwrites input");
4851 assert(!vs_write_before_read(v, v2), "output overwrites input");
4852 for (int i = 0; i < N; i++) {
4853 __ andr(v[i], __ T16B, v1[i], v2[i]);
4854 }
4855 }
4856
4857 template<int N>
4858 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4859 // output must not be constant
4860 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4861 // output cannot overwrite pending inputs
4862 assert(!vs_write_before_read(v, v1), "output overwrites input");
4863 assert(!vs_write_before_read(v, v2), "output overwrites input");
4864 for (int i = 0; i < N; i++) {
4865 __ orr(v[i], __ T16B, v1[i], v2[i]);
4866 }
4867 }
4868
4869 template<int N>
4870 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4871 // output must not be constant
4872 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4873 // output cannot overwrite pending inputs
4874 assert(!vs_write_before_read(v, v1), "output overwrites input");
4875 for (int i = 0; i < N; i++) {
4876 __ notr(v[i], __ T16B, v1[i]);
4877 }
4878 }
4879
4880 template<int N>
4881 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4882 // output must not be constant
4883 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4884 // output cannot overwrite pending inputs
4885 assert(!vs_write_before_read(v, v1), "output overwrites input");
4886 assert(!vs_write_before_read(v, v2), "output overwrites input");
4887 for (int i = 0; i < N; i++) {
4888 __ sqdmulh(v[i], T, v1[i], v2[i]);
4889 }
4890 }
4891
4892 template<int N>
4893 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4894 // output must not be constant
4895 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4896 // output cannot overwrite pending inputs
4897 assert(!vs_write_before_read(v, v1), "output overwrites input");
4898 assert(!vs_write_before_read(v, v2), "output overwrites input");
4899 for (int i = 0; i < N; i++) {
4900 __ mlsv(v[i], T, v1[i], v2[i]);
4901 }
4902 }
4903
4904 // load N/2 successive pairs of quadword values from memory in order
4905 // into N successive vector registers of the sequence via the
4906 // address supplied in base.
4907 template<int N>
4908 void vs_ldpq(const VSeq<N>& v, Register base) {
4909 for (int i = 0; i < N; i += 2) {
4910 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4911 }
4912 }
4913
4914 // load N/2 successive pairs of quadword values from memory in order
4915 // into N vector registers of the sequence via the address supplied
4916 // in base using post-increment addressing
4917 template<int N>
4918 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4919 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4920 for (int i = 0; i < N; i += 2) {
4921 __ ldpq(v[i], v[i+1], __ post(base, 32));
4922 }
4923 }
4924
4925 // store N successive vector registers of the sequence into N/2
4926 // successive pairs of quadword memory locations via the address
4927 // supplied in base using post-increment addressing
4928 template<int N>
4929 void vs_stpq_post(const VSeq<N>& v, Register base) {
4930 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4931 for (int i = 0; i < N; i += 2) {
4932 __ stpq(v[i], v[i+1], __ post(base, 32));
4933 }
4934 }
4935
4936 // load N/2 pairs of quadword values from memory de-interleaved into
4937 // N vector registers 2 at a time via the address supplied in base
4938 // using post-increment addressing.
4939 template<int N>
4940 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4941 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4942 for (int i = 0; i < N; i += 2) {
4943 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4944 }
4945 }
4946
4947 // store N vector registers interleaved into N/2 pairs of quadword
4948 // memory locations via the address supplied in base using
4949 // post-increment addressing.
4950 template<int N>
4951 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4952 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4953 for (int i = 0; i < N; i += 2) {
4954 __ st2(v[i], v[i+1], T, __ post(base, 32));
4955 }
4956 }
4957
4958 // load N quadword values from memory de-interleaved into N vector
4959 // registers 3 elements at a time via the address supplied in base.
4960 template<int N>
4961 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4962 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4963 for (int i = 0; i < N; i += 3) {
4964 __ ld3(v[i], v[i+1], v[i+2], T, base);
4965 }
4966 }
4967
4968 // load N quadword values from memory de-interleaved into N vector
4969 // registers 3 elements at a time via the address supplied in base
4970 // using post-increment addressing.
4971 template<int N>
4972 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4973 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4974 for (int i = 0; i < N; i += 3) {
4975 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4976 }
4977 }
4978
4979 // load N/2 pairs of quadword values from memory into N vector
4980 // registers via the address supplied in base with each pair indexed
4981 // using the the start offset plus the corresponding entry in the
4982 // offsets array
4983 template<int N>
4984 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
4985 for (int i = 0; i < N/2; i++) {
4986 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4987 }
4988 }
4989
4990 // store N vector registers into N/2 pairs of quadword memory
4991 // locations via the address supplied in base with each pair indexed
4992 // using the the start offset plus the corresponding entry in the
4993 // offsets array
4994 template<int N>
4995 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
4996 for (int i = 0; i < N/2; i++) {
4997 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4998 }
4999 }
5000
5001 // load N single quadword values from memory into N vector registers
5002 // via the address supplied in base with each value indexed using
5003 // the the start offset plus the corresponding entry in the offsets
5004 // array
5005 template<int N>
5006 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5007 int start, int (&offsets)[N]) {
5008 for (int i = 0; i < N; i++) {
5009 __ ldr(v[i], T, Address(base, start + offsets[i]));
5010 }
5011 }
5012
5013 // store N vector registers into N single quadword memory locations
5014 // via the address supplied in base with each value indexed using
5015 // the the start offset plus the corresponding entry in the offsets
5016 // array
5017 template<int N>
5018 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5019 int start, int (&offsets)[N]) {
5020 for (int i = 0; i < N; i++) {
5021 __ str(v[i], T, Address(base, start + offsets[i]));
5022 }
5023 }
5024
5025 // load N/2 pairs of quadword values from memory de-interleaved into
5026 // N vector registers 2 at a time via the address supplied in base
5027 // with each pair indexed using the the start offset plus the
5028 // corresponding entry in the offsets array
5029 template<int N>
5030 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5031 Register tmp, int start, int (&offsets)[N/2]) {
5032 for (int i = 0; i < N/2; i++) {
5033 __ add(tmp, base, start + offsets[i]);
5034 __ ld2(v[2*i], v[2*i+1], T, tmp);
5035 }
5036 }
5037
5038 // store N vector registers 2 at a time interleaved into N/2 pairs
5039 // of quadword memory locations via the address supplied in base
5040 // with each pair indexed using the the start offset plus the
5041 // corresponding entry in the offsets array
5042 template<int N>
5043 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5044 Register tmp, int start, int (&offsets)[N/2]) {
5045 for (int i = 0; i < N/2; i++) {
5046 __ add(tmp, base, start + offsets[i]);
5047 __ st2(v[2*i], v[2*i+1], T, tmp);
5048 }
5049 }
5050
5051 // Helper routines for various flavours of Montgomery multiply
5052
5053 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5054 // multiplications in parallel
5055 //
5056
5057 // See the montMul() method of the sun.security.provider.ML_DSA
5058 // class.
5059 //
5060 // Computes 4x4S results or 8x8H results
5061 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5062 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5063 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5064 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5065 // Outputs: va - 4x4S or 4x8H vector register sequences
5066 // vb, vc, vtmp and vq must all be disjoint
5067 // va must be disjoint from all other inputs/temps or must equal vc
5068 // va must have a non-zero delta i.e. it must not be a constant vseq.
5069 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5070 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5071 Assembler::SIMD_Arrangement T,
5072 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5073 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5074 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5075 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5076 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5077
5078 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5079 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5080
5081 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5082
5083 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5084 assert(vs_disjoint(va, vb), "va and vb overlap");
5085 assert(vs_disjoint(va, vq), "va and vq overlap");
5086 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5087 assert(!va.is_constant(), "output vector must identify 4 different registers");
5088
5089 // schedule 4 streams of instructions across the vector sequences
5090 for (int i = 0; i < 4; i++) {
5091 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5092 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5093 }
5094
5095 for (int i = 0; i < 4; i++) {
5096 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5097 }
5098
5099 for (int i = 0; i < 4; i++) {
5100 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5101 }
5102
5103 for (int i = 0; i < 4; i++) {
5104 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5105 }
5106 }
5107
5108 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5109 // multiplications in parallel
5110 //
5111
5112 // See the montMul() method of the sun.security.provider.ML_DSA
5113 // class.
5114 //
5115 // Computes 4x4S results or 8x8H results
5116 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5117 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5118 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5119 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5120 // Outputs: va - 4x4S or 4x8H vector register sequences
5121 // vb, vc, vtmp and vq must all be disjoint
5122 // va must be disjoint from all other inputs/temps or must equal vc
5123 // va must have a non-zero delta i.e. it must not be a constant vseq.
5124 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5125 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5126 Assembler::SIMD_Arrangement T,
5127 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5128 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5129 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5130 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5131 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5132
5133 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5134 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5135
5136 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5137
5138 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5139 assert(vs_disjoint(va, vb), "va and vb overlap");
5140 assert(vs_disjoint(va, vq), "va and vq overlap");
5141 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5142 assert(!va.is_constant(), "output vector must identify 2 different registers");
5143
5144 // schedule 2 streams of instructions across the vector sequences
5145 for (int i = 0; i < 2; i++) {
5146 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5147 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5148 }
5149
5150 for (int i = 0; i < 2; i++) {
5151 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5152 }
5153
5154 for (int i = 0; i < 2; i++) {
5155 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5156 }
5157
5158 for (int i = 0; i < 2; i++) {
5159 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5160 }
5161 }
5162
5163 // Perform 16 16-bit Montgomery multiplications in parallel.
5164 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5165 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5166 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5167 // It will assert that the register use is valid
5168 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5169 }
5170
5171 // Perform 32 16-bit Montgomery multiplications in parallel.
5172 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5173 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5174 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5175 // It will assert that the register use is valid
5176 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5177 }
5178
5179 // Perform 64 16-bit Montgomery multiplications in parallel.
5180 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5181 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5182 // Schedule two successive 4x8H multiplies via the montmul helper
5183 // on the front and back halves of va, vb and vc. The helper will
5184 // assert that the register use has no overlap conflicts on each
5185 // individual call but we also need to ensure that the necessary
5186 // disjoint/equality constraints are met across both calls.
5187
5188 // vb, vc, vtmp and vq must be disjoint. va must either be
5189 // disjoint from all other registers or equal vc
5190
5191 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5192 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5193 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5194
5195 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5196 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5197
5198 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5199
5200 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5201 assert(vs_disjoint(va, vb), "va and vb overlap");
5202 assert(vs_disjoint(va, vq), "va and vq overlap");
5203 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5204
5205 // we multiply the front and back halves of each sequence 4 at a
5206 // time because
5207 //
5208 // 1) we are currently only able to get 4-way instruction
5209 // parallelism at best
5210 //
5211 // 2) we need registers for the constants in vq and temporary
5212 // scratch registers to hold intermediate results so vtmp can only
5213 // be a VSeq<4> which means we only have 4 scratch slots
5214
5215 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5216 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5217 }
5218
5219 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5220 const VSeq<4>& vc,
5221 const VSeq<4>& vtmp,
5222 const VSeq<2>& vq) {
5223 // compute a = montmul(a1, c)
5224 kyber_montmul32(vc, va1, vc, vtmp, vq);
5225 // ouptut a1 = a0 - a
5226 vs_subv(va1, __ T8H, va0, vc);
5227 // and a0 = a0 + a
5228 vs_addv(va0, __ T8H, va0, vc);
5229 }
5230
5231 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5232 const VSeq<4>& vb,
5233 const VSeq<4>& vtmp1,
5234 const VSeq<4>& vtmp2,
5235 const VSeq<2>& vq) {
5236 // compute c = a0 - a1
5237 vs_subv(vtmp1, __ T8H, va0, va1);
5238 // output a0 = a0 + a1
5239 vs_addv(va0, __ T8H, va0, va1);
5240 // output a1 = b montmul c
5241 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5242 }
5243
5244 void load64shorts(const VSeq<8>& v, Register shorts) {
5245 vs_ldpq_post(v, shorts);
5246 }
5247
5248 void load32shorts(const VSeq<4>& v, Register shorts) {
5249 vs_ldpq_post(v, shorts);
5250 }
5251
5252 void store64shorts(VSeq<8> v, Register tmpAddr) {
5253 vs_stpq_post(v, tmpAddr);
5254 }
5255
5256 // Kyber NTT function.
5257 // Implements
5258 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5259 //
5260 // coeffs (short[256]) = c_rarg0
5261 // ntt_zetas (short[256]) = c_rarg1
5262 address generate_kyberNtt() {
5263
5264 __ align(CodeEntryAlignment);
5265 StubId stub_id = StubId::stubgen_kyberNtt_id;
5266 StubCodeMark mark(this, stub_id);
5267 address start = __ pc();
5268 __ enter();
5269
5270 const Register coeffs = c_rarg0;
5271 const Register zetas = c_rarg1;
5272
5273 const Register kyberConsts = r10;
5274 const Register tmpAddr = r11;
5275
5276 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5277 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5278 VSeq<2> vq(30); // n.b. constants overlap vs3
5279
5280 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5281 // load the montmul constants
5282 vs_ldpq(vq, kyberConsts);
5283
5284 // Each level corresponds to an iteration of the outermost loop of the
5285 // Java method seilerNTT(int[] coeffs). There are some differences
5286 // from what is done in the seilerNTT() method, though:
5287 // 1. The computation is using 16-bit signed values, we do not convert them
5288 // to ints here.
5289 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5290 // this array for each level, it is easier that way to fill up the vector
5291 // registers.
5292 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5293 // multiplications (this is because that way there should not be any
5294 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5295 // that we can use the 16-bit arithmetic in the vector unit.
5296 //
5297 // On each level, we fill up the vector registers in such a way that the
5298 // array elements that need to be multiplied by the zetas go into one
5299 // set of vector registers while the corresponding ones that don't need to
5300 // be multiplied, go into another set.
5301 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5302 // registers interleaving the steps of 4 identical computations,
5303 // each done on 8 16-bit values per register.
5304
5305 // At levels 0-3 the coefficients multiplied by or added/subtracted
5306 // to the zetas occur in discrete blocks whose size is some multiple
5307 // of 32.
5308
5309 // level 0
5310 __ add(tmpAddr, coeffs, 256);
5311 load64shorts(vs1, tmpAddr);
5312 load64shorts(vs2, zetas);
5313 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5314 __ add(tmpAddr, coeffs, 0);
5315 load64shorts(vs1, tmpAddr);
5316 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5317 vs_addv(vs1, __ T8H, vs1, vs2);
5318 __ add(tmpAddr, coeffs, 0);
5319 vs_stpq_post(vs1, tmpAddr);
5320 __ add(tmpAddr, coeffs, 256);
5321 vs_stpq_post(vs3, tmpAddr);
5322 // restore montmul constants
5323 vs_ldpq(vq, kyberConsts);
5324 load64shorts(vs1, tmpAddr);
5325 load64shorts(vs2, zetas);
5326 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5327 __ add(tmpAddr, coeffs, 128);
5328 load64shorts(vs1, tmpAddr);
5329 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5330 vs_addv(vs1, __ T8H, vs1, vs2);
5331 __ add(tmpAddr, coeffs, 128);
5332 store64shorts(vs1, tmpAddr);
5333 __ add(tmpAddr, coeffs, 384);
5334 store64shorts(vs3, tmpAddr);
5335
5336 // level 1
5337 // restore montmul constants
5338 vs_ldpq(vq, kyberConsts);
5339 __ add(tmpAddr, coeffs, 128);
5340 load64shorts(vs1, tmpAddr);
5341 load64shorts(vs2, zetas);
5342 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5343 __ add(tmpAddr, coeffs, 0);
5344 load64shorts(vs1, tmpAddr);
5345 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5346 vs_addv(vs1, __ T8H, vs1, vs2);
5347 __ add(tmpAddr, coeffs, 0);
5348 store64shorts(vs1, tmpAddr);
5349 store64shorts(vs3, tmpAddr);
5350 vs_ldpq(vq, kyberConsts);
5351 __ add(tmpAddr, coeffs, 384);
5352 load64shorts(vs1, tmpAddr);
5353 load64shorts(vs2, zetas);
5354 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5355 __ add(tmpAddr, coeffs, 256);
5356 load64shorts(vs1, tmpAddr);
5357 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5358 vs_addv(vs1, __ T8H, vs1, vs2);
5359 __ add(tmpAddr, coeffs, 256);
5360 store64shorts(vs1, tmpAddr);
5361 store64shorts(vs3, tmpAddr);
5362
5363 // level 2
5364 vs_ldpq(vq, kyberConsts);
5365 int offsets1[4] = { 0, 32, 128, 160 };
5366 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5367 load64shorts(vs2, zetas);
5368 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5369 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5370 // kyber_subv_addv64();
5371 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5372 vs_addv(vs1, __ T8H, vs1, vs2);
5373 __ add(tmpAddr, coeffs, 0);
5374 vs_stpq_post(vs_front(vs1), tmpAddr);
5375 vs_stpq_post(vs_front(vs3), tmpAddr);
5376 vs_stpq_post(vs_back(vs1), tmpAddr);
5377 vs_stpq_post(vs_back(vs3), tmpAddr);
5378 vs_ldpq(vq, kyberConsts);
5379 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5380 load64shorts(vs2, zetas);
5381 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5382 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5383 // kyber_subv_addv64();
5384 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5385 vs_addv(vs1, __ T8H, vs1, vs2);
5386 __ add(tmpAddr, coeffs, 256);
5387 vs_stpq_post(vs_front(vs1), tmpAddr);
5388 vs_stpq_post(vs_front(vs3), tmpAddr);
5389 vs_stpq_post(vs_back(vs1), tmpAddr);
5390 vs_stpq_post(vs_back(vs3), tmpAddr);
5391
5392 // level 3
5393 vs_ldpq(vq, kyberConsts);
5394 int offsets2[4] = { 0, 64, 128, 192 };
5395 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5396 load64shorts(vs2, zetas);
5397 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5398 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5399 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5400 vs_addv(vs1, __ T8H, vs1, vs2);
5401 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5402 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5403
5404 vs_ldpq(vq, kyberConsts);
5405 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5406 load64shorts(vs2, zetas);
5407 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5408 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5409 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5410 vs_addv(vs1, __ T8H, vs1, vs2);
5411 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5412 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5413
5414 // level 4
5415 // At level 4 coefficients occur in 8 discrete blocks of size 16
5416 // so they are loaded using employing an ldr at 8 distinct offsets.
5417
5418 vs_ldpq(vq, kyberConsts);
5419 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5420 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5421 load64shorts(vs2, zetas);
5422 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5423 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5424 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5425 vs_addv(vs1, __ T8H, vs1, vs2);
5426 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5427 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5428
5429 vs_ldpq(vq, kyberConsts);
5430 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5431 load64shorts(vs2, zetas);
5432 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5433 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5434 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5435 vs_addv(vs1, __ T8H, vs1, vs2);
5436 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5437 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5438
5439 // level 5
5440 // At level 5 related coefficients occur in discrete blocks of size 8 so
5441 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5442
5443 vs_ldpq(vq, kyberConsts);
5444 int offsets4[4] = { 0, 32, 64, 96 };
5445 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5446 load32shorts(vs_front(vs2), zetas);
5447 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5448 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5449 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5450 load32shorts(vs_front(vs2), zetas);
5451 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5452 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5453 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5454 load32shorts(vs_front(vs2), zetas);
5455 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5456 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5457
5458 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5459 load32shorts(vs_front(vs2), zetas);
5460 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5461 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5462
5463 // level 6
5464 // At level 6 related coefficients occur in discrete blocks of size 4 so
5465 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5466
5467 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5468 load32shorts(vs_front(vs2), zetas);
5469 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5470 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5471 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5472 // __ ldpq(v18, v19, __ post(zetas, 32));
5473 load32shorts(vs_front(vs2), zetas);
5474 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5475 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5476
5477 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5478 load32shorts(vs_front(vs2), zetas);
5479 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5480 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5481
5482 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5483 load32shorts(vs_front(vs2), zetas);
5484 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5485 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5486
5487 __ leave(); // required for proper stackwalking of RuntimeStub frame
5488 __ mov(r0, zr); // return 0
5489 __ ret(lr);
5490
5491 return start;
5492 }
5493
5494 // Kyber Inverse NTT function
5495 // Implements
5496 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5497 //
5498 // coeffs (short[256]) = c_rarg0
5499 // ntt_zetas (short[256]) = c_rarg1
5500 address generate_kyberInverseNtt() {
5501
5502 __ align(CodeEntryAlignment);
5503 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5504 StubCodeMark mark(this, stub_id);
5505 address start = __ pc();
5506 __ enter();
5507
5508 const Register coeffs = c_rarg0;
5509 const Register zetas = c_rarg1;
5510
5511 const Register kyberConsts = r10;
5512 const Register tmpAddr = r11;
5513 const Register tmpAddr2 = c_rarg2;
5514
5515 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5516 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5517 VSeq<2> vq(30); // n.b. constants overlap vs3
5518
5519 __ lea(kyberConsts,
5520 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5521
5522 // level 0
5523 // At level 0 related coefficients occur in discrete blocks of size 4 so
5524 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5525
5526 vs_ldpq(vq, kyberConsts);
5527 int offsets4[4] = { 0, 32, 64, 96 };
5528 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5529 load32shorts(vs_front(vs2), zetas);
5530 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5531 vs_front(vs2), vs_back(vs2), vtmp, vq);
5532 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5533 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5534 load32shorts(vs_front(vs2), zetas);
5535 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5536 vs_front(vs2), vs_back(vs2), vtmp, vq);
5537 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5538 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5539 load32shorts(vs_front(vs2), zetas);
5540 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5541 vs_front(vs2), vs_back(vs2), vtmp, vq);
5542 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5543 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5544 load32shorts(vs_front(vs2), zetas);
5545 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5546 vs_front(vs2), vs_back(vs2), vtmp, vq);
5547 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5548
5549 // level 1
5550 // At level 1 related coefficients occur in discrete blocks of size 8 so
5551 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5552
5553 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5554 load32shorts(vs_front(vs2), zetas);
5555 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5556 vs_front(vs2), vs_back(vs2), vtmp, vq);
5557 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5558 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5559 load32shorts(vs_front(vs2), zetas);
5560 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5561 vs_front(vs2), vs_back(vs2), vtmp, vq);
5562 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5563
5564 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5565 load32shorts(vs_front(vs2), zetas);
5566 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5567 vs_front(vs2), vs_back(vs2), vtmp, vq);
5568 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5569 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5570 load32shorts(vs_front(vs2), zetas);
5571 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5572 vs_front(vs2), vs_back(vs2), vtmp, vq);
5573 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5574
5575 // level 2
5576 // At level 2 coefficients occur in 8 discrete blocks of size 16
5577 // so they are loaded using employing an ldr at 8 distinct offsets.
5578
5579 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5580 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5581 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5582 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5583 vs_subv(vs1, __ T8H, vs1, vs2);
5584 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5585 load64shorts(vs2, zetas);
5586 vs_ldpq(vq, kyberConsts);
5587 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5588 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5589
5590 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5591 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5592 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5593 vs_subv(vs1, __ T8H, vs1, vs2);
5594 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5595 load64shorts(vs2, zetas);
5596 vs_ldpq(vq, kyberConsts);
5597 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5598 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5599
5600 // Barrett reduction at indexes where overflow may happen
5601
5602 // load q and the multiplier for the Barrett reduction
5603 __ add(tmpAddr, kyberConsts, 16);
5604 vs_ldpq(vq, tmpAddr);
5605
5606 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5607 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5608 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5609 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5610 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5611 vs_sshr(vs2, __ T8H, vs2, 11);
5612 vs_mlsv(vs1, __ T8H, vs2, vq1);
5613 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5614 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5615 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5616 vs_sshr(vs2, __ T8H, vs2, 11);
5617 vs_mlsv(vs1, __ T8H, vs2, vq1);
5618 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5619
5620 // level 3
5621 // From level 3 upwards coefficients occur in discrete blocks whose size is
5622 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5623
5624 int offsets2[4] = { 0, 64, 128, 192 };
5625 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5626 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5627 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5628 vs_subv(vs1, __ T8H, vs1, vs2);
5629 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5630 load64shorts(vs2, zetas);
5631 vs_ldpq(vq, kyberConsts);
5632 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5633 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5634
5635 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5636 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5637 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5638 vs_subv(vs1, __ T8H, vs1, vs2);
5639 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5640 load64shorts(vs2, zetas);
5641 vs_ldpq(vq, kyberConsts);
5642 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5643 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5644
5645 // level 4
5646
5647 int offsets1[4] = { 0, 32, 128, 160 };
5648 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5649 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5650 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5651 vs_subv(vs1, __ T8H, vs1, vs2);
5652 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5653 load64shorts(vs2, zetas);
5654 vs_ldpq(vq, kyberConsts);
5655 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5656 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5657
5658 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5659 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5660 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5661 vs_subv(vs1, __ T8H, vs1, vs2);
5662 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5663 load64shorts(vs2, zetas);
5664 vs_ldpq(vq, kyberConsts);
5665 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5666 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5667
5668 // level 5
5669
5670 __ add(tmpAddr, coeffs, 0);
5671 load64shorts(vs1, tmpAddr);
5672 __ add(tmpAddr, coeffs, 128);
5673 load64shorts(vs2, tmpAddr);
5674 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5675 vs_subv(vs1, __ T8H, vs1, vs2);
5676 __ add(tmpAddr, coeffs, 0);
5677 store64shorts(vs3, tmpAddr);
5678 load64shorts(vs2, zetas);
5679 vs_ldpq(vq, kyberConsts);
5680 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5681 __ add(tmpAddr, coeffs, 128);
5682 store64shorts(vs2, tmpAddr);
5683
5684 load64shorts(vs1, tmpAddr);
5685 __ add(tmpAddr, coeffs, 384);
5686 load64shorts(vs2, tmpAddr);
5687 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5688 vs_subv(vs1, __ T8H, vs1, vs2);
5689 __ add(tmpAddr, coeffs, 256);
5690 store64shorts(vs3, tmpAddr);
5691 load64shorts(vs2, zetas);
5692 vs_ldpq(vq, kyberConsts);
5693 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5694 __ add(tmpAddr, coeffs, 384);
5695 store64shorts(vs2, tmpAddr);
5696
5697 // Barrett reduction at indexes where overflow may happen
5698
5699 // load q and the multiplier for the Barrett reduction
5700 __ add(tmpAddr, kyberConsts, 16);
5701 vs_ldpq(vq, tmpAddr);
5702
5703 int offsets0[2] = { 0, 256 };
5704 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5705 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5706 vs_sshr(vs2, __ T8H, vs2, 11);
5707 vs_mlsv(vs1, __ T8H, vs2, vq1);
5708 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5709
5710 // level 6
5711
5712 __ add(tmpAddr, coeffs, 0);
5713 load64shorts(vs1, tmpAddr);
5714 __ add(tmpAddr, coeffs, 256);
5715 load64shorts(vs2, tmpAddr);
5716 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5717 vs_subv(vs1, __ T8H, vs1, vs2);
5718 __ add(tmpAddr, coeffs, 0);
5719 store64shorts(vs3, tmpAddr);
5720 load64shorts(vs2, zetas);
5721 vs_ldpq(vq, kyberConsts);
5722 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5723 __ add(tmpAddr, coeffs, 256);
5724 store64shorts(vs2, tmpAddr);
5725
5726 __ add(tmpAddr, coeffs, 128);
5727 load64shorts(vs1, tmpAddr);
5728 __ add(tmpAddr, coeffs, 384);
5729 load64shorts(vs2, tmpAddr);
5730 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5731 vs_subv(vs1, __ T8H, vs1, vs2);
5732 __ add(tmpAddr, coeffs, 128);
5733 store64shorts(vs3, tmpAddr);
5734 load64shorts(vs2, zetas);
5735 vs_ldpq(vq, kyberConsts);
5736 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5737 __ add(tmpAddr, coeffs, 384);
5738 store64shorts(vs2, tmpAddr);
5739
5740 // multiply by 2^-n
5741
5742 // load toMont(2^-n mod q)
5743 __ add(tmpAddr, kyberConsts, 48);
5744 __ ldr(v29, __ Q, tmpAddr);
5745
5746 vs_ldpq(vq, kyberConsts);
5747 __ add(tmpAddr, coeffs, 0);
5748 load64shorts(vs1, tmpAddr);
5749 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5750 __ add(tmpAddr, coeffs, 0);
5751 store64shorts(vs2, tmpAddr);
5752
5753 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5754 load64shorts(vs1, tmpAddr);
5755 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5756 __ add(tmpAddr, coeffs, 128);
5757 store64shorts(vs2, tmpAddr);
5758
5759 // now tmpAddr contains coeffs + 256
5760 load64shorts(vs1, tmpAddr);
5761 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5762 __ add(tmpAddr, coeffs, 256);
5763 store64shorts(vs2, tmpAddr);
5764
5765 // now tmpAddr contains coeffs + 384
5766 load64shorts(vs1, tmpAddr);
5767 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5768 __ add(tmpAddr, coeffs, 384);
5769 store64shorts(vs2, tmpAddr);
5770
5771 __ leave(); // required for proper stackwalking of RuntimeStub frame
5772 __ mov(r0, zr); // return 0
5773 __ ret(lr);
5774
5775 return start;
5776 }
5777
5778 // Kyber multiply polynomials in the NTT domain.
5779 // Implements
5780 // static int implKyberNttMult(
5781 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5782 //
5783 // result (short[256]) = c_rarg0
5784 // ntta (short[256]) = c_rarg1
5785 // nttb (short[256]) = c_rarg2
5786 // zetas (short[128]) = c_rarg3
5787 address generate_kyberNttMult() {
5788
5789 __ align(CodeEntryAlignment);
5790 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5791 StubCodeMark mark(this, stub_id);
5792 address start = __ pc();
5793 __ enter();
5794
5795 const Register result = c_rarg0;
5796 const Register ntta = c_rarg1;
5797 const Register nttb = c_rarg2;
5798 const Register zetas = c_rarg3;
5799
5800 const Register kyberConsts = r10;
5801 const Register limit = r11;
5802
5803 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5804 VSeq<4> vs3(16), vs4(20);
5805 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5806 VSeq<2> vz(28); // pair of zetas
5807 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5808
5809 __ lea(kyberConsts,
5810 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5811
5812 Label kyberNttMult_loop;
5813
5814 __ add(limit, result, 512);
5815
5816 // load q and qinv
5817 vs_ldpq(vq, kyberConsts);
5818
5819 // load R^2 mod q (to convert back from Montgomery representation)
5820 __ add(kyberConsts, kyberConsts, 64);
5821 __ ldr(v27, __ Q, kyberConsts);
5822
5823 __ BIND(kyberNttMult_loop);
5824
5825 // load 16 zetas
5826 vs_ldpq_post(vz, zetas);
5827
5828 // load 2 sets of 32 coefficients from the two input arrays
5829 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5830 // are striped across pairs of vector registers
5831 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5832 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5833 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5834 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5835
5836 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5837 // i.e. montmul the first and second halves of vs1 in order and
5838 // then with one sequence reversed storing the two results in vs3
5839 //
5840 // vs3[0] <- montmul(a0, b0)
5841 // vs3[1] <- montmul(a1, b1)
5842 // vs3[2] <- montmul(a0, b1)
5843 // vs3[3] <- montmul(a1, b0)
5844 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5845 kyber_montmul16(vs_back(vs3),
5846 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5847
5848 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5849 // i.e. montmul the first and second halves of vs4 in order and
5850 // then with one sequence reversed storing the two results in vs1
5851 //
5852 // vs1[0] <- montmul(a2, b2)
5853 // vs1[1] <- montmul(a3, b3)
5854 // vs1[2] <- montmul(a2, b3)
5855 // vs1[3] <- montmul(a3, b2)
5856 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5857 kyber_montmul16(vs_back(vs1),
5858 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5859
5860 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5861 // We can schedule two montmuls at a time if we use a suitable vector
5862 // sequence <vs3[1], vs1[1]>.
5863 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5864 VSeq<2> vs5(vs3[1], delta);
5865
5866 // vs3[1] <- montmul(montmul(a1, b1), z0)
5867 // vs1[1] <- montmul(montmul(a3, b3), z1)
5868 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5869
5870 // add results in pairs storing in vs3
5871 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5872 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5873 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5874
5875 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5876 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5877 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5878
5879 // vs1 <- montmul(vs3, montRSquareModQ)
5880 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5881
5882 // store back the two pairs of result vectors de-interleaved as 8H elements
5883 // i.e. storing each pairs of shorts striped across a register pair adjacent
5884 // in memory
5885 vs_st2_post(vs1, __ T8H, result);
5886
5887 __ cmp(result, limit);
5888 __ br(Assembler::NE, kyberNttMult_loop);
5889
5890 __ leave(); // required for proper stackwalking of RuntimeStub frame
5891 __ mov(r0, zr); // return 0
5892 __ ret(lr);
5893
5894 return start;
5895 }
5896
5897 // Kyber add 2 polynomials.
5898 // Implements
5899 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5900 //
5901 // result (short[256]) = c_rarg0
5902 // a (short[256]) = c_rarg1
5903 // b (short[256]) = c_rarg2
5904 address generate_kyberAddPoly_2() {
5905
5906 __ align(CodeEntryAlignment);
5907 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5908 StubCodeMark mark(this, stub_id);
5909 address start = __ pc();
5910 __ enter();
5911
5912 const Register result = c_rarg0;
5913 const Register a = c_rarg1;
5914 const Register b = c_rarg2;
5915
5916 const Register kyberConsts = r11;
5917
5918 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5919 // So, we can load, add and store the data in 3 groups of 11,
5920 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5921 // registers. A further constraint is that the mapping needs
5922 // to skip callee saves. So, we allocate the register
5923 // sequences using two 8 sequences, two 2 sequences and two
5924 // single registers.
5925 VSeq<8> vs1_1(0);
5926 VSeq<2> vs1_2(16);
5927 FloatRegister vs1_3 = v28;
5928 VSeq<8> vs2_1(18);
5929 VSeq<2> vs2_2(26);
5930 FloatRegister vs2_3 = v29;
5931
5932 // two constant vector sequences
5933 VSeq<8> vc_1(31, 0);
5934 VSeq<2> vc_2(31, 0);
5935
5936 FloatRegister vc_3 = v31;
5937 __ lea(kyberConsts,
5938 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5939
5940 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5941 for (int i = 0; i < 3; i++) {
5942 // load 80 or 88 values from a into vs1_1/2/3
5943 vs_ldpq_post(vs1_1, a);
5944 vs_ldpq_post(vs1_2, a);
5945 if (i < 2) {
5946 __ ldr(vs1_3, __ Q, __ post(a, 16));
5947 }
5948 // load 80 or 88 values from b into vs2_1/2/3
5949 vs_ldpq_post(vs2_1, b);
5950 vs_ldpq_post(vs2_2, b);
5951 if (i < 2) {
5952 __ ldr(vs2_3, __ Q, __ post(b, 16));
5953 }
5954 // sum 80 or 88 values across vs1 and vs2 into vs1
5955 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5956 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5957 if (i < 2) {
5958 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5959 }
5960 // add constant to all 80 or 88 results
5961 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5962 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5963 if (i < 2) {
5964 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5965 }
5966 // store 80 or 88 values
5967 vs_stpq_post(vs1_1, result);
5968 vs_stpq_post(vs1_2, result);
5969 if (i < 2) {
5970 __ str(vs1_3, __ Q, __ post(result, 16));
5971 }
5972 }
5973
5974 __ leave(); // required for proper stackwalking of RuntimeStub frame
5975 __ mov(r0, zr); // return 0
5976 __ ret(lr);
5977
5978 return start;
5979 }
5980
5981 // Kyber add 3 polynomials.
5982 // Implements
5983 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
5984 //
5985 // result (short[256]) = c_rarg0
5986 // a (short[256]) = c_rarg1
5987 // b (short[256]) = c_rarg2
5988 // c (short[256]) = c_rarg3
5989 address generate_kyberAddPoly_3() {
5990
5991 __ align(CodeEntryAlignment);
5992 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
5993 StubCodeMark mark(this, stub_id);
5994 address start = __ pc();
5995 __ enter();
5996
5997 const Register result = c_rarg0;
5998 const Register a = c_rarg1;
5999 const Register b = c_rarg2;
6000 const Register c = c_rarg3;
6001
6002 const Register kyberConsts = r11;
6003
6004 // As above we sum 256 sets of values in total i.e. 32 x 8H
6005 // quadwords. So, we can load, add and store the data in 3
6006 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6007 // of 10 or 11 registers. A further constraint is that the
6008 // mapping needs to skip callee saves. So, we allocate the
6009 // register sequences using two 8 sequences, two 2 sequences
6010 // and two single registers.
6011 VSeq<8> vs1_1(0);
6012 VSeq<2> vs1_2(16);
6013 FloatRegister vs1_3 = v28;
6014 VSeq<8> vs2_1(18);
6015 VSeq<2> vs2_2(26);
6016 FloatRegister vs2_3 = v29;
6017
6018 // two constant vector sequences
6019 VSeq<8> vc_1(31, 0);
6020 VSeq<2> vc_2(31, 0);
6021
6022 FloatRegister vc_3 = v31;
6023
6024 __ lea(kyberConsts,
6025 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6026
6027 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6028 for (int i = 0; i < 3; i++) {
6029 // load 80 or 88 values from a into vs1_1/2/3
6030 vs_ldpq_post(vs1_1, a);
6031 vs_ldpq_post(vs1_2, a);
6032 if (i < 2) {
6033 __ ldr(vs1_3, __ Q, __ post(a, 16));
6034 }
6035 // load 80 or 88 values from b into vs2_1/2/3
6036 vs_ldpq_post(vs2_1, b);
6037 vs_ldpq_post(vs2_2, b);
6038 if (i < 2) {
6039 __ ldr(vs2_3, __ Q, __ post(b, 16));
6040 }
6041 // sum 80 or 88 values across vs1 and vs2 into vs1
6042 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6043 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6044 if (i < 2) {
6045 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6046 }
6047 // load 80 or 88 values from c into vs2_1/2/3
6048 vs_ldpq_post(vs2_1, c);
6049 vs_ldpq_post(vs2_2, c);
6050 if (i < 2) {
6051 __ ldr(vs2_3, __ Q, __ post(c, 16));
6052 }
6053 // sum 80 or 88 values across vs1 and vs2 into vs1
6054 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6055 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6056 if (i < 2) {
6057 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6058 }
6059 // add constant to all 80 or 88 results
6060 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6061 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6062 if (i < 2) {
6063 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6064 }
6065 // store 80 or 88 values
6066 vs_stpq_post(vs1_1, result);
6067 vs_stpq_post(vs1_2, result);
6068 if (i < 2) {
6069 __ str(vs1_3, __ Q, __ post(result, 16));
6070 }
6071 }
6072
6073 __ leave(); // required for proper stackwalking of RuntimeStub frame
6074 __ mov(r0, zr); // return 0
6075 __ ret(lr);
6076
6077 return start;
6078 }
6079
6080 // Kyber parse XOF output to polynomial coefficient candidates
6081 // or decodePoly(12, ...).
6082 // Implements
6083 // static int implKyber12To16(
6084 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6085 //
6086 // (parsedLength or (parsedLength - 48) must be divisible by 64.)
6087 //
6088 // condensed (byte[]) = c_rarg0
6089 // condensedIndex = c_rarg1
6090 // parsed (short[112 or 256]) = c_rarg2
6091 // parsedLength (112 or 256) = c_rarg3
6092 address generate_kyber12To16() {
6093 Label L_F00, L_loop, L_end;
6094
6095 __ align(CodeEntryAlignment);
6096 StubId stub_id = StubId::stubgen_kyber12To16_id;
6097 StubCodeMark mark(this, stub_id);
6098 address start = __ pc();
6099 __ enter();
6100
6101 const Register condensed = c_rarg0;
6102 const Register condensedOffs = c_rarg1;
6103 const Register parsed = c_rarg2;
6104 const Register parsedLength = c_rarg3;
6105
6106 const Register tmpAddr = r11;
6107
6108 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6109 // quadwords so we need a 6 vector sequence for the inputs.
6110 // Parsing produces 64 shorts, employing two 8 vector
6111 // sequences to store and combine the intermediate data.
6112 VSeq<6> vin(24);
6113 VSeq<8> va(0), vb(16);
6114
6115 __ adr(tmpAddr, L_F00);
6116 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6117 __ add(condensed, condensed, condensedOffs);
6118
6119 __ BIND(L_loop);
6120 // load 96 (6 x 16B) byte values
6121 vs_ld3_post(vin, __ T16B, condensed);
6122
6123 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6124 // holds 48 (16x3) contiguous bytes from memory striped
6125 // horizontally across each of the 16 byte lanes. Equivalently,
6126 // that is 16 pairs of 12-bit integers. Likewise the back half
6127 // holds the next 48 bytes in the same arrangement.
6128
6129 // Each vector in the front half can also be viewed as a vertical
6130 // strip across the 16 pairs of 12 bit integers. Each byte in
6131 // vin[0] stores the low 8 bits of the first int in a pair. Each
6132 // byte in vin[1] stores the high 4 bits of the first int and the
6133 // low 4 bits of the second int. Each byte in vin[2] stores the
6134 // high 8 bits of the second int. Likewise the vectors in second
6135 // half.
6136
6137 // Converting the data to 16-bit shorts requires first of all
6138 // expanding each of the 6 x 16B vectors into 6 corresponding
6139 // pairs of 8H vectors. Mask, shift and add operations on the
6140 // resulting vector pairs can be used to combine 4 and 8 bit
6141 // parts of related 8H vector elements.
6142 //
6143 // The middle vectors (vin[2] and vin[5]) are actually expanded
6144 // twice, one copy manipulated to provide the lower 4 bits
6145 // belonging to the first short in a pair and another copy
6146 // manipulated to provide the higher 4 bits belonging to the
6147 // second short in a pair. This is why the the vector sequences va
6148 // and vb used to hold the expanded 8H elements are of length 8.
6149
6150 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6151 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6152 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6153 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6154 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6155 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6156 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6157 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6158
6159 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6160 // and vb[4:5]
6161 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6162 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6163 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6164 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6165 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6166 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6167
6168 // shift lo byte of copy 1 of the middle stripe into the high byte
6169 __ shl(va[2], __ T8H, va[2], 8);
6170 __ shl(va[3], __ T8H, va[3], 8);
6171 __ shl(vb[2], __ T8H, vb[2], 8);
6172 __ shl(vb[3], __ T8H, vb[3], 8);
6173
6174 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6175 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6176 // are in bit positions [4..11].
6177 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6178 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6179 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6180 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6181
6182 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6183 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6184 // copy2
6185 __ andr(va[2], __ T16B, va[2], v31);
6186 __ andr(va[3], __ T16B, va[3], v31);
6187 __ ushr(va[4], __ T8H, va[4], 4);
6188 __ ushr(va[5], __ T8H, va[5], 4);
6189 __ andr(vb[2], __ T16B, vb[2], v31);
6190 __ andr(vb[3], __ T16B, vb[3], v31);
6191 __ ushr(vb[4], __ T8H, vb[4], 4);
6192 __ ushr(vb[5], __ T8H, vb[5], 4);
6193
6194 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6195 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6196 // n.b. the ordering ensures: i) inputs are consumed before they
6197 // are overwritten ii) the order of 16-bit results across successive
6198 // pairs of vectors in va and then vb reflects the order of the
6199 // corresponding 12-bit inputs
6200 __ addv(va[0], __ T8H, va[0], va[2]);
6201 __ addv(va[2], __ T8H, va[1], va[3]);
6202 __ addv(va[1], __ T8H, va[4], va[6]);
6203 __ addv(va[3], __ T8H, va[5], va[7]);
6204 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6205 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6206 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6207 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6208
6209 // store 64 results interleaved as shorts
6210 vs_st2_post(vs_front(va), __ T8H, parsed);
6211 vs_st2_post(vs_front(vb), __ T8H, parsed);
6212
6213 __ sub(parsedLength, parsedLength, 64);
6214 __ cmp(parsedLength, (u1)64);
6215 __ br(Assembler::GE, L_loop);
6216 __ cbz(parsedLength, L_end);
6217
6218 // if anything is left it should be a final 72 bytes of input
6219 // i.e. a final 48 12-bit values. so we handle this by loading
6220 // 48 bytes into all 16B lanes of front(vin) and only 24
6221 // bytes into the lower 8B lane of back(vin)
6222 vs_ld3_post(vs_front(vin), __ T16B, condensed);
6223 vs_ld3(vs_back(vin), __ T8B, condensed);
6224
6225 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6226 // n.b. target elements 2 and 3 of va duplicate elements 4 and
6227 // 5 and target element 2 of vb duplicates element 4.
6228 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6229 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6230 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6231 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6232 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6233 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6234
6235 // This time expand just the lower 8 lanes
6236 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6237 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6238 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6239
6240 // shift lo byte of copy 1 of the middle stripe into the high byte
6241 __ shl(va[2], __ T8H, va[2], 8);
6242 __ shl(va[3], __ T8H, va[3], 8);
6243 __ shl(vb[2], __ T8H, vb[2], 8);
6244
6245 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
6246 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
6247 // int are in bit positions [4..11].
6248 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6249 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6250 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6251
6252 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
6253 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
6254 // copy2
6255 __ andr(va[2], __ T16B, va[2], v31);
6256 __ andr(va[3], __ T16B, va[3], v31);
6257 __ ushr(va[4], __ T8H, va[4], 4);
6258 __ ushr(va[5], __ T8H, va[5], 4);
6259 __ andr(vb[2], __ T16B, vb[2], v31);
6260 __ ushr(vb[4], __ T8H, vb[4], 4);
6261
6262
6263
6264 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
6265 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
6266
6267 // n.b. ordering ensures: i) inputs are consumed before they are
6268 // overwritten ii) order of 16-bit results across succsessive
6269 // pairs of vectors in va and then lower half of vb reflects order
6270 // of corresponding 12-bit inputs
6271 __ addv(va[0], __ T8H, va[0], va[2]);
6272 __ addv(va[2], __ T8H, va[1], va[3]);
6273 __ addv(va[1], __ T8H, va[4], va[6]);
6274 __ addv(va[3], __ T8H, va[5], va[7]);
6275 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6276 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6277
6278 // store 48 results interleaved as shorts
6279 vs_st2_post(vs_front(va), __ T8H, parsed);
6280 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
6281
6282 __ BIND(L_end);
6283
6284 __ leave(); // required for proper stackwalking of RuntimeStub frame
6285 __ mov(r0, zr); // return 0
6286 __ ret(lr);
6287
6288 // bind label and generate constant data used by this stub
6289 __ BIND(L_F00);
6290 __ emit_int64(0x0f000f000f000f00);
6291 __ emit_int64(0x0f000f000f000f00);
6292
6293 return start;
6294 }
6295
6296 // Kyber Barrett reduce function.
6297 // Implements
6298 // static int implKyberBarrettReduce(short[] coeffs) {}
6299 //
6300 // coeffs (short[256]) = c_rarg0
6301 address generate_kyberBarrettReduce() {
6302
6303 __ align(CodeEntryAlignment);
6304 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6305 StubCodeMark mark(this, stub_id);
6306 address start = __ pc();
6307 __ enter();
6308
6309 const Register coeffs = c_rarg0;
6310
6311 const Register kyberConsts = r10;
6312 const Register result = r11;
6313
6314 // As above we process 256 sets of values in total i.e. 32 x
6315 // 8H quadwords. So, we can load, add and store the data in 3
6316 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6317 // of 10 or 11 registers. A further constraint is that the
6318 // mapping needs to skip callee saves. So, we allocate the
6319 // register sequences using two 8 sequences, two 2 sequences
6320 // and two single registers.
6321 VSeq<8> vs1_1(0);
6322 VSeq<2> vs1_2(16);
6323 FloatRegister vs1_3 = v28;
6324 VSeq<8> vs2_1(18);
6325 VSeq<2> vs2_2(26);
6326 FloatRegister vs2_3 = v29;
6327
6328 // we also need a pair of corresponding constant sequences
6329
6330 VSeq<8> vc1_1(30, 0);
6331 VSeq<2> vc1_2(30, 0);
6332 FloatRegister vc1_3 = v30; // for kyber_q
6333
6334 VSeq<8> vc2_1(31, 0);
6335 VSeq<2> vc2_2(31, 0);
6336 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6337
6338 __ add(result, coeffs, 0);
6339 __ lea(kyberConsts,
6340 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6341
6342 // load q and the multiplier for the Barrett reduction
6343 __ add(kyberConsts, kyberConsts, 16);
6344 __ ldpq(vc1_3, vc2_3, kyberConsts);
6345
6346 for (int i = 0; i < 3; i++) {
6347 // load 80 or 88 coefficients
6348 vs_ldpq_post(vs1_1, coeffs);
6349 vs_ldpq_post(vs1_2, coeffs);
6350 if (i < 2) {
6351 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6352 }
6353
6354 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6355 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6356 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6357 if (i < 2) {
6358 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6359 }
6360
6361 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6362 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6363 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6364 if (i < 2) {
6365 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6366 }
6367
6368 // vs1 <- vs1 - vs2 * kyber_q
6369 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6370 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6371 if (i < 2) {
6372 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6373 }
6374
6375 vs_stpq_post(vs1_1, result);
6376 vs_stpq_post(vs1_2, result);
6377 if (i < 2) {
6378 __ str(vs1_3, __ Q, __ post(result, 16));
6379 }
6380 }
6381
6382 __ leave(); // required for proper stackwalking of RuntimeStub frame
6383 __ mov(r0, zr); // return 0
6384 __ ret(lr);
6385
6386 return start;
6387 }
6388
6389
6390 // Dilithium-specific montmul helper routines that generate parallel
6391 // code for, respectively, a single 4x4s vector sequence montmul or
6392 // two such multiplies in a row.
6393
6394 // Perform 16 32-bit Montgomery multiplications in parallel
6395 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6396 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6397 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6398 // It will assert that the register use is valid
6399 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6400 }
6401
6402 // Perform 2x16 32-bit Montgomery multiplications in parallel
6403 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6404 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6405 // Schedule two successive 4x4S multiplies via the montmul helper
6406 // on the front and back halves of va, vb and vc. The helper will
6407 // assert that the register use has no overlap conflicts on each
6408 // individual call but we also need to ensure that the necessary
6409 // disjoint/equality constraints are met across both calls.
6410
6411 // vb, vc, vtmp and vq must be disjoint. va must either be
6412 // disjoint from all other registers or equal vc
6413
6414 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6415 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6416 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6417
6418 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6419 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6420
6421 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6422
6423 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6424 assert(vs_disjoint(va, vb), "va and vb overlap");
6425 assert(vs_disjoint(va, vq), "va and vq overlap");
6426 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6427
6428 // We multiply the front and back halves of each sequence 4 at a
6429 // time because
6430 //
6431 // 1) we are currently only able to get 4-way instruction
6432 // parallelism at best
6433 //
6434 // 2) we need registers for the constants in vq and temporary
6435 // scratch registers to hold intermediate results so vtmp can only
6436 // be a VSeq<4> which means we only have 4 scratch slots.
6437
6438 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6439 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6440 }
6441
6442 // Perform combined montmul then add/sub on 4x4S vectors.
6443 void dilithium_montmul16_sub_add(
6444 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6445 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6446 // compute a = montmul(a1, c)
6447 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6448 // ouptut a1 = a0 - a
6449 vs_subv(va1, __ T4S, va0, vc);
6450 // and a0 = a0 + a
6451 vs_addv(va0, __ T4S, va0, vc);
6452 }
6453
6454 // Perform combined add/sub then montul on 4x4S vectors.
6455 void dilithium_sub_add_montmul16(
6456 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6457 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6458 // compute c = a0 - a1
6459 vs_subv(vtmp1, __ T4S, va0, va1);
6460 // output a0 = a0 + a1
6461 vs_addv(va0, __ T4S, va0, va1);
6462 // output a1 = b montmul c
6463 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6464 }
6465
6466 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6467 // in the Java implementation come in sequences of at least 8, so we
6468 // can use ldpq to collect the corresponding data into pairs of vector
6469 // registers.
6470 // We collect the coefficients corresponding to the 'j+l' indexes into
6471 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6472 // then we do the (Montgomery) multiplications by the zetas in parallel
6473 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6474 // v0-v7, then do the additions into v24-v31 and the subtractions into
6475 // v0-v7 and finally save the results back to the coeffs array.
6476 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6477 const Register coeffs, const Register zetas) {
6478 int c1 = 0;
6479 int c2 = 512;
6480 int startIncr;
6481 // don't use callee save registers v8 - v15
6482 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6483 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6484 VSeq<2> vq(30); // n.b. constants overlap vs3
6485 int offsets[4] = { 0, 32, 64, 96 };
6486
6487 for (int level = 0; level < 5; level++) {
6488 int c1Start = c1;
6489 int c2Start = c2;
6490 if (level == 3) {
6491 offsets[1] = 32;
6492 offsets[2] = 128;
6493 offsets[3] = 160;
6494 } else if (level == 4) {
6495 offsets[1] = 64;
6496 offsets[2] = 128;
6497 offsets[3] = 192;
6498 }
6499
6500 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6501 // time at 4 different offsets and multiply them in order by the
6502 // next set of input values. So we employ indexed load and store
6503 // pair instructions with arrangement 4S.
6504 for (int i = 0; i < 4; i++) {
6505 // reload q and qinv
6506 vs_ldpq(vq, dilithiumConsts); // qInv, q
6507 // load 8x4S coefficients via second start pos == c2
6508 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6509 // load next 8x4S inputs == b
6510 vs_ldpq_post(vs2, zetas);
6511 // compute a == c2 * b mod MONT_Q
6512 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6513 // load 8x4s coefficients via first start pos == c1
6514 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6515 // compute a1 = c1 + a
6516 vs_addv(vs3, __ T4S, vs1, vs2);
6517 // compute a2 = c1 - a
6518 vs_subv(vs1, __ T4S, vs1, vs2);
6519 // output a1 and a2
6520 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6521 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6522
6523 int k = 4 * level + i;
6524
6525 if (k > 7) {
6526 startIncr = 256;
6527 } else if (k == 5) {
6528 startIncr = 384;
6529 } else {
6530 startIncr = 128;
6531 }
6532
6533 c1Start += startIncr;
6534 c2Start += startIncr;
6535 }
6536
6537 c2 /= 2;
6538 }
6539 }
6540
6541 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6542 // Implements the method
6543 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6544 // of the Java class sun.security.provider
6545 //
6546 // coeffs (int[256]) = c_rarg0
6547 // zetas (int[256]) = c_rarg1
6548 address generate_dilithiumAlmostNtt() {
6549
6550 __ align(CodeEntryAlignment);
6551 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6552 StubCodeMark mark(this, stub_id);
6553 address start = __ pc();
6554 __ enter();
6555
6556 const Register coeffs = c_rarg0;
6557 const Register zetas = c_rarg1;
6558
6559 const Register tmpAddr = r9;
6560 const Register dilithiumConsts = r10;
6561 const Register result = r11;
6562 // don't use callee save registers v8 - v15
6563 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6564 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6565 VSeq<2> vq(30); // n.b. constants overlap vs3
6566 int offsets[4] = { 0, 32, 64, 96};
6567 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6568 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6569 __ add(result, coeffs, 0);
6570 __ lea(dilithiumConsts,
6571 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6572
6573 // Each level represents one iteration of the outer for loop of the Java version.
6574
6575 // level 0-4
6576 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6577
6578 // level 5
6579
6580 // At level 5 the coefficients we need to combine with the zetas
6581 // are grouped in memory in blocks of size 4. So, for both sets of
6582 // coefficients we load 4 adjacent values at 8 different offsets
6583 // using an indexed ldr with register variant Q and multiply them
6584 // in sequence order by the next set of inputs. Likewise we store
6585 // the resuls using an indexed str with register variant Q.
6586 for (int i = 0; i < 1024; i += 256) {
6587 // reload constants q, qinv each iteration as they get clobbered later
6588 vs_ldpq(vq, dilithiumConsts); // qInv, q
6589 // load 32 (8x4S) coefficients via first offsets = c1
6590 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6591 // load next 32 (8x4S) inputs = b
6592 vs_ldpq_post(vs2, zetas);
6593 // a = b montul c1
6594 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6595 // load 32 (8x4S) coefficients via second offsets = c2
6596 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6597 // add/sub with result of multiply
6598 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6599 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6600 // write back new coefficients using same offsets
6601 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6602 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6603 }
6604
6605 // level 6
6606 // At level 6 the coefficients we need to combine with the zetas
6607 // are grouped in memory in pairs, the first two being montmul
6608 // inputs and the second add/sub inputs. We can still implement
6609 // the montmul+sub+add using 4-way parallelism but only if we
6610 // combine the coefficients with the zetas 16 at a time. We load 8
6611 // adjacent values at 4 different offsets using an ld2 load with
6612 // arrangement 2D. That interleaves the lower and upper halves of
6613 // each pair of quadwords into successive vector registers. We
6614 // then need to montmul the 4 even elements of the coefficients
6615 // register sequence by the zetas in order and then add/sub the 4
6616 // odd elements of the coefficients register sequence. We use an
6617 // equivalent st2 operation to store the results back into memory
6618 // de-interleaved.
6619 for (int i = 0; i < 1024; i += 128) {
6620 // reload constants q, qinv each iteration as they get clobbered later
6621 vs_ldpq(vq, dilithiumConsts); // qInv, q
6622 // load interleaved 16 (4x2D) coefficients via offsets
6623 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6624 // load next 16 (4x4S) inputs
6625 vs_ldpq_post(vs_front(vs2), zetas);
6626 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6627 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6628 vs_front(vs2), vtmp, vq);
6629 // store interleaved 16 (4x2D) coefficients via offsets
6630 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6631 }
6632
6633 // level 7
6634 // At level 7 the coefficients we need to combine with the zetas
6635 // occur singly with montmul inputs alterating with add/sub
6636 // inputs. Once again we can use 4-way parallelism to combine 16
6637 // zetas at a time. However, we have to load 8 adjacent values at
6638 // 4 different offsets using an ld2 load with arrangement 4S. That
6639 // interleaves the the odd words of each pair into one
6640 // coefficients vector register and the even words of the pair
6641 // into the next register. We then need to montmul the 4 even
6642 // elements of the coefficients register sequence by the zetas in
6643 // order and then add/sub the 4 odd elements of the coefficients
6644 // register sequence. We use an equivalent st2 operation to store
6645 // the results back into memory de-interleaved.
6646
6647 for (int i = 0; i < 1024; i += 128) {
6648 // reload constants q, qinv each iteration as they get clobbered later
6649 vs_ldpq(vq, dilithiumConsts); // qInv, q
6650 // load interleaved 16 (4x4S) coefficients via offsets
6651 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6652 // load next 16 (4x4S) inputs
6653 vs_ldpq_post(vs_front(vs2), zetas);
6654 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6655 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6656 vs_front(vs2), vtmp, vq);
6657 // store interleaved 16 (4x4S) coefficients via offsets
6658 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6659 }
6660 __ leave(); // required for proper stackwalking of RuntimeStub frame
6661 __ mov(r0, zr); // return 0
6662 __ ret(lr);
6663
6664 return start;
6665 }
6666
6667 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6668 // in the Java implementation come in sequences of at least 8, so we
6669 // can use ldpq to collect the corresponding data into pairs of vector
6670 // registers
6671 // We collect the coefficients that correspond to the 'j's into vs1
6672 // the coefficiets that correspond to the 'j+l's into vs2 then
6673 // do the additions into vs3 and the subtractions into vs1 then
6674 // save the result of the additions, load the zetas into vs2
6675 // do the (Montgomery) multiplications by zeta in parallel into vs2
6676 // finally save the results back to the coeffs array
6677 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6678 const Register coeffs, const Register zetas) {
6679 int c1 = 0;
6680 int c2 = 32;
6681 int startIncr;
6682 int offsets[4];
6683 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6684 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6685 VSeq<2> vq(30); // n.b. constants overlap vs3
6686
6687 offsets[0] = 0;
6688
6689 for (int level = 3; level < 8; level++) {
6690 int c1Start = c1;
6691 int c2Start = c2;
6692 if (level == 3) {
6693 offsets[1] = 64;
6694 offsets[2] = 128;
6695 offsets[3] = 192;
6696 } else if (level == 4) {
6697 offsets[1] = 32;
6698 offsets[2] = 128;
6699 offsets[3] = 160;
6700 } else {
6701 offsets[1] = 32;
6702 offsets[2] = 64;
6703 offsets[3] = 96;
6704 }
6705
6706 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6707 // time at 4 different offsets and multiply them in order by the
6708 // next set of input values. So we employ indexed load and store
6709 // pair instructions with arrangement 4S.
6710 for (int i = 0; i < 4; i++) {
6711 // load v1 32 (8x4S) coefficients relative to first start index
6712 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6713 // load v2 32 (8x4S) coefficients relative to second start index
6714 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6715 // a0 = v1 + v2 -- n.b. clobbers vqs
6716 vs_addv(vs3, __ T4S, vs1, vs2);
6717 // a1 = v1 - v2
6718 vs_subv(vs1, __ T4S, vs1, vs2);
6719 // save a1 relative to first start index
6720 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6721 // load constants q, qinv each iteration as they get clobbered above
6722 vs_ldpq(vq, dilithiumConsts); // qInv, q
6723 // load b next 32 (8x4S) inputs
6724 vs_ldpq_post(vs2, zetas);
6725 // a = a1 montmul b
6726 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6727 // save a relative to second start index
6728 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6729
6730 int k = 4 * level + i;
6731
6732 if (k < 24) {
6733 startIncr = 256;
6734 } else if (k == 25) {
6735 startIncr = 384;
6736 } else {
6737 startIncr = 128;
6738 }
6739
6740 c1Start += startIncr;
6741 c2Start += startIncr;
6742 }
6743
6744 c2 *= 2;
6745 }
6746 }
6747
6748 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6749 // Implements the method
6750 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6751 // the sun.security.provider.ML_DSA class.
6752 //
6753 // coeffs (int[256]) = c_rarg0
6754 // zetas (int[256]) = c_rarg1
6755 address generate_dilithiumAlmostInverseNtt() {
6756
6757 __ align(CodeEntryAlignment);
6758 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6759 StubCodeMark mark(this, stub_id);
6760 address start = __ pc();
6761 __ enter();
6762
6763 const Register coeffs = c_rarg0;
6764 const Register zetas = c_rarg1;
6765
6766 const Register tmpAddr = r9;
6767 const Register dilithiumConsts = r10;
6768 const Register result = r11;
6769 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6770 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6771 VSeq<2> vq(30); // n.b. constants overlap vs3
6772 int offsets[4] = { 0, 32, 64, 96 };
6773 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6774 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6775
6776 __ add(result, coeffs, 0);
6777 __ lea(dilithiumConsts,
6778 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6779
6780 // Each level represents one iteration of the outer for loop of the Java version
6781
6782 // level 0
6783 // At level 0 we need to interleave adjacent quartets of
6784 // coefficients before we multiply and add/sub by the next 16
6785 // zetas just as we did for level 7 in the multiply code. So we
6786 // load and store the values using an ld2/st2 with arrangement 4S.
6787 for (int i = 0; i < 1024; i += 128) {
6788 // load constants q, qinv
6789 // n.b. this can be moved out of the loop as they do not get
6790 // clobbered by first two loops
6791 vs_ldpq(vq, dilithiumConsts); // qInv, q
6792 // a0/a1 load interleaved 32 (8x4S) coefficients
6793 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6794 // b load next 32 (8x4S) inputs
6795 vs_ldpq_post(vs_front(vs2), zetas);
6796 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6797 // n.b. second half of vs2 provides temporary register storage
6798 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6799 vs_front(vs2), vs_back(vs2), vtmp, vq);
6800 // a0/a1 store interleaved 32 (8x4S) coefficients
6801 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6802 }
6803
6804 // level 1
6805 // At level 1 we need to interleave pairs of adjacent pairs of
6806 // coefficients before we multiply by the next 16 zetas just as we
6807 // did for level 6 in the multiply code. So we load and store the
6808 // values an ld2/st2 with arrangement 2D.
6809 for (int i = 0; i < 1024; i += 128) {
6810 // a0/a1 load interleaved 32 (8x2D) coefficients
6811 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6812 // b load next 16 (4x4S) inputs
6813 vs_ldpq_post(vs_front(vs2), zetas);
6814 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6815 // n.b. second half of vs2 provides temporary register storage
6816 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6817 vs_front(vs2), vs_back(vs2), vtmp, vq);
6818 // a0/a1 store interleaved 32 (8x2D) coefficients
6819 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6820 }
6821
6822 // level 2
6823 // At level 2 coefficients come in blocks of 4. So, we load 4
6824 // adjacent coefficients at 8 distinct offsets for both the first
6825 // and second coefficient sequences, using an ldr with register
6826 // variant Q then combine them with next set of 32 zetas. Likewise
6827 // we store the results using an str with register variant Q.
6828 for (int i = 0; i < 1024; i += 256) {
6829 // c0 load 32 (8x4S) coefficients via first offsets
6830 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6831 // c1 load 32 (8x4S) coefficients via second offsets
6832 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6833 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6834 vs_addv(vs3, __ T4S, vs1, vs2);
6835 // c = c0 - c1
6836 vs_subv(vs1, __ T4S, vs1, vs2);
6837 // store a0 32 (8x4S) coefficients via first offsets
6838 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6839 // b load 32 (8x4S) next inputs
6840 vs_ldpq_post(vs2, zetas);
6841 // reload constants q, qinv -- they were clobbered earlier
6842 vs_ldpq(vq, dilithiumConsts); // qInv, q
6843 // compute a1 = b montmul c
6844 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6845 // store a1 32 (8x4S) coefficients via second offsets
6846 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6847 }
6848
6849 // level 3-7
6850 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6851
6852 __ leave(); // required for proper stackwalking of RuntimeStub frame
6853 __ mov(r0, zr); // return 0
6854 __ ret(lr);
6855
6856 return start;
6857 }
6858
6859 // Dilithium multiply polynomials in the NTT domain.
6860 // Straightforward implementation of the method
6861 // static int implDilithiumNttMult(
6862 // int[] result, int[] ntta, int[] nttb {} of
6863 // the sun.security.provider.ML_DSA class.
6864 //
6865 // result (int[256]) = c_rarg0
6866 // poly1 (int[256]) = c_rarg1
6867 // poly2 (int[256]) = c_rarg2
6868 address generate_dilithiumNttMult() {
6869
6870 __ align(CodeEntryAlignment);
6871 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6872 StubCodeMark mark(this, stub_id);
6873 address start = __ pc();
6874 __ enter();
6875
6876 Label L_loop;
6877
6878 const Register result = c_rarg0;
6879 const Register poly1 = c_rarg1;
6880 const Register poly2 = c_rarg2;
6881
6882 const Register dilithiumConsts = r10;
6883 const Register len = r11;
6884
6885 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6886 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6887 VSeq<2> vq(30); // n.b. constants overlap vs3
6888 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6889
6890 __ lea(dilithiumConsts,
6891 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6892
6893 // load constants q, qinv
6894 vs_ldpq(vq, dilithiumConsts); // qInv, q
6895 // load constant rSquare into v29
6896 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6897
6898 __ mov(len, zr);
6899 __ add(len, len, 1024);
6900
6901 __ BIND(L_loop);
6902
6903 // b load 32 (8x4S) next inputs from poly1
6904 vs_ldpq_post(vs1, poly1);
6905 // c load 32 (8x4S) next inputs from poly2
6906 vs_ldpq_post(vs2, poly2);
6907 // compute a = b montmul c
6908 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6909 // compute a = rsquare montmul a
6910 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6911 // save a 32 (8x4S) results
6912 vs_stpq_post(vs2, result);
6913
6914 __ sub(len, len, 128);
6915 __ cmp(len, (u1)128);
6916 __ br(Assembler::GE, L_loop);
6917
6918 __ leave(); // required for proper stackwalking of RuntimeStub frame
6919 __ mov(r0, zr); // return 0
6920 __ ret(lr);
6921
6922 return start;
6923 }
6924
6925 // Dilithium Motgomery multiply an array by a constant.
6926 // A straightforward implementation of the method
6927 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6928 // of the sun.security.provider.MLDSA class
6929 //
6930 // coeffs (int[256]) = c_rarg0
6931 // constant (int) = c_rarg1
6932 address generate_dilithiumMontMulByConstant() {
6933
6934 __ align(CodeEntryAlignment);
6935 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6936 StubCodeMark mark(this, stub_id);
6937 address start = __ pc();
6938 __ enter();
6939
6940 Label L_loop;
6941
6942 const Register coeffs = c_rarg0;
6943 const Register constant = c_rarg1;
6944
6945 const Register dilithiumConsts = r10;
6946 const Register result = r11;
6947 const Register len = r12;
6948
6949 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6950 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6951 VSeq<2> vq(30); // n.b. constants overlap vs3
6952 VSeq<8> vconst(29, 0); // for montmul by constant
6953
6954 // results track inputs
6955 __ add(result, coeffs, 0);
6956 __ lea(dilithiumConsts,
6957 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6958
6959 // load constants q, qinv -- they do not get clobbered by first two loops
6960 vs_ldpq(vq, dilithiumConsts); // qInv, q
6961 // copy caller supplied constant across vconst
6962 __ dup(vconst[0], __ T4S, constant);
6963 __ mov(len, zr);
6964 __ add(len, len, 1024);
6965
6966 __ BIND(L_loop);
6967
6968 // load next 32 inputs
6969 vs_ldpq_post(vs2, coeffs);
6970 // mont mul by constant
6971 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6972 // write next 32 results
6973 vs_stpq_post(vs2, result);
6974
6975 __ sub(len, len, 128);
6976 __ cmp(len, (u1)128);
6977 __ br(Assembler::GE, L_loop);
6978
6979 __ leave(); // required for proper stackwalking of RuntimeStub frame
6980 __ mov(r0, zr); // return 0
6981 __ ret(lr);
6982
6983 return start;
6984 }
6985
6986 // Dilithium decompose poly.
6987 // Implements the method
6988 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6989 // of the sun.security.provider.ML_DSA class
6990 //
6991 // input (int[256]) = c_rarg0
6992 // lowPart (int[256]) = c_rarg1
6993 // highPart (int[256]) = c_rarg2
6994 // twoGamma2 (int) = c_rarg3
6995 // multiplier (int) = c_rarg4
6996 address generate_dilithiumDecomposePoly() {
6997
6998 __ align(CodeEntryAlignment);
6999 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7000 StubCodeMark mark(this, stub_id);
7001 address start = __ pc();
7002 Label L_loop;
7003
7004 const Register input = c_rarg0;
7005 const Register lowPart = c_rarg1;
7006 const Register highPart = c_rarg2;
7007 const Register twoGamma2 = c_rarg3;
7008 const Register multiplier = c_rarg4;
7009
7010 const Register len = r9;
7011 const Register dilithiumConsts = r10;
7012 const Register tmp = r11;
7013
7014 // 6 independent sets of 4x4s values
7015 VSeq<4> vs1(0), vs2(4), vs3(8);
7016 VSeq<4> vs4(12), vs5(16), vtmp(20);
7017
7018 // 7 constants for cross-multiplying
7019 VSeq<4> one(25, 0);
7020 VSeq<4> qminus1(26, 0);
7021 VSeq<4> g2(27, 0);
7022 VSeq<4> twog2(28, 0);
7023 VSeq<4> mult(29, 0);
7024 VSeq<4> q(30, 0);
7025 VSeq<4> qadd(31, 0);
7026
7027 __ enter();
7028
7029 __ lea(dilithiumConsts,
7030 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7031
7032 // save callee-saved registers
7033 __ stpd(v8, v9, __ pre(sp, -64));
7034 __ stpd(v10, v11, Address(sp, 16));
7035 __ stpd(v12, v13, Address(sp, 32));
7036 __ stpd(v14, v15, Address(sp, 48));
7037
7038 // populate constant registers
7039 __ mov(tmp, zr);
7040 __ add(tmp, tmp, 1);
7041 __ dup(one[0], __ T4S, tmp); // 1
7042 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7043 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7044 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7045 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7046 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7047 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7048
7049 __ mov(len, zr);
7050 __ add(len, len, 1024);
7051
7052 __ BIND(L_loop);
7053
7054 // load next 4x4S inputs interleaved: rplus --> vs1
7055 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7056
7057 // rplus = rplus - ((rplus + qadd) >> 23) * q
7058 vs_addv(vtmp, __ T4S, vs1, qadd);
7059 vs_sshr(vtmp, __ T4S, vtmp, 23);
7060 vs_mulv(vtmp, __ T4S, vtmp, q);
7061 vs_subv(vs1, __ T4S, vs1, vtmp);
7062
7063 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7064 vs_sshr(vtmp, __ T4S, vs1, 31);
7065 vs_andr(vtmp, vtmp, q);
7066 vs_addv(vs1, __ T4S, vs1, vtmp);
7067
7068 // quotient --> vs2
7069 // int quotient = (rplus * multiplier) >> 22;
7070 vs_mulv(vtmp, __ T4S, vs1, mult);
7071 vs_sshr(vs2, __ T4S, vtmp, 22);
7072
7073 // r0 --> vs3
7074 // int r0 = rplus - quotient * twoGamma2;
7075 vs_mulv(vtmp, __ T4S, vs2, twog2);
7076 vs_subv(vs3, __ T4S, vs1, vtmp);
7077
7078 // mask --> vs4
7079 // int mask = (twoGamma2 - r0) >> 22;
7080 vs_subv(vtmp, __ T4S, twog2, vs3);
7081 vs_sshr(vs4, __ T4S, vtmp, 22);
7082
7083 // r0 -= (mask & twoGamma2);
7084 vs_andr(vtmp, vs4, twog2);
7085 vs_subv(vs3, __ T4S, vs3, vtmp);
7086
7087 // quotient += (mask & 1);
7088 vs_andr(vtmp, vs4, one);
7089 vs_addv(vs2, __ T4S, vs2, vtmp);
7090
7091 // mask = (twoGamma2 / 2 - r0) >> 31;
7092 vs_subv(vtmp, __ T4S, g2, vs3);
7093 vs_sshr(vs4, __ T4S, vtmp, 31);
7094
7095 // r0 -= (mask & twoGamma2);
7096 vs_andr(vtmp, vs4, twog2);
7097 vs_subv(vs3, __ T4S, vs3, vtmp);
7098
7099 // quotient += (mask & 1);
7100 vs_andr(vtmp, vs4, one);
7101 vs_addv(vs2, __ T4S, vs2, vtmp);
7102
7103 // r1 --> vs5
7104 // int r1 = rplus - r0 - (dilithium_q - 1);
7105 vs_subv(vtmp, __ T4S, vs1, vs3);
7106 vs_subv(vs5, __ T4S, vtmp, qminus1);
7107
7108 // r1 --> vs1 (overwriting rplus)
7109 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7110 vs_negr(vtmp, __ T4S, vs5);
7111 vs_orr(vtmp, vs5, vtmp);
7112 vs_sshr(vs1, __ T4S, vtmp, 31);
7113
7114 // r0 += ~r1;
7115 vs_notr(vtmp, vs1);
7116 vs_addv(vs3, __ T4S, vs3, vtmp);
7117
7118 // r1 = r1 & quotient;
7119 vs_andr(vs1, vs2, vs1);
7120
7121 // store results inteleaved
7122 // lowPart[m] = r0;
7123 // highPart[m] = r1;
7124 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7125 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7126
7127 __ sub(len, len, 64);
7128 __ cmp(len, (u1)64);
7129 __ br(Assembler::GE, L_loop);
7130
7131 // restore callee-saved vector registers
7132 __ ldpd(v14, v15, Address(sp, 48));
7133 __ ldpd(v12, v13, Address(sp, 32));
7134 __ ldpd(v10, v11, Address(sp, 16));
7135 __ ldpd(v8, v9, __ post(sp, 64));
7136
7137 __ leave(); // required for proper stackwalking of RuntimeStub frame
7138 __ mov(r0, zr); // return 0
7139 __ ret(lr);
7140
7141 return start;
7142 }
7143
7144 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7145 Register tmp0, Register tmp1, Register tmp2) {
7146 __ bic(tmp0, a2, a1); // for a0
7147 __ bic(tmp1, a3, a2); // for a1
7148 __ bic(tmp2, a4, a3); // for a2
7149 __ eor(a2, a2, tmp2);
7150 __ bic(tmp2, a0, a4); // for a3
7151 __ eor(a3, a3, tmp2);
7152 __ bic(tmp2, a1, a0); // for a4
7153 __ eor(a0, a0, tmp0);
7154 __ eor(a1, a1, tmp1);
7155 __ eor(a4, a4, tmp2);
7156 }
7157
7158 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7159 Register a0, Register a1, Register a2, Register a3, Register a4,
7160 Register a5, Register a6, Register a7, Register a8, Register a9,
7161 Register a10, Register a11, Register a12, Register a13, Register a14,
7162 Register a15, Register a16, Register a17, Register a18, Register a19,
7163 Register a20, Register a21, Register a22, Register a23, Register a24,
7164 Register tmp0, Register tmp1, Register tmp2) {
7165 __ eor3(tmp1, a4, a9, a14);
7166 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7167 __ eor3(tmp2, a1, a6, a11);
7168 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7169 __ rax1(tmp2, tmp0, tmp1); // d0
7170 {
7171
7172 Register tmp3, tmp4;
7173 if (can_use_fp && can_use_r18) {
7174 tmp3 = rfp;
7175 tmp4 = r18_tls;
7176 } else {
7177 tmp3 = a4;
7178 tmp4 = a9;
7179 __ stp(tmp3, tmp4, __ pre(sp, -16));
7180 }
7181
7182 __ eor3(tmp3, a0, a5, a10);
7183 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7184 __ eor(a0, a0, tmp2);
7185 __ eor(a5, a5, tmp2);
7186 __ eor(a10, a10, tmp2);
7187 __ eor(a15, a15, tmp2);
7188 __ eor(a20, a20, tmp2); // d0(tmp2)
7189 __ eor3(tmp3, a2, a7, a12);
7190 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7191 __ rax1(tmp3, tmp4, tmp2); // d1
7192 __ eor(a1, a1, tmp3);
7193 __ eor(a6, a6, tmp3);
7194 __ eor(a11, a11, tmp3);
7195 __ eor(a16, a16, tmp3);
7196 __ eor(a21, a21, tmp3); // d1(tmp3)
7197 __ rax1(tmp3, tmp2, tmp0); // d3
7198 __ eor3(tmp2, a3, a8, a13);
7199 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7200 __ eor(a3, a3, tmp3);
7201 __ eor(a8, a8, tmp3);
7202 __ eor(a13, a13, tmp3);
7203 __ eor(a18, a18, tmp3);
7204 __ eor(a23, a23, tmp3);
7205 __ rax1(tmp2, tmp1, tmp0); // d2
7206 __ eor(a2, a2, tmp2);
7207 __ eor(a7, a7, tmp2);
7208 __ eor(a12, a12, tmp2);
7209 __ rax1(tmp0, tmp0, tmp4); // d4
7210 if (!can_use_fp || !can_use_r18) {
7211 __ ldp(tmp3, tmp4, __ post(sp, 16));
7212 }
7213 __ eor(a17, a17, tmp2);
7214 __ eor(a22, a22, tmp2);
7215 __ eor(a4, a4, tmp0);
7216 __ eor(a9, a9, tmp0);
7217 __ eor(a14, a14, tmp0);
7218 __ eor(a19, a19, tmp0);
7219 __ eor(a24, a24, tmp0);
7220 }
7221
7222 __ rol(tmp0, a10, 3);
7223 __ rol(a10, a1, 1);
7224 __ rol(a1, a6, 44);
7225 __ rol(a6, a9, 20);
7226 __ rol(a9, a22, 61);
7227 __ rol(a22, a14, 39);
7228 __ rol(a14, a20, 18);
7229 __ rol(a20, a2, 62);
7230 __ rol(a2, a12, 43);
7231 __ rol(a12, a13, 25);
7232 __ rol(a13, a19, 8) ;
7233 __ rol(a19, a23, 56);
7234 __ rol(a23, a15, 41);
7235 __ rol(a15, a4, 27);
7236 __ rol(a4, a24, 14);
7237 __ rol(a24, a21, 2);
7238 __ rol(a21, a8, 55);
7239 __ rol(a8, a16, 45);
7240 __ rol(a16, a5, 36);
7241 __ rol(a5, a3, 28);
7242 __ rol(a3, a18, 21);
7243 __ rol(a18, a17, 15);
7244 __ rol(a17, a11, 10);
7245 __ rol(a11, a7, 6);
7246 __ mov(a7, tmp0);
7247
7248 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7249 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7250 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7251 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7252 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7253
7254 __ ldr(tmp1, __ post(rc, 8));
7255 __ eor(a0, a0, tmp1);
7256
7257 }
7258
7259 // Arguments:
7260 //
7261 // Inputs:
7262 // c_rarg0 - byte[] source+offset
7263 // c_rarg1 - byte[] SHA.state
7264 // c_rarg2 - int block_size
7265 // c_rarg3 - int offset
7266 // c_rarg4 - int limit
7267 //
7268 address generate_sha3_implCompress_gpr(StubId stub_id) {
7269 bool multi_block;
7270 switch (stub_id) {
7271 case StubId::stubgen_sha3_implCompress_id:
7272 multi_block = false;
7273 break;
7274 case StubId::stubgen_sha3_implCompressMB_id:
7275 multi_block = true;
7276 break;
7277 default:
7278 ShouldNotReachHere();
7279 }
7280
7281 static const uint64_t round_consts[24] = {
7282 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7283 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7284 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7285 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7286 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7287 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7288 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7289 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7290 };
7291
7292 __ align(CodeEntryAlignment);
7293 StubCodeMark mark(this, stub_id);
7294 address start = __ pc();
7295
7296 Register buf = c_rarg0;
7297 Register state = c_rarg1;
7298 Register block_size = c_rarg2;
7299 Register ofs = c_rarg3;
7300 Register limit = c_rarg4;
7301
7302 // use r3.r17,r19..r28 to keep a0..a24.
7303 // a0..a24 are respective locals from SHA3.java
7304 Register a0 = r25,
7305 a1 = r26,
7306 a2 = r27,
7307 a3 = r3,
7308 a4 = r4,
7309 a5 = r5,
7310 a6 = r6,
7311 a7 = r7,
7312 a8 = rscratch1, // r8
7313 a9 = rscratch2, // r9
7314 a10 = r10,
7315 a11 = r11,
7316 a12 = r12,
7317 a13 = r13,
7318 a14 = r14,
7319 a15 = r15,
7320 a16 = r16,
7321 a17 = r17,
7322 a18 = r28,
7323 a19 = r19,
7324 a20 = r20,
7325 a21 = r21,
7326 a22 = r22,
7327 a23 = r23,
7328 a24 = r24;
7329
7330 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7331
7332 Label sha3_loop, rounds24_preloop, loop_body;
7333 Label sha3_512_or_sha3_384, shake128;
7334
7335 bool can_use_r18 = false;
7336 #ifndef R18_RESERVED
7337 can_use_r18 = true;
7338 #endif
7339 bool can_use_fp = !PreserveFramePointer;
7340
7341 __ enter();
7342
7343 // save almost all yet unsaved gpr registers on stack
7344 __ str(block_size, __ pre(sp, -128));
7345 if (multi_block) {
7346 __ stpw(ofs, limit, Address(sp, 8));
7347 }
7348 // 8 bytes at sp+16 will be used to keep buf
7349 __ stp(r19, r20, Address(sp, 32));
7350 __ stp(r21, r22, Address(sp, 48));
7351 __ stp(r23, r24, Address(sp, 64));
7352 __ stp(r25, r26, Address(sp, 80));
7353 __ stp(r27, r28, Address(sp, 96));
7354 if (can_use_r18 && can_use_fp) {
7355 __ stp(r18_tls, state, Address(sp, 112));
7356 } else {
7357 __ str(state, Address(sp, 112));
7358 }
7359
7360 // begin sha3 calculations: loading a0..a24 from state arrary
7361 __ ldp(a0, a1, state);
7362 __ ldp(a2, a3, Address(state, 16));
7363 __ ldp(a4, a5, Address(state, 32));
7364 __ ldp(a6, a7, Address(state, 48));
7365 __ ldp(a8, a9, Address(state, 64));
7366 __ ldp(a10, a11, Address(state, 80));
7367 __ ldp(a12, a13, Address(state, 96));
7368 __ ldp(a14, a15, Address(state, 112));
7369 __ ldp(a16, a17, Address(state, 128));
7370 __ ldp(a18, a19, Address(state, 144));
7371 __ ldp(a20, a21, Address(state, 160));
7372 __ ldp(a22, a23, Address(state, 176));
7373 __ ldr(a24, Address(state, 192));
7374
7375 __ BIND(sha3_loop);
7376
7377 // load input
7378 __ ldp(tmp3, tmp2, __ post(buf, 16));
7379 __ eor(a0, a0, tmp3);
7380 __ eor(a1, a1, tmp2);
7381 __ ldp(tmp3, tmp2, __ post(buf, 16));
7382 __ eor(a2, a2, tmp3);
7383 __ eor(a3, a3, tmp2);
7384 __ ldp(tmp3, tmp2, __ post(buf, 16));
7385 __ eor(a4, a4, tmp3);
7386 __ eor(a5, a5, tmp2);
7387 __ ldr(tmp3, __ post(buf, 8));
7388 __ eor(a6, a6, tmp3);
7389
7390 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7391 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7392
7393 __ ldp(tmp3, tmp2, __ post(buf, 16));
7394 __ eor(a7, a7, tmp3);
7395 __ eor(a8, a8, tmp2);
7396 __ ldp(tmp3, tmp2, __ post(buf, 16));
7397 __ eor(a9, a9, tmp3);
7398 __ eor(a10, a10, tmp2);
7399 __ ldp(tmp3, tmp2, __ post(buf, 16));
7400 __ eor(a11, a11, tmp3);
7401 __ eor(a12, a12, tmp2);
7402 __ ldp(tmp3, tmp2, __ post(buf, 16));
7403 __ eor(a13, a13, tmp3);
7404 __ eor(a14, a14, tmp2);
7405 __ ldp(tmp3, tmp2, __ post(buf, 16));
7406 __ eor(a15, a15, tmp3);
7407 __ eor(a16, a16, tmp2);
7408
7409 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7410 __ andw(tmp2, block_size, 48);
7411 __ cbzw(tmp2, rounds24_preloop);
7412 __ tbnz(block_size, 5, shake128);
7413 // block_size == 144, bit5 == 0, SHA3-244
7414 __ ldr(tmp3, __ post(buf, 8));
7415 __ eor(a17, a17, tmp3);
7416 __ b(rounds24_preloop);
7417
7418 __ BIND(shake128);
7419 __ ldp(tmp3, tmp2, __ post(buf, 16));
7420 __ eor(a17, a17, tmp3);
7421 __ eor(a18, a18, tmp2);
7422 __ ldp(tmp3, tmp2, __ post(buf, 16));
7423 __ eor(a19, a19, tmp3);
7424 __ eor(a20, a20, tmp2);
7425 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7426
7427 __ BIND(sha3_512_or_sha3_384);
7428 __ ldp(tmp3, tmp2, __ post(buf, 16));
7429 __ eor(a7, a7, tmp3);
7430 __ eor(a8, a8, tmp2);
7431 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7432
7433 // SHA3-384
7434 __ ldp(tmp3, tmp2, __ post(buf, 16));
7435 __ eor(a9, a9, tmp3);
7436 __ eor(a10, a10, tmp2);
7437 __ ldp(tmp3, tmp2, __ post(buf, 16));
7438 __ eor(a11, a11, tmp3);
7439 __ eor(a12, a12, tmp2);
7440
7441 __ BIND(rounds24_preloop);
7442 __ fmovs(v0, 24.0); // float loop counter,
7443 __ fmovs(v1, 1.0); // exact representation
7444
7445 __ str(buf, Address(sp, 16));
7446 __ lea(tmp3, ExternalAddress((address) round_consts));
7447
7448 __ BIND(loop_body);
7449 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7450 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7451 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7452 tmp0, tmp1, tmp2);
7453 __ fsubs(v0, v0, v1);
7454 __ fcmps(v0, 0.0);
7455 __ br(__ NE, loop_body);
7456
7457 if (multi_block) {
7458 __ ldrw(block_size, sp); // block_size
7459 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7460 __ addw(tmp2, tmp2, block_size);
7461 __ cmpw(tmp2, tmp1);
7462 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7463 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7464 __ br(Assembler::LE, sha3_loop);
7465 __ movw(c_rarg0, tmp2); // return offset
7466 }
7467 if (can_use_fp && can_use_r18) {
7468 __ ldp(r18_tls, state, Address(sp, 112));
7469 } else {
7470 __ ldr(state, Address(sp, 112));
7471 }
7472 // save calculated sha3 state
7473 __ stp(a0, a1, Address(state));
7474 __ stp(a2, a3, Address(state, 16));
7475 __ stp(a4, a5, Address(state, 32));
7476 __ stp(a6, a7, Address(state, 48));
7477 __ stp(a8, a9, Address(state, 64));
7478 __ stp(a10, a11, Address(state, 80));
7479 __ stp(a12, a13, Address(state, 96));
7480 __ stp(a14, a15, Address(state, 112));
7481 __ stp(a16, a17, Address(state, 128));
7482 __ stp(a18, a19, Address(state, 144));
7483 __ stp(a20, a21, Address(state, 160));
7484 __ stp(a22, a23, Address(state, 176));
7485 __ str(a24, Address(state, 192));
7486
7487 // restore required registers from stack
7488 __ ldp(r19, r20, Address(sp, 32));
7489 __ ldp(r21, r22, Address(sp, 48));
7490 __ ldp(r23, r24, Address(sp, 64));
7491 __ ldp(r25, r26, Address(sp, 80));
7492 __ ldp(r27, r28, Address(sp, 96));
7493 if (can_use_fp && can_use_r18) {
7494 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7495 } // else no need to recalculate rfp, since it wasn't changed
7496
7497 __ leave();
7498
7499 __ ret(lr);
7500
7501 return start;
7502 }
7503
7504 /**
7505 * Arguments:
7506 *
7507 * Inputs:
7508 * c_rarg0 - int crc
7509 * c_rarg1 - byte* buf
7510 * c_rarg2 - int length
7511 *
7512 * Output:
7513 * rax - int crc result
7514 */
7515 address generate_updateBytesCRC32() {
7516 assert(UseCRC32Intrinsics, "what are we doing here?");
7517
7518 __ align(CodeEntryAlignment);
7519 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7520 StubCodeMark mark(this, stub_id);
7521
7522 address start = __ pc();
7523
7524 const Register crc = c_rarg0; // crc
7525 const Register buf = c_rarg1; // source java byte array address
7526 const Register len = c_rarg2; // length
7527 const Register table0 = c_rarg3; // crc_table address
7528 const Register table1 = c_rarg4;
7529 const Register table2 = c_rarg5;
7530 const Register table3 = c_rarg6;
7531 const Register tmp3 = c_rarg7;
7532
7533 BLOCK_COMMENT("Entry:");
7534 __ enter(); // required for proper stackwalking of RuntimeStub frame
7535
7536 __ kernel_crc32(crc, buf, len,
7537 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7538
7539 __ leave(); // required for proper stackwalking of RuntimeStub frame
7540 __ ret(lr);
7541
7542 return start;
7543 }
7544
7545 /**
7546 * Arguments:
7547 *
7548 * Inputs:
7549 * c_rarg0 - int crc
7550 * c_rarg1 - byte* buf
7551 * c_rarg2 - int length
7552 * c_rarg3 - int* table
7553 *
7554 * Output:
7555 * r0 - int crc result
7556 */
7557 address generate_updateBytesCRC32C() {
7558 assert(UseCRC32CIntrinsics, "what are we doing here?");
7559
7560 __ align(CodeEntryAlignment);
7561 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7562 StubCodeMark mark(this, stub_id);
7563
7564 address start = __ pc();
7565
7566 const Register crc = c_rarg0; // crc
7567 const Register buf = c_rarg1; // source java byte array address
7568 const Register len = c_rarg2; // length
7569 const Register table0 = c_rarg3; // crc_table address
7570 const Register table1 = c_rarg4;
7571 const Register table2 = c_rarg5;
7572 const Register table3 = c_rarg6;
7573 const Register tmp3 = c_rarg7;
7574
7575 BLOCK_COMMENT("Entry:");
7576 __ enter(); // required for proper stackwalking of RuntimeStub frame
7577
7578 __ kernel_crc32c(crc, buf, len,
7579 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7580
7581 __ leave(); // required for proper stackwalking of RuntimeStub frame
7582 __ ret(lr);
7583
7584 return start;
7585 }
7586
7587 /***
7588 * Arguments:
7589 *
7590 * Inputs:
7591 * c_rarg0 - int adler
7592 * c_rarg1 - byte* buff
7593 * c_rarg2 - int len
7594 *
7595 * Output:
7596 * c_rarg0 - int adler result
7597 */
7598 address generate_updateBytesAdler32() {
7599 __ align(CodeEntryAlignment);
7600 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7601 StubCodeMark mark(this, stub_id);
7602 address start = __ pc();
7603
7604 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7605
7606 // Aliases
7607 Register adler = c_rarg0;
7608 Register s1 = c_rarg0;
7609 Register s2 = c_rarg3;
7610 Register buff = c_rarg1;
7611 Register len = c_rarg2;
7612 Register nmax = r4;
7613 Register base = r5;
7614 Register count = r6;
7615 Register temp0 = rscratch1;
7616 Register temp1 = rscratch2;
7617 FloatRegister vbytes = v0;
7618 FloatRegister vs1acc = v1;
7619 FloatRegister vs2acc = v2;
7620 FloatRegister vtable = v3;
7621
7622 // Max number of bytes we can process before having to take the mod
7623 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7624 uint64_t BASE = 0xfff1;
7625 uint64_t NMAX = 0x15B0;
7626
7627 __ mov(base, BASE);
7628 __ mov(nmax, NMAX);
7629
7630 // Load accumulation coefficients for the upper 16 bits
7631 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7632 __ ld1(vtable, __ T16B, Address(temp0));
7633
7634 // s1 is initialized to the lower 16 bits of adler
7635 // s2 is initialized to the upper 16 bits of adler
7636 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7637 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7638
7639 // The pipelined loop needs at least 16 elements for 1 iteration
7640 // It does check this, but it is more effective to skip to the cleanup loop
7641 __ cmp(len, (u1)16);
7642 __ br(Assembler::HS, L_nmax);
7643 __ cbz(len, L_combine);
7644
7645 __ bind(L_simple_by1_loop);
7646 __ ldrb(temp0, Address(__ post(buff, 1)));
7647 __ add(s1, s1, temp0);
7648 __ add(s2, s2, s1);
7649 __ subs(len, len, 1);
7650 __ br(Assembler::HI, L_simple_by1_loop);
7651
7652 // s1 = s1 % BASE
7653 __ subs(temp0, s1, base);
7654 __ csel(s1, temp0, s1, Assembler::HS);
7655
7656 // s2 = s2 % BASE
7657 __ lsr(temp0, s2, 16);
7658 __ lsl(temp1, temp0, 4);
7659 __ sub(temp1, temp1, temp0);
7660 __ add(s2, temp1, s2, ext::uxth);
7661
7662 __ subs(temp0, s2, base);
7663 __ csel(s2, temp0, s2, Assembler::HS);
7664
7665 __ b(L_combine);
7666
7667 __ bind(L_nmax);
7668 __ subs(len, len, nmax);
7669 __ sub(count, nmax, 16);
7670 __ br(Assembler::LO, L_by16);
7671
7672 __ bind(L_nmax_loop);
7673
7674 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7675 vbytes, vs1acc, vs2acc, vtable);
7676
7677 __ subs(count, count, 16);
7678 __ br(Assembler::HS, L_nmax_loop);
7679
7680 // s1 = s1 % BASE
7681 __ lsr(temp0, s1, 16);
7682 __ lsl(temp1, temp0, 4);
7683 __ sub(temp1, temp1, temp0);
7684 __ add(temp1, temp1, s1, ext::uxth);
7685
7686 __ lsr(temp0, temp1, 16);
7687 __ lsl(s1, temp0, 4);
7688 __ sub(s1, s1, temp0);
7689 __ add(s1, s1, temp1, ext:: uxth);
7690
7691 __ subs(temp0, s1, base);
7692 __ csel(s1, temp0, s1, Assembler::HS);
7693
7694 // s2 = s2 % BASE
7695 __ lsr(temp0, s2, 16);
7696 __ lsl(temp1, temp0, 4);
7697 __ sub(temp1, temp1, temp0);
7698 __ add(temp1, temp1, s2, ext::uxth);
7699
7700 __ lsr(temp0, temp1, 16);
7701 __ lsl(s2, temp0, 4);
7702 __ sub(s2, s2, temp0);
7703 __ add(s2, s2, temp1, ext:: uxth);
7704
7705 __ subs(temp0, s2, base);
7706 __ csel(s2, temp0, s2, Assembler::HS);
7707
7708 __ subs(len, len, nmax);
7709 __ sub(count, nmax, 16);
7710 __ br(Assembler::HS, L_nmax_loop);
7711
7712 __ bind(L_by16);
7713 __ adds(len, len, count);
7714 __ br(Assembler::LO, L_by1);
7715
7716 __ bind(L_by16_loop);
7717
7718 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7719 vbytes, vs1acc, vs2acc, vtable);
7720
7721 __ subs(len, len, 16);
7722 __ br(Assembler::HS, L_by16_loop);
7723
7724 __ bind(L_by1);
7725 __ adds(len, len, 15);
7726 __ br(Assembler::LO, L_do_mod);
7727
7728 __ bind(L_by1_loop);
7729 __ ldrb(temp0, Address(__ post(buff, 1)));
7730 __ add(s1, temp0, s1);
7731 __ add(s2, s2, s1);
7732 __ subs(len, len, 1);
7733 __ br(Assembler::HS, L_by1_loop);
7734
7735 __ bind(L_do_mod);
7736 // s1 = s1 % BASE
7737 __ lsr(temp0, s1, 16);
7738 __ lsl(temp1, temp0, 4);
7739 __ sub(temp1, temp1, temp0);
7740 __ add(temp1, temp1, s1, ext::uxth);
7741
7742 __ lsr(temp0, temp1, 16);
7743 __ lsl(s1, temp0, 4);
7744 __ sub(s1, s1, temp0);
7745 __ add(s1, s1, temp1, ext:: uxth);
7746
7747 __ subs(temp0, s1, base);
7748 __ csel(s1, temp0, s1, Assembler::HS);
7749
7750 // s2 = s2 % BASE
7751 __ lsr(temp0, s2, 16);
7752 __ lsl(temp1, temp0, 4);
7753 __ sub(temp1, temp1, temp0);
7754 __ add(temp1, temp1, s2, ext::uxth);
7755
7756 __ lsr(temp0, temp1, 16);
7757 __ lsl(s2, temp0, 4);
7758 __ sub(s2, s2, temp0);
7759 __ add(s2, s2, temp1, ext:: uxth);
7760
7761 __ subs(temp0, s2, base);
7762 __ csel(s2, temp0, s2, Assembler::HS);
7763
7764 // Combine lower bits and higher bits
7765 __ bind(L_combine);
7766 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7767
7768 __ ret(lr);
7769
7770 return start;
7771 }
7772
7773 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7774 Register temp0, Register temp1, FloatRegister vbytes,
7775 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7776 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7777 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7778 // In non-vectorized code, we update s1 and s2 as:
7779 // s1 <- s1 + b1
7780 // s2 <- s2 + s1
7781 // s1 <- s1 + b2
7782 // s2 <- s2 + b1
7783 // ...
7784 // s1 <- s1 + b16
7785 // s2 <- s2 + s1
7786 // Putting above assignments together, we have:
7787 // s1_new = s1 + b1 + b2 + ... + b16
7788 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7789 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7790 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7791 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7792
7793 // s2 = s2 + s1 * 16
7794 __ add(s2, s2, s1, Assembler::LSL, 4);
7795
7796 // vs1acc = b1 + b2 + b3 + ... + b16
7797 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7798 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7799 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7800 __ uaddlv(vs1acc, __ T16B, vbytes);
7801 __ uaddlv(vs2acc, __ T8H, vs2acc);
7802
7803 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7804 __ fmovd(temp0, vs1acc);
7805 __ fmovd(temp1, vs2acc);
7806 __ add(s1, s1, temp0);
7807 __ add(s2, s2, temp1);
7808 }
7809
7810 /**
7811 * Arguments:
7812 *
7813 * Input:
7814 * c_rarg0 - x address
7815 * c_rarg1 - x length
7816 * c_rarg2 - y address
7817 * c_rarg3 - y length
7818 * c_rarg4 - z address
7819 */
7820 address generate_multiplyToLen() {
7821 __ align(CodeEntryAlignment);
7822 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7823 StubCodeMark mark(this, stub_id);
7824
7825 address start = __ pc();
7826
7827 if (AOTCodeCache::load_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start)) {
7828 return start;
7829 }
7830 const Register x = r0;
7831 const Register xlen = r1;
7832 const Register y = r2;
7833 const Register ylen = r3;
7834 const Register z = r4;
7835
7836 const Register tmp0 = r5;
7837 const Register tmp1 = r10;
7838 const Register tmp2 = r11;
7839 const Register tmp3 = r12;
7840 const Register tmp4 = r13;
7841 const Register tmp5 = r14;
7842 const Register tmp6 = r15;
7843 const Register tmp7 = r16;
7844
7845 BLOCK_COMMENT("Entry:");
7846 __ enter(); // required for proper stackwalking of RuntimeStub frame
7847 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7848 __ leave(); // required for proper stackwalking of RuntimeStub frame
7849 __ ret(lr);
7850
7851 AOTCodeCache::store_stub(this, vmIntrinsics::_multiplyToLen, "multiplyToLen", start);
7852 return start;
7853 }
7854
7855 address generate_squareToLen() {
7856 // squareToLen algorithm for sizes 1..127 described in java code works
7857 // faster than multiply_to_len on some CPUs and slower on others, but
7858 // multiply_to_len shows a bit better overall results
7859 __ align(CodeEntryAlignment);
7860 StubId stub_id = StubId::stubgen_squareToLen_id;
7861 StubCodeMark mark(this, stub_id);
7862 address start = __ pc();
7863
7864 if (AOTCodeCache::load_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start)) {
7865 return start;
7866 }
7867 const Register x = r0;
7868 const Register xlen = r1;
7869 const Register z = r2;
7870 const Register y = r4; // == x
7871 const Register ylen = r5; // == xlen
7872
7873 const Register tmp0 = r3;
7874 const Register tmp1 = r10;
7875 const Register tmp2 = r11;
7876 const Register tmp3 = r12;
7877 const Register tmp4 = r13;
7878 const Register tmp5 = r14;
7879 const Register tmp6 = r15;
7880 const Register tmp7 = r16;
7881
7882 RegSet spilled_regs = RegSet::of(y, ylen);
7883 BLOCK_COMMENT("Entry:");
7884 __ enter();
7885 __ push(spilled_regs, sp);
7886 __ mov(y, x);
7887 __ mov(ylen, xlen);
7888 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7889 __ pop(spilled_regs, sp);
7890 __ leave();
7891 __ ret(lr);
7892
7893 AOTCodeCache::store_stub(this, vmIntrinsics::_squareToLen, "squareToLen", start);
7894 return start;
7895 }
7896
7897 address generate_mulAdd() {
7898 __ align(CodeEntryAlignment);
7899 StubId stub_id = StubId::stubgen_mulAdd_id;
7900 StubCodeMark mark(this, stub_id);
7901
7902 address start = __ pc();
7903
7904 if (AOTCodeCache::load_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start)) {
7905 return start;
7906 }
7907 const Register out = r0;
7908 const Register in = r1;
7909 const Register offset = r2;
7910 const Register len = r3;
7911 const Register k = r4;
7912
7913 BLOCK_COMMENT("Entry:");
7914 __ enter();
7915 __ mul_add(out, in, offset, len, k);
7916 __ leave();
7917 __ ret(lr);
7918
7919 AOTCodeCache::store_stub(this, vmIntrinsics::_mulAdd, "mulAdd", start);
7920 return start;
7921 }
7922
7923 // Arguments:
7924 //
7925 // Input:
7926 // c_rarg0 - newArr address
7927 // c_rarg1 - oldArr address
7928 // c_rarg2 - newIdx
7929 // c_rarg3 - shiftCount
7930 // c_rarg4 - numIter
7931 //
7932 address generate_bigIntegerRightShift() {
7933 __ align(CodeEntryAlignment);
7934 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7935 StubCodeMark mark(this, stub_id);
7936 address start = __ pc();
7937
7938 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7939
7940 Register newArr = c_rarg0;
7941 Register oldArr = c_rarg1;
7942 Register newIdx = c_rarg2;
7943 Register shiftCount = c_rarg3;
7944 Register numIter = c_rarg4;
7945 Register idx = numIter;
7946
7947 Register newArrCur = rscratch1;
7948 Register shiftRevCount = rscratch2;
7949 Register oldArrCur = r13;
7950 Register oldArrNext = r14;
7951
7952 FloatRegister oldElem0 = v0;
7953 FloatRegister oldElem1 = v1;
7954 FloatRegister newElem = v2;
7955 FloatRegister shiftVCount = v3;
7956 FloatRegister shiftVRevCount = v4;
7957
7958 __ cbz(idx, Exit);
7959
7960 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7961
7962 // left shift count
7963 __ movw(shiftRevCount, 32);
7964 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7965
7966 // numIter too small to allow a 4-words SIMD loop, rolling back
7967 __ cmp(numIter, (u1)4);
7968 __ br(Assembler::LT, ShiftThree);
7969
7970 __ dup(shiftVCount, __ T4S, shiftCount);
7971 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7972 __ negr(shiftVCount, __ T4S, shiftVCount);
7973
7974 __ BIND(ShiftSIMDLoop);
7975
7976 // Calculate the load addresses
7977 __ sub(idx, idx, 4);
7978 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7979 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7980 __ add(oldArrCur, oldArrNext, 4);
7981
7982 // Load 4 words and process
7983 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7984 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7985 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7986 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7987 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7988 __ st1(newElem, __ T4S, Address(newArrCur));
7989
7990 __ cmp(idx, (u1)4);
7991 __ br(Assembler::LT, ShiftTwoLoop);
7992 __ b(ShiftSIMDLoop);
7993
7994 __ BIND(ShiftTwoLoop);
7995 __ cbz(idx, Exit);
7996 __ cmp(idx, (u1)1);
7997 __ br(Assembler::EQ, ShiftOne);
7998
7999 // Calculate the load addresses
8000 __ sub(idx, idx, 2);
8001 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8002 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8003 __ add(oldArrCur, oldArrNext, 4);
8004
8005 // Load 2 words and process
8006 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8007 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8008 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8009 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8010 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8011 __ st1(newElem, __ T2S, Address(newArrCur));
8012 __ b(ShiftTwoLoop);
8013
8014 __ BIND(ShiftThree);
8015 __ tbz(idx, 1, ShiftOne);
8016 __ tbz(idx, 0, ShiftTwo);
8017 __ ldrw(r10, Address(oldArr, 12));
8018 __ ldrw(r11, Address(oldArr, 8));
8019 __ lsrvw(r10, r10, shiftCount);
8020 __ lslvw(r11, r11, shiftRevCount);
8021 __ orrw(r12, r10, r11);
8022 __ strw(r12, Address(newArr, 8));
8023
8024 __ BIND(ShiftTwo);
8025 __ ldrw(r10, Address(oldArr, 8));
8026 __ ldrw(r11, Address(oldArr, 4));
8027 __ lsrvw(r10, r10, shiftCount);
8028 __ lslvw(r11, r11, shiftRevCount);
8029 __ orrw(r12, r10, r11);
8030 __ strw(r12, Address(newArr, 4));
8031
8032 __ BIND(ShiftOne);
8033 __ ldrw(r10, Address(oldArr, 4));
8034 __ ldrw(r11, Address(oldArr));
8035 __ lsrvw(r10, r10, shiftCount);
8036 __ lslvw(r11, r11, shiftRevCount);
8037 __ orrw(r12, r10, r11);
8038 __ strw(r12, Address(newArr));
8039
8040 __ BIND(Exit);
8041 __ ret(lr);
8042
8043 return start;
8044 }
8045
8046 // Arguments:
8047 //
8048 // Input:
8049 // c_rarg0 - newArr address
8050 // c_rarg1 - oldArr address
8051 // c_rarg2 - newIdx
8052 // c_rarg3 - shiftCount
8053 // c_rarg4 - numIter
8054 //
8055 address generate_bigIntegerLeftShift() {
8056 __ align(CodeEntryAlignment);
8057 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8058 StubCodeMark mark(this, stub_id);
8059 address start = __ pc();
8060
8061 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8062
8063 Register newArr = c_rarg0;
8064 Register oldArr = c_rarg1;
8065 Register newIdx = c_rarg2;
8066 Register shiftCount = c_rarg3;
8067 Register numIter = c_rarg4;
8068
8069 Register shiftRevCount = rscratch1;
8070 Register oldArrNext = rscratch2;
8071
8072 FloatRegister oldElem0 = v0;
8073 FloatRegister oldElem1 = v1;
8074 FloatRegister newElem = v2;
8075 FloatRegister shiftVCount = v3;
8076 FloatRegister shiftVRevCount = v4;
8077
8078 __ cbz(numIter, Exit);
8079
8080 __ add(oldArrNext, oldArr, 4);
8081 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8082
8083 // right shift count
8084 __ movw(shiftRevCount, 32);
8085 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8086
8087 // numIter too small to allow a 4-words SIMD loop, rolling back
8088 __ cmp(numIter, (u1)4);
8089 __ br(Assembler::LT, ShiftThree);
8090
8091 __ dup(shiftVCount, __ T4S, shiftCount);
8092 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8093 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8094
8095 __ BIND(ShiftSIMDLoop);
8096
8097 // load 4 words and process
8098 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8099 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8100 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8101 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8102 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8103 __ st1(newElem, __ T4S, __ post(newArr, 16));
8104 __ sub(numIter, numIter, 4);
8105
8106 __ cmp(numIter, (u1)4);
8107 __ br(Assembler::LT, ShiftTwoLoop);
8108 __ b(ShiftSIMDLoop);
8109
8110 __ BIND(ShiftTwoLoop);
8111 __ cbz(numIter, Exit);
8112 __ cmp(numIter, (u1)1);
8113 __ br(Assembler::EQ, ShiftOne);
8114
8115 // load 2 words and process
8116 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8117 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8118 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8119 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8120 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8121 __ st1(newElem, __ T2S, __ post(newArr, 8));
8122 __ sub(numIter, numIter, 2);
8123 __ b(ShiftTwoLoop);
8124
8125 __ BIND(ShiftThree);
8126 __ ldrw(r10, __ post(oldArr, 4));
8127 __ ldrw(r11, __ post(oldArrNext, 4));
8128 __ lslvw(r10, r10, shiftCount);
8129 __ lsrvw(r11, r11, shiftRevCount);
8130 __ orrw(r12, r10, r11);
8131 __ strw(r12, __ post(newArr, 4));
8132 __ tbz(numIter, 1, Exit);
8133 __ tbz(numIter, 0, ShiftOne);
8134
8135 __ BIND(ShiftTwo);
8136 __ ldrw(r10, __ post(oldArr, 4));
8137 __ ldrw(r11, __ post(oldArrNext, 4));
8138 __ lslvw(r10, r10, shiftCount);
8139 __ lsrvw(r11, r11, shiftRevCount);
8140 __ orrw(r12, r10, r11);
8141 __ strw(r12, __ post(newArr, 4));
8142
8143 __ BIND(ShiftOne);
8144 __ ldrw(r10, Address(oldArr));
8145 __ ldrw(r11, Address(oldArrNext));
8146 __ lslvw(r10, r10, shiftCount);
8147 __ lsrvw(r11, r11, shiftRevCount);
8148 __ orrw(r12, r10, r11);
8149 __ strw(r12, Address(newArr));
8150
8151 __ BIND(Exit);
8152 __ ret(lr);
8153
8154 return start;
8155 }
8156
8157 address generate_count_positives(address &count_positives_long) {
8158 const u1 large_loop_size = 64;
8159 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8160 int dcache_line = VM_Version::dcache_line_size();
8161
8162 Register ary1 = r1, len = r2, result = r0;
8163
8164 __ align(CodeEntryAlignment);
8165
8166 StubId stub_id = StubId::stubgen_count_positives_id;
8167 StubCodeMark mark(this, stub_id);
8168
8169 address entry = __ pc();
8170
8171 __ enter();
8172 // precondition: a copy of len is already in result
8173 // __ mov(result, len);
8174
8175 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8176 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8177
8178 __ cmp(len, (u1)15);
8179 __ br(Assembler::GT, LEN_OVER_15);
8180 // The only case when execution falls into this code is when pointer is near
8181 // the end of memory page and we have to avoid reading next page
8182 __ add(ary1, ary1, len);
8183 __ subs(len, len, 8);
8184 __ br(Assembler::GT, LEN_OVER_8);
8185 __ ldr(rscratch2, Address(ary1, -8));
8186 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8187 __ lsrv(rscratch2, rscratch2, rscratch1);
8188 __ tst(rscratch2, UPPER_BIT_MASK);
8189 __ csel(result, zr, result, Assembler::NE);
8190 __ leave();
8191 __ ret(lr);
8192 __ bind(LEN_OVER_8);
8193 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8194 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8195 __ tst(rscratch2, UPPER_BIT_MASK);
8196 __ br(Assembler::NE, RET_NO_POP);
8197 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8198 __ lsrv(rscratch1, rscratch1, rscratch2);
8199 __ tst(rscratch1, UPPER_BIT_MASK);
8200 __ bind(RET_NO_POP);
8201 __ csel(result, zr, result, Assembler::NE);
8202 __ leave();
8203 __ ret(lr);
8204
8205 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8206 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8207
8208 count_positives_long = __ pc(); // 2nd entry point
8209
8210 __ enter();
8211
8212 __ bind(LEN_OVER_15);
8213 __ push(spilled_regs, sp);
8214 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8215 __ cbz(rscratch2, ALIGNED);
8216 __ ldp(tmp6, tmp1, Address(ary1));
8217 __ mov(tmp5, 16);
8218 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8219 __ add(ary1, ary1, rscratch1);
8220 __ orr(tmp6, tmp6, tmp1);
8221 __ tst(tmp6, UPPER_BIT_MASK);
8222 __ br(Assembler::NE, RET_ADJUST);
8223 __ sub(len, len, rscratch1);
8224
8225 __ bind(ALIGNED);
8226 __ cmp(len, large_loop_size);
8227 __ br(Assembler::LT, CHECK_16);
8228 // Perform 16-byte load as early return in pre-loop to handle situation
8229 // when initially aligned large array has negative values at starting bytes,
8230 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8231 // slower. Cases with negative bytes further ahead won't be affected that
8232 // much. In fact, it'll be faster due to early loads, less instructions and
8233 // less branches in LARGE_LOOP.
8234 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8235 __ sub(len, len, 16);
8236 __ orr(tmp6, tmp6, tmp1);
8237 __ tst(tmp6, UPPER_BIT_MASK);
8238 __ br(Assembler::NE, RET_ADJUST_16);
8239 __ cmp(len, large_loop_size);
8240 __ br(Assembler::LT, CHECK_16);
8241
8242 if (SoftwarePrefetchHintDistance >= 0
8243 && SoftwarePrefetchHintDistance >= dcache_line) {
8244 // initial prefetch
8245 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8246 }
8247 __ bind(LARGE_LOOP);
8248 if (SoftwarePrefetchHintDistance >= 0) {
8249 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8250 }
8251 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8252 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8253 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8254 // instructions per cycle and have less branches, but this approach disables
8255 // early return, thus, all 64 bytes are loaded and checked every time.
8256 __ ldp(tmp2, tmp3, Address(ary1));
8257 __ ldp(tmp4, tmp5, Address(ary1, 16));
8258 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8259 __ ldp(tmp6, tmp1, Address(ary1, 48));
8260 __ add(ary1, ary1, large_loop_size);
8261 __ sub(len, len, large_loop_size);
8262 __ orr(tmp2, tmp2, tmp3);
8263 __ orr(tmp4, tmp4, tmp5);
8264 __ orr(rscratch1, rscratch1, rscratch2);
8265 __ orr(tmp6, tmp6, tmp1);
8266 __ orr(tmp2, tmp2, tmp4);
8267 __ orr(rscratch1, rscratch1, tmp6);
8268 __ orr(tmp2, tmp2, rscratch1);
8269 __ tst(tmp2, UPPER_BIT_MASK);
8270 __ br(Assembler::NE, RET_ADJUST_LONG);
8271 __ cmp(len, large_loop_size);
8272 __ br(Assembler::GE, LARGE_LOOP);
8273
8274 __ bind(CHECK_16); // small 16-byte load pre-loop
8275 __ cmp(len, (u1)16);
8276 __ br(Assembler::LT, POST_LOOP16);
8277
8278 __ bind(LOOP16); // small 16-byte load loop
8279 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8280 __ sub(len, len, 16);
8281 __ orr(tmp2, tmp2, tmp3);
8282 __ tst(tmp2, UPPER_BIT_MASK);
8283 __ br(Assembler::NE, RET_ADJUST_16);
8284 __ cmp(len, (u1)16);
8285 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8286
8287 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8288 __ cmp(len, (u1)8);
8289 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8290 __ ldr(tmp3, Address(__ post(ary1, 8)));
8291 __ tst(tmp3, UPPER_BIT_MASK);
8292 __ br(Assembler::NE, RET_ADJUST);
8293 __ sub(len, len, 8);
8294
8295 __ bind(POST_LOOP16_LOAD_TAIL);
8296 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8297 __ ldr(tmp1, Address(ary1));
8298 __ mov(tmp2, 64);
8299 __ sub(tmp4, tmp2, len, __ LSL, 3);
8300 __ lslv(tmp1, tmp1, tmp4);
8301 __ tst(tmp1, UPPER_BIT_MASK);
8302 __ br(Assembler::NE, RET_ADJUST);
8303 // Fallthrough
8304
8305 __ bind(RET_LEN);
8306 __ pop(spilled_regs, sp);
8307 __ leave();
8308 __ ret(lr);
8309
8310 // difference result - len is the count of guaranteed to be
8311 // positive bytes
8312
8313 __ bind(RET_ADJUST_LONG);
8314 __ add(len, len, (u1)(large_loop_size - 16));
8315 __ bind(RET_ADJUST_16);
8316 __ add(len, len, 16);
8317 __ bind(RET_ADJUST);
8318 __ pop(spilled_regs, sp);
8319 __ leave();
8320 __ sub(result, result, len);
8321 __ ret(lr);
8322
8323 return entry;
8324 }
8325
8326 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8327 bool usePrefetch, Label &NOT_EQUAL) {
8328 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8329 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8330 tmp7 = r12, tmp8 = r13;
8331 Label LOOP;
8332
8333 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8334 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8335 __ bind(LOOP);
8336 if (usePrefetch) {
8337 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8338 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8339 }
8340 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8341 __ eor(tmp1, tmp1, tmp2);
8342 __ eor(tmp3, tmp3, tmp4);
8343 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8344 __ orr(tmp1, tmp1, tmp3);
8345 __ cbnz(tmp1, NOT_EQUAL);
8346 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8347 __ eor(tmp5, tmp5, tmp6);
8348 __ eor(tmp7, tmp7, tmp8);
8349 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8350 __ orr(tmp5, tmp5, tmp7);
8351 __ cbnz(tmp5, NOT_EQUAL);
8352 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8353 __ eor(tmp1, tmp1, tmp2);
8354 __ eor(tmp3, tmp3, tmp4);
8355 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8356 __ orr(tmp1, tmp1, tmp3);
8357 __ cbnz(tmp1, NOT_EQUAL);
8358 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8359 __ eor(tmp5, tmp5, tmp6);
8360 __ sub(cnt1, cnt1, 8 * wordSize);
8361 __ eor(tmp7, tmp7, tmp8);
8362 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8363 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8364 // cmp) because subs allows an unlimited range of immediate operand.
8365 __ subs(tmp6, cnt1, loopThreshold);
8366 __ orr(tmp5, tmp5, tmp7);
8367 __ cbnz(tmp5, NOT_EQUAL);
8368 __ br(__ GE, LOOP);
8369 // post-loop
8370 __ eor(tmp1, tmp1, tmp2);
8371 __ eor(tmp3, tmp3, tmp4);
8372 __ orr(tmp1, tmp1, tmp3);
8373 __ sub(cnt1, cnt1, 2 * wordSize);
8374 __ cbnz(tmp1, NOT_EQUAL);
8375 }
8376
8377 void generate_large_array_equals_loop_simd(int loopThreshold,
8378 bool usePrefetch, Label &NOT_EQUAL) {
8379 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8380 tmp2 = rscratch2;
8381 Label LOOP;
8382
8383 __ bind(LOOP);
8384 if (usePrefetch) {
8385 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8386 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8387 }
8388 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8389 __ sub(cnt1, cnt1, 8 * wordSize);
8390 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8391 __ subs(tmp1, cnt1, loopThreshold);
8392 __ eor(v0, __ T16B, v0, v4);
8393 __ eor(v1, __ T16B, v1, v5);
8394 __ eor(v2, __ T16B, v2, v6);
8395 __ eor(v3, __ T16B, v3, v7);
8396 __ orr(v0, __ T16B, v0, v1);
8397 __ orr(v1, __ T16B, v2, v3);
8398 __ orr(v0, __ T16B, v0, v1);
8399 __ umov(tmp1, v0, __ D, 0);
8400 __ umov(tmp2, v0, __ D, 1);
8401 __ orr(tmp1, tmp1, tmp2);
8402 __ cbnz(tmp1, NOT_EQUAL);
8403 __ br(__ GE, LOOP);
8404 }
8405
8406 // a1 = r1 - array1 address
8407 // a2 = r2 - array2 address
8408 // result = r0 - return value. Already contains "false"
8409 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8410 // r3-r5 are reserved temporary registers
8411 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8412 address generate_large_array_equals() {
8413 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8414 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8415 tmp7 = r12, tmp8 = r13;
8416 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8417 SMALL_LOOP, POST_LOOP;
8418 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8419 // calculate if at least 32 prefetched bytes are used
8420 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8421 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8422 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8423 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8424 tmp5, tmp6, tmp7, tmp8);
8425
8426 __ align(CodeEntryAlignment);
8427
8428 StubId stub_id = StubId::stubgen_large_array_equals_id;
8429 StubCodeMark mark(this, stub_id);
8430
8431 address entry = __ pc();
8432 __ enter();
8433 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8434 // also advance pointers to use post-increment instead of pre-increment
8435 __ add(a1, a1, wordSize);
8436 __ add(a2, a2, wordSize);
8437 if (AvoidUnalignedAccesses) {
8438 // both implementations (SIMD/nonSIMD) are using relatively large load
8439 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8440 // on some CPUs in case of address is not at least 16-byte aligned.
8441 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8442 // load if needed at least for 1st address and make if 16-byte aligned.
8443 Label ALIGNED16;
8444 __ tbz(a1, 3, ALIGNED16);
8445 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8446 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8447 __ sub(cnt1, cnt1, wordSize);
8448 __ eor(tmp1, tmp1, tmp2);
8449 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8450 __ bind(ALIGNED16);
8451 }
8452 if (UseSIMDForArrayEquals) {
8453 if (SoftwarePrefetchHintDistance >= 0) {
8454 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8455 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8456 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8457 /* prfm = */ true, NOT_EQUAL);
8458 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8459 __ br(__ LT, TAIL);
8460 }
8461 __ bind(NO_PREFETCH_LARGE_LOOP);
8462 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8463 /* prfm = */ false, NOT_EQUAL);
8464 } else {
8465 __ push(spilled_regs, sp);
8466 if (SoftwarePrefetchHintDistance >= 0) {
8467 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8468 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8469 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8470 /* prfm = */ true, NOT_EQUAL);
8471 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8472 __ br(__ LT, TAIL);
8473 }
8474 __ bind(NO_PREFETCH_LARGE_LOOP);
8475 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8476 /* prfm = */ false, NOT_EQUAL);
8477 }
8478 __ bind(TAIL);
8479 __ cbz(cnt1, EQUAL);
8480 __ subs(cnt1, cnt1, wordSize);
8481 __ br(__ LE, POST_LOOP);
8482 __ bind(SMALL_LOOP);
8483 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8484 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8485 __ subs(cnt1, cnt1, wordSize);
8486 __ eor(tmp1, tmp1, tmp2);
8487 __ cbnz(tmp1, NOT_EQUAL);
8488 __ br(__ GT, SMALL_LOOP);
8489 __ bind(POST_LOOP);
8490 __ ldr(tmp1, Address(a1, cnt1));
8491 __ ldr(tmp2, Address(a2, cnt1));
8492 __ eor(tmp1, tmp1, tmp2);
8493 __ cbnz(tmp1, NOT_EQUAL);
8494 __ bind(EQUAL);
8495 __ mov(result, true);
8496 __ bind(NOT_EQUAL);
8497 if (!UseSIMDForArrayEquals) {
8498 __ pop(spilled_regs, sp);
8499 }
8500 __ bind(NOT_EQUAL_NO_POP);
8501 __ leave();
8502 __ ret(lr);
8503 return entry;
8504 }
8505
8506 // result = r0 - return value. Contains initial hashcode value on entry.
8507 // ary = r1 - array address
8508 // cnt = r2 - elements count
8509 // Clobbers: v0-v13, rscratch1, rscratch2
8510 address generate_large_arrays_hashcode(BasicType eltype) {
8511 const Register result = r0, ary = r1, cnt = r2;
8512 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8513 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8514 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8515 const FloatRegister vpowm = v13;
8516
8517 ARRAYS_HASHCODE_REGISTERS;
8518
8519 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8520
8521 unsigned int vf; // vectorization factor
8522 bool multiply_by_halves;
8523 Assembler::SIMD_Arrangement load_arrangement;
8524 switch (eltype) {
8525 case T_BOOLEAN:
8526 case T_BYTE:
8527 load_arrangement = Assembler::T8B;
8528 multiply_by_halves = true;
8529 vf = 8;
8530 break;
8531 case T_CHAR:
8532 case T_SHORT:
8533 load_arrangement = Assembler::T8H;
8534 multiply_by_halves = true;
8535 vf = 8;
8536 break;
8537 case T_INT:
8538 load_arrangement = Assembler::T4S;
8539 multiply_by_halves = false;
8540 vf = 4;
8541 break;
8542 default:
8543 ShouldNotReachHere();
8544 }
8545
8546 // Unroll factor
8547 const unsigned uf = 4;
8548
8549 // Effective vectorization factor
8550 const unsigned evf = vf * uf;
8551
8552 __ align(CodeEntryAlignment);
8553
8554 StubId stub_id;
8555 switch (eltype) {
8556 case T_BOOLEAN:
8557 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8558 break;
8559 case T_BYTE:
8560 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8561 break;
8562 case T_CHAR:
8563 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8564 break;
8565 case T_SHORT:
8566 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8567 break;
8568 case T_INT:
8569 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8570 break;
8571 default:
8572 stub_id = StubId::NO_STUBID;
8573 ShouldNotReachHere();
8574 };
8575
8576 StubCodeMark mark(this, stub_id);
8577
8578 address entry = __ pc();
8579 __ enter();
8580
8581 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8582 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8583 // value shouldn't change throughout both loops.
8584 __ movw(rscratch1, intpow(31U, 3));
8585 __ mov(vpow, Assembler::S, 0, rscratch1);
8586 __ movw(rscratch1, intpow(31U, 2));
8587 __ mov(vpow, Assembler::S, 1, rscratch1);
8588 __ movw(rscratch1, intpow(31U, 1));
8589 __ mov(vpow, Assembler::S, 2, rscratch1);
8590 __ movw(rscratch1, intpow(31U, 0));
8591 __ mov(vpow, Assembler::S, 3, rscratch1);
8592
8593 __ mov(vmul0, Assembler::T16B, 0);
8594 __ mov(vmul0, Assembler::S, 3, result);
8595
8596 __ andr(rscratch2, cnt, (uf - 1) * vf);
8597 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8598
8599 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8600 __ mov(vpowm, Assembler::S, 0, rscratch1);
8601
8602 // SMALL LOOP
8603 __ bind(SMALL_LOOP);
8604
8605 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8606 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8607 __ subsw(rscratch2, rscratch2, vf);
8608
8609 if (load_arrangement == Assembler::T8B) {
8610 // Extend 8B to 8H to be able to use vector multiply
8611 // instructions
8612 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8613 if (is_signed_subword_type(eltype)) {
8614 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8615 } else {
8616 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8617 }
8618 }
8619
8620 switch (load_arrangement) {
8621 case Assembler::T4S:
8622 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8623 break;
8624 case Assembler::T8B:
8625 case Assembler::T8H:
8626 assert(is_subword_type(eltype), "subword type expected");
8627 if (is_signed_subword_type(eltype)) {
8628 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8629 } else {
8630 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8631 }
8632 break;
8633 default:
8634 __ should_not_reach_here();
8635 }
8636
8637 // Process the upper half of a vector
8638 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8639 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8640 if (is_signed_subword_type(eltype)) {
8641 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8642 } else {
8643 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8644 }
8645 }
8646
8647 __ br(Assembler::HI, SMALL_LOOP);
8648
8649 // SMALL LOOP'S EPILOQUE
8650 __ lsr(rscratch2, cnt, exact_log2(evf));
8651 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8652
8653 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8654 __ addv(vmul0, Assembler::T4S, vmul0);
8655 __ umov(result, vmul0, Assembler::S, 0);
8656
8657 // TAIL
8658 __ bind(TAIL);
8659
8660 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8661 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8662 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8663 __ andr(rscratch2, cnt, vf - 1);
8664 __ bind(TAIL_SHORTCUT);
8665 __ adr(rscratch1, BR_BASE);
8666 // For Cortex-A53 offset is 4 because 2 nops are generated.
8667 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8668 __ movw(rscratch2, 0x1f);
8669 __ br(rscratch1);
8670
8671 for (size_t i = 0; i < vf - 1; ++i) {
8672 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8673 eltype);
8674 __ maddw(result, result, rscratch2, rscratch1);
8675 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8676 // Generate 2nd nop to have 4 instructions per iteration.
8677 if (VM_Version::supports_a53mac()) {
8678 __ nop();
8679 }
8680 }
8681 __ bind(BR_BASE);
8682
8683 __ leave();
8684 __ ret(lr);
8685
8686 // LARGE LOOP
8687 __ bind(LARGE_LOOP_PREHEADER);
8688
8689 __ lsr(rscratch2, cnt, exact_log2(evf));
8690
8691 if (multiply_by_halves) {
8692 // 31^4 - multiplier between lower and upper parts of a register
8693 __ movw(rscratch1, intpow(31U, vf / 2));
8694 __ mov(vpowm, Assembler::S, 1, rscratch1);
8695 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8696 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8697 __ mov(vpowm, Assembler::S, 0, rscratch1);
8698 } else {
8699 // 31^16
8700 __ movw(rscratch1, intpow(31U, evf));
8701 __ mov(vpowm, Assembler::S, 0, rscratch1);
8702 }
8703
8704 __ mov(vmul3, Assembler::T16B, 0);
8705 __ mov(vmul2, Assembler::T16B, 0);
8706 __ mov(vmul1, Assembler::T16B, 0);
8707
8708 __ bind(LARGE_LOOP);
8709
8710 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8711 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8712 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8713 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8714
8715 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8716 Address(__ post(ary, evf * type2aelembytes(eltype))));
8717
8718 if (load_arrangement == Assembler::T8B) {
8719 // Extend 8B to 8H to be able to use vector multiply
8720 // instructions
8721 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8722 if (is_signed_subword_type(eltype)) {
8723 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8724 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8725 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8726 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8727 } else {
8728 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8729 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8730 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8731 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8732 }
8733 }
8734
8735 switch (load_arrangement) {
8736 case Assembler::T4S:
8737 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8738 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8739 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8740 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8741 break;
8742 case Assembler::T8B:
8743 case Assembler::T8H:
8744 assert(is_subword_type(eltype), "subword type expected");
8745 if (is_signed_subword_type(eltype)) {
8746 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8747 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8748 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8749 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8750 } else {
8751 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8752 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8753 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8754 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8755 }
8756 break;
8757 default:
8758 __ should_not_reach_here();
8759 }
8760
8761 // Process the upper half of a vector
8762 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8763 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8764 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8765 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8766 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8767 if (is_signed_subword_type(eltype)) {
8768 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8769 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8770 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8771 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8772 } else {
8773 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8774 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8775 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8776 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8777 }
8778 }
8779
8780 __ subsw(rscratch2, rscratch2, 1);
8781 __ br(Assembler::HI, LARGE_LOOP);
8782
8783 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8784 __ addv(vmul3, Assembler::T4S, vmul3);
8785 __ umov(result, vmul3, Assembler::S, 0);
8786
8787 __ mov(rscratch2, intpow(31U, vf));
8788
8789 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8790 __ addv(vmul2, Assembler::T4S, vmul2);
8791 __ umov(rscratch1, vmul2, Assembler::S, 0);
8792 __ maddw(result, result, rscratch2, rscratch1);
8793
8794 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8795 __ addv(vmul1, Assembler::T4S, vmul1);
8796 __ umov(rscratch1, vmul1, Assembler::S, 0);
8797 __ maddw(result, result, rscratch2, rscratch1);
8798
8799 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8800 __ addv(vmul0, Assembler::T4S, vmul0);
8801 __ umov(rscratch1, vmul0, Assembler::S, 0);
8802 __ maddw(result, result, rscratch2, rscratch1);
8803
8804 __ andr(rscratch2, cnt, vf - 1);
8805 __ cbnz(rscratch2, TAIL_SHORTCUT);
8806
8807 __ leave();
8808 __ ret(lr);
8809
8810 return entry;
8811 }
8812
8813 address generate_dsin_dcos(bool isCos) {
8814 __ align(CodeEntryAlignment);
8815 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8816 StubCodeMark mark(this, stub_id);
8817 address start = __ pc();
8818 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8819 (address)StubRoutines::aarch64::_two_over_pi,
8820 (address)StubRoutines::aarch64::_pio2,
8821 (address)StubRoutines::aarch64::_dsin_coef,
8822 (address)StubRoutines::aarch64::_dcos_coef);
8823 return start;
8824 }
8825
8826 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8827 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8828 Label &DIFF2) {
8829 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8830 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8831
8832 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8833 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8834 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8835 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8836
8837 __ fmovd(tmpL, vtmp3);
8838 __ eor(rscratch2, tmp3, tmpL);
8839 __ cbnz(rscratch2, DIFF2);
8840
8841 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8842 __ umov(tmpL, vtmp3, __ D, 1);
8843 __ eor(rscratch2, tmpU, tmpL);
8844 __ cbnz(rscratch2, DIFF1);
8845
8846 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8847 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8848 __ fmovd(tmpL, vtmp);
8849 __ eor(rscratch2, tmp3, tmpL);
8850 __ cbnz(rscratch2, DIFF2);
8851
8852 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8853 __ umov(tmpL, vtmp, __ D, 1);
8854 __ eor(rscratch2, tmpU, tmpL);
8855 __ cbnz(rscratch2, DIFF1);
8856 }
8857
8858 // r0 = result
8859 // r1 = str1
8860 // r2 = cnt1
8861 // r3 = str2
8862 // r4 = cnt2
8863 // r10 = tmp1
8864 // r11 = tmp2
8865 address generate_compare_long_string_different_encoding(bool isLU) {
8866 __ align(CodeEntryAlignment);
8867 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8868 StubCodeMark mark(this, stub_id);
8869 address entry = __ pc();
8870 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8871 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8872 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8873 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8874 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8875 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8876 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8877
8878 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8879
8880 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8881 // cnt2 == amount of characters left to compare
8882 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8883 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8884 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8885 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8886 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8887 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8888 __ eor(rscratch2, tmp1, tmp2);
8889 __ mov(rscratch1, tmp2);
8890 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8891 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8892 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8893 __ push(spilled_regs, sp);
8894 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8895 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8896
8897 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8898
8899 if (SoftwarePrefetchHintDistance >= 0) {
8900 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8901 __ br(__ LT, NO_PREFETCH);
8902 __ bind(LARGE_LOOP_PREFETCH);
8903 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8904 __ mov(tmp4, 2);
8905 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8906 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8907 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8908 __ subs(tmp4, tmp4, 1);
8909 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8910 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8911 __ mov(tmp4, 2);
8912 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8913 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8914 __ subs(tmp4, tmp4, 1);
8915 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8916 __ sub(cnt2, cnt2, 64);
8917 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8918 __ br(__ GE, LARGE_LOOP_PREFETCH);
8919 }
8920 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8921 __ bind(NO_PREFETCH);
8922 __ subs(cnt2, cnt2, 16);
8923 __ br(__ LT, TAIL);
8924 __ align(OptoLoopAlignment);
8925 __ bind(SMALL_LOOP); // smaller loop
8926 __ subs(cnt2, cnt2, 16);
8927 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8928 __ br(__ GE, SMALL_LOOP);
8929 __ cmn(cnt2, (u1)16);
8930 __ br(__ EQ, LOAD_LAST);
8931 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8932 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8933 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8934 __ ldr(tmp3, Address(cnt1, -8));
8935 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8936 __ b(LOAD_LAST);
8937 __ bind(DIFF2);
8938 __ mov(tmpU, tmp3);
8939 __ bind(DIFF1);
8940 __ pop(spilled_regs, sp);
8941 __ b(CALCULATE_DIFFERENCE);
8942 __ bind(LOAD_LAST);
8943 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8944 // No need to load it again
8945 __ mov(tmpU, tmp3);
8946 __ pop(spilled_regs, sp);
8947
8948 // tmp2 points to the address of the last 4 Latin1 characters right now
8949 __ ldrs(vtmp, Address(tmp2));
8950 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8951 __ fmovd(tmpL, vtmp);
8952
8953 __ eor(rscratch2, tmpU, tmpL);
8954 __ cbz(rscratch2, DONE);
8955
8956 // Find the first different characters in the longwords and
8957 // compute their difference.
8958 __ bind(CALCULATE_DIFFERENCE);
8959 __ rev(rscratch2, rscratch2);
8960 __ clz(rscratch2, rscratch2);
8961 __ andr(rscratch2, rscratch2, -16);
8962 __ lsrv(tmp1, tmp1, rscratch2);
8963 __ uxthw(tmp1, tmp1);
8964 __ lsrv(rscratch1, rscratch1, rscratch2);
8965 __ uxthw(rscratch1, rscratch1);
8966 __ subw(result, tmp1, rscratch1);
8967 __ bind(DONE);
8968 __ ret(lr);
8969 return entry;
8970 }
8971
8972 // r0 = input (float16)
8973 // v0 = result (float)
8974 // v1 = temporary float register
8975 address generate_float16ToFloat() {
8976 __ align(CodeEntryAlignment);
8977 StubId stub_id = StubId::stubgen_hf2f_id;
8978 StubCodeMark mark(this, stub_id);
8979 address entry = __ pc();
8980 BLOCK_COMMENT("Entry:");
8981 __ flt16_to_flt(v0, r0, v1);
8982 __ ret(lr);
8983 return entry;
8984 }
8985
8986 // v0 = input (float)
8987 // r0 = result (float16)
8988 // v1 = temporary float register
8989 address generate_floatToFloat16() {
8990 __ align(CodeEntryAlignment);
8991 StubId stub_id = StubId::stubgen_f2hf_id;
8992 StubCodeMark mark(this, stub_id);
8993 address entry = __ pc();
8994 BLOCK_COMMENT("Entry:");
8995 __ flt_to_flt16(r0, v0, v1);
8996 __ ret(lr);
8997 return entry;
8998 }
8999
9000 address generate_method_entry_barrier() {
9001 __ align(CodeEntryAlignment);
9002 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9003 StubCodeMark mark(this, stub_id);
9004
9005 Label deoptimize_label;
9006
9007 address start = __ pc();
9008
9009 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9010
9011 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9012 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9013 // We can get here despite the nmethod being good, if we have not
9014 // yet applied our cross modification fence (or data fence).
9015 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9016 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9017 __ ldrw(rscratch2, rscratch2);
9018 __ strw(rscratch2, thread_epoch_addr);
9019 __ isb();
9020 __ membar(__ LoadLoad);
9021 }
9022
9023 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9024
9025 __ enter();
9026 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9027
9028 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9029
9030 __ push_call_clobbered_registers();
9031
9032 __ mov(c_rarg0, rscratch2);
9033 __ call_VM_leaf
9034 (CAST_FROM_FN_PTR
9035 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9036
9037 __ reset_last_Java_frame(true);
9038
9039 __ mov(rscratch1, r0);
9040
9041 __ pop_call_clobbered_registers();
9042
9043 __ cbnz(rscratch1, deoptimize_label);
9044
9045 __ leave();
9046 __ ret(lr);
9047
9048 __ BIND(deoptimize_label);
9049
9050 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9051 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9052
9053 __ mov(sp, rscratch1);
9054 __ br(rscratch2);
9055
9056 return start;
9057 }
9058
9059 // r0 = result
9060 // r1 = str1
9061 // r2 = cnt1
9062 // r3 = str2
9063 // r4 = cnt2
9064 // r10 = tmp1
9065 // r11 = tmp2
9066 address generate_compare_long_string_same_encoding(bool isLL) {
9067 __ align(CodeEntryAlignment);
9068 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9069 StubCodeMark mark(this, stub_id);
9070 address entry = __ pc();
9071 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9072 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9073
9074 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9075
9076 // exit from large loop when less than 64 bytes left to read or we're about
9077 // to prefetch memory behind array border
9078 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9079
9080 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9081 __ eor(rscratch2, tmp1, tmp2);
9082 __ cbnz(rscratch2, CAL_DIFFERENCE);
9083
9084 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9085 // update pointers, because of previous read
9086 __ add(str1, str1, wordSize);
9087 __ add(str2, str2, wordSize);
9088 if (SoftwarePrefetchHintDistance >= 0) {
9089 __ align(OptoLoopAlignment);
9090 __ bind(LARGE_LOOP_PREFETCH);
9091 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9092 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9093
9094 for (int i = 0; i < 4; i++) {
9095 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9096 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9097 __ cmp(tmp1, tmp2);
9098 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9099 __ br(Assembler::NE, DIFF);
9100 }
9101 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9102 __ add(str1, str1, 64);
9103 __ add(str2, str2, 64);
9104 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9105 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9106 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9107 }
9108
9109 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9110 __ br(Assembler::LE, LESS16);
9111 __ align(OptoLoopAlignment);
9112 __ bind(LOOP_COMPARE16);
9113 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9114 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9115 __ cmp(tmp1, tmp2);
9116 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9117 __ br(Assembler::NE, DIFF);
9118 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9119 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9120 __ br(Assembler::LT, LESS16);
9121
9122 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9123 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9124 __ cmp(tmp1, tmp2);
9125 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9126 __ br(Assembler::NE, DIFF);
9127 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9128 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9129 __ br(Assembler::GE, LOOP_COMPARE16);
9130 __ cbz(cnt2, LENGTH_DIFF);
9131
9132 __ bind(LESS16);
9133 // each 8 compare
9134 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9135 __ br(Assembler::LE, LESS8);
9136 __ ldr(tmp1, Address(__ post(str1, 8)));
9137 __ ldr(tmp2, Address(__ post(str2, 8)));
9138 __ eor(rscratch2, tmp1, tmp2);
9139 __ cbnz(rscratch2, CAL_DIFFERENCE);
9140 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9141
9142 __ bind(LESS8); // directly load last 8 bytes
9143 if (!isLL) {
9144 __ add(cnt2, cnt2, cnt2);
9145 }
9146 __ ldr(tmp1, Address(str1, cnt2));
9147 __ ldr(tmp2, Address(str2, cnt2));
9148 __ eor(rscratch2, tmp1, tmp2);
9149 __ cbz(rscratch2, LENGTH_DIFF);
9150 __ b(CAL_DIFFERENCE);
9151
9152 __ bind(DIFF);
9153 __ cmp(tmp1, tmp2);
9154 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9155 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9156 // reuse rscratch2 register for the result of eor instruction
9157 __ eor(rscratch2, tmp1, tmp2);
9158
9159 __ bind(CAL_DIFFERENCE);
9160 __ rev(rscratch2, rscratch2);
9161 __ clz(rscratch2, rscratch2);
9162 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9163 __ lsrv(tmp1, tmp1, rscratch2);
9164 __ lsrv(tmp2, tmp2, rscratch2);
9165 if (isLL) {
9166 __ uxtbw(tmp1, tmp1);
9167 __ uxtbw(tmp2, tmp2);
9168 } else {
9169 __ uxthw(tmp1, tmp1);
9170 __ uxthw(tmp2, tmp2);
9171 }
9172 __ subw(result, tmp1, tmp2);
9173
9174 __ bind(LENGTH_DIFF);
9175 __ ret(lr);
9176 return entry;
9177 }
9178
9179 enum string_compare_mode {
9180 LL,
9181 LU,
9182 UL,
9183 UU,
9184 };
9185
9186 // The following registers are declared in aarch64.ad
9187 // r0 = result
9188 // r1 = str1
9189 // r2 = cnt1
9190 // r3 = str2
9191 // r4 = cnt2
9192 // r10 = tmp1
9193 // r11 = tmp2
9194 // z0 = ztmp1
9195 // z1 = ztmp2
9196 // p0 = pgtmp1
9197 // p1 = pgtmp2
9198 address generate_compare_long_string_sve(string_compare_mode mode) {
9199 StubId stub_id;
9200 switch (mode) {
9201 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9202 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9203 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9204 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9205 default: ShouldNotReachHere();
9206 }
9207
9208 __ align(CodeEntryAlignment);
9209 address entry = __ pc();
9210 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9211 tmp1 = r10, tmp2 = r11;
9212
9213 Label LOOP, DONE, MISMATCH;
9214 Register vec_len = tmp1;
9215 Register idx = tmp2;
9216 // The minimum of the string lengths has been stored in cnt2.
9217 Register cnt = cnt2;
9218 FloatRegister ztmp1 = z0, ztmp2 = z1;
9219 PRegister pgtmp1 = p0, pgtmp2 = p1;
9220
9221 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9222 switch (mode) { \
9223 case LL: \
9224 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9225 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9226 break; \
9227 case LU: \
9228 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9229 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9230 break; \
9231 case UL: \
9232 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9233 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9234 break; \
9235 case UU: \
9236 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9237 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9238 break; \
9239 default: \
9240 ShouldNotReachHere(); \
9241 }
9242
9243 StubCodeMark mark(this, stub_id);
9244
9245 __ mov(idx, 0);
9246 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9247
9248 if (mode == LL) {
9249 __ sve_cntb(vec_len);
9250 } else {
9251 __ sve_cnth(vec_len);
9252 }
9253
9254 __ sub(rscratch1, cnt, vec_len);
9255
9256 __ bind(LOOP);
9257
9258 // main loop
9259 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9260 __ add(idx, idx, vec_len);
9261 // Compare strings.
9262 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9263 __ br(__ NE, MISMATCH);
9264 __ cmp(idx, rscratch1);
9265 __ br(__ LT, LOOP);
9266
9267 // post loop, last iteration
9268 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9269
9270 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9271 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9272 __ br(__ EQ, DONE);
9273
9274 __ bind(MISMATCH);
9275
9276 // Crop the vector to find its location.
9277 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9278 // Extract the first different characters of each string.
9279 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9280 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9281
9282 // Compute the difference of the first different characters.
9283 __ sub(result, rscratch1, rscratch2);
9284
9285 __ bind(DONE);
9286 __ ret(lr);
9287 #undef LOAD_PAIR
9288 return entry;
9289 }
9290
9291 void generate_compare_long_strings() {
9292 if (UseSVE == 0) {
9293 StubRoutines::aarch64::_compare_long_string_LL
9294 = generate_compare_long_string_same_encoding(true);
9295 StubRoutines::aarch64::_compare_long_string_UU
9296 = generate_compare_long_string_same_encoding(false);
9297 StubRoutines::aarch64::_compare_long_string_LU
9298 = generate_compare_long_string_different_encoding(true);
9299 StubRoutines::aarch64::_compare_long_string_UL
9300 = generate_compare_long_string_different_encoding(false);
9301 } else {
9302 StubRoutines::aarch64::_compare_long_string_LL
9303 = generate_compare_long_string_sve(LL);
9304 StubRoutines::aarch64::_compare_long_string_UU
9305 = generate_compare_long_string_sve(UU);
9306 StubRoutines::aarch64::_compare_long_string_LU
9307 = generate_compare_long_string_sve(LU);
9308 StubRoutines::aarch64::_compare_long_string_UL
9309 = generate_compare_long_string_sve(UL);
9310 }
9311 }
9312
9313 // R0 = result
9314 // R1 = str2
9315 // R2 = cnt1
9316 // R3 = str1
9317 // R4 = cnt2
9318 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9319 //
9320 // This generic linear code use few additional ideas, which makes it faster:
9321 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9322 // in order to skip initial loading(help in systems with 1 ld pipeline)
9323 // 2) we can use "fast" algorithm of finding single character to search for
9324 // first symbol with less branches(1 branch per each loaded register instead
9325 // of branch for each symbol), so, this is where constants like
9326 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9327 // 3) after loading and analyzing 1st register of source string, it can be
9328 // used to search for every 1st character entry, saving few loads in
9329 // comparison with "simplier-but-slower" implementation
9330 // 4) in order to avoid lots of push/pop operations, code below is heavily
9331 // re-using/re-initializing/compressing register values, which makes code
9332 // larger and a bit less readable, however, most of extra operations are
9333 // issued during loads or branches, so, penalty is minimal
9334 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9335 StubId stub_id;
9336 if (str1_isL) {
9337 if (str2_isL) {
9338 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9339 } else {
9340 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9341 }
9342 } else {
9343 if (str2_isL) {
9344 ShouldNotReachHere();
9345 } else {
9346 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9347 }
9348 }
9349 __ align(CodeEntryAlignment);
9350 StubCodeMark mark(this, stub_id);
9351 address entry = __ pc();
9352
9353 int str1_chr_size = str1_isL ? 1 : 2;
9354 int str2_chr_size = str2_isL ? 1 : 2;
9355 int str1_chr_shift = str1_isL ? 0 : 1;
9356 int str2_chr_shift = str2_isL ? 0 : 1;
9357 bool isL = str1_isL && str2_isL;
9358 // parameters
9359 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9360 // temporary registers
9361 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9362 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9363 // redefinitions
9364 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9365
9366 __ push(spilled_regs, sp);
9367 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9368 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9369 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9370 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9371 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9372 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9373 // Read whole register from str1. It is safe, because length >=8 here
9374 __ ldr(ch1, Address(str1));
9375 // Read whole register from str2. It is safe, because length >=8 here
9376 __ ldr(ch2, Address(str2));
9377 __ sub(cnt2, cnt2, cnt1);
9378 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9379 if (str1_isL != str2_isL) {
9380 __ eor(v0, __ T16B, v0, v0);
9381 }
9382 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9383 __ mul(first, first, tmp1);
9384 // check if we have less than 1 register to check
9385 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9386 if (str1_isL != str2_isL) {
9387 __ fmovd(v1, ch1);
9388 }
9389 __ br(__ LE, L_SMALL);
9390 __ eor(ch2, first, ch2);
9391 if (str1_isL != str2_isL) {
9392 __ zip1(v1, __ T16B, v1, v0);
9393 }
9394 __ sub(tmp2, ch2, tmp1);
9395 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9396 __ bics(tmp2, tmp2, ch2);
9397 if (str1_isL != str2_isL) {
9398 __ fmovd(ch1, v1);
9399 }
9400 __ br(__ NE, L_HAS_ZERO);
9401 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9402 __ add(result, result, wordSize/str2_chr_size);
9403 __ add(str2, str2, wordSize);
9404 __ br(__ LT, L_POST_LOOP);
9405 __ BIND(L_LOOP);
9406 __ ldr(ch2, Address(str2));
9407 __ eor(ch2, first, ch2);
9408 __ sub(tmp2, ch2, tmp1);
9409 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9410 __ bics(tmp2, tmp2, ch2);
9411 __ br(__ NE, L_HAS_ZERO);
9412 __ BIND(L_LOOP_PROCEED);
9413 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9414 __ add(str2, str2, wordSize);
9415 __ add(result, result, wordSize/str2_chr_size);
9416 __ br(__ GE, L_LOOP);
9417 __ BIND(L_POST_LOOP);
9418 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9419 __ br(__ LE, NOMATCH);
9420 __ ldr(ch2, Address(str2));
9421 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9422 __ eor(ch2, first, ch2);
9423 __ sub(tmp2, ch2, tmp1);
9424 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9425 __ mov(tmp4, -1); // all bits set
9426 __ b(L_SMALL_PROCEED);
9427 __ align(OptoLoopAlignment);
9428 __ BIND(L_SMALL);
9429 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9430 __ eor(ch2, first, ch2);
9431 if (str1_isL != str2_isL) {
9432 __ zip1(v1, __ T16B, v1, v0);
9433 }
9434 __ sub(tmp2, ch2, tmp1);
9435 __ mov(tmp4, -1); // all bits set
9436 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9437 if (str1_isL != str2_isL) {
9438 __ fmovd(ch1, v1); // move converted 4 symbols
9439 }
9440 __ BIND(L_SMALL_PROCEED);
9441 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9442 __ bic(tmp2, tmp2, ch2);
9443 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9444 __ rbit(tmp2, tmp2);
9445 __ br(__ EQ, NOMATCH);
9446 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9447 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9448 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9449 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9450 if (str2_isL) { // LL
9451 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9452 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9453 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9454 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9455 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9456 } else {
9457 __ mov(ch2, 0xE); // all bits in byte set except last one
9458 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9459 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9460 __ lslv(tmp2, tmp2, tmp4);
9461 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9462 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9463 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9464 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9465 }
9466 __ cmp(ch1, ch2);
9467 __ mov(tmp4, wordSize/str2_chr_size);
9468 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9469 __ BIND(L_SMALL_CMP_LOOP);
9470 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9471 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9472 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9473 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9474 __ add(tmp4, tmp4, 1);
9475 __ cmp(tmp4, cnt1);
9476 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9477 __ cmp(first, ch2);
9478 __ br(__ EQ, L_SMALL_CMP_LOOP);
9479 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9480 __ cbz(tmp2, NOMATCH); // no more matches. exit
9481 __ clz(tmp4, tmp2);
9482 __ add(result, result, 1); // advance index
9483 __ add(str2, str2, str2_chr_size); // advance pointer
9484 __ b(L_SMALL_HAS_ZERO_LOOP);
9485 __ align(OptoLoopAlignment);
9486 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9487 __ cmp(first, ch2);
9488 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9489 __ b(DONE);
9490 __ align(OptoLoopAlignment);
9491 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9492 if (str2_isL) { // LL
9493 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9494 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9495 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9496 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9497 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9498 } else {
9499 __ mov(ch2, 0xE); // all bits in byte set except last one
9500 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9501 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9502 __ lslv(tmp2, tmp2, tmp4);
9503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9505 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9506 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9507 }
9508 __ cmp(ch1, ch2);
9509 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9510 __ b(DONE);
9511 __ align(OptoLoopAlignment);
9512 __ BIND(L_HAS_ZERO);
9513 __ rbit(tmp2, tmp2);
9514 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9515 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9516 // It's fine because both counters are 32bit and are not changed in this
9517 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9518 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9519 __ sub(result, result, 1);
9520 __ BIND(L_HAS_ZERO_LOOP);
9521 __ mov(cnt1, wordSize/str2_chr_size);
9522 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9523 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9524 if (str2_isL) {
9525 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9526 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9527 __ lslv(tmp2, tmp2, tmp4);
9528 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9529 __ add(tmp4, tmp4, 1);
9530 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9531 __ lsl(tmp2, tmp2, 1);
9532 __ mov(tmp4, wordSize/str2_chr_size);
9533 } else {
9534 __ mov(ch2, 0xE);
9535 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9536 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9537 __ lslv(tmp2, tmp2, tmp4);
9538 __ add(tmp4, tmp4, 1);
9539 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9540 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9541 __ lsl(tmp2, tmp2, 1);
9542 __ mov(tmp4, wordSize/str2_chr_size);
9543 __ sub(str2, str2, str2_chr_size);
9544 }
9545 __ cmp(ch1, ch2);
9546 __ mov(tmp4, wordSize/str2_chr_size);
9547 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9548 __ BIND(L_CMP_LOOP);
9549 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9550 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9551 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9552 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9553 __ add(tmp4, tmp4, 1);
9554 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9555 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9556 __ cmp(cnt1, ch2);
9557 __ br(__ EQ, L_CMP_LOOP);
9558 __ BIND(L_CMP_LOOP_NOMATCH);
9559 // here we're not matched
9560 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9561 __ clz(tmp4, tmp2);
9562 __ add(str2, str2, str2_chr_size); // advance pointer
9563 __ b(L_HAS_ZERO_LOOP);
9564 __ align(OptoLoopAlignment);
9565 __ BIND(L_CMP_LOOP_LAST_CMP);
9566 __ cmp(cnt1, ch2);
9567 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9568 __ b(DONE);
9569 __ align(OptoLoopAlignment);
9570 __ BIND(L_CMP_LOOP_LAST_CMP2);
9571 if (str2_isL) {
9572 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9573 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9574 __ lslv(tmp2, tmp2, tmp4);
9575 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9576 __ add(tmp4, tmp4, 1);
9577 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9578 __ lsl(tmp2, tmp2, 1);
9579 } else {
9580 __ mov(ch2, 0xE);
9581 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9582 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9583 __ lslv(tmp2, tmp2, tmp4);
9584 __ add(tmp4, tmp4, 1);
9585 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9586 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9587 __ lsl(tmp2, tmp2, 1);
9588 __ sub(str2, str2, str2_chr_size);
9589 }
9590 __ cmp(ch1, ch2);
9591 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9592 __ b(DONE);
9593 __ align(OptoLoopAlignment);
9594 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9595 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9596 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9597 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9598 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9599 // result by analyzed characters value, so, we can just reset lower bits
9600 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9601 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9602 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9603 // index of last analyzed substring inside current octet. So, str2 in at
9604 // respective start address. We need to advance it to next octet
9605 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9606 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9607 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9608 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9609 __ movw(cnt2, cnt2);
9610 __ b(L_LOOP_PROCEED);
9611 __ align(OptoLoopAlignment);
9612 __ BIND(NOMATCH);
9613 __ mov(result, -1);
9614 __ BIND(DONE);
9615 __ pop(spilled_regs, sp);
9616 __ ret(lr);
9617 return entry;
9618 }
9619
9620 void generate_string_indexof_stubs() {
9621 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9622 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9623 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9624 }
9625
9626 void inflate_and_store_2_fp_registers(bool generatePrfm,
9627 FloatRegister src1, FloatRegister src2) {
9628 Register dst = r1;
9629 __ zip1(v1, __ T16B, src1, v0);
9630 __ zip2(v2, __ T16B, src1, v0);
9631 if (generatePrfm) {
9632 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9633 }
9634 __ zip1(v3, __ T16B, src2, v0);
9635 __ zip2(v4, __ T16B, src2, v0);
9636 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9637 }
9638
9639 // R0 = src
9640 // R1 = dst
9641 // R2 = len
9642 // R3 = len >> 3
9643 // V0 = 0
9644 // v1 = loaded 8 bytes
9645 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9646 address generate_large_byte_array_inflate() {
9647 __ align(CodeEntryAlignment);
9648 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9649 StubCodeMark mark(this, stub_id);
9650 address entry = __ pc();
9651 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9652 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9653 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9654
9655 // do one more 8-byte read to have address 16-byte aligned in most cases
9656 // also use single store instruction
9657 __ ldrd(v2, __ post(src, 8));
9658 __ sub(octetCounter, octetCounter, 2);
9659 __ zip1(v1, __ T16B, v1, v0);
9660 __ zip1(v2, __ T16B, v2, v0);
9661 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9662 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9663 __ subs(rscratch1, octetCounter, large_loop_threshold);
9664 __ br(__ LE, LOOP_START);
9665 __ b(LOOP_PRFM_START);
9666 __ bind(LOOP_PRFM);
9667 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9668 __ bind(LOOP_PRFM_START);
9669 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9670 __ sub(octetCounter, octetCounter, 8);
9671 __ subs(rscratch1, octetCounter, large_loop_threshold);
9672 inflate_and_store_2_fp_registers(true, v3, v4);
9673 inflate_and_store_2_fp_registers(true, v5, v6);
9674 __ br(__ GT, LOOP_PRFM);
9675 __ cmp(octetCounter, (u1)8);
9676 __ br(__ LT, DONE);
9677 __ bind(LOOP);
9678 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9679 __ bind(LOOP_START);
9680 __ sub(octetCounter, octetCounter, 8);
9681 __ cmp(octetCounter, (u1)8);
9682 inflate_and_store_2_fp_registers(false, v3, v4);
9683 inflate_and_store_2_fp_registers(false, v5, v6);
9684 __ br(__ GE, LOOP);
9685 __ bind(DONE);
9686 __ ret(lr);
9687 return entry;
9688 }
9689
9690 /**
9691 * Arguments:
9692 *
9693 * Input:
9694 * c_rarg0 - current state address
9695 * c_rarg1 - H key address
9696 * c_rarg2 - data address
9697 * c_rarg3 - number of blocks
9698 *
9699 * Output:
9700 * Updated state at c_rarg0
9701 */
9702 address generate_ghash_processBlocks() {
9703 // Bafflingly, GCM uses little-endian for the byte order, but
9704 // big-endian for the bit order. For example, the polynomial 1 is
9705 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9706 //
9707 // So, we must either reverse the bytes in each word and do
9708 // everything big-endian or reverse the bits in each byte and do
9709 // it little-endian. On AArch64 it's more idiomatic to reverse
9710 // the bits in each byte (we have an instruction, RBIT, to do
9711 // that) and keep the data in little-endian bit order through the
9712 // calculation, bit-reversing the inputs and outputs.
9713
9714 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9715 StubCodeMark mark(this, stub_id);
9716 Label polynomial; // local data generated at end of stub
9717 __ align(CodeEntryAlignment);
9718 address start = __ pc();
9719
9720 Register state = c_rarg0;
9721 Register subkeyH = c_rarg1;
9722 Register data = c_rarg2;
9723 Register blocks = c_rarg3;
9724
9725 FloatRegister vzr = v30;
9726 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9727
9728 __ adr(rscratch1, polynomial);
9729 __ ldrq(v24, rscratch1); // The field polynomial
9730
9731 __ ldrq(v0, Address(state));
9732 __ ldrq(v1, Address(subkeyH));
9733
9734 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9735 __ rbit(v0, __ T16B, v0);
9736 __ rev64(v1, __ T16B, v1);
9737 __ rbit(v1, __ T16B, v1);
9738
9739 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9740 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9741
9742 {
9743 Label L_ghash_loop;
9744 __ bind(L_ghash_loop);
9745
9746 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9747 // reversing each byte
9748 __ rbit(v2, __ T16B, v2);
9749 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9750
9751 // Multiply state in v2 by subkey in v1
9752 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9753 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9754 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9755 // Reduce v7:v5 by the field polynomial
9756 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9757
9758 __ sub(blocks, blocks, 1);
9759 __ cbnz(blocks, L_ghash_loop);
9760 }
9761
9762 // The bit-reversed result is at this point in v0
9763 __ rev64(v0, __ T16B, v0);
9764 __ rbit(v0, __ T16B, v0);
9765
9766 __ st1(v0, __ T16B, state);
9767 __ ret(lr);
9768
9769 // bind label and generate local polynomial data
9770 __ align(wordSize * 2);
9771 __ bind(polynomial);
9772 __ emit_int64(0x87); // The low-order bits of the field
9773 // polynomial (i.e. p = z^7+z^2+z+1)
9774 // repeated in the low and high parts of a
9775 // 128-bit vector
9776 __ emit_int64(0x87);
9777
9778 return start;
9779 }
9780
9781 address generate_ghash_processBlocks_wide() {
9782 address small = generate_ghash_processBlocks();
9783
9784 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9785 StubCodeMark mark(this, stub_id);
9786 Label polynomial; // local data generated after stub
9787 __ align(CodeEntryAlignment);
9788 address start = __ pc();
9789
9790 Register state = c_rarg0;
9791 Register subkeyH = c_rarg1;
9792 Register data = c_rarg2;
9793 Register blocks = c_rarg3;
9794
9795 const int unroll = 4;
9796
9797 __ cmp(blocks, (unsigned char)(unroll * 2));
9798 __ br(__ LT, small);
9799
9800 if (unroll > 1) {
9801 // Save state before entering routine
9802 __ sub(sp, sp, 4 * 16);
9803 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9804 __ sub(sp, sp, 4 * 16);
9805 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9806 }
9807
9808 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9809
9810 if (unroll > 1) {
9811 // And restore state
9812 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9813 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9814 }
9815
9816 __ cmp(blocks, (unsigned char)0);
9817 __ br(__ GT, small);
9818
9819 __ ret(lr);
9820
9821 // bind label and generate polynomial data
9822 __ align(wordSize * 2);
9823 __ bind(polynomial);
9824 __ emit_int64(0x87); // The low-order bits of the field
9825 // polynomial (i.e. p = z^7+z^2+z+1)
9826 // repeated in the low and high parts of a
9827 // 128-bit vector
9828 __ emit_int64(0x87);
9829
9830 return start;
9831
9832 }
9833
9834 void generate_base64_encode_simdround(Register src, Register dst,
9835 FloatRegister codec, u8 size) {
9836
9837 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9838 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9839 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9840
9841 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9842
9843 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9844
9845 __ ushr(ind0, arrangement, in0, 2);
9846
9847 __ ushr(ind1, arrangement, in1, 2);
9848 __ shl(in0, arrangement, in0, 6);
9849 __ orr(ind1, arrangement, ind1, in0);
9850 __ ushr(ind1, arrangement, ind1, 2);
9851
9852 __ ushr(ind2, arrangement, in2, 4);
9853 __ shl(in1, arrangement, in1, 4);
9854 __ orr(ind2, arrangement, in1, ind2);
9855 __ ushr(ind2, arrangement, ind2, 2);
9856
9857 __ shl(ind3, arrangement, in2, 2);
9858 __ ushr(ind3, arrangement, ind3, 2);
9859
9860 __ tbl(out0, arrangement, codec, 4, ind0);
9861 __ tbl(out1, arrangement, codec, 4, ind1);
9862 __ tbl(out2, arrangement, codec, 4, ind2);
9863 __ tbl(out3, arrangement, codec, 4, ind3);
9864
9865 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9866 }
9867
9868 /**
9869 * Arguments:
9870 *
9871 * Input:
9872 * c_rarg0 - src_start
9873 * c_rarg1 - src_offset
9874 * c_rarg2 - src_length
9875 * c_rarg3 - dest_start
9876 * c_rarg4 - dest_offset
9877 * c_rarg5 - isURL
9878 *
9879 */
9880 address generate_base64_encodeBlock() {
9881
9882 static const char toBase64[64] = {
9883 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9884 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9885 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9886 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9887 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9888 };
9889
9890 static const char toBase64URL[64] = {
9891 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9892 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9893 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9894 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9895 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9896 };
9897
9898 __ align(CodeEntryAlignment);
9899 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9900 StubCodeMark mark(this, stub_id);
9901 address start = __ pc();
9902
9903 Register src = c_rarg0; // source array
9904 Register soff = c_rarg1; // source start offset
9905 Register send = c_rarg2; // source end offset
9906 Register dst = c_rarg3; // dest array
9907 Register doff = c_rarg4; // position for writing to dest array
9908 Register isURL = c_rarg5; // Base64 or URL character set
9909
9910 // c_rarg6 and c_rarg7 are free to use as temps
9911 Register codec = c_rarg6;
9912 Register length = c_rarg7;
9913
9914 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9915
9916 __ add(src, src, soff);
9917 __ add(dst, dst, doff);
9918 __ sub(length, send, soff);
9919
9920 // load the codec base address
9921 __ lea(codec, ExternalAddress((address) toBase64));
9922 __ cbz(isURL, ProcessData);
9923 __ lea(codec, ExternalAddress((address) toBase64URL));
9924
9925 __ BIND(ProcessData);
9926
9927 // too short to formup a SIMD loop, roll back
9928 __ cmp(length, (u1)24);
9929 __ br(Assembler::LT, Process3B);
9930
9931 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9932
9933 __ BIND(Process48B);
9934 __ cmp(length, (u1)48);
9935 __ br(Assembler::LT, Process24B);
9936 generate_base64_encode_simdround(src, dst, v0, 16);
9937 __ sub(length, length, 48);
9938 __ b(Process48B);
9939
9940 __ BIND(Process24B);
9941 __ cmp(length, (u1)24);
9942 __ br(Assembler::LT, SIMDExit);
9943 generate_base64_encode_simdround(src, dst, v0, 8);
9944 __ sub(length, length, 24);
9945
9946 __ BIND(SIMDExit);
9947 __ cbz(length, Exit);
9948
9949 __ BIND(Process3B);
9950 // 3 src bytes, 24 bits
9951 __ ldrb(r10, __ post(src, 1));
9952 __ ldrb(r11, __ post(src, 1));
9953 __ ldrb(r12, __ post(src, 1));
9954 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9955 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9956 // codec index
9957 __ ubfmw(r15, r12, 18, 23);
9958 __ ubfmw(r14, r12, 12, 17);
9959 __ ubfmw(r13, r12, 6, 11);
9960 __ andw(r12, r12, 63);
9961 // get the code based on the codec
9962 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9963 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9964 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9965 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9966 __ strb(r15, __ post(dst, 1));
9967 __ strb(r14, __ post(dst, 1));
9968 __ strb(r13, __ post(dst, 1));
9969 __ strb(r12, __ post(dst, 1));
9970 __ sub(length, length, 3);
9971 __ cbnz(length, Process3B);
9972
9973 __ BIND(Exit);
9974 __ ret(lr);
9975
9976 return start;
9977 }
9978
9979 void generate_base64_decode_simdround(Register src, Register dst,
9980 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9981
9982 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9983 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9984
9985 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9986 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9987
9988 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9989
9990 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9991
9992 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9993
9994 // we need unsigned saturating subtract, to make sure all input values
9995 // in range [0, 63] will have 0U value in the higher half lookup
9996 __ uqsubv(decH0, __ T16B, in0, v27);
9997 __ uqsubv(decH1, __ T16B, in1, v27);
9998 __ uqsubv(decH2, __ T16B, in2, v27);
9999 __ uqsubv(decH3, __ T16B, in3, v27);
10000
10001 // lower half lookup
10002 __ tbl(decL0, arrangement, codecL, 4, in0);
10003 __ tbl(decL1, arrangement, codecL, 4, in1);
10004 __ tbl(decL2, arrangement, codecL, 4, in2);
10005 __ tbl(decL3, arrangement, codecL, 4, in3);
10006
10007 // higher half lookup
10008 __ tbx(decH0, arrangement, codecH, 4, decH0);
10009 __ tbx(decH1, arrangement, codecH, 4, decH1);
10010 __ tbx(decH2, arrangement, codecH, 4, decH2);
10011 __ tbx(decH3, arrangement, codecH, 4, decH3);
10012
10013 // combine lower and higher
10014 __ orr(decL0, arrangement, decL0, decH0);
10015 __ orr(decL1, arrangement, decL1, decH1);
10016 __ orr(decL2, arrangement, decL2, decH2);
10017 __ orr(decL3, arrangement, decL3, decH3);
10018
10019 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10020 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10021 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10022 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10023 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10024 __ orr(in0, arrangement, decH0, decH1);
10025 __ orr(in1, arrangement, decH2, decH3);
10026 __ orr(in2, arrangement, in0, in1);
10027 __ umaxv(in3, arrangement, in2);
10028 __ umov(rscratch2, in3, __ B, 0);
10029
10030 // get the data to output
10031 __ shl(out0, arrangement, decL0, 2);
10032 __ ushr(out1, arrangement, decL1, 4);
10033 __ orr(out0, arrangement, out0, out1);
10034 __ shl(out1, arrangement, decL1, 4);
10035 __ ushr(out2, arrangement, decL2, 2);
10036 __ orr(out1, arrangement, out1, out2);
10037 __ shl(out2, arrangement, decL2, 6);
10038 __ orr(out2, arrangement, out2, decL3);
10039
10040 __ cbz(rscratch2, NoIllegalData);
10041
10042 // handle illegal input
10043 __ umov(r10, in2, __ D, 0);
10044 if (size == 16) {
10045 __ cbnz(r10, ErrorInLowerHalf);
10046
10047 // illegal input is in higher half, store the lower half now.
10048 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10049
10050 __ umov(r10, in2, __ D, 1);
10051 __ umov(r11, out0, __ D, 1);
10052 __ umov(r12, out1, __ D, 1);
10053 __ umov(r13, out2, __ D, 1);
10054 __ b(StoreLegalData);
10055
10056 __ BIND(ErrorInLowerHalf);
10057 }
10058 __ umov(r11, out0, __ D, 0);
10059 __ umov(r12, out1, __ D, 0);
10060 __ umov(r13, out2, __ D, 0);
10061
10062 __ BIND(StoreLegalData);
10063 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10064 __ strb(r11, __ post(dst, 1));
10065 __ strb(r12, __ post(dst, 1));
10066 __ strb(r13, __ post(dst, 1));
10067 __ lsr(r10, r10, 8);
10068 __ lsr(r11, r11, 8);
10069 __ lsr(r12, r12, 8);
10070 __ lsr(r13, r13, 8);
10071 __ b(StoreLegalData);
10072
10073 __ BIND(NoIllegalData);
10074 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10075 }
10076
10077
10078 /**
10079 * Arguments:
10080 *
10081 * Input:
10082 * c_rarg0 - src_start
10083 * c_rarg1 - src_offset
10084 * c_rarg2 - src_length
10085 * c_rarg3 - dest_start
10086 * c_rarg4 - dest_offset
10087 * c_rarg5 - isURL
10088 * c_rarg6 - isMIME
10089 *
10090 */
10091 address generate_base64_decodeBlock() {
10092
10093 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10094 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10095 // titled "Base64 decoding".
10096
10097 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10098 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10099 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10100 static const uint8_t fromBase64ForNoSIMD[256] = {
10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10103 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10104 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10105 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10106 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10107 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10108 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10109 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10110 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10111 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10112 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117 };
10118
10119 static const uint8_t fromBase64URLForNoSIMD[256] = {
10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10123 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10124 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10125 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10126 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10127 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10128 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10129 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10130 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10134 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10135 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136 };
10137
10138 // A legal value of base64 code is in range [0, 127]. We need two lookups
10139 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10140 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10141 // table vector lookup use tbx, out of range indices are unchanged in
10142 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10143 // The value of index 64 is set to 0, so that we know that we already get the
10144 // decoded data with the 1st lookup.
10145 static const uint8_t fromBase64ForSIMD[128] = {
10146 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10147 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10148 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10149 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10150 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10151 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10152 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10153 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10154 };
10155
10156 static const uint8_t fromBase64URLForSIMD[128] = {
10157 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10158 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10159 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10160 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10161 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10162 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10163 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10164 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10165 };
10166
10167 __ align(CodeEntryAlignment);
10168 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10169 StubCodeMark mark(this, stub_id);
10170 address start = __ pc();
10171
10172 Register src = c_rarg0; // source array
10173 Register soff = c_rarg1; // source start offset
10174 Register send = c_rarg2; // source end offset
10175 Register dst = c_rarg3; // dest array
10176 Register doff = c_rarg4; // position for writing to dest array
10177 Register isURL = c_rarg5; // Base64 or URL character set
10178 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10179
10180 Register length = send; // reuse send as length of source data to process
10181
10182 Register simd_codec = c_rarg6;
10183 Register nosimd_codec = c_rarg7;
10184
10185 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10186
10187 __ enter();
10188
10189 __ add(src, src, soff);
10190 __ add(dst, dst, doff);
10191
10192 __ mov(doff, dst);
10193
10194 __ sub(length, send, soff);
10195 __ bfm(length, zr, 0, 1);
10196
10197 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10198 __ cbz(isURL, ProcessData);
10199 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10200
10201 __ BIND(ProcessData);
10202 __ mov(rscratch1, length);
10203 __ cmp(length, (u1)144); // 144 = 80 + 64
10204 __ br(Assembler::LT, Process4B);
10205
10206 // In the MIME case, the line length cannot be more than 76
10207 // bytes (see RFC 2045). This is too short a block for SIMD
10208 // to be worthwhile, so we use non-SIMD here.
10209 __ movw(rscratch1, 79);
10210
10211 __ BIND(Process4B);
10212 __ ldrw(r14, __ post(src, 4));
10213 __ ubfxw(r10, r14, 0, 8);
10214 __ ubfxw(r11, r14, 8, 8);
10215 __ ubfxw(r12, r14, 16, 8);
10216 __ ubfxw(r13, r14, 24, 8);
10217 // get the de-code
10218 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10219 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10220 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10221 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10222 // error detection, 255u indicates an illegal input
10223 __ orrw(r14, r10, r11);
10224 __ orrw(r15, r12, r13);
10225 __ orrw(r14, r14, r15);
10226 __ tbnz(r14, 7, Exit);
10227 // recover the data
10228 __ lslw(r14, r10, 10);
10229 __ bfiw(r14, r11, 4, 6);
10230 __ bfmw(r14, r12, 2, 5);
10231 __ rev16w(r14, r14);
10232 __ bfiw(r13, r12, 6, 2);
10233 __ strh(r14, __ post(dst, 2));
10234 __ strb(r13, __ post(dst, 1));
10235 // non-simd loop
10236 __ subsw(rscratch1, rscratch1, 4);
10237 __ br(Assembler::GT, Process4B);
10238
10239 // if exiting from PreProcess80B, rscratch1 == -1;
10240 // otherwise, rscratch1 == 0.
10241 __ cbzw(rscratch1, Exit);
10242 __ sub(length, length, 80);
10243
10244 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10245 __ cbz(isURL, SIMDEnter);
10246 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10247
10248 __ BIND(SIMDEnter);
10249 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10250 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10251 __ mov(rscratch1, 63);
10252 __ dup(v27, __ T16B, rscratch1);
10253
10254 __ BIND(Process64B);
10255 __ cmp(length, (u1)64);
10256 __ br(Assembler::LT, Process32B);
10257 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10258 __ sub(length, length, 64);
10259 __ b(Process64B);
10260
10261 __ BIND(Process32B);
10262 __ cmp(length, (u1)32);
10263 __ br(Assembler::LT, SIMDExit);
10264 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10265 __ sub(length, length, 32);
10266 __ b(Process32B);
10267
10268 __ BIND(SIMDExit);
10269 __ cbz(length, Exit);
10270 __ movw(rscratch1, length);
10271 __ b(Process4B);
10272
10273 __ BIND(Exit);
10274 __ sub(c_rarg0, dst, doff);
10275
10276 __ leave();
10277 __ ret(lr);
10278
10279 return start;
10280 }
10281
10282 // Support for spin waits.
10283 address generate_spin_wait() {
10284 __ align(CodeEntryAlignment);
10285 StubId stub_id = StubId::stubgen_spin_wait_id;
10286 StubCodeMark mark(this, stub_id);
10287 address start = __ pc();
10288
10289 __ spin_wait();
10290 __ ret(lr);
10291
10292 return start;
10293 }
10294
10295 void generate_lookup_secondary_supers_table_stub() {
10296 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10297 StubCodeMark mark(this, stub_id);
10298
10299 const Register
10300 r_super_klass = r0,
10301 r_array_base = r1,
10302 r_array_length = r2,
10303 r_array_index = r3,
10304 r_sub_klass = r4,
10305 r_bitmap = rscratch2,
10306 result = r5;
10307 const FloatRegister
10308 vtemp = v0;
10309
10310 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10311 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10312 Label L_success;
10313 __ enter();
10314 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10315 r_array_base, r_array_length, r_array_index,
10316 vtemp, result, slot,
10317 /*stub_is_near*/true);
10318 __ leave();
10319 __ ret(lr);
10320 }
10321 }
10322
10323 // Slow path implementation for UseSecondarySupersTable.
10324 address generate_lookup_secondary_supers_table_slow_path_stub() {
10325 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10326 StubCodeMark mark(this, stub_id);
10327
10328 address start = __ pc();
10329 const Register
10330 r_super_klass = r0, // argument
10331 r_array_base = r1, // argument
10332 temp1 = r2, // temp
10333 r_array_index = r3, // argument
10334 r_bitmap = rscratch2, // argument
10335 result = r5; // argument
10336
10337 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10338 __ ret(lr);
10339
10340 return start;
10341 }
10342
10343 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10344
10345 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10346 //
10347 // If LSE is in use, generate LSE versions of all the stubs. The
10348 // non-LSE versions are in atomic_aarch64.S.
10349
10350 // class AtomicStubMark records the entry point of a stub and the
10351 // stub pointer which will point to it. The stub pointer is set to
10352 // the entry point when ~AtomicStubMark() is called, which must be
10353 // after ICache::invalidate_range. This ensures safe publication of
10354 // the generated code.
10355 class AtomicStubMark {
10356 address _entry_point;
10357 aarch64_atomic_stub_t *_stub;
10358 MacroAssembler *_masm;
10359 public:
10360 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10361 _masm = masm;
10362 __ align(32);
10363 _entry_point = __ pc();
10364 _stub = stub;
10365 }
10366 ~AtomicStubMark() {
10367 *_stub = (aarch64_atomic_stub_t)_entry_point;
10368 }
10369 };
10370
10371 // NB: For memory_order_conservative we need a trailing membar after
10372 // LSE atomic operations but not a leading membar.
10373 //
10374 // We don't need a leading membar because a clause in the Arm ARM
10375 // says:
10376 //
10377 // Barrier-ordered-before
10378 //
10379 // Barrier instructions order prior Memory effects before subsequent
10380 // Memory effects generated by the same Observer. A read or a write
10381 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10382 // Observer if and only if RW1 appears in program order before RW 2
10383 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10384 // instruction with both Acquire and Release semantics.
10385 //
10386 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10387 // and Release semantics, therefore we don't need a leading
10388 // barrier. However, there is no corresponding Barrier-ordered-after
10389 // relationship, therefore we need a trailing membar to prevent a
10390 // later store or load from being reordered with the store in an
10391 // atomic instruction.
10392 //
10393 // This was checked by using the herd7 consistency model simulator
10394 // (http://diy.inria.fr/) with this test case:
10395 //
10396 // AArch64 LseCas
10397 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10398 // P0 | P1;
10399 // LDR W4, [X2] | MOV W3, #0;
10400 // DMB LD | MOV W4, #1;
10401 // LDR W3, [X1] | CASAL W3, W4, [X1];
10402 // | DMB ISH;
10403 // | STR W4, [X2];
10404 // exists
10405 // (0:X3=0 /\ 0:X4=1)
10406 //
10407 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10408 // with the store to x in P1. Without the DMB in P1 this may happen.
10409 //
10410 // At the time of writing we don't know of any AArch64 hardware that
10411 // reorders stores in this way, but the Reference Manual permits it.
10412
10413 void gen_cas_entry(Assembler::operand_size size,
10414 atomic_memory_order order) {
10415 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10416 exchange_val = c_rarg2;
10417 bool acquire, release;
10418 switch (order) {
10419 case memory_order_relaxed:
10420 acquire = false;
10421 release = false;
10422 break;
10423 case memory_order_release:
10424 acquire = false;
10425 release = true;
10426 break;
10427 default:
10428 acquire = true;
10429 release = true;
10430 break;
10431 }
10432 __ mov(prev, compare_val);
10433 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10434 if (order == memory_order_conservative) {
10435 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10436 }
10437 if (size == Assembler::xword) {
10438 __ mov(r0, prev);
10439 } else {
10440 __ movw(r0, prev);
10441 }
10442 __ ret(lr);
10443 }
10444
10445 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10446 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10447 // If not relaxed, then default to conservative. Relaxed is the only
10448 // case we use enough to be worth specializing.
10449 if (order == memory_order_relaxed) {
10450 __ ldadd(size, incr, prev, addr);
10451 } else {
10452 __ ldaddal(size, incr, prev, addr);
10453 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10454 }
10455 if (size == Assembler::xword) {
10456 __ mov(r0, prev);
10457 } else {
10458 __ movw(r0, prev);
10459 }
10460 __ ret(lr);
10461 }
10462
10463 void gen_swpal_entry(Assembler::operand_size size) {
10464 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10465 __ swpal(size, incr, prev, addr);
10466 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10467 if (size == Assembler::xword) {
10468 __ mov(r0, prev);
10469 } else {
10470 __ movw(r0, prev);
10471 }
10472 __ ret(lr);
10473 }
10474
10475 void generate_atomic_entry_points() {
10476 if (! UseLSE) {
10477 return;
10478 }
10479 __ align(CodeEntryAlignment);
10480 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10481 StubCodeMark mark(this, stub_id);
10482 address first_entry = __ pc();
10483
10484 // ADD, memory_order_conservative
10485 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10486 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10487 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10488 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10489
10490 // ADD, memory_order_relaxed
10491 AtomicStubMark mark_fetch_add_4_relaxed
10492 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10493 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10494 AtomicStubMark mark_fetch_add_8_relaxed
10495 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10496 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10497
10498 // XCHG, memory_order_conservative
10499 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10500 gen_swpal_entry(Assembler::word);
10501 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10502 gen_swpal_entry(Assembler::xword);
10503
10504 // CAS, memory_order_conservative
10505 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10506 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10507 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10508 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10509 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10510 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10511
10512 // CAS, memory_order_relaxed
10513 AtomicStubMark mark_cmpxchg_1_relaxed
10514 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10515 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10516 AtomicStubMark mark_cmpxchg_4_relaxed
10517 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10518 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10519 AtomicStubMark mark_cmpxchg_8_relaxed
10520 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10521 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10522
10523 AtomicStubMark mark_cmpxchg_4_release
10524 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10525 gen_cas_entry(MacroAssembler::word, memory_order_release);
10526 AtomicStubMark mark_cmpxchg_8_release
10527 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10528 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10529
10530 AtomicStubMark mark_cmpxchg_4_seq_cst
10531 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10532 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10533 AtomicStubMark mark_cmpxchg_8_seq_cst
10534 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10535 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10536
10537 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10538 }
10539 #endif // LINUX
10540
10541 address generate_cont_thaw(Continuation::thaw_kind kind) {
10542 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10543 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10544
10545 address start = __ pc();
10546
10547 if (return_barrier) {
10548 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10549 __ mov(sp, rscratch1);
10550 }
10551 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10552
10553 if (return_barrier) {
10554 // preserve possible return value from a method returning to the return barrier
10555 __ fmovd(rscratch1, v0);
10556 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10557 }
10558
10559 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10560 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10561 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10562
10563 if (return_barrier) {
10564 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10565 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10566 __ fmovd(v0, rscratch1);
10567 }
10568 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10569
10570
10571 Label thaw_success;
10572 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10573 __ cbnz(rscratch2, thaw_success);
10574 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10575 __ br(rscratch1);
10576 __ bind(thaw_success);
10577
10578 // make room for the thawed frames
10579 __ sub(rscratch1, sp, rscratch2);
10580 __ andr(rscratch1, rscratch1, -16); // align
10581 __ mov(sp, rscratch1);
10582
10583 if (return_barrier) {
10584 // save original return value -- again
10585 __ fmovd(rscratch1, v0);
10586 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10587 }
10588
10589 // If we want, we can templatize thaw by kind, and have three different entries
10590 __ movw(c_rarg1, (uint32_t)kind);
10591
10592 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10593 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10594
10595 if (return_barrier) {
10596 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10597 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10598 __ fmovd(v0, rscratch1);
10599 } else {
10600 __ mov(r0, zr); // return 0 (success) from doYield
10601 }
10602
10603 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10604 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10605 __ mov(rfp, sp);
10606
10607 if (return_barrier_exception) {
10608 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10609 __ authenticate_return_address(c_rarg1);
10610 __ verify_oop(r0);
10611 // save return value containing the exception oop in callee-saved R19
10612 __ mov(r19, r0);
10613
10614 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10615
10616 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10617 // __ reinitialize_ptrue();
10618
10619 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10620
10621 __ mov(r1, r0); // the exception handler
10622 __ mov(r0, r19); // restore return value containing the exception oop
10623 __ verify_oop(r0);
10624
10625 __ leave();
10626 __ mov(r3, lr);
10627 __ br(r1); // the exception handler
10628 } else {
10629 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10630 __ leave();
10631 __ ret(lr);
10632 }
10633
10634 return start;
10635 }
10636
10637 address generate_cont_thaw() {
10638 if (!Continuations::enabled()) return nullptr;
10639
10640 StubId stub_id = StubId::stubgen_cont_thaw_id;
10641 StubCodeMark mark(this, stub_id);
10642 address start = __ pc();
10643 generate_cont_thaw(Continuation::thaw_top);
10644 return start;
10645 }
10646
10647 address generate_cont_returnBarrier() {
10648 if (!Continuations::enabled()) return nullptr;
10649
10650 // TODO: will probably need multiple return barriers depending on return type
10651 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10652 StubCodeMark mark(this, stub_id);
10653 address start = __ pc();
10654
10655 generate_cont_thaw(Continuation::thaw_return_barrier);
10656
10657 return start;
10658 }
10659
10660 address generate_cont_returnBarrier_exception() {
10661 if (!Continuations::enabled()) return nullptr;
10662
10663 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10664 StubCodeMark mark(this, stub_id);
10665 address start = __ pc();
10666
10667 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10668
10669 return start;
10670 }
10671
10672 address generate_cont_preempt_stub() {
10673 if (!Continuations::enabled()) return nullptr;
10674 StubId stub_id = StubId::stubgen_cont_preempt_id;
10675 StubCodeMark mark(this, stub_id);
10676 address start = __ pc();
10677
10678 __ reset_last_Java_frame(true);
10679
10680 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10681 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10682 __ mov(sp, rscratch2);
10683
10684 Label preemption_cancelled;
10685 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10686 __ cbnz(rscratch1, preemption_cancelled);
10687
10688 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10689 SharedRuntime::continuation_enter_cleanup(_masm);
10690 __ leave();
10691 __ ret(lr);
10692
10693 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10694 __ bind(preemption_cancelled);
10695 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10696 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10697 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10698 __ ldr(rscratch1, Address(rscratch1));
10699 __ br(rscratch1);
10700
10701 return start;
10702 }
10703
10704 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10705 // are represented as long[5], with BITS_PER_LIMB = 26.
10706 // Pack five 26-bit limbs into three 64-bit registers.
10707 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10708 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10709 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10710 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10711 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10712
10713 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10714 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10715 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10716 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10717
10718 if (dest2->is_valid()) {
10719 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10720 } else {
10721 #ifdef ASSERT
10722 Label OK;
10723 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10724 __ br(__ EQ, OK);
10725 __ stop("high bits of Poly1305 integer should be zero");
10726 __ should_not_reach_here();
10727 __ bind(OK);
10728 #endif
10729 }
10730 }
10731
10732 // As above, but return only a 128-bit integer, packed into two
10733 // 64-bit registers.
10734 void pack_26(Register dest0, Register dest1, Register src) {
10735 pack_26(dest0, dest1, noreg, src);
10736 }
10737
10738 // Multiply and multiply-accumulate unsigned 64-bit registers.
10739 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10740 __ mul(prod_lo, n, m);
10741 __ umulh(prod_hi, n, m);
10742 }
10743 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10744 wide_mul(rscratch1, rscratch2, n, m);
10745 __ adds(sum_lo, sum_lo, rscratch1);
10746 __ adc(sum_hi, sum_hi, rscratch2);
10747 }
10748
10749 // Poly1305, RFC 7539
10750
10751 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10752 // description of the tricks used to simplify and accelerate this
10753 // computation.
10754
10755 address generate_poly1305_processBlocks() {
10756 __ align(CodeEntryAlignment);
10757 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10758 StubCodeMark mark(this, stub_id);
10759 address start = __ pc();
10760 Label here;
10761 __ enter();
10762 RegSet callee_saved = RegSet::range(r19, r28);
10763 __ push(callee_saved, sp);
10764
10765 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10766
10767 // Arguments
10768 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10769
10770 // R_n is the 128-bit randomly-generated key, packed into two
10771 // registers. The caller passes this key to us as long[5], with
10772 // BITS_PER_LIMB = 26.
10773 const Register R_0 = *++regs, R_1 = *++regs;
10774 pack_26(R_0, R_1, r_start);
10775
10776 // RR_n is (R_n >> 2) * 5
10777 const Register RR_0 = *++regs, RR_1 = *++regs;
10778 __ lsr(RR_0, R_0, 2);
10779 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10780 __ lsr(RR_1, R_1, 2);
10781 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10782
10783 // U_n is the current checksum
10784 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10785 pack_26(U_0, U_1, U_2, acc_start);
10786
10787 static constexpr int BLOCK_LENGTH = 16;
10788 Label DONE, LOOP;
10789
10790 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10791 __ br(Assembler::LT, DONE); {
10792 __ bind(LOOP);
10793
10794 // S_n is to be the sum of U_n and the next block of data
10795 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10796 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10797 __ adds(S_0, U_0, S_0);
10798 __ adcs(S_1, U_1, S_1);
10799 __ adc(S_2, U_2, zr);
10800 __ add(S_2, S_2, 1);
10801
10802 const Register U_0HI = *++regs, U_1HI = *++regs;
10803
10804 // NB: this logic depends on some of the special properties of
10805 // Poly1305 keys. In particular, because we know that the top
10806 // four bits of R_0 and R_1 are zero, we can add together
10807 // partial products without any risk of needing to propagate a
10808 // carry out.
10809 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10810 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10811 __ andr(U_2, R_0, 3);
10812 __ mul(U_2, S_2, U_2);
10813
10814 // Recycle registers S_0, S_1, S_2
10815 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10816
10817 // Partial reduction mod 2**130 - 5
10818 __ adds(U_1, U_0HI, U_1);
10819 __ adc(U_2, U_1HI, U_2);
10820 // Sum now in U_2:U_1:U_0.
10821 // Dead: U_0HI, U_1HI.
10822 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10823
10824 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10825
10826 // First, U_2:U_1:U_0 += (U_2 >> 2)
10827 __ lsr(rscratch1, U_2, 2);
10828 __ andr(U_2, U_2, (u8)3);
10829 __ adds(U_0, U_0, rscratch1);
10830 __ adcs(U_1, U_1, zr);
10831 __ adc(U_2, U_2, zr);
10832 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10833 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10834 __ adcs(U_1, U_1, zr);
10835 __ adc(U_2, U_2, zr);
10836
10837 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10838 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10839 __ br(~ Assembler::LT, LOOP);
10840 }
10841
10842 // Further reduce modulo 2^130 - 5
10843 __ lsr(rscratch1, U_2, 2);
10844 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10845 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10846 __ adcs(U_1, U_1, zr);
10847 __ andr(U_2, U_2, (u1)3);
10848 __ adc(U_2, U_2, zr);
10849
10850 // Unpack the sum into five 26-bit limbs and write to memory.
10851 __ ubfiz(rscratch1, U_0, 0, 26);
10852 __ ubfx(rscratch2, U_0, 26, 26);
10853 __ stp(rscratch1, rscratch2, Address(acc_start));
10854 __ ubfx(rscratch1, U_0, 52, 12);
10855 __ bfi(rscratch1, U_1, 12, 14);
10856 __ ubfx(rscratch2, U_1, 14, 26);
10857 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10858 __ ubfx(rscratch1, U_1, 40, 24);
10859 __ bfi(rscratch1, U_2, 24, 3);
10860 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10861
10862 __ bind(DONE);
10863 __ pop(callee_saved, sp);
10864 __ leave();
10865 __ ret(lr);
10866
10867 return start;
10868 }
10869
10870 // exception handler for upcall stubs
10871 address generate_upcall_stub_exception_handler() {
10872 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10873 StubCodeMark mark(this, stub_id);
10874 address start = __ pc();
10875
10876 // Native caller has no idea how to handle exceptions,
10877 // so we just crash here. Up to callee to catch exceptions.
10878 __ verify_oop(r0);
10879 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10880 __ blr(rscratch1);
10881 __ should_not_reach_here();
10882
10883 return start;
10884 }
10885
10886 // load Method* target of MethodHandle
10887 // j_rarg0 = jobject receiver
10888 // rmethod = result
10889 address generate_upcall_stub_load_target() {
10890 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10891 StubCodeMark mark(this, stub_id);
10892 address start = __ pc();
10893
10894 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10895 // Load target method from receiver
10896 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10897 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10898 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10899 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10900 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10901 noreg, noreg);
10902 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10903
10904 __ ret(lr);
10905
10906 return start;
10907 }
10908
10909 #undef __
10910 #define __ masm->
10911
10912 class MontgomeryMultiplyGenerator : public MacroAssembler {
10913
10914 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10915 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10916
10917 RegSet _toSave;
10918 bool _squaring;
10919
10920 public:
10921 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10922 : MacroAssembler(as->code()), _squaring(squaring) {
10923
10924 // Register allocation
10925
10926 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10927 Pa_base = *regs; // Argument registers
10928 if (squaring)
10929 Pb_base = Pa_base;
10930 else
10931 Pb_base = *++regs;
10932 Pn_base = *++regs;
10933 Rlen= *++regs;
10934 inv = *++regs;
10935 Pm_base = *++regs;
10936
10937 // Working registers:
10938 Ra = *++regs; // The current digit of a, b, n, and m.
10939 Rb = *++regs;
10940 Rm = *++regs;
10941 Rn = *++regs;
10942
10943 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10944 Pb = *++regs;
10945 Pm = *++regs;
10946 Pn = *++regs;
10947
10948 t0 = *++regs; // Three registers which form a
10949 t1 = *++regs; // triple-precision accumuator.
10950 t2 = *++regs;
10951
10952 Ri = *++regs; // Inner and outer loop indexes.
10953 Rj = *++regs;
10954
10955 Rhi_ab = *++regs; // Product registers: low and high parts
10956 Rlo_ab = *++regs; // of a*b and m*n.
10957 Rhi_mn = *++regs;
10958 Rlo_mn = *++regs;
10959
10960 // r19 and up are callee-saved.
10961 _toSave = RegSet::range(r19, *regs) + Pm_base;
10962 }
10963
10964 private:
10965 void save_regs() {
10966 push(_toSave, sp);
10967 }
10968
10969 void restore_regs() {
10970 pop(_toSave, sp);
10971 }
10972
10973 template <typename T>
10974 void unroll_2(Register count, T block) {
10975 Label loop, end, odd;
10976 tbnz(count, 0, odd);
10977 cbz(count, end);
10978 align(16);
10979 bind(loop);
10980 (this->*block)();
10981 bind(odd);
10982 (this->*block)();
10983 subs(count, count, 2);
10984 br(Assembler::GT, loop);
10985 bind(end);
10986 }
10987
10988 template <typename T>
10989 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10990 Label loop, end, odd;
10991 tbnz(count, 0, odd);
10992 cbz(count, end);
10993 align(16);
10994 bind(loop);
10995 (this->*block)(d, s, tmp);
10996 bind(odd);
10997 (this->*block)(d, s, tmp);
10998 subs(count, count, 2);
10999 br(Assembler::GT, loop);
11000 bind(end);
11001 }
11002
11003 void pre1(RegisterOrConstant i) {
11004 block_comment("pre1");
11005 // Pa = Pa_base;
11006 // Pb = Pb_base + i;
11007 // Pm = Pm_base;
11008 // Pn = Pn_base + i;
11009 // Ra = *Pa;
11010 // Rb = *Pb;
11011 // Rm = *Pm;
11012 // Rn = *Pn;
11013 ldr(Ra, Address(Pa_base));
11014 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11015 ldr(Rm, Address(Pm_base));
11016 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11017 lea(Pa, Address(Pa_base));
11018 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11019 lea(Pm, Address(Pm_base));
11020 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11021
11022 // Zero the m*n result.
11023 mov(Rhi_mn, zr);
11024 mov(Rlo_mn, zr);
11025 }
11026
11027 // The core multiply-accumulate step of a Montgomery
11028 // multiplication. The idea is to schedule operations as a
11029 // pipeline so that instructions with long latencies (loads and
11030 // multiplies) have time to complete before their results are
11031 // used. This most benefits in-order implementations of the
11032 // architecture but out-of-order ones also benefit.
11033 void step() {
11034 block_comment("step");
11035 // MACC(Ra, Rb, t0, t1, t2);
11036 // Ra = *++Pa;
11037 // Rb = *--Pb;
11038 umulh(Rhi_ab, Ra, Rb);
11039 mul(Rlo_ab, Ra, Rb);
11040 ldr(Ra, pre(Pa, wordSize));
11041 ldr(Rb, pre(Pb, -wordSize));
11042 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11043 // previous iteration.
11044 // MACC(Rm, Rn, t0, t1, t2);
11045 // Rm = *++Pm;
11046 // Rn = *--Pn;
11047 umulh(Rhi_mn, Rm, Rn);
11048 mul(Rlo_mn, Rm, Rn);
11049 ldr(Rm, pre(Pm, wordSize));
11050 ldr(Rn, pre(Pn, -wordSize));
11051 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11052 }
11053
11054 void post1() {
11055 block_comment("post1");
11056
11057 // MACC(Ra, Rb, t0, t1, t2);
11058 // Ra = *++Pa;
11059 // Rb = *--Pb;
11060 umulh(Rhi_ab, Ra, Rb);
11061 mul(Rlo_ab, Ra, Rb);
11062 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11063 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11064
11065 // *Pm = Rm = t0 * inv;
11066 mul(Rm, t0, inv);
11067 str(Rm, Address(Pm));
11068
11069 // MACC(Rm, Rn, t0, t1, t2);
11070 // t0 = t1; t1 = t2; t2 = 0;
11071 umulh(Rhi_mn, Rm, Rn);
11072
11073 #ifndef PRODUCT
11074 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11075 {
11076 mul(Rlo_mn, Rm, Rn);
11077 add(Rlo_mn, t0, Rlo_mn);
11078 Label ok;
11079 cbz(Rlo_mn, ok); {
11080 stop("broken Montgomery multiply");
11081 } bind(ok);
11082 }
11083 #endif
11084 // We have very carefully set things up so that
11085 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11086 // the lower half of Rm * Rn because we know the result already:
11087 // it must be -t0. t0 + (-t0) must generate a carry iff
11088 // t0 != 0. So, rather than do a mul and an adds we just set
11089 // the carry flag iff t0 is nonzero.
11090 //
11091 // mul(Rlo_mn, Rm, Rn);
11092 // adds(zr, t0, Rlo_mn);
11093 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11094 adcs(t0, t1, Rhi_mn);
11095 adc(t1, t2, zr);
11096 mov(t2, zr);
11097 }
11098
11099 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11100 block_comment("pre2");
11101 // Pa = Pa_base + i-len;
11102 // Pb = Pb_base + len;
11103 // Pm = Pm_base + i-len;
11104 // Pn = Pn_base + len;
11105
11106 if (i.is_register()) {
11107 sub(Rj, i.as_register(), len);
11108 } else {
11109 mov(Rj, i.as_constant());
11110 sub(Rj, Rj, len);
11111 }
11112 // Rj == i-len
11113
11114 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11115 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11116 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11117 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11118
11119 // Ra = *++Pa;
11120 // Rb = *--Pb;
11121 // Rm = *++Pm;
11122 // Rn = *--Pn;
11123 ldr(Ra, pre(Pa, wordSize));
11124 ldr(Rb, pre(Pb, -wordSize));
11125 ldr(Rm, pre(Pm, wordSize));
11126 ldr(Rn, pre(Pn, -wordSize));
11127
11128 mov(Rhi_mn, zr);
11129 mov(Rlo_mn, zr);
11130 }
11131
11132 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11133 block_comment("post2");
11134 if (i.is_constant()) {
11135 mov(Rj, i.as_constant()-len.as_constant());
11136 } else {
11137 sub(Rj, i.as_register(), len);
11138 }
11139
11140 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11141
11142 // As soon as we know the least significant digit of our result,
11143 // store it.
11144 // Pm_base[i-len] = t0;
11145 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11146
11147 // t0 = t1; t1 = t2; t2 = 0;
11148 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11149 adc(t1, t2, zr);
11150 mov(t2, zr);
11151 }
11152
11153 // A carry in t0 after Montgomery multiplication means that we
11154 // should subtract multiples of n from our result in m. We'll
11155 // keep doing that until there is no carry.
11156 void normalize(RegisterOrConstant len) {
11157 block_comment("normalize");
11158 // while (t0)
11159 // t0 = sub(Pm_base, Pn_base, t0, len);
11160 Label loop, post, again;
11161 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11162 cbz(t0, post); {
11163 bind(again); {
11164 mov(i, zr);
11165 mov(cnt, len);
11166 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11167 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11168 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11169 align(16);
11170 bind(loop); {
11171 sbcs(Rm, Rm, Rn);
11172 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11173 add(i, i, 1);
11174 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11175 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11176 sub(cnt, cnt, 1);
11177 } cbnz(cnt, loop);
11178 sbc(t0, t0, zr);
11179 } cbnz(t0, again);
11180 } bind(post);
11181 }
11182
11183 // Move memory at s to d, reversing words.
11184 // Increments d to end of copied memory
11185 // Destroys tmp1, tmp2
11186 // Preserves len
11187 // Leaves s pointing to the address which was in d at start
11188 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11189 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11190 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11191
11192 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11193 mov(tmp1, len);
11194 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11195 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11196 }
11197 // where
11198 void reverse1(Register d, Register s, Register tmp) {
11199 ldr(tmp, pre(s, -wordSize));
11200 ror(tmp, tmp, 32);
11201 str(tmp, post(d, wordSize));
11202 }
11203
11204 void step_squaring() {
11205 // An extra ACC
11206 step();
11207 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11208 }
11209
11210 void last_squaring(RegisterOrConstant i) {
11211 Label dont;
11212 // if ((i & 1) == 0) {
11213 tbnz(i.as_register(), 0, dont); {
11214 // MACC(Ra, Rb, t0, t1, t2);
11215 // Ra = *++Pa;
11216 // Rb = *--Pb;
11217 umulh(Rhi_ab, Ra, Rb);
11218 mul(Rlo_ab, Ra, Rb);
11219 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11220 } bind(dont);
11221 }
11222
11223 void extra_step_squaring() {
11224 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11225
11226 // MACC(Rm, Rn, t0, t1, t2);
11227 // Rm = *++Pm;
11228 // Rn = *--Pn;
11229 umulh(Rhi_mn, Rm, Rn);
11230 mul(Rlo_mn, Rm, Rn);
11231 ldr(Rm, pre(Pm, wordSize));
11232 ldr(Rn, pre(Pn, -wordSize));
11233 }
11234
11235 void post1_squaring() {
11236 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11237
11238 // *Pm = Rm = t0 * inv;
11239 mul(Rm, t0, inv);
11240 str(Rm, Address(Pm));
11241
11242 // MACC(Rm, Rn, t0, t1, t2);
11243 // t0 = t1; t1 = t2; t2 = 0;
11244 umulh(Rhi_mn, Rm, Rn);
11245
11246 #ifndef PRODUCT
11247 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11248 {
11249 mul(Rlo_mn, Rm, Rn);
11250 add(Rlo_mn, t0, Rlo_mn);
11251 Label ok;
11252 cbz(Rlo_mn, ok); {
11253 stop("broken Montgomery multiply");
11254 } bind(ok);
11255 }
11256 #endif
11257 // We have very carefully set things up so that
11258 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11259 // the lower half of Rm * Rn because we know the result already:
11260 // it must be -t0. t0 + (-t0) must generate a carry iff
11261 // t0 != 0. So, rather than do a mul and an adds we just set
11262 // the carry flag iff t0 is nonzero.
11263 //
11264 // mul(Rlo_mn, Rm, Rn);
11265 // adds(zr, t0, Rlo_mn);
11266 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11267 adcs(t0, t1, Rhi_mn);
11268 adc(t1, t2, zr);
11269 mov(t2, zr);
11270 }
11271
11272 void acc(Register Rhi, Register Rlo,
11273 Register t0, Register t1, Register t2) {
11274 adds(t0, t0, Rlo);
11275 adcs(t1, t1, Rhi);
11276 adc(t2, t2, zr);
11277 }
11278
11279 public:
11280 /**
11281 * Fast Montgomery multiplication. The derivation of the
11282 * algorithm is in A Cryptographic Library for the Motorola
11283 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11284 *
11285 * Arguments:
11286 *
11287 * Inputs for multiplication:
11288 * c_rarg0 - int array elements a
11289 * c_rarg1 - int array elements b
11290 * c_rarg2 - int array elements n (the modulus)
11291 * c_rarg3 - int length
11292 * c_rarg4 - int inv
11293 * c_rarg5 - int array elements m (the result)
11294 *
11295 * Inputs for squaring:
11296 * c_rarg0 - int array elements a
11297 * c_rarg1 - int array elements n (the modulus)
11298 * c_rarg2 - int length
11299 * c_rarg3 - int inv
11300 * c_rarg4 - int array elements m (the result)
11301 *
11302 */
11303 address generate_multiply() {
11304 Label argh, nothing;
11305 bind(argh);
11306 stop("MontgomeryMultiply total_allocation must be <= 8192");
11307
11308 align(CodeEntryAlignment);
11309 address entry = pc();
11310
11311 cbzw(Rlen, nothing);
11312
11313 enter();
11314
11315 // Make room.
11316 cmpw(Rlen, 512);
11317 br(Assembler::HI, argh);
11318 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11319 andr(sp, Ra, -2 * wordSize);
11320
11321 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11322
11323 {
11324 // Copy input args, reversing as we go. We use Ra as a
11325 // temporary variable.
11326 reverse(Ra, Pa_base, Rlen, t0, t1);
11327 if (!_squaring)
11328 reverse(Ra, Pb_base, Rlen, t0, t1);
11329 reverse(Ra, Pn_base, Rlen, t0, t1);
11330 }
11331
11332 // Push all call-saved registers and also Pm_base which we'll need
11333 // at the end.
11334 save_regs();
11335
11336 #ifndef PRODUCT
11337 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11338 {
11339 ldr(Rn, Address(Pn_base, 0));
11340 mul(Rlo_mn, Rn, inv);
11341 subs(zr, Rlo_mn, -1);
11342 Label ok;
11343 br(EQ, ok); {
11344 stop("broken inverse in Montgomery multiply");
11345 } bind(ok);
11346 }
11347 #endif
11348
11349 mov(Pm_base, Ra);
11350
11351 mov(t0, zr);
11352 mov(t1, zr);
11353 mov(t2, zr);
11354
11355 block_comment("for (int i = 0; i < len; i++) {");
11356 mov(Ri, zr); {
11357 Label loop, end;
11358 cmpw(Ri, Rlen);
11359 br(Assembler::GE, end);
11360
11361 bind(loop);
11362 pre1(Ri);
11363
11364 block_comment(" for (j = i; j; j--) {"); {
11365 movw(Rj, Ri);
11366 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11367 } block_comment(" } // j");
11368
11369 post1();
11370 addw(Ri, Ri, 1);
11371 cmpw(Ri, Rlen);
11372 br(Assembler::LT, loop);
11373 bind(end);
11374 block_comment("} // i");
11375 }
11376
11377 block_comment("for (int i = len; i < 2*len; i++) {");
11378 mov(Ri, Rlen); {
11379 Label loop, end;
11380 cmpw(Ri, Rlen, Assembler::LSL, 1);
11381 br(Assembler::GE, end);
11382
11383 bind(loop);
11384 pre2(Ri, Rlen);
11385
11386 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11387 lslw(Rj, Rlen, 1);
11388 subw(Rj, Rj, Ri);
11389 subw(Rj, Rj, 1);
11390 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11391 } block_comment(" } // j");
11392
11393 post2(Ri, Rlen);
11394 addw(Ri, Ri, 1);
11395 cmpw(Ri, Rlen, Assembler::LSL, 1);
11396 br(Assembler::LT, loop);
11397 bind(end);
11398 }
11399 block_comment("} // i");
11400
11401 normalize(Rlen);
11402
11403 mov(Ra, Pm_base); // Save Pm_base in Ra
11404 restore_regs(); // Restore caller's Pm_base
11405
11406 // Copy our result into caller's Pm_base
11407 reverse(Pm_base, Ra, Rlen, t0, t1);
11408
11409 leave();
11410 bind(nothing);
11411 ret(lr);
11412
11413 return entry;
11414 }
11415 // In C, approximately:
11416
11417 // void
11418 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11419 // julong Pn_base[], julong Pm_base[],
11420 // julong inv, int len) {
11421 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11422 // julong *Pa, *Pb, *Pn, *Pm;
11423 // julong Ra, Rb, Rn, Rm;
11424
11425 // int i;
11426
11427 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11428
11429 // for (i = 0; i < len; i++) {
11430 // int j;
11431
11432 // Pa = Pa_base;
11433 // Pb = Pb_base + i;
11434 // Pm = Pm_base;
11435 // Pn = Pn_base + i;
11436
11437 // Ra = *Pa;
11438 // Rb = *Pb;
11439 // Rm = *Pm;
11440 // Rn = *Pn;
11441
11442 // int iters = i;
11443 // for (j = 0; iters--; j++) {
11444 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11445 // MACC(Ra, Rb, t0, t1, t2);
11446 // Ra = *++Pa;
11447 // Rb = *--Pb;
11448 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11449 // MACC(Rm, Rn, t0, t1, t2);
11450 // Rm = *++Pm;
11451 // Rn = *--Pn;
11452 // }
11453
11454 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11455 // MACC(Ra, Rb, t0, t1, t2);
11456 // *Pm = Rm = t0 * inv;
11457 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11458 // MACC(Rm, Rn, t0, t1, t2);
11459
11460 // assert(t0 == 0, "broken Montgomery multiply");
11461
11462 // t0 = t1; t1 = t2; t2 = 0;
11463 // }
11464
11465 // for (i = len; i < 2*len; i++) {
11466 // int j;
11467
11468 // Pa = Pa_base + i-len;
11469 // Pb = Pb_base + len;
11470 // Pm = Pm_base + i-len;
11471 // Pn = Pn_base + len;
11472
11473 // Ra = *++Pa;
11474 // Rb = *--Pb;
11475 // Rm = *++Pm;
11476 // Rn = *--Pn;
11477
11478 // int iters = len*2-i-1;
11479 // for (j = i-len+1; iters--; j++) {
11480 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11481 // MACC(Ra, Rb, t0, t1, t2);
11482 // Ra = *++Pa;
11483 // Rb = *--Pb;
11484 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11485 // MACC(Rm, Rn, t0, t1, t2);
11486 // Rm = *++Pm;
11487 // Rn = *--Pn;
11488 // }
11489
11490 // Pm_base[i-len] = t0;
11491 // t0 = t1; t1 = t2; t2 = 0;
11492 // }
11493
11494 // while (t0)
11495 // t0 = sub(Pm_base, Pn_base, t0, len);
11496 // }
11497
11498 /**
11499 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11500 * multiplies than Montgomery multiplication so it should be up to
11501 * 25% faster. However, its loop control is more complex and it
11502 * may actually run slower on some machines.
11503 *
11504 * Arguments:
11505 *
11506 * Inputs:
11507 * c_rarg0 - int array elements a
11508 * c_rarg1 - int array elements n (the modulus)
11509 * c_rarg2 - int length
11510 * c_rarg3 - int inv
11511 * c_rarg4 - int array elements m (the result)
11512 *
11513 */
11514 address generate_square() {
11515 Label argh;
11516 bind(argh);
11517 stop("MontgomeryMultiply total_allocation must be <= 8192");
11518
11519 align(CodeEntryAlignment);
11520 address entry = pc();
11521
11522 enter();
11523
11524 // Make room.
11525 cmpw(Rlen, 512);
11526 br(Assembler::HI, argh);
11527 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11528 andr(sp, Ra, -2 * wordSize);
11529
11530 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11531
11532 {
11533 // Copy input args, reversing as we go. We use Ra as a
11534 // temporary variable.
11535 reverse(Ra, Pa_base, Rlen, t0, t1);
11536 reverse(Ra, Pn_base, Rlen, t0, t1);
11537 }
11538
11539 // Push all call-saved registers and also Pm_base which we'll need
11540 // at the end.
11541 save_regs();
11542
11543 mov(Pm_base, Ra);
11544
11545 mov(t0, zr);
11546 mov(t1, zr);
11547 mov(t2, zr);
11548
11549 block_comment("for (int i = 0; i < len; i++) {");
11550 mov(Ri, zr); {
11551 Label loop, end;
11552 bind(loop);
11553 cmp(Ri, Rlen);
11554 br(Assembler::GE, end);
11555
11556 pre1(Ri);
11557
11558 block_comment("for (j = (i+1)/2; j; j--) {"); {
11559 add(Rj, Ri, 1);
11560 lsr(Rj, Rj, 1);
11561 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11562 } block_comment(" } // j");
11563
11564 last_squaring(Ri);
11565
11566 block_comment(" for (j = i/2; j; j--) {"); {
11567 lsr(Rj, Ri, 1);
11568 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11569 } block_comment(" } // j");
11570
11571 post1_squaring();
11572 add(Ri, Ri, 1);
11573 cmp(Ri, Rlen);
11574 br(Assembler::LT, loop);
11575
11576 bind(end);
11577 block_comment("} // i");
11578 }
11579
11580 block_comment("for (int i = len; i < 2*len; i++) {");
11581 mov(Ri, Rlen); {
11582 Label loop, end;
11583 bind(loop);
11584 cmp(Ri, Rlen, Assembler::LSL, 1);
11585 br(Assembler::GE, end);
11586
11587 pre2(Ri, Rlen);
11588
11589 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11590 lsl(Rj, Rlen, 1);
11591 sub(Rj, Rj, Ri);
11592 sub(Rj, Rj, 1);
11593 lsr(Rj, Rj, 1);
11594 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11595 } block_comment(" } // j");
11596
11597 last_squaring(Ri);
11598
11599 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11600 lsl(Rj, Rlen, 1);
11601 sub(Rj, Rj, Ri);
11602 lsr(Rj, Rj, 1);
11603 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11604 } block_comment(" } // j");
11605
11606 post2(Ri, Rlen);
11607 add(Ri, Ri, 1);
11608 cmp(Ri, Rlen, Assembler::LSL, 1);
11609
11610 br(Assembler::LT, loop);
11611 bind(end);
11612 block_comment("} // i");
11613 }
11614
11615 normalize(Rlen);
11616
11617 mov(Ra, Pm_base); // Save Pm_base in Ra
11618 restore_regs(); // Restore caller's Pm_base
11619
11620 // Copy our result into caller's Pm_base
11621 reverse(Pm_base, Ra, Rlen, t0, t1);
11622
11623 leave();
11624 ret(lr);
11625
11626 return entry;
11627 }
11628 // In C, approximately:
11629
11630 // void
11631 // montgomery_square(julong Pa_base[], julong Pn_base[],
11632 // julong Pm_base[], julong inv, int len) {
11633 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11634 // julong *Pa, *Pb, *Pn, *Pm;
11635 // julong Ra, Rb, Rn, Rm;
11636
11637 // int i;
11638
11639 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11640
11641 // for (i = 0; i < len; i++) {
11642 // int j;
11643
11644 // Pa = Pa_base;
11645 // Pb = Pa_base + i;
11646 // Pm = Pm_base;
11647 // Pn = Pn_base + i;
11648
11649 // Ra = *Pa;
11650 // Rb = *Pb;
11651 // Rm = *Pm;
11652 // Rn = *Pn;
11653
11654 // int iters = (i+1)/2;
11655 // for (j = 0; iters--; j++) {
11656 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11657 // MACC2(Ra, Rb, t0, t1, t2);
11658 // Ra = *++Pa;
11659 // Rb = *--Pb;
11660 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11661 // MACC(Rm, Rn, t0, t1, t2);
11662 // Rm = *++Pm;
11663 // Rn = *--Pn;
11664 // }
11665 // if ((i & 1) == 0) {
11666 // assert(Ra == Pa_base[j], "must be");
11667 // MACC(Ra, Ra, t0, t1, t2);
11668 // }
11669 // iters = i/2;
11670 // assert(iters == i-j, "must be");
11671 // for (; iters--; j++) {
11672 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11673 // MACC(Rm, Rn, t0, t1, t2);
11674 // Rm = *++Pm;
11675 // Rn = *--Pn;
11676 // }
11677
11678 // *Pm = Rm = t0 * inv;
11679 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11680 // MACC(Rm, Rn, t0, t1, t2);
11681
11682 // assert(t0 == 0, "broken Montgomery multiply");
11683
11684 // t0 = t1; t1 = t2; t2 = 0;
11685 // }
11686
11687 // for (i = len; i < 2*len; i++) {
11688 // int start = i-len+1;
11689 // int end = start + (len - start)/2;
11690 // int j;
11691
11692 // Pa = Pa_base + i-len;
11693 // Pb = Pa_base + len;
11694 // Pm = Pm_base + i-len;
11695 // Pn = Pn_base + len;
11696
11697 // Ra = *++Pa;
11698 // Rb = *--Pb;
11699 // Rm = *++Pm;
11700 // Rn = *--Pn;
11701
11702 // int iters = (2*len-i-1)/2;
11703 // assert(iters == end-start, "must be");
11704 // for (j = start; iters--; j++) {
11705 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11706 // MACC2(Ra, Rb, t0, t1, t2);
11707 // Ra = *++Pa;
11708 // Rb = *--Pb;
11709 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11710 // MACC(Rm, Rn, t0, t1, t2);
11711 // Rm = *++Pm;
11712 // Rn = *--Pn;
11713 // }
11714 // if ((i & 1) == 0) {
11715 // assert(Ra == Pa_base[j], "must be");
11716 // MACC(Ra, Ra, t0, t1, t2);
11717 // }
11718 // iters = (2*len-i)/2;
11719 // assert(iters == len-j, "must be");
11720 // for (; iters--; j++) {
11721 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11722 // MACC(Rm, Rn, t0, t1, t2);
11723 // Rm = *++Pm;
11724 // Rn = *--Pn;
11725 // }
11726 // Pm_base[i-len] = t0;
11727 // t0 = t1; t1 = t2; t2 = 0;
11728 // }
11729
11730 // while (t0)
11731 // t0 = sub(Pm_base, Pn_base, t0, len);
11732 // }
11733 };
11734
11735 // Initialization
11736 void generate_preuniverse_stubs() {
11737 // preuniverse stubs are not needed for aarch64
11738 }
11739
11740 void generate_initial_stubs() {
11741 // Generate initial stubs and initializes the entry points
11742
11743 // entry points that exist in all platforms Note: This is code
11744 // that could be shared among different platforms - however the
11745 // benefit seems to be smaller than the disadvantage of having a
11746 // much more complicated generator structure. See also comment in
11747 // stubRoutines.hpp.
11748
11749 StubRoutines::_forward_exception_entry = generate_forward_exception();
11750
11751 StubRoutines::_call_stub_entry =
11752 generate_call_stub(StubRoutines::_call_stub_return_address);
11753
11754 // is referenced by megamorphic call
11755 StubRoutines::_catch_exception_entry = generate_catch_exception();
11756
11757 // Initialize table for copy memory (arraycopy) check.
11758 if (UnsafeMemoryAccess::_table == nullptr) {
11759 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11760 }
11761
11762 if (UseCRC32Intrinsics) {
11763 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11764 }
11765
11766 if (UseCRC32CIntrinsics) {
11767 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11768 }
11769
11770 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11771 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11772 }
11773
11774 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11775 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11776 }
11777
11778 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11779 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11780 StubRoutines::_hf2f = generate_float16ToFloat();
11781 StubRoutines::_f2hf = generate_floatToFloat16();
11782 }
11783 }
11784
11785 void generate_continuation_stubs() {
11786 // Continuation stubs:
11787 StubRoutines::_cont_thaw = generate_cont_thaw();
11788 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11789 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11790 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11791 }
11792
11793 void generate_final_stubs() {
11794 // support for verify_oop (must happen after universe_init)
11795 if (VerifyOops) {
11796 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11797 }
11798
11799 // arraycopy stubs used by compilers
11800 generate_arraycopy_stubs();
11801
11802 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11803
11804 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11805
11806 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11807 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11808
11809 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11810
11811 generate_atomic_entry_points();
11812
11813 #endif // LINUX
11814
11815 #ifdef COMPILER2
11816 if (UseSecondarySupersTable) {
11817 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11818 if (! InlineSecondarySupersTest) {
11819 generate_lookup_secondary_supers_table_stub();
11820 }
11821 }
11822 #endif
11823
11824 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11825
11826 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11827 }
11828
11829 void generate_compiler_stubs() {
11830 #if COMPILER2_OR_JVMCI
11831
11832 if (UseSVE == 0) {
11833 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11834 }
11835
11836 // array equals stub for large arrays.
11837 if (!UseSimpleArrayEquals) {
11838 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11839 }
11840
11841 // arrays_hascode stub for large arrays.
11842 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11843 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11844 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11845 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11846 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11847
11848 // byte_array_inflate stub for large arrays.
11849 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11850
11851 // countPositives stub for large arrays.
11852 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11853
11854 generate_compare_long_strings();
11855
11856 generate_string_indexof_stubs();
11857
11858 #ifdef COMPILER2
11859 if (UseMultiplyToLenIntrinsic) {
11860 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11861 }
11862
11863 if (UseSquareToLenIntrinsic) {
11864 StubRoutines::_squareToLen = generate_squareToLen();
11865 }
11866
11867 if (UseMulAddIntrinsic) {
11868 StubRoutines::_mulAdd = generate_mulAdd();
11869 }
11870
11871 if (UseSIMDForBigIntegerShiftIntrinsics) {
11872 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11873 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11874 }
11875
11876 if (UseMontgomeryMultiplyIntrinsic) {
11877 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11878 StubCodeMark mark(this, stub_id);
11879 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11880 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11881 }
11882
11883 if (UseMontgomerySquareIntrinsic) {
11884 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11885 StubCodeMark mark(this, stub_id);
11886 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11887 // We use generate_multiply() rather than generate_square()
11888 // because it's faster for the sizes of modulus we care about.
11889 StubRoutines::_montgomerySquare = g.generate_multiply();
11890 }
11891
11892 #endif // COMPILER2
11893
11894 if (UseChaCha20Intrinsics) {
11895 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11896 }
11897
11898 if (UseKyberIntrinsics) {
11899 StubRoutines::_kyberNtt = generate_kyberNtt();
11900 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11901 StubRoutines::_kyberNttMult = generate_kyberNttMult();
11902 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11903 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11904 StubRoutines::_kyber12To16 = generate_kyber12To16();
11905 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11906 }
11907
11908 if (UseDilithiumIntrinsics) {
11909 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11910 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11911 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11912 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11913 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11914 }
11915
11916 if (UseBASE64Intrinsics) {
11917 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11918 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11919 }
11920
11921 // data cache line writeback
11922 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11923 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11924
11925 if (UseAESIntrinsics) {
11926 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11927 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11928 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11929 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11930 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11931 }
11932 if (UseGHASHIntrinsics) {
11933 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11934 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11935 }
11936 if (UseAESIntrinsics && UseGHASHIntrinsics) {
11937 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11938 }
11939
11940 if (UseMD5Intrinsics) {
11941 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11942 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11943 }
11944 if (UseSHA1Intrinsics) {
11945 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11946 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11947 }
11948 if (UseSHA256Intrinsics) {
11949 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11950 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11951 }
11952 if (UseSHA512Intrinsics) {
11953 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11954 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11955 }
11956 if (UseSHA3Intrinsics) {
11957
11958 StubRoutines::_double_keccak = generate_double_keccak();
11959 if (UseSIMDForSHA3Intrinsic) {
11960 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11961 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11962 } else {
11963 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11964 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11965 }
11966 }
11967
11968 if (UsePoly1305Intrinsics) {
11969 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11970 }
11971
11972 // generate Adler32 intrinsics code
11973 if (UseAdler32Intrinsics) {
11974 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11975 }
11976
11977 #endif // COMPILER2_OR_JVMCI
11978 }
11979
11980 public:
11981 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11982 switch(blob_id) {
11983 case BlobId::stubgen_preuniverse_id:
11984 generate_preuniverse_stubs();
11985 break;
11986 case BlobId::stubgen_initial_id:
11987 generate_initial_stubs();
11988 break;
11989 case BlobId::stubgen_continuation_id:
11990 generate_continuation_stubs();
11991 break;
11992 case BlobId::stubgen_compiler_id:
11993 generate_compiler_stubs();
11994 break;
11995 case BlobId::stubgen_final_id:
11996 generate_final_stubs();
11997 break;
11998 default:
11999 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12000 break;
12001 };
12002 }
12003 }; // end class declaration
12004
12005 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12006 StubGenerator g(code, blob_id);
12007 }
12008
12009
12010 #if defined (LINUX)
12011
12012 // Define pointers to atomic stubs and initialize them to point to the
12013 // code in atomic_aarch64.S.
12014
12015 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12016 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12017 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12018 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12019 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12020
12021 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12022 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12023 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12024 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12025 DEFAULT_ATOMIC_OP(xchg, 4, )
12026 DEFAULT_ATOMIC_OP(xchg, 8, )
12027 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12028 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12029 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12030 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12031 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12032 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12033 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12034 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12035 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12036 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12037
12038 #undef DEFAULT_ATOMIC_OP
12039
12040 #endif // LINUX