1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 // All of j_rargN may be used to return inline type fields so be careful
332 // not to clobber those.
333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
334 // assignment of Rresult below.
335 Register Rresult = r14, Rresult_type = r15;
336 __ ldr(Rresult, result);
337 Label is_long, is_float, is_double, check_prim, exit;
338 __ ldr(Rresult_type, result_type);
339 __ cmp(Rresult_type, (u1)T_OBJECT);
340 __ br(Assembler::EQ, check_prim);
341 __ cmp(Rresult_type, (u1)T_LONG);
342 __ br(Assembler::EQ, is_long);
343 __ cmp(Rresult_type, (u1)T_FLOAT);
344 __ br(Assembler::EQ, is_float);
345 __ cmp(Rresult_type, (u1)T_DOUBLE);
346 __ br(Assembler::EQ, is_double);
347
348 // handle T_INT case
349 __ strw(r0, Address(Rresult));
350
351 __ BIND(exit);
352
353 // pop parameters
354 __ sub(esp, rfp, -sp_after_call_off * wordSize);
355
356 #ifdef ASSERT
357 // verify that threads correspond
358 {
359 Label L, S;
360 __ ldr(rscratch1, thread);
361 __ cmp(rthread, rscratch1);
362 __ br(Assembler::NE, S);
363 __ get_thread(rscratch1);
364 __ cmp(rthread, rscratch1);
365 __ br(Assembler::EQ, L);
366 __ BIND(S);
367 __ stop("StubRoutines::call_stub: threads must correspond");
368 __ BIND(L);
369 }
370 #endif
371
372 __ pop_cont_fastpath(rthread);
373
374 // restore callee-save registers
375 __ ldpd(v15, v14, d15_save);
376 __ ldpd(v13, v12, d13_save);
377 __ ldpd(v11, v10, d11_save);
378 __ ldpd(v9, v8, d9_save);
379
380 __ ldp(r28, r27, r28_save);
381 __ ldp(r26, r25, r26_save);
382 __ ldp(r24, r23, r24_save);
383 __ ldp(r22, r21, r22_save);
384 __ ldp(r20, r19, r20_save);
385
386 // restore fpcr
387 __ ldr(rscratch1, fpcr_save);
388 __ set_fpcr(rscratch1);
389
390 __ ldp(c_rarg0, c_rarg1, call_wrapper);
391 __ ldrw(c_rarg2, result_type);
392 __ ldr(c_rarg3, method);
393 __ ldp(c_rarg4, c_rarg5, entry_point);
394 __ ldp(c_rarg6, c_rarg7, parameter_size);
395
396 // leave frame and return to caller
397 __ leave();
398 __ ret(lr);
399
400 // handle return types different from T_INT
401 __ BIND(check_prim);
402 if (InlineTypeReturnedAsFields) {
403 // Check for scalarized return value
404 __ tbz(r0, 0, is_long);
405 // Load pack handler address
406 __ andr(rscratch1, r0, -2);
407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
409 __ blr(rscratch1);
410 __ b(exit);
411 }
412
413 __ BIND(is_long);
414 __ str(r0, Address(Rresult, 0));
415 __ br(Assembler::AL, exit);
416
417 __ BIND(is_float);
418 __ strs(j_farg0, Address(Rresult, 0));
419 __ br(Assembler::AL, exit);
420
421 __ BIND(is_double);
422 __ strd(j_farg0, Address(Rresult, 0));
423 __ br(Assembler::AL, exit);
424
425 return start;
426 }
427
428 // Return point for a Java call if there's an exception thrown in
429 // Java code. The exception is caught and transformed into a
430 // pending exception stored in JavaThread that can be tested from
431 // within the VM.
432 //
433 // Note: Usually the parameters are removed by the callee. In case
434 // of an exception crossing an activation frame boundary, that is
435 // not the case if the callee is compiled code => need to setup the
436 // rsp.
437 //
438 // r0: exception oop
439
440 address generate_catch_exception() {
441 StubId stub_id = StubId::stubgen_catch_exception_id;
442 StubCodeMark mark(this, stub_id);
443 address start = __ pc();
444
445 // same as in generate_call_stub():
446 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
447 const Address thread (rfp, thread_off * wordSize);
448
449 #ifdef ASSERT
450 // verify that threads correspond
451 {
452 Label L, S;
453 __ ldr(rscratch1, thread);
454 __ cmp(rthread, rscratch1);
455 __ br(Assembler::NE, S);
456 __ get_thread(rscratch1);
457 __ cmp(rthread, rscratch1);
458 __ br(Assembler::EQ, L);
459 __ bind(S);
460 __ stop("StubRoutines::catch_exception: threads must correspond");
461 __ bind(L);
462 }
463 #endif
464
465 // set pending exception
466 __ verify_oop(r0);
467
468 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
469 __ mov(rscratch1, (address)__FILE__);
470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
471 __ movw(rscratch1, (int)__LINE__);
472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
473
474 // complete return to VM
475 assert(StubRoutines::_call_stub_return_address != nullptr,
476 "_call_stub_return_address must have been generated before");
477 __ b(StubRoutines::_call_stub_return_address);
478
479 return start;
480 }
481
482 // Continuation point for runtime calls returning with a pending
483 // exception. The pending exception check happened in the runtime
484 // or native call stub. The pending exception in Thread is
485 // converted into a Java-level exception.
486 //
487 // Contract with Java-level exception handlers:
488 // r0: exception
489 // r3: throwing pc
490 //
491 // NOTE: At entry of this stub, exception-pc must be in LR !!
492
493 // NOTE: this is always used as a jump target within generated code
494 // so it just needs to be generated code with no x86 prolog
495
496 address generate_forward_exception() {
497 StubId stub_id = StubId::stubgen_forward_exception_id;
498 StubCodeMark mark(this, stub_id);
499 address start = __ pc();
500
501 // Upon entry, LR points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510 #ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
515 __ cbnz(rscratch1, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519 #endif
520
521 // compute exception handler into r19
522
523 // call the VM to find the handler address associated with the
524 // caller address. pass thread in r0 and caller pc (ret address)
525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
526 // the stack.
527 __ mov(c_rarg1, lr);
528 // lr will be trashed by the VM call so we move it to R19
529 // (callee-saved) because we also need to pass it to the handler
530 // returned by this call.
531 __ mov(r19, lr);
532 BLOCK_COMMENT("call exception_handler_for_return_address");
533 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
534 SharedRuntime::exception_handler_for_return_address),
535 rthread, c_rarg1);
536 // Reinitialize the ptrue predicate register, in case the external runtime
537 // call clobbers ptrue reg, as we may return to SVE compiled code.
538 __ reinitialize_ptrue();
539
540 // we should not really care that lr is no longer the callee
541 // address. we saved the value the handler needs in r19 so we can
542 // just copy it to r3. however, the C2 handler will push its own
543 // frame and then calls into the VM and the VM code asserts that
544 // the PC for the frame above the handler belongs to a compiled
545 // Java method. So, we restore lr here to satisfy that assert.
546 __ mov(lr, r19);
547 // setup r0 & r3 & clear pending exception
548 __ mov(r3, r19);
549 __ mov(r19, r0);
550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
551 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
552
553 #ifdef ASSERT
554 // make sure exception is set
555 {
556 Label L;
557 __ cbnz(r0, L);
558 __ stop("StubRoutines::forward exception: no pending exception (2)");
559 __ bind(L);
560 }
561 #endif
562
563 // continue at exception handler
564 // r0: exception
565 // r3: throwing pc
566 // r19: exception handler
567 __ verify_oop(r0);
568 __ br(r19);
569
570 return start;
571 }
572
573 // Non-destructive plausibility checks for oops
574 //
575 // Arguments:
576 // r0: oop to verify
577 // rscratch1: error message
578 //
579 // Stack after saving c_rarg3:
580 // [tos + 0]: saved c_rarg3
581 // [tos + 1]: saved c_rarg2
582 // [tos + 2]: saved lr
583 // [tos + 3]: saved rscratch2
584 // [tos + 4]: saved r0
585 // [tos + 5]: saved rscratch1
586 address generate_verify_oop() {
587 StubId stub_id = StubId::stubgen_verify_oop_id;
588 StubCodeMark mark(this, stub_id);
589 address start = __ pc();
590
591 Label exit, error;
592
593 // save c_rarg2 and c_rarg3
594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
595
596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
598 __ ldr(c_rarg3, Address(c_rarg2));
599 __ add(c_rarg3, c_rarg3, 1);
600 __ str(c_rarg3, Address(c_rarg2));
601
602 // object is in r0
603 // make sure object is 'reasonable'
604 __ cbz(r0, exit); // if obj is null it is OK
605
606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
608
609 // return if everything seems ok
610 __ bind(exit);
611
612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
613 __ ret(lr);
614
615 // handle errors
616 __ bind(error);
617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
618
619 __ push(RegSet::range(r0, r29), sp);
620 // debug(char* msg, int64_t pc, int64_t regs[])
621 __ mov(c_rarg0, rscratch1); // pass address of error message
622 __ mov(c_rarg1, lr); // pass return address
623 __ mov(c_rarg2, sp); // pass address of regs on stack
624 #ifndef PRODUCT
625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
626 #endif
627 BLOCK_COMMENT("call MacroAssembler::debug");
628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
629 __ blr(rscratch1);
630 __ hlt(0);
631
632 return start;
633 }
634
635 // Generate indices for iota vector.
636 address generate_iota_indices(StubId stub_id) {
637 __ align(CodeEntryAlignment);
638 StubCodeMark mark(this, stub_id);
639 address start = __ pc();
640 // B
641 __ emit_data64(0x0706050403020100, relocInfo::none);
642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
643 // H
644 __ emit_data64(0x0003000200010000, relocInfo::none);
645 __ emit_data64(0x0007000600050004, relocInfo::none);
646 // S
647 __ emit_data64(0x0000000100000000, relocInfo::none);
648 __ emit_data64(0x0000000300000002, relocInfo::none);
649 // D
650 __ emit_data64(0x0000000000000000, relocInfo::none);
651 __ emit_data64(0x0000000000000001, relocInfo::none);
652 // S - FP
653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
655 // D - FP
656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
658 return start;
659 }
660
661 // The inner part of zero_words(). This is the bulk operation,
662 // zeroing words in blocks, possibly using DC ZVA to do it. The
663 // caller is responsible for zeroing the last few words.
664 //
665 // Inputs:
666 // r10: the HeapWord-aligned base address of an array to zero.
667 // r11: the count in HeapWords, r11 > 0.
668 //
669 // Returns r10 and r11, adjusted for the caller to clear.
670 // r10: the base address of the tail of words left to clear.
671 // r11: the number of words in the tail.
672 // r11 < MacroAssembler::zero_words_block_size.
673
674 address generate_zero_blocks() {
675 Label done;
676 Label base_aligned;
677
678 Register base = r10, cnt = r11;
679
680 __ align(CodeEntryAlignment);
681 StubId stub_id = StubId::stubgen_zero_blocks_id;
682 StubCodeMark mark(this, stub_id);
683 address start = __ pc();
684
685 if (UseBlockZeroing) {
686 int zva_length = VM_Version::zva_length();
687
688 // Ensure ZVA length can be divided by 16. This is required by
689 // the subsequent operations.
690 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
691
692 __ tbz(base, 3, base_aligned);
693 __ str(zr, Address(__ post(base, 8)));
694 __ sub(cnt, cnt, 1);
695 __ bind(base_aligned);
696
697 // Ensure count >= zva_length * 2 so that it still deserves a zva after
698 // alignment.
699 Label small;
700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
701 __ subs(rscratch1, cnt, low_limit >> 3);
702 __ br(Assembler::LT, small);
703 __ zero_dcache_blocks(base, cnt);
704 __ bind(small);
705 }
706
707 {
708 // Number of stp instructions we'll unroll
709 const int unroll =
710 MacroAssembler::zero_words_block_size / 2;
711 // Clear the remaining blocks.
712 Label loop;
713 __ subs(cnt, cnt, unroll * 2);
714 __ br(Assembler::LT, done);
715 __ bind(loop);
716 for (int i = 0; i < unroll; i++)
717 __ stp(zr, zr, __ post(base, 16));
718 __ subs(cnt, cnt, unroll * 2);
719 __ br(Assembler::GE, loop);
720 __ bind(done);
721 __ add(cnt, cnt, unroll * 2);
722 }
723
724 __ ret(lr);
725
726 return start;
727 }
728
729
730 typedef enum {
731 copy_forwards = 1,
732 copy_backwards = -1
733 } copy_direction;
734
735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
736 // for arraycopy stubs.
737 class ArrayCopyBarrierSetHelper : StackObj {
738 BarrierSetAssembler* _bs_asm;
739 MacroAssembler* _masm;
740 DecoratorSet _decorators;
741 BasicType _type;
742 Register _gct1;
743 Register _gct2;
744 Register _gct3;
745 FloatRegister _gcvt1;
746 FloatRegister _gcvt2;
747 FloatRegister _gcvt3;
748
749 public:
750 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
751 DecoratorSet decorators,
752 BasicType type,
753 Register gct1,
754 Register gct2,
755 Register gct3,
756 FloatRegister gcvt1,
757 FloatRegister gcvt2,
758 FloatRegister gcvt3)
759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
760 _masm(masm),
761 _decorators(decorators),
762 _type(type),
763 _gct1(gct1),
764 _gct2(gct2),
765 _gct3(gct3),
766 _gcvt1(gcvt1),
767 _gcvt2(gcvt2),
768 _gcvt3(gcvt3) {
769 }
770
771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
773 dst1, dst2, src,
774 _gct1, _gct2, _gcvt1);
775 }
776
777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
779 dst, src1, src2,
780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
781 }
782
783 void copy_load_at_16(Register dst1, Register dst2, Address src) {
784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
785 dst1, dst2, src,
786 _gct1);
787 }
788
789 void copy_store_at_16(Address dst, Register src1, Register src2) {
790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
791 dst, src1, src2,
792 _gct1, _gct2, _gct3);
793 }
794
795 void copy_load_at_8(Register dst, Address src) {
796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
797 dst, noreg, src,
798 _gct1);
799 }
800
801 void copy_store_at_8(Address dst, Register src) {
802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
803 dst, src, noreg,
804 _gct1, _gct2, _gct3);
805 }
806 };
807
808 // Bulk copy of blocks of 8 words.
809 //
810 // count is a count of words.
811 //
812 // Precondition: count >= 8
813 //
814 // Postconditions:
815 //
816 // The least significant bit of count contains the remaining count
817 // of words to copy. The rest of count is trash.
818 //
819 // s and d are adjusted to point to the remaining words to copy
820 //
821 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
822 BasicType type;
823 copy_direction direction;
824
825 switch (stub_id) {
826 case StubId::stubgen_copy_byte_f_id:
827 direction = copy_forwards;
828 type = T_BYTE;
829 break;
830 case StubId::stubgen_copy_byte_b_id:
831 direction = copy_backwards;
832 type = T_BYTE;
833 break;
834 case StubId::stubgen_copy_oop_f_id:
835 direction = copy_forwards;
836 type = T_OBJECT;
837 break;
838 case StubId::stubgen_copy_oop_b_id:
839 direction = copy_backwards;
840 type = T_OBJECT;
841 break;
842 case StubId::stubgen_copy_oop_uninit_f_id:
843 direction = copy_forwards;
844 type = T_OBJECT;
845 break;
846 case StubId::stubgen_copy_oop_uninit_b_id:
847 direction = copy_backwards;
848 type = T_OBJECT;
849 break;
850 default:
851 ShouldNotReachHere();
852 }
853
854 int unit = wordSize * direction;
855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
856
857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
858 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
859 const Register stride = r14;
860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
863
864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
865 assert_different_registers(s, d, count, rscratch1, rscratch2);
866
867 Label again, drain;
868
869 __ align(CodeEntryAlignment);
870
871 StubCodeMark mark(this, stub_id);
872
873 address start = __ pc();
874
875 Label unaligned_copy_long;
876 if (AvoidUnalignedAccesses) {
877 __ tbnz(d, 3, unaligned_copy_long);
878 }
879
880 if (direction == copy_forwards) {
881 __ sub(s, s, bias);
882 __ sub(d, d, bias);
883 }
884
885 #ifdef ASSERT
886 // Make sure we are never given < 8 words
887 {
888 Label L;
889 __ cmp(count, (u1)8);
890 __ br(Assembler::GE, L);
891 __ stop("genrate_copy_longs called with < 8 words");
892 __ bind(L);
893 }
894 #endif
895
896 // Fill 8 registers
897 if (UseSIMDForMemoryOps) {
898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
900 } else {
901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
905 }
906
907 __ subs(count, count, 16);
908 __ br(Assembler::LO, drain);
909
910 int prefetch = PrefetchCopyIntervalInBytes;
911 bool use_stride = false;
912 if (direction == copy_backwards) {
913 use_stride = prefetch > 256;
914 prefetch = -prefetch;
915 if (use_stride) __ mov(stride, prefetch);
916 }
917
918 __ bind(again);
919
920 if (PrefetchCopyIntervalInBytes > 0)
921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
922
923 if (UseSIMDForMemoryOps) {
924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
928 } else {
929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
937 }
938
939 __ subs(count, count, 8);
940 __ br(Assembler::HS, again);
941
942 // Drain
943 __ bind(drain);
944 if (UseSIMDForMemoryOps) {
945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
947 } else {
948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
952 }
953
954 {
955 Label L1, L2;
956 __ tbz(count, exact_log2(4), L1);
957 if (UseSIMDForMemoryOps) {
958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
960 } else {
961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
965 }
966 __ bind(L1);
967
968 if (direction == copy_forwards) {
969 __ add(s, s, bias);
970 __ add(d, d, bias);
971 }
972
973 __ tbz(count, 1, L2);
974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
976 __ bind(L2);
977 }
978
979 __ ret(lr);
980
981 if (AvoidUnalignedAccesses) {
982 Label drain, again;
983 // Register order for storing. Order is different for backward copy.
984
985 __ bind(unaligned_copy_long);
986
987 // source address is even aligned, target odd aligned
988 //
989 // when forward copying word pairs we read long pairs at offsets
990 // {0, 2, 4, 6} (in long words). when backwards copying we read
991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
992 // address by -2 in the forwards case so we can compute the
993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
994 // or -1.
995 //
996 // when forward copying we need to store 1 word, 3 pairs and
997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
998 // zero offset We adjust the destination by -1 which means we
999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1000 //
1001 // When backwards copyng we need to store 1 word, 3 pairs and
1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1003 // offsets {1, 3, 5, 7, 8} * unit.
1004
1005 if (direction == copy_forwards) {
1006 __ sub(s, s, 16);
1007 __ sub(d, d, 8);
1008 }
1009
1010 // Fill 8 registers
1011 //
1012 // for forwards copy s was offset by -16 from the original input
1013 // value of s so the register contents are at these offsets
1014 // relative to the 64 bit block addressed by that original input
1015 // and so on for each successive 64 byte block when s is updated
1016 //
1017 // t0 at offset 0, t1 at offset 8
1018 // t2 at offset 16, t3 at offset 24
1019 // t4 at offset 32, t5 at offset 40
1020 // t6 at offset 48, t7 at offset 56
1021
1022 // for backwards copy s was not offset so the register contents
1023 // are at these offsets into the preceding 64 byte block
1024 // relative to that original input and so on for each successive
1025 // preceding 64 byte block when s is updated. this explains the
1026 // slightly counter-intuitive looking pattern of register usage
1027 // in the stp instructions for backwards copy.
1028 //
1029 // t0 at offset -16, t1 at offset -8
1030 // t2 at offset -32, t3 at offset -24
1031 // t4 at offset -48, t5 at offset -40
1032 // t6 at offset -64, t7 at offset -56
1033
1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1038
1039 __ subs(count, count, 16);
1040 __ br(Assembler::LO, drain);
1041
1042 int prefetch = PrefetchCopyIntervalInBytes;
1043 bool use_stride = false;
1044 if (direction == copy_backwards) {
1045 use_stride = prefetch > 256;
1046 prefetch = -prefetch;
1047 if (use_stride) __ mov(stride, prefetch);
1048 }
1049
1050 __ bind(again);
1051
1052 if (PrefetchCopyIntervalInBytes > 0)
1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1054
1055 if (direction == copy_forwards) {
1056 // allowing for the offset of -8 the store instructions place
1057 // registers into the target 64 bit block at the following
1058 // offsets
1059 //
1060 // t0 at offset 0
1061 // t1 at offset 8, t2 at offset 16
1062 // t3 at offset 24, t4 at offset 32
1063 // t5 at offset 40, t6 at offset 48
1064 // t7 at offset 56
1065
1066 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1075 } else {
1076 // d was not offset when we started so the registers are
1077 // written into the 64 bit block preceding d with the following
1078 // offsets
1079 //
1080 // t1 at offset -8
1081 // t3 at offset -24, t0 at offset -16
1082 // t5 at offset -48, t2 at offset -32
1083 // t7 at offset -56, t4 at offset -48
1084 // t6 at offset -64
1085 //
1086 // note that this matches the offsets previously noted for the
1087 // loads
1088
1089 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1098 }
1099
1100 __ subs(count, count, 8);
1101 __ br(Assembler::HS, again);
1102
1103 // Drain
1104 //
1105 // this uses the same pattern of offsets and register arguments
1106 // as above
1107 __ bind(drain);
1108 if (direction == copy_forwards) {
1109 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1114 } else {
1115 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1120 }
1121 // now we need to copy any remaining part block which may
1122 // include a 4 word block subblock and/or a 2 word subblock.
1123 // bits 2 and 1 in the count are the tell-tale for whether we
1124 // have each such subblock
1125 {
1126 Label L1, L2;
1127 __ tbz(count, exact_log2(4), L1);
1128 // this is the same as above but copying only 4 longs hence
1129 // with only one intervening stp between the str instructions
1130 // but note that the offsets and registers still follow the
1131 // same pattern
1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1134 if (direction == copy_forwards) {
1135 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1142 }
1143 __ bind(L1);
1144
1145 __ tbz(count, 1, L2);
1146 // this is the same as above but copying only 2 longs hence
1147 // there is no intervening stp between the str instructions
1148 // but note that the offset and register patterns are still
1149 // the same
1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1151 if (direction == copy_forwards) {
1152 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1154 } else {
1155 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1157 }
1158 __ bind(L2);
1159
1160 // for forwards copy we need to re-adjust the offsets we
1161 // applied so that s and d are follow the last words written
1162
1163 if (direction == copy_forwards) {
1164 __ add(s, s, 16);
1165 __ add(d, d, 8);
1166 }
1167
1168 }
1169
1170 __ ret(lr);
1171 }
1172
1173 return start;
1174 }
1175
1176 // Small copy: less than 16 bytes.
1177 //
1178 // NB: Ignores all of the bits of count which represent more than 15
1179 // bytes, so a caller doesn't have to mask them.
1180
1181 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1182 bool is_backwards = step < 0;
1183 size_t granularity = g_uabs(step);
1184 int direction = is_backwards ? -1 : 1;
1185
1186 Label Lword, Lint, Lshort, Lbyte;
1187
1188 assert(granularity
1189 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1190
1191 const Register t0 = r3;
1192 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1193 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1194
1195 // ??? I don't know if this bit-test-and-branch is the right thing
1196 // to do. It does a lot of jumping, resulting in several
1197 // mispredicted branches. It might make more sense to do this
1198 // with something like Duff's device with a single computed branch.
1199
1200 __ tbz(count, 3 - exact_log2(granularity), Lword);
1201 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1202 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1203 __ bind(Lword);
1204
1205 if (granularity <= sizeof (jint)) {
1206 __ tbz(count, 2 - exact_log2(granularity), Lint);
1207 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1208 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1209 __ bind(Lint);
1210 }
1211
1212 if (granularity <= sizeof (jshort)) {
1213 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1214 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1215 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1216 __ bind(Lshort);
1217 }
1218
1219 if (granularity <= sizeof (jbyte)) {
1220 __ tbz(count, 0, Lbyte);
1221 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1222 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1223 __ bind(Lbyte);
1224 }
1225 }
1226
1227 // All-singing all-dancing memory copy.
1228 //
1229 // Copy count units of memory from s to d. The size of a unit is
1230 // step, which can be positive or negative depending on the direction
1231 // of copy. If is_aligned is false, we align the source address.
1232 //
1233
1234 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1235 Register s, Register d, Register count, int step) {
1236 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1237 bool is_backwards = step < 0;
1238 unsigned int granularity = g_uabs(step);
1239 const Register t0 = r3, t1 = r4;
1240
1241 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1242 // load all the data before writing anything
1243 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1244 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1245 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1246 const Register send = r17, dend = r16;
1247 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1248 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1249 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1250
1251 if (PrefetchCopyIntervalInBytes > 0)
1252 __ prfm(Address(s, 0), PLDL1KEEP);
1253 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1254 __ br(Assembler::HI, copy_big);
1255
1256 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1257 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1258
1259 __ cmp(count, u1(16/granularity));
1260 __ br(Assembler::LS, copy16);
1261
1262 __ cmp(count, u1(64/granularity));
1263 __ br(Assembler::HI, copy80);
1264
1265 __ cmp(count, u1(32/granularity));
1266 __ br(Assembler::LS, copy32);
1267
1268 // 33..64 bytes
1269 if (UseSIMDForMemoryOps) {
1270 bs.copy_load_at_32(v0, v1, Address(s, 0));
1271 bs.copy_load_at_32(v2, v3, Address(send, -32));
1272 bs.copy_store_at_32(Address(d, 0), v0, v1);
1273 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1274 } else {
1275 bs.copy_load_at_16(t0, t1, Address(s, 0));
1276 bs.copy_load_at_16(t2, t3, Address(s, 16));
1277 bs.copy_load_at_16(t4, t5, Address(send, -32));
1278 bs.copy_load_at_16(t6, t7, Address(send, -16));
1279
1280 bs.copy_store_at_16(Address(d, 0), t0, t1);
1281 bs.copy_store_at_16(Address(d, 16), t2, t3);
1282 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1283 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1284 }
1285 __ b(finish);
1286
1287 // 17..32 bytes
1288 __ bind(copy32);
1289 bs.copy_load_at_16(t0, t1, Address(s, 0));
1290 bs.copy_load_at_16(t6, t7, Address(send, -16));
1291
1292 bs.copy_store_at_16(Address(d, 0), t0, t1);
1293 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1294 __ b(finish);
1295
1296 // 65..80/96 bytes
1297 // (96 bytes if SIMD because we do 32 byes per instruction)
1298 __ bind(copy80);
1299 if (UseSIMDForMemoryOps) {
1300 bs.copy_load_at_32(v0, v1, Address(s, 0));
1301 bs.copy_load_at_32(v2, v3, Address(s, 32));
1302 // Unaligned pointers can be an issue for copying.
1303 // The issue has more chances to happen when granularity of data is
1304 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1305 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1306 // The most performance drop has been seen for the range 65-80 bytes.
1307 // For such cases using the pair of ldp/stp instead of the third pair of
1308 // ldpq/stpq fixes the performance issue.
1309 if (granularity < sizeof (jint)) {
1310 Label copy96;
1311 __ cmp(count, u1(80/granularity));
1312 __ br(Assembler::HI, copy96);
1313 bs.copy_load_at_16(t0, t1, Address(send, -16));
1314
1315 bs.copy_store_at_32(Address(d, 0), v0, v1);
1316 bs.copy_store_at_32(Address(d, 32), v2, v3);
1317
1318 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1319 __ b(finish);
1320
1321 __ bind(copy96);
1322 }
1323 bs.copy_load_at_32(v4, v5, Address(send, -32));
1324
1325 bs.copy_store_at_32(Address(d, 0), v0, v1);
1326 bs.copy_store_at_32(Address(d, 32), v2, v3);
1327
1328 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1329 } else {
1330 bs.copy_load_at_16(t0, t1, Address(s, 0));
1331 bs.copy_load_at_16(t2, t3, Address(s, 16));
1332 bs.copy_load_at_16(t4, t5, Address(s, 32));
1333 bs.copy_load_at_16(t6, t7, Address(s, 48));
1334 bs.copy_load_at_16(t8, t9, Address(send, -16));
1335
1336 bs.copy_store_at_16(Address(d, 0), t0, t1);
1337 bs.copy_store_at_16(Address(d, 16), t2, t3);
1338 bs.copy_store_at_16(Address(d, 32), t4, t5);
1339 bs.copy_store_at_16(Address(d, 48), t6, t7);
1340 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1341 }
1342 __ b(finish);
1343
1344 // 0..16 bytes
1345 __ bind(copy16);
1346 __ cmp(count, u1(8/granularity));
1347 __ br(Assembler::LO, copy8);
1348
1349 // 8..16 bytes
1350 bs.copy_load_at_8(t0, Address(s, 0));
1351 bs.copy_load_at_8(t1, Address(send, -8));
1352 bs.copy_store_at_8(Address(d, 0), t0);
1353 bs.copy_store_at_8(Address(dend, -8), t1);
1354 __ b(finish);
1355
1356 if (granularity < 8) {
1357 // 4..7 bytes
1358 __ bind(copy8);
1359 __ tbz(count, 2 - exact_log2(granularity), copy4);
1360 __ ldrw(t0, Address(s, 0));
1361 __ ldrw(t1, Address(send, -4));
1362 __ strw(t0, Address(d, 0));
1363 __ strw(t1, Address(dend, -4));
1364 __ b(finish);
1365 if (granularity < 4) {
1366 // 0..3 bytes
1367 __ bind(copy4);
1368 __ cbz(count, finish); // get rid of 0 case
1369 if (granularity == 2) {
1370 __ ldrh(t0, Address(s, 0));
1371 __ strh(t0, Address(d, 0));
1372 } else { // granularity == 1
1373 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1374 // the first and last byte.
1375 // Handle the 3 byte case by loading and storing base + count/2
1376 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1377 // This does means in the 1 byte case we load/store the same
1378 // byte 3 times.
1379 __ lsr(count, count, 1);
1380 __ ldrb(t0, Address(s, 0));
1381 __ ldrb(t1, Address(send, -1));
1382 __ ldrb(t2, Address(s, count));
1383 __ strb(t0, Address(d, 0));
1384 __ strb(t1, Address(dend, -1));
1385 __ strb(t2, Address(d, count));
1386 }
1387 __ b(finish);
1388 }
1389 }
1390
1391 __ bind(copy_big);
1392 if (is_backwards) {
1393 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1394 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1395 }
1396
1397 // Now we've got the small case out of the way we can align the
1398 // source address on a 2-word boundary.
1399
1400 // Here we will materialize a count in r15, which is used by copy_memory_small
1401 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1402 // Up until here, we have used t9, which aliases r15, but from here on, that register
1403 // can not be used as a temp register, as it contains the count.
1404
1405 Label aligned;
1406
1407 if (is_aligned) {
1408 // We may have to adjust by 1 word to get s 2-word-aligned.
1409 __ tbz(s, exact_log2(wordSize), aligned);
1410 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1411 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1412 __ sub(count, count, wordSize/granularity);
1413 } else {
1414 if (is_backwards) {
1415 __ andr(r15, s, 2 * wordSize - 1);
1416 } else {
1417 __ neg(r15, s);
1418 __ andr(r15, r15, 2 * wordSize - 1);
1419 }
1420 // r15 is the byte adjustment needed to align s.
1421 __ cbz(r15, aligned);
1422 int shift = exact_log2(granularity);
1423 if (shift > 0) {
1424 __ lsr(r15, r15, shift);
1425 }
1426 __ sub(count, count, r15);
1427
1428 #if 0
1429 // ?? This code is only correct for a disjoint copy. It may or
1430 // may not make sense to use it in that case.
1431
1432 // Copy the first pair; s and d may not be aligned.
1433 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1434 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1435
1436 // Align s and d, adjust count
1437 if (is_backwards) {
1438 __ sub(s, s, r15);
1439 __ sub(d, d, r15);
1440 } else {
1441 __ add(s, s, r15);
1442 __ add(d, d, r15);
1443 }
1444 #else
1445 copy_memory_small(decorators, type, s, d, r15, step);
1446 #endif
1447 }
1448
1449 __ bind(aligned);
1450
1451 // s is now 2-word-aligned.
1452
1453 // We have a count of units and some trailing bytes. Adjust the
1454 // count and do a bulk copy of words. If the shift is zero
1455 // perform a move instead to benefit from zero latency moves.
1456 int shift = exact_log2(wordSize/granularity);
1457 if (shift > 0) {
1458 __ lsr(r15, count, shift);
1459 } else {
1460 __ mov(r15, count);
1461 }
1462 if (direction == copy_forwards) {
1463 if (type != T_OBJECT) {
1464 __ bl(StubRoutines::aarch64::copy_byte_f());
1465 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1466 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1467 } else {
1468 __ bl(StubRoutines::aarch64::copy_oop_f());
1469 }
1470 } else {
1471 if (type != T_OBJECT) {
1472 __ bl(StubRoutines::aarch64::copy_byte_b());
1473 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1474 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1475 } else {
1476 __ bl(StubRoutines::aarch64::copy_oop_b());
1477 }
1478 }
1479
1480 // And the tail.
1481 copy_memory_small(decorators, type, s, d, count, step);
1482
1483 if (granularity >= 8) __ bind(copy8);
1484 if (granularity >= 4) __ bind(copy4);
1485 __ bind(finish);
1486 }
1487
1488
1489 void clobber_registers() {
1490 #ifdef ASSERT
1491 RegSet clobbered
1492 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1493 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1494 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1495 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1496 __ mov(*it, rscratch1);
1497 }
1498 #endif
1499
1500 }
1501
1502 // Scan over array at a for count oops, verifying each one.
1503 // Preserves a and count, clobbers rscratch1 and rscratch2.
1504 void verify_oop_array (int size, Register a, Register count, Register temp) {
1505 Label loop, end;
1506 __ mov(rscratch1, a);
1507 __ mov(rscratch2, zr);
1508 __ bind(loop);
1509 __ cmp(rscratch2, count);
1510 __ br(Assembler::HS, end);
1511 if (size == wordSize) {
1512 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1513 __ verify_oop(temp);
1514 } else {
1515 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1516 __ decode_heap_oop(temp); // calls verify_oop
1517 }
1518 __ add(rscratch2, rscratch2, 1);
1519 __ b(loop);
1520 __ bind(end);
1521 }
1522
1523 // Arguments:
1524 // stub_id - is used to name the stub and identify all details of
1525 // how to perform the copy.
1526 //
1527 // entry - is assigned to the stub's post push entry point unless
1528 // it is null
1529 //
1530 // Inputs:
1531 // c_rarg0 - source array address
1532 // c_rarg1 - destination array address
1533 // c_rarg2 - element count, treated as ssize_t, can be zero
1534 //
1535 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1536 // the hardware handle it. The two dwords within qwords that span
1537 // cache line boundaries will still be loaded and stored atomically.
1538 //
1539 // Side Effects: nopush_entry is set to the (post push) entry point
1540 // so it can be used by the corresponding conjoint
1541 // copy method
1542 //
1543 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1544 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1545 RegSet saved_reg = RegSet::of(s, d, count);
1546 int size;
1547 bool aligned;
1548 bool is_oop;
1549 bool dest_uninitialized;
1550 switch (stub_id) {
1551 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1552 size = sizeof(jbyte);
1553 aligned = false;
1554 is_oop = false;
1555 dest_uninitialized = false;
1556 break;
1557 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1558 size = sizeof(jbyte);
1559 aligned = true;
1560 is_oop = false;
1561 dest_uninitialized = false;
1562 break;
1563 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1564 size = sizeof(jshort);
1565 aligned = false;
1566 is_oop = false;
1567 dest_uninitialized = false;
1568 break;
1569 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1570 size = sizeof(jshort);
1571 aligned = true;
1572 is_oop = false;
1573 dest_uninitialized = false;
1574 break;
1575 case StubId::stubgen_jint_disjoint_arraycopy_id:
1576 size = sizeof(jint);
1577 aligned = false;
1578 is_oop = false;
1579 dest_uninitialized = false;
1580 break;
1581 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1582 size = sizeof(jint);
1583 aligned = true;
1584 is_oop = false;
1585 dest_uninitialized = false;
1586 break;
1587 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1588 // since this is always aligned we can (should!) use the same
1589 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1590 ShouldNotReachHere();
1591 break;
1592 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1593 size = sizeof(jlong);
1594 aligned = true;
1595 is_oop = false;
1596 dest_uninitialized = false;
1597 break;
1598 case StubId::stubgen_oop_disjoint_arraycopy_id:
1599 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1600 aligned = !UseCompressedOops;
1601 is_oop = true;
1602 dest_uninitialized = false;
1603 break;
1604 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1605 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1606 aligned = !UseCompressedOops;
1607 is_oop = true;
1608 dest_uninitialized = false;
1609 break;
1610 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1611 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1612 aligned = !UseCompressedOops;
1613 is_oop = true;
1614 dest_uninitialized = true;
1615 break;
1616 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1617 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1618 aligned = !UseCompressedOops;
1619 is_oop = true;
1620 dest_uninitialized = true;
1621 break;
1622 default:
1623 ShouldNotReachHere();
1624 break;
1625 }
1626
1627 __ align(CodeEntryAlignment);
1628 StubCodeMark mark(this, stub_id);
1629 address start = __ pc();
1630 __ enter();
1631
1632 if (nopush_entry != nullptr) {
1633 *nopush_entry = __ pc();
1634 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1635 BLOCK_COMMENT("Entry:");
1636 }
1637
1638 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1639 if (dest_uninitialized) {
1640 decorators |= IS_DEST_UNINITIALIZED;
1641 }
1642 if (aligned) {
1643 decorators |= ARRAYCOPY_ALIGNED;
1644 }
1645
1646 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1647 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1648
1649 if (is_oop) {
1650 // save regs before copy_memory
1651 __ push(RegSet::of(d, count), sp);
1652 }
1653 {
1654 // UnsafeMemoryAccess page error: continue after unsafe access
1655 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1656 UnsafeMemoryAccessMark umam(this, add_entry, true);
1657 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1658 }
1659
1660 if (is_oop) {
1661 __ pop(RegSet::of(d, count), sp);
1662 if (VerifyOops)
1663 verify_oop_array(size, d, count, r16);
1664 }
1665
1666 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1667
1668 __ leave();
1669 __ mov(r0, zr); // return 0
1670 __ ret(lr);
1671 return start;
1672 }
1673
1674 // Arguments:
1675 // stub_id - is used to name the stub and identify all details of
1676 // how to perform the copy.
1677 //
1678 // nooverlap_target - identifes the (post push) entry for the
1679 // corresponding disjoint copy routine which can be
1680 // jumped to if the ranges do not actually overlap
1681 //
1682 // entry - is assigned to the stub's post push entry point unless
1683 // it is null
1684 //
1685 //
1686 // Inputs:
1687 // c_rarg0 - source array address
1688 // c_rarg1 - destination array address
1689 // c_rarg2 - element count, treated as ssize_t, can be zero
1690 //
1691 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1692 // the hardware handle it. The two dwords within qwords that span
1693 // cache line boundaries will still be loaded and stored atomically.
1694 //
1695 // Side Effects:
1696 // nopush_entry is set to the no-overlap entry point so it can be
1697 // used by some other conjoint copy method
1698 //
1699 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1700 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1701 RegSet saved_regs = RegSet::of(s, d, count);
1702 int size;
1703 bool aligned;
1704 bool is_oop;
1705 bool dest_uninitialized;
1706 switch (stub_id) {
1707 case StubId::stubgen_jbyte_arraycopy_id:
1708 size = sizeof(jbyte);
1709 aligned = false;
1710 is_oop = false;
1711 dest_uninitialized = false;
1712 break;
1713 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1714 size = sizeof(jbyte);
1715 aligned = true;
1716 is_oop = false;
1717 dest_uninitialized = false;
1718 break;
1719 case StubId::stubgen_jshort_arraycopy_id:
1720 size = sizeof(jshort);
1721 aligned = false;
1722 is_oop = false;
1723 dest_uninitialized = false;
1724 break;
1725 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1726 size = sizeof(jshort);
1727 aligned = true;
1728 is_oop = false;
1729 dest_uninitialized = false;
1730 break;
1731 case StubId::stubgen_jint_arraycopy_id:
1732 size = sizeof(jint);
1733 aligned = false;
1734 is_oop = false;
1735 dest_uninitialized = false;
1736 break;
1737 case StubId::stubgen_arrayof_jint_arraycopy_id:
1738 size = sizeof(jint);
1739 aligned = true;
1740 is_oop = false;
1741 dest_uninitialized = false;
1742 break;
1743 case StubId::stubgen_jlong_arraycopy_id:
1744 // since this is always aligned we can (should!) use the same
1745 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1746 ShouldNotReachHere();
1747 break;
1748 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1749 size = sizeof(jlong);
1750 aligned = true;
1751 is_oop = false;
1752 dest_uninitialized = false;
1753 break;
1754 case StubId::stubgen_oop_arraycopy_id:
1755 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1756 aligned = !UseCompressedOops;
1757 is_oop = true;
1758 dest_uninitialized = false;
1759 break;
1760 case StubId::stubgen_arrayof_oop_arraycopy_id:
1761 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1762 aligned = !UseCompressedOops;
1763 is_oop = true;
1764 dest_uninitialized = false;
1765 break;
1766 case StubId::stubgen_oop_arraycopy_uninit_id:
1767 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1768 aligned = !UseCompressedOops;
1769 is_oop = true;
1770 dest_uninitialized = true;
1771 break;
1772 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1773 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1774 aligned = !UseCompressedOops;
1775 is_oop = true;
1776 dest_uninitialized = true;
1777 break;
1778 default:
1779 ShouldNotReachHere();
1780 }
1781
1782 StubCodeMark mark(this, stub_id);
1783 address start = __ pc();
1784 __ enter();
1785
1786 if (nopush_entry != nullptr) {
1787 *nopush_entry = __ pc();
1788 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1789 BLOCK_COMMENT("Entry:");
1790 }
1791
1792 // use fwd copy when (d-s) above_equal (count*size)
1793 Label L_overlapping;
1794 __ sub(rscratch1, d, s);
1795 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1796 __ br(Assembler::LO, L_overlapping);
1797 __ b(RuntimeAddress(nooverlap_target));
1798 __ bind(L_overlapping);
1799
1800 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1801 if (dest_uninitialized) {
1802 decorators |= IS_DEST_UNINITIALIZED;
1803 }
1804 if (aligned) {
1805 decorators |= ARRAYCOPY_ALIGNED;
1806 }
1807
1808 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1810
1811 if (is_oop) {
1812 // save regs before copy_memory
1813 __ push(RegSet::of(d, count), sp);
1814 }
1815 {
1816 // UnsafeMemoryAccess page error: continue after unsafe access
1817 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1818 UnsafeMemoryAccessMark umam(this, add_entry, true);
1819 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1820 }
1821 if (is_oop) {
1822 __ pop(RegSet::of(d, count), sp);
1823 if (VerifyOops)
1824 verify_oop_array(size, d, count, r16);
1825 }
1826 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1827 __ leave();
1828 __ mov(r0, zr); // return 0
1829 __ ret(lr);
1830 return start;
1831 }
1832
1833 // Helper for generating a dynamic type check.
1834 // Smashes rscratch1, rscratch2.
1835 void generate_type_check(Register sub_klass,
1836 Register super_check_offset,
1837 Register super_klass,
1838 Register temp1,
1839 Register temp2,
1840 Register result,
1841 Label& L_success) {
1842 assert_different_registers(sub_klass, super_check_offset, super_klass);
1843
1844 BLOCK_COMMENT("type_check:");
1845
1846 Label L_miss;
1847
1848 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1849 super_check_offset);
1850 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1851
1852 // Fall through on failure!
1853 __ BIND(L_miss);
1854 }
1855
1856 //
1857 // Generate checkcasting array copy stub
1858 //
1859 // Input:
1860 // c_rarg0 - source array address
1861 // c_rarg1 - destination array address
1862 // c_rarg2 - element count, treated as ssize_t, can be zero
1863 // c_rarg3 - size_t ckoff (super_check_offset)
1864 // c_rarg4 - oop ckval (super_klass)
1865 //
1866 // Output:
1867 // r0 == 0 - success
1868 // r0 == -1^K - failure, where K is partial transfer count
1869 //
1870 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1871 bool dest_uninitialized;
1872 switch (stub_id) {
1873 case StubId::stubgen_checkcast_arraycopy_id:
1874 dest_uninitialized = false;
1875 break;
1876 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1877 dest_uninitialized = true;
1878 break;
1879 default:
1880 ShouldNotReachHere();
1881 }
1882
1883 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1884
1885 // Input registers (after setup_arg_regs)
1886 const Register from = c_rarg0; // source array address
1887 const Register to = c_rarg1; // destination array address
1888 const Register count = c_rarg2; // elementscount
1889 const Register ckoff = c_rarg3; // super_check_offset
1890 const Register ckval = c_rarg4; // super_klass
1891
1892 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1893
1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1895 const Register copied_oop = r22; // actual oop copied
1896 const Register count_save = r21; // orig elementscount
1897 const Register start_to = r20; // destination array start address
1898 const Register r19_klass = r19; // oop._klass
1899
1900 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1901 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1902
1903 //---------------------------------------------------------------
1904 // Assembler stub will be used for this call to arraycopy
1905 // if the two arrays are subtypes of Object[] but the
1906 // destination array type is not equal to or a supertype
1907 // of the source type. Each element must be separately
1908 // checked.
1909
1910 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1911 copied_oop, r19_klass, count_save);
1912
1913 __ align(CodeEntryAlignment);
1914 StubCodeMark mark(this, stub_id);
1915 address start = __ pc();
1916
1917 __ enter(); // required for proper stackwalking of RuntimeStub frame
1918
1919 #ifdef ASSERT
1920 // caller guarantees that the arrays really are different
1921 // otherwise, we would have to make conjoint checks
1922 { Label L;
1923 __ b(L); // conjoint check not yet implemented
1924 __ stop("checkcast_copy within a single array");
1925 __ bind(L);
1926 }
1927 #endif //ASSERT
1928
1929 // Caller of this entry point must set up the argument registers.
1930 if (nopush_entry != nullptr) {
1931 *nopush_entry = __ pc();
1932 BLOCK_COMMENT("Entry:");
1933 }
1934
1935 // Empty array: Nothing to do.
1936 __ cbz(count, L_done);
1937 __ push(RegSet::of(r19, r20, r21, r22), sp);
1938
1939 #ifdef ASSERT
1940 BLOCK_COMMENT("assert consistent ckoff/ckval");
1941 // The ckoff and ckval must be mutually consistent,
1942 // even though caller generates both.
1943 { Label L;
1944 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1945 __ ldrw(start_to, Address(ckval, sco_offset));
1946 __ cmpw(ckoff, start_to);
1947 __ br(Assembler::EQ, L);
1948 __ stop("super_check_offset inconsistent");
1949 __ bind(L);
1950 }
1951 #endif //ASSERT
1952
1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1954 bool is_oop = true;
1955 int element_size = UseCompressedOops ? 4 : 8;
1956 if (dest_uninitialized) {
1957 decorators |= IS_DEST_UNINITIALIZED;
1958 }
1959
1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1962
1963 // save the original count
1964 __ mov(count_save, count);
1965
1966 // Copy from low to high addresses
1967 __ mov(start_to, to); // Save destination array start address
1968 __ b(L_load_element);
1969
1970 // ======== begin loop ========
1971 // (Loop is rotated; its entry is L_load_element.)
1972 // Loop control:
1973 // for (; count != 0; count--) {
1974 // copied_oop = load_heap_oop(from++);
1975 // ... generate_type_check ...;
1976 // store_heap_oop(to++, copied_oop);
1977 // }
1978 __ align(OptoLoopAlignment);
1979
1980 __ BIND(L_store_element);
1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1982 __ post(to, element_size), copied_oop, noreg,
1983 gct1, gct2, gct3);
1984 __ sub(count, count, 1);
1985 __ cbz(count, L_do_card_marks);
1986
1987 // ======== loop entry is here ========
1988 __ BIND(L_load_element);
1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1990 copied_oop, noreg, __ post(from, element_size),
1991 gct1);
1992 __ cbz(copied_oop, L_store_element);
1993
1994 __ load_klass(r19_klass, copied_oop);// query the object klass
1995
1996 BLOCK_COMMENT("type_check:");
1997 generate_type_check(/*sub_klass*/r19_klass,
1998 /*super_check_offset*/ckoff,
1999 /*super_klass*/ckval,
2000 /*r_array_base*/gct1,
2001 /*temp2*/gct2,
2002 /*result*/r10, L_store_element);
2003
2004 // Fall through on failure!
2005
2006 // ======== end loop ========
2007
2008 // It was a real error; we must depend on the caller to finish the job.
2009 // Register count = remaining oops, count_orig = total oops.
2010 // Emit GC store barriers for the oops we have copied and report
2011 // their number to the caller.
2012
2013 __ subs(count, count_save, count); // K = partially copied oop count
2014 __ eon(count, count, zr); // report (-1^K) to caller
2015 __ br(Assembler::EQ, L_done_pop);
2016
2017 __ BIND(L_do_card_marks);
2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2019
2020 __ bind(L_done_pop);
2021 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2023
2024 __ bind(L_done);
2025 __ mov(r0, count);
2026 __ leave();
2027 __ ret(lr);
2028
2029 return start;
2030 }
2031
2032 // Perform range checks on the proposed arraycopy.
2033 // Kills temp, but nothing else.
2034 // Also, clean the sign bits of src_pos and dst_pos.
2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2036 Register src_pos, // source position (c_rarg1)
2037 Register dst, // destination array oo (c_rarg2)
2038 Register dst_pos, // destination position (c_rarg3)
2039 Register length,
2040 Register temp,
2041 Label& L_failed) {
2042 BLOCK_COMMENT("arraycopy_range_checks:");
2043
2044 assert_different_registers(rscratch1, temp);
2045
2046 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2048 __ addw(temp, length, src_pos);
2049 __ cmpw(temp, rscratch1);
2050 __ br(Assembler::HI, L_failed);
2051
2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2054 __ addw(temp, length, dst_pos);
2055 __ cmpw(temp, rscratch1);
2056 __ br(Assembler::HI, L_failed);
2057
2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2059 __ movw(src_pos, src_pos);
2060 __ movw(dst_pos, dst_pos);
2061
2062 BLOCK_COMMENT("arraycopy_range_checks done");
2063 }
2064
2065 // These stubs get called from some dumb test routine.
2066 // I'll write them properly when they're called from
2067 // something that's actually doing something.
2068 static void fake_arraycopy_stub(address src, address dst, int count) {
2069 assert(count == 0, "huh?");
2070 }
2071
2072
2073 //
2074 // Generate 'unsafe' array copy stub
2075 // Though just as safe as the other stubs, it takes an unscaled
2076 // size_t argument instead of an element count.
2077 //
2078 // Input:
2079 // c_rarg0 - source array address
2080 // c_rarg1 - destination array address
2081 // c_rarg2 - byte count, treated as ssize_t, can be zero
2082 //
2083 // Examines the alignment of the operands and dispatches
2084 // to a long, int, short, or byte copy loop.
2085 //
2086 address generate_unsafe_copy(address byte_copy_entry,
2087 address short_copy_entry,
2088 address int_copy_entry,
2089 address long_copy_entry) {
2090 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2091
2092 Label L_long_aligned, L_int_aligned, L_short_aligned;
2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2094
2095 __ align(CodeEntryAlignment);
2096 StubCodeMark mark(this, stub_id);
2097 address start = __ pc();
2098 __ enter(); // required for proper stackwalking of RuntimeStub frame
2099
2100 // bump this on entry, not on exit:
2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2102
2103 __ orr(rscratch1, s, d);
2104 __ orr(rscratch1, rscratch1, count);
2105
2106 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2107 __ cbz(rscratch1, L_long_aligned);
2108 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2109 __ cbz(rscratch1, L_int_aligned);
2110 __ tbz(rscratch1, 0, L_short_aligned);
2111 __ b(RuntimeAddress(byte_copy_entry));
2112
2113 __ BIND(L_short_aligned);
2114 __ lsr(count, count, LogBytesPerShort); // size => short_count
2115 __ b(RuntimeAddress(short_copy_entry));
2116 __ BIND(L_int_aligned);
2117 __ lsr(count, count, LogBytesPerInt); // size => int_count
2118 __ b(RuntimeAddress(int_copy_entry));
2119 __ BIND(L_long_aligned);
2120 __ lsr(count, count, LogBytesPerLong); // size => long_count
2121 __ b(RuntimeAddress(long_copy_entry));
2122
2123 return start;
2124 }
2125
2126 //
2127 // Generate generic array copy stubs
2128 //
2129 // Input:
2130 // c_rarg0 - src oop
2131 // c_rarg1 - src_pos (32-bits)
2132 // c_rarg2 - dst oop
2133 // c_rarg3 - dst_pos (32-bits)
2134 // c_rarg4 - element count (32-bits)
2135 //
2136 // Output:
2137 // r0 == 0 - success
2138 // r0 == -1^K - failure, where K is partial transfer count
2139 //
2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2141 address int_copy_entry, address oop_copy_entry,
2142 address long_copy_entry, address checkcast_copy_entry) {
2143 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2144
2145 Label L_failed, L_objArray;
2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2147
2148 // Input registers
2149 const Register src = c_rarg0; // source array oop
2150 const Register src_pos = c_rarg1; // source position
2151 const Register dst = c_rarg2; // destination array oop
2152 const Register dst_pos = c_rarg3; // destination position
2153 const Register length = c_rarg4;
2154
2155
2156 // Registers used as temps
2157 const Register dst_klass = c_rarg5;
2158
2159 __ align(CodeEntryAlignment);
2160
2161 StubCodeMark mark(this, stub_id);
2162
2163 address start = __ pc();
2164
2165 __ enter(); // required for proper stackwalking of RuntimeStub frame
2166
2167 // bump this on entry, not on exit:
2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2169
2170 //-----------------------------------------------------------------------
2171 // Assembler stub will be used for this call to arraycopy
2172 // if the following conditions are met:
2173 //
2174 // (1) src and dst must not be null.
2175 // (2) src_pos must not be negative.
2176 // (3) dst_pos must not be negative.
2177 // (4) length must not be negative.
2178 // (5) src klass and dst klass should be the same and not null.
2179 // (6) src and dst should be arrays.
2180 // (7) src_pos + length must not exceed length of src.
2181 // (8) dst_pos + length must not exceed length of dst.
2182 //
2183
2184 // if (src == nullptr) return -1;
2185 __ cbz(src, L_failed);
2186
2187 // if (src_pos < 0) return -1;
2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2189
2190 // if (dst == nullptr) return -1;
2191 __ cbz(dst, L_failed);
2192
2193 // if (dst_pos < 0) return -1;
2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2195
2196 // registers used as temp
2197 const Register scratch_length = r16; // elements count to copy
2198 const Register scratch_src_klass = r17; // array klass
2199 const Register lh = r15; // layout helper
2200
2201 // if (length < 0) return -1;
2202 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2204
2205 __ load_klass(scratch_src_klass, src);
2206 #ifdef ASSERT
2207 // assert(src->klass() != nullptr);
2208 {
2209 BLOCK_COMMENT("assert klasses not null {");
2210 Label L1, L2;
2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2212 __ bind(L1);
2213 __ stop("broken null klass");
2214 __ bind(L2);
2215 __ load_klass(rscratch1, dst);
2216 __ cbz(rscratch1, L1); // this would be broken also
2217 BLOCK_COMMENT("} assert klasses not null done");
2218 }
2219 #endif
2220
2221 // Load layout helper (32-bits)
2222 //
2223 // |array_tag| | header_size | element_type | |log2_element_size|
2224 // 32 30 24 16 8 2 0
2225 //
2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2227 //
2228
2229 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2230
2231 // Handle objArrays completely differently...
2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2234 __ movw(rscratch1, objArray_lh);
2235 __ eorw(rscratch2, lh, rscratch1);
2236 __ cbzw(rscratch2, L_objArray);
2237
2238 // if (src->klass() != dst->klass()) return -1;
2239 __ load_klass(rscratch2, dst);
2240 __ eor(rscratch2, rscratch2, scratch_src_klass);
2241 __ cbnz(rscratch2, L_failed);
2242
2243 // Check for flat inline type array -> return -1
2244 __ test_flat_array_oop(src, rscratch2, L_failed);
2245
2246 // Check for null-free (non-flat) inline type array -> handle as object array
2247 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2248
2249 // if (!src->is_Array()) return -1;
2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2251
2252 // At this point, it is known to be a typeArray (array_tag 0x3).
2253 #ifdef ASSERT
2254 {
2255 BLOCK_COMMENT("assert primitive array {");
2256 Label L;
2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2258 __ cmpw(lh, rscratch2);
2259 __ br(Assembler::GE, L);
2260 __ stop("must be a primitive array");
2261 __ bind(L);
2262 BLOCK_COMMENT("} assert primitive array done");
2263 }
2264 #endif
2265
2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2267 rscratch2, L_failed);
2268
2269 // TypeArrayKlass
2270 //
2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2273 //
2274
2275 const Register rscratch1_offset = rscratch1; // array offset
2276 const Register r15_elsize = lh; // element size
2277
2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2280 __ add(src, src, rscratch1_offset); // src array offset
2281 __ add(dst, dst, rscratch1_offset); // dst array offset
2282 BLOCK_COMMENT("choose copy loop based on element size");
2283
2284 // next registers should be set before the jump to corresponding stub
2285 const Register from = c_rarg0; // source array address
2286 const Register to = c_rarg1; // destination array address
2287 const Register count = c_rarg2; // elements count
2288
2289 // 'from', 'to', 'count' registers should be set in such order
2290 // since they are the same as 'src', 'src_pos', 'dst'.
2291
2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2293
2294 // The possible values of elsize are 0-3, i.e. exact_log2(element
2295 // size in bytes). We do a simple bitwise binary search.
2296 __ BIND(L_copy_bytes);
2297 __ tbnz(r15_elsize, 1, L_copy_ints);
2298 __ tbnz(r15_elsize, 0, L_copy_shorts);
2299 __ lea(from, Address(src, src_pos));// src_addr
2300 __ lea(to, Address(dst, dst_pos));// dst_addr
2301 __ movw(count, scratch_length); // length
2302 __ b(RuntimeAddress(byte_copy_entry));
2303
2304 __ BIND(L_copy_shorts);
2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2307 __ movw(count, scratch_length); // length
2308 __ b(RuntimeAddress(short_copy_entry));
2309
2310 __ BIND(L_copy_ints);
2311 __ tbnz(r15_elsize, 0, L_copy_longs);
2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2314 __ movw(count, scratch_length); // length
2315 __ b(RuntimeAddress(int_copy_entry));
2316
2317 __ BIND(L_copy_longs);
2318 #ifdef ASSERT
2319 {
2320 BLOCK_COMMENT("assert long copy {");
2321 Label L;
2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2323 __ cmpw(r15_elsize, LogBytesPerLong);
2324 __ br(Assembler::EQ, L);
2325 __ stop("must be long copy, but elsize is wrong");
2326 __ bind(L);
2327 BLOCK_COMMENT("} assert long copy done");
2328 }
2329 #endif
2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2332 __ movw(count, scratch_length); // length
2333 __ b(RuntimeAddress(long_copy_entry));
2334
2335 // ObjArrayKlass
2336 __ BIND(L_objArray);
2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2338
2339 Label L_plain_copy, L_checkcast_copy;
2340 // test array classes for subtyping
2341 __ load_klass(r15, dst);
2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2343 __ br(Assembler::NE, L_checkcast_copy);
2344
2345 // Identically typed arrays can be copied without element-wise checks.
2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347 rscratch2, L_failed);
2348
2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353 __ movw(count, scratch_length); // length
2354 __ BIND(L_plain_copy);
2355 __ b(RuntimeAddress(oop_copy_entry));
2356
2357 __ BIND(L_checkcast_copy);
2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2359 {
2360 // Before looking at dst.length, make sure dst is also an objArray.
2361 __ ldrw(rscratch1, Address(r15, lh_offset));
2362 __ movw(rscratch2, objArray_lh);
2363 __ eorw(rscratch1, rscratch1, rscratch2);
2364 __ cbnzw(rscratch1, L_failed);
2365
2366 // It is safe to examine both src.length and dst.length.
2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2368 r15, L_failed);
2369
2370 __ load_klass(dst_klass, dst); // reload
2371
2372 // Marshal the base address arguments now, freeing registers.
2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2377 __ movw(count, length); // length (reloaded)
2378 Register sco_temp = c_rarg3; // this register is free now
2379 assert_different_registers(from, to, count, sco_temp,
2380 dst_klass, scratch_src_klass);
2381 // assert_clean_int(count, sco_temp);
2382
2383 // Generate the type check.
2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2386
2387 // Smashes rscratch1, rscratch2
2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2389 L_plain_copy);
2390
2391 // Fetch destination element klass from the ObjArrayKlass header.
2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2393 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2395
2396 // the checkcast_copy loop needs two extra arguments:
2397 assert(c_rarg3 == sco_temp, "#3 already in place");
2398 // Set up arguments for checkcast_copy_entry.
2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2400 __ b(RuntimeAddress(checkcast_copy_entry));
2401 }
2402
2403 __ BIND(L_failed);
2404 __ mov(r0, -1);
2405 __ leave(); // required for proper stackwalking of RuntimeStub frame
2406 __ ret(lr);
2407
2408 return start;
2409 }
2410
2411 //
2412 // Generate stub for array fill. If "aligned" is true, the
2413 // "to" address is assumed to be heapword aligned.
2414 //
2415 // Arguments for generated stub:
2416 // to: c_rarg0
2417 // value: c_rarg1
2418 // count: c_rarg2 treated as signed
2419 //
2420 address generate_fill(StubId stub_id) {
2421 BasicType t;
2422 bool aligned;
2423
2424 switch (stub_id) {
2425 case StubId::stubgen_jbyte_fill_id:
2426 t = T_BYTE;
2427 aligned = false;
2428 break;
2429 case StubId::stubgen_jshort_fill_id:
2430 t = T_SHORT;
2431 aligned = false;
2432 break;
2433 case StubId::stubgen_jint_fill_id:
2434 t = T_INT;
2435 aligned = false;
2436 break;
2437 case StubId::stubgen_arrayof_jbyte_fill_id:
2438 t = T_BYTE;
2439 aligned = true;
2440 break;
2441 case StubId::stubgen_arrayof_jshort_fill_id:
2442 t = T_SHORT;
2443 aligned = true;
2444 break;
2445 case StubId::stubgen_arrayof_jint_fill_id:
2446 t = T_INT;
2447 aligned = true;
2448 break;
2449 default:
2450 ShouldNotReachHere();
2451 };
2452
2453 __ align(CodeEntryAlignment);
2454 StubCodeMark mark(this, stub_id);
2455 address start = __ pc();
2456
2457 BLOCK_COMMENT("Entry:");
2458
2459 const Register to = c_rarg0; // source array address
2460 const Register value = c_rarg1; // value
2461 const Register count = c_rarg2; // elements count
2462
2463 const Register bz_base = r10; // base for block_zero routine
2464 const Register cnt_words = r11; // temp register
2465
2466 __ enter();
2467
2468 Label L_fill_elements, L_exit1;
2469
2470 int shift = -1;
2471 switch (t) {
2472 case T_BYTE:
2473 shift = 0;
2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2477 __ br(Assembler::LO, L_fill_elements);
2478 break;
2479 case T_SHORT:
2480 shift = 1;
2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2483 __ br(Assembler::LO, L_fill_elements);
2484 break;
2485 case T_INT:
2486 shift = 2;
2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2488 __ br(Assembler::LO, L_fill_elements);
2489 break;
2490 default: ShouldNotReachHere();
2491 }
2492
2493 // Align source address at 8 bytes address boundary.
2494 Label L_skip_align1, L_skip_align2, L_skip_align4;
2495 if (!aligned) {
2496 switch (t) {
2497 case T_BYTE:
2498 // One byte misalignment happens only for byte arrays.
2499 __ tbz(to, 0, L_skip_align1);
2500 __ strb(value, Address(__ post(to, 1)));
2501 __ subw(count, count, 1);
2502 __ bind(L_skip_align1);
2503 // Fallthrough
2504 case T_SHORT:
2505 // Two bytes misalignment happens only for byte and short (char) arrays.
2506 __ tbz(to, 1, L_skip_align2);
2507 __ strh(value, Address(__ post(to, 2)));
2508 __ subw(count, count, 2 >> shift);
2509 __ bind(L_skip_align2);
2510 // Fallthrough
2511 case T_INT:
2512 // Align to 8 bytes, we know we are 4 byte aligned to start.
2513 __ tbz(to, 2, L_skip_align4);
2514 __ strw(value, Address(__ post(to, 4)));
2515 __ subw(count, count, 4 >> shift);
2516 __ bind(L_skip_align4);
2517 break;
2518 default: ShouldNotReachHere();
2519 }
2520 }
2521
2522 //
2523 // Fill large chunks
2524 //
2525 __ lsrw(cnt_words, count, 3 - shift); // number of words
2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2528 if (UseBlockZeroing) {
2529 Label non_block_zeroing, rest;
2530 // If the fill value is zero we can use the fast zero_words().
2531 __ cbnz(value, non_block_zeroing);
2532 __ mov(bz_base, to);
2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2534 address tpc = __ zero_words(bz_base, cnt_words);
2535 if (tpc == nullptr) {
2536 fatal("CodeCache is full at generate_fill");
2537 }
2538 __ b(rest);
2539 __ bind(non_block_zeroing);
2540 __ fill_words(to, cnt_words, value);
2541 __ bind(rest);
2542 } else {
2543 __ fill_words(to, cnt_words, value);
2544 }
2545
2546 // Remaining count is less than 8 bytes. Fill it by a single store.
2547 // Note that the total length is no less than 8 bytes.
2548 if (t == T_BYTE || t == T_SHORT) {
2549 Label L_exit1;
2550 __ cbzw(count, L_exit1);
2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2552 __ str(value, Address(to, -8)); // overwrite some elements
2553 __ bind(L_exit1);
2554 __ leave();
2555 __ ret(lr);
2556 }
2557
2558 // Handle copies less than 8 bytes.
2559 Label L_fill_2, L_fill_4, L_exit2;
2560 __ bind(L_fill_elements);
2561 switch (t) {
2562 case T_BYTE:
2563 __ tbz(count, 0, L_fill_2);
2564 __ strb(value, Address(__ post(to, 1)));
2565 __ bind(L_fill_2);
2566 __ tbz(count, 1, L_fill_4);
2567 __ strh(value, Address(__ post(to, 2)));
2568 __ bind(L_fill_4);
2569 __ tbz(count, 2, L_exit2);
2570 __ strw(value, Address(to));
2571 break;
2572 case T_SHORT:
2573 __ tbz(count, 0, L_fill_4);
2574 __ strh(value, Address(__ post(to, 2)));
2575 __ bind(L_fill_4);
2576 __ tbz(count, 1, L_exit2);
2577 __ strw(value, Address(to));
2578 break;
2579 case T_INT:
2580 __ cbzw(count, L_exit2);
2581 __ strw(value, Address(to));
2582 break;
2583 default: ShouldNotReachHere();
2584 }
2585 __ bind(L_exit2);
2586 __ leave();
2587 __ ret(lr);
2588 return start;
2589 }
2590
2591 address generate_unsafecopy_common_error_exit() {
2592 address start_pc = __ pc();
2593 __ leave();
2594 __ mov(r0, 0);
2595 __ ret(lr);
2596 return start_pc;
2597 }
2598
2599 //
2600 // Generate 'unsafe' set memory stub
2601 // Though just as safe as the other stubs, it takes an unscaled
2602 // size_t (# bytes) argument instead of an element count.
2603 //
2604 // This fill operation is atomicity preserving: as long as the
2605 // address supplied is sufficiently aligned, all writes of up to 64
2606 // bits in size are single-copy atomic.
2607 //
2608 // Input:
2609 // c_rarg0 - destination array address
2610 // c_rarg1 - byte count (size_t)
2611 // c_rarg2 - byte value
2612 //
2613 address generate_unsafe_setmemory() {
2614 __ align(CodeEntryAlignment);
2615 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2616 address start = __ pc();
2617
2618 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2619 Label tail;
2620
2621 UnsafeMemoryAccessMark umam(this, true, false);
2622
2623 __ enter(); // required for proper stackwalking of RuntimeStub frame
2624
2625 __ dup(v0, __ T16B, value);
2626
2627 if (AvoidUnalignedAccesses) {
2628 __ cmp(count, (u1)16);
2629 __ br(__ LO, tail);
2630
2631 __ mov(rscratch1, 16);
2632 __ andr(rscratch2, dest, 15);
2633 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2634 __ strq(v0, Address(dest));
2635 __ sub(count, count, rscratch1);
2636 __ add(dest, dest, rscratch1);
2637 }
2638
2639 __ subs(count, count, (u1)64);
2640 __ br(__ LO, tail);
2641 {
2642 Label again;
2643 __ bind(again);
2644 __ stpq(v0, v0, Address(dest));
2645 __ stpq(v0, v0, Address(dest, 32));
2646
2647 __ subs(count, count, 64);
2648 __ add(dest, dest, 64);
2649 __ br(__ HS, again);
2650 }
2651
2652 __ bind(tail);
2653 // The count of bytes is off by 64, but we don't need to correct
2654 // it because we're only going to use the least-significant few
2655 // count bits from here on.
2656 // __ add(count, count, 64);
2657
2658 {
2659 Label dont;
2660 __ tbz(count, exact_log2(32), dont);
2661 __ stpq(v0, v0, __ post(dest, 32));
2662 __ bind(dont);
2663 }
2664 {
2665 Label dont;
2666 __ tbz(count, exact_log2(16), dont);
2667 __ strq(v0, __ post(dest, 16));
2668 __ bind(dont);
2669 }
2670 {
2671 Label dont;
2672 __ tbz(count, exact_log2(8), dont);
2673 __ strd(v0, __ post(dest, 8));
2674 __ bind(dont);
2675 }
2676
2677 Label finished;
2678 __ tst(count, 7);
2679 __ br(__ EQ, finished);
2680
2681 {
2682 Label dont;
2683 __ tbz(count, exact_log2(4), dont);
2684 __ strs(v0, __ post(dest, 4));
2685 __ bind(dont);
2686 }
2687 {
2688 Label dont;
2689 __ tbz(count, exact_log2(2), dont);
2690 __ bfi(value, value, 8, 8);
2691 __ strh(value, __ post(dest, 2));
2692 __ bind(dont);
2693 }
2694 {
2695 Label dont;
2696 __ tbz(count, exact_log2(1), dont);
2697 __ strb(value, Address(dest));
2698 __ bind(dont);
2699 }
2700
2701 __ bind(finished);
2702 __ leave();
2703 __ ret(lr);
2704
2705 return start;
2706 }
2707
2708 address generate_data_cache_writeback() {
2709 const Register line = c_rarg0; // address of line to write back
2710
2711 __ align(CodeEntryAlignment);
2712
2713 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2714 StubCodeMark mark(this, stub_id);
2715
2716 address start = __ pc();
2717 __ enter();
2718 __ cache_wb(Address(line, 0));
2719 __ leave();
2720 __ ret(lr);
2721
2722 return start;
2723 }
2724
2725 address generate_data_cache_writeback_sync() {
2726 const Register is_pre = c_rarg0; // pre or post sync
2727
2728 __ align(CodeEntryAlignment);
2729
2730 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2731 StubCodeMark mark(this, stub_id);
2732
2733 // pre wbsync is a no-op
2734 // post wbsync translates to an sfence
2735
2736 Label skip;
2737 address start = __ pc();
2738 __ enter();
2739 __ cbnz(is_pre, skip);
2740 __ cache_wbsync(false);
2741 __ bind(skip);
2742 __ leave();
2743 __ ret(lr);
2744
2745 return start;
2746 }
2747
2748 void generate_arraycopy_stubs() {
2749 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2750 // entry immediately following their stack push. This can be used
2751 // as a post-push branch target for compatible stubs when they
2752 // identify a special case that can be handled by the fallback
2753 // stub e.g a disjoint copy stub may be use as a special case
2754 // fallback for its compatible conjoint copy stub.
2755 //
2756 // A no push entry is always returned in the following local and
2757 // then published by assigning to the appropriate entry field in
2758 // class StubRoutines. The entry value is then passed to the
2759 // generator for the compatible stub. That means the entry must be
2760 // listed when saving to/restoring from the AOT cache, ensuring
2761 // that the inter-stub jumps are noted at AOT-cache save and
2762 // relocated at AOT cache load.
2763 address nopush_entry;
2764
2765 // generate the common exit first so later stubs can rely on it if
2766 // they want an UnsafeMemoryAccess exit non-local to the stub
2767 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2768 // register the stub as the default exit with class UnsafeMemoryAccess
2769 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2770
2771 // generate and publish arch64-specific bulk copy routines first
2772 // so we can call them from other copy stubs
2773 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2774 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2775
2776 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2777 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2778
2779 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2780 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2781
2782 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2783
2784 //*** jbyte
2785 // Always need aligned and unaligned versions
2786 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2787 // disjoint nopush entry is needed by conjoint copy
2788 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2789 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2790 // conjoint nopush entry is needed by generic/unsafe copy
2791 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2792 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2793 // disjoint arrayof nopush entry is needed by conjoint copy
2794 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2795 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2796
2797 //*** jshort
2798 // Always need aligned and unaligned versions
2799 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2800 // disjoint nopush entry is needed by conjoint copy
2801 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2802 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2803 // conjoint nopush entry is used by generic/unsafe copy
2804 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2805 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2806 // disjoint arrayof nopush entry is needed by conjoint copy
2807 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2808 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2809
2810 //*** jint
2811 // Aligned versions
2812 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2813 // disjoint arrayof nopush entry is needed by conjoint copy
2814 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2815 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2816 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2817 // jint_arraycopy_nopush always points to the unaligned version
2818 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2819 // disjoint nopush entry is needed by conjoint copy
2820 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2821 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2822 // conjoint nopush entry is needed by generic/unsafe copy
2823 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2824
2825 //*** jlong
2826 // It is always aligned
2827 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2828 // disjoint arrayof nopush entry is needed by conjoint copy
2829 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2830 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2831 // conjoint nopush entry is needed by generic/unsafe copy
2832 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2833 // disjoint normal/nopush and conjoint normal entries are not
2834 // generated since the arrayof versions are the same
2835 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2836 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2837 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2838
2839 //*** oops
2840 {
2841 StubRoutines::_arrayof_oop_disjoint_arraycopy
2842 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2843 // disjoint arrayof nopush entry is needed by conjoint copy
2844 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2845 StubRoutines::_arrayof_oop_arraycopy
2846 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2847 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2848 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2849 // Aligned versions without pre-barriers
2850 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2851 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2852 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2853 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2854 // note that we don't need a returned nopush entry because the
2855 // generic/unsafe copy does not cater for uninit arrays.
2856 StubRoutines::_arrayof_oop_arraycopy_uninit
2857 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2858 }
2859
2860 // for oop copies reuse arrayof entries for non-arrayof cases
2861 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2862 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2863 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2864 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2865 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2866 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2867
2868 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2869 // checkcast nopush entry is needed by generic copy
2870 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2871 // note that we don't need a returned nopush entry because the
2872 // generic copy does not cater for uninit arrays.
2873 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2874
2875 // unsafe arraycopy may fallback on conjoint stubs
2876 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2877 StubRoutines::_jshort_arraycopy_nopush,
2878 StubRoutines::_jint_arraycopy_nopush,
2879 StubRoutines::_jlong_arraycopy_nopush);
2880
2881 // generic arraycopy may fallback on conjoint stubs
2882 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2883 StubRoutines::_jshort_arraycopy_nopush,
2884 StubRoutines::_jint_arraycopy_nopush,
2885 StubRoutines::_oop_arraycopy_nopush,
2886 StubRoutines::_jlong_arraycopy_nopush,
2887 StubRoutines::_checkcast_arraycopy_nopush);
2888
2889 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2890 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2891 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2892 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2893 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2894 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2895 }
2896
2897 void generate_math_stubs() { Unimplemented(); }
2898
2899 // Arguments:
2900 //
2901 // Inputs:
2902 // c_rarg0 - source byte array address
2903 // c_rarg1 - destination byte array address
2904 // c_rarg2 - K (key) in little endian int array
2905 //
2906 address generate_aescrypt_encryptBlock() {
2907 __ align(CodeEntryAlignment);
2908 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2909 StubCodeMark mark(this, stub_id);
2910
2911 const Register from = c_rarg0; // source array address
2912 const Register to = c_rarg1; // destination array address
2913 const Register key = c_rarg2; // key array address
2914 const Register keylen = rscratch1;
2915
2916 address start = __ pc();
2917 __ enter();
2918
2919 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2920
2921 __ aesenc_loadkeys(key, keylen);
2922 __ aesecb_encrypt(from, to, keylen);
2923
2924 __ mov(r0, 0);
2925
2926 __ leave();
2927 __ ret(lr);
2928
2929 return start;
2930 }
2931
2932 // Arguments:
2933 //
2934 // Inputs:
2935 // c_rarg0 - source byte array address
2936 // c_rarg1 - destination byte array address
2937 // c_rarg2 - K (key) in little endian int array
2938 //
2939 address generate_aescrypt_decryptBlock() {
2940 assert(UseAES, "need AES cryptographic extension support");
2941 __ align(CodeEntryAlignment);
2942 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2943 StubCodeMark mark(this, stub_id);
2944 Label L_doLast;
2945
2946 const Register from = c_rarg0; // source array address
2947 const Register to = c_rarg1; // destination array address
2948 const Register key = c_rarg2; // key array address
2949 const Register keylen = rscratch1;
2950
2951 address start = __ pc();
2952 __ enter(); // required for proper stackwalking of RuntimeStub frame
2953
2954 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2955
2956 __ aesecb_decrypt(from, to, key, keylen);
2957
2958 __ mov(r0, 0);
2959
2960 __ leave();
2961 __ ret(lr);
2962
2963 return start;
2964 }
2965
2966 // Arguments:
2967 //
2968 // Inputs:
2969 // c_rarg0 - source byte array address
2970 // c_rarg1 - destination byte array address
2971 // c_rarg2 - K (key) in little endian int array
2972 // c_rarg3 - r vector byte array address
2973 // c_rarg4 - input length
2974 //
2975 // Output:
2976 // x0 - input length
2977 //
2978 address generate_cipherBlockChaining_encryptAESCrypt() {
2979 assert(UseAES, "need AES cryptographic extension support");
2980 __ align(CodeEntryAlignment);
2981 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2982 StubCodeMark mark(this, stub_id);
2983
2984 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2985
2986 const Register from = c_rarg0; // source array address
2987 const Register to = c_rarg1; // destination array address
2988 const Register key = c_rarg2; // key array address
2989 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2990 // and left with the results of the last encryption block
2991 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2992 const Register keylen = rscratch1;
2993
2994 address start = __ pc();
2995
2996 __ enter();
2997
2998 __ movw(rscratch2, len_reg);
2999
3000 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3001
3002 __ ld1(v0, __ T16B, rvec);
3003
3004 __ cmpw(keylen, 52);
3005 __ br(Assembler::CC, L_loadkeys_44);
3006 __ br(Assembler::EQ, L_loadkeys_52);
3007
3008 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3009 __ rev32(v17, __ T16B, v17);
3010 __ rev32(v18, __ T16B, v18);
3011 __ BIND(L_loadkeys_52);
3012 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3013 __ rev32(v19, __ T16B, v19);
3014 __ rev32(v20, __ T16B, v20);
3015 __ BIND(L_loadkeys_44);
3016 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3017 __ rev32(v21, __ T16B, v21);
3018 __ rev32(v22, __ T16B, v22);
3019 __ rev32(v23, __ T16B, v23);
3020 __ rev32(v24, __ T16B, v24);
3021 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3022 __ rev32(v25, __ T16B, v25);
3023 __ rev32(v26, __ T16B, v26);
3024 __ rev32(v27, __ T16B, v27);
3025 __ rev32(v28, __ T16B, v28);
3026 __ ld1(v29, v30, v31, __ T16B, key);
3027 __ rev32(v29, __ T16B, v29);
3028 __ rev32(v30, __ T16B, v30);
3029 __ rev32(v31, __ T16B, v31);
3030
3031 __ BIND(L_aes_loop);
3032 __ ld1(v1, __ T16B, __ post(from, 16));
3033 __ eor(v0, __ T16B, v0, v1);
3034
3035 __ br(Assembler::CC, L_rounds_44);
3036 __ br(Assembler::EQ, L_rounds_52);
3037
3038 __ aese(v0, v17); __ aesmc(v0, v0);
3039 __ aese(v0, v18); __ aesmc(v0, v0);
3040 __ BIND(L_rounds_52);
3041 __ aese(v0, v19); __ aesmc(v0, v0);
3042 __ aese(v0, v20); __ aesmc(v0, v0);
3043 __ BIND(L_rounds_44);
3044 __ aese(v0, v21); __ aesmc(v0, v0);
3045 __ aese(v0, v22); __ aesmc(v0, v0);
3046 __ aese(v0, v23); __ aesmc(v0, v0);
3047 __ aese(v0, v24); __ aesmc(v0, v0);
3048 __ aese(v0, v25); __ aesmc(v0, v0);
3049 __ aese(v0, v26); __ aesmc(v0, v0);
3050 __ aese(v0, v27); __ aesmc(v0, v0);
3051 __ aese(v0, v28); __ aesmc(v0, v0);
3052 __ aese(v0, v29); __ aesmc(v0, v0);
3053 __ aese(v0, v30);
3054 __ eor(v0, __ T16B, v0, v31);
3055
3056 __ st1(v0, __ T16B, __ post(to, 16));
3057
3058 __ subw(len_reg, len_reg, 16);
3059 __ cbnzw(len_reg, L_aes_loop);
3060
3061 __ st1(v0, __ T16B, rvec);
3062
3063 __ mov(r0, rscratch2);
3064
3065 __ leave();
3066 __ ret(lr);
3067
3068 return start;
3069 }
3070
3071 // Arguments:
3072 //
3073 // Inputs:
3074 // c_rarg0 - source byte array address
3075 // c_rarg1 - destination byte array address
3076 // c_rarg2 - K (key) in little endian int array
3077 // c_rarg3 - r vector byte array address
3078 // c_rarg4 - input length
3079 //
3080 // Output:
3081 // r0 - input length
3082 //
3083 address generate_cipherBlockChaining_decryptAESCrypt() {
3084 assert(UseAES, "need AES cryptographic extension support");
3085 __ align(CodeEntryAlignment);
3086 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3087 StubCodeMark mark(this, stub_id);
3088
3089 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3090
3091 const Register from = c_rarg0; // source array address
3092 const Register to = c_rarg1; // destination array address
3093 const Register key = c_rarg2; // key array address
3094 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3095 // and left with the results of the last encryption block
3096 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3097 const Register keylen = rscratch1;
3098
3099 address start = __ pc();
3100
3101 __ enter();
3102
3103 __ movw(rscratch2, len_reg);
3104
3105 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3106
3107 __ ld1(v2, __ T16B, rvec);
3108
3109 __ ld1(v31, __ T16B, __ post(key, 16));
3110 __ rev32(v31, __ T16B, v31);
3111
3112 __ cmpw(keylen, 52);
3113 __ br(Assembler::CC, L_loadkeys_44);
3114 __ br(Assembler::EQ, L_loadkeys_52);
3115
3116 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3117 __ rev32(v17, __ T16B, v17);
3118 __ rev32(v18, __ T16B, v18);
3119 __ BIND(L_loadkeys_52);
3120 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3121 __ rev32(v19, __ T16B, v19);
3122 __ rev32(v20, __ T16B, v20);
3123 __ BIND(L_loadkeys_44);
3124 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3125 __ rev32(v21, __ T16B, v21);
3126 __ rev32(v22, __ T16B, v22);
3127 __ rev32(v23, __ T16B, v23);
3128 __ rev32(v24, __ T16B, v24);
3129 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3130 __ rev32(v25, __ T16B, v25);
3131 __ rev32(v26, __ T16B, v26);
3132 __ rev32(v27, __ T16B, v27);
3133 __ rev32(v28, __ T16B, v28);
3134 __ ld1(v29, v30, __ T16B, key);
3135 __ rev32(v29, __ T16B, v29);
3136 __ rev32(v30, __ T16B, v30);
3137
3138 __ BIND(L_aes_loop);
3139 __ ld1(v0, __ T16B, __ post(from, 16));
3140 __ orr(v1, __ T16B, v0, v0);
3141
3142 __ br(Assembler::CC, L_rounds_44);
3143 __ br(Assembler::EQ, L_rounds_52);
3144
3145 __ aesd(v0, v17); __ aesimc(v0, v0);
3146 __ aesd(v0, v18); __ aesimc(v0, v0);
3147 __ BIND(L_rounds_52);
3148 __ aesd(v0, v19); __ aesimc(v0, v0);
3149 __ aesd(v0, v20); __ aesimc(v0, v0);
3150 __ BIND(L_rounds_44);
3151 __ aesd(v0, v21); __ aesimc(v0, v0);
3152 __ aesd(v0, v22); __ aesimc(v0, v0);
3153 __ aesd(v0, v23); __ aesimc(v0, v0);
3154 __ aesd(v0, v24); __ aesimc(v0, v0);
3155 __ aesd(v0, v25); __ aesimc(v0, v0);
3156 __ aesd(v0, v26); __ aesimc(v0, v0);
3157 __ aesd(v0, v27); __ aesimc(v0, v0);
3158 __ aesd(v0, v28); __ aesimc(v0, v0);
3159 __ aesd(v0, v29); __ aesimc(v0, v0);
3160 __ aesd(v0, v30);
3161 __ eor(v0, __ T16B, v0, v31);
3162 __ eor(v0, __ T16B, v0, v2);
3163
3164 __ st1(v0, __ T16B, __ post(to, 16));
3165 __ orr(v2, __ T16B, v1, v1);
3166
3167 __ subw(len_reg, len_reg, 16);
3168 __ cbnzw(len_reg, L_aes_loop);
3169
3170 __ st1(v2, __ T16B, rvec);
3171
3172 __ mov(r0, rscratch2);
3173
3174 __ leave();
3175 __ ret(lr);
3176
3177 return start;
3178 }
3179
3180 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3181 // Inputs: 128-bits. in is preserved.
3182 // The least-significant 64-bit word is in the upper dword of each vector.
3183 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3184 // Output: result
3185 void be_add_128_64(FloatRegister result, FloatRegister in,
3186 FloatRegister inc, FloatRegister tmp) {
3187 assert_different_registers(result, tmp, inc);
3188
3189 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3190 // input
3191 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3192 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3193 // MSD == 0 (must be!) to LSD
3194 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3195 }
3196
3197 // CTR AES crypt.
3198 // Arguments:
3199 //
3200 // Inputs:
3201 // c_rarg0 - source byte array address
3202 // c_rarg1 - destination byte array address
3203 // c_rarg2 - K (key) in little endian int array
3204 // c_rarg3 - counter vector byte array address
3205 // c_rarg4 - input length
3206 // c_rarg5 - saved encryptedCounter start
3207 // c_rarg6 - saved used length
3208 //
3209 // Output:
3210 // r0 - input length
3211 //
3212 address generate_counterMode_AESCrypt() {
3213 const Register in = c_rarg0;
3214 const Register out = c_rarg1;
3215 const Register key = c_rarg2;
3216 const Register counter = c_rarg3;
3217 const Register saved_len = c_rarg4, len = r10;
3218 const Register saved_encrypted_ctr = c_rarg5;
3219 const Register used_ptr = c_rarg6, used = r12;
3220
3221 const Register offset = r7;
3222 const Register keylen = r11;
3223
3224 const unsigned char block_size = 16;
3225 const int bulk_width = 4;
3226 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3227 // performance with larger data sizes, but it also means that the
3228 // fast path isn't used until you have at least 8 blocks, and up
3229 // to 127 bytes of data will be executed on the slow path. For
3230 // that reason, and also so as not to blow away too much icache, 4
3231 // blocks seems like a sensible compromise.
3232
3233 // Algorithm:
3234 //
3235 // if (len == 0) {
3236 // goto DONE;
3237 // }
3238 // int result = len;
3239 // do {
3240 // if (used >= blockSize) {
3241 // if (len >= bulk_width * blockSize) {
3242 // CTR_large_block();
3243 // if (len == 0)
3244 // goto DONE;
3245 // }
3246 // for (;;) {
3247 // 16ByteVector v0 = counter;
3248 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3249 // used = 0;
3250 // if (len < blockSize)
3251 // break; /* goto NEXT */
3252 // 16ByteVector v1 = load16Bytes(in, offset);
3253 // v1 = v1 ^ encryptedCounter;
3254 // store16Bytes(out, offset);
3255 // used = blockSize;
3256 // offset += blockSize;
3257 // len -= blockSize;
3258 // if (len == 0)
3259 // goto DONE;
3260 // }
3261 // }
3262 // NEXT:
3263 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3264 // len--;
3265 // } while (len != 0);
3266 // DONE:
3267 // return result;
3268 //
3269 // CTR_large_block()
3270 // Wide bulk encryption of whole blocks.
3271
3272 __ align(CodeEntryAlignment);
3273 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3274 StubCodeMark mark(this, stub_id);
3275 const address start = __ pc();
3276 __ enter();
3277
3278 Label DONE, CTR_large_block, large_block_return;
3279 __ ldrw(used, Address(used_ptr));
3280 __ cbzw(saved_len, DONE);
3281
3282 __ mov(len, saved_len);
3283 __ mov(offset, 0);
3284
3285 // Compute #rounds for AES based on the length of the key array
3286 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3287
3288 __ aesenc_loadkeys(key, keylen);
3289
3290 {
3291 Label L_CTR_loop, NEXT;
3292
3293 __ bind(L_CTR_loop);
3294
3295 __ cmp(used, block_size);
3296 __ br(__ LO, NEXT);
3297
3298 // Maybe we have a lot of data
3299 __ subsw(rscratch1, len, bulk_width * block_size);
3300 __ br(__ HS, CTR_large_block);
3301 __ BIND(large_block_return);
3302 __ cbzw(len, DONE);
3303
3304 // Setup the counter
3305 __ movi(v4, __ T4S, 0);
3306 __ movi(v5, __ T4S, 1);
3307 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3308
3309 // 128-bit big-endian increment
3310 __ ld1(v0, __ T16B, counter);
3311 __ rev64(v16, __ T16B, v0);
3312 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3313 __ rev64(v16, __ T16B, v16);
3314 __ st1(v16, __ T16B, counter);
3315 // Previous counter value is in v0
3316 // v4 contains { 0, 1 }
3317
3318 {
3319 // We have fewer than bulk_width blocks of data left. Encrypt
3320 // them one by one until there is less than a full block
3321 // remaining, being careful to save both the encrypted counter
3322 // and the counter.
3323
3324 Label inner_loop;
3325 __ bind(inner_loop);
3326 // Counter to encrypt is in v0
3327 __ aesecb_encrypt(noreg, noreg, keylen);
3328 __ st1(v0, __ T16B, saved_encrypted_ctr);
3329
3330 // Do we have a remaining full block?
3331
3332 __ mov(used, 0);
3333 __ cmp(len, block_size);
3334 __ br(__ LO, NEXT);
3335
3336 // Yes, we have a full block
3337 __ ldrq(v1, Address(in, offset));
3338 __ eor(v1, __ T16B, v1, v0);
3339 __ strq(v1, Address(out, offset));
3340 __ mov(used, block_size);
3341 __ add(offset, offset, block_size);
3342
3343 __ subw(len, len, block_size);
3344 __ cbzw(len, DONE);
3345
3346 // Increment the counter, store it back
3347 __ orr(v0, __ T16B, v16, v16);
3348 __ rev64(v16, __ T16B, v16);
3349 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3350 __ rev64(v16, __ T16B, v16);
3351 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3352
3353 __ b(inner_loop);
3354 }
3355
3356 __ BIND(NEXT);
3357
3358 // Encrypt a single byte, and loop.
3359 // We expect this to be a rare event.
3360 __ ldrb(rscratch1, Address(in, offset));
3361 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3362 __ eor(rscratch1, rscratch1, rscratch2);
3363 __ strb(rscratch1, Address(out, offset));
3364 __ add(offset, offset, 1);
3365 __ add(used, used, 1);
3366 __ subw(len, len,1);
3367 __ cbnzw(len, L_CTR_loop);
3368 }
3369
3370 __ bind(DONE);
3371 __ strw(used, Address(used_ptr));
3372 __ mov(r0, saved_len);
3373
3374 __ leave(); // required for proper stackwalking of RuntimeStub frame
3375 __ ret(lr);
3376
3377 // Bulk encryption
3378
3379 __ BIND (CTR_large_block);
3380 assert(bulk_width == 4 || bulk_width == 8, "must be");
3381
3382 if (bulk_width == 8) {
3383 __ sub(sp, sp, 4 * 16);
3384 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3385 }
3386 __ sub(sp, sp, 4 * 16);
3387 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3388 RegSet saved_regs = (RegSet::of(in, out, offset)
3389 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3390 __ push(saved_regs, sp);
3391 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3392 __ add(in, in, offset);
3393 __ add(out, out, offset);
3394
3395 // Keys should already be loaded into the correct registers
3396
3397 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3398 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3399
3400 // AES/CTR loop
3401 {
3402 Label L_CTR_loop;
3403 __ BIND(L_CTR_loop);
3404
3405 // Setup the counters
3406 __ movi(v8, __ T4S, 0);
3407 __ movi(v9, __ T4S, 1);
3408 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3409
3410 for (int i = 0; i < bulk_width; i++) {
3411 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3412 __ rev64(v0_ofs, __ T16B, v16);
3413 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3414 }
3415
3416 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3417
3418 // Encrypt the counters
3419 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3420
3421 if (bulk_width == 8) {
3422 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3423 }
3424
3425 // XOR the encrypted counters with the inputs
3426 for (int i = 0; i < bulk_width; i++) {
3427 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3428 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3429 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3430 }
3431
3432 // Write the encrypted data
3433 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3434 if (bulk_width == 8) {
3435 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3436 }
3437
3438 __ subw(len, len, 16 * bulk_width);
3439 __ cbnzw(len, L_CTR_loop);
3440 }
3441
3442 // Save the counter back where it goes
3443 __ rev64(v16, __ T16B, v16);
3444 __ st1(v16, __ T16B, counter);
3445
3446 __ pop(saved_regs, sp);
3447
3448 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3449 if (bulk_width == 8) {
3450 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3451 }
3452
3453 __ andr(rscratch1, len, -16 * bulk_width);
3454 __ sub(len, len, rscratch1);
3455 __ add(offset, offset, rscratch1);
3456 __ mov(used, 16);
3457 __ strw(used, Address(used_ptr));
3458 __ b(large_block_return);
3459
3460 return start;
3461 }
3462
3463 // Vector AES Galois Counter Mode implementation. Parameters:
3464 //
3465 // in = c_rarg0
3466 // len = c_rarg1
3467 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3468 // out = c_rarg3
3469 // key = c_rarg4
3470 // state = c_rarg5 - GHASH.state
3471 // subkeyHtbl = c_rarg6 - powers of H
3472 // counter = c_rarg7 - 16 bytes of CTR
3473 // return - number of processed bytes
3474 address generate_galoisCounterMode_AESCrypt() {
3475 Label ghash_polynomial; // local data generated after code
3476
3477 __ align(CodeEntryAlignment);
3478 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3479 StubCodeMark mark(this, stub_id);
3480 address start = __ pc();
3481 __ enter();
3482
3483 const Register in = c_rarg0;
3484 const Register len = c_rarg1;
3485 const Register ct = c_rarg2;
3486 const Register out = c_rarg3;
3487 // and updated with the incremented counter in the end
3488
3489 const Register key = c_rarg4;
3490 const Register state = c_rarg5;
3491
3492 const Register subkeyHtbl = c_rarg6;
3493
3494 const Register counter = c_rarg7;
3495
3496 const Register keylen = r10;
3497 // Save state before entering routine
3498 __ sub(sp, sp, 4 * 16);
3499 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3500 __ sub(sp, sp, 4 * 16);
3501 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3502
3503 // __ andr(len, len, -512);
3504 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3505 __ str(len, __ pre(sp, -2 * wordSize));
3506
3507 Label DONE;
3508 __ cbz(len, DONE);
3509
3510 // Compute #rounds for AES based on the length of the key array
3511 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3512
3513 __ aesenc_loadkeys(key, keylen);
3514 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3515 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3516
3517 // AES/CTR loop
3518 {
3519 Label L_CTR_loop;
3520 __ BIND(L_CTR_loop);
3521
3522 // Setup the counters
3523 __ movi(v8, __ T4S, 0);
3524 __ movi(v9, __ T4S, 1);
3525 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3526
3527 assert(v0->encoding() < v8->encoding(), "");
3528 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3529 FloatRegister f = as_FloatRegister(i);
3530 __ rev32(f, __ T16B, v16);
3531 __ addv(v16, __ T4S, v16, v8);
3532 }
3533
3534 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3535
3536 // Encrypt the counters
3537 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3538
3539 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3540
3541 // XOR the encrypted counters with the inputs
3542 for (int i = 0; i < 8; i++) {
3543 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3544 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3545 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3546 }
3547 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3548 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3549
3550 __ subw(len, len, 16 * 8);
3551 __ cbnzw(len, L_CTR_loop);
3552 }
3553
3554 __ rev32(v16, __ T16B, v16);
3555 __ st1(v16, __ T16B, counter);
3556
3557 __ ldr(len, Address(sp));
3558 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3559
3560 // GHASH/CTR loop
3561 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3562 len, /*unrolls*/4);
3563
3564 #ifdef ASSERT
3565 { Label L;
3566 __ cmp(len, (unsigned char)0);
3567 __ br(Assembler::EQ, L);
3568 __ stop("stubGenerator: abort");
3569 __ bind(L);
3570 }
3571 #endif
3572
3573 __ bind(DONE);
3574 // Return the number of bytes processed
3575 __ ldr(r0, __ post(sp, 2 * wordSize));
3576
3577 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3578 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3579
3580 __ leave(); // required for proper stackwalking of RuntimeStub frame
3581 __ ret(lr);
3582
3583 // bind label and generate polynomial data
3584 __ align(wordSize * 2);
3585 __ bind(ghash_polynomial);
3586 __ emit_int64(0x87); // The low-order bits of the field
3587 // polynomial (i.e. p = z^7+z^2+z+1)
3588 // repeated in the low and high parts of a
3589 // 128-bit vector
3590 __ emit_int64(0x87);
3591
3592 return start;
3593 }
3594
3595 class Cached64Bytes {
3596 private:
3597 MacroAssembler *_masm;
3598 Register _regs[8];
3599
3600 public:
3601 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3602 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3603 auto it = rs.begin();
3604 for (auto &r: _regs) {
3605 r = *it;
3606 ++it;
3607 }
3608 }
3609
3610 void gen_loads(Register base) {
3611 for (int i = 0; i < 8; i += 2) {
3612 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3613 }
3614 }
3615
3616 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3617 void extract_u32(Register dest, int i) {
3618 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3619 }
3620 };
3621
3622 // Utility routines for md5.
3623 // Clobbers r10 and r11.
3624 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3625 int k, int s, int t) {
3626 Register rscratch3 = r10;
3627 Register rscratch4 = r11;
3628
3629 __ eorw(rscratch3, r3, r4);
3630 __ movw(rscratch2, t);
3631 __ andw(rscratch3, rscratch3, r2);
3632 __ addw(rscratch4, r1, rscratch2);
3633 reg_cache.extract_u32(rscratch1, k);
3634 __ eorw(rscratch3, rscratch3, r4);
3635 __ addw(rscratch4, rscratch4, rscratch1);
3636 __ addw(rscratch3, rscratch3, rscratch4);
3637 __ rorw(rscratch2, rscratch3, 32 - s);
3638 __ addw(r1, rscratch2, r2);
3639 }
3640
3641 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3642 int k, int s, int t) {
3643 Register rscratch3 = r10;
3644 Register rscratch4 = r11;
3645
3646 reg_cache.extract_u32(rscratch1, k);
3647 __ movw(rscratch2, t);
3648 __ addw(rscratch4, r1, rscratch2);
3649 __ addw(rscratch4, rscratch4, rscratch1);
3650 __ bicw(rscratch2, r3, r4);
3651 __ andw(rscratch3, r2, r4);
3652 __ addw(rscratch2, rscratch2, rscratch4);
3653 __ addw(rscratch2, rscratch2, rscratch3);
3654 __ rorw(rscratch2, rscratch2, 32 - s);
3655 __ addw(r1, rscratch2, r2);
3656 }
3657
3658 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3659 int k, int s, int t) {
3660 Register rscratch3 = r10;
3661 Register rscratch4 = r11;
3662
3663 __ eorw(rscratch3, r3, r4);
3664 __ movw(rscratch2, t);
3665 __ addw(rscratch4, r1, rscratch2);
3666 reg_cache.extract_u32(rscratch1, k);
3667 __ eorw(rscratch3, rscratch3, r2);
3668 __ addw(rscratch4, rscratch4, rscratch1);
3669 __ addw(rscratch3, rscratch3, rscratch4);
3670 __ rorw(rscratch2, rscratch3, 32 - s);
3671 __ addw(r1, rscratch2, r2);
3672 }
3673
3674 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3675 int k, int s, int t) {
3676 Register rscratch3 = r10;
3677 Register rscratch4 = r11;
3678
3679 __ movw(rscratch3, t);
3680 __ ornw(rscratch2, r2, r4);
3681 __ addw(rscratch4, r1, rscratch3);
3682 reg_cache.extract_u32(rscratch1, k);
3683 __ eorw(rscratch3, rscratch2, r3);
3684 __ addw(rscratch4, rscratch4, rscratch1);
3685 __ addw(rscratch3, rscratch3, rscratch4);
3686 __ rorw(rscratch2, rscratch3, 32 - s);
3687 __ addw(r1, rscratch2, r2);
3688 }
3689
3690 // Arguments:
3691 //
3692 // Inputs:
3693 // c_rarg0 - byte[] source+offset
3694 // c_rarg1 - int[] SHA.state
3695 // c_rarg2 - int offset
3696 // c_rarg3 - int limit
3697 //
3698 address generate_md5_implCompress(StubId stub_id) {
3699 bool multi_block;
3700 switch (stub_id) {
3701 case StubId::stubgen_md5_implCompress_id:
3702 multi_block = false;
3703 break;
3704 case StubId::stubgen_md5_implCompressMB_id:
3705 multi_block = true;
3706 break;
3707 default:
3708 ShouldNotReachHere();
3709 }
3710 __ align(CodeEntryAlignment);
3711
3712 StubCodeMark mark(this, stub_id);
3713 address start = __ pc();
3714
3715 Register buf = c_rarg0;
3716 Register state = c_rarg1;
3717 Register ofs = c_rarg2;
3718 Register limit = c_rarg3;
3719 Register a = r4;
3720 Register b = r5;
3721 Register c = r6;
3722 Register d = r7;
3723 Register rscratch3 = r10;
3724 Register rscratch4 = r11;
3725
3726 Register state_regs[2] = { r12, r13 };
3727 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3728 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3729
3730 __ push(saved_regs, sp);
3731
3732 __ ldp(state_regs[0], state_regs[1], Address(state));
3733 __ ubfx(a, state_regs[0], 0, 32);
3734 __ ubfx(b, state_regs[0], 32, 32);
3735 __ ubfx(c, state_regs[1], 0, 32);
3736 __ ubfx(d, state_regs[1], 32, 32);
3737
3738 Label md5_loop;
3739 __ BIND(md5_loop);
3740
3741 reg_cache.gen_loads(buf);
3742
3743 // Round 1
3744 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3745 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3746 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3747 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3748 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3749 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3750 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3751 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3752 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3753 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3754 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3755 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3756 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3757 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3758 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3759 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3760
3761 // Round 2
3762 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3763 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3764 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3765 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3766 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3767 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3768 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3769 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3770 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3771 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3772 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3773 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3774 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3775 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3776 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3777 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3778
3779 // Round 3
3780 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3781 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3782 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3783 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3784 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3785 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3786 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3787 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3788 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3789 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3790 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3791 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3792 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3793 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3794 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3795 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3796
3797 // Round 4
3798 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3799 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3800 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3801 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3802 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3803 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3804 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3805 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3806 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3807 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3808 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3809 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3810 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3811 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3812 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3813 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3814
3815 __ addw(a, state_regs[0], a);
3816 __ ubfx(rscratch2, state_regs[0], 32, 32);
3817 __ addw(b, rscratch2, b);
3818 __ addw(c, state_regs[1], c);
3819 __ ubfx(rscratch4, state_regs[1], 32, 32);
3820 __ addw(d, rscratch4, d);
3821
3822 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3823 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3824
3825 if (multi_block) {
3826 __ add(buf, buf, 64);
3827 __ add(ofs, ofs, 64);
3828 __ cmp(ofs, limit);
3829 __ br(Assembler::LE, md5_loop);
3830 __ mov(c_rarg0, ofs); // return ofs
3831 }
3832
3833 // write hash values back in the correct order
3834 __ stp(state_regs[0], state_regs[1], Address(state));
3835
3836 __ pop(saved_regs, sp);
3837
3838 __ ret(lr);
3839
3840 return start;
3841 }
3842
3843 // Arguments:
3844 //
3845 // Inputs:
3846 // c_rarg0 - byte[] source+offset
3847 // c_rarg1 - int[] SHA.state
3848 // c_rarg2 - int offset
3849 // c_rarg3 - int limit
3850 //
3851 address generate_sha1_implCompress(StubId stub_id) {
3852 bool multi_block;
3853 switch (stub_id) {
3854 case StubId::stubgen_sha1_implCompress_id:
3855 multi_block = false;
3856 break;
3857 case StubId::stubgen_sha1_implCompressMB_id:
3858 multi_block = true;
3859 break;
3860 default:
3861 ShouldNotReachHere();
3862 }
3863
3864 __ align(CodeEntryAlignment);
3865
3866 StubCodeMark mark(this, stub_id);
3867 address start = __ pc();
3868
3869 Register buf = c_rarg0;
3870 Register state = c_rarg1;
3871 Register ofs = c_rarg2;
3872 Register limit = c_rarg3;
3873
3874 Label keys;
3875 Label sha1_loop;
3876
3877 // load the keys into v0..v3
3878 __ adr(rscratch1, keys);
3879 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3880 // load 5 words state into v6, v7
3881 __ ldrq(v6, Address(state, 0));
3882 __ ldrs(v7, Address(state, 16));
3883
3884
3885 __ BIND(sha1_loop);
3886 // load 64 bytes of data into v16..v19
3887 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3888 __ rev32(v16, __ T16B, v16);
3889 __ rev32(v17, __ T16B, v17);
3890 __ rev32(v18, __ T16B, v18);
3891 __ rev32(v19, __ T16B, v19);
3892
3893 // do the sha1
3894 __ addv(v4, __ T4S, v16, v0);
3895 __ orr(v20, __ T16B, v6, v6);
3896
3897 FloatRegister d0 = v16;
3898 FloatRegister d1 = v17;
3899 FloatRegister d2 = v18;
3900 FloatRegister d3 = v19;
3901
3902 for (int round = 0; round < 20; round++) {
3903 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3904 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3905 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3906 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3907 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3908
3909 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3910 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3911 __ sha1h(tmp2, __ T4S, v20);
3912 if (round < 5)
3913 __ sha1c(v20, __ T4S, tmp3, tmp4);
3914 else if (round < 10 || round >= 15)
3915 __ sha1p(v20, __ T4S, tmp3, tmp4);
3916 else
3917 __ sha1m(v20, __ T4S, tmp3, tmp4);
3918 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3919
3920 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3921 }
3922
3923 __ addv(v7, __ T2S, v7, v21);
3924 __ addv(v6, __ T4S, v6, v20);
3925
3926 if (multi_block) {
3927 __ add(ofs, ofs, 64);
3928 __ cmp(ofs, limit);
3929 __ br(Assembler::LE, sha1_loop);
3930 __ mov(c_rarg0, ofs); // return ofs
3931 }
3932
3933 __ strq(v6, Address(state, 0));
3934 __ strs(v7, Address(state, 16));
3935
3936 __ ret(lr);
3937
3938 __ bind(keys);
3939 __ emit_int32(0x5a827999);
3940 __ emit_int32(0x6ed9eba1);
3941 __ emit_int32(0x8f1bbcdc);
3942 __ emit_int32(0xca62c1d6);
3943
3944 return start;
3945 }
3946
3947
3948 // Arguments:
3949 //
3950 // Inputs:
3951 // c_rarg0 - byte[] source+offset
3952 // c_rarg1 - int[] SHA.state
3953 // c_rarg2 - int offset
3954 // c_rarg3 - int limit
3955 //
3956 address generate_sha256_implCompress(StubId stub_id) {
3957 bool multi_block;
3958 switch (stub_id) {
3959 case StubId::stubgen_sha256_implCompress_id:
3960 multi_block = false;
3961 break;
3962 case StubId::stubgen_sha256_implCompressMB_id:
3963 multi_block = true;
3964 break;
3965 default:
3966 ShouldNotReachHere();
3967 }
3968
3969 static const uint32_t round_consts[64] = {
3970 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3971 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3972 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3973 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3974 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3975 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3976 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3977 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3978 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3979 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3980 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3981 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3982 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3983 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3984 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3985 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3986 };
3987
3988 __ align(CodeEntryAlignment);
3989
3990 StubCodeMark mark(this, stub_id);
3991 address start = __ pc();
3992
3993 Register buf = c_rarg0;
3994 Register state = c_rarg1;
3995 Register ofs = c_rarg2;
3996 Register limit = c_rarg3;
3997
3998 Label sha1_loop;
3999
4000 __ stpd(v8, v9, __ pre(sp, -32));
4001 __ stpd(v10, v11, Address(sp, 16));
4002
4003 // dga == v0
4004 // dgb == v1
4005 // dg0 == v2
4006 // dg1 == v3
4007 // dg2 == v4
4008 // t0 == v6
4009 // t1 == v7
4010
4011 // load 16 keys to v16..v31
4012 __ lea(rscratch1, ExternalAddress((address)round_consts));
4013 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4014 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4015 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4016 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4017
4018 // load 8 words (256 bits) state
4019 __ ldpq(v0, v1, state);
4020
4021 __ BIND(sha1_loop);
4022 // load 64 bytes of data into v8..v11
4023 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4024 __ rev32(v8, __ T16B, v8);
4025 __ rev32(v9, __ T16B, v9);
4026 __ rev32(v10, __ T16B, v10);
4027 __ rev32(v11, __ T16B, v11);
4028
4029 __ addv(v6, __ T4S, v8, v16);
4030 __ orr(v2, __ T16B, v0, v0);
4031 __ orr(v3, __ T16B, v1, v1);
4032
4033 FloatRegister d0 = v8;
4034 FloatRegister d1 = v9;
4035 FloatRegister d2 = v10;
4036 FloatRegister d3 = v11;
4037
4038
4039 for (int round = 0; round < 16; round++) {
4040 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4041 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4042 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4043 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4044
4045 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4046 __ orr(v4, __ T16B, v2, v2);
4047 if (round < 15)
4048 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4049 __ sha256h(v2, __ T4S, v3, tmp2);
4050 __ sha256h2(v3, __ T4S, v4, tmp2);
4051 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4052
4053 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4054 }
4055
4056 __ addv(v0, __ T4S, v0, v2);
4057 __ addv(v1, __ T4S, v1, v3);
4058
4059 if (multi_block) {
4060 __ add(ofs, ofs, 64);
4061 __ cmp(ofs, limit);
4062 __ br(Assembler::LE, sha1_loop);
4063 __ mov(c_rarg0, ofs); // return ofs
4064 }
4065
4066 __ ldpd(v10, v11, Address(sp, 16));
4067 __ ldpd(v8, v9, __ post(sp, 32));
4068
4069 __ stpq(v0, v1, state);
4070
4071 __ ret(lr);
4072
4073 return start;
4074 }
4075
4076 // Double rounds for sha512.
4077 void sha512_dround(int dr,
4078 FloatRegister vi0, FloatRegister vi1,
4079 FloatRegister vi2, FloatRegister vi3,
4080 FloatRegister vi4, FloatRegister vrc0,
4081 FloatRegister vrc1, FloatRegister vin0,
4082 FloatRegister vin1, FloatRegister vin2,
4083 FloatRegister vin3, FloatRegister vin4) {
4084 if (dr < 36) {
4085 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4086 }
4087 __ addv(v5, __ T2D, vrc0, vin0);
4088 __ ext(v6, __ T16B, vi2, vi3, 8);
4089 __ ext(v5, __ T16B, v5, v5, 8);
4090 __ ext(v7, __ T16B, vi1, vi2, 8);
4091 __ addv(vi3, __ T2D, vi3, v5);
4092 if (dr < 32) {
4093 __ ext(v5, __ T16B, vin3, vin4, 8);
4094 __ sha512su0(vin0, __ T2D, vin1);
4095 }
4096 __ sha512h(vi3, __ T2D, v6, v7);
4097 if (dr < 32) {
4098 __ sha512su1(vin0, __ T2D, vin2, v5);
4099 }
4100 __ addv(vi4, __ T2D, vi1, vi3);
4101 __ sha512h2(vi3, __ T2D, vi1, vi0);
4102 }
4103
4104 // Arguments:
4105 //
4106 // Inputs:
4107 // c_rarg0 - byte[] source+offset
4108 // c_rarg1 - int[] SHA.state
4109 // c_rarg2 - int offset
4110 // c_rarg3 - int limit
4111 //
4112 address generate_sha512_implCompress(StubId stub_id) {
4113 bool multi_block;
4114 switch (stub_id) {
4115 case StubId::stubgen_sha512_implCompress_id:
4116 multi_block = false;
4117 break;
4118 case StubId::stubgen_sha512_implCompressMB_id:
4119 multi_block = true;
4120 break;
4121 default:
4122 ShouldNotReachHere();
4123 }
4124
4125 static const uint64_t round_consts[80] = {
4126 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4127 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4128 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4129 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4130 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4131 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4132 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4133 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4134 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4135 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4136 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4137 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4138 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4139 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4140 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4141 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4142 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4143 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4144 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4145 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4146 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4147 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4148 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4149 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4150 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4151 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4152 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4153 };
4154
4155 __ align(CodeEntryAlignment);
4156
4157 StubCodeMark mark(this, stub_id);
4158 address start = __ pc();
4159
4160 Register buf = c_rarg0;
4161 Register state = c_rarg1;
4162 Register ofs = c_rarg2;
4163 Register limit = c_rarg3;
4164
4165 __ stpd(v8, v9, __ pre(sp, -64));
4166 __ stpd(v10, v11, Address(sp, 16));
4167 __ stpd(v12, v13, Address(sp, 32));
4168 __ stpd(v14, v15, Address(sp, 48));
4169
4170 Label sha512_loop;
4171
4172 // load state
4173 __ ld1(v8, v9, v10, v11, __ T2D, state);
4174
4175 // load first 4 round constants
4176 __ lea(rscratch1, ExternalAddress((address)round_consts));
4177 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4178
4179 __ BIND(sha512_loop);
4180 // load 128B of data into v12..v19
4181 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4182 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4183 __ rev64(v12, __ T16B, v12);
4184 __ rev64(v13, __ T16B, v13);
4185 __ rev64(v14, __ T16B, v14);
4186 __ rev64(v15, __ T16B, v15);
4187 __ rev64(v16, __ T16B, v16);
4188 __ rev64(v17, __ T16B, v17);
4189 __ rev64(v18, __ T16B, v18);
4190 __ rev64(v19, __ T16B, v19);
4191
4192 __ mov(rscratch2, rscratch1);
4193
4194 __ mov(v0, __ T16B, v8);
4195 __ mov(v1, __ T16B, v9);
4196 __ mov(v2, __ T16B, v10);
4197 __ mov(v3, __ T16B, v11);
4198
4199 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4200 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4201 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4202 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4203 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4204 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4205 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4206 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4207 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4208 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4209 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4210 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4211 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4212 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4213 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4214 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4215 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4216 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4217 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4218 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4219 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4220 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4221 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4222 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4223 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4224 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4225 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4226 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4227 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4228 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4229 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4230 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4231 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4232 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4233 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4234 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4235 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4236 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4237 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4238 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4239
4240 __ addv(v8, __ T2D, v8, v0);
4241 __ addv(v9, __ T2D, v9, v1);
4242 __ addv(v10, __ T2D, v10, v2);
4243 __ addv(v11, __ T2D, v11, v3);
4244
4245 if (multi_block) {
4246 __ add(ofs, ofs, 128);
4247 __ cmp(ofs, limit);
4248 __ br(Assembler::LE, sha512_loop);
4249 __ mov(c_rarg0, ofs); // return ofs
4250 }
4251
4252 __ st1(v8, v9, v10, v11, __ T2D, state);
4253
4254 __ ldpd(v14, v15, Address(sp, 48));
4255 __ ldpd(v12, v13, Address(sp, 32));
4256 __ ldpd(v10, v11, Address(sp, 16));
4257 __ ldpd(v8, v9, __ post(sp, 64));
4258
4259 __ ret(lr);
4260
4261 return start;
4262 }
4263
4264 // Execute one round of keccak of two computations in parallel.
4265 // One of the states should be loaded into the lower halves of
4266 // the vector registers v0-v24, the other should be loaded into
4267 // the upper halves of those registers. The ld1r instruction loads
4268 // the round constant into both halves of register v31.
4269 // Intermediate results c0...c5 and d0...d5 are computed
4270 // in registers v25...v30.
4271 // All vector instructions that are used operate on both register
4272 // halves in parallel.
4273 // If only a single computation is needed, one can only load the lower halves.
4274 void keccak_round(Register rscratch1) {
4275 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4276 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4277 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4278 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4279 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4280 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4281 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4282 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4283 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4284 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4285
4286 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4287 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4288 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4289 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4290 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4291
4292 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4293 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4294 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4295 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4296 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4297 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4298 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4299 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4300 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4301 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4302 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4303 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4304 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4305 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4306 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4307 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4308 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4309 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4310 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4311 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4312 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4313 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4314 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4315 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4316 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4317
4318 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4319 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4320 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4321 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4322 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4323
4324 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4325
4326 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4327 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4328 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4329 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4330 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4331
4332 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4333 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4334 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4335 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4336 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4337
4338 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4339 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4340 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4341 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4342 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4343
4344 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4345 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4346 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4347 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4348 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4349
4350 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4351 }
4352
4353 // Arguments:
4354 //
4355 // Inputs:
4356 // c_rarg0 - byte[] source+offset
4357 // c_rarg1 - byte[] SHA.state
4358 // c_rarg2 - int block_size
4359 // c_rarg3 - int offset
4360 // c_rarg4 - int limit
4361 //
4362 address generate_sha3_implCompress(StubId stub_id) {
4363 bool multi_block;
4364 switch (stub_id) {
4365 case StubId::stubgen_sha3_implCompress_id:
4366 multi_block = false;
4367 break;
4368 case StubId::stubgen_sha3_implCompressMB_id:
4369 multi_block = true;
4370 break;
4371 default:
4372 ShouldNotReachHere();
4373 }
4374
4375 static const uint64_t round_consts[24] = {
4376 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4377 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4378 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4379 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4380 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4381 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4382 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4383 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4384 };
4385
4386 __ align(CodeEntryAlignment);
4387
4388 StubCodeMark mark(this, stub_id);
4389 address start = __ pc();
4390
4391 Register buf = c_rarg0;
4392 Register state = c_rarg1;
4393 Register block_size = c_rarg2;
4394 Register ofs = c_rarg3;
4395 Register limit = c_rarg4;
4396
4397 Label sha3_loop, rounds24_loop;
4398 Label sha3_512_or_sha3_384, shake128;
4399
4400 __ stpd(v8, v9, __ pre(sp, -64));
4401 __ stpd(v10, v11, Address(sp, 16));
4402 __ stpd(v12, v13, Address(sp, 32));
4403 __ stpd(v14, v15, Address(sp, 48));
4404
4405 // load state
4406 __ add(rscratch1, state, 32);
4407 __ ld1(v0, v1, v2, v3, __ T1D, state);
4408 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4409 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4410 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4411 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4412 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4413 __ ld1(v24, __ T1D, rscratch1);
4414
4415 __ BIND(sha3_loop);
4416
4417 // 24 keccak rounds
4418 __ movw(rscratch2, 24);
4419
4420 // load round_constants base
4421 __ lea(rscratch1, ExternalAddress((address) round_consts));
4422
4423 // load input
4424 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4425 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4426 __ eor(v0, __ T8B, v0, v25);
4427 __ eor(v1, __ T8B, v1, v26);
4428 __ eor(v2, __ T8B, v2, v27);
4429 __ eor(v3, __ T8B, v3, v28);
4430 __ eor(v4, __ T8B, v4, v29);
4431 __ eor(v5, __ T8B, v5, v30);
4432 __ eor(v6, __ T8B, v6, v31);
4433
4434 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4435 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4436
4437 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4438 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4439 __ eor(v7, __ T8B, v7, v25);
4440 __ eor(v8, __ T8B, v8, v26);
4441 __ eor(v9, __ T8B, v9, v27);
4442 __ eor(v10, __ T8B, v10, v28);
4443 __ eor(v11, __ T8B, v11, v29);
4444 __ eor(v12, __ T8B, v12, v30);
4445 __ eor(v13, __ T8B, v13, v31);
4446
4447 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4448 __ eor(v14, __ T8B, v14, v25);
4449 __ eor(v15, __ T8B, v15, v26);
4450 __ eor(v16, __ T8B, v16, v27);
4451
4452 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4453 __ andw(c_rarg5, block_size, 48);
4454 __ cbzw(c_rarg5, rounds24_loop);
4455
4456 __ tbnz(block_size, 5, shake128);
4457 // block_size == 144, bit5 == 0, SHA3-224
4458 __ ldrd(v28, __ post(buf, 8));
4459 __ eor(v17, __ T8B, v17, v28);
4460 __ b(rounds24_loop);
4461
4462 __ BIND(shake128);
4463 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4464 __ eor(v17, __ T8B, v17, v28);
4465 __ eor(v18, __ T8B, v18, v29);
4466 __ eor(v19, __ T8B, v19, v30);
4467 __ eor(v20, __ T8B, v20, v31);
4468 __ b(rounds24_loop); // block_size == 168, SHAKE128
4469
4470 __ BIND(sha3_512_or_sha3_384);
4471 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4472 __ eor(v7, __ T8B, v7, v25);
4473 __ eor(v8, __ T8B, v8, v26);
4474 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4475
4476 // SHA3-384
4477 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4478 __ eor(v9, __ T8B, v9, v27);
4479 __ eor(v10, __ T8B, v10, v28);
4480 __ eor(v11, __ T8B, v11, v29);
4481 __ eor(v12, __ T8B, v12, v30);
4482
4483 __ BIND(rounds24_loop);
4484 __ subw(rscratch2, rscratch2, 1);
4485
4486 keccak_round(rscratch1);
4487
4488 __ cbnzw(rscratch2, rounds24_loop);
4489
4490 if (multi_block) {
4491 __ add(ofs, ofs, block_size);
4492 __ cmp(ofs, limit);
4493 __ br(Assembler::LE, sha3_loop);
4494 __ mov(c_rarg0, ofs); // return ofs
4495 }
4496
4497 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4498 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4499 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4500 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4501 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4502 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4503 __ st1(v24, __ T1D, state);
4504
4505 // restore callee-saved registers
4506 __ ldpd(v14, v15, Address(sp, 48));
4507 __ ldpd(v12, v13, Address(sp, 32));
4508 __ ldpd(v10, v11, Address(sp, 16));
4509 __ ldpd(v8, v9, __ post(sp, 64));
4510
4511 __ ret(lr);
4512
4513 return start;
4514 }
4515
4516 // Inputs:
4517 // c_rarg0 - long[] state0
4518 // c_rarg1 - long[] state1
4519 address generate_double_keccak() {
4520 static const uint64_t round_consts[24] = {
4521 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4522 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4523 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4524 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4525 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4526 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4527 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4528 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4529 };
4530
4531 // Implements the double_keccak() method of the
4532 // sun.secyrity.provider.SHA3Parallel class
4533 __ align(CodeEntryAlignment);
4534 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4535 address start = __ pc();
4536 __ enter();
4537
4538 Register state0 = c_rarg0;
4539 Register state1 = c_rarg1;
4540
4541 Label rounds24_loop;
4542
4543 // save callee-saved registers
4544 __ stpd(v8, v9, __ pre(sp, -64));
4545 __ stpd(v10, v11, Address(sp, 16));
4546 __ stpd(v12, v13, Address(sp, 32));
4547 __ stpd(v14, v15, Address(sp, 48));
4548
4549 // load states
4550 __ add(rscratch1, state0, 32);
4551 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4552 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4553 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4554 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4555 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4556 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4557 __ ld1(v24, __ D, 0, rscratch1);
4558 __ add(rscratch1, state1, 32);
4559 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4560 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4561 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4562 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4563 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4564 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4565 __ ld1(v24, __ D, 1, rscratch1);
4566
4567 // 24 keccak rounds
4568 __ movw(rscratch2, 24);
4569
4570 // load round_constants base
4571 __ lea(rscratch1, ExternalAddress((address) round_consts));
4572
4573 __ BIND(rounds24_loop);
4574 __ subw(rscratch2, rscratch2, 1);
4575 keccak_round(rscratch1);
4576 __ cbnzw(rscratch2, rounds24_loop);
4577
4578 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4579 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4580 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4581 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4582 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4583 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4584 __ st1(v24, __ D, 0, state0);
4585 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4586 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4587 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4588 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4589 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4590 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4591 __ st1(v24, __ D, 1, state1);
4592
4593 // restore callee-saved vector registers
4594 __ ldpd(v14, v15, Address(sp, 48));
4595 __ ldpd(v12, v13, Address(sp, 32));
4596 __ ldpd(v10, v11, Address(sp, 16));
4597 __ ldpd(v8, v9, __ post(sp, 64));
4598
4599 __ leave(); // required for proper stackwalking of RuntimeStub frame
4600 __ mov(r0, zr); // return 0
4601 __ ret(lr);
4602
4603 return start;
4604 }
4605
4606 // ChaCha20 block function. This version parallelizes the 32-bit
4607 // state elements on each of 16 vectors, producing 4 blocks of
4608 // keystream at a time.
4609 //
4610 // state (int[16]) = c_rarg0
4611 // keystream (byte[256]) = c_rarg1
4612 // return - number of bytes of produced keystream (always 256)
4613 //
4614 // This implementation takes each 32-bit integer from the state
4615 // array and broadcasts it across all 4 32-bit lanes of a vector register
4616 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4617 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4618 // the quarter round schedule is implemented as outlined in RFC 7539 section
4619 // 2.3. However, instead of sequentially processing the 3 quarter round
4620 // operations represented by one QUARTERROUND function, we instead stack all
4621 // the adds, xors and left-rotations from the first 4 quarter rounds together
4622 // and then do the same for the second set of 4 quarter rounds. This removes
4623 // some latency that would otherwise be incurred by waiting for an add to
4624 // complete before performing an xor (which depends on the result of the
4625 // add), etc. An adjustment happens between the first and second groups of 4
4626 // quarter rounds, but this is done only in the inputs to the macro functions
4627 // that generate the assembly instructions - these adjustments themselves are
4628 // not part of the resulting assembly.
4629 // The 4 registers v0-v3 are used during the quarter round operations as
4630 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4631 // registers become the vectors involved in adding the start state back onto
4632 // the post-QR working state. After the adds are complete, each of the 16
4633 // vectors write their first lane back to the keystream buffer, followed
4634 // by the second lane from all vectors and so on.
4635 address generate_chacha20Block_blockpar() {
4636 Label L_twoRounds, L_cc20_const;
4637 __ align(CodeEntryAlignment);
4638 StubId stub_id = StubId::stubgen_chacha20Block_id;
4639 StubCodeMark mark(this, stub_id);
4640 address start = __ pc();
4641 __ enter();
4642
4643 int i, j;
4644 const Register state = c_rarg0;
4645 const Register keystream = c_rarg1;
4646 const Register loopCtr = r10;
4647 const Register tmpAddr = r11;
4648 const FloatRegister ctrAddOverlay = v28;
4649 const FloatRegister lrot8Tbl = v29;
4650
4651 // Organize SIMD registers in an array that facilitates
4652 // putting repetitive opcodes into loop structures. It is
4653 // important that each grouping of 4 registers is monotonically
4654 // increasing to support the requirements of multi-register
4655 // instructions (e.g. ld4r, st4, etc.)
4656 const FloatRegister workSt[16] = {
4657 v4, v5, v6, v7, v16, v17, v18, v19,
4658 v20, v21, v22, v23, v24, v25, v26, v27
4659 };
4660
4661 // Pull in constant data. The first 16 bytes are the add overlay
4662 // which is applied to the vector holding the counter (state[12]).
4663 // The second 16 bytes is the index register for the 8-bit left
4664 // rotation tbl instruction.
4665 __ adr(tmpAddr, L_cc20_const);
4666 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4667
4668 // Load from memory and interlace across 16 SIMD registers,
4669 // With each word from memory being broadcast to all lanes of
4670 // each successive SIMD register.
4671 // Addr(0) -> All lanes in workSt[i]
4672 // Addr(4) -> All lanes workSt[i + 1], etc.
4673 __ mov(tmpAddr, state);
4674 for (i = 0; i < 16; i += 4) {
4675 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4676 __ post(tmpAddr, 16));
4677 }
4678 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4679
4680 // Before entering the loop, create 5 4-register arrays. These
4681 // will hold the 4 registers that represent the a/b/c/d fields
4682 // in the quarter round operation. For instance the "b" field
4683 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4684 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4685 // since it is part of a diagonal organization. The aSet and scratch
4686 // register sets are defined at declaration time because they do not change
4687 // organization at any point during the 20-round processing.
4688 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4689 FloatRegister bSet[4];
4690 FloatRegister cSet[4];
4691 FloatRegister dSet[4];
4692 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4693
4694 // Set up the 10 iteration loop and perform all 8 quarter round ops
4695 __ mov(loopCtr, 10);
4696 __ BIND(L_twoRounds);
4697
4698 // Set to columnar organization and do the following 4 quarter-rounds:
4699 // QUARTERROUND(0, 4, 8, 12)
4700 // QUARTERROUND(1, 5, 9, 13)
4701 // QUARTERROUND(2, 6, 10, 14)
4702 // QUARTERROUND(3, 7, 11, 15)
4703 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4704 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4705 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4706
4707 __ cc20_qr_add4(aSet, bSet); // a += b
4708 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4709 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4710
4711 __ cc20_qr_add4(cSet, dSet); // c += d
4712 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4713 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4714
4715 __ cc20_qr_add4(aSet, bSet); // a += b
4716 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4717 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4718
4719 __ cc20_qr_add4(cSet, dSet); // c += d
4720 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4721 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4722
4723 // Set to diagonal organization and do the next 4 quarter-rounds:
4724 // QUARTERROUND(0, 5, 10, 15)
4725 // QUARTERROUND(1, 6, 11, 12)
4726 // QUARTERROUND(2, 7, 8, 13)
4727 // QUARTERROUND(3, 4, 9, 14)
4728 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4729 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4730 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4731
4732 __ cc20_qr_add4(aSet, bSet); // a += b
4733 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4734 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4735
4736 __ cc20_qr_add4(cSet, dSet); // c += d
4737 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4738 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4739
4740 __ cc20_qr_add4(aSet, bSet); // a += b
4741 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4742 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4743
4744 __ cc20_qr_add4(cSet, dSet); // c += d
4745 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4746 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4747
4748 // Decrement and iterate
4749 __ sub(loopCtr, loopCtr, 1);
4750 __ cbnz(loopCtr, L_twoRounds);
4751
4752 __ mov(tmpAddr, state);
4753
4754 // Add the starting state back to the post-loop keystream
4755 // state. We read/interlace the state array from memory into
4756 // 4 registers similar to what we did in the beginning. Then
4757 // add the counter overlay onto workSt[12] at the end.
4758 for (i = 0; i < 16; i += 4) {
4759 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4760 __ addv(workSt[i], __ T4S, workSt[i], v0);
4761 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4762 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4763 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4764 }
4765 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4766
4767 // Write working state into the keystream buffer. This is accomplished
4768 // by taking the lane "i" from each of the four vectors and writing
4769 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4770 // repeating with the next 4 vectors until all 16 vectors have been used.
4771 // Then move to the next lane and repeat the process until all lanes have
4772 // been written.
4773 for (i = 0; i < 4; i++) {
4774 for (j = 0; j < 16; j += 4) {
4775 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4776 __ post(keystream, 16));
4777 }
4778 }
4779
4780 __ mov(r0, 256); // Return length of output keystream
4781 __ leave();
4782 __ ret(lr);
4783
4784 // bind label and generate local constant data used by this stub
4785 // The constant data is broken into two 128-bit segments to be loaded
4786 // onto FloatRegisters. The first 128 bits are a counter add overlay
4787 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4788 // The second 128-bits is a table constant used for 8-bit left rotations.
4789 __ BIND(L_cc20_const);
4790 __ emit_int64(0x0000000100000000UL);
4791 __ emit_int64(0x0000000300000002UL);
4792 __ emit_int64(0x0605040702010003UL);
4793 __ emit_int64(0x0E0D0C0F0A09080BUL);
4794
4795 return start;
4796 }
4797
4798 // Helpers to schedule parallel operation bundles across vector
4799 // register sequences of size 2, 4 or 8.
4800
4801 // Implement various primitive computations across vector sequences
4802
4803 template<int N>
4804 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4805 const VSeq<N>& v1, const VSeq<N>& v2) {
4806 // output must not be constant
4807 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4808 // output cannot overwrite pending inputs
4809 assert(!vs_write_before_read(v, v1), "output overwrites input");
4810 assert(!vs_write_before_read(v, v2), "output overwrites input");
4811 for (int i = 0; i < N; i++) {
4812 __ addv(v[i], T, v1[i], v2[i]);
4813 }
4814 }
4815
4816 template<int N>
4817 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4818 const VSeq<N>& v1, const VSeq<N>& v2) {
4819 // output must not be constant
4820 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4821 // output cannot overwrite pending inputs
4822 assert(!vs_write_before_read(v, v1), "output overwrites input");
4823 assert(!vs_write_before_read(v, v2), "output overwrites input");
4824 for (int i = 0; i < N; i++) {
4825 __ subv(v[i], T, v1[i], v2[i]);
4826 }
4827 }
4828
4829 template<int N>
4830 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4831 const VSeq<N>& v1, const VSeq<N>& v2) {
4832 // output must not be constant
4833 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4834 // output cannot overwrite pending inputs
4835 assert(!vs_write_before_read(v, v1), "output overwrites input");
4836 assert(!vs_write_before_read(v, v2), "output overwrites input");
4837 for (int i = 0; i < N; i++) {
4838 __ mulv(v[i], T, v1[i], v2[i]);
4839 }
4840 }
4841
4842 template<int N>
4843 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4844 // output must not be constant
4845 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4846 // output cannot overwrite pending inputs
4847 assert(!vs_write_before_read(v, v1), "output overwrites input");
4848 for (int i = 0; i < N; i++) {
4849 __ negr(v[i], T, v1[i]);
4850 }
4851 }
4852
4853 template<int N>
4854 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4855 const VSeq<N>& v1, int shift) {
4856 // output must not be constant
4857 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4858 // output cannot overwrite pending inputs
4859 assert(!vs_write_before_read(v, v1), "output overwrites input");
4860 for (int i = 0; i < N; i++) {
4861 __ sshr(v[i], T, v1[i], shift);
4862 }
4863 }
4864
4865 template<int N>
4866 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4867 // output must not be constant
4868 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4869 // output cannot overwrite pending inputs
4870 assert(!vs_write_before_read(v, v1), "output overwrites input");
4871 assert(!vs_write_before_read(v, v2), "output overwrites input");
4872 for (int i = 0; i < N; i++) {
4873 __ andr(v[i], __ T16B, v1[i], v2[i]);
4874 }
4875 }
4876
4877 template<int N>
4878 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4879 // output must not be constant
4880 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4881 // output cannot overwrite pending inputs
4882 assert(!vs_write_before_read(v, v1), "output overwrites input");
4883 assert(!vs_write_before_read(v, v2), "output overwrites input");
4884 for (int i = 0; i < N; i++) {
4885 __ orr(v[i], __ T16B, v1[i], v2[i]);
4886 }
4887 }
4888
4889 template<int N>
4890 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4891 // output must not be constant
4892 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4893 // output cannot overwrite pending inputs
4894 assert(!vs_write_before_read(v, v1), "output overwrites input");
4895 for (int i = 0; i < N; i++) {
4896 __ notr(v[i], __ T16B, v1[i]);
4897 }
4898 }
4899
4900 template<int N>
4901 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4902 // output must not be constant
4903 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4904 // output cannot overwrite pending inputs
4905 assert(!vs_write_before_read(v, v1), "output overwrites input");
4906 assert(!vs_write_before_read(v, v2), "output overwrites input");
4907 for (int i = 0; i < N; i++) {
4908 __ sqdmulh(v[i], T, v1[i], v2[i]);
4909 }
4910 }
4911
4912 template<int N>
4913 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4914 // output must not be constant
4915 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4916 // output cannot overwrite pending inputs
4917 assert(!vs_write_before_read(v, v1), "output overwrites input");
4918 assert(!vs_write_before_read(v, v2), "output overwrites input");
4919 for (int i = 0; i < N; i++) {
4920 __ mlsv(v[i], T, v1[i], v2[i]);
4921 }
4922 }
4923
4924 // load N/2 successive pairs of quadword values from memory in order
4925 // into N successive vector registers of the sequence via the
4926 // address supplied in base.
4927 template<int N>
4928 void vs_ldpq(const VSeq<N>& v, Register base) {
4929 for (int i = 0; i < N; i += 2) {
4930 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4931 }
4932 }
4933
4934 // load N/2 successive pairs of quadword values from memory in order
4935 // into N vector registers of the sequence via the address supplied
4936 // in base using post-increment addressing
4937 template<int N>
4938 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4939 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4940 for (int i = 0; i < N; i += 2) {
4941 __ ldpq(v[i], v[i+1], __ post(base, 32));
4942 }
4943 }
4944
4945 // store N successive vector registers of the sequence into N/2
4946 // successive pairs of quadword memory locations via the address
4947 // supplied in base using post-increment addressing
4948 template<int N>
4949 void vs_stpq_post(const VSeq<N>& v, Register base) {
4950 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4951 for (int i = 0; i < N; i += 2) {
4952 __ stpq(v[i], v[i+1], __ post(base, 32));
4953 }
4954 }
4955
4956 // load N/2 pairs of quadword values from memory de-interleaved into
4957 // N vector registers 2 at a time via the address supplied in base
4958 // using post-increment addressing.
4959 template<int N>
4960 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4961 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4962 for (int i = 0; i < N; i += 2) {
4963 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4964 }
4965 }
4966
4967 // store N vector registers interleaved into N/2 pairs of quadword
4968 // memory locations via the address supplied in base using
4969 // post-increment addressing.
4970 template<int N>
4971 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4972 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4973 for (int i = 0; i < N; i += 2) {
4974 __ st2(v[i], v[i+1], T, __ post(base, 32));
4975 }
4976 }
4977
4978 // load N quadword values from memory de-interleaved into N vector
4979 // registers 3 elements at a time via the address supplied in base.
4980 template<int N>
4981 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4982 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4983 for (int i = 0; i < N; i += 3) {
4984 __ ld3(v[i], v[i+1], v[i+2], T, base);
4985 }
4986 }
4987
4988 // load N quadword values from memory de-interleaved into N vector
4989 // registers 3 elements at a time via the address supplied in base
4990 // using post-increment addressing.
4991 template<int N>
4992 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4993 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4994 for (int i = 0; i < N; i += 3) {
4995 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4996 }
4997 }
4998
4999 // load N/2 pairs of quadword values from memory into N vector
5000 // registers via the address supplied in base with each pair indexed
5001 // using the the start offset plus the corresponding entry in the
5002 // offsets array
5003 template<int N>
5004 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5005 for (int i = 0; i < N/2; i++) {
5006 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5007 }
5008 }
5009
5010 // store N vector registers into N/2 pairs of quadword memory
5011 // locations via the address supplied in base with each pair indexed
5012 // using the the start offset plus the corresponding entry in the
5013 // offsets array
5014 template<int N>
5015 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5016 for (int i = 0; i < N/2; i++) {
5017 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5018 }
5019 }
5020
5021 // load N single quadword values from memory into N vector registers
5022 // via the address supplied in base with each value indexed using
5023 // the the start offset plus the corresponding entry in the offsets
5024 // array
5025 template<int N>
5026 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5027 int start, int (&offsets)[N]) {
5028 for (int i = 0; i < N; i++) {
5029 __ ldr(v[i], T, Address(base, start + offsets[i]));
5030 }
5031 }
5032
5033 // store N vector registers into N single quadword memory locations
5034 // via the address supplied in base with each value indexed using
5035 // the the start offset plus the corresponding entry in the offsets
5036 // array
5037 template<int N>
5038 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5039 int start, int (&offsets)[N]) {
5040 for (int i = 0; i < N; i++) {
5041 __ str(v[i], T, Address(base, start + offsets[i]));
5042 }
5043 }
5044
5045 // load N/2 pairs of quadword values from memory de-interleaved into
5046 // N vector registers 2 at a time via the address supplied in base
5047 // with each pair indexed using the the start offset plus the
5048 // corresponding entry in the offsets array
5049 template<int N>
5050 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5051 Register tmp, int start, int (&offsets)[N/2]) {
5052 for (int i = 0; i < N/2; i++) {
5053 __ add(tmp, base, start + offsets[i]);
5054 __ ld2(v[2*i], v[2*i+1], T, tmp);
5055 }
5056 }
5057
5058 // store N vector registers 2 at a time interleaved into N/2 pairs
5059 // of quadword memory locations via the address supplied in base
5060 // with each pair indexed using the the start offset plus the
5061 // corresponding entry in the offsets array
5062 template<int N>
5063 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5064 Register tmp, int start, int (&offsets)[N/2]) {
5065 for (int i = 0; i < N/2; i++) {
5066 __ add(tmp, base, start + offsets[i]);
5067 __ st2(v[2*i], v[2*i+1], T, tmp);
5068 }
5069 }
5070
5071 // Helper routines for various flavours of Montgomery multiply
5072
5073 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5074 // multiplications in parallel
5075 //
5076
5077 // See the montMul() method of the sun.security.provider.ML_DSA
5078 // class.
5079 //
5080 // Computes 4x4S results or 8x8H results
5081 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5082 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5083 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5084 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5085 // Outputs: va - 4x4S or 4x8H vector register sequences
5086 // vb, vc, vtmp and vq must all be disjoint
5087 // va must be disjoint from all other inputs/temps or must equal vc
5088 // va must have a non-zero delta i.e. it must not be a constant vseq.
5089 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5090 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5091 Assembler::SIMD_Arrangement T,
5092 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5093 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5094 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5095 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5096 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5097
5098 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5099 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5100
5101 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5102
5103 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5104 assert(vs_disjoint(va, vb), "va and vb overlap");
5105 assert(vs_disjoint(va, vq), "va and vq overlap");
5106 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5107 assert(!va.is_constant(), "output vector must identify 4 different registers");
5108
5109 // schedule 4 streams of instructions across the vector sequences
5110 for (int i = 0; i < 4; i++) {
5111 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5112 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5113 }
5114
5115 for (int i = 0; i < 4; i++) {
5116 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5117 }
5118
5119 for (int i = 0; i < 4; i++) {
5120 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5121 }
5122
5123 for (int i = 0; i < 4; i++) {
5124 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5125 }
5126 }
5127
5128 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5129 // multiplications in parallel
5130 //
5131
5132 // See the montMul() method of the sun.security.provider.ML_DSA
5133 // class.
5134 //
5135 // Computes 4x4S results or 8x8H results
5136 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5137 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5138 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5139 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5140 // Outputs: va - 4x4S or 4x8H vector register sequences
5141 // vb, vc, vtmp and vq must all be disjoint
5142 // va must be disjoint from all other inputs/temps or must equal vc
5143 // va must have a non-zero delta i.e. it must not be a constant vseq.
5144 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5145 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5146 Assembler::SIMD_Arrangement T,
5147 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5148 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5149 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5150 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5151 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5152
5153 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5154 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5155
5156 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5157
5158 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5159 assert(vs_disjoint(va, vb), "va and vb overlap");
5160 assert(vs_disjoint(va, vq), "va and vq overlap");
5161 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5162 assert(!va.is_constant(), "output vector must identify 2 different registers");
5163
5164 // schedule 2 streams of instructions across the vector sequences
5165 for (int i = 0; i < 2; i++) {
5166 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5167 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5168 }
5169
5170 for (int i = 0; i < 2; i++) {
5171 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5172 }
5173
5174 for (int i = 0; i < 2; i++) {
5175 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5176 }
5177
5178 for (int i = 0; i < 2; i++) {
5179 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5180 }
5181 }
5182
5183 // Perform 16 16-bit Montgomery multiplications in parallel.
5184 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5185 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5186 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5187 // It will assert that the register use is valid
5188 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5189 }
5190
5191 // Perform 32 16-bit Montgomery multiplications in parallel.
5192 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5193 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5194 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5195 // It will assert that the register use is valid
5196 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5197 }
5198
5199 // Perform 64 16-bit Montgomery multiplications in parallel.
5200 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5201 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5202 // Schedule two successive 4x8H multiplies via the montmul helper
5203 // on the front and back halves of va, vb and vc. The helper will
5204 // assert that the register use has no overlap conflicts on each
5205 // individual call but we also need to ensure that the necessary
5206 // disjoint/equality constraints are met across both calls.
5207
5208 // vb, vc, vtmp and vq must be disjoint. va must either be
5209 // disjoint from all other registers or equal vc
5210
5211 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5212 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5213 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5214
5215 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5216 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5217
5218 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5219
5220 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5221 assert(vs_disjoint(va, vb), "va and vb overlap");
5222 assert(vs_disjoint(va, vq), "va and vq overlap");
5223 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5224
5225 // we multiply the front and back halves of each sequence 4 at a
5226 // time because
5227 //
5228 // 1) we are currently only able to get 4-way instruction
5229 // parallelism at best
5230 //
5231 // 2) we need registers for the constants in vq and temporary
5232 // scratch registers to hold intermediate results so vtmp can only
5233 // be a VSeq<4> which means we only have 4 scratch slots
5234
5235 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5236 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5237 }
5238
5239 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5240 const VSeq<4>& vc,
5241 const VSeq<4>& vtmp,
5242 const VSeq<2>& vq) {
5243 // compute a = montmul(a1, c)
5244 kyber_montmul32(vc, va1, vc, vtmp, vq);
5245 // ouptut a1 = a0 - a
5246 vs_subv(va1, __ T8H, va0, vc);
5247 // and a0 = a0 + a
5248 vs_addv(va0, __ T8H, va0, vc);
5249 }
5250
5251 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5252 const VSeq<4>& vb,
5253 const VSeq<4>& vtmp1,
5254 const VSeq<4>& vtmp2,
5255 const VSeq<2>& vq) {
5256 // compute c = a0 - a1
5257 vs_subv(vtmp1, __ T8H, va0, va1);
5258 // output a0 = a0 + a1
5259 vs_addv(va0, __ T8H, va0, va1);
5260 // output a1 = b montmul c
5261 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5262 }
5263
5264 void load64shorts(const VSeq<8>& v, Register shorts) {
5265 vs_ldpq_post(v, shorts);
5266 }
5267
5268 void load32shorts(const VSeq<4>& v, Register shorts) {
5269 vs_ldpq_post(v, shorts);
5270 }
5271
5272 void store64shorts(VSeq<8> v, Register tmpAddr) {
5273 vs_stpq_post(v, tmpAddr);
5274 }
5275
5276 // Kyber NTT function.
5277 // Implements
5278 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5279 //
5280 // coeffs (short[256]) = c_rarg0
5281 // ntt_zetas (short[256]) = c_rarg1
5282 address generate_kyberNtt() {
5283
5284 __ align(CodeEntryAlignment);
5285 StubId stub_id = StubId::stubgen_kyberNtt_id;
5286 StubCodeMark mark(this, stub_id);
5287 address start = __ pc();
5288 __ enter();
5289
5290 const Register coeffs = c_rarg0;
5291 const Register zetas = c_rarg1;
5292
5293 const Register kyberConsts = r10;
5294 const Register tmpAddr = r11;
5295
5296 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5297 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5298 VSeq<2> vq(30); // n.b. constants overlap vs3
5299
5300 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5301 // load the montmul constants
5302 vs_ldpq(vq, kyberConsts);
5303
5304 // Each level corresponds to an iteration of the outermost loop of the
5305 // Java method seilerNTT(int[] coeffs). There are some differences
5306 // from what is done in the seilerNTT() method, though:
5307 // 1. The computation is using 16-bit signed values, we do not convert them
5308 // to ints here.
5309 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5310 // this array for each level, it is easier that way to fill up the vector
5311 // registers.
5312 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5313 // multiplications (this is because that way there should not be any
5314 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5315 // that we can use the 16-bit arithmetic in the vector unit.
5316 //
5317 // On each level, we fill up the vector registers in such a way that the
5318 // array elements that need to be multiplied by the zetas go into one
5319 // set of vector registers while the corresponding ones that don't need to
5320 // be multiplied, go into another set.
5321 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5322 // registers interleaving the steps of 4 identical computations,
5323 // each done on 8 16-bit values per register.
5324
5325 // At levels 0-3 the coefficients multiplied by or added/subtracted
5326 // to the zetas occur in discrete blocks whose size is some multiple
5327 // of 32.
5328
5329 // level 0
5330 __ add(tmpAddr, coeffs, 256);
5331 load64shorts(vs1, tmpAddr);
5332 load64shorts(vs2, zetas);
5333 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5334 __ add(tmpAddr, coeffs, 0);
5335 load64shorts(vs1, tmpAddr);
5336 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5337 vs_addv(vs1, __ T8H, vs1, vs2);
5338 __ add(tmpAddr, coeffs, 0);
5339 vs_stpq_post(vs1, tmpAddr);
5340 __ add(tmpAddr, coeffs, 256);
5341 vs_stpq_post(vs3, tmpAddr);
5342 // restore montmul constants
5343 vs_ldpq(vq, kyberConsts);
5344 load64shorts(vs1, tmpAddr);
5345 load64shorts(vs2, zetas);
5346 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5347 __ add(tmpAddr, coeffs, 128);
5348 load64shorts(vs1, tmpAddr);
5349 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5350 vs_addv(vs1, __ T8H, vs1, vs2);
5351 __ add(tmpAddr, coeffs, 128);
5352 store64shorts(vs1, tmpAddr);
5353 __ add(tmpAddr, coeffs, 384);
5354 store64shorts(vs3, tmpAddr);
5355
5356 // level 1
5357 // restore montmul constants
5358 vs_ldpq(vq, kyberConsts);
5359 __ add(tmpAddr, coeffs, 128);
5360 load64shorts(vs1, tmpAddr);
5361 load64shorts(vs2, zetas);
5362 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5363 __ add(tmpAddr, coeffs, 0);
5364 load64shorts(vs1, tmpAddr);
5365 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5366 vs_addv(vs1, __ T8H, vs1, vs2);
5367 __ add(tmpAddr, coeffs, 0);
5368 store64shorts(vs1, tmpAddr);
5369 store64shorts(vs3, tmpAddr);
5370 vs_ldpq(vq, kyberConsts);
5371 __ add(tmpAddr, coeffs, 384);
5372 load64shorts(vs1, tmpAddr);
5373 load64shorts(vs2, zetas);
5374 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5375 __ add(tmpAddr, coeffs, 256);
5376 load64shorts(vs1, tmpAddr);
5377 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5378 vs_addv(vs1, __ T8H, vs1, vs2);
5379 __ add(tmpAddr, coeffs, 256);
5380 store64shorts(vs1, tmpAddr);
5381 store64shorts(vs3, tmpAddr);
5382
5383 // level 2
5384 vs_ldpq(vq, kyberConsts);
5385 int offsets1[4] = { 0, 32, 128, 160 };
5386 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5387 load64shorts(vs2, zetas);
5388 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5389 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5390 // kyber_subv_addv64();
5391 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5392 vs_addv(vs1, __ T8H, vs1, vs2);
5393 __ add(tmpAddr, coeffs, 0);
5394 vs_stpq_post(vs_front(vs1), tmpAddr);
5395 vs_stpq_post(vs_front(vs3), tmpAddr);
5396 vs_stpq_post(vs_back(vs1), tmpAddr);
5397 vs_stpq_post(vs_back(vs3), tmpAddr);
5398 vs_ldpq(vq, kyberConsts);
5399 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5400 load64shorts(vs2, zetas);
5401 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5402 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5403 // kyber_subv_addv64();
5404 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5405 vs_addv(vs1, __ T8H, vs1, vs2);
5406 __ add(tmpAddr, coeffs, 256);
5407 vs_stpq_post(vs_front(vs1), tmpAddr);
5408 vs_stpq_post(vs_front(vs3), tmpAddr);
5409 vs_stpq_post(vs_back(vs1), tmpAddr);
5410 vs_stpq_post(vs_back(vs3), tmpAddr);
5411
5412 // level 3
5413 vs_ldpq(vq, kyberConsts);
5414 int offsets2[4] = { 0, 64, 128, 192 };
5415 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5416 load64shorts(vs2, zetas);
5417 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5418 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5419 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5420 vs_addv(vs1, __ T8H, vs1, vs2);
5421 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5422 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5423
5424 vs_ldpq(vq, kyberConsts);
5425 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5426 load64shorts(vs2, zetas);
5427 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5428 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5429 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5430 vs_addv(vs1, __ T8H, vs1, vs2);
5431 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5432 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5433
5434 // level 4
5435 // At level 4 coefficients occur in 8 discrete blocks of size 16
5436 // so they are loaded using employing an ldr at 8 distinct offsets.
5437
5438 vs_ldpq(vq, kyberConsts);
5439 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5440 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5441 load64shorts(vs2, zetas);
5442 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5443 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5444 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5445 vs_addv(vs1, __ T8H, vs1, vs2);
5446 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5447 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5448
5449 vs_ldpq(vq, kyberConsts);
5450 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5451 load64shorts(vs2, zetas);
5452 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5453 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5454 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5455 vs_addv(vs1, __ T8H, vs1, vs2);
5456 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5457 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5458
5459 // level 5
5460 // At level 5 related coefficients occur in discrete blocks of size 8 so
5461 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5462
5463 vs_ldpq(vq, kyberConsts);
5464 int offsets4[4] = { 0, 32, 64, 96 };
5465 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5466 load32shorts(vs_front(vs2), zetas);
5467 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5468 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5469 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5470 load32shorts(vs_front(vs2), zetas);
5471 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5472 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5473 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5474 load32shorts(vs_front(vs2), zetas);
5475 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5476 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5477
5478 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5479 load32shorts(vs_front(vs2), zetas);
5480 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5481 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5482
5483 // level 6
5484 // At level 6 related coefficients occur in discrete blocks of size 4 so
5485 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5486
5487 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5488 load32shorts(vs_front(vs2), zetas);
5489 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5490 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5491 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5492 // __ ldpq(v18, v19, __ post(zetas, 32));
5493 load32shorts(vs_front(vs2), zetas);
5494 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5495 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5496
5497 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5498 load32shorts(vs_front(vs2), zetas);
5499 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5500 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5501
5502 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5503 load32shorts(vs_front(vs2), zetas);
5504 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5505 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5506
5507 __ leave(); // required for proper stackwalking of RuntimeStub frame
5508 __ mov(r0, zr); // return 0
5509 __ ret(lr);
5510
5511 return start;
5512 }
5513
5514 // Kyber Inverse NTT function
5515 // Implements
5516 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5517 //
5518 // coeffs (short[256]) = c_rarg0
5519 // ntt_zetas (short[256]) = c_rarg1
5520 address generate_kyberInverseNtt() {
5521
5522 __ align(CodeEntryAlignment);
5523 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5524 StubCodeMark mark(this, stub_id);
5525 address start = __ pc();
5526 __ enter();
5527
5528 const Register coeffs = c_rarg0;
5529 const Register zetas = c_rarg1;
5530
5531 const Register kyberConsts = r10;
5532 const Register tmpAddr = r11;
5533 const Register tmpAddr2 = c_rarg2;
5534
5535 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5536 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5537 VSeq<2> vq(30); // n.b. constants overlap vs3
5538
5539 __ lea(kyberConsts,
5540 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5541
5542 // level 0
5543 // At level 0 related coefficients occur in discrete blocks of size 4 so
5544 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5545
5546 vs_ldpq(vq, kyberConsts);
5547 int offsets4[4] = { 0, 32, 64, 96 };
5548 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5549 load32shorts(vs_front(vs2), zetas);
5550 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5551 vs_front(vs2), vs_back(vs2), vtmp, vq);
5552 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5553 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5554 load32shorts(vs_front(vs2), zetas);
5555 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5556 vs_front(vs2), vs_back(vs2), vtmp, vq);
5557 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5558 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5559 load32shorts(vs_front(vs2), zetas);
5560 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5561 vs_front(vs2), vs_back(vs2), vtmp, vq);
5562 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5563 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5564 load32shorts(vs_front(vs2), zetas);
5565 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5566 vs_front(vs2), vs_back(vs2), vtmp, vq);
5567 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5568
5569 // level 1
5570 // At level 1 related coefficients occur in discrete blocks of size 8 so
5571 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5572
5573 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5574 load32shorts(vs_front(vs2), zetas);
5575 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5576 vs_front(vs2), vs_back(vs2), vtmp, vq);
5577 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5578 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5579 load32shorts(vs_front(vs2), zetas);
5580 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5581 vs_front(vs2), vs_back(vs2), vtmp, vq);
5582 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5583
5584 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5585 load32shorts(vs_front(vs2), zetas);
5586 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5587 vs_front(vs2), vs_back(vs2), vtmp, vq);
5588 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5589 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5590 load32shorts(vs_front(vs2), zetas);
5591 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5592 vs_front(vs2), vs_back(vs2), vtmp, vq);
5593 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5594
5595 // level 2
5596 // At level 2 coefficients occur in 8 discrete blocks of size 16
5597 // so they are loaded using employing an ldr at 8 distinct offsets.
5598
5599 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5600 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5601 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5602 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5603 vs_subv(vs1, __ T8H, vs1, vs2);
5604 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5605 load64shorts(vs2, zetas);
5606 vs_ldpq(vq, kyberConsts);
5607 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5608 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5609
5610 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5611 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5612 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5613 vs_subv(vs1, __ T8H, vs1, vs2);
5614 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5615 load64shorts(vs2, zetas);
5616 vs_ldpq(vq, kyberConsts);
5617 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5618 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5619
5620 // Barrett reduction at indexes where overflow may happen
5621
5622 // load q and the multiplier for the Barrett reduction
5623 __ add(tmpAddr, kyberConsts, 16);
5624 vs_ldpq(vq, tmpAddr);
5625
5626 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5627 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5628 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5629 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5630 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5631 vs_sshr(vs2, __ T8H, vs2, 11);
5632 vs_mlsv(vs1, __ T8H, vs2, vq1);
5633 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5634 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5635 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5636 vs_sshr(vs2, __ T8H, vs2, 11);
5637 vs_mlsv(vs1, __ T8H, vs2, vq1);
5638 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5639
5640 // level 3
5641 // From level 3 upwards coefficients occur in discrete blocks whose size is
5642 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5643
5644 int offsets2[4] = { 0, 64, 128, 192 };
5645 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5646 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5647 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5648 vs_subv(vs1, __ T8H, vs1, vs2);
5649 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5650 load64shorts(vs2, zetas);
5651 vs_ldpq(vq, kyberConsts);
5652 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5653 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5654
5655 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5656 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5657 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5658 vs_subv(vs1, __ T8H, vs1, vs2);
5659 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5660 load64shorts(vs2, zetas);
5661 vs_ldpq(vq, kyberConsts);
5662 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5663 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5664
5665 // level 4
5666
5667 int offsets1[4] = { 0, 32, 128, 160 };
5668 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5669 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5670 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5671 vs_subv(vs1, __ T8H, vs1, vs2);
5672 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5673 load64shorts(vs2, zetas);
5674 vs_ldpq(vq, kyberConsts);
5675 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5676 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5677
5678 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5679 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5680 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5681 vs_subv(vs1, __ T8H, vs1, vs2);
5682 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5683 load64shorts(vs2, zetas);
5684 vs_ldpq(vq, kyberConsts);
5685 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5686 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5687
5688 // level 5
5689
5690 __ add(tmpAddr, coeffs, 0);
5691 load64shorts(vs1, tmpAddr);
5692 __ add(tmpAddr, coeffs, 128);
5693 load64shorts(vs2, tmpAddr);
5694 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5695 vs_subv(vs1, __ T8H, vs1, vs2);
5696 __ add(tmpAddr, coeffs, 0);
5697 store64shorts(vs3, tmpAddr);
5698 load64shorts(vs2, zetas);
5699 vs_ldpq(vq, kyberConsts);
5700 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5701 __ add(tmpAddr, coeffs, 128);
5702 store64shorts(vs2, tmpAddr);
5703
5704 load64shorts(vs1, tmpAddr);
5705 __ add(tmpAddr, coeffs, 384);
5706 load64shorts(vs2, tmpAddr);
5707 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5708 vs_subv(vs1, __ T8H, vs1, vs2);
5709 __ add(tmpAddr, coeffs, 256);
5710 store64shorts(vs3, tmpAddr);
5711 load64shorts(vs2, zetas);
5712 vs_ldpq(vq, kyberConsts);
5713 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5714 __ add(tmpAddr, coeffs, 384);
5715 store64shorts(vs2, tmpAddr);
5716
5717 // Barrett reduction at indexes where overflow may happen
5718
5719 // load q and the multiplier for the Barrett reduction
5720 __ add(tmpAddr, kyberConsts, 16);
5721 vs_ldpq(vq, tmpAddr);
5722
5723 int offsets0[2] = { 0, 256 };
5724 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5725 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5726 vs_sshr(vs2, __ T8H, vs2, 11);
5727 vs_mlsv(vs1, __ T8H, vs2, vq1);
5728 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5729
5730 // level 6
5731
5732 __ add(tmpAddr, coeffs, 0);
5733 load64shorts(vs1, tmpAddr);
5734 __ add(tmpAddr, coeffs, 256);
5735 load64shorts(vs2, tmpAddr);
5736 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5737 vs_subv(vs1, __ T8H, vs1, vs2);
5738 __ add(tmpAddr, coeffs, 0);
5739 store64shorts(vs3, tmpAddr);
5740 load64shorts(vs2, zetas);
5741 vs_ldpq(vq, kyberConsts);
5742 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5743 __ add(tmpAddr, coeffs, 256);
5744 store64shorts(vs2, tmpAddr);
5745
5746 __ add(tmpAddr, coeffs, 128);
5747 load64shorts(vs1, tmpAddr);
5748 __ add(tmpAddr, coeffs, 384);
5749 load64shorts(vs2, tmpAddr);
5750 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5751 vs_subv(vs1, __ T8H, vs1, vs2);
5752 __ add(tmpAddr, coeffs, 128);
5753 store64shorts(vs3, tmpAddr);
5754 load64shorts(vs2, zetas);
5755 vs_ldpq(vq, kyberConsts);
5756 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5757 __ add(tmpAddr, coeffs, 384);
5758 store64shorts(vs2, tmpAddr);
5759
5760 // multiply by 2^-n
5761
5762 // load toMont(2^-n mod q)
5763 __ add(tmpAddr, kyberConsts, 48);
5764 __ ldr(v29, __ Q, tmpAddr);
5765
5766 vs_ldpq(vq, kyberConsts);
5767 __ add(tmpAddr, coeffs, 0);
5768 load64shorts(vs1, tmpAddr);
5769 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5770 __ add(tmpAddr, coeffs, 0);
5771 store64shorts(vs2, tmpAddr);
5772
5773 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5774 load64shorts(vs1, tmpAddr);
5775 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5776 __ add(tmpAddr, coeffs, 128);
5777 store64shorts(vs2, tmpAddr);
5778
5779 // now tmpAddr contains coeffs + 256
5780 load64shorts(vs1, tmpAddr);
5781 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5782 __ add(tmpAddr, coeffs, 256);
5783 store64shorts(vs2, tmpAddr);
5784
5785 // now tmpAddr contains coeffs + 384
5786 load64shorts(vs1, tmpAddr);
5787 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5788 __ add(tmpAddr, coeffs, 384);
5789 store64shorts(vs2, tmpAddr);
5790
5791 __ leave(); // required for proper stackwalking of RuntimeStub frame
5792 __ mov(r0, zr); // return 0
5793 __ ret(lr);
5794
5795 return start;
5796 }
5797
5798 // Kyber multiply polynomials in the NTT domain.
5799 // Implements
5800 // static int implKyberNttMult(
5801 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5802 //
5803 // result (short[256]) = c_rarg0
5804 // ntta (short[256]) = c_rarg1
5805 // nttb (short[256]) = c_rarg2
5806 // zetas (short[128]) = c_rarg3
5807 address generate_kyberNttMult() {
5808
5809 __ align(CodeEntryAlignment);
5810 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5811 StubCodeMark mark(this, stub_id);
5812 address start = __ pc();
5813 __ enter();
5814
5815 const Register result = c_rarg0;
5816 const Register ntta = c_rarg1;
5817 const Register nttb = c_rarg2;
5818 const Register zetas = c_rarg3;
5819
5820 const Register kyberConsts = r10;
5821 const Register limit = r11;
5822
5823 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5824 VSeq<4> vs3(16), vs4(20);
5825 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5826 VSeq<2> vz(28); // pair of zetas
5827 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5828
5829 __ lea(kyberConsts,
5830 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5831
5832 Label kyberNttMult_loop;
5833
5834 __ add(limit, result, 512);
5835
5836 // load q and qinv
5837 vs_ldpq(vq, kyberConsts);
5838
5839 // load R^2 mod q (to convert back from Montgomery representation)
5840 __ add(kyberConsts, kyberConsts, 64);
5841 __ ldr(v27, __ Q, kyberConsts);
5842
5843 __ BIND(kyberNttMult_loop);
5844
5845 // load 16 zetas
5846 vs_ldpq_post(vz, zetas);
5847
5848 // load 2 sets of 32 coefficients from the two input arrays
5849 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5850 // are striped across pairs of vector registers
5851 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5852 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5853 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5854 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5855
5856 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5857 // i.e. montmul the first and second halves of vs1 in order and
5858 // then with one sequence reversed storing the two results in vs3
5859 //
5860 // vs3[0] <- montmul(a0, b0)
5861 // vs3[1] <- montmul(a1, b1)
5862 // vs3[2] <- montmul(a0, b1)
5863 // vs3[3] <- montmul(a1, b0)
5864 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5865 kyber_montmul16(vs_back(vs3),
5866 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5867
5868 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5869 // i.e. montmul the first and second halves of vs4 in order and
5870 // then with one sequence reversed storing the two results in vs1
5871 //
5872 // vs1[0] <- montmul(a2, b2)
5873 // vs1[1] <- montmul(a3, b3)
5874 // vs1[2] <- montmul(a2, b3)
5875 // vs1[3] <- montmul(a3, b2)
5876 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5877 kyber_montmul16(vs_back(vs1),
5878 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5879
5880 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5881 // We can schedule two montmuls at a time if we use a suitable vector
5882 // sequence <vs3[1], vs1[1]>.
5883 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5884 VSeq<2> vs5(vs3[1], delta);
5885
5886 // vs3[1] <- montmul(montmul(a1, b1), z0)
5887 // vs1[1] <- montmul(montmul(a3, b3), z1)
5888 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5889
5890 // add results in pairs storing in vs3
5891 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5892 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5893 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5894
5895 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5896 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5897 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5898
5899 // vs1 <- montmul(vs3, montRSquareModQ)
5900 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5901
5902 // store back the two pairs of result vectors de-interleaved as 8H elements
5903 // i.e. storing each pairs of shorts striped across a register pair adjacent
5904 // in memory
5905 vs_st2_post(vs1, __ T8H, result);
5906
5907 __ cmp(result, limit);
5908 __ br(Assembler::NE, kyberNttMult_loop);
5909
5910 __ leave(); // required for proper stackwalking of RuntimeStub frame
5911 __ mov(r0, zr); // return 0
5912 __ ret(lr);
5913
5914 return start;
5915 }
5916
5917 // Kyber add 2 polynomials.
5918 // Implements
5919 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5920 //
5921 // result (short[256]) = c_rarg0
5922 // a (short[256]) = c_rarg1
5923 // b (short[256]) = c_rarg2
5924 address generate_kyberAddPoly_2() {
5925
5926 __ align(CodeEntryAlignment);
5927 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5928 StubCodeMark mark(this, stub_id);
5929 address start = __ pc();
5930 __ enter();
5931
5932 const Register result = c_rarg0;
5933 const Register a = c_rarg1;
5934 const Register b = c_rarg2;
5935
5936 const Register kyberConsts = r11;
5937
5938 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5939 // So, we can load, add and store the data in 3 groups of 11,
5940 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5941 // registers. A further constraint is that the mapping needs
5942 // to skip callee saves. So, we allocate the register
5943 // sequences using two 8 sequences, two 2 sequences and two
5944 // single registers.
5945 VSeq<8> vs1_1(0);
5946 VSeq<2> vs1_2(16);
5947 FloatRegister vs1_3 = v28;
5948 VSeq<8> vs2_1(18);
5949 VSeq<2> vs2_2(26);
5950 FloatRegister vs2_3 = v29;
5951
5952 // two constant vector sequences
5953 VSeq<8> vc_1(31, 0);
5954 VSeq<2> vc_2(31, 0);
5955
5956 FloatRegister vc_3 = v31;
5957 __ lea(kyberConsts,
5958 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5959
5960 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5961 for (int i = 0; i < 3; i++) {
5962 // load 80 or 88 values from a into vs1_1/2/3
5963 vs_ldpq_post(vs1_1, a);
5964 vs_ldpq_post(vs1_2, a);
5965 if (i < 2) {
5966 __ ldr(vs1_3, __ Q, __ post(a, 16));
5967 }
5968 // load 80 or 88 values from b into vs2_1/2/3
5969 vs_ldpq_post(vs2_1, b);
5970 vs_ldpq_post(vs2_2, b);
5971 if (i < 2) {
5972 __ ldr(vs2_3, __ Q, __ post(b, 16));
5973 }
5974 // sum 80 or 88 values across vs1 and vs2 into vs1
5975 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5976 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5977 if (i < 2) {
5978 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5979 }
5980 // add constant to all 80 or 88 results
5981 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5982 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5983 if (i < 2) {
5984 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5985 }
5986 // store 80 or 88 values
5987 vs_stpq_post(vs1_1, result);
5988 vs_stpq_post(vs1_2, result);
5989 if (i < 2) {
5990 __ str(vs1_3, __ Q, __ post(result, 16));
5991 }
5992 }
5993
5994 __ leave(); // required for proper stackwalking of RuntimeStub frame
5995 __ mov(r0, zr); // return 0
5996 __ ret(lr);
5997
5998 return start;
5999 }
6000
6001 // Kyber add 3 polynomials.
6002 // Implements
6003 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6004 //
6005 // result (short[256]) = c_rarg0
6006 // a (short[256]) = c_rarg1
6007 // b (short[256]) = c_rarg2
6008 // c (short[256]) = c_rarg3
6009 address generate_kyberAddPoly_3() {
6010
6011 __ align(CodeEntryAlignment);
6012 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6013 StubCodeMark mark(this, stub_id);
6014 address start = __ pc();
6015 __ enter();
6016
6017 const Register result = c_rarg0;
6018 const Register a = c_rarg1;
6019 const Register b = c_rarg2;
6020 const Register c = c_rarg3;
6021
6022 const Register kyberConsts = r11;
6023
6024 // As above we sum 256 sets of values in total i.e. 32 x 8H
6025 // quadwords. So, we can load, add and store the data in 3
6026 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6027 // of 10 or 11 registers. A further constraint is that the
6028 // mapping needs to skip callee saves. So, we allocate the
6029 // register sequences using two 8 sequences, two 2 sequences
6030 // and two single registers.
6031 VSeq<8> vs1_1(0);
6032 VSeq<2> vs1_2(16);
6033 FloatRegister vs1_3 = v28;
6034 VSeq<8> vs2_1(18);
6035 VSeq<2> vs2_2(26);
6036 FloatRegister vs2_3 = v29;
6037
6038 // two constant vector sequences
6039 VSeq<8> vc_1(31, 0);
6040 VSeq<2> vc_2(31, 0);
6041
6042 FloatRegister vc_3 = v31;
6043
6044 __ lea(kyberConsts,
6045 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6046
6047 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6048 for (int i = 0; i < 3; i++) {
6049 // load 80 or 88 values from a into vs1_1/2/3
6050 vs_ldpq_post(vs1_1, a);
6051 vs_ldpq_post(vs1_2, a);
6052 if (i < 2) {
6053 __ ldr(vs1_3, __ Q, __ post(a, 16));
6054 }
6055 // load 80 or 88 values from b into vs2_1/2/3
6056 vs_ldpq_post(vs2_1, b);
6057 vs_ldpq_post(vs2_2, b);
6058 if (i < 2) {
6059 __ ldr(vs2_3, __ Q, __ post(b, 16));
6060 }
6061 // sum 80 or 88 values across vs1 and vs2 into vs1
6062 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6063 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6064 if (i < 2) {
6065 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6066 }
6067 // load 80 or 88 values from c into vs2_1/2/3
6068 vs_ldpq_post(vs2_1, c);
6069 vs_ldpq_post(vs2_2, c);
6070 if (i < 2) {
6071 __ ldr(vs2_3, __ Q, __ post(c, 16));
6072 }
6073 // sum 80 or 88 values across vs1 and vs2 into vs1
6074 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6075 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6076 if (i < 2) {
6077 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6078 }
6079 // add constant to all 80 or 88 results
6080 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6081 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6082 if (i < 2) {
6083 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6084 }
6085 // store 80 or 88 values
6086 vs_stpq_post(vs1_1, result);
6087 vs_stpq_post(vs1_2, result);
6088 if (i < 2) {
6089 __ str(vs1_3, __ Q, __ post(result, 16));
6090 }
6091 }
6092
6093 __ leave(); // required for proper stackwalking of RuntimeStub frame
6094 __ mov(r0, zr); // return 0
6095 __ ret(lr);
6096
6097 return start;
6098 }
6099
6100 // Kyber parse XOF output to polynomial coefficient candidates
6101 // or decodePoly(12, ...).
6102 // Implements
6103 // static int implKyber12To16(
6104 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6105 //
6106 // (parsedLength or (parsedLength - 48) must be divisible by 64.)
6107 //
6108 // condensed (byte[]) = c_rarg0
6109 // condensedIndex = c_rarg1
6110 // parsed (short[112 or 256]) = c_rarg2
6111 // parsedLength (112 or 256) = c_rarg3
6112 address generate_kyber12To16() {
6113 Label L_F00, L_loop, L_end;
6114
6115 __ align(CodeEntryAlignment);
6116 StubId stub_id = StubId::stubgen_kyber12To16_id;
6117 StubCodeMark mark(this, stub_id);
6118 address start = __ pc();
6119 __ enter();
6120
6121 const Register condensed = c_rarg0;
6122 const Register condensedOffs = c_rarg1;
6123 const Register parsed = c_rarg2;
6124 const Register parsedLength = c_rarg3;
6125
6126 const Register tmpAddr = r11;
6127
6128 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6129 // quadwords so we need a 6 vector sequence for the inputs.
6130 // Parsing produces 64 shorts, employing two 8 vector
6131 // sequences to store and combine the intermediate data.
6132 VSeq<6> vin(24);
6133 VSeq<8> va(0), vb(16);
6134
6135 __ adr(tmpAddr, L_F00);
6136 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6137 __ add(condensed, condensed, condensedOffs);
6138
6139 __ BIND(L_loop);
6140 // load 96 (6 x 16B) byte values
6141 vs_ld3_post(vin, __ T16B, condensed);
6142
6143 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6144 // holds 48 (16x3) contiguous bytes from memory striped
6145 // horizontally across each of the 16 byte lanes. Equivalently,
6146 // that is 16 pairs of 12-bit integers. Likewise the back half
6147 // holds the next 48 bytes in the same arrangement.
6148
6149 // Each vector in the front half can also be viewed as a vertical
6150 // strip across the 16 pairs of 12 bit integers. Each byte in
6151 // vin[0] stores the low 8 bits of the first int in a pair. Each
6152 // byte in vin[1] stores the high 4 bits of the first int and the
6153 // low 4 bits of the second int. Each byte in vin[2] stores the
6154 // high 8 bits of the second int. Likewise the vectors in second
6155 // half.
6156
6157 // Converting the data to 16-bit shorts requires first of all
6158 // expanding each of the 6 x 16B vectors into 6 corresponding
6159 // pairs of 8H vectors. Mask, shift and add operations on the
6160 // resulting vector pairs can be used to combine 4 and 8 bit
6161 // parts of related 8H vector elements.
6162 //
6163 // The middle vectors (vin[2] and vin[5]) are actually expanded
6164 // twice, one copy manipulated to provide the lower 4 bits
6165 // belonging to the first short in a pair and another copy
6166 // manipulated to provide the higher 4 bits belonging to the
6167 // second short in a pair. This is why the the vector sequences va
6168 // and vb used to hold the expanded 8H elements are of length 8.
6169
6170 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6171 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6172 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6173 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6174 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6175 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6176 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6177 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6178
6179 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6180 // and vb[4:5]
6181 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6182 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6183 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6184 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6185 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6186 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6187
6188 // shift lo byte of copy 1 of the middle stripe into the high byte
6189 __ shl(va[2], __ T8H, va[2], 8);
6190 __ shl(va[3], __ T8H, va[3], 8);
6191 __ shl(vb[2], __ T8H, vb[2], 8);
6192 __ shl(vb[3], __ T8H, vb[3], 8);
6193
6194 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6195 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6196 // are in bit positions [4..11].
6197 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6198 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6199 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6200 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6201
6202 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6203 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6204 // copy2
6205 __ andr(va[2], __ T16B, va[2], v31);
6206 __ andr(va[3], __ T16B, va[3], v31);
6207 __ ushr(va[4], __ T8H, va[4], 4);
6208 __ ushr(va[5], __ T8H, va[5], 4);
6209 __ andr(vb[2], __ T16B, vb[2], v31);
6210 __ andr(vb[3], __ T16B, vb[3], v31);
6211 __ ushr(vb[4], __ T8H, vb[4], 4);
6212 __ ushr(vb[5], __ T8H, vb[5], 4);
6213
6214 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6215 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6216 // n.b. the ordering ensures: i) inputs are consumed before they
6217 // are overwritten ii) the order of 16-bit results across successive
6218 // pairs of vectors in va and then vb reflects the order of the
6219 // corresponding 12-bit inputs
6220 __ addv(va[0], __ T8H, va[0], va[2]);
6221 __ addv(va[2], __ T8H, va[1], va[3]);
6222 __ addv(va[1], __ T8H, va[4], va[6]);
6223 __ addv(va[3], __ T8H, va[5], va[7]);
6224 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6225 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6226 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6227 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6228
6229 // store 64 results interleaved as shorts
6230 vs_st2_post(vs_front(va), __ T8H, parsed);
6231 vs_st2_post(vs_front(vb), __ T8H, parsed);
6232
6233 __ sub(parsedLength, parsedLength, 64);
6234 __ cmp(parsedLength, (u1)64);
6235 __ br(Assembler::GE, L_loop);
6236 __ cbz(parsedLength, L_end);
6237
6238 // if anything is left it should be a final 72 bytes of input
6239 // i.e. a final 48 12-bit values. so we handle this by loading
6240 // 48 bytes into all 16B lanes of front(vin) and only 24
6241 // bytes into the lower 8B lane of back(vin)
6242 vs_ld3_post(vs_front(vin), __ T16B, condensed);
6243 vs_ld3(vs_back(vin), __ T8B, condensed);
6244
6245 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6246 // n.b. target elements 2 and 3 of va duplicate elements 4 and
6247 // 5 and target element 2 of vb duplicates element 4.
6248 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6249 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6250 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6251 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6252 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6253 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6254
6255 // This time expand just the lower 8 lanes
6256 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6257 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6258 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6259
6260 // shift lo byte of copy 1 of the middle stripe into the high byte
6261 __ shl(va[2], __ T8H, va[2], 8);
6262 __ shl(va[3], __ T8H, va[3], 8);
6263 __ shl(vb[2], __ T8H, vb[2], 8);
6264
6265 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
6266 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
6267 // int are in bit positions [4..11].
6268 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6269 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6270 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6271
6272 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
6273 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
6274 // copy2
6275 __ andr(va[2], __ T16B, va[2], v31);
6276 __ andr(va[3], __ T16B, va[3], v31);
6277 __ ushr(va[4], __ T8H, va[4], 4);
6278 __ ushr(va[5], __ T8H, va[5], 4);
6279 __ andr(vb[2], __ T16B, vb[2], v31);
6280 __ ushr(vb[4], __ T8H, vb[4], 4);
6281
6282
6283
6284 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
6285 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
6286
6287 // n.b. ordering ensures: i) inputs are consumed before they are
6288 // overwritten ii) order of 16-bit results across succsessive
6289 // pairs of vectors in va and then lower half of vb reflects order
6290 // of corresponding 12-bit inputs
6291 __ addv(va[0], __ T8H, va[0], va[2]);
6292 __ addv(va[2], __ T8H, va[1], va[3]);
6293 __ addv(va[1], __ T8H, va[4], va[6]);
6294 __ addv(va[3], __ T8H, va[5], va[7]);
6295 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6296 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6297
6298 // store 48 results interleaved as shorts
6299 vs_st2_post(vs_front(va), __ T8H, parsed);
6300 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
6301
6302 __ BIND(L_end);
6303
6304 __ leave(); // required for proper stackwalking of RuntimeStub frame
6305 __ mov(r0, zr); // return 0
6306 __ ret(lr);
6307
6308 // bind label and generate constant data used by this stub
6309 __ BIND(L_F00);
6310 __ emit_int64(0x0f000f000f000f00);
6311 __ emit_int64(0x0f000f000f000f00);
6312
6313 return start;
6314 }
6315
6316 // Kyber Barrett reduce function.
6317 // Implements
6318 // static int implKyberBarrettReduce(short[] coeffs) {}
6319 //
6320 // coeffs (short[256]) = c_rarg0
6321 address generate_kyberBarrettReduce() {
6322
6323 __ align(CodeEntryAlignment);
6324 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6325 StubCodeMark mark(this, stub_id);
6326 address start = __ pc();
6327 __ enter();
6328
6329 const Register coeffs = c_rarg0;
6330
6331 const Register kyberConsts = r10;
6332 const Register result = r11;
6333
6334 // As above we process 256 sets of values in total i.e. 32 x
6335 // 8H quadwords. So, we can load, add and store the data in 3
6336 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6337 // of 10 or 11 registers. A further constraint is that the
6338 // mapping needs to skip callee saves. So, we allocate the
6339 // register sequences using two 8 sequences, two 2 sequences
6340 // and two single registers.
6341 VSeq<8> vs1_1(0);
6342 VSeq<2> vs1_2(16);
6343 FloatRegister vs1_3 = v28;
6344 VSeq<8> vs2_1(18);
6345 VSeq<2> vs2_2(26);
6346 FloatRegister vs2_3 = v29;
6347
6348 // we also need a pair of corresponding constant sequences
6349
6350 VSeq<8> vc1_1(30, 0);
6351 VSeq<2> vc1_2(30, 0);
6352 FloatRegister vc1_3 = v30; // for kyber_q
6353
6354 VSeq<8> vc2_1(31, 0);
6355 VSeq<2> vc2_2(31, 0);
6356 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6357
6358 __ add(result, coeffs, 0);
6359 __ lea(kyberConsts,
6360 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6361
6362 // load q and the multiplier for the Barrett reduction
6363 __ add(kyberConsts, kyberConsts, 16);
6364 __ ldpq(vc1_3, vc2_3, kyberConsts);
6365
6366 for (int i = 0; i < 3; i++) {
6367 // load 80 or 88 coefficients
6368 vs_ldpq_post(vs1_1, coeffs);
6369 vs_ldpq_post(vs1_2, coeffs);
6370 if (i < 2) {
6371 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6372 }
6373
6374 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6375 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6376 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6377 if (i < 2) {
6378 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6379 }
6380
6381 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6382 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6383 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6384 if (i < 2) {
6385 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6386 }
6387
6388 // vs1 <- vs1 - vs2 * kyber_q
6389 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6390 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6391 if (i < 2) {
6392 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6393 }
6394
6395 vs_stpq_post(vs1_1, result);
6396 vs_stpq_post(vs1_2, result);
6397 if (i < 2) {
6398 __ str(vs1_3, __ Q, __ post(result, 16));
6399 }
6400 }
6401
6402 __ leave(); // required for proper stackwalking of RuntimeStub frame
6403 __ mov(r0, zr); // return 0
6404 __ ret(lr);
6405
6406 return start;
6407 }
6408
6409
6410 // Dilithium-specific montmul helper routines that generate parallel
6411 // code for, respectively, a single 4x4s vector sequence montmul or
6412 // two such multiplies in a row.
6413
6414 // Perform 16 32-bit Montgomery multiplications in parallel
6415 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6416 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6417 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6418 // It will assert that the register use is valid
6419 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6420 }
6421
6422 // Perform 2x16 32-bit Montgomery multiplications in parallel
6423 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6424 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6425 // Schedule two successive 4x4S multiplies via the montmul helper
6426 // on the front and back halves of va, vb and vc. The helper will
6427 // assert that the register use has no overlap conflicts on each
6428 // individual call but we also need to ensure that the necessary
6429 // disjoint/equality constraints are met across both calls.
6430
6431 // vb, vc, vtmp and vq must be disjoint. va must either be
6432 // disjoint from all other registers or equal vc
6433
6434 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6435 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6436 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6437
6438 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6439 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6440
6441 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6442
6443 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6444 assert(vs_disjoint(va, vb), "va and vb overlap");
6445 assert(vs_disjoint(va, vq), "va and vq overlap");
6446 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6447
6448 // We multiply the front and back halves of each sequence 4 at a
6449 // time because
6450 //
6451 // 1) we are currently only able to get 4-way instruction
6452 // parallelism at best
6453 //
6454 // 2) we need registers for the constants in vq and temporary
6455 // scratch registers to hold intermediate results so vtmp can only
6456 // be a VSeq<4> which means we only have 4 scratch slots.
6457
6458 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6459 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6460 }
6461
6462 // Perform combined montmul then add/sub on 4x4S vectors.
6463 void dilithium_montmul16_sub_add(
6464 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6465 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6466 // compute a = montmul(a1, c)
6467 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6468 // ouptut a1 = a0 - a
6469 vs_subv(va1, __ T4S, va0, vc);
6470 // and a0 = a0 + a
6471 vs_addv(va0, __ T4S, va0, vc);
6472 }
6473
6474 // Perform combined add/sub then montul on 4x4S vectors.
6475 void dilithium_sub_add_montmul16(
6476 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6477 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6478 // compute c = a0 - a1
6479 vs_subv(vtmp1, __ T4S, va0, va1);
6480 // output a0 = a0 + a1
6481 vs_addv(va0, __ T4S, va0, va1);
6482 // output a1 = b montmul c
6483 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6484 }
6485
6486 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6487 // in the Java implementation come in sequences of at least 8, so we
6488 // can use ldpq to collect the corresponding data into pairs of vector
6489 // registers.
6490 // We collect the coefficients corresponding to the 'j+l' indexes into
6491 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6492 // then we do the (Montgomery) multiplications by the zetas in parallel
6493 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6494 // v0-v7, then do the additions into v24-v31 and the subtractions into
6495 // v0-v7 and finally save the results back to the coeffs array.
6496 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6497 const Register coeffs, const Register zetas) {
6498 int c1 = 0;
6499 int c2 = 512;
6500 int startIncr;
6501 // don't use callee save registers v8 - v15
6502 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6503 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6504 VSeq<2> vq(30); // n.b. constants overlap vs3
6505 int offsets[4] = { 0, 32, 64, 96 };
6506
6507 for (int level = 0; level < 5; level++) {
6508 int c1Start = c1;
6509 int c2Start = c2;
6510 if (level == 3) {
6511 offsets[1] = 32;
6512 offsets[2] = 128;
6513 offsets[3] = 160;
6514 } else if (level == 4) {
6515 offsets[1] = 64;
6516 offsets[2] = 128;
6517 offsets[3] = 192;
6518 }
6519
6520 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6521 // time at 4 different offsets and multiply them in order by the
6522 // next set of input values. So we employ indexed load and store
6523 // pair instructions with arrangement 4S.
6524 for (int i = 0; i < 4; i++) {
6525 // reload q and qinv
6526 vs_ldpq(vq, dilithiumConsts); // qInv, q
6527 // load 8x4S coefficients via second start pos == c2
6528 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6529 // load next 8x4S inputs == b
6530 vs_ldpq_post(vs2, zetas);
6531 // compute a == c2 * b mod MONT_Q
6532 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6533 // load 8x4s coefficients via first start pos == c1
6534 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6535 // compute a1 = c1 + a
6536 vs_addv(vs3, __ T4S, vs1, vs2);
6537 // compute a2 = c1 - a
6538 vs_subv(vs1, __ T4S, vs1, vs2);
6539 // output a1 and a2
6540 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6541 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6542
6543 int k = 4 * level + i;
6544
6545 if (k > 7) {
6546 startIncr = 256;
6547 } else if (k == 5) {
6548 startIncr = 384;
6549 } else {
6550 startIncr = 128;
6551 }
6552
6553 c1Start += startIncr;
6554 c2Start += startIncr;
6555 }
6556
6557 c2 /= 2;
6558 }
6559 }
6560
6561 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6562 // Implements the method
6563 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6564 // of the Java class sun.security.provider
6565 //
6566 // coeffs (int[256]) = c_rarg0
6567 // zetas (int[256]) = c_rarg1
6568 address generate_dilithiumAlmostNtt() {
6569
6570 __ align(CodeEntryAlignment);
6571 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6572 StubCodeMark mark(this, stub_id);
6573 address start = __ pc();
6574 __ enter();
6575
6576 const Register coeffs = c_rarg0;
6577 const Register zetas = c_rarg1;
6578
6579 const Register tmpAddr = r9;
6580 const Register dilithiumConsts = r10;
6581 const Register result = r11;
6582 // don't use callee save registers v8 - v15
6583 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6584 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6585 VSeq<2> vq(30); // n.b. constants overlap vs3
6586 int offsets[4] = { 0, 32, 64, 96};
6587 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6588 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6589 __ add(result, coeffs, 0);
6590 __ lea(dilithiumConsts,
6591 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6592
6593 // Each level represents one iteration of the outer for loop of the Java version.
6594
6595 // level 0-4
6596 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6597
6598 // level 5
6599
6600 // At level 5 the coefficients we need to combine with the zetas
6601 // are grouped in memory in blocks of size 4. So, for both sets of
6602 // coefficients we load 4 adjacent values at 8 different offsets
6603 // using an indexed ldr with register variant Q and multiply them
6604 // in sequence order by the next set of inputs. Likewise we store
6605 // the resuls using an indexed str with register variant Q.
6606 for (int i = 0; i < 1024; i += 256) {
6607 // reload constants q, qinv each iteration as they get clobbered later
6608 vs_ldpq(vq, dilithiumConsts); // qInv, q
6609 // load 32 (8x4S) coefficients via first offsets = c1
6610 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6611 // load next 32 (8x4S) inputs = b
6612 vs_ldpq_post(vs2, zetas);
6613 // a = b montul c1
6614 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6615 // load 32 (8x4S) coefficients via second offsets = c2
6616 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6617 // add/sub with result of multiply
6618 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6619 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6620 // write back new coefficients using same offsets
6621 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6622 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6623 }
6624
6625 // level 6
6626 // At level 6 the coefficients we need to combine with the zetas
6627 // are grouped in memory in pairs, the first two being montmul
6628 // inputs and the second add/sub inputs. We can still implement
6629 // the montmul+sub+add using 4-way parallelism but only if we
6630 // combine the coefficients with the zetas 16 at a time. We load 8
6631 // adjacent values at 4 different offsets using an ld2 load with
6632 // arrangement 2D. That interleaves the lower and upper halves of
6633 // each pair of quadwords into successive vector registers. We
6634 // then need to montmul the 4 even elements of the coefficients
6635 // register sequence by the zetas in order and then add/sub the 4
6636 // odd elements of the coefficients register sequence. We use an
6637 // equivalent st2 operation to store the results back into memory
6638 // de-interleaved.
6639 for (int i = 0; i < 1024; i += 128) {
6640 // reload constants q, qinv each iteration as they get clobbered later
6641 vs_ldpq(vq, dilithiumConsts); // qInv, q
6642 // load interleaved 16 (4x2D) coefficients via offsets
6643 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6644 // load next 16 (4x4S) inputs
6645 vs_ldpq_post(vs_front(vs2), zetas);
6646 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6647 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6648 vs_front(vs2), vtmp, vq);
6649 // store interleaved 16 (4x2D) coefficients via offsets
6650 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6651 }
6652
6653 // level 7
6654 // At level 7 the coefficients we need to combine with the zetas
6655 // occur singly with montmul inputs alterating with add/sub
6656 // inputs. Once again we can use 4-way parallelism to combine 16
6657 // zetas at a time. However, we have to load 8 adjacent values at
6658 // 4 different offsets using an ld2 load with arrangement 4S. That
6659 // interleaves the the odd words of each pair into one
6660 // coefficients vector register and the even words of the pair
6661 // into the next register. We then need to montmul the 4 even
6662 // elements of the coefficients register sequence by the zetas in
6663 // order and then add/sub the 4 odd elements of the coefficients
6664 // register sequence. We use an equivalent st2 operation to store
6665 // the results back into memory de-interleaved.
6666
6667 for (int i = 0; i < 1024; i += 128) {
6668 // reload constants q, qinv each iteration as they get clobbered later
6669 vs_ldpq(vq, dilithiumConsts); // qInv, q
6670 // load interleaved 16 (4x4S) coefficients via offsets
6671 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6672 // load next 16 (4x4S) inputs
6673 vs_ldpq_post(vs_front(vs2), zetas);
6674 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6675 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6676 vs_front(vs2), vtmp, vq);
6677 // store interleaved 16 (4x4S) coefficients via offsets
6678 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6679 }
6680 __ leave(); // required for proper stackwalking of RuntimeStub frame
6681 __ mov(r0, zr); // return 0
6682 __ ret(lr);
6683
6684 return start;
6685 }
6686
6687 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6688 // in the Java implementation come in sequences of at least 8, so we
6689 // can use ldpq to collect the corresponding data into pairs of vector
6690 // registers
6691 // We collect the coefficients that correspond to the 'j's into vs1
6692 // the coefficiets that correspond to the 'j+l's into vs2 then
6693 // do the additions into vs3 and the subtractions into vs1 then
6694 // save the result of the additions, load the zetas into vs2
6695 // do the (Montgomery) multiplications by zeta in parallel into vs2
6696 // finally save the results back to the coeffs array
6697 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6698 const Register coeffs, const Register zetas) {
6699 int c1 = 0;
6700 int c2 = 32;
6701 int startIncr;
6702 int offsets[4];
6703 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6704 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6705 VSeq<2> vq(30); // n.b. constants overlap vs3
6706
6707 offsets[0] = 0;
6708
6709 for (int level = 3; level < 8; level++) {
6710 int c1Start = c1;
6711 int c2Start = c2;
6712 if (level == 3) {
6713 offsets[1] = 64;
6714 offsets[2] = 128;
6715 offsets[3] = 192;
6716 } else if (level == 4) {
6717 offsets[1] = 32;
6718 offsets[2] = 128;
6719 offsets[3] = 160;
6720 } else {
6721 offsets[1] = 32;
6722 offsets[2] = 64;
6723 offsets[3] = 96;
6724 }
6725
6726 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6727 // time at 4 different offsets and multiply them in order by the
6728 // next set of input values. So we employ indexed load and store
6729 // pair instructions with arrangement 4S.
6730 for (int i = 0; i < 4; i++) {
6731 // load v1 32 (8x4S) coefficients relative to first start index
6732 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6733 // load v2 32 (8x4S) coefficients relative to second start index
6734 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6735 // a0 = v1 + v2 -- n.b. clobbers vqs
6736 vs_addv(vs3, __ T4S, vs1, vs2);
6737 // a1 = v1 - v2
6738 vs_subv(vs1, __ T4S, vs1, vs2);
6739 // save a1 relative to first start index
6740 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6741 // load constants q, qinv each iteration as they get clobbered above
6742 vs_ldpq(vq, dilithiumConsts); // qInv, q
6743 // load b next 32 (8x4S) inputs
6744 vs_ldpq_post(vs2, zetas);
6745 // a = a1 montmul b
6746 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6747 // save a relative to second start index
6748 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6749
6750 int k = 4 * level + i;
6751
6752 if (k < 24) {
6753 startIncr = 256;
6754 } else if (k == 25) {
6755 startIncr = 384;
6756 } else {
6757 startIncr = 128;
6758 }
6759
6760 c1Start += startIncr;
6761 c2Start += startIncr;
6762 }
6763
6764 c2 *= 2;
6765 }
6766 }
6767
6768 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6769 // Implements the method
6770 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6771 // the sun.security.provider.ML_DSA class.
6772 //
6773 // coeffs (int[256]) = c_rarg0
6774 // zetas (int[256]) = c_rarg1
6775 address generate_dilithiumAlmostInverseNtt() {
6776
6777 __ align(CodeEntryAlignment);
6778 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6779 StubCodeMark mark(this, stub_id);
6780 address start = __ pc();
6781 __ enter();
6782
6783 const Register coeffs = c_rarg0;
6784 const Register zetas = c_rarg1;
6785
6786 const Register tmpAddr = r9;
6787 const Register dilithiumConsts = r10;
6788 const Register result = r11;
6789 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6790 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6791 VSeq<2> vq(30); // n.b. constants overlap vs3
6792 int offsets[4] = { 0, 32, 64, 96 };
6793 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6794 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6795
6796 __ add(result, coeffs, 0);
6797 __ lea(dilithiumConsts,
6798 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6799
6800 // Each level represents one iteration of the outer for loop of the Java version
6801
6802 // level 0
6803 // At level 0 we need to interleave adjacent quartets of
6804 // coefficients before we multiply and add/sub by the next 16
6805 // zetas just as we did for level 7 in the multiply code. So we
6806 // load and store the values using an ld2/st2 with arrangement 4S.
6807 for (int i = 0; i < 1024; i += 128) {
6808 // load constants q, qinv
6809 // n.b. this can be moved out of the loop as they do not get
6810 // clobbered by first two loops
6811 vs_ldpq(vq, dilithiumConsts); // qInv, q
6812 // a0/a1 load interleaved 32 (8x4S) coefficients
6813 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6814 // b load next 32 (8x4S) inputs
6815 vs_ldpq_post(vs_front(vs2), zetas);
6816 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6817 // n.b. second half of vs2 provides temporary register storage
6818 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6819 vs_front(vs2), vs_back(vs2), vtmp, vq);
6820 // a0/a1 store interleaved 32 (8x4S) coefficients
6821 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6822 }
6823
6824 // level 1
6825 // At level 1 we need to interleave pairs of adjacent pairs of
6826 // coefficients before we multiply by the next 16 zetas just as we
6827 // did for level 6 in the multiply code. So we load and store the
6828 // values an ld2/st2 with arrangement 2D.
6829 for (int i = 0; i < 1024; i += 128) {
6830 // a0/a1 load interleaved 32 (8x2D) coefficients
6831 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6832 // b load next 16 (4x4S) inputs
6833 vs_ldpq_post(vs_front(vs2), zetas);
6834 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6835 // n.b. second half of vs2 provides temporary register storage
6836 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6837 vs_front(vs2), vs_back(vs2), vtmp, vq);
6838 // a0/a1 store interleaved 32 (8x2D) coefficients
6839 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6840 }
6841
6842 // level 2
6843 // At level 2 coefficients come in blocks of 4. So, we load 4
6844 // adjacent coefficients at 8 distinct offsets for both the first
6845 // and second coefficient sequences, using an ldr with register
6846 // variant Q then combine them with next set of 32 zetas. Likewise
6847 // we store the results using an str with register variant Q.
6848 for (int i = 0; i < 1024; i += 256) {
6849 // c0 load 32 (8x4S) coefficients via first offsets
6850 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6851 // c1 load 32 (8x4S) coefficients via second offsets
6852 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6853 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6854 vs_addv(vs3, __ T4S, vs1, vs2);
6855 // c = c0 - c1
6856 vs_subv(vs1, __ T4S, vs1, vs2);
6857 // store a0 32 (8x4S) coefficients via first offsets
6858 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6859 // b load 32 (8x4S) next inputs
6860 vs_ldpq_post(vs2, zetas);
6861 // reload constants q, qinv -- they were clobbered earlier
6862 vs_ldpq(vq, dilithiumConsts); // qInv, q
6863 // compute a1 = b montmul c
6864 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6865 // store a1 32 (8x4S) coefficients via second offsets
6866 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6867 }
6868
6869 // level 3-7
6870 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6871
6872 __ leave(); // required for proper stackwalking of RuntimeStub frame
6873 __ mov(r0, zr); // return 0
6874 __ ret(lr);
6875
6876 return start;
6877 }
6878
6879 // Dilithium multiply polynomials in the NTT domain.
6880 // Straightforward implementation of the method
6881 // static int implDilithiumNttMult(
6882 // int[] result, int[] ntta, int[] nttb {} of
6883 // the sun.security.provider.ML_DSA class.
6884 //
6885 // result (int[256]) = c_rarg0
6886 // poly1 (int[256]) = c_rarg1
6887 // poly2 (int[256]) = c_rarg2
6888 address generate_dilithiumNttMult() {
6889
6890 __ align(CodeEntryAlignment);
6891 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6892 StubCodeMark mark(this, stub_id);
6893 address start = __ pc();
6894 __ enter();
6895
6896 Label L_loop;
6897
6898 const Register result = c_rarg0;
6899 const Register poly1 = c_rarg1;
6900 const Register poly2 = c_rarg2;
6901
6902 const Register dilithiumConsts = r10;
6903 const Register len = r11;
6904
6905 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6906 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6907 VSeq<2> vq(30); // n.b. constants overlap vs3
6908 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6909
6910 __ lea(dilithiumConsts,
6911 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6912
6913 // load constants q, qinv
6914 vs_ldpq(vq, dilithiumConsts); // qInv, q
6915 // load constant rSquare into v29
6916 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6917
6918 __ mov(len, zr);
6919 __ add(len, len, 1024);
6920
6921 __ BIND(L_loop);
6922
6923 // b load 32 (8x4S) next inputs from poly1
6924 vs_ldpq_post(vs1, poly1);
6925 // c load 32 (8x4S) next inputs from poly2
6926 vs_ldpq_post(vs2, poly2);
6927 // compute a = b montmul c
6928 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6929 // compute a = rsquare montmul a
6930 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6931 // save a 32 (8x4S) results
6932 vs_stpq_post(vs2, result);
6933
6934 __ sub(len, len, 128);
6935 __ cmp(len, (u1)128);
6936 __ br(Assembler::GE, L_loop);
6937
6938 __ leave(); // required for proper stackwalking of RuntimeStub frame
6939 __ mov(r0, zr); // return 0
6940 __ ret(lr);
6941
6942 return start;
6943 }
6944
6945 // Dilithium Motgomery multiply an array by a constant.
6946 // A straightforward implementation of the method
6947 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6948 // of the sun.security.provider.MLDSA class
6949 //
6950 // coeffs (int[256]) = c_rarg0
6951 // constant (int) = c_rarg1
6952 address generate_dilithiumMontMulByConstant() {
6953
6954 __ align(CodeEntryAlignment);
6955 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6956 StubCodeMark mark(this, stub_id);
6957 address start = __ pc();
6958 __ enter();
6959
6960 Label L_loop;
6961
6962 const Register coeffs = c_rarg0;
6963 const Register constant = c_rarg1;
6964
6965 const Register dilithiumConsts = r10;
6966 const Register result = r11;
6967 const Register len = r12;
6968
6969 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6970 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6971 VSeq<2> vq(30); // n.b. constants overlap vs3
6972 VSeq<8> vconst(29, 0); // for montmul by constant
6973
6974 // results track inputs
6975 __ add(result, coeffs, 0);
6976 __ lea(dilithiumConsts,
6977 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6978
6979 // load constants q, qinv -- they do not get clobbered by first two loops
6980 vs_ldpq(vq, dilithiumConsts); // qInv, q
6981 // copy caller supplied constant across vconst
6982 __ dup(vconst[0], __ T4S, constant);
6983 __ mov(len, zr);
6984 __ add(len, len, 1024);
6985
6986 __ BIND(L_loop);
6987
6988 // load next 32 inputs
6989 vs_ldpq_post(vs2, coeffs);
6990 // mont mul by constant
6991 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6992 // write next 32 results
6993 vs_stpq_post(vs2, result);
6994
6995 __ sub(len, len, 128);
6996 __ cmp(len, (u1)128);
6997 __ br(Assembler::GE, L_loop);
6998
6999 __ leave(); // required for proper stackwalking of RuntimeStub frame
7000 __ mov(r0, zr); // return 0
7001 __ ret(lr);
7002
7003 return start;
7004 }
7005
7006 // Dilithium decompose poly.
7007 // Implements the method
7008 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7009 // of the sun.security.provider.ML_DSA class
7010 //
7011 // input (int[256]) = c_rarg0
7012 // lowPart (int[256]) = c_rarg1
7013 // highPart (int[256]) = c_rarg2
7014 // twoGamma2 (int) = c_rarg3
7015 // multiplier (int) = c_rarg4
7016 address generate_dilithiumDecomposePoly() {
7017
7018 __ align(CodeEntryAlignment);
7019 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7020 StubCodeMark mark(this, stub_id);
7021 address start = __ pc();
7022 Label L_loop;
7023
7024 const Register input = c_rarg0;
7025 const Register lowPart = c_rarg1;
7026 const Register highPart = c_rarg2;
7027 const Register twoGamma2 = c_rarg3;
7028 const Register multiplier = c_rarg4;
7029
7030 const Register len = r9;
7031 const Register dilithiumConsts = r10;
7032 const Register tmp = r11;
7033
7034 // 6 independent sets of 4x4s values
7035 VSeq<4> vs1(0), vs2(4), vs3(8);
7036 VSeq<4> vs4(12), vs5(16), vtmp(20);
7037
7038 // 7 constants for cross-multiplying
7039 VSeq<4> one(25, 0);
7040 VSeq<4> qminus1(26, 0);
7041 VSeq<4> g2(27, 0);
7042 VSeq<4> twog2(28, 0);
7043 VSeq<4> mult(29, 0);
7044 VSeq<4> q(30, 0);
7045 VSeq<4> qadd(31, 0);
7046
7047 __ enter();
7048
7049 __ lea(dilithiumConsts,
7050 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7051
7052 // save callee-saved registers
7053 __ stpd(v8, v9, __ pre(sp, -64));
7054 __ stpd(v10, v11, Address(sp, 16));
7055 __ stpd(v12, v13, Address(sp, 32));
7056 __ stpd(v14, v15, Address(sp, 48));
7057
7058 // populate constant registers
7059 __ mov(tmp, zr);
7060 __ add(tmp, tmp, 1);
7061 __ dup(one[0], __ T4S, tmp); // 1
7062 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7063 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7064 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7065 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7066 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7067 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7068
7069 __ mov(len, zr);
7070 __ add(len, len, 1024);
7071
7072 __ BIND(L_loop);
7073
7074 // load next 4x4S inputs interleaved: rplus --> vs1
7075 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7076
7077 // rplus = rplus - ((rplus + qadd) >> 23) * q
7078 vs_addv(vtmp, __ T4S, vs1, qadd);
7079 vs_sshr(vtmp, __ T4S, vtmp, 23);
7080 vs_mulv(vtmp, __ T4S, vtmp, q);
7081 vs_subv(vs1, __ T4S, vs1, vtmp);
7082
7083 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7084 vs_sshr(vtmp, __ T4S, vs1, 31);
7085 vs_andr(vtmp, vtmp, q);
7086 vs_addv(vs1, __ T4S, vs1, vtmp);
7087
7088 // quotient --> vs2
7089 // int quotient = (rplus * multiplier) >> 22;
7090 vs_mulv(vtmp, __ T4S, vs1, mult);
7091 vs_sshr(vs2, __ T4S, vtmp, 22);
7092
7093 // r0 --> vs3
7094 // int r0 = rplus - quotient * twoGamma2;
7095 vs_mulv(vtmp, __ T4S, vs2, twog2);
7096 vs_subv(vs3, __ T4S, vs1, vtmp);
7097
7098 // mask --> vs4
7099 // int mask = (twoGamma2 - r0) >> 22;
7100 vs_subv(vtmp, __ T4S, twog2, vs3);
7101 vs_sshr(vs4, __ T4S, vtmp, 22);
7102
7103 // r0 -= (mask & twoGamma2);
7104 vs_andr(vtmp, vs4, twog2);
7105 vs_subv(vs3, __ T4S, vs3, vtmp);
7106
7107 // quotient += (mask & 1);
7108 vs_andr(vtmp, vs4, one);
7109 vs_addv(vs2, __ T4S, vs2, vtmp);
7110
7111 // mask = (twoGamma2 / 2 - r0) >> 31;
7112 vs_subv(vtmp, __ T4S, g2, vs3);
7113 vs_sshr(vs4, __ T4S, vtmp, 31);
7114
7115 // r0 -= (mask & twoGamma2);
7116 vs_andr(vtmp, vs4, twog2);
7117 vs_subv(vs3, __ T4S, vs3, vtmp);
7118
7119 // quotient += (mask & 1);
7120 vs_andr(vtmp, vs4, one);
7121 vs_addv(vs2, __ T4S, vs2, vtmp);
7122
7123 // r1 --> vs5
7124 // int r1 = rplus - r0 - (dilithium_q - 1);
7125 vs_subv(vtmp, __ T4S, vs1, vs3);
7126 vs_subv(vs5, __ T4S, vtmp, qminus1);
7127
7128 // r1 --> vs1 (overwriting rplus)
7129 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7130 vs_negr(vtmp, __ T4S, vs5);
7131 vs_orr(vtmp, vs5, vtmp);
7132 vs_sshr(vs1, __ T4S, vtmp, 31);
7133
7134 // r0 += ~r1;
7135 vs_notr(vtmp, vs1);
7136 vs_addv(vs3, __ T4S, vs3, vtmp);
7137
7138 // r1 = r1 & quotient;
7139 vs_andr(vs1, vs2, vs1);
7140
7141 // store results inteleaved
7142 // lowPart[m] = r0;
7143 // highPart[m] = r1;
7144 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7145 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7146
7147 __ sub(len, len, 64);
7148 __ cmp(len, (u1)64);
7149 __ br(Assembler::GE, L_loop);
7150
7151 // restore callee-saved vector registers
7152 __ ldpd(v14, v15, Address(sp, 48));
7153 __ ldpd(v12, v13, Address(sp, 32));
7154 __ ldpd(v10, v11, Address(sp, 16));
7155 __ ldpd(v8, v9, __ post(sp, 64));
7156
7157 __ leave(); // required for proper stackwalking of RuntimeStub frame
7158 __ mov(r0, zr); // return 0
7159 __ ret(lr);
7160
7161 return start;
7162 }
7163
7164 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7165 Register tmp0, Register tmp1, Register tmp2) {
7166 __ bic(tmp0, a2, a1); // for a0
7167 __ bic(tmp1, a3, a2); // for a1
7168 __ bic(tmp2, a4, a3); // for a2
7169 __ eor(a2, a2, tmp2);
7170 __ bic(tmp2, a0, a4); // for a3
7171 __ eor(a3, a3, tmp2);
7172 __ bic(tmp2, a1, a0); // for a4
7173 __ eor(a0, a0, tmp0);
7174 __ eor(a1, a1, tmp1);
7175 __ eor(a4, a4, tmp2);
7176 }
7177
7178 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7179 Register a0, Register a1, Register a2, Register a3, Register a4,
7180 Register a5, Register a6, Register a7, Register a8, Register a9,
7181 Register a10, Register a11, Register a12, Register a13, Register a14,
7182 Register a15, Register a16, Register a17, Register a18, Register a19,
7183 Register a20, Register a21, Register a22, Register a23, Register a24,
7184 Register tmp0, Register tmp1, Register tmp2) {
7185 __ eor3(tmp1, a4, a9, a14);
7186 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7187 __ eor3(tmp2, a1, a6, a11);
7188 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7189 __ rax1(tmp2, tmp0, tmp1); // d0
7190 {
7191
7192 Register tmp3, tmp4;
7193 if (can_use_fp && can_use_r18) {
7194 tmp3 = rfp;
7195 tmp4 = r18_tls;
7196 } else {
7197 tmp3 = a4;
7198 tmp4 = a9;
7199 __ stp(tmp3, tmp4, __ pre(sp, -16));
7200 }
7201
7202 __ eor3(tmp3, a0, a5, a10);
7203 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7204 __ eor(a0, a0, tmp2);
7205 __ eor(a5, a5, tmp2);
7206 __ eor(a10, a10, tmp2);
7207 __ eor(a15, a15, tmp2);
7208 __ eor(a20, a20, tmp2); // d0(tmp2)
7209 __ eor3(tmp3, a2, a7, a12);
7210 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7211 __ rax1(tmp3, tmp4, tmp2); // d1
7212 __ eor(a1, a1, tmp3);
7213 __ eor(a6, a6, tmp3);
7214 __ eor(a11, a11, tmp3);
7215 __ eor(a16, a16, tmp3);
7216 __ eor(a21, a21, tmp3); // d1(tmp3)
7217 __ rax1(tmp3, tmp2, tmp0); // d3
7218 __ eor3(tmp2, a3, a8, a13);
7219 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7220 __ eor(a3, a3, tmp3);
7221 __ eor(a8, a8, tmp3);
7222 __ eor(a13, a13, tmp3);
7223 __ eor(a18, a18, tmp3);
7224 __ eor(a23, a23, tmp3);
7225 __ rax1(tmp2, tmp1, tmp0); // d2
7226 __ eor(a2, a2, tmp2);
7227 __ eor(a7, a7, tmp2);
7228 __ eor(a12, a12, tmp2);
7229 __ rax1(tmp0, tmp0, tmp4); // d4
7230 if (!can_use_fp || !can_use_r18) {
7231 __ ldp(tmp3, tmp4, __ post(sp, 16));
7232 }
7233 __ eor(a17, a17, tmp2);
7234 __ eor(a22, a22, tmp2);
7235 __ eor(a4, a4, tmp0);
7236 __ eor(a9, a9, tmp0);
7237 __ eor(a14, a14, tmp0);
7238 __ eor(a19, a19, tmp0);
7239 __ eor(a24, a24, tmp0);
7240 }
7241
7242 __ rol(tmp0, a10, 3);
7243 __ rol(a10, a1, 1);
7244 __ rol(a1, a6, 44);
7245 __ rol(a6, a9, 20);
7246 __ rol(a9, a22, 61);
7247 __ rol(a22, a14, 39);
7248 __ rol(a14, a20, 18);
7249 __ rol(a20, a2, 62);
7250 __ rol(a2, a12, 43);
7251 __ rol(a12, a13, 25);
7252 __ rol(a13, a19, 8) ;
7253 __ rol(a19, a23, 56);
7254 __ rol(a23, a15, 41);
7255 __ rol(a15, a4, 27);
7256 __ rol(a4, a24, 14);
7257 __ rol(a24, a21, 2);
7258 __ rol(a21, a8, 55);
7259 __ rol(a8, a16, 45);
7260 __ rol(a16, a5, 36);
7261 __ rol(a5, a3, 28);
7262 __ rol(a3, a18, 21);
7263 __ rol(a18, a17, 15);
7264 __ rol(a17, a11, 10);
7265 __ rol(a11, a7, 6);
7266 __ mov(a7, tmp0);
7267
7268 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7269 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7270 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7271 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7272 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7273
7274 __ ldr(tmp1, __ post(rc, 8));
7275 __ eor(a0, a0, tmp1);
7276
7277 }
7278
7279 // Arguments:
7280 //
7281 // Inputs:
7282 // c_rarg0 - byte[] source+offset
7283 // c_rarg1 - byte[] SHA.state
7284 // c_rarg2 - int block_size
7285 // c_rarg3 - int offset
7286 // c_rarg4 - int limit
7287 //
7288 address generate_sha3_implCompress_gpr(StubId stub_id) {
7289 bool multi_block;
7290 switch (stub_id) {
7291 case StubId::stubgen_sha3_implCompress_id:
7292 multi_block = false;
7293 break;
7294 case StubId::stubgen_sha3_implCompressMB_id:
7295 multi_block = true;
7296 break;
7297 default:
7298 ShouldNotReachHere();
7299 }
7300
7301 static const uint64_t round_consts[24] = {
7302 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7303 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7304 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7305 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7306 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7307 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7308 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7309 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7310 };
7311
7312 __ align(CodeEntryAlignment);
7313 StubCodeMark mark(this, stub_id);
7314 address start = __ pc();
7315
7316 Register buf = c_rarg0;
7317 Register state = c_rarg1;
7318 Register block_size = c_rarg2;
7319 Register ofs = c_rarg3;
7320 Register limit = c_rarg4;
7321
7322 // use r3.r17,r19..r28 to keep a0..a24.
7323 // a0..a24 are respective locals from SHA3.java
7324 Register a0 = r25,
7325 a1 = r26,
7326 a2 = r27,
7327 a3 = r3,
7328 a4 = r4,
7329 a5 = r5,
7330 a6 = r6,
7331 a7 = r7,
7332 a8 = rscratch1, // r8
7333 a9 = rscratch2, // r9
7334 a10 = r10,
7335 a11 = r11,
7336 a12 = r12,
7337 a13 = r13,
7338 a14 = r14,
7339 a15 = r15,
7340 a16 = r16,
7341 a17 = r17,
7342 a18 = r28,
7343 a19 = r19,
7344 a20 = r20,
7345 a21 = r21,
7346 a22 = r22,
7347 a23 = r23,
7348 a24 = r24;
7349
7350 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7351
7352 Label sha3_loop, rounds24_preloop, loop_body;
7353 Label sha3_512_or_sha3_384, shake128;
7354
7355 bool can_use_r18 = false;
7356 #ifndef R18_RESERVED
7357 can_use_r18 = true;
7358 #endif
7359 bool can_use_fp = !PreserveFramePointer;
7360
7361 __ enter();
7362
7363 // save almost all yet unsaved gpr registers on stack
7364 __ str(block_size, __ pre(sp, -128));
7365 if (multi_block) {
7366 __ stpw(ofs, limit, Address(sp, 8));
7367 }
7368 // 8 bytes at sp+16 will be used to keep buf
7369 __ stp(r19, r20, Address(sp, 32));
7370 __ stp(r21, r22, Address(sp, 48));
7371 __ stp(r23, r24, Address(sp, 64));
7372 __ stp(r25, r26, Address(sp, 80));
7373 __ stp(r27, r28, Address(sp, 96));
7374 if (can_use_r18 && can_use_fp) {
7375 __ stp(r18_tls, state, Address(sp, 112));
7376 } else {
7377 __ str(state, Address(sp, 112));
7378 }
7379
7380 // begin sha3 calculations: loading a0..a24 from state arrary
7381 __ ldp(a0, a1, state);
7382 __ ldp(a2, a3, Address(state, 16));
7383 __ ldp(a4, a5, Address(state, 32));
7384 __ ldp(a6, a7, Address(state, 48));
7385 __ ldp(a8, a9, Address(state, 64));
7386 __ ldp(a10, a11, Address(state, 80));
7387 __ ldp(a12, a13, Address(state, 96));
7388 __ ldp(a14, a15, Address(state, 112));
7389 __ ldp(a16, a17, Address(state, 128));
7390 __ ldp(a18, a19, Address(state, 144));
7391 __ ldp(a20, a21, Address(state, 160));
7392 __ ldp(a22, a23, Address(state, 176));
7393 __ ldr(a24, Address(state, 192));
7394
7395 __ BIND(sha3_loop);
7396
7397 // load input
7398 __ ldp(tmp3, tmp2, __ post(buf, 16));
7399 __ eor(a0, a0, tmp3);
7400 __ eor(a1, a1, tmp2);
7401 __ ldp(tmp3, tmp2, __ post(buf, 16));
7402 __ eor(a2, a2, tmp3);
7403 __ eor(a3, a3, tmp2);
7404 __ ldp(tmp3, tmp2, __ post(buf, 16));
7405 __ eor(a4, a4, tmp3);
7406 __ eor(a5, a5, tmp2);
7407 __ ldr(tmp3, __ post(buf, 8));
7408 __ eor(a6, a6, tmp3);
7409
7410 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7411 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7412
7413 __ ldp(tmp3, tmp2, __ post(buf, 16));
7414 __ eor(a7, a7, tmp3);
7415 __ eor(a8, a8, tmp2);
7416 __ ldp(tmp3, tmp2, __ post(buf, 16));
7417 __ eor(a9, a9, tmp3);
7418 __ eor(a10, a10, tmp2);
7419 __ ldp(tmp3, tmp2, __ post(buf, 16));
7420 __ eor(a11, a11, tmp3);
7421 __ eor(a12, a12, tmp2);
7422 __ ldp(tmp3, tmp2, __ post(buf, 16));
7423 __ eor(a13, a13, tmp3);
7424 __ eor(a14, a14, tmp2);
7425 __ ldp(tmp3, tmp2, __ post(buf, 16));
7426 __ eor(a15, a15, tmp3);
7427 __ eor(a16, a16, tmp2);
7428
7429 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7430 __ andw(tmp2, block_size, 48);
7431 __ cbzw(tmp2, rounds24_preloop);
7432 __ tbnz(block_size, 5, shake128);
7433 // block_size == 144, bit5 == 0, SHA3-244
7434 __ ldr(tmp3, __ post(buf, 8));
7435 __ eor(a17, a17, tmp3);
7436 __ b(rounds24_preloop);
7437
7438 __ BIND(shake128);
7439 __ ldp(tmp3, tmp2, __ post(buf, 16));
7440 __ eor(a17, a17, tmp3);
7441 __ eor(a18, a18, tmp2);
7442 __ ldp(tmp3, tmp2, __ post(buf, 16));
7443 __ eor(a19, a19, tmp3);
7444 __ eor(a20, a20, tmp2);
7445 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7446
7447 __ BIND(sha3_512_or_sha3_384);
7448 __ ldp(tmp3, tmp2, __ post(buf, 16));
7449 __ eor(a7, a7, tmp3);
7450 __ eor(a8, a8, tmp2);
7451 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7452
7453 // SHA3-384
7454 __ ldp(tmp3, tmp2, __ post(buf, 16));
7455 __ eor(a9, a9, tmp3);
7456 __ eor(a10, a10, tmp2);
7457 __ ldp(tmp3, tmp2, __ post(buf, 16));
7458 __ eor(a11, a11, tmp3);
7459 __ eor(a12, a12, tmp2);
7460
7461 __ BIND(rounds24_preloop);
7462 __ fmovs(v0, 24.0); // float loop counter,
7463 __ fmovs(v1, 1.0); // exact representation
7464
7465 __ str(buf, Address(sp, 16));
7466 __ lea(tmp3, ExternalAddress((address) round_consts));
7467
7468 __ BIND(loop_body);
7469 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7470 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7471 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7472 tmp0, tmp1, tmp2);
7473 __ fsubs(v0, v0, v1);
7474 __ fcmps(v0, 0.0);
7475 __ br(__ NE, loop_body);
7476
7477 if (multi_block) {
7478 __ ldrw(block_size, sp); // block_size
7479 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7480 __ addw(tmp2, tmp2, block_size);
7481 __ cmpw(tmp2, tmp1);
7482 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7483 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7484 __ br(Assembler::LE, sha3_loop);
7485 __ movw(c_rarg0, tmp2); // return offset
7486 }
7487 if (can_use_fp && can_use_r18) {
7488 __ ldp(r18_tls, state, Address(sp, 112));
7489 } else {
7490 __ ldr(state, Address(sp, 112));
7491 }
7492 // save calculated sha3 state
7493 __ stp(a0, a1, Address(state));
7494 __ stp(a2, a3, Address(state, 16));
7495 __ stp(a4, a5, Address(state, 32));
7496 __ stp(a6, a7, Address(state, 48));
7497 __ stp(a8, a9, Address(state, 64));
7498 __ stp(a10, a11, Address(state, 80));
7499 __ stp(a12, a13, Address(state, 96));
7500 __ stp(a14, a15, Address(state, 112));
7501 __ stp(a16, a17, Address(state, 128));
7502 __ stp(a18, a19, Address(state, 144));
7503 __ stp(a20, a21, Address(state, 160));
7504 __ stp(a22, a23, Address(state, 176));
7505 __ str(a24, Address(state, 192));
7506
7507 // restore required registers from stack
7508 __ ldp(r19, r20, Address(sp, 32));
7509 __ ldp(r21, r22, Address(sp, 48));
7510 __ ldp(r23, r24, Address(sp, 64));
7511 __ ldp(r25, r26, Address(sp, 80));
7512 __ ldp(r27, r28, Address(sp, 96));
7513 if (can_use_fp && can_use_r18) {
7514 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7515 } // else no need to recalculate rfp, since it wasn't changed
7516
7517 __ leave();
7518
7519 __ ret(lr);
7520
7521 return start;
7522 }
7523
7524 /**
7525 * Arguments:
7526 *
7527 * Inputs:
7528 * c_rarg0 - int crc
7529 * c_rarg1 - byte* buf
7530 * c_rarg2 - int length
7531 *
7532 * Output:
7533 * rax - int crc result
7534 */
7535 address generate_updateBytesCRC32() {
7536 assert(UseCRC32Intrinsics, "what are we doing here?");
7537
7538 __ align(CodeEntryAlignment);
7539 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7540 StubCodeMark mark(this, stub_id);
7541
7542 address start = __ pc();
7543
7544 const Register crc = c_rarg0; // crc
7545 const Register buf = c_rarg1; // source java byte array address
7546 const Register len = c_rarg2; // length
7547 const Register table0 = c_rarg3; // crc_table address
7548 const Register table1 = c_rarg4;
7549 const Register table2 = c_rarg5;
7550 const Register table3 = c_rarg6;
7551 const Register tmp3 = c_rarg7;
7552
7553 BLOCK_COMMENT("Entry:");
7554 __ enter(); // required for proper stackwalking of RuntimeStub frame
7555
7556 __ kernel_crc32(crc, buf, len,
7557 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7558
7559 __ leave(); // required for proper stackwalking of RuntimeStub frame
7560 __ ret(lr);
7561
7562 return start;
7563 }
7564
7565 /**
7566 * Arguments:
7567 *
7568 * Inputs:
7569 * c_rarg0 - int crc
7570 * c_rarg1 - byte* buf
7571 * c_rarg2 - int length
7572 * c_rarg3 - int* table
7573 *
7574 * Output:
7575 * r0 - int crc result
7576 */
7577 address generate_updateBytesCRC32C() {
7578 assert(UseCRC32CIntrinsics, "what are we doing here?");
7579
7580 __ align(CodeEntryAlignment);
7581 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7582 StubCodeMark mark(this, stub_id);
7583
7584 address start = __ pc();
7585
7586 const Register crc = c_rarg0; // crc
7587 const Register buf = c_rarg1; // source java byte array address
7588 const Register len = c_rarg2; // length
7589 const Register table0 = c_rarg3; // crc_table address
7590 const Register table1 = c_rarg4;
7591 const Register table2 = c_rarg5;
7592 const Register table3 = c_rarg6;
7593 const Register tmp3 = c_rarg7;
7594
7595 BLOCK_COMMENT("Entry:");
7596 __ enter(); // required for proper stackwalking of RuntimeStub frame
7597
7598 __ kernel_crc32c(crc, buf, len,
7599 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7600
7601 __ leave(); // required for proper stackwalking of RuntimeStub frame
7602 __ ret(lr);
7603
7604 return start;
7605 }
7606
7607 /***
7608 * Arguments:
7609 *
7610 * Inputs:
7611 * c_rarg0 - int adler
7612 * c_rarg1 - byte* buff
7613 * c_rarg2 - int len
7614 *
7615 * Output:
7616 * c_rarg0 - int adler result
7617 */
7618 address generate_updateBytesAdler32() {
7619 __ align(CodeEntryAlignment);
7620 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7621 StubCodeMark mark(this, stub_id);
7622 address start = __ pc();
7623
7624 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7625
7626 // Aliases
7627 Register adler = c_rarg0;
7628 Register s1 = c_rarg0;
7629 Register s2 = c_rarg3;
7630 Register buff = c_rarg1;
7631 Register len = c_rarg2;
7632 Register nmax = r4;
7633 Register base = r5;
7634 Register count = r6;
7635 Register temp0 = rscratch1;
7636 Register temp1 = rscratch2;
7637 FloatRegister vbytes = v0;
7638 FloatRegister vs1acc = v1;
7639 FloatRegister vs2acc = v2;
7640 FloatRegister vtable = v3;
7641
7642 // Max number of bytes we can process before having to take the mod
7643 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7644 uint64_t BASE = 0xfff1;
7645 uint64_t NMAX = 0x15B0;
7646
7647 __ mov(base, BASE);
7648 __ mov(nmax, NMAX);
7649
7650 // Load accumulation coefficients for the upper 16 bits
7651 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7652 __ ld1(vtable, __ T16B, Address(temp0));
7653
7654 // s1 is initialized to the lower 16 bits of adler
7655 // s2 is initialized to the upper 16 bits of adler
7656 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7657 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7658
7659 // The pipelined loop needs at least 16 elements for 1 iteration
7660 // It does check this, but it is more effective to skip to the cleanup loop
7661 __ cmp(len, (u1)16);
7662 __ br(Assembler::HS, L_nmax);
7663 __ cbz(len, L_combine);
7664
7665 __ bind(L_simple_by1_loop);
7666 __ ldrb(temp0, Address(__ post(buff, 1)));
7667 __ add(s1, s1, temp0);
7668 __ add(s2, s2, s1);
7669 __ subs(len, len, 1);
7670 __ br(Assembler::HI, L_simple_by1_loop);
7671
7672 // s1 = s1 % BASE
7673 __ subs(temp0, s1, base);
7674 __ csel(s1, temp0, s1, Assembler::HS);
7675
7676 // s2 = s2 % BASE
7677 __ lsr(temp0, s2, 16);
7678 __ lsl(temp1, temp0, 4);
7679 __ sub(temp1, temp1, temp0);
7680 __ add(s2, temp1, s2, ext::uxth);
7681
7682 __ subs(temp0, s2, base);
7683 __ csel(s2, temp0, s2, Assembler::HS);
7684
7685 __ b(L_combine);
7686
7687 __ bind(L_nmax);
7688 __ subs(len, len, nmax);
7689 __ sub(count, nmax, 16);
7690 __ br(Assembler::LO, L_by16);
7691
7692 __ bind(L_nmax_loop);
7693
7694 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7695 vbytes, vs1acc, vs2acc, vtable);
7696
7697 __ subs(count, count, 16);
7698 __ br(Assembler::HS, L_nmax_loop);
7699
7700 // s1 = s1 % BASE
7701 __ lsr(temp0, s1, 16);
7702 __ lsl(temp1, temp0, 4);
7703 __ sub(temp1, temp1, temp0);
7704 __ add(temp1, temp1, s1, ext::uxth);
7705
7706 __ lsr(temp0, temp1, 16);
7707 __ lsl(s1, temp0, 4);
7708 __ sub(s1, s1, temp0);
7709 __ add(s1, s1, temp1, ext:: uxth);
7710
7711 __ subs(temp0, s1, base);
7712 __ csel(s1, temp0, s1, Assembler::HS);
7713
7714 // s2 = s2 % BASE
7715 __ lsr(temp0, s2, 16);
7716 __ lsl(temp1, temp0, 4);
7717 __ sub(temp1, temp1, temp0);
7718 __ add(temp1, temp1, s2, ext::uxth);
7719
7720 __ lsr(temp0, temp1, 16);
7721 __ lsl(s2, temp0, 4);
7722 __ sub(s2, s2, temp0);
7723 __ add(s2, s2, temp1, ext:: uxth);
7724
7725 __ subs(temp0, s2, base);
7726 __ csel(s2, temp0, s2, Assembler::HS);
7727
7728 __ subs(len, len, nmax);
7729 __ sub(count, nmax, 16);
7730 __ br(Assembler::HS, L_nmax_loop);
7731
7732 __ bind(L_by16);
7733 __ adds(len, len, count);
7734 __ br(Assembler::LO, L_by1);
7735
7736 __ bind(L_by16_loop);
7737
7738 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7739 vbytes, vs1acc, vs2acc, vtable);
7740
7741 __ subs(len, len, 16);
7742 __ br(Assembler::HS, L_by16_loop);
7743
7744 __ bind(L_by1);
7745 __ adds(len, len, 15);
7746 __ br(Assembler::LO, L_do_mod);
7747
7748 __ bind(L_by1_loop);
7749 __ ldrb(temp0, Address(__ post(buff, 1)));
7750 __ add(s1, temp0, s1);
7751 __ add(s2, s2, s1);
7752 __ subs(len, len, 1);
7753 __ br(Assembler::HS, L_by1_loop);
7754
7755 __ bind(L_do_mod);
7756 // s1 = s1 % BASE
7757 __ lsr(temp0, s1, 16);
7758 __ lsl(temp1, temp0, 4);
7759 __ sub(temp1, temp1, temp0);
7760 __ add(temp1, temp1, s1, ext::uxth);
7761
7762 __ lsr(temp0, temp1, 16);
7763 __ lsl(s1, temp0, 4);
7764 __ sub(s1, s1, temp0);
7765 __ add(s1, s1, temp1, ext:: uxth);
7766
7767 __ subs(temp0, s1, base);
7768 __ csel(s1, temp0, s1, Assembler::HS);
7769
7770 // s2 = s2 % BASE
7771 __ lsr(temp0, s2, 16);
7772 __ lsl(temp1, temp0, 4);
7773 __ sub(temp1, temp1, temp0);
7774 __ add(temp1, temp1, s2, ext::uxth);
7775
7776 __ lsr(temp0, temp1, 16);
7777 __ lsl(s2, temp0, 4);
7778 __ sub(s2, s2, temp0);
7779 __ add(s2, s2, temp1, ext:: uxth);
7780
7781 __ subs(temp0, s2, base);
7782 __ csel(s2, temp0, s2, Assembler::HS);
7783
7784 // Combine lower bits and higher bits
7785 __ bind(L_combine);
7786 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7787
7788 __ ret(lr);
7789
7790 return start;
7791 }
7792
7793 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7794 Register temp0, Register temp1, FloatRegister vbytes,
7795 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7796 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7797 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7798 // In non-vectorized code, we update s1 and s2 as:
7799 // s1 <- s1 + b1
7800 // s2 <- s2 + s1
7801 // s1 <- s1 + b2
7802 // s2 <- s2 + b1
7803 // ...
7804 // s1 <- s1 + b16
7805 // s2 <- s2 + s1
7806 // Putting above assignments together, we have:
7807 // s1_new = s1 + b1 + b2 + ... + b16
7808 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7809 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7810 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7811 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7812
7813 // s2 = s2 + s1 * 16
7814 __ add(s2, s2, s1, Assembler::LSL, 4);
7815
7816 // vs1acc = b1 + b2 + b3 + ... + b16
7817 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7818 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7819 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7820 __ uaddlv(vs1acc, __ T16B, vbytes);
7821 __ uaddlv(vs2acc, __ T8H, vs2acc);
7822
7823 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7824 __ fmovd(temp0, vs1acc);
7825 __ fmovd(temp1, vs2acc);
7826 __ add(s1, s1, temp0);
7827 __ add(s2, s2, temp1);
7828 }
7829
7830 /**
7831 * Arguments:
7832 *
7833 * Input:
7834 * c_rarg0 - x address
7835 * c_rarg1 - x length
7836 * c_rarg2 - y address
7837 * c_rarg3 - y length
7838 * c_rarg4 - z address
7839 */
7840 address generate_multiplyToLen() {
7841 __ align(CodeEntryAlignment);
7842 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7843 StubCodeMark mark(this, stub_id);
7844
7845 address start = __ pc();
7846 const Register x = r0;
7847 const Register xlen = r1;
7848 const Register y = r2;
7849 const Register ylen = r3;
7850 const Register z = r4;
7851
7852 const Register tmp0 = r5;
7853 const Register tmp1 = r10;
7854 const Register tmp2 = r11;
7855 const Register tmp3 = r12;
7856 const Register tmp4 = r13;
7857 const Register tmp5 = r14;
7858 const Register tmp6 = r15;
7859 const Register tmp7 = r16;
7860
7861 BLOCK_COMMENT("Entry:");
7862 __ enter(); // required for proper stackwalking of RuntimeStub frame
7863 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7864 __ leave(); // required for proper stackwalking of RuntimeStub frame
7865 __ ret(lr);
7866
7867 return start;
7868 }
7869
7870 address generate_squareToLen() {
7871 // squareToLen algorithm for sizes 1..127 described in java code works
7872 // faster than multiply_to_len on some CPUs and slower on others, but
7873 // multiply_to_len shows a bit better overall results
7874 __ align(CodeEntryAlignment);
7875 StubId stub_id = StubId::stubgen_squareToLen_id;
7876 StubCodeMark mark(this, stub_id);
7877 address start = __ pc();
7878
7879 const Register x = r0;
7880 const Register xlen = r1;
7881 const Register z = r2;
7882 const Register y = r4; // == x
7883 const Register ylen = r5; // == xlen
7884
7885 const Register tmp0 = r3;
7886 const Register tmp1 = r10;
7887 const Register tmp2 = r11;
7888 const Register tmp3 = r12;
7889 const Register tmp4 = r13;
7890 const Register tmp5 = r14;
7891 const Register tmp6 = r15;
7892 const Register tmp7 = r16;
7893
7894 RegSet spilled_regs = RegSet::of(y, ylen);
7895 BLOCK_COMMENT("Entry:");
7896 __ enter();
7897 __ push(spilled_regs, sp);
7898 __ mov(y, x);
7899 __ mov(ylen, xlen);
7900 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7901 __ pop(spilled_regs, sp);
7902 __ leave();
7903 __ ret(lr);
7904 return start;
7905 }
7906
7907 address generate_mulAdd() {
7908 __ align(CodeEntryAlignment);
7909 StubId stub_id = StubId::stubgen_mulAdd_id;
7910 StubCodeMark mark(this, stub_id);
7911
7912 address start = __ pc();
7913
7914 const Register out = r0;
7915 const Register in = r1;
7916 const Register offset = r2;
7917 const Register len = r3;
7918 const Register k = r4;
7919
7920 BLOCK_COMMENT("Entry:");
7921 __ enter();
7922 __ mul_add(out, in, offset, len, k);
7923 __ leave();
7924 __ ret(lr);
7925
7926 return start;
7927 }
7928
7929 // Arguments:
7930 //
7931 // Input:
7932 // c_rarg0 - newArr address
7933 // c_rarg1 - oldArr address
7934 // c_rarg2 - newIdx
7935 // c_rarg3 - shiftCount
7936 // c_rarg4 - numIter
7937 //
7938 address generate_bigIntegerRightShift() {
7939 __ align(CodeEntryAlignment);
7940 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7941 StubCodeMark mark(this, stub_id);
7942 address start = __ pc();
7943
7944 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7945
7946 Register newArr = c_rarg0;
7947 Register oldArr = c_rarg1;
7948 Register newIdx = c_rarg2;
7949 Register shiftCount = c_rarg3;
7950 Register numIter = c_rarg4;
7951 Register idx = numIter;
7952
7953 Register newArrCur = rscratch1;
7954 Register shiftRevCount = rscratch2;
7955 Register oldArrCur = r13;
7956 Register oldArrNext = r14;
7957
7958 FloatRegister oldElem0 = v0;
7959 FloatRegister oldElem1 = v1;
7960 FloatRegister newElem = v2;
7961 FloatRegister shiftVCount = v3;
7962 FloatRegister shiftVRevCount = v4;
7963
7964 __ cbz(idx, Exit);
7965
7966 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7967
7968 // left shift count
7969 __ movw(shiftRevCount, 32);
7970 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7971
7972 // numIter too small to allow a 4-words SIMD loop, rolling back
7973 __ cmp(numIter, (u1)4);
7974 __ br(Assembler::LT, ShiftThree);
7975
7976 __ dup(shiftVCount, __ T4S, shiftCount);
7977 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7978 __ negr(shiftVCount, __ T4S, shiftVCount);
7979
7980 __ BIND(ShiftSIMDLoop);
7981
7982 // Calculate the load addresses
7983 __ sub(idx, idx, 4);
7984 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7985 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7986 __ add(oldArrCur, oldArrNext, 4);
7987
7988 // Load 4 words and process
7989 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7990 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7991 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7992 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7993 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7994 __ st1(newElem, __ T4S, Address(newArrCur));
7995
7996 __ cmp(idx, (u1)4);
7997 __ br(Assembler::LT, ShiftTwoLoop);
7998 __ b(ShiftSIMDLoop);
7999
8000 __ BIND(ShiftTwoLoop);
8001 __ cbz(idx, Exit);
8002 __ cmp(idx, (u1)1);
8003 __ br(Assembler::EQ, ShiftOne);
8004
8005 // Calculate the load addresses
8006 __ sub(idx, idx, 2);
8007 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8008 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8009 __ add(oldArrCur, oldArrNext, 4);
8010
8011 // Load 2 words and process
8012 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8013 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8014 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8015 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8016 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8017 __ st1(newElem, __ T2S, Address(newArrCur));
8018 __ b(ShiftTwoLoop);
8019
8020 __ BIND(ShiftThree);
8021 __ tbz(idx, 1, ShiftOne);
8022 __ tbz(idx, 0, ShiftTwo);
8023 __ ldrw(r10, Address(oldArr, 12));
8024 __ ldrw(r11, Address(oldArr, 8));
8025 __ lsrvw(r10, r10, shiftCount);
8026 __ lslvw(r11, r11, shiftRevCount);
8027 __ orrw(r12, r10, r11);
8028 __ strw(r12, Address(newArr, 8));
8029
8030 __ BIND(ShiftTwo);
8031 __ ldrw(r10, Address(oldArr, 8));
8032 __ ldrw(r11, Address(oldArr, 4));
8033 __ lsrvw(r10, r10, shiftCount);
8034 __ lslvw(r11, r11, shiftRevCount);
8035 __ orrw(r12, r10, r11);
8036 __ strw(r12, Address(newArr, 4));
8037
8038 __ BIND(ShiftOne);
8039 __ ldrw(r10, Address(oldArr, 4));
8040 __ ldrw(r11, Address(oldArr));
8041 __ lsrvw(r10, r10, shiftCount);
8042 __ lslvw(r11, r11, shiftRevCount);
8043 __ orrw(r12, r10, r11);
8044 __ strw(r12, Address(newArr));
8045
8046 __ BIND(Exit);
8047 __ ret(lr);
8048
8049 return start;
8050 }
8051
8052 // Arguments:
8053 //
8054 // Input:
8055 // c_rarg0 - newArr address
8056 // c_rarg1 - oldArr address
8057 // c_rarg2 - newIdx
8058 // c_rarg3 - shiftCount
8059 // c_rarg4 - numIter
8060 //
8061 address generate_bigIntegerLeftShift() {
8062 __ align(CodeEntryAlignment);
8063 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8064 StubCodeMark mark(this, stub_id);
8065 address start = __ pc();
8066
8067 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8068
8069 Register newArr = c_rarg0;
8070 Register oldArr = c_rarg1;
8071 Register newIdx = c_rarg2;
8072 Register shiftCount = c_rarg3;
8073 Register numIter = c_rarg4;
8074
8075 Register shiftRevCount = rscratch1;
8076 Register oldArrNext = rscratch2;
8077
8078 FloatRegister oldElem0 = v0;
8079 FloatRegister oldElem1 = v1;
8080 FloatRegister newElem = v2;
8081 FloatRegister shiftVCount = v3;
8082 FloatRegister shiftVRevCount = v4;
8083
8084 __ cbz(numIter, Exit);
8085
8086 __ add(oldArrNext, oldArr, 4);
8087 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8088
8089 // right shift count
8090 __ movw(shiftRevCount, 32);
8091 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8092
8093 // numIter too small to allow a 4-words SIMD loop, rolling back
8094 __ cmp(numIter, (u1)4);
8095 __ br(Assembler::LT, ShiftThree);
8096
8097 __ dup(shiftVCount, __ T4S, shiftCount);
8098 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8099 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8100
8101 __ BIND(ShiftSIMDLoop);
8102
8103 // load 4 words and process
8104 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8105 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8106 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8107 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8108 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8109 __ st1(newElem, __ T4S, __ post(newArr, 16));
8110 __ sub(numIter, numIter, 4);
8111
8112 __ cmp(numIter, (u1)4);
8113 __ br(Assembler::LT, ShiftTwoLoop);
8114 __ b(ShiftSIMDLoop);
8115
8116 __ BIND(ShiftTwoLoop);
8117 __ cbz(numIter, Exit);
8118 __ cmp(numIter, (u1)1);
8119 __ br(Assembler::EQ, ShiftOne);
8120
8121 // load 2 words and process
8122 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8123 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8124 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8125 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8126 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8127 __ st1(newElem, __ T2S, __ post(newArr, 8));
8128 __ sub(numIter, numIter, 2);
8129 __ b(ShiftTwoLoop);
8130
8131 __ BIND(ShiftThree);
8132 __ ldrw(r10, __ post(oldArr, 4));
8133 __ ldrw(r11, __ post(oldArrNext, 4));
8134 __ lslvw(r10, r10, shiftCount);
8135 __ lsrvw(r11, r11, shiftRevCount);
8136 __ orrw(r12, r10, r11);
8137 __ strw(r12, __ post(newArr, 4));
8138 __ tbz(numIter, 1, Exit);
8139 __ tbz(numIter, 0, ShiftOne);
8140
8141 __ BIND(ShiftTwo);
8142 __ ldrw(r10, __ post(oldArr, 4));
8143 __ ldrw(r11, __ post(oldArrNext, 4));
8144 __ lslvw(r10, r10, shiftCount);
8145 __ lsrvw(r11, r11, shiftRevCount);
8146 __ orrw(r12, r10, r11);
8147 __ strw(r12, __ post(newArr, 4));
8148
8149 __ BIND(ShiftOne);
8150 __ ldrw(r10, Address(oldArr));
8151 __ ldrw(r11, Address(oldArrNext));
8152 __ lslvw(r10, r10, shiftCount);
8153 __ lsrvw(r11, r11, shiftRevCount);
8154 __ orrw(r12, r10, r11);
8155 __ strw(r12, Address(newArr));
8156
8157 __ BIND(Exit);
8158 __ ret(lr);
8159
8160 return start;
8161 }
8162
8163 address generate_count_positives(address &count_positives_long) {
8164 const u1 large_loop_size = 64;
8165 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8166 int dcache_line = VM_Version::dcache_line_size();
8167
8168 Register ary1 = r1, len = r2, result = r0;
8169
8170 __ align(CodeEntryAlignment);
8171
8172 StubId stub_id = StubId::stubgen_count_positives_id;
8173 StubCodeMark mark(this, stub_id);
8174
8175 address entry = __ pc();
8176
8177 __ enter();
8178 // precondition: a copy of len is already in result
8179 // __ mov(result, len);
8180
8181 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8182 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8183
8184 __ cmp(len, (u1)15);
8185 __ br(Assembler::GT, LEN_OVER_15);
8186 // The only case when execution falls into this code is when pointer is near
8187 // the end of memory page and we have to avoid reading next page
8188 __ add(ary1, ary1, len);
8189 __ subs(len, len, 8);
8190 __ br(Assembler::GT, LEN_OVER_8);
8191 __ ldr(rscratch2, Address(ary1, -8));
8192 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8193 __ lsrv(rscratch2, rscratch2, rscratch1);
8194 __ tst(rscratch2, UPPER_BIT_MASK);
8195 __ csel(result, zr, result, Assembler::NE);
8196 __ leave();
8197 __ ret(lr);
8198 __ bind(LEN_OVER_8);
8199 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8200 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8201 __ tst(rscratch2, UPPER_BIT_MASK);
8202 __ br(Assembler::NE, RET_NO_POP);
8203 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8204 __ lsrv(rscratch1, rscratch1, rscratch2);
8205 __ tst(rscratch1, UPPER_BIT_MASK);
8206 __ bind(RET_NO_POP);
8207 __ csel(result, zr, result, Assembler::NE);
8208 __ leave();
8209 __ ret(lr);
8210
8211 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8212 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8213
8214 count_positives_long = __ pc(); // 2nd entry point
8215
8216 __ enter();
8217
8218 __ bind(LEN_OVER_15);
8219 __ push(spilled_regs, sp);
8220 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8221 __ cbz(rscratch2, ALIGNED);
8222 __ ldp(tmp6, tmp1, Address(ary1));
8223 __ mov(tmp5, 16);
8224 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8225 __ add(ary1, ary1, rscratch1);
8226 __ orr(tmp6, tmp6, tmp1);
8227 __ tst(tmp6, UPPER_BIT_MASK);
8228 __ br(Assembler::NE, RET_ADJUST);
8229 __ sub(len, len, rscratch1);
8230
8231 __ bind(ALIGNED);
8232 __ cmp(len, large_loop_size);
8233 __ br(Assembler::LT, CHECK_16);
8234 // Perform 16-byte load as early return in pre-loop to handle situation
8235 // when initially aligned large array has negative values at starting bytes,
8236 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8237 // slower. Cases with negative bytes further ahead won't be affected that
8238 // much. In fact, it'll be faster due to early loads, less instructions and
8239 // less branches in LARGE_LOOP.
8240 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8241 __ sub(len, len, 16);
8242 __ orr(tmp6, tmp6, tmp1);
8243 __ tst(tmp6, UPPER_BIT_MASK);
8244 __ br(Assembler::NE, RET_ADJUST_16);
8245 __ cmp(len, large_loop_size);
8246 __ br(Assembler::LT, CHECK_16);
8247
8248 if (SoftwarePrefetchHintDistance >= 0
8249 && SoftwarePrefetchHintDistance >= dcache_line) {
8250 // initial prefetch
8251 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8252 }
8253 __ bind(LARGE_LOOP);
8254 if (SoftwarePrefetchHintDistance >= 0) {
8255 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8256 }
8257 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8258 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8259 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8260 // instructions per cycle and have less branches, but this approach disables
8261 // early return, thus, all 64 bytes are loaded and checked every time.
8262 __ ldp(tmp2, tmp3, Address(ary1));
8263 __ ldp(tmp4, tmp5, Address(ary1, 16));
8264 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8265 __ ldp(tmp6, tmp1, Address(ary1, 48));
8266 __ add(ary1, ary1, large_loop_size);
8267 __ sub(len, len, large_loop_size);
8268 __ orr(tmp2, tmp2, tmp3);
8269 __ orr(tmp4, tmp4, tmp5);
8270 __ orr(rscratch1, rscratch1, rscratch2);
8271 __ orr(tmp6, tmp6, tmp1);
8272 __ orr(tmp2, tmp2, tmp4);
8273 __ orr(rscratch1, rscratch1, tmp6);
8274 __ orr(tmp2, tmp2, rscratch1);
8275 __ tst(tmp2, UPPER_BIT_MASK);
8276 __ br(Assembler::NE, RET_ADJUST_LONG);
8277 __ cmp(len, large_loop_size);
8278 __ br(Assembler::GE, LARGE_LOOP);
8279
8280 __ bind(CHECK_16); // small 16-byte load pre-loop
8281 __ cmp(len, (u1)16);
8282 __ br(Assembler::LT, POST_LOOP16);
8283
8284 __ bind(LOOP16); // small 16-byte load loop
8285 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8286 __ sub(len, len, 16);
8287 __ orr(tmp2, tmp2, tmp3);
8288 __ tst(tmp2, UPPER_BIT_MASK);
8289 __ br(Assembler::NE, RET_ADJUST_16);
8290 __ cmp(len, (u1)16);
8291 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8292
8293 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8294 __ cmp(len, (u1)8);
8295 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8296 __ ldr(tmp3, Address(__ post(ary1, 8)));
8297 __ tst(tmp3, UPPER_BIT_MASK);
8298 __ br(Assembler::NE, RET_ADJUST);
8299 __ sub(len, len, 8);
8300
8301 __ bind(POST_LOOP16_LOAD_TAIL);
8302 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8303 __ ldr(tmp1, Address(ary1));
8304 __ mov(tmp2, 64);
8305 __ sub(tmp4, tmp2, len, __ LSL, 3);
8306 __ lslv(tmp1, tmp1, tmp4);
8307 __ tst(tmp1, UPPER_BIT_MASK);
8308 __ br(Assembler::NE, RET_ADJUST);
8309 // Fallthrough
8310
8311 __ bind(RET_LEN);
8312 __ pop(spilled_regs, sp);
8313 __ leave();
8314 __ ret(lr);
8315
8316 // difference result - len is the count of guaranteed to be
8317 // positive bytes
8318
8319 __ bind(RET_ADJUST_LONG);
8320 __ add(len, len, (u1)(large_loop_size - 16));
8321 __ bind(RET_ADJUST_16);
8322 __ add(len, len, 16);
8323 __ bind(RET_ADJUST);
8324 __ pop(spilled_regs, sp);
8325 __ leave();
8326 __ sub(result, result, len);
8327 __ ret(lr);
8328
8329 return entry;
8330 }
8331
8332 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8333 bool usePrefetch, Label &NOT_EQUAL) {
8334 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8335 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8336 tmp7 = r12, tmp8 = r13;
8337 Label LOOP;
8338
8339 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8340 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8341 __ bind(LOOP);
8342 if (usePrefetch) {
8343 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8344 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8345 }
8346 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8347 __ eor(tmp1, tmp1, tmp2);
8348 __ eor(tmp3, tmp3, tmp4);
8349 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8350 __ orr(tmp1, tmp1, tmp3);
8351 __ cbnz(tmp1, NOT_EQUAL);
8352 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8353 __ eor(tmp5, tmp5, tmp6);
8354 __ eor(tmp7, tmp7, tmp8);
8355 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8356 __ orr(tmp5, tmp5, tmp7);
8357 __ cbnz(tmp5, NOT_EQUAL);
8358 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8359 __ eor(tmp1, tmp1, tmp2);
8360 __ eor(tmp3, tmp3, tmp4);
8361 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8362 __ orr(tmp1, tmp1, tmp3);
8363 __ cbnz(tmp1, NOT_EQUAL);
8364 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8365 __ eor(tmp5, tmp5, tmp6);
8366 __ sub(cnt1, cnt1, 8 * wordSize);
8367 __ eor(tmp7, tmp7, tmp8);
8368 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8369 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8370 // cmp) because subs allows an unlimited range of immediate operand.
8371 __ subs(tmp6, cnt1, loopThreshold);
8372 __ orr(tmp5, tmp5, tmp7);
8373 __ cbnz(tmp5, NOT_EQUAL);
8374 __ br(__ GE, LOOP);
8375 // post-loop
8376 __ eor(tmp1, tmp1, tmp2);
8377 __ eor(tmp3, tmp3, tmp4);
8378 __ orr(tmp1, tmp1, tmp3);
8379 __ sub(cnt1, cnt1, 2 * wordSize);
8380 __ cbnz(tmp1, NOT_EQUAL);
8381 }
8382
8383 void generate_large_array_equals_loop_simd(int loopThreshold,
8384 bool usePrefetch, Label &NOT_EQUAL) {
8385 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8386 tmp2 = rscratch2;
8387 Label LOOP;
8388
8389 __ bind(LOOP);
8390 if (usePrefetch) {
8391 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8392 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8393 }
8394 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8395 __ sub(cnt1, cnt1, 8 * wordSize);
8396 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8397 __ subs(tmp1, cnt1, loopThreshold);
8398 __ eor(v0, __ T16B, v0, v4);
8399 __ eor(v1, __ T16B, v1, v5);
8400 __ eor(v2, __ T16B, v2, v6);
8401 __ eor(v3, __ T16B, v3, v7);
8402 __ orr(v0, __ T16B, v0, v1);
8403 __ orr(v1, __ T16B, v2, v3);
8404 __ orr(v0, __ T16B, v0, v1);
8405 __ umov(tmp1, v0, __ D, 0);
8406 __ umov(tmp2, v0, __ D, 1);
8407 __ orr(tmp1, tmp1, tmp2);
8408 __ cbnz(tmp1, NOT_EQUAL);
8409 __ br(__ GE, LOOP);
8410 }
8411
8412 // a1 = r1 - array1 address
8413 // a2 = r2 - array2 address
8414 // result = r0 - return value. Already contains "false"
8415 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8416 // r3-r5 are reserved temporary registers
8417 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8418 address generate_large_array_equals() {
8419 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8420 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8421 tmp7 = r12, tmp8 = r13;
8422 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8423 SMALL_LOOP, POST_LOOP;
8424 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8425 // calculate if at least 32 prefetched bytes are used
8426 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8427 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8428 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8429 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8430 tmp5, tmp6, tmp7, tmp8);
8431
8432 __ align(CodeEntryAlignment);
8433
8434 StubId stub_id = StubId::stubgen_large_array_equals_id;
8435 StubCodeMark mark(this, stub_id);
8436
8437 address entry = __ pc();
8438 __ enter();
8439 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8440 // also advance pointers to use post-increment instead of pre-increment
8441 __ add(a1, a1, wordSize);
8442 __ add(a2, a2, wordSize);
8443 if (AvoidUnalignedAccesses) {
8444 // both implementations (SIMD/nonSIMD) are using relatively large load
8445 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8446 // on some CPUs in case of address is not at least 16-byte aligned.
8447 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8448 // load if needed at least for 1st address and make if 16-byte aligned.
8449 Label ALIGNED16;
8450 __ tbz(a1, 3, ALIGNED16);
8451 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8452 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8453 __ sub(cnt1, cnt1, wordSize);
8454 __ eor(tmp1, tmp1, tmp2);
8455 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8456 __ bind(ALIGNED16);
8457 }
8458 if (UseSIMDForArrayEquals) {
8459 if (SoftwarePrefetchHintDistance >= 0) {
8460 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8461 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8462 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8463 /* prfm = */ true, NOT_EQUAL);
8464 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8465 __ br(__ LT, TAIL);
8466 }
8467 __ bind(NO_PREFETCH_LARGE_LOOP);
8468 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8469 /* prfm = */ false, NOT_EQUAL);
8470 } else {
8471 __ push(spilled_regs, sp);
8472 if (SoftwarePrefetchHintDistance >= 0) {
8473 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8474 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8475 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8476 /* prfm = */ true, NOT_EQUAL);
8477 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8478 __ br(__ LT, TAIL);
8479 }
8480 __ bind(NO_PREFETCH_LARGE_LOOP);
8481 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8482 /* prfm = */ false, NOT_EQUAL);
8483 }
8484 __ bind(TAIL);
8485 __ cbz(cnt1, EQUAL);
8486 __ subs(cnt1, cnt1, wordSize);
8487 __ br(__ LE, POST_LOOP);
8488 __ bind(SMALL_LOOP);
8489 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8490 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8491 __ subs(cnt1, cnt1, wordSize);
8492 __ eor(tmp1, tmp1, tmp2);
8493 __ cbnz(tmp1, NOT_EQUAL);
8494 __ br(__ GT, SMALL_LOOP);
8495 __ bind(POST_LOOP);
8496 __ ldr(tmp1, Address(a1, cnt1));
8497 __ ldr(tmp2, Address(a2, cnt1));
8498 __ eor(tmp1, tmp1, tmp2);
8499 __ cbnz(tmp1, NOT_EQUAL);
8500 __ bind(EQUAL);
8501 __ mov(result, true);
8502 __ bind(NOT_EQUAL);
8503 if (!UseSIMDForArrayEquals) {
8504 __ pop(spilled_regs, sp);
8505 }
8506 __ bind(NOT_EQUAL_NO_POP);
8507 __ leave();
8508 __ ret(lr);
8509 return entry;
8510 }
8511
8512 // result = r0 - return value. Contains initial hashcode value on entry.
8513 // ary = r1 - array address
8514 // cnt = r2 - elements count
8515 // Clobbers: v0-v13, rscratch1, rscratch2
8516 address generate_large_arrays_hashcode(BasicType eltype) {
8517 const Register result = r0, ary = r1, cnt = r2;
8518 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8519 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8520 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8521 const FloatRegister vpowm = v13;
8522
8523 ARRAYS_HASHCODE_REGISTERS;
8524
8525 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8526
8527 unsigned int vf; // vectorization factor
8528 bool multiply_by_halves;
8529 Assembler::SIMD_Arrangement load_arrangement;
8530 switch (eltype) {
8531 case T_BOOLEAN:
8532 case T_BYTE:
8533 load_arrangement = Assembler::T8B;
8534 multiply_by_halves = true;
8535 vf = 8;
8536 break;
8537 case T_CHAR:
8538 case T_SHORT:
8539 load_arrangement = Assembler::T8H;
8540 multiply_by_halves = true;
8541 vf = 8;
8542 break;
8543 case T_INT:
8544 load_arrangement = Assembler::T4S;
8545 multiply_by_halves = false;
8546 vf = 4;
8547 break;
8548 default:
8549 ShouldNotReachHere();
8550 }
8551
8552 // Unroll factor
8553 const unsigned uf = 4;
8554
8555 // Effective vectorization factor
8556 const unsigned evf = vf * uf;
8557
8558 __ align(CodeEntryAlignment);
8559
8560 StubId stub_id;
8561 switch (eltype) {
8562 case T_BOOLEAN:
8563 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8564 break;
8565 case T_BYTE:
8566 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8567 break;
8568 case T_CHAR:
8569 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8570 break;
8571 case T_SHORT:
8572 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8573 break;
8574 case T_INT:
8575 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8576 break;
8577 default:
8578 stub_id = StubId::NO_STUBID;
8579 ShouldNotReachHere();
8580 };
8581
8582 StubCodeMark mark(this, stub_id);
8583
8584 address entry = __ pc();
8585 __ enter();
8586
8587 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8588 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8589 // value shouldn't change throughout both loops.
8590 __ movw(rscratch1, intpow(31U, 3));
8591 __ mov(vpow, Assembler::S, 0, rscratch1);
8592 __ movw(rscratch1, intpow(31U, 2));
8593 __ mov(vpow, Assembler::S, 1, rscratch1);
8594 __ movw(rscratch1, intpow(31U, 1));
8595 __ mov(vpow, Assembler::S, 2, rscratch1);
8596 __ movw(rscratch1, intpow(31U, 0));
8597 __ mov(vpow, Assembler::S, 3, rscratch1);
8598
8599 __ mov(vmul0, Assembler::T16B, 0);
8600 __ mov(vmul0, Assembler::S, 3, result);
8601
8602 __ andr(rscratch2, cnt, (uf - 1) * vf);
8603 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8604
8605 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8606 __ mov(vpowm, Assembler::S, 0, rscratch1);
8607
8608 // SMALL LOOP
8609 __ bind(SMALL_LOOP);
8610
8611 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8612 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8613 __ subsw(rscratch2, rscratch2, vf);
8614
8615 if (load_arrangement == Assembler::T8B) {
8616 // Extend 8B to 8H to be able to use vector multiply
8617 // instructions
8618 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8619 if (is_signed_subword_type(eltype)) {
8620 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8621 } else {
8622 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8623 }
8624 }
8625
8626 switch (load_arrangement) {
8627 case Assembler::T4S:
8628 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8629 break;
8630 case Assembler::T8B:
8631 case Assembler::T8H:
8632 assert(is_subword_type(eltype), "subword type expected");
8633 if (is_signed_subword_type(eltype)) {
8634 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8635 } else {
8636 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8637 }
8638 break;
8639 default:
8640 __ should_not_reach_here();
8641 }
8642
8643 // Process the upper half of a vector
8644 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8645 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8646 if (is_signed_subword_type(eltype)) {
8647 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8648 } else {
8649 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8650 }
8651 }
8652
8653 __ br(Assembler::HI, SMALL_LOOP);
8654
8655 // SMALL LOOP'S EPILOQUE
8656 __ lsr(rscratch2, cnt, exact_log2(evf));
8657 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8658
8659 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8660 __ addv(vmul0, Assembler::T4S, vmul0);
8661 __ umov(result, vmul0, Assembler::S, 0);
8662
8663 // TAIL
8664 __ bind(TAIL);
8665
8666 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8667 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8668 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8669 __ andr(rscratch2, cnt, vf - 1);
8670 __ bind(TAIL_SHORTCUT);
8671 __ adr(rscratch1, BR_BASE);
8672 // For Cortex-A53 offset is 4 because 2 nops are generated.
8673 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8674 __ movw(rscratch2, 0x1f);
8675 __ br(rscratch1);
8676
8677 for (size_t i = 0; i < vf - 1; ++i) {
8678 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8679 eltype);
8680 __ maddw(result, result, rscratch2, rscratch1);
8681 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8682 // Generate 2nd nop to have 4 instructions per iteration.
8683 if (VM_Version::supports_a53mac()) {
8684 __ nop();
8685 }
8686 }
8687 __ bind(BR_BASE);
8688
8689 __ leave();
8690 __ ret(lr);
8691
8692 // LARGE LOOP
8693 __ bind(LARGE_LOOP_PREHEADER);
8694
8695 __ lsr(rscratch2, cnt, exact_log2(evf));
8696
8697 if (multiply_by_halves) {
8698 // 31^4 - multiplier between lower and upper parts of a register
8699 __ movw(rscratch1, intpow(31U, vf / 2));
8700 __ mov(vpowm, Assembler::S, 1, rscratch1);
8701 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8702 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8703 __ mov(vpowm, Assembler::S, 0, rscratch1);
8704 } else {
8705 // 31^16
8706 __ movw(rscratch1, intpow(31U, evf));
8707 __ mov(vpowm, Assembler::S, 0, rscratch1);
8708 }
8709
8710 __ mov(vmul3, Assembler::T16B, 0);
8711 __ mov(vmul2, Assembler::T16B, 0);
8712 __ mov(vmul1, Assembler::T16B, 0);
8713
8714 __ bind(LARGE_LOOP);
8715
8716 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8717 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8718 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8719 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8720
8721 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8722 Address(__ post(ary, evf * type2aelembytes(eltype))));
8723
8724 if (load_arrangement == Assembler::T8B) {
8725 // Extend 8B to 8H to be able to use vector multiply
8726 // instructions
8727 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8728 if (is_signed_subword_type(eltype)) {
8729 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8730 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8731 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8732 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8733 } else {
8734 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8735 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8736 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8737 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8738 }
8739 }
8740
8741 switch (load_arrangement) {
8742 case Assembler::T4S:
8743 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8744 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8745 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8746 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8747 break;
8748 case Assembler::T8B:
8749 case Assembler::T8H:
8750 assert(is_subword_type(eltype), "subword type expected");
8751 if (is_signed_subword_type(eltype)) {
8752 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8753 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8754 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8755 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8756 } else {
8757 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8758 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8759 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8760 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8761 }
8762 break;
8763 default:
8764 __ should_not_reach_here();
8765 }
8766
8767 // Process the upper half of a vector
8768 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8769 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8770 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8771 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8772 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8773 if (is_signed_subword_type(eltype)) {
8774 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8775 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8776 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8777 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8778 } else {
8779 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8780 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8781 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8782 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8783 }
8784 }
8785
8786 __ subsw(rscratch2, rscratch2, 1);
8787 __ br(Assembler::HI, LARGE_LOOP);
8788
8789 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8790 __ addv(vmul3, Assembler::T4S, vmul3);
8791 __ umov(result, vmul3, Assembler::S, 0);
8792
8793 __ mov(rscratch2, intpow(31U, vf));
8794
8795 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8796 __ addv(vmul2, Assembler::T4S, vmul2);
8797 __ umov(rscratch1, vmul2, Assembler::S, 0);
8798 __ maddw(result, result, rscratch2, rscratch1);
8799
8800 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8801 __ addv(vmul1, Assembler::T4S, vmul1);
8802 __ umov(rscratch1, vmul1, Assembler::S, 0);
8803 __ maddw(result, result, rscratch2, rscratch1);
8804
8805 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8806 __ addv(vmul0, Assembler::T4S, vmul0);
8807 __ umov(rscratch1, vmul0, Assembler::S, 0);
8808 __ maddw(result, result, rscratch2, rscratch1);
8809
8810 __ andr(rscratch2, cnt, vf - 1);
8811 __ cbnz(rscratch2, TAIL_SHORTCUT);
8812
8813 __ leave();
8814 __ ret(lr);
8815
8816 return entry;
8817 }
8818
8819 address generate_dsin_dcos(bool isCos) {
8820 __ align(CodeEntryAlignment);
8821 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8822 StubCodeMark mark(this, stub_id);
8823 address start = __ pc();
8824 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8825 (address)StubRoutines::aarch64::_two_over_pi,
8826 (address)StubRoutines::aarch64::_pio2,
8827 (address)StubRoutines::aarch64::_dsin_coef,
8828 (address)StubRoutines::aarch64::_dcos_coef);
8829 return start;
8830 }
8831
8832 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8833 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8834 Label &DIFF2) {
8835 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8836 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8837
8838 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8839 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8840 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8841 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8842
8843 __ fmovd(tmpL, vtmp3);
8844 __ eor(rscratch2, tmp3, tmpL);
8845 __ cbnz(rscratch2, DIFF2);
8846
8847 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8848 __ umov(tmpL, vtmp3, __ D, 1);
8849 __ eor(rscratch2, tmpU, tmpL);
8850 __ cbnz(rscratch2, DIFF1);
8851
8852 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8853 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8854 __ fmovd(tmpL, vtmp);
8855 __ eor(rscratch2, tmp3, tmpL);
8856 __ cbnz(rscratch2, DIFF2);
8857
8858 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8859 __ umov(tmpL, vtmp, __ D, 1);
8860 __ eor(rscratch2, tmpU, tmpL);
8861 __ cbnz(rscratch2, DIFF1);
8862 }
8863
8864 // r0 = result
8865 // r1 = str1
8866 // r2 = cnt1
8867 // r3 = str2
8868 // r4 = cnt2
8869 // r10 = tmp1
8870 // r11 = tmp2
8871 address generate_compare_long_string_different_encoding(bool isLU) {
8872 __ align(CodeEntryAlignment);
8873 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8874 StubCodeMark mark(this, stub_id);
8875 address entry = __ pc();
8876 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8877 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8878 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8879 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8880 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8881 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8882 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8883
8884 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8885
8886 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8887 // cnt2 == amount of characters left to compare
8888 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8889 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8890 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8891 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8892 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8893 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8894 __ eor(rscratch2, tmp1, tmp2);
8895 __ mov(rscratch1, tmp2);
8896 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8897 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8898 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8899 __ push(spilled_regs, sp);
8900 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8901 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8902
8903 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8904
8905 if (SoftwarePrefetchHintDistance >= 0) {
8906 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8907 __ br(__ LT, NO_PREFETCH);
8908 __ bind(LARGE_LOOP_PREFETCH);
8909 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8910 __ mov(tmp4, 2);
8911 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8912 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8913 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8914 __ subs(tmp4, tmp4, 1);
8915 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8916 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8917 __ mov(tmp4, 2);
8918 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8919 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8920 __ subs(tmp4, tmp4, 1);
8921 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8922 __ sub(cnt2, cnt2, 64);
8923 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8924 __ br(__ GE, LARGE_LOOP_PREFETCH);
8925 }
8926 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8927 __ bind(NO_PREFETCH);
8928 __ subs(cnt2, cnt2, 16);
8929 __ br(__ LT, TAIL);
8930 __ align(OptoLoopAlignment);
8931 __ bind(SMALL_LOOP); // smaller loop
8932 __ subs(cnt2, cnt2, 16);
8933 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8934 __ br(__ GE, SMALL_LOOP);
8935 __ cmn(cnt2, (u1)16);
8936 __ br(__ EQ, LOAD_LAST);
8937 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8938 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8939 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8940 __ ldr(tmp3, Address(cnt1, -8));
8941 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8942 __ b(LOAD_LAST);
8943 __ bind(DIFF2);
8944 __ mov(tmpU, tmp3);
8945 __ bind(DIFF1);
8946 __ pop(spilled_regs, sp);
8947 __ b(CALCULATE_DIFFERENCE);
8948 __ bind(LOAD_LAST);
8949 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8950 // No need to load it again
8951 __ mov(tmpU, tmp3);
8952 __ pop(spilled_regs, sp);
8953
8954 // tmp2 points to the address of the last 4 Latin1 characters right now
8955 __ ldrs(vtmp, Address(tmp2));
8956 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8957 __ fmovd(tmpL, vtmp);
8958
8959 __ eor(rscratch2, tmpU, tmpL);
8960 __ cbz(rscratch2, DONE);
8961
8962 // Find the first different characters in the longwords and
8963 // compute their difference.
8964 __ bind(CALCULATE_DIFFERENCE);
8965 __ rev(rscratch2, rscratch2);
8966 __ clz(rscratch2, rscratch2);
8967 __ andr(rscratch2, rscratch2, -16);
8968 __ lsrv(tmp1, tmp1, rscratch2);
8969 __ uxthw(tmp1, tmp1);
8970 __ lsrv(rscratch1, rscratch1, rscratch2);
8971 __ uxthw(rscratch1, rscratch1);
8972 __ subw(result, tmp1, rscratch1);
8973 __ bind(DONE);
8974 __ ret(lr);
8975 return entry;
8976 }
8977
8978 // r0 = input (float16)
8979 // v0 = result (float)
8980 // v1 = temporary float register
8981 address generate_float16ToFloat() {
8982 __ align(CodeEntryAlignment);
8983 StubId stub_id = StubId::stubgen_hf2f_id;
8984 StubCodeMark mark(this, stub_id);
8985 address entry = __ pc();
8986 BLOCK_COMMENT("Entry:");
8987 __ flt16_to_flt(v0, r0, v1);
8988 __ ret(lr);
8989 return entry;
8990 }
8991
8992 // v0 = input (float)
8993 // r0 = result (float16)
8994 // v1 = temporary float register
8995 address generate_floatToFloat16() {
8996 __ align(CodeEntryAlignment);
8997 StubId stub_id = StubId::stubgen_f2hf_id;
8998 StubCodeMark mark(this, stub_id);
8999 address entry = __ pc();
9000 BLOCK_COMMENT("Entry:");
9001 __ flt_to_flt16(r0, v0, v1);
9002 __ ret(lr);
9003 return entry;
9004 }
9005
9006 address generate_method_entry_barrier() {
9007 __ align(CodeEntryAlignment);
9008 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9009 StubCodeMark mark(this, stub_id);
9010
9011 Label deoptimize_label;
9012
9013 address start = __ pc();
9014
9015 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9016
9017 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9018 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9019 // We can get here despite the nmethod being good, if we have not
9020 // yet applied our cross modification fence (or data fence).
9021 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9022 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9023 __ ldrw(rscratch2, rscratch2);
9024 __ strw(rscratch2, thread_epoch_addr);
9025 __ isb();
9026 __ membar(__ LoadLoad);
9027 }
9028
9029 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9030
9031 __ enter();
9032 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9033
9034 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9035
9036 __ push_call_clobbered_registers();
9037
9038 __ mov(c_rarg0, rscratch2);
9039 __ call_VM_leaf
9040 (CAST_FROM_FN_PTR
9041 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9042
9043 __ reset_last_Java_frame(true);
9044
9045 __ mov(rscratch1, r0);
9046
9047 __ pop_call_clobbered_registers();
9048
9049 __ cbnz(rscratch1, deoptimize_label);
9050
9051 __ leave();
9052 __ ret(lr);
9053
9054 __ BIND(deoptimize_label);
9055
9056 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9057 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9058
9059 __ mov(sp, rscratch1);
9060 __ br(rscratch2);
9061
9062 return start;
9063 }
9064
9065 // r0 = result
9066 // r1 = str1
9067 // r2 = cnt1
9068 // r3 = str2
9069 // r4 = cnt2
9070 // r10 = tmp1
9071 // r11 = tmp2
9072 address generate_compare_long_string_same_encoding(bool isLL) {
9073 __ align(CodeEntryAlignment);
9074 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9075 StubCodeMark mark(this, stub_id);
9076 address entry = __ pc();
9077 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9078 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9079
9080 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9081
9082 // exit from large loop when less than 64 bytes left to read or we're about
9083 // to prefetch memory behind array border
9084 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9085
9086 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9087 __ eor(rscratch2, tmp1, tmp2);
9088 __ cbnz(rscratch2, CAL_DIFFERENCE);
9089
9090 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9091 // update pointers, because of previous read
9092 __ add(str1, str1, wordSize);
9093 __ add(str2, str2, wordSize);
9094 if (SoftwarePrefetchHintDistance >= 0) {
9095 __ align(OptoLoopAlignment);
9096 __ bind(LARGE_LOOP_PREFETCH);
9097 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9098 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9099
9100 for (int i = 0; i < 4; i++) {
9101 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9102 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9103 __ cmp(tmp1, tmp2);
9104 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9105 __ br(Assembler::NE, DIFF);
9106 }
9107 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9108 __ add(str1, str1, 64);
9109 __ add(str2, str2, 64);
9110 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9111 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9112 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9113 }
9114
9115 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9116 __ br(Assembler::LE, LESS16);
9117 __ align(OptoLoopAlignment);
9118 __ bind(LOOP_COMPARE16);
9119 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9120 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9121 __ cmp(tmp1, tmp2);
9122 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9123 __ br(Assembler::NE, DIFF);
9124 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9125 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9126 __ br(Assembler::LT, LESS16);
9127
9128 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9129 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9130 __ cmp(tmp1, tmp2);
9131 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9132 __ br(Assembler::NE, DIFF);
9133 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9134 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9135 __ br(Assembler::GE, LOOP_COMPARE16);
9136 __ cbz(cnt2, LENGTH_DIFF);
9137
9138 __ bind(LESS16);
9139 // each 8 compare
9140 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9141 __ br(Assembler::LE, LESS8);
9142 __ ldr(tmp1, Address(__ post(str1, 8)));
9143 __ ldr(tmp2, Address(__ post(str2, 8)));
9144 __ eor(rscratch2, tmp1, tmp2);
9145 __ cbnz(rscratch2, CAL_DIFFERENCE);
9146 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9147
9148 __ bind(LESS8); // directly load last 8 bytes
9149 if (!isLL) {
9150 __ add(cnt2, cnt2, cnt2);
9151 }
9152 __ ldr(tmp1, Address(str1, cnt2));
9153 __ ldr(tmp2, Address(str2, cnt2));
9154 __ eor(rscratch2, tmp1, tmp2);
9155 __ cbz(rscratch2, LENGTH_DIFF);
9156 __ b(CAL_DIFFERENCE);
9157
9158 __ bind(DIFF);
9159 __ cmp(tmp1, tmp2);
9160 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9161 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9162 // reuse rscratch2 register for the result of eor instruction
9163 __ eor(rscratch2, tmp1, tmp2);
9164
9165 __ bind(CAL_DIFFERENCE);
9166 __ rev(rscratch2, rscratch2);
9167 __ clz(rscratch2, rscratch2);
9168 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9169 __ lsrv(tmp1, tmp1, rscratch2);
9170 __ lsrv(tmp2, tmp2, rscratch2);
9171 if (isLL) {
9172 __ uxtbw(tmp1, tmp1);
9173 __ uxtbw(tmp2, tmp2);
9174 } else {
9175 __ uxthw(tmp1, tmp1);
9176 __ uxthw(tmp2, tmp2);
9177 }
9178 __ subw(result, tmp1, tmp2);
9179
9180 __ bind(LENGTH_DIFF);
9181 __ ret(lr);
9182 return entry;
9183 }
9184
9185 enum string_compare_mode {
9186 LL,
9187 LU,
9188 UL,
9189 UU,
9190 };
9191
9192 // The following registers are declared in aarch64.ad
9193 // r0 = result
9194 // r1 = str1
9195 // r2 = cnt1
9196 // r3 = str2
9197 // r4 = cnt2
9198 // r10 = tmp1
9199 // r11 = tmp2
9200 // z0 = ztmp1
9201 // z1 = ztmp2
9202 // p0 = pgtmp1
9203 // p1 = pgtmp2
9204 address generate_compare_long_string_sve(string_compare_mode mode) {
9205 StubId stub_id;
9206 switch (mode) {
9207 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9208 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9209 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9210 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9211 default: ShouldNotReachHere();
9212 }
9213
9214 __ align(CodeEntryAlignment);
9215 address entry = __ pc();
9216 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9217 tmp1 = r10, tmp2 = r11;
9218
9219 Label LOOP, DONE, MISMATCH;
9220 Register vec_len = tmp1;
9221 Register idx = tmp2;
9222 // The minimum of the string lengths has been stored in cnt2.
9223 Register cnt = cnt2;
9224 FloatRegister ztmp1 = z0, ztmp2 = z1;
9225 PRegister pgtmp1 = p0, pgtmp2 = p1;
9226
9227 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9228 switch (mode) { \
9229 case LL: \
9230 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9231 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9232 break; \
9233 case LU: \
9234 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9235 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9236 break; \
9237 case UL: \
9238 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9239 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9240 break; \
9241 case UU: \
9242 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9243 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9244 break; \
9245 default: \
9246 ShouldNotReachHere(); \
9247 }
9248
9249 StubCodeMark mark(this, stub_id);
9250
9251 __ mov(idx, 0);
9252 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9253
9254 if (mode == LL) {
9255 __ sve_cntb(vec_len);
9256 } else {
9257 __ sve_cnth(vec_len);
9258 }
9259
9260 __ sub(rscratch1, cnt, vec_len);
9261
9262 __ bind(LOOP);
9263
9264 // main loop
9265 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9266 __ add(idx, idx, vec_len);
9267 // Compare strings.
9268 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9269 __ br(__ NE, MISMATCH);
9270 __ cmp(idx, rscratch1);
9271 __ br(__ LT, LOOP);
9272
9273 // post loop, last iteration
9274 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9275
9276 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9277 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9278 __ br(__ EQ, DONE);
9279
9280 __ bind(MISMATCH);
9281
9282 // Crop the vector to find its location.
9283 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9284 // Extract the first different characters of each string.
9285 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9286 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9287
9288 // Compute the difference of the first different characters.
9289 __ sub(result, rscratch1, rscratch2);
9290
9291 __ bind(DONE);
9292 __ ret(lr);
9293 #undef LOAD_PAIR
9294 return entry;
9295 }
9296
9297 void generate_compare_long_strings() {
9298 if (UseSVE == 0) {
9299 StubRoutines::aarch64::_compare_long_string_LL
9300 = generate_compare_long_string_same_encoding(true);
9301 StubRoutines::aarch64::_compare_long_string_UU
9302 = generate_compare_long_string_same_encoding(false);
9303 StubRoutines::aarch64::_compare_long_string_LU
9304 = generate_compare_long_string_different_encoding(true);
9305 StubRoutines::aarch64::_compare_long_string_UL
9306 = generate_compare_long_string_different_encoding(false);
9307 } else {
9308 StubRoutines::aarch64::_compare_long_string_LL
9309 = generate_compare_long_string_sve(LL);
9310 StubRoutines::aarch64::_compare_long_string_UU
9311 = generate_compare_long_string_sve(UU);
9312 StubRoutines::aarch64::_compare_long_string_LU
9313 = generate_compare_long_string_sve(LU);
9314 StubRoutines::aarch64::_compare_long_string_UL
9315 = generate_compare_long_string_sve(UL);
9316 }
9317 }
9318
9319 // R0 = result
9320 // R1 = str2
9321 // R2 = cnt1
9322 // R3 = str1
9323 // R4 = cnt2
9324 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9325 //
9326 // This generic linear code use few additional ideas, which makes it faster:
9327 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9328 // in order to skip initial loading(help in systems with 1 ld pipeline)
9329 // 2) we can use "fast" algorithm of finding single character to search for
9330 // first symbol with less branches(1 branch per each loaded register instead
9331 // of branch for each symbol), so, this is where constants like
9332 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9333 // 3) after loading and analyzing 1st register of source string, it can be
9334 // used to search for every 1st character entry, saving few loads in
9335 // comparison with "simplier-but-slower" implementation
9336 // 4) in order to avoid lots of push/pop operations, code below is heavily
9337 // re-using/re-initializing/compressing register values, which makes code
9338 // larger and a bit less readable, however, most of extra operations are
9339 // issued during loads or branches, so, penalty is minimal
9340 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9341 StubId stub_id;
9342 if (str1_isL) {
9343 if (str2_isL) {
9344 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9345 } else {
9346 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9347 }
9348 } else {
9349 if (str2_isL) {
9350 ShouldNotReachHere();
9351 } else {
9352 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9353 }
9354 }
9355 __ align(CodeEntryAlignment);
9356 StubCodeMark mark(this, stub_id);
9357 address entry = __ pc();
9358
9359 int str1_chr_size = str1_isL ? 1 : 2;
9360 int str2_chr_size = str2_isL ? 1 : 2;
9361 int str1_chr_shift = str1_isL ? 0 : 1;
9362 int str2_chr_shift = str2_isL ? 0 : 1;
9363 bool isL = str1_isL && str2_isL;
9364 // parameters
9365 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9366 // temporary registers
9367 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9368 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9369 // redefinitions
9370 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9371
9372 __ push(spilled_regs, sp);
9373 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9374 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9375 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9376 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9377 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9378 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9379 // Read whole register from str1. It is safe, because length >=8 here
9380 __ ldr(ch1, Address(str1));
9381 // Read whole register from str2. It is safe, because length >=8 here
9382 __ ldr(ch2, Address(str2));
9383 __ sub(cnt2, cnt2, cnt1);
9384 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9385 if (str1_isL != str2_isL) {
9386 __ eor(v0, __ T16B, v0, v0);
9387 }
9388 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9389 __ mul(first, first, tmp1);
9390 // check if we have less than 1 register to check
9391 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9392 if (str1_isL != str2_isL) {
9393 __ fmovd(v1, ch1);
9394 }
9395 __ br(__ LE, L_SMALL);
9396 __ eor(ch2, first, ch2);
9397 if (str1_isL != str2_isL) {
9398 __ zip1(v1, __ T16B, v1, v0);
9399 }
9400 __ sub(tmp2, ch2, tmp1);
9401 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9402 __ bics(tmp2, tmp2, ch2);
9403 if (str1_isL != str2_isL) {
9404 __ fmovd(ch1, v1);
9405 }
9406 __ br(__ NE, L_HAS_ZERO);
9407 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9408 __ add(result, result, wordSize/str2_chr_size);
9409 __ add(str2, str2, wordSize);
9410 __ br(__ LT, L_POST_LOOP);
9411 __ BIND(L_LOOP);
9412 __ ldr(ch2, Address(str2));
9413 __ eor(ch2, first, ch2);
9414 __ sub(tmp2, ch2, tmp1);
9415 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9416 __ bics(tmp2, tmp2, ch2);
9417 __ br(__ NE, L_HAS_ZERO);
9418 __ BIND(L_LOOP_PROCEED);
9419 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9420 __ add(str2, str2, wordSize);
9421 __ add(result, result, wordSize/str2_chr_size);
9422 __ br(__ GE, L_LOOP);
9423 __ BIND(L_POST_LOOP);
9424 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9425 __ br(__ LE, NOMATCH);
9426 __ ldr(ch2, Address(str2));
9427 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9428 __ eor(ch2, first, ch2);
9429 __ sub(tmp2, ch2, tmp1);
9430 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9431 __ mov(tmp4, -1); // all bits set
9432 __ b(L_SMALL_PROCEED);
9433 __ align(OptoLoopAlignment);
9434 __ BIND(L_SMALL);
9435 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9436 __ eor(ch2, first, ch2);
9437 if (str1_isL != str2_isL) {
9438 __ zip1(v1, __ T16B, v1, v0);
9439 }
9440 __ sub(tmp2, ch2, tmp1);
9441 __ mov(tmp4, -1); // all bits set
9442 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9443 if (str1_isL != str2_isL) {
9444 __ fmovd(ch1, v1); // move converted 4 symbols
9445 }
9446 __ BIND(L_SMALL_PROCEED);
9447 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9448 __ bic(tmp2, tmp2, ch2);
9449 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9450 __ rbit(tmp2, tmp2);
9451 __ br(__ EQ, NOMATCH);
9452 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9453 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9454 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9455 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9456 if (str2_isL) { // LL
9457 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9458 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9459 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9460 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9461 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9462 } else {
9463 __ mov(ch2, 0xE); // all bits in byte set except last one
9464 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9465 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9466 __ lslv(tmp2, tmp2, tmp4);
9467 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9468 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9469 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9470 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9471 }
9472 __ cmp(ch1, ch2);
9473 __ mov(tmp4, wordSize/str2_chr_size);
9474 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9475 __ BIND(L_SMALL_CMP_LOOP);
9476 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9477 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9478 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9479 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9480 __ add(tmp4, tmp4, 1);
9481 __ cmp(tmp4, cnt1);
9482 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9483 __ cmp(first, ch2);
9484 __ br(__ EQ, L_SMALL_CMP_LOOP);
9485 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9486 __ cbz(tmp2, NOMATCH); // no more matches. exit
9487 __ clz(tmp4, tmp2);
9488 __ add(result, result, 1); // advance index
9489 __ add(str2, str2, str2_chr_size); // advance pointer
9490 __ b(L_SMALL_HAS_ZERO_LOOP);
9491 __ align(OptoLoopAlignment);
9492 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9493 __ cmp(first, ch2);
9494 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9495 __ b(DONE);
9496 __ align(OptoLoopAlignment);
9497 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9498 if (str2_isL) { // LL
9499 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9500 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9501 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9502 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9503 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9504 } else {
9505 __ mov(ch2, 0xE); // all bits in byte set except last one
9506 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9507 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9508 __ lslv(tmp2, tmp2, tmp4);
9509 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9510 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9511 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9512 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9513 }
9514 __ cmp(ch1, ch2);
9515 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9516 __ b(DONE);
9517 __ align(OptoLoopAlignment);
9518 __ BIND(L_HAS_ZERO);
9519 __ rbit(tmp2, tmp2);
9520 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9521 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9522 // It's fine because both counters are 32bit and are not changed in this
9523 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9524 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9525 __ sub(result, result, 1);
9526 __ BIND(L_HAS_ZERO_LOOP);
9527 __ mov(cnt1, wordSize/str2_chr_size);
9528 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9529 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9530 if (str2_isL) {
9531 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9532 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9533 __ lslv(tmp2, tmp2, tmp4);
9534 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9535 __ add(tmp4, tmp4, 1);
9536 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9537 __ lsl(tmp2, tmp2, 1);
9538 __ mov(tmp4, wordSize/str2_chr_size);
9539 } else {
9540 __ mov(ch2, 0xE);
9541 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9542 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9543 __ lslv(tmp2, tmp2, tmp4);
9544 __ add(tmp4, tmp4, 1);
9545 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9546 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9547 __ lsl(tmp2, tmp2, 1);
9548 __ mov(tmp4, wordSize/str2_chr_size);
9549 __ sub(str2, str2, str2_chr_size);
9550 }
9551 __ cmp(ch1, ch2);
9552 __ mov(tmp4, wordSize/str2_chr_size);
9553 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9554 __ BIND(L_CMP_LOOP);
9555 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9556 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9557 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9558 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9559 __ add(tmp4, tmp4, 1);
9560 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9561 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9562 __ cmp(cnt1, ch2);
9563 __ br(__ EQ, L_CMP_LOOP);
9564 __ BIND(L_CMP_LOOP_NOMATCH);
9565 // here we're not matched
9566 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9567 __ clz(tmp4, tmp2);
9568 __ add(str2, str2, str2_chr_size); // advance pointer
9569 __ b(L_HAS_ZERO_LOOP);
9570 __ align(OptoLoopAlignment);
9571 __ BIND(L_CMP_LOOP_LAST_CMP);
9572 __ cmp(cnt1, ch2);
9573 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9574 __ b(DONE);
9575 __ align(OptoLoopAlignment);
9576 __ BIND(L_CMP_LOOP_LAST_CMP2);
9577 if (str2_isL) {
9578 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9579 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9580 __ lslv(tmp2, tmp2, tmp4);
9581 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9582 __ add(tmp4, tmp4, 1);
9583 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9584 __ lsl(tmp2, tmp2, 1);
9585 } else {
9586 __ mov(ch2, 0xE);
9587 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9588 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9589 __ lslv(tmp2, tmp2, tmp4);
9590 __ add(tmp4, tmp4, 1);
9591 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9592 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9593 __ lsl(tmp2, tmp2, 1);
9594 __ sub(str2, str2, str2_chr_size);
9595 }
9596 __ cmp(ch1, ch2);
9597 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9598 __ b(DONE);
9599 __ align(OptoLoopAlignment);
9600 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9601 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9602 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9603 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9604 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9605 // result by analyzed characters value, so, we can just reset lower bits
9606 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9607 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9608 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9609 // index of last analyzed substring inside current octet. So, str2 in at
9610 // respective start address. We need to advance it to next octet
9611 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9612 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9613 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9614 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9615 __ movw(cnt2, cnt2);
9616 __ b(L_LOOP_PROCEED);
9617 __ align(OptoLoopAlignment);
9618 __ BIND(NOMATCH);
9619 __ mov(result, -1);
9620 __ BIND(DONE);
9621 __ pop(spilled_regs, sp);
9622 __ ret(lr);
9623 return entry;
9624 }
9625
9626 void generate_string_indexof_stubs() {
9627 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9628 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9629 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9630 }
9631
9632 void inflate_and_store_2_fp_registers(bool generatePrfm,
9633 FloatRegister src1, FloatRegister src2) {
9634 Register dst = r1;
9635 __ zip1(v1, __ T16B, src1, v0);
9636 __ zip2(v2, __ T16B, src1, v0);
9637 if (generatePrfm) {
9638 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9639 }
9640 __ zip1(v3, __ T16B, src2, v0);
9641 __ zip2(v4, __ T16B, src2, v0);
9642 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9643 }
9644
9645 // R0 = src
9646 // R1 = dst
9647 // R2 = len
9648 // R3 = len >> 3
9649 // V0 = 0
9650 // v1 = loaded 8 bytes
9651 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9652 address generate_large_byte_array_inflate() {
9653 __ align(CodeEntryAlignment);
9654 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9655 StubCodeMark mark(this, stub_id);
9656 address entry = __ pc();
9657 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9658 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9659 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9660
9661 // do one more 8-byte read to have address 16-byte aligned in most cases
9662 // also use single store instruction
9663 __ ldrd(v2, __ post(src, 8));
9664 __ sub(octetCounter, octetCounter, 2);
9665 __ zip1(v1, __ T16B, v1, v0);
9666 __ zip1(v2, __ T16B, v2, v0);
9667 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9668 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9669 __ subs(rscratch1, octetCounter, large_loop_threshold);
9670 __ br(__ LE, LOOP_START);
9671 __ b(LOOP_PRFM_START);
9672 __ bind(LOOP_PRFM);
9673 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9674 __ bind(LOOP_PRFM_START);
9675 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9676 __ sub(octetCounter, octetCounter, 8);
9677 __ subs(rscratch1, octetCounter, large_loop_threshold);
9678 inflate_and_store_2_fp_registers(true, v3, v4);
9679 inflate_and_store_2_fp_registers(true, v5, v6);
9680 __ br(__ GT, LOOP_PRFM);
9681 __ cmp(octetCounter, (u1)8);
9682 __ br(__ LT, DONE);
9683 __ bind(LOOP);
9684 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9685 __ bind(LOOP_START);
9686 __ sub(octetCounter, octetCounter, 8);
9687 __ cmp(octetCounter, (u1)8);
9688 inflate_and_store_2_fp_registers(false, v3, v4);
9689 inflate_and_store_2_fp_registers(false, v5, v6);
9690 __ br(__ GE, LOOP);
9691 __ bind(DONE);
9692 __ ret(lr);
9693 return entry;
9694 }
9695
9696 /**
9697 * Arguments:
9698 *
9699 * Input:
9700 * c_rarg0 - current state address
9701 * c_rarg1 - H key address
9702 * c_rarg2 - data address
9703 * c_rarg3 - number of blocks
9704 *
9705 * Output:
9706 * Updated state at c_rarg0
9707 */
9708 address generate_ghash_processBlocks() {
9709 // Bafflingly, GCM uses little-endian for the byte order, but
9710 // big-endian for the bit order. For example, the polynomial 1 is
9711 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9712 //
9713 // So, we must either reverse the bytes in each word and do
9714 // everything big-endian or reverse the bits in each byte and do
9715 // it little-endian. On AArch64 it's more idiomatic to reverse
9716 // the bits in each byte (we have an instruction, RBIT, to do
9717 // that) and keep the data in little-endian bit order through the
9718 // calculation, bit-reversing the inputs and outputs.
9719
9720 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9721 StubCodeMark mark(this, stub_id);
9722 Label polynomial; // local data generated at end of stub
9723 __ align(CodeEntryAlignment);
9724 address start = __ pc();
9725
9726 Register state = c_rarg0;
9727 Register subkeyH = c_rarg1;
9728 Register data = c_rarg2;
9729 Register blocks = c_rarg3;
9730
9731 FloatRegister vzr = v30;
9732 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9733
9734 __ adr(rscratch1, polynomial);
9735 __ ldrq(v24, rscratch1); // The field polynomial
9736
9737 __ ldrq(v0, Address(state));
9738 __ ldrq(v1, Address(subkeyH));
9739
9740 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9741 __ rbit(v0, __ T16B, v0);
9742 __ rev64(v1, __ T16B, v1);
9743 __ rbit(v1, __ T16B, v1);
9744
9745 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9746 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9747
9748 {
9749 Label L_ghash_loop;
9750 __ bind(L_ghash_loop);
9751
9752 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9753 // reversing each byte
9754 __ rbit(v2, __ T16B, v2);
9755 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9756
9757 // Multiply state in v2 by subkey in v1
9758 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9759 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9760 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9761 // Reduce v7:v5 by the field polynomial
9762 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9763
9764 __ sub(blocks, blocks, 1);
9765 __ cbnz(blocks, L_ghash_loop);
9766 }
9767
9768 // The bit-reversed result is at this point in v0
9769 __ rev64(v0, __ T16B, v0);
9770 __ rbit(v0, __ T16B, v0);
9771
9772 __ st1(v0, __ T16B, state);
9773 __ ret(lr);
9774
9775 // bind label and generate local polynomial data
9776 __ align(wordSize * 2);
9777 __ bind(polynomial);
9778 __ emit_int64(0x87); // The low-order bits of the field
9779 // polynomial (i.e. p = z^7+z^2+z+1)
9780 // repeated in the low and high parts of a
9781 // 128-bit vector
9782 __ emit_int64(0x87);
9783
9784 return start;
9785 }
9786
9787 address generate_ghash_processBlocks_wide() {
9788 address small = generate_ghash_processBlocks();
9789
9790 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9791 StubCodeMark mark(this, stub_id);
9792 Label polynomial; // local data generated after stub
9793 __ align(CodeEntryAlignment);
9794 address start = __ pc();
9795
9796 Register state = c_rarg0;
9797 Register subkeyH = c_rarg1;
9798 Register data = c_rarg2;
9799 Register blocks = c_rarg3;
9800
9801 const int unroll = 4;
9802
9803 __ cmp(blocks, (unsigned char)(unroll * 2));
9804 __ br(__ LT, small);
9805
9806 if (unroll > 1) {
9807 // Save state before entering routine
9808 __ sub(sp, sp, 4 * 16);
9809 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9810 __ sub(sp, sp, 4 * 16);
9811 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9812 }
9813
9814 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9815
9816 if (unroll > 1) {
9817 // And restore state
9818 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9819 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9820 }
9821
9822 __ cmp(blocks, (unsigned char)0);
9823 __ br(__ GT, small);
9824
9825 __ ret(lr);
9826
9827 // bind label and generate polynomial data
9828 __ align(wordSize * 2);
9829 __ bind(polynomial);
9830 __ emit_int64(0x87); // The low-order bits of the field
9831 // polynomial (i.e. p = z^7+z^2+z+1)
9832 // repeated in the low and high parts of a
9833 // 128-bit vector
9834 __ emit_int64(0x87);
9835
9836 return start;
9837
9838 }
9839
9840 void generate_base64_encode_simdround(Register src, Register dst,
9841 FloatRegister codec, u8 size) {
9842
9843 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9844 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9845 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9846
9847 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9848
9849 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9850
9851 __ ushr(ind0, arrangement, in0, 2);
9852
9853 __ ushr(ind1, arrangement, in1, 2);
9854 __ shl(in0, arrangement, in0, 6);
9855 __ orr(ind1, arrangement, ind1, in0);
9856 __ ushr(ind1, arrangement, ind1, 2);
9857
9858 __ ushr(ind2, arrangement, in2, 4);
9859 __ shl(in1, arrangement, in1, 4);
9860 __ orr(ind2, arrangement, in1, ind2);
9861 __ ushr(ind2, arrangement, ind2, 2);
9862
9863 __ shl(ind3, arrangement, in2, 2);
9864 __ ushr(ind3, arrangement, ind3, 2);
9865
9866 __ tbl(out0, arrangement, codec, 4, ind0);
9867 __ tbl(out1, arrangement, codec, 4, ind1);
9868 __ tbl(out2, arrangement, codec, 4, ind2);
9869 __ tbl(out3, arrangement, codec, 4, ind3);
9870
9871 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9872 }
9873
9874 /**
9875 * Arguments:
9876 *
9877 * Input:
9878 * c_rarg0 - src_start
9879 * c_rarg1 - src_offset
9880 * c_rarg2 - src_length
9881 * c_rarg3 - dest_start
9882 * c_rarg4 - dest_offset
9883 * c_rarg5 - isURL
9884 *
9885 */
9886 address generate_base64_encodeBlock() {
9887
9888 static const char toBase64[64] = {
9889 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9890 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9891 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9892 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9893 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9894 };
9895
9896 static const char toBase64URL[64] = {
9897 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9898 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9899 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9900 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9901 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9902 };
9903
9904 __ align(CodeEntryAlignment);
9905 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9906 StubCodeMark mark(this, stub_id);
9907 address start = __ pc();
9908
9909 Register src = c_rarg0; // source array
9910 Register soff = c_rarg1; // source start offset
9911 Register send = c_rarg2; // source end offset
9912 Register dst = c_rarg3; // dest array
9913 Register doff = c_rarg4; // position for writing to dest array
9914 Register isURL = c_rarg5; // Base64 or URL character set
9915
9916 // c_rarg6 and c_rarg7 are free to use as temps
9917 Register codec = c_rarg6;
9918 Register length = c_rarg7;
9919
9920 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9921
9922 __ add(src, src, soff);
9923 __ add(dst, dst, doff);
9924 __ sub(length, send, soff);
9925
9926 // load the codec base address
9927 __ lea(codec, ExternalAddress((address) toBase64));
9928 __ cbz(isURL, ProcessData);
9929 __ lea(codec, ExternalAddress((address) toBase64URL));
9930
9931 __ BIND(ProcessData);
9932
9933 // too short to formup a SIMD loop, roll back
9934 __ cmp(length, (u1)24);
9935 __ br(Assembler::LT, Process3B);
9936
9937 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9938
9939 __ BIND(Process48B);
9940 __ cmp(length, (u1)48);
9941 __ br(Assembler::LT, Process24B);
9942 generate_base64_encode_simdround(src, dst, v0, 16);
9943 __ sub(length, length, 48);
9944 __ b(Process48B);
9945
9946 __ BIND(Process24B);
9947 __ cmp(length, (u1)24);
9948 __ br(Assembler::LT, SIMDExit);
9949 generate_base64_encode_simdround(src, dst, v0, 8);
9950 __ sub(length, length, 24);
9951
9952 __ BIND(SIMDExit);
9953 __ cbz(length, Exit);
9954
9955 __ BIND(Process3B);
9956 // 3 src bytes, 24 bits
9957 __ ldrb(r10, __ post(src, 1));
9958 __ ldrb(r11, __ post(src, 1));
9959 __ ldrb(r12, __ post(src, 1));
9960 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9961 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9962 // codec index
9963 __ ubfmw(r15, r12, 18, 23);
9964 __ ubfmw(r14, r12, 12, 17);
9965 __ ubfmw(r13, r12, 6, 11);
9966 __ andw(r12, r12, 63);
9967 // get the code based on the codec
9968 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9969 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9970 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9971 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9972 __ strb(r15, __ post(dst, 1));
9973 __ strb(r14, __ post(dst, 1));
9974 __ strb(r13, __ post(dst, 1));
9975 __ strb(r12, __ post(dst, 1));
9976 __ sub(length, length, 3);
9977 __ cbnz(length, Process3B);
9978
9979 __ BIND(Exit);
9980 __ ret(lr);
9981
9982 return start;
9983 }
9984
9985 void generate_base64_decode_simdround(Register src, Register dst,
9986 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9987
9988 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9989 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9990
9991 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9992 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9993
9994 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9995
9996 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9997
9998 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9999
10000 // we need unsigned saturating subtract, to make sure all input values
10001 // in range [0, 63] will have 0U value in the higher half lookup
10002 __ uqsubv(decH0, __ T16B, in0, v27);
10003 __ uqsubv(decH1, __ T16B, in1, v27);
10004 __ uqsubv(decH2, __ T16B, in2, v27);
10005 __ uqsubv(decH3, __ T16B, in3, v27);
10006
10007 // lower half lookup
10008 __ tbl(decL0, arrangement, codecL, 4, in0);
10009 __ tbl(decL1, arrangement, codecL, 4, in1);
10010 __ tbl(decL2, arrangement, codecL, 4, in2);
10011 __ tbl(decL3, arrangement, codecL, 4, in3);
10012
10013 // higher half lookup
10014 __ tbx(decH0, arrangement, codecH, 4, decH0);
10015 __ tbx(decH1, arrangement, codecH, 4, decH1);
10016 __ tbx(decH2, arrangement, codecH, 4, decH2);
10017 __ tbx(decH3, arrangement, codecH, 4, decH3);
10018
10019 // combine lower and higher
10020 __ orr(decL0, arrangement, decL0, decH0);
10021 __ orr(decL1, arrangement, decL1, decH1);
10022 __ orr(decL2, arrangement, decL2, decH2);
10023 __ orr(decL3, arrangement, decL3, decH3);
10024
10025 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10026 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10027 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10028 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10029 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10030 __ orr(in0, arrangement, decH0, decH1);
10031 __ orr(in1, arrangement, decH2, decH3);
10032 __ orr(in2, arrangement, in0, in1);
10033 __ umaxv(in3, arrangement, in2);
10034 __ umov(rscratch2, in3, __ B, 0);
10035
10036 // get the data to output
10037 __ shl(out0, arrangement, decL0, 2);
10038 __ ushr(out1, arrangement, decL1, 4);
10039 __ orr(out0, arrangement, out0, out1);
10040 __ shl(out1, arrangement, decL1, 4);
10041 __ ushr(out2, arrangement, decL2, 2);
10042 __ orr(out1, arrangement, out1, out2);
10043 __ shl(out2, arrangement, decL2, 6);
10044 __ orr(out2, arrangement, out2, decL3);
10045
10046 __ cbz(rscratch2, NoIllegalData);
10047
10048 // handle illegal input
10049 __ umov(r10, in2, __ D, 0);
10050 if (size == 16) {
10051 __ cbnz(r10, ErrorInLowerHalf);
10052
10053 // illegal input is in higher half, store the lower half now.
10054 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10055
10056 __ umov(r10, in2, __ D, 1);
10057 __ umov(r11, out0, __ D, 1);
10058 __ umov(r12, out1, __ D, 1);
10059 __ umov(r13, out2, __ D, 1);
10060 __ b(StoreLegalData);
10061
10062 __ BIND(ErrorInLowerHalf);
10063 }
10064 __ umov(r11, out0, __ D, 0);
10065 __ umov(r12, out1, __ D, 0);
10066 __ umov(r13, out2, __ D, 0);
10067
10068 __ BIND(StoreLegalData);
10069 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10070 __ strb(r11, __ post(dst, 1));
10071 __ strb(r12, __ post(dst, 1));
10072 __ strb(r13, __ post(dst, 1));
10073 __ lsr(r10, r10, 8);
10074 __ lsr(r11, r11, 8);
10075 __ lsr(r12, r12, 8);
10076 __ lsr(r13, r13, 8);
10077 __ b(StoreLegalData);
10078
10079 __ BIND(NoIllegalData);
10080 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10081 }
10082
10083
10084 /**
10085 * Arguments:
10086 *
10087 * Input:
10088 * c_rarg0 - src_start
10089 * c_rarg1 - src_offset
10090 * c_rarg2 - src_length
10091 * c_rarg3 - dest_start
10092 * c_rarg4 - dest_offset
10093 * c_rarg5 - isURL
10094 * c_rarg6 - isMIME
10095 *
10096 */
10097 address generate_base64_decodeBlock() {
10098
10099 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10100 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10101 // titled "Base64 decoding".
10102
10103 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10104 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10105 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10106 static const uint8_t fromBase64ForNoSIMD[256] = {
10107 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10108 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10109 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10110 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10111 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10112 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10113 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10114 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10123 };
10124
10125 static const uint8_t fromBase64URLForNoSIMD[256] = {
10126 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10127 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10128 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10129 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10130 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10131 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10132 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10133 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10134 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10135 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10137 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10138 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10139 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10140 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10141 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142 };
10143
10144 // A legal value of base64 code is in range [0, 127]. We need two lookups
10145 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10146 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10147 // table vector lookup use tbx, out of range indices are unchanged in
10148 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10149 // The value of index 64 is set to 0, so that we know that we already get the
10150 // decoded data with the 1st lookup.
10151 static const uint8_t fromBase64ForSIMD[128] = {
10152 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10153 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10155 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10156 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10157 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10158 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10159 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10160 };
10161
10162 static const uint8_t fromBase64URLForSIMD[128] = {
10163 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10166 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10167 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10168 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10169 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10170 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10171 };
10172
10173 __ align(CodeEntryAlignment);
10174 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10175 StubCodeMark mark(this, stub_id);
10176 address start = __ pc();
10177
10178 Register src = c_rarg0; // source array
10179 Register soff = c_rarg1; // source start offset
10180 Register send = c_rarg2; // source end offset
10181 Register dst = c_rarg3; // dest array
10182 Register doff = c_rarg4; // position for writing to dest array
10183 Register isURL = c_rarg5; // Base64 or URL character set
10184 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10185
10186 Register length = send; // reuse send as length of source data to process
10187
10188 Register simd_codec = c_rarg6;
10189 Register nosimd_codec = c_rarg7;
10190
10191 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10192
10193 __ enter();
10194
10195 __ add(src, src, soff);
10196 __ add(dst, dst, doff);
10197
10198 __ mov(doff, dst);
10199
10200 __ sub(length, send, soff);
10201 __ bfm(length, zr, 0, 1);
10202
10203 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10204 __ cbz(isURL, ProcessData);
10205 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10206
10207 __ BIND(ProcessData);
10208 __ mov(rscratch1, length);
10209 __ cmp(length, (u1)144); // 144 = 80 + 64
10210 __ br(Assembler::LT, Process4B);
10211
10212 // In the MIME case, the line length cannot be more than 76
10213 // bytes (see RFC 2045). This is too short a block for SIMD
10214 // to be worthwhile, so we use non-SIMD here.
10215 __ movw(rscratch1, 79);
10216
10217 __ BIND(Process4B);
10218 __ ldrw(r14, __ post(src, 4));
10219 __ ubfxw(r10, r14, 0, 8);
10220 __ ubfxw(r11, r14, 8, 8);
10221 __ ubfxw(r12, r14, 16, 8);
10222 __ ubfxw(r13, r14, 24, 8);
10223 // get the de-code
10224 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10225 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10226 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10227 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10228 // error detection, 255u indicates an illegal input
10229 __ orrw(r14, r10, r11);
10230 __ orrw(r15, r12, r13);
10231 __ orrw(r14, r14, r15);
10232 __ tbnz(r14, 7, Exit);
10233 // recover the data
10234 __ lslw(r14, r10, 10);
10235 __ bfiw(r14, r11, 4, 6);
10236 __ bfmw(r14, r12, 2, 5);
10237 __ rev16w(r14, r14);
10238 __ bfiw(r13, r12, 6, 2);
10239 __ strh(r14, __ post(dst, 2));
10240 __ strb(r13, __ post(dst, 1));
10241 // non-simd loop
10242 __ subsw(rscratch1, rscratch1, 4);
10243 __ br(Assembler::GT, Process4B);
10244
10245 // if exiting from PreProcess80B, rscratch1 == -1;
10246 // otherwise, rscratch1 == 0.
10247 __ cbzw(rscratch1, Exit);
10248 __ sub(length, length, 80);
10249
10250 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10251 __ cbz(isURL, SIMDEnter);
10252 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10253
10254 __ BIND(SIMDEnter);
10255 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10256 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10257 __ mov(rscratch1, 63);
10258 __ dup(v27, __ T16B, rscratch1);
10259
10260 __ BIND(Process64B);
10261 __ cmp(length, (u1)64);
10262 __ br(Assembler::LT, Process32B);
10263 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10264 __ sub(length, length, 64);
10265 __ b(Process64B);
10266
10267 __ BIND(Process32B);
10268 __ cmp(length, (u1)32);
10269 __ br(Assembler::LT, SIMDExit);
10270 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10271 __ sub(length, length, 32);
10272 __ b(Process32B);
10273
10274 __ BIND(SIMDExit);
10275 __ cbz(length, Exit);
10276 __ movw(rscratch1, length);
10277 __ b(Process4B);
10278
10279 __ BIND(Exit);
10280 __ sub(c_rarg0, dst, doff);
10281
10282 __ leave();
10283 __ ret(lr);
10284
10285 return start;
10286 }
10287
10288 // Support for spin waits.
10289 address generate_spin_wait() {
10290 __ align(CodeEntryAlignment);
10291 StubId stub_id = StubId::stubgen_spin_wait_id;
10292 StubCodeMark mark(this, stub_id);
10293 address start = __ pc();
10294
10295 __ spin_wait();
10296 __ ret(lr);
10297
10298 return start;
10299 }
10300
10301 void generate_lookup_secondary_supers_table_stub() {
10302 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10303 StubCodeMark mark(this, stub_id);
10304
10305 const Register
10306 r_super_klass = r0,
10307 r_array_base = r1,
10308 r_array_length = r2,
10309 r_array_index = r3,
10310 r_sub_klass = r4,
10311 r_bitmap = rscratch2,
10312 result = r5;
10313 const FloatRegister
10314 vtemp = v0;
10315
10316 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10317 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10318 Label L_success;
10319 __ enter();
10320 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10321 r_array_base, r_array_length, r_array_index,
10322 vtemp, result, slot,
10323 /*stub_is_near*/true);
10324 __ leave();
10325 __ ret(lr);
10326 }
10327 }
10328
10329 // Slow path implementation for UseSecondarySupersTable.
10330 address generate_lookup_secondary_supers_table_slow_path_stub() {
10331 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10332 StubCodeMark mark(this, stub_id);
10333
10334 address start = __ pc();
10335 const Register
10336 r_super_klass = r0, // argument
10337 r_array_base = r1, // argument
10338 temp1 = r2, // temp
10339 r_array_index = r3, // argument
10340 r_bitmap = rscratch2, // argument
10341 result = r5; // argument
10342
10343 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10344 __ ret(lr);
10345
10346 return start;
10347 }
10348
10349 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10350
10351 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10352 //
10353 // If LSE is in use, generate LSE versions of all the stubs. The
10354 // non-LSE versions are in atomic_aarch64.S.
10355
10356 // class AtomicStubMark records the entry point of a stub and the
10357 // stub pointer which will point to it. The stub pointer is set to
10358 // the entry point when ~AtomicStubMark() is called, which must be
10359 // after ICache::invalidate_range. This ensures safe publication of
10360 // the generated code.
10361 class AtomicStubMark {
10362 address _entry_point;
10363 aarch64_atomic_stub_t *_stub;
10364 MacroAssembler *_masm;
10365 public:
10366 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10367 _masm = masm;
10368 __ align(32);
10369 _entry_point = __ pc();
10370 _stub = stub;
10371 }
10372 ~AtomicStubMark() {
10373 *_stub = (aarch64_atomic_stub_t)_entry_point;
10374 }
10375 };
10376
10377 // NB: For memory_order_conservative we need a trailing membar after
10378 // LSE atomic operations but not a leading membar.
10379 //
10380 // We don't need a leading membar because a clause in the Arm ARM
10381 // says:
10382 //
10383 // Barrier-ordered-before
10384 //
10385 // Barrier instructions order prior Memory effects before subsequent
10386 // Memory effects generated by the same Observer. A read or a write
10387 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10388 // Observer if and only if RW1 appears in program order before RW 2
10389 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10390 // instruction with both Acquire and Release semantics.
10391 //
10392 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10393 // and Release semantics, therefore we don't need a leading
10394 // barrier. However, there is no corresponding Barrier-ordered-after
10395 // relationship, therefore we need a trailing membar to prevent a
10396 // later store or load from being reordered with the store in an
10397 // atomic instruction.
10398 //
10399 // This was checked by using the herd7 consistency model simulator
10400 // (http://diy.inria.fr/) with this test case:
10401 //
10402 // AArch64 LseCas
10403 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10404 // P0 | P1;
10405 // LDR W4, [X2] | MOV W3, #0;
10406 // DMB LD | MOV W4, #1;
10407 // LDR W3, [X1] | CASAL W3, W4, [X1];
10408 // | DMB ISH;
10409 // | STR W4, [X2];
10410 // exists
10411 // (0:X3=0 /\ 0:X4=1)
10412 //
10413 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10414 // with the store to x in P1. Without the DMB in P1 this may happen.
10415 //
10416 // At the time of writing we don't know of any AArch64 hardware that
10417 // reorders stores in this way, but the Reference Manual permits it.
10418
10419 void gen_cas_entry(Assembler::operand_size size,
10420 atomic_memory_order order) {
10421 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10422 exchange_val = c_rarg2;
10423 bool acquire, release;
10424 switch (order) {
10425 case memory_order_relaxed:
10426 acquire = false;
10427 release = false;
10428 break;
10429 case memory_order_release:
10430 acquire = false;
10431 release = true;
10432 break;
10433 default:
10434 acquire = true;
10435 release = true;
10436 break;
10437 }
10438 __ mov(prev, compare_val);
10439 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10440 if (order == memory_order_conservative) {
10441 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10442 }
10443 if (size == Assembler::xword) {
10444 __ mov(r0, prev);
10445 } else {
10446 __ movw(r0, prev);
10447 }
10448 __ ret(lr);
10449 }
10450
10451 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10452 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10453 // If not relaxed, then default to conservative. Relaxed is the only
10454 // case we use enough to be worth specializing.
10455 if (order == memory_order_relaxed) {
10456 __ ldadd(size, incr, prev, addr);
10457 } else {
10458 __ ldaddal(size, incr, prev, addr);
10459 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10460 }
10461 if (size == Assembler::xword) {
10462 __ mov(r0, prev);
10463 } else {
10464 __ movw(r0, prev);
10465 }
10466 __ ret(lr);
10467 }
10468
10469 void gen_swpal_entry(Assembler::operand_size size) {
10470 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10471 __ swpal(size, incr, prev, addr);
10472 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10473 if (size == Assembler::xword) {
10474 __ mov(r0, prev);
10475 } else {
10476 __ movw(r0, prev);
10477 }
10478 __ ret(lr);
10479 }
10480
10481 void generate_atomic_entry_points() {
10482 if (! UseLSE) {
10483 return;
10484 }
10485 __ align(CodeEntryAlignment);
10486 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10487 StubCodeMark mark(this, stub_id);
10488 address first_entry = __ pc();
10489
10490 // ADD, memory_order_conservative
10491 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10492 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10493 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10494 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10495
10496 // ADD, memory_order_relaxed
10497 AtomicStubMark mark_fetch_add_4_relaxed
10498 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10499 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10500 AtomicStubMark mark_fetch_add_8_relaxed
10501 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10502 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10503
10504 // XCHG, memory_order_conservative
10505 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10506 gen_swpal_entry(Assembler::word);
10507 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10508 gen_swpal_entry(Assembler::xword);
10509
10510 // CAS, memory_order_conservative
10511 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10512 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10513 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10514 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10515 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10516 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10517
10518 // CAS, memory_order_relaxed
10519 AtomicStubMark mark_cmpxchg_1_relaxed
10520 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10521 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10522 AtomicStubMark mark_cmpxchg_4_relaxed
10523 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10524 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10525 AtomicStubMark mark_cmpxchg_8_relaxed
10526 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10527 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10528
10529 AtomicStubMark mark_cmpxchg_4_release
10530 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10531 gen_cas_entry(MacroAssembler::word, memory_order_release);
10532 AtomicStubMark mark_cmpxchg_8_release
10533 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10534 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10535
10536 AtomicStubMark mark_cmpxchg_4_seq_cst
10537 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10538 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10539 AtomicStubMark mark_cmpxchg_8_seq_cst
10540 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10541 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10542
10543 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10544 }
10545 #endif // LINUX
10546
10547 static void save_return_registers(MacroAssembler* masm) {
10548 if (InlineTypeReturnedAsFields) {
10549 masm->push(RegSet::range(r0, r7), sp);
10550 masm->sub(sp, sp, 4 * wordSize);
10551 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10552 masm->sub(sp, sp, 4 * wordSize);
10553 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10554 } else {
10555 masm->fmovd(rscratch1, v0);
10556 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10557 }
10558 }
10559
10560 static void restore_return_registers(MacroAssembler* masm) {
10561 if (InlineTypeReturnedAsFields) {
10562 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10563 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10564 masm->pop(RegSet::range(r0, r7), sp);
10565 } else {
10566 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10567 masm->fmovd(v0, rscratch1);
10568 }
10569 }
10570
10571 address generate_cont_thaw(Continuation::thaw_kind kind) {
10572 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10573 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10574
10575 address start = __ pc();
10576
10577 if (return_barrier) {
10578 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10579 __ mov(sp, rscratch1);
10580 }
10581 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10582
10583 if (return_barrier) {
10584 // preserve possible return value from a method returning to the return barrier
10585 save_return_registers(_masm);
10586 }
10587
10588 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10589 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10590 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10591
10592 if (return_barrier) {
10593 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10594 restore_return_registers(_masm);
10595 }
10596 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10597
10598
10599 Label thaw_success;
10600 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10601 __ cbnz(rscratch2, thaw_success);
10602 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10603 __ br(rscratch1);
10604 __ bind(thaw_success);
10605
10606 // make room for the thawed frames
10607 __ sub(rscratch1, sp, rscratch2);
10608 __ andr(rscratch1, rscratch1, -16); // align
10609 __ mov(sp, rscratch1);
10610
10611 if (return_barrier) {
10612 // save original return value -- again
10613 save_return_registers(_masm);
10614 }
10615
10616 // If we want, we can templatize thaw by kind, and have three different entries
10617 __ movw(c_rarg1, (uint32_t)kind);
10618
10619 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10620 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10621
10622 if (return_barrier) {
10623 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10624 restore_return_registers(_masm);
10625 } else {
10626 __ mov(r0, zr); // return 0 (success) from doYield
10627 }
10628
10629 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10630 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10631 __ mov(rfp, sp);
10632
10633 if (return_barrier_exception) {
10634 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10635 __ authenticate_return_address(c_rarg1);
10636 __ verify_oop(r0);
10637 // save return value containing the exception oop in callee-saved R19
10638 __ mov(r19, r0);
10639
10640 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10641
10642 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10643 // __ reinitialize_ptrue();
10644
10645 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10646
10647 __ mov(r1, r0); // the exception handler
10648 __ mov(r0, r19); // restore return value containing the exception oop
10649 __ verify_oop(r0);
10650
10651 __ leave();
10652 __ mov(r3, lr);
10653 __ br(r1); // the exception handler
10654 } else {
10655 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10656 __ leave();
10657 __ ret(lr);
10658 }
10659
10660 return start;
10661 }
10662
10663 address generate_cont_thaw() {
10664 if (!Continuations::enabled()) return nullptr;
10665
10666 StubId stub_id = StubId::stubgen_cont_thaw_id;
10667 StubCodeMark mark(this, stub_id);
10668 address start = __ pc();
10669 generate_cont_thaw(Continuation::thaw_top);
10670 return start;
10671 }
10672
10673 address generate_cont_returnBarrier() {
10674 if (!Continuations::enabled()) return nullptr;
10675
10676 // TODO: will probably need multiple return barriers depending on return type
10677 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10678 StubCodeMark mark(this, stub_id);
10679 address start = __ pc();
10680
10681 generate_cont_thaw(Continuation::thaw_return_barrier);
10682
10683 return start;
10684 }
10685
10686 address generate_cont_returnBarrier_exception() {
10687 if (!Continuations::enabled()) return nullptr;
10688
10689 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10690 StubCodeMark mark(this, stub_id);
10691 address start = __ pc();
10692
10693 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10694
10695 return start;
10696 }
10697
10698 address generate_cont_preempt_stub() {
10699 if (!Continuations::enabled()) return nullptr;
10700 StubId stub_id = StubId::stubgen_cont_preempt_id;
10701 StubCodeMark mark(this, stub_id);
10702 address start = __ pc();
10703
10704 __ reset_last_Java_frame(true);
10705
10706 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10707 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10708 __ mov(sp, rscratch2);
10709
10710 Label preemption_cancelled;
10711 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10712 __ cbnz(rscratch1, preemption_cancelled);
10713
10714 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10715 SharedRuntime::continuation_enter_cleanup(_masm);
10716 __ leave();
10717 __ ret(lr);
10718
10719 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10720 __ bind(preemption_cancelled);
10721 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10722 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10723 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10724 __ ldr(rscratch1, Address(rscratch1));
10725 __ br(rscratch1);
10726
10727 return start;
10728 }
10729
10730 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10731 // are represented as long[5], with BITS_PER_LIMB = 26.
10732 // Pack five 26-bit limbs into three 64-bit registers.
10733 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10734 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10735 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10736 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10737 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10738
10739 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10740 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10741 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10742 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10743
10744 if (dest2->is_valid()) {
10745 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10746 } else {
10747 #ifdef ASSERT
10748 Label OK;
10749 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10750 __ br(__ EQ, OK);
10751 __ stop("high bits of Poly1305 integer should be zero");
10752 __ should_not_reach_here();
10753 __ bind(OK);
10754 #endif
10755 }
10756 }
10757
10758 // As above, but return only a 128-bit integer, packed into two
10759 // 64-bit registers.
10760 void pack_26(Register dest0, Register dest1, Register src) {
10761 pack_26(dest0, dest1, noreg, src);
10762 }
10763
10764 // Multiply and multiply-accumulate unsigned 64-bit registers.
10765 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10766 __ mul(prod_lo, n, m);
10767 __ umulh(prod_hi, n, m);
10768 }
10769 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10770 wide_mul(rscratch1, rscratch2, n, m);
10771 __ adds(sum_lo, sum_lo, rscratch1);
10772 __ adc(sum_hi, sum_hi, rscratch2);
10773 }
10774
10775 // Poly1305, RFC 7539
10776
10777 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10778 // description of the tricks used to simplify and accelerate this
10779 // computation.
10780
10781 address generate_poly1305_processBlocks() {
10782 __ align(CodeEntryAlignment);
10783 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10784 StubCodeMark mark(this, stub_id);
10785 address start = __ pc();
10786 Label here;
10787 __ enter();
10788 RegSet callee_saved = RegSet::range(r19, r28);
10789 __ push(callee_saved, sp);
10790
10791 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10792
10793 // Arguments
10794 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10795
10796 // R_n is the 128-bit randomly-generated key, packed into two
10797 // registers. The caller passes this key to us as long[5], with
10798 // BITS_PER_LIMB = 26.
10799 const Register R_0 = *++regs, R_1 = *++regs;
10800 pack_26(R_0, R_1, r_start);
10801
10802 // RR_n is (R_n >> 2) * 5
10803 const Register RR_0 = *++regs, RR_1 = *++regs;
10804 __ lsr(RR_0, R_0, 2);
10805 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10806 __ lsr(RR_1, R_1, 2);
10807 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10808
10809 // U_n is the current checksum
10810 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10811 pack_26(U_0, U_1, U_2, acc_start);
10812
10813 static constexpr int BLOCK_LENGTH = 16;
10814 Label DONE, LOOP;
10815
10816 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10817 __ br(Assembler::LT, DONE); {
10818 __ bind(LOOP);
10819
10820 // S_n is to be the sum of U_n and the next block of data
10821 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10822 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10823 __ adds(S_0, U_0, S_0);
10824 __ adcs(S_1, U_1, S_1);
10825 __ adc(S_2, U_2, zr);
10826 __ add(S_2, S_2, 1);
10827
10828 const Register U_0HI = *++regs, U_1HI = *++regs;
10829
10830 // NB: this logic depends on some of the special properties of
10831 // Poly1305 keys. In particular, because we know that the top
10832 // four bits of R_0 and R_1 are zero, we can add together
10833 // partial products without any risk of needing to propagate a
10834 // carry out.
10835 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10836 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10837 __ andr(U_2, R_0, 3);
10838 __ mul(U_2, S_2, U_2);
10839
10840 // Recycle registers S_0, S_1, S_2
10841 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10842
10843 // Partial reduction mod 2**130 - 5
10844 __ adds(U_1, U_0HI, U_1);
10845 __ adc(U_2, U_1HI, U_2);
10846 // Sum now in U_2:U_1:U_0.
10847 // Dead: U_0HI, U_1HI.
10848 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10849
10850 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10851
10852 // First, U_2:U_1:U_0 += (U_2 >> 2)
10853 __ lsr(rscratch1, U_2, 2);
10854 __ andr(U_2, U_2, (u8)3);
10855 __ adds(U_0, U_0, rscratch1);
10856 __ adcs(U_1, U_1, zr);
10857 __ adc(U_2, U_2, zr);
10858 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10859 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10860 __ adcs(U_1, U_1, zr);
10861 __ adc(U_2, U_2, zr);
10862
10863 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10864 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10865 __ br(~ Assembler::LT, LOOP);
10866 }
10867
10868 // Further reduce modulo 2^130 - 5
10869 __ lsr(rscratch1, U_2, 2);
10870 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10871 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10872 __ adcs(U_1, U_1, zr);
10873 __ andr(U_2, U_2, (u1)3);
10874 __ adc(U_2, U_2, zr);
10875
10876 // Unpack the sum into five 26-bit limbs and write to memory.
10877 __ ubfiz(rscratch1, U_0, 0, 26);
10878 __ ubfx(rscratch2, U_0, 26, 26);
10879 __ stp(rscratch1, rscratch2, Address(acc_start));
10880 __ ubfx(rscratch1, U_0, 52, 12);
10881 __ bfi(rscratch1, U_1, 12, 14);
10882 __ ubfx(rscratch2, U_1, 14, 26);
10883 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10884 __ ubfx(rscratch1, U_1, 40, 24);
10885 __ bfi(rscratch1, U_2, 24, 3);
10886 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10887
10888 __ bind(DONE);
10889 __ pop(callee_saved, sp);
10890 __ leave();
10891 __ ret(lr);
10892
10893 return start;
10894 }
10895
10896 // exception handler for upcall stubs
10897 address generate_upcall_stub_exception_handler() {
10898 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10899 StubCodeMark mark(this, stub_id);
10900 address start = __ pc();
10901
10902 // Native caller has no idea how to handle exceptions,
10903 // so we just crash here. Up to callee to catch exceptions.
10904 __ verify_oop(r0);
10905 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10906 __ blr(rscratch1);
10907 __ should_not_reach_here();
10908
10909 return start;
10910 }
10911
10912 // load Method* target of MethodHandle
10913 // j_rarg0 = jobject receiver
10914 // rmethod = result
10915 address generate_upcall_stub_load_target() {
10916 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10917 StubCodeMark mark(this, stub_id);
10918 address start = __ pc();
10919
10920 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10921 // Load target method from receiver
10922 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10923 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10924 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10925 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10926 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10927 noreg, noreg);
10928 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10929
10930 __ ret(lr);
10931
10932 return start;
10933 }
10934
10935 #undef __
10936 #define __ masm->
10937
10938 class MontgomeryMultiplyGenerator : public MacroAssembler {
10939
10940 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10941 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10942
10943 RegSet _toSave;
10944 bool _squaring;
10945
10946 public:
10947 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10948 : MacroAssembler(as->code()), _squaring(squaring) {
10949
10950 // Register allocation
10951
10952 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10953 Pa_base = *regs; // Argument registers
10954 if (squaring)
10955 Pb_base = Pa_base;
10956 else
10957 Pb_base = *++regs;
10958 Pn_base = *++regs;
10959 Rlen= *++regs;
10960 inv = *++regs;
10961 Pm_base = *++regs;
10962
10963 // Working registers:
10964 Ra = *++regs; // The current digit of a, b, n, and m.
10965 Rb = *++regs;
10966 Rm = *++regs;
10967 Rn = *++regs;
10968
10969 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10970 Pb = *++regs;
10971 Pm = *++regs;
10972 Pn = *++regs;
10973
10974 t0 = *++regs; // Three registers which form a
10975 t1 = *++regs; // triple-precision accumuator.
10976 t2 = *++regs;
10977
10978 Ri = *++regs; // Inner and outer loop indexes.
10979 Rj = *++regs;
10980
10981 Rhi_ab = *++regs; // Product registers: low and high parts
10982 Rlo_ab = *++regs; // of a*b and m*n.
10983 Rhi_mn = *++regs;
10984 Rlo_mn = *++regs;
10985
10986 // r19 and up are callee-saved.
10987 _toSave = RegSet::range(r19, *regs) + Pm_base;
10988 }
10989
10990 private:
10991 void save_regs() {
10992 push(_toSave, sp);
10993 }
10994
10995 void restore_regs() {
10996 pop(_toSave, sp);
10997 }
10998
10999 template <typename T>
11000 void unroll_2(Register count, T block) {
11001 Label loop, end, odd;
11002 tbnz(count, 0, odd);
11003 cbz(count, end);
11004 align(16);
11005 bind(loop);
11006 (this->*block)();
11007 bind(odd);
11008 (this->*block)();
11009 subs(count, count, 2);
11010 br(Assembler::GT, loop);
11011 bind(end);
11012 }
11013
11014 template <typename T>
11015 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11016 Label loop, end, odd;
11017 tbnz(count, 0, odd);
11018 cbz(count, end);
11019 align(16);
11020 bind(loop);
11021 (this->*block)(d, s, tmp);
11022 bind(odd);
11023 (this->*block)(d, s, tmp);
11024 subs(count, count, 2);
11025 br(Assembler::GT, loop);
11026 bind(end);
11027 }
11028
11029 void pre1(RegisterOrConstant i) {
11030 block_comment("pre1");
11031 // Pa = Pa_base;
11032 // Pb = Pb_base + i;
11033 // Pm = Pm_base;
11034 // Pn = Pn_base + i;
11035 // Ra = *Pa;
11036 // Rb = *Pb;
11037 // Rm = *Pm;
11038 // Rn = *Pn;
11039 ldr(Ra, Address(Pa_base));
11040 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11041 ldr(Rm, Address(Pm_base));
11042 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11043 lea(Pa, Address(Pa_base));
11044 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11045 lea(Pm, Address(Pm_base));
11046 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11047
11048 // Zero the m*n result.
11049 mov(Rhi_mn, zr);
11050 mov(Rlo_mn, zr);
11051 }
11052
11053 // The core multiply-accumulate step of a Montgomery
11054 // multiplication. The idea is to schedule operations as a
11055 // pipeline so that instructions with long latencies (loads and
11056 // multiplies) have time to complete before their results are
11057 // used. This most benefits in-order implementations of the
11058 // architecture but out-of-order ones also benefit.
11059 void step() {
11060 block_comment("step");
11061 // MACC(Ra, Rb, t0, t1, t2);
11062 // Ra = *++Pa;
11063 // Rb = *--Pb;
11064 umulh(Rhi_ab, Ra, Rb);
11065 mul(Rlo_ab, Ra, Rb);
11066 ldr(Ra, pre(Pa, wordSize));
11067 ldr(Rb, pre(Pb, -wordSize));
11068 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11069 // previous iteration.
11070 // MACC(Rm, Rn, t0, t1, t2);
11071 // Rm = *++Pm;
11072 // Rn = *--Pn;
11073 umulh(Rhi_mn, Rm, Rn);
11074 mul(Rlo_mn, Rm, Rn);
11075 ldr(Rm, pre(Pm, wordSize));
11076 ldr(Rn, pre(Pn, -wordSize));
11077 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11078 }
11079
11080 void post1() {
11081 block_comment("post1");
11082
11083 // MACC(Ra, Rb, t0, t1, t2);
11084 // Ra = *++Pa;
11085 // Rb = *--Pb;
11086 umulh(Rhi_ab, Ra, Rb);
11087 mul(Rlo_ab, Ra, Rb);
11088 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11089 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11090
11091 // *Pm = Rm = t0 * inv;
11092 mul(Rm, t0, inv);
11093 str(Rm, Address(Pm));
11094
11095 // MACC(Rm, Rn, t0, t1, t2);
11096 // t0 = t1; t1 = t2; t2 = 0;
11097 umulh(Rhi_mn, Rm, Rn);
11098
11099 #ifndef PRODUCT
11100 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11101 {
11102 mul(Rlo_mn, Rm, Rn);
11103 add(Rlo_mn, t0, Rlo_mn);
11104 Label ok;
11105 cbz(Rlo_mn, ok); {
11106 stop("broken Montgomery multiply");
11107 } bind(ok);
11108 }
11109 #endif
11110 // We have very carefully set things up so that
11111 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11112 // the lower half of Rm * Rn because we know the result already:
11113 // it must be -t0. t0 + (-t0) must generate a carry iff
11114 // t0 != 0. So, rather than do a mul and an adds we just set
11115 // the carry flag iff t0 is nonzero.
11116 //
11117 // mul(Rlo_mn, Rm, Rn);
11118 // adds(zr, t0, Rlo_mn);
11119 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11120 adcs(t0, t1, Rhi_mn);
11121 adc(t1, t2, zr);
11122 mov(t2, zr);
11123 }
11124
11125 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11126 block_comment("pre2");
11127 // Pa = Pa_base + i-len;
11128 // Pb = Pb_base + len;
11129 // Pm = Pm_base + i-len;
11130 // Pn = Pn_base + len;
11131
11132 if (i.is_register()) {
11133 sub(Rj, i.as_register(), len);
11134 } else {
11135 mov(Rj, i.as_constant());
11136 sub(Rj, Rj, len);
11137 }
11138 // Rj == i-len
11139
11140 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11141 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11142 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11143 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11144
11145 // Ra = *++Pa;
11146 // Rb = *--Pb;
11147 // Rm = *++Pm;
11148 // Rn = *--Pn;
11149 ldr(Ra, pre(Pa, wordSize));
11150 ldr(Rb, pre(Pb, -wordSize));
11151 ldr(Rm, pre(Pm, wordSize));
11152 ldr(Rn, pre(Pn, -wordSize));
11153
11154 mov(Rhi_mn, zr);
11155 mov(Rlo_mn, zr);
11156 }
11157
11158 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11159 block_comment("post2");
11160 if (i.is_constant()) {
11161 mov(Rj, i.as_constant()-len.as_constant());
11162 } else {
11163 sub(Rj, i.as_register(), len);
11164 }
11165
11166 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11167
11168 // As soon as we know the least significant digit of our result,
11169 // store it.
11170 // Pm_base[i-len] = t0;
11171 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11172
11173 // t0 = t1; t1 = t2; t2 = 0;
11174 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11175 adc(t1, t2, zr);
11176 mov(t2, zr);
11177 }
11178
11179 // A carry in t0 after Montgomery multiplication means that we
11180 // should subtract multiples of n from our result in m. We'll
11181 // keep doing that until there is no carry.
11182 void normalize(RegisterOrConstant len) {
11183 block_comment("normalize");
11184 // while (t0)
11185 // t0 = sub(Pm_base, Pn_base, t0, len);
11186 Label loop, post, again;
11187 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11188 cbz(t0, post); {
11189 bind(again); {
11190 mov(i, zr);
11191 mov(cnt, len);
11192 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11193 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11194 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11195 align(16);
11196 bind(loop); {
11197 sbcs(Rm, Rm, Rn);
11198 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11199 add(i, i, 1);
11200 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11201 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11202 sub(cnt, cnt, 1);
11203 } cbnz(cnt, loop);
11204 sbc(t0, t0, zr);
11205 } cbnz(t0, again);
11206 } bind(post);
11207 }
11208
11209 // Move memory at s to d, reversing words.
11210 // Increments d to end of copied memory
11211 // Destroys tmp1, tmp2
11212 // Preserves len
11213 // Leaves s pointing to the address which was in d at start
11214 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11215 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11216 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11217
11218 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11219 mov(tmp1, len);
11220 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11221 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11222 }
11223 // where
11224 void reverse1(Register d, Register s, Register tmp) {
11225 ldr(tmp, pre(s, -wordSize));
11226 ror(tmp, tmp, 32);
11227 str(tmp, post(d, wordSize));
11228 }
11229
11230 void step_squaring() {
11231 // An extra ACC
11232 step();
11233 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11234 }
11235
11236 void last_squaring(RegisterOrConstant i) {
11237 Label dont;
11238 // if ((i & 1) == 0) {
11239 tbnz(i.as_register(), 0, dont); {
11240 // MACC(Ra, Rb, t0, t1, t2);
11241 // Ra = *++Pa;
11242 // Rb = *--Pb;
11243 umulh(Rhi_ab, Ra, Rb);
11244 mul(Rlo_ab, Ra, Rb);
11245 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11246 } bind(dont);
11247 }
11248
11249 void extra_step_squaring() {
11250 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11251
11252 // MACC(Rm, Rn, t0, t1, t2);
11253 // Rm = *++Pm;
11254 // Rn = *--Pn;
11255 umulh(Rhi_mn, Rm, Rn);
11256 mul(Rlo_mn, Rm, Rn);
11257 ldr(Rm, pre(Pm, wordSize));
11258 ldr(Rn, pre(Pn, -wordSize));
11259 }
11260
11261 void post1_squaring() {
11262 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11263
11264 // *Pm = Rm = t0 * inv;
11265 mul(Rm, t0, inv);
11266 str(Rm, Address(Pm));
11267
11268 // MACC(Rm, Rn, t0, t1, t2);
11269 // t0 = t1; t1 = t2; t2 = 0;
11270 umulh(Rhi_mn, Rm, Rn);
11271
11272 #ifndef PRODUCT
11273 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11274 {
11275 mul(Rlo_mn, Rm, Rn);
11276 add(Rlo_mn, t0, Rlo_mn);
11277 Label ok;
11278 cbz(Rlo_mn, ok); {
11279 stop("broken Montgomery multiply");
11280 } bind(ok);
11281 }
11282 #endif
11283 // We have very carefully set things up so that
11284 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11285 // the lower half of Rm * Rn because we know the result already:
11286 // it must be -t0. t0 + (-t0) must generate a carry iff
11287 // t0 != 0. So, rather than do a mul and an adds we just set
11288 // the carry flag iff t0 is nonzero.
11289 //
11290 // mul(Rlo_mn, Rm, Rn);
11291 // adds(zr, t0, Rlo_mn);
11292 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11293 adcs(t0, t1, Rhi_mn);
11294 adc(t1, t2, zr);
11295 mov(t2, zr);
11296 }
11297
11298 void acc(Register Rhi, Register Rlo,
11299 Register t0, Register t1, Register t2) {
11300 adds(t0, t0, Rlo);
11301 adcs(t1, t1, Rhi);
11302 adc(t2, t2, zr);
11303 }
11304
11305 public:
11306 /**
11307 * Fast Montgomery multiplication. The derivation of the
11308 * algorithm is in A Cryptographic Library for the Motorola
11309 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11310 *
11311 * Arguments:
11312 *
11313 * Inputs for multiplication:
11314 * c_rarg0 - int array elements a
11315 * c_rarg1 - int array elements b
11316 * c_rarg2 - int array elements n (the modulus)
11317 * c_rarg3 - int length
11318 * c_rarg4 - int inv
11319 * c_rarg5 - int array elements m (the result)
11320 *
11321 * Inputs for squaring:
11322 * c_rarg0 - int array elements a
11323 * c_rarg1 - int array elements n (the modulus)
11324 * c_rarg2 - int length
11325 * c_rarg3 - int inv
11326 * c_rarg4 - int array elements m (the result)
11327 *
11328 */
11329 address generate_multiply() {
11330 Label argh, nothing;
11331 bind(argh);
11332 stop("MontgomeryMultiply total_allocation must be <= 8192");
11333
11334 align(CodeEntryAlignment);
11335 address entry = pc();
11336
11337 cbzw(Rlen, nothing);
11338
11339 enter();
11340
11341 // Make room.
11342 cmpw(Rlen, 512);
11343 br(Assembler::HI, argh);
11344 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11345 andr(sp, Ra, -2 * wordSize);
11346
11347 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11348
11349 {
11350 // Copy input args, reversing as we go. We use Ra as a
11351 // temporary variable.
11352 reverse(Ra, Pa_base, Rlen, t0, t1);
11353 if (!_squaring)
11354 reverse(Ra, Pb_base, Rlen, t0, t1);
11355 reverse(Ra, Pn_base, Rlen, t0, t1);
11356 }
11357
11358 // Push all call-saved registers and also Pm_base which we'll need
11359 // at the end.
11360 save_regs();
11361
11362 #ifndef PRODUCT
11363 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11364 {
11365 ldr(Rn, Address(Pn_base, 0));
11366 mul(Rlo_mn, Rn, inv);
11367 subs(zr, Rlo_mn, -1);
11368 Label ok;
11369 br(EQ, ok); {
11370 stop("broken inverse in Montgomery multiply");
11371 } bind(ok);
11372 }
11373 #endif
11374
11375 mov(Pm_base, Ra);
11376
11377 mov(t0, zr);
11378 mov(t1, zr);
11379 mov(t2, zr);
11380
11381 block_comment("for (int i = 0; i < len; i++) {");
11382 mov(Ri, zr); {
11383 Label loop, end;
11384 cmpw(Ri, Rlen);
11385 br(Assembler::GE, end);
11386
11387 bind(loop);
11388 pre1(Ri);
11389
11390 block_comment(" for (j = i; j; j--) {"); {
11391 movw(Rj, Ri);
11392 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11393 } block_comment(" } // j");
11394
11395 post1();
11396 addw(Ri, Ri, 1);
11397 cmpw(Ri, Rlen);
11398 br(Assembler::LT, loop);
11399 bind(end);
11400 block_comment("} // i");
11401 }
11402
11403 block_comment("for (int i = len; i < 2*len; i++) {");
11404 mov(Ri, Rlen); {
11405 Label loop, end;
11406 cmpw(Ri, Rlen, Assembler::LSL, 1);
11407 br(Assembler::GE, end);
11408
11409 bind(loop);
11410 pre2(Ri, Rlen);
11411
11412 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11413 lslw(Rj, Rlen, 1);
11414 subw(Rj, Rj, Ri);
11415 subw(Rj, Rj, 1);
11416 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11417 } block_comment(" } // j");
11418
11419 post2(Ri, Rlen);
11420 addw(Ri, Ri, 1);
11421 cmpw(Ri, Rlen, Assembler::LSL, 1);
11422 br(Assembler::LT, loop);
11423 bind(end);
11424 }
11425 block_comment("} // i");
11426
11427 normalize(Rlen);
11428
11429 mov(Ra, Pm_base); // Save Pm_base in Ra
11430 restore_regs(); // Restore caller's Pm_base
11431
11432 // Copy our result into caller's Pm_base
11433 reverse(Pm_base, Ra, Rlen, t0, t1);
11434
11435 leave();
11436 bind(nothing);
11437 ret(lr);
11438
11439 return entry;
11440 }
11441 // In C, approximately:
11442
11443 // void
11444 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11445 // julong Pn_base[], julong Pm_base[],
11446 // julong inv, int len) {
11447 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11448 // julong *Pa, *Pb, *Pn, *Pm;
11449 // julong Ra, Rb, Rn, Rm;
11450
11451 // int i;
11452
11453 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11454
11455 // for (i = 0; i < len; i++) {
11456 // int j;
11457
11458 // Pa = Pa_base;
11459 // Pb = Pb_base + i;
11460 // Pm = Pm_base;
11461 // Pn = Pn_base + i;
11462
11463 // Ra = *Pa;
11464 // Rb = *Pb;
11465 // Rm = *Pm;
11466 // Rn = *Pn;
11467
11468 // int iters = i;
11469 // for (j = 0; iters--; j++) {
11470 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11471 // MACC(Ra, Rb, t0, t1, t2);
11472 // Ra = *++Pa;
11473 // Rb = *--Pb;
11474 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11475 // MACC(Rm, Rn, t0, t1, t2);
11476 // Rm = *++Pm;
11477 // Rn = *--Pn;
11478 // }
11479
11480 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11481 // MACC(Ra, Rb, t0, t1, t2);
11482 // *Pm = Rm = t0 * inv;
11483 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11484 // MACC(Rm, Rn, t0, t1, t2);
11485
11486 // assert(t0 == 0, "broken Montgomery multiply");
11487
11488 // t0 = t1; t1 = t2; t2 = 0;
11489 // }
11490
11491 // for (i = len; i < 2*len; i++) {
11492 // int j;
11493
11494 // Pa = Pa_base + i-len;
11495 // Pb = Pb_base + len;
11496 // Pm = Pm_base + i-len;
11497 // Pn = Pn_base + len;
11498
11499 // Ra = *++Pa;
11500 // Rb = *--Pb;
11501 // Rm = *++Pm;
11502 // Rn = *--Pn;
11503
11504 // int iters = len*2-i-1;
11505 // for (j = i-len+1; iters--; j++) {
11506 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11507 // MACC(Ra, Rb, t0, t1, t2);
11508 // Ra = *++Pa;
11509 // Rb = *--Pb;
11510 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11511 // MACC(Rm, Rn, t0, t1, t2);
11512 // Rm = *++Pm;
11513 // Rn = *--Pn;
11514 // }
11515
11516 // Pm_base[i-len] = t0;
11517 // t0 = t1; t1 = t2; t2 = 0;
11518 // }
11519
11520 // while (t0)
11521 // t0 = sub(Pm_base, Pn_base, t0, len);
11522 // }
11523
11524 /**
11525 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11526 * multiplies than Montgomery multiplication so it should be up to
11527 * 25% faster. However, its loop control is more complex and it
11528 * may actually run slower on some machines.
11529 *
11530 * Arguments:
11531 *
11532 * Inputs:
11533 * c_rarg0 - int array elements a
11534 * c_rarg1 - int array elements n (the modulus)
11535 * c_rarg2 - int length
11536 * c_rarg3 - int inv
11537 * c_rarg4 - int array elements m (the result)
11538 *
11539 */
11540 address generate_square() {
11541 Label argh;
11542 bind(argh);
11543 stop("MontgomeryMultiply total_allocation must be <= 8192");
11544
11545 align(CodeEntryAlignment);
11546 address entry = pc();
11547
11548 enter();
11549
11550 // Make room.
11551 cmpw(Rlen, 512);
11552 br(Assembler::HI, argh);
11553 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11554 andr(sp, Ra, -2 * wordSize);
11555
11556 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11557
11558 {
11559 // Copy input args, reversing as we go. We use Ra as a
11560 // temporary variable.
11561 reverse(Ra, Pa_base, Rlen, t0, t1);
11562 reverse(Ra, Pn_base, Rlen, t0, t1);
11563 }
11564
11565 // Push all call-saved registers and also Pm_base which we'll need
11566 // at the end.
11567 save_regs();
11568
11569 mov(Pm_base, Ra);
11570
11571 mov(t0, zr);
11572 mov(t1, zr);
11573 mov(t2, zr);
11574
11575 block_comment("for (int i = 0; i < len; i++) {");
11576 mov(Ri, zr); {
11577 Label loop, end;
11578 bind(loop);
11579 cmp(Ri, Rlen);
11580 br(Assembler::GE, end);
11581
11582 pre1(Ri);
11583
11584 block_comment("for (j = (i+1)/2; j; j--) {"); {
11585 add(Rj, Ri, 1);
11586 lsr(Rj, Rj, 1);
11587 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11588 } block_comment(" } // j");
11589
11590 last_squaring(Ri);
11591
11592 block_comment(" for (j = i/2; j; j--) {"); {
11593 lsr(Rj, Ri, 1);
11594 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11595 } block_comment(" } // j");
11596
11597 post1_squaring();
11598 add(Ri, Ri, 1);
11599 cmp(Ri, Rlen);
11600 br(Assembler::LT, loop);
11601
11602 bind(end);
11603 block_comment("} // i");
11604 }
11605
11606 block_comment("for (int i = len; i < 2*len; i++) {");
11607 mov(Ri, Rlen); {
11608 Label loop, end;
11609 bind(loop);
11610 cmp(Ri, Rlen, Assembler::LSL, 1);
11611 br(Assembler::GE, end);
11612
11613 pre2(Ri, Rlen);
11614
11615 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11616 lsl(Rj, Rlen, 1);
11617 sub(Rj, Rj, Ri);
11618 sub(Rj, Rj, 1);
11619 lsr(Rj, Rj, 1);
11620 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11621 } block_comment(" } // j");
11622
11623 last_squaring(Ri);
11624
11625 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11626 lsl(Rj, Rlen, 1);
11627 sub(Rj, Rj, Ri);
11628 lsr(Rj, Rj, 1);
11629 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11630 } block_comment(" } // j");
11631
11632 post2(Ri, Rlen);
11633 add(Ri, Ri, 1);
11634 cmp(Ri, Rlen, Assembler::LSL, 1);
11635
11636 br(Assembler::LT, loop);
11637 bind(end);
11638 block_comment("} // i");
11639 }
11640
11641 normalize(Rlen);
11642
11643 mov(Ra, Pm_base); // Save Pm_base in Ra
11644 restore_regs(); // Restore caller's Pm_base
11645
11646 // Copy our result into caller's Pm_base
11647 reverse(Pm_base, Ra, Rlen, t0, t1);
11648
11649 leave();
11650 ret(lr);
11651
11652 return entry;
11653 }
11654 // In C, approximately:
11655
11656 // void
11657 // montgomery_square(julong Pa_base[], julong Pn_base[],
11658 // julong Pm_base[], julong inv, int len) {
11659 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11660 // julong *Pa, *Pb, *Pn, *Pm;
11661 // julong Ra, Rb, Rn, Rm;
11662
11663 // int i;
11664
11665 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11666
11667 // for (i = 0; i < len; i++) {
11668 // int j;
11669
11670 // Pa = Pa_base;
11671 // Pb = Pa_base + i;
11672 // Pm = Pm_base;
11673 // Pn = Pn_base + i;
11674
11675 // Ra = *Pa;
11676 // Rb = *Pb;
11677 // Rm = *Pm;
11678 // Rn = *Pn;
11679
11680 // int iters = (i+1)/2;
11681 // for (j = 0; iters--; j++) {
11682 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11683 // MACC2(Ra, Rb, t0, t1, t2);
11684 // Ra = *++Pa;
11685 // Rb = *--Pb;
11686 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11687 // MACC(Rm, Rn, t0, t1, t2);
11688 // Rm = *++Pm;
11689 // Rn = *--Pn;
11690 // }
11691 // if ((i & 1) == 0) {
11692 // assert(Ra == Pa_base[j], "must be");
11693 // MACC(Ra, Ra, t0, t1, t2);
11694 // }
11695 // iters = i/2;
11696 // assert(iters == i-j, "must be");
11697 // for (; iters--; j++) {
11698 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11699 // MACC(Rm, Rn, t0, t1, t2);
11700 // Rm = *++Pm;
11701 // Rn = *--Pn;
11702 // }
11703
11704 // *Pm = Rm = t0 * inv;
11705 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11706 // MACC(Rm, Rn, t0, t1, t2);
11707
11708 // assert(t0 == 0, "broken Montgomery multiply");
11709
11710 // t0 = t1; t1 = t2; t2 = 0;
11711 // }
11712
11713 // for (i = len; i < 2*len; i++) {
11714 // int start = i-len+1;
11715 // int end = start + (len - start)/2;
11716 // int j;
11717
11718 // Pa = Pa_base + i-len;
11719 // Pb = Pa_base + len;
11720 // Pm = Pm_base + i-len;
11721 // Pn = Pn_base + len;
11722
11723 // Ra = *++Pa;
11724 // Rb = *--Pb;
11725 // Rm = *++Pm;
11726 // Rn = *--Pn;
11727
11728 // int iters = (2*len-i-1)/2;
11729 // assert(iters == end-start, "must be");
11730 // for (j = start; iters--; j++) {
11731 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11732 // MACC2(Ra, Rb, t0, t1, t2);
11733 // Ra = *++Pa;
11734 // Rb = *--Pb;
11735 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11736 // MACC(Rm, Rn, t0, t1, t2);
11737 // Rm = *++Pm;
11738 // Rn = *--Pn;
11739 // }
11740 // if ((i & 1) == 0) {
11741 // assert(Ra == Pa_base[j], "must be");
11742 // MACC(Ra, Ra, t0, t1, t2);
11743 // }
11744 // iters = (2*len-i)/2;
11745 // assert(iters == len-j, "must be");
11746 // for (; iters--; j++) {
11747 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11748 // MACC(Rm, Rn, t0, t1, t2);
11749 // Rm = *++Pm;
11750 // Rn = *--Pn;
11751 // }
11752 // Pm_base[i-len] = t0;
11753 // t0 = t1; t1 = t2; t2 = 0;
11754 // }
11755
11756 // while (t0)
11757 // t0 = sub(Pm_base, Pn_base, t0, len);
11758 // }
11759 };
11760
11761 // Call here from the interpreter or compiled code to either load
11762 // multiple returned values from the inline type instance being
11763 // returned to registers or to store returned values to a newly
11764 // allocated inline type instance.
11765 address generate_return_value_stub(address destination, const char* name, bool has_res) {
11766 // We need to save all registers the calling convention may use so
11767 // the runtime calls read or update those registers. This needs to
11768 // be in sync with SharedRuntime::java_return_convention().
11769 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11770 enum layout {
11771 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
11772 j_rarg6_off, j_rarg6_2,
11773 j_rarg5_off, j_rarg5_2,
11774 j_rarg4_off, j_rarg4_2,
11775 j_rarg3_off, j_rarg3_2,
11776 j_rarg2_off, j_rarg2_2,
11777 j_rarg1_off, j_rarg1_2,
11778 j_rarg0_off, j_rarg0_2,
11779
11780 j_farg7_off, j_farg7_2,
11781 j_farg6_off, j_farg6_2,
11782 j_farg5_off, j_farg5_2,
11783 j_farg4_off, j_farg4_2,
11784 j_farg3_off, j_farg3_2,
11785 j_farg2_off, j_farg2_2,
11786 j_farg1_off, j_farg1_2,
11787 j_farg0_off, j_farg0_2,
11788
11789 rfp_off, rfp_off2,
11790 return_off, return_off2,
11791
11792 framesize // inclusive of return address
11793 };
11794
11795 CodeBuffer code(name, 512, 64);
11796 MacroAssembler* masm = new MacroAssembler(&code);
11797
11798 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11799 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11800 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11801 int frame_size_in_words = frame_size_in_bytes / wordSize;
11802
11803 OopMapSet* oop_maps = new OopMapSet();
11804 OopMap* map = new OopMap(frame_size_in_slots, 0);
11805
11806 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11807 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11808 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11809 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11810 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11811 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11812 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11813 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11814
11815 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11816 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11817 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11818 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11819 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11820 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11821 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11822 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11823
11824 address start = __ pc();
11825
11826 __ enter(); // Save FP and LR before call
11827
11828 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11829 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11830 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11831 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11832
11833 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11834 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11835 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11836 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11837
11838 int frame_complete = __ offset();
11839
11840 // Set up last_Java_sp and last_Java_fp
11841 address the_pc = __ pc();
11842 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11843
11844 // Call runtime
11845 __ mov(c_rarg1, r0);
11846 __ mov(c_rarg0, rthread);
11847
11848 __ mov(rscratch1, destination);
11849 __ blr(rscratch1);
11850
11851 oop_maps->add_gc_map(the_pc - start, map);
11852
11853 __ reset_last_Java_frame(false);
11854
11855 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11856 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11857 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11858 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11859
11860 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11861 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11862 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11863 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11864
11865 __ leave();
11866
11867 // check for pending exceptions
11868 Label pending;
11869 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11870 __ cbnz(rscratch1, pending);
11871
11872 if (has_res) {
11873 __ get_vm_result_oop(r0, rthread);
11874 }
11875
11876 __ ret(lr);
11877
11878 __ bind(pending);
11879 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11880
11881 // -------------
11882 // make sure all code is generated
11883 masm->flush();
11884
11885 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11886 return stub->entry_point();
11887 }
11888
11889 // Initialization
11890 void generate_preuniverse_stubs() {
11891 // preuniverse stubs are not needed for aarch64
11892 }
11893
11894 void generate_initial_stubs() {
11895 // Generate initial stubs and initializes the entry points
11896
11897 // entry points that exist in all platforms Note: This is code
11898 // that could be shared among different platforms - however the
11899 // benefit seems to be smaller than the disadvantage of having a
11900 // much more complicated generator structure. See also comment in
11901 // stubRoutines.hpp.
11902
11903 StubRoutines::_forward_exception_entry = generate_forward_exception();
11904
11905 StubRoutines::_call_stub_entry =
11906 generate_call_stub(StubRoutines::_call_stub_return_address);
11907
11908 // is referenced by megamorphic call
11909 StubRoutines::_catch_exception_entry = generate_catch_exception();
11910
11911 // Initialize table for copy memory (arraycopy) check.
11912 if (UnsafeMemoryAccess::_table == nullptr) {
11913 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11914 }
11915
11916 if (UseCRC32Intrinsics) {
11917 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11918 }
11919
11920 if (UseCRC32CIntrinsics) {
11921 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11922 }
11923
11924 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11925 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11926 }
11927
11928 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11929 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11930 }
11931
11932 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11933 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11934 StubRoutines::_hf2f = generate_float16ToFloat();
11935 StubRoutines::_f2hf = generate_floatToFloat16();
11936 }
11937
11938 if (InlineTypeReturnedAsFields) {
11939 StubRoutines::_load_inline_type_fields_in_regs =
11940 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11941 StubRoutines::_store_inline_type_fields_to_buf =
11942 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11943 }
11944
11945 }
11946
11947 void generate_continuation_stubs() {
11948 // Continuation stubs:
11949 StubRoutines::_cont_thaw = generate_cont_thaw();
11950 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11951 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11952 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11953 }
11954
11955 void generate_final_stubs() {
11956 // support for verify_oop (must happen after universe_init)
11957 if (VerifyOops) {
11958 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11959 }
11960
11961 // arraycopy stubs used by compilers
11962 generate_arraycopy_stubs();
11963
11964 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11965
11966 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11967
11968 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11969 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11970
11971 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11972
11973 generate_atomic_entry_points();
11974
11975 #endif // LINUX
11976
11977 #ifdef COMPILER2
11978 if (UseSecondarySupersTable) {
11979 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11980 if (! InlineSecondarySupersTest) {
11981 generate_lookup_secondary_supers_table_stub();
11982 }
11983 }
11984 #endif
11985
11986 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11987
11988 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11989 }
11990
11991 void generate_compiler_stubs() {
11992 #if COMPILER2_OR_JVMCI
11993
11994 if (UseSVE == 0) {
11995 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11996 }
11997
11998 // array equals stub for large arrays.
11999 if (!UseSimpleArrayEquals) {
12000 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12001 }
12002
12003 // arrays_hascode stub for large arrays.
12004 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12005 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12006 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12007 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12008 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12009
12010 // byte_array_inflate stub for large arrays.
12011 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12012
12013 // countPositives stub for large arrays.
12014 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12015
12016 generate_compare_long_strings();
12017
12018 generate_string_indexof_stubs();
12019
12020 #ifdef COMPILER2
12021 if (UseMultiplyToLenIntrinsic) {
12022 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12023 }
12024
12025 if (UseSquareToLenIntrinsic) {
12026 StubRoutines::_squareToLen = generate_squareToLen();
12027 }
12028
12029 if (UseMulAddIntrinsic) {
12030 StubRoutines::_mulAdd = generate_mulAdd();
12031 }
12032
12033 if (UseSIMDForBigIntegerShiftIntrinsics) {
12034 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12035 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12036 }
12037
12038 if (UseMontgomeryMultiplyIntrinsic) {
12039 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12040 StubCodeMark mark(this, stub_id);
12041 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12042 StubRoutines::_montgomeryMultiply = g.generate_multiply();
12043 }
12044
12045 if (UseMontgomerySquareIntrinsic) {
12046 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12047 StubCodeMark mark(this, stub_id);
12048 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12049 // We use generate_multiply() rather than generate_square()
12050 // because it's faster for the sizes of modulus we care about.
12051 StubRoutines::_montgomerySquare = g.generate_multiply();
12052 }
12053
12054 #endif // COMPILER2
12055
12056 if (UseChaCha20Intrinsics) {
12057 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12058 }
12059
12060 if (UseKyberIntrinsics) {
12061 StubRoutines::_kyberNtt = generate_kyberNtt();
12062 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12063 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12064 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12065 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12066 StubRoutines::_kyber12To16 = generate_kyber12To16();
12067 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12068 }
12069
12070 if (UseDilithiumIntrinsics) {
12071 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12072 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12073 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12074 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12075 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12076 }
12077
12078 if (UseBASE64Intrinsics) {
12079 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12080 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12081 }
12082
12083 // data cache line writeback
12084 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12085 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12086
12087 if (UseAESIntrinsics) {
12088 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12089 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12090 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12091 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12092 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12093 }
12094 if (UseGHASHIntrinsics) {
12095 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12096 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12097 }
12098 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12099 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12100 }
12101
12102 if (UseMD5Intrinsics) {
12103 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12104 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12105 }
12106 if (UseSHA1Intrinsics) {
12107 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12108 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12109 }
12110 if (UseSHA256Intrinsics) {
12111 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12112 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12113 }
12114 if (UseSHA512Intrinsics) {
12115 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12116 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12117 }
12118 if (UseSHA3Intrinsics) {
12119
12120 StubRoutines::_double_keccak = generate_double_keccak();
12121 if (UseSIMDForSHA3Intrinsic) {
12122 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12123 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12124 } else {
12125 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12126 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12127 }
12128 }
12129
12130 if (UsePoly1305Intrinsics) {
12131 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12132 }
12133
12134 // generate Adler32 intrinsics code
12135 if (UseAdler32Intrinsics) {
12136 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12137 }
12138
12139 #endif // COMPILER2_OR_JVMCI
12140 }
12141
12142 public:
12143 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12144 switch(blob_id) {
12145 case BlobId::stubgen_preuniverse_id:
12146 generate_preuniverse_stubs();
12147 break;
12148 case BlobId::stubgen_initial_id:
12149 generate_initial_stubs();
12150 break;
12151 case BlobId::stubgen_continuation_id:
12152 generate_continuation_stubs();
12153 break;
12154 case BlobId::stubgen_compiler_id:
12155 generate_compiler_stubs();
12156 break;
12157 case BlobId::stubgen_final_id:
12158 generate_final_stubs();
12159 break;
12160 default:
12161 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12162 break;
12163 };
12164 }
12165 }; // end class declaration
12166
12167 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12168 StubGenerator g(code, blob_id);
12169 }
12170
12171
12172 #if defined (LINUX)
12173
12174 // Define pointers to atomic stubs and initialize them to point to the
12175 // code in atomic_aarch64.S.
12176
12177 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12178 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12179 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12180 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12181 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12182
12183 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12184 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12185 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12186 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12187 DEFAULT_ATOMIC_OP(xchg, 4, )
12188 DEFAULT_ATOMIC_OP(xchg, 8, )
12189 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12190 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12191 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12192 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12193 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12194 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12195 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12196 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12197 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12198 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12199
12200 #undef DEFAULT_ATOMIC_OP
12201
12202 #endif // LINUX