1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 // All of j_rargN may be used to return inline type fields so be careful
332 // not to clobber those.
333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
334 // assignment of Rresult below.
335 Register Rresult = r14, Rresult_type = r15;
336 __ ldr(Rresult, result);
337 Label is_long, is_float, is_double, check_prim, exit;
338 __ ldr(Rresult_type, result_type);
339 __ cmp(Rresult_type, (u1)T_OBJECT);
340 __ br(Assembler::EQ, check_prim);
341 __ cmp(Rresult_type, (u1)T_LONG);
342 __ br(Assembler::EQ, is_long);
343 __ cmp(Rresult_type, (u1)T_FLOAT);
344 __ br(Assembler::EQ, is_float);
345 __ cmp(Rresult_type, (u1)T_DOUBLE);
346 __ br(Assembler::EQ, is_double);
347
348 // handle T_INT case
349 __ strw(r0, Address(Rresult));
350
351 __ BIND(exit);
352
353 // pop parameters
354 __ sub(esp, rfp, -sp_after_call_off * wordSize);
355
356 #ifdef ASSERT
357 // verify that threads correspond
358 {
359 Label L, S;
360 __ ldr(rscratch1, thread);
361 __ cmp(rthread, rscratch1);
362 __ br(Assembler::NE, S);
363 __ get_thread(rscratch1);
364 __ cmp(rthread, rscratch1);
365 __ br(Assembler::EQ, L);
366 __ BIND(S);
367 __ stop("StubRoutines::call_stub: threads must correspond");
368 __ BIND(L);
369 }
370 #endif
371
372 __ pop_cont_fastpath(rthread);
373
374 // restore callee-save registers
375 __ ldpd(v15, v14, d15_save);
376 __ ldpd(v13, v12, d13_save);
377 __ ldpd(v11, v10, d11_save);
378 __ ldpd(v9, v8, d9_save);
379
380 __ ldp(r28, r27, r28_save);
381 __ ldp(r26, r25, r26_save);
382 __ ldp(r24, r23, r24_save);
383 __ ldp(r22, r21, r22_save);
384 __ ldp(r20, r19, r20_save);
385
386 // restore fpcr
387 __ ldr(rscratch1, fpcr_save);
388 __ set_fpcr(rscratch1);
389
390 __ ldp(c_rarg0, c_rarg1, call_wrapper);
391 __ ldrw(c_rarg2, result_type);
392 __ ldr(c_rarg3, method);
393 __ ldp(c_rarg4, c_rarg5, entry_point);
394 __ ldp(c_rarg6, c_rarg7, parameter_size);
395
396 // leave frame and return to caller
397 __ leave();
398 __ ret(lr);
399
400 // handle return types different from T_INT
401 __ BIND(check_prim);
402 if (InlineTypeReturnedAsFields) {
403 // Check for scalarized return value
404 __ tbz(r0, 0, is_long);
405 // Load pack handler address
406 __ andr(rscratch1, r0, -2);
407 __ ldr(rscratch1, Address(rscratch1, InstanceKlass::adr_inlineklass_fixed_block_offset()));
408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
409 __ blr(rscratch1);
410 __ b(exit);
411 }
412
413 __ BIND(is_long);
414 __ str(r0, Address(Rresult, 0));
415 __ br(Assembler::AL, exit);
416
417 __ BIND(is_float);
418 __ strs(j_farg0, Address(Rresult, 0));
419 __ br(Assembler::AL, exit);
420
421 __ BIND(is_double);
422 __ strd(j_farg0, Address(Rresult, 0));
423 __ br(Assembler::AL, exit);
424
425 return start;
426 }
427
428 // Return point for a Java call if there's an exception thrown in
429 // Java code. The exception is caught and transformed into a
430 // pending exception stored in JavaThread that can be tested from
431 // within the VM.
432 //
433 // Note: Usually the parameters are removed by the callee. In case
434 // of an exception crossing an activation frame boundary, that is
435 // not the case if the callee is compiled code => need to setup the
436 // rsp.
437 //
438 // r0: exception oop
439
440 address generate_catch_exception() {
441 StubId stub_id = StubId::stubgen_catch_exception_id;
442 StubCodeMark mark(this, stub_id);
443 address start = __ pc();
444
445 // same as in generate_call_stub():
446 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
447 const Address thread (rfp, thread_off * wordSize);
448
449 #ifdef ASSERT
450 // verify that threads correspond
451 {
452 Label L, S;
453 __ ldr(rscratch1, thread);
454 __ cmp(rthread, rscratch1);
455 __ br(Assembler::NE, S);
456 __ get_thread(rscratch1);
457 __ cmp(rthread, rscratch1);
458 __ br(Assembler::EQ, L);
459 __ bind(S);
460 __ stop("StubRoutines::catch_exception: threads must correspond");
461 __ bind(L);
462 }
463 #endif
464
465 // set pending exception
466 __ verify_oop(r0);
467
468 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
469 __ mov(rscratch1, (address)__FILE__);
470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
471 __ movw(rscratch1, (int)__LINE__);
472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
473
474 // complete return to VM
475 assert(StubRoutines::_call_stub_return_address != nullptr,
476 "_call_stub_return_address must have been generated before");
477 __ b(StubRoutines::_call_stub_return_address);
478
479 return start;
480 }
481
482 // Continuation point for runtime calls returning with a pending
483 // exception. The pending exception check happened in the runtime
484 // or native call stub. The pending exception in Thread is
485 // converted into a Java-level exception.
486 //
487 // Contract with Java-level exception handlers:
488 // r0: exception
489 // r3: throwing pc
490 //
491 // NOTE: At entry of this stub, exception-pc must be in LR !!
492
493 // NOTE: this is always used as a jump target within generated code
494 // so it just needs to be generated code with no x86 prolog
495
496 address generate_forward_exception() {
497 StubId stub_id = StubId::stubgen_forward_exception_id;
498 StubCodeMark mark(this, stub_id);
499 address start = __ pc();
500
501 // Upon entry, LR points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510 #ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
515 __ cbnz(rscratch1, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519 #endif
520
521 // compute exception handler into r19
522
523 // call the VM to find the handler address associated with the
524 // caller address. pass thread in r0 and caller pc (ret address)
525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
526 // the stack.
527 __ mov(c_rarg1, lr);
528 // lr will be trashed by the VM call so we move it to R19
529 // (callee-saved) because we also need to pass it to the handler
530 // returned by this call.
531 __ mov(r19, lr);
532 BLOCK_COMMENT("call exception_handler_for_return_address");
533 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
534 SharedRuntime::exception_handler_for_return_address),
535 rthread, c_rarg1);
536 // Reinitialize the ptrue predicate register, in case the external runtime
537 // call clobbers ptrue reg, as we may return to SVE compiled code.
538 __ reinitialize_ptrue();
539
540 // we should not really care that lr is no longer the callee
541 // address. we saved the value the handler needs in r19 so we can
542 // just copy it to r3. however, the C2 handler will push its own
543 // frame and then calls into the VM and the VM code asserts that
544 // the PC for the frame above the handler belongs to a compiled
545 // Java method. So, we restore lr here to satisfy that assert.
546 __ mov(lr, r19);
547 // setup r0 & r3 & clear pending exception
548 __ mov(r3, r19);
549 __ mov(r19, r0);
550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
551 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
552
553 #ifdef ASSERT
554 // make sure exception is set
555 {
556 Label L;
557 __ cbnz(r0, L);
558 __ stop("StubRoutines::forward exception: no pending exception (2)");
559 __ bind(L);
560 }
561 #endif
562
563 // continue at exception handler
564 // r0: exception
565 // r3: throwing pc
566 // r19: exception handler
567 __ verify_oop(r0);
568 __ br(r19);
569
570 return start;
571 }
572
573 // Non-destructive plausibility checks for oops
574 //
575 // Arguments:
576 // r0: oop to verify
577 // rscratch1: error message
578 //
579 // Stack after saving c_rarg3:
580 // [tos + 0]: saved c_rarg3
581 // [tos + 1]: saved c_rarg2
582 // [tos + 2]: saved lr
583 // [tos + 3]: saved rscratch2
584 // [tos + 4]: saved r0
585 // [tos + 5]: saved rscratch1
586 address generate_verify_oop() {
587 StubId stub_id = StubId::stubgen_verify_oop_id;
588 StubCodeMark mark(this, stub_id);
589 address start = __ pc();
590
591 Label exit, error;
592
593 // save c_rarg2 and c_rarg3
594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
595
596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
598 __ ldr(c_rarg3, Address(c_rarg2));
599 __ add(c_rarg3, c_rarg3, 1);
600 __ str(c_rarg3, Address(c_rarg2));
601
602 // object is in r0
603 // make sure object is 'reasonable'
604 __ cbz(r0, exit); // if obj is null it is OK
605
606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
608
609 // return if everything seems ok
610 __ bind(exit);
611
612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
613 __ ret(lr);
614
615 // handle errors
616 __ bind(error);
617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
618
619 __ push(RegSet::range(r0, r29), sp);
620 // debug(char* msg, int64_t pc, int64_t regs[])
621 __ mov(c_rarg0, rscratch1); // pass address of error message
622 __ mov(c_rarg1, lr); // pass return address
623 __ mov(c_rarg2, sp); // pass address of regs on stack
624 #ifndef PRODUCT
625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
626 #endif
627 BLOCK_COMMENT("call MacroAssembler::debug");
628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
629 __ blr(rscratch1);
630 __ hlt(0);
631
632 return start;
633 }
634
635 // Generate indices for iota vector.
636 address generate_iota_indices(StubId stub_id) {
637 __ align(CodeEntryAlignment);
638 StubCodeMark mark(this, stub_id);
639 address start = __ pc();
640 // B
641 __ emit_data64(0x0706050403020100, relocInfo::none);
642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
643 // H
644 __ emit_data64(0x0003000200010000, relocInfo::none);
645 __ emit_data64(0x0007000600050004, relocInfo::none);
646 // S
647 __ emit_data64(0x0000000100000000, relocInfo::none);
648 __ emit_data64(0x0000000300000002, relocInfo::none);
649 // D
650 __ emit_data64(0x0000000000000000, relocInfo::none);
651 __ emit_data64(0x0000000000000001, relocInfo::none);
652 // S - FP
653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
655 // D - FP
656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
658 return start;
659 }
660
661 // The inner part of zero_words(). This is the bulk operation,
662 // zeroing words in blocks, possibly using DC ZVA to do it. The
663 // caller is responsible for zeroing the last few words.
664 //
665 // Inputs:
666 // r10: the HeapWord-aligned base address of an array to zero.
667 // r11: the count in HeapWords, r11 > 0.
668 //
669 // Returns r10 and r11, adjusted for the caller to clear.
670 // r10: the base address of the tail of words left to clear.
671 // r11: the number of words in the tail.
672 // r11 < MacroAssembler::zero_words_block_size.
673
674 address generate_zero_blocks() {
675 Label done;
676 Label base_aligned;
677
678 Register base = r10, cnt = r11;
679
680 __ align(CodeEntryAlignment);
681 StubId stub_id = StubId::stubgen_zero_blocks_id;
682 StubCodeMark mark(this, stub_id);
683 address start = __ pc();
684
685 if (UseBlockZeroing) {
686 int zva_length = VM_Version::zva_length();
687
688 // Ensure ZVA length can be divided by 16. This is required by
689 // the subsequent operations.
690 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
691
692 __ tbz(base, 3, base_aligned);
693 __ str(zr, Address(__ post(base, 8)));
694 __ sub(cnt, cnt, 1);
695 __ bind(base_aligned);
696
697 // Ensure count >= zva_length * 2 so that it still deserves a zva after
698 // alignment.
699 Label small;
700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
701 __ subs(rscratch1, cnt, low_limit >> 3);
702 __ br(Assembler::LT, small);
703 __ zero_dcache_blocks(base, cnt);
704 __ bind(small);
705 }
706
707 {
708 // Number of stp instructions we'll unroll
709 const int unroll =
710 MacroAssembler::zero_words_block_size / 2;
711 // Clear the remaining blocks.
712 Label loop;
713 __ subs(cnt, cnt, unroll * 2);
714 __ br(Assembler::LT, done);
715 __ bind(loop);
716 for (int i = 0; i < unroll; i++)
717 __ stp(zr, zr, __ post(base, 16));
718 __ subs(cnt, cnt, unroll * 2);
719 __ br(Assembler::GE, loop);
720 __ bind(done);
721 __ add(cnt, cnt, unroll * 2);
722 }
723
724 __ ret(lr);
725
726 return start;
727 }
728
729
730 typedef enum {
731 copy_forwards = 1,
732 copy_backwards = -1
733 } copy_direction;
734
735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
736 // for arraycopy stubs.
737 class ArrayCopyBarrierSetHelper : StackObj {
738 BarrierSetAssembler* _bs_asm;
739 MacroAssembler* _masm;
740 DecoratorSet _decorators;
741 BasicType _type;
742 Register _gct1;
743 Register _gct2;
744 Register _gct3;
745 FloatRegister _gcvt1;
746 FloatRegister _gcvt2;
747 FloatRegister _gcvt3;
748
749 public:
750 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
751 DecoratorSet decorators,
752 BasicType type,
753 Register gct1,
754 Register gct2,
755 Register gct3,
756 FloatRegister gcvt1,
757 FloatRegister gcvt2,
758 FloatRegister gcvt3)
759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
760 _masm(masm),
761 _decorators(decorators),
762 _type(type),
763 _gct1(gct1),
764 _gct2(gct2),
765 _gct3(gct3),
766 _gcvt1(gcvt1),
767 _gcvt2(gcvt2),
768 _gcvt3(gcvt3) {
769 }
770
771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
773 dst1, dst2, src,
774 _gct1, _gct2, _gcvt1);
775 }
776
777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
779 dst, src1, src2,
780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
781 }
782
783 void copy_load_at_16(Register dst1, Register dst2, Address src) {
784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
785 dst1, dst2, src,
786 _gct1);
787 }
788
789 void copy_store_at_16(Address dst, Register src1, Register src2) {
790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
791 dst, src1, src2,
792 _gct1, _gct2, _gct3);
793 }
794
795 void copy_load_at_8(Register dst, Address src) {
796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
797 dst, noreg, src,
798 _gct1);
799 }
800
801 void copy_store_at_8(Address dst, Register src) {
802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
803 dst, src, noreg,
804 _gct1, _gct2, _gct3);
805 }
806 };
807
808 // Bulk copy of blocks of 8 words.
809 //
810 // count is a count of words.
811 //
812 // Precondition: count >= 8
813 //
814 // Postconditions:
815 //
816 // The least significant bit of count contains the remaining count
817 // of words to copy. The rest of count is trash.
818 //
819 // s and d are adjusted to point to the remaining words to copy
820 //
821 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
822 BasicType type;
823 copy_direction direction;
824
825 switch (stub_id) {
826 case StubId::stubgen_copy_byte_f_id:
827 direction = copy_forwards;
828 type = T_BYTE;
829 break;
830 case StubId::stubgen_copy_byte_b_id:
831 direction = copy_backwards;
832 type = T_BYTE;
833 break;
834 case StubId::stubgen_copy_oop_f_id:
835 direction = copy_forwards;
836 type = T_OBJECT;
837 break;
838 case StubId::stubgen_copy_oop_b_id:
839 direction = copy_backwards;
840 type = T_OBJECT;
841 break;
842 case StubId::stubgen_copy_oop_uninit_f_id:
843 direction = copy_forwards;
844 type = T_OBJECT;
845 break;
846 case StubId::stubgen_copy_oop_uninit_b_id:
847 direction = copy_backwards;
848 type = T_OBJECT;
849 break;
850 default:
851 ShouldNotReachHere();
852 }
853
854 int unit = wordSize * direction;
855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
856
857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
858 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
859 const Register stride = r14;
860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
863
864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
865 assert_different_registers(s, d, count, rscratch1, rscratch2);
866
867 Label again, drain;
868
869 __ align(CodeEntryAlignment);
870
871 StubCodeMark mark(this, stub_id);
872
873 address start = __ pc();
874
875 Label unaligned_copy_long;
876 if (AvoidUnalignedAccesses) {
877 __ tbnz(d, 3, unaligned_copy_long);
878 }
879
880 if (direction == copy_forwards) {
881 __ sub(s, s, bias);
882 __ sub(d, d, bias);
883 }
884
885 #ifdef ASSERT
886 // Make sure we are never given < 8 words
887 {
888 Label L;
889 __ cmp(count, (u1)8);
890 __ br(Assembler::GE, L);
891 __ stop("genrate_copy_longs called with < 8 words");
892 __ bind(L);
893 }
894 #endif
895
896 // Fill 8 registers
897 if (UseSIMDForMemoryOps) {
898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
900 } else {
901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
905 }
906
907 __ subs(count, count, 16);
908 __ br(Assembler::LO, drain);
909
910 int prefetch = PrefetchCopyIntervalInBytes;
911 bool use_stride = false;
912 if (direction == copy_backwards) {
913 use_stride = prefetch > 256;
914 prefetch = -prefetch;
915 if (use_stride) __ mov(stride, prefetch);
916 }
917
918 __ bind(again);
919
920 if (PrefetchCopyIntervalInBytes > 0)
921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
922
923 if (UseSIMDForMemoryOps) {
924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
928 } else {
929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
937 }
938
939 __ subs(count, count, 8);
940 __ br(Assembler::HS, again);
941
942 // Drain
943 __ bind(drain);
944 if (UseSIMDForMemoryOps) {
945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
947 } else {
948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
952 }
953
954 {
955 Label L1, L2;
956 __ tbz(count, exact_log2(4), L1);
957 if (UseSIMDForMemoryOps) {
958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
960 } else {
961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
965 }
966 __ bind(L1);
967
968 if (direction == copy_forwards) {
969 __ add(s, s, bias);
970 __ add(d, d, bias);
971 }
972
973 __ tbz(count, 1, L2);
974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
976 __ bind(L2);
977 }
978
979 __ ret(lr);
980
981 if (AvoidUnalignedAccesses) {
982 Label drain, again;
983 // Register order for storing. Order is different for backward copy.
984
985 __ bind(unaligned_copy_long);
986
987 // source address is even aligned, target odd aligned
988 //
989 // when forward copying word pairs we read long pairs at offsets
990 // {0, 2, 4, 6} (in long words). when backwards copying we read
991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
992 // address by -2 in the forwards case so we can compute the
993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
994 // or -1.
995 //
996 // when forward copying we need to store 1 word, 3 pairs and
997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
998 // zero offset We adjust the destination by -1 which means we
999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1000 //
1001 // When backwards copyng we need to store 1 word, 3 pairs and
1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1003 // offsets {1, 3, 5, 7, 8} * unit.
1004
1005 if (direction == copy_forwards) {
1006 __ sub(s, s, 16);
1007 __ sub(d, d, 8);
1008 }
1009
1010 // Fill 8 registers
1011 //
1012 // for forwards copy s was offset by -16 from the original input
1013 // value of s so the register contents are at these offsets
1014 // relative to the 64 bit block addressed by that original input
1015 // and so on for each successive 64 byte block when s is updated
1016 //
1017 // t0 at offset 0, t1 at offset 8
1018 // t2 at offset 16, t3 at offset 24
1019 // t4 at offset 32, t5 at offset 40
1020 // t6 at offset 48, t7 at offset 56
1021
1022 // for backwards copy s was not offset so the register contents
1023 // are at these offsets into the preceding 64 byte block
1024 // relative to that original input and so on for each successive
1025 // preceding 64 byte block when s is updated. this explains the
1026 // slightly counter-intuitive looking pattern of register usage
1027 // in the stp instructions for backwards copy.
1028 //
1029 // t0 at offset -16, t1 at offset -8
1030 // t2 at offset -32, t3 at offset -24
1031 // t4 at offset -48, t5 at offset -40
1032 // t6 at offset -64, t7 at offset -56
1033
1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1038
1039 __ subs(count, count, 16);
1040 __ br(Assembler::LO, drain);
1041
1042 int prefetch = PrefetchCopyIntervalInBytes;
1043 bool use_stride = false;
1044 if (direction == copy_backwards) {
1045 use_stride = prefetch > 256;
1046 prefetch = -prefetch;
1047 if (use_stride) __ mov(stride, prefetch);
1048 }
1049
1050 __ bind(again);
1051
1052 if (PrefetchCopyIntervalInBytes > 0)
1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1054
1055 if (direction == copy_forwards) {
1056 // allowing for the offset of -8 the store instructions place
1057 // registers into the target 64 bit block at the following
1058 // offsets
1059 //
1060 // t0 at offset 0
1061 // t1 at offset 8, t2 at offset 16
1062 // t3 at offset 24, t4 at offset 32
1063 // t5 at offset 40, t6 at offset 48
1064 // t7 at offset 56
1065
1066 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1075 } else {
1076 // d was not offset when we started so the registers are
1077 // written into the 64 bit block preceding d with the following
1078 // offsets
1079 //
1080 // t1 at offset -8
1081 // t3 at offset -24, t0 at offset -16
1082 // t5 at offset -48, t2 at offset -32
1083 // t7 at offset -56, t4 at offset -48
1084 // t6 at offset -64
1085 //
1086 // note that this matches the offsets previously noted for the
1087 // loads
1088
1089 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1098 }
1099
1100 __ subs(count, count, 8);
1101 __ br(Assembler::HS, again);
1102
1103 // Drain
1104 //
1105 // this uses the same pattern of offsets and register arguments
1106 // as above
1107 __ bind(drain);
1108 if (direction == copy_forwards) {
1109 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1114 } else {
1115 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1120 }
1121 // now we need to copy any remaining part block which may
1122 // include a 4 word block subblock and/or a 2 word subblock.
1123 // bits 2 and 1 in the count are the tell-tale for whether we
1124 // have each such subblock
1125 {
1126 Label L1, L2;
1127 __ tbz(count, exact_log2(4), L1);
1128 // this is the same as above but copying only 4 longs hence
1129 // with only one intervening stp between the str instructions
1130 // but note that the offsets and registers still follow the
1131 // same pattern
1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1134 if (direction == copy_forwards) {
1135 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1142 }
1143 __ bind(L1);
1144
1145 __ tbz(count, 1, L2);
1146 // this is the same as above but copying only 2 longs hence
1147 // there is no intervening stp between the str instructions
1148 // but note that the offset and register patterns are still
1149 // the same
1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1151 if (direction == copy_forwards) {
1152 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1154 } else {
1155 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1157 }
1158 __ bind(L2);
1159
1160 // for forwards copy we need to re-adjust the offsets we
1161 // applied so that s and d are follow the last words written
1162
1163 if (direction == copy_forwards) {
1164 __ add(s, s, 16);
1165 __ add(d, d, 8);
1166 }
1167
1168 }
1169
1170 __ ret(lr);
1171 }
1172
1173 return start;
1174 }
1175
1176 // Small copy: less than 16 bytes.
1177 //
1178 // NB: Ignores all of the bits of count which represent more than 15
1179 // bytes, so a caller doesn't have to mask them.
1180
1181 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1182 bool is_backwards = step < 0;
1183 size_t granularity = g_uabs(step);
1184 int direction = is_backwards ? -1 : 1;
1185
1186 Label Lword, Lint, Lshort, Lbyte;
1187
1188 assert(granularity
1189 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1190
1191 const Register t0 = r3;
1192 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1193 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1194
1195 // ??? I don't know if this bit-test-and-branch is the right thing
1196 // to do. It does a lot of jumping, resulting in several
1197 // mispredicted branches. It might make more sense to do this
1198 // with something like Duff's device with a single computed branch.
1199
1200 __ tbz(count, 3 - exact_log2(granularity), Lword);
1201 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1202 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1203 __ bind(Lword);
1204
1205 if (granularity <= sizeof (jint)) {
1206 __ tbz(count, 2 - exact_log2(granularity), Lint);
1207 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1208 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1209 __ bind(Lint);
1210 }
1211
1212 if (granularity <= sizeof (jshort)) {
1213 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1214 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1215 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1216 __ bind(Lshort);
1217 }
1218
1219 if (granularity <= sizeof (jbyte)) {
1220 __ tbz(count, 0, Lbyte);
1221 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1222 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1223 __ bind(Lbyte);
1224 }
1225 }
1226
1227 // All-singing all-dancing memory copy.
1228 //
1229 // Copy count units of memory from s to d. The size of a unit is
1230 // step, which can be positive or negative depending on the direction
1231 // of copy. If is_aligned is false, we align the source address.
1232 //
1233
1234 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1235 Register s, Register d, Register count, int step) {
1236 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1237 bool is_backwards = step < 0;
1238 unsigned int granularity = g_uabs(step);
1239 const Register t0 = r3, t1 = r4;
1240
1241 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1242 // load all the data before writing anything
1243 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1244 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1245 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1246 const Register send = r17, dend = r16;
1247 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1248 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1249 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1250
1251 if (PrefetchCopyIntervalInBytes > 0)
1252 __ prfm(Address(s, 0), PLDL1KEEP);
1253 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1254 __ br(Assembler::HI, copy_big);
1255
1256 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1257 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1258
1259 __ cmp(count, u1(16/granularity));
1260 __ br(Assembler::LS, copy16);
1261
1262 __ cmp(count, u1(64/granularity));
1263 __ br(Assembler::HI, copy80);
1264
1265 __ cmp(count, u1(32/granularity));
1266 __ br(Assembler::LS, copy32);
1267
1268 // 33..64 bytes
1269 if (UseSIMDForMemoryOps) {
1270 bs.copy_load_at_32(v0, v1, Address(s, 0));
1271 bs.copy_load_at_32(v2, v3, Address(send, -32));
1272 bs.copy_store_at_32(Address(d, 0), v0, v1);
1273 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1274 } else {
1275 bs.copy_load_at_16(t0, t1, Address(s, 0));
1276 bs.copy_load_at_16(t2, t3, Address(s, 16));
1277 bs.copy_load_at_16(t4, t5, Address(send, -32));
1278 bs.copy_load_at_16(t6, t7, Address(send, -16));
1279
1280 bs.copy_store_at_16(Address(d, 0), t0, t1);
1281 bs.copy_store_at_16(Address(d, 16), t2, t3);
1282 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1283 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1284 }
1285 __ b(finish);
1286
1287 // 17..32 bytes
1288 __ bind(copy32);
1289 bs.copy_load_at_16(t0, t1, Address(s, 0));
1290 bs.copy_load_at_16(t6, t7, Address(send, -16));
1291
1292 bs.copy_store_at_16(Address(d, 0), t0, t1);
1293 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1294 __ b(finish);
1295
1296 // 65..80/96 bytes
1297 // (96 bytes if SIMD because we do 32 byes per instruction)
1298 __ bind(copy80);
1299 if (UseSIMDForMemoryOps) {
1300 bs.copy_load_at_32(v0, v1, Address(s, 0));
1301 bs.copy_load_at_32(v2, v3, Address(s, 32));
1302 // Unaligned pointers can be an issue for copying.
1303 // The issue has more chances to happen when granularity of data is
1304 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1305 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1306 // The most performance drop has been seen for the range 65-80 bytes.
1307 // For such cases using the pair of ldp/stp instead of the third pair of
1308 // ldpq/stpq fixes the performance issue.
1309 if (granularity < sizeof (jint)) {
1310 Label copy96;
1311 __ cmp(count, u1(80/granularity));
1312 __ br(Assembler::HI, copy96);
1313 bs.copy_load_at_16(t0, t1, Address(send, -16));
1314
1315 bs.copy_store_at_32(Address(d, 0), v0, v1);
1316 bs.copy_store_at_32(Address(d, 32), v2, v3);
1317
1318 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1319 __ b(finish);
1320
1321 __ bind(copy96);
1322 }
1323 bs.copy_load_at_32(v4, v5, Address(send, -32));
1324
1325 bs.copy_store_at_32(Address(d, 0), v0, v1);
1326 bs.copy_store_at_32(Address(d, 32), v2, v3);
1327
1328 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1329 } else {
1330 bs.copy_load_at_16(t0, t1, Address(s, 0));
1331 bs.copy_load_at_16(t2, t3, Address(s, 16));
1332 bs.copy_load_at_16(t4, t5, Address(s, 32));
1333 bs.copy_load_at_16(t6, t7, Address(s, 48));
1334 bs.copy_load_at_16(t8, t9, Address(send, -16));
1335
1336 bs.copy_store_at_16(Address(d, 0), t0, t1);
1337 bs.copy_store_at_16(Address(d, 16), t2, t3);
1338 bs.copy_store_at_16(Address(d, 32), t4, t5);
1339 bs.copy_store_at_16(Address(d, 48), t6, t7);
1340 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1341 }
1342 __ b(finish);
1343
1344 // 0..16 bytes
1345 __ bind(copy16);
1346 __ cmp(count, u1(8/granularity));
1347 __ br(Assembler::LO, copy8);
1348
1349 // 8..16 bytes
1350 bs.copy_load_at_8(t0, Address(s, 0));
1351 bs.copy_load_at_8(t1, Address(send, -8));
1352 bs.copy_store_at_8(Address(d, 0), t0);
1353 bs.copy_store_at_8(Address(dend, -8), t1);
1354 __ b(finish);
1355
1356 if (granularity < 8) {
1357 // 4..7 bytes
1358 __ bind(copy8);
1359 __ tbz(count, 2 - exact_log2(granularity), copy4);
1360 __ ldrw(t0, Address(s, 0));
1361 __ ldrw(t1, Address(send, -4));
1362 __ strw(t0, Address(d, 0));
1363 __ strw(t1, Address(dend, -4));
1364 __ b(finish);
1365 if (granularity < 4) {
1366 // 0..3 bytes
1367 __ bind(copy4);
1368 __ cbz(count, finish); // get rid of 0 case
1369 if (granularity == 2) {
1370 __ ldrh(t0, Address(s, 0));
1371 __ strh(t0, Address(d, 0));
1372 } else { // granularity == 1
1373 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1374 // the first and last byte.
1375 // Handle the 3 byte case by loading and storing base + count/2
1376 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1377 // This does means in the 1 byte case we load/store the same
1378 // byte 3 times.
1379 __ lsr(count, count, 1);
1380 __ ldrb(t0, Address(s, 0));
1381 __ ldrb(t1, Address(send, -1));
1382 __ ldrb(t2, Address(s, count));
1383 __ strb(t0, Address(d, 0));
1384 __ strb(t1, Address(dend, -1));
1385 __ strb(t2, Address(d, count));
1386 }
1387 __ b(finish);
1388 }
1389 }
1390
1391 __ bind(copy_big);
1392 if (is_backwards) {
1393 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1394 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1395 }
1396
1397 // Now we've got the small case out of the way we can align the
1398 // source address on a 2-word boundary.
1399
1400 // Here we will materialize a count in r15, which is used by copy_memory_small
1401 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1402 // Up until here, we have used t9, which aliases r15, but from here on, that register
1403 // can not be used as a temp register, as it contains the count.
1404
1405 Label aligned;
1406
1407 if (is_aligned) {
1408 // We may have to adjust by 1 word to get s 2-word-aligned.
1409 __ tbz(s, exact_log2(wordSize), aligned);
1410 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1411 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1412 __ sub(count, count, wordSize/granularity);
1413 } else {
1414 if (is_backwards) {
1415 __ andr(r15, s, 2 * wordSize - 1);
1416 } else {
1417 __ neg(r15, s);
1418 __ andr(r15, r15, 2 * wordSize - 1);
1419 }
1420 // r15 is the byte adjustment needed to align s.
1421 __ cbz(r15, aligned);
1422 int shift = exact_log2(granularity);
1423 if (shift > 0) {
1424 __ lsr(r15, r15, shift);
1425 }
1426 __ sub(count, count, r15);
1427
1428 #if 0
1429 // ?? This code is only correct for a disjoint copy. It may or
1430 // may not make sense to use it in that case.
1431
1432 // Copy the first pair; s and d may not be aligned.
1433 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1434 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1435
1436 // Align s and d, adjust count
1437 if (is_backwards) {
1438 __ sub(s, s, r15);
1439 __ sub(d, d, r15);
1440 } else {
1441 __ add(s, s, r15);
1442 __ add(d, d, r15);
1443 }
1444 #else
1445 copy_memory_small(decorators, type, s, d, r15, step);
1446 #endif
1447 }
1448
1449 __ bind(aligned);
1450
1451 // s is now 2-word-aligned.
1452
1453 // We have a count of units and some trailing bytes. Adjust the
1454 // count and do a bulk copy of words. If the shift is zero
1455 // perform a move instead to benefit from zero latency moves.
1456 int shift = exact_log2(wordSize/granularity);
1457 if (shift > 0) {
1458 __ lsr(r15, count, shift);
1459 } else {
1460 __ mov(r15, count);
1461 }
1462 if (direction == copy_forwards) {
1463 if (type != T_OBJECT) {
1464 __ bl(StubRoutines::aarch64::copy_byte_f());
1465 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1466 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1467 } else {
1468 __ bl(StubRoutines::aarch64::copy_oop_f());
1469 }
1470 } else {
1471 if (type != T_OBJECT) {
1472 __ bl(StubRoutines::aarch64::copy_byte_b());
1473 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1474 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1475 } else {
1476 __ bl(StubRoutines::aarch64::copy_oop_b());
1477 }
1478 }
1479
1480 // And the tail.
1481 copy_memory_small(decorators, type, s, d, count, step);
1482
1483 if (granularity >= 8) __ bind(copy8);
1484 if (granularity >= 4) __ bind(copy4);
1485 __ bind(finish);
1486 }
1487
1488
1489 void clobber_registers() {
1490 #ifdef ASSERT
1491 RegSet clobbered
1492 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1493 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1494 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1495 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1496 __ mov(*it, rscratch1);
1497 }
1498 #endif
1499
1500 }
1501
1502 // Scan over array at a for count oops, verifying each one.
1503 // Preserves a and count, clobbers rscratch1 and rscratch2.
1504 void verify_oop_array (int size, Register a, Register count, Register temp) {
1505 Label loop, end;
1506 __ mov(rscratch1, a);
1507 __ mov(rscratch2, zr);
1508 __ bind(loop);
1509 __ cmp(rscratch2, count);
1510 __ br(Assembler::HS, end);
1511 if (size == wordSize) {
1512 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1513 __ verify_oop(temp);
1514 } else {
1515 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1516 __ decode_heap_oop(temp); // calls verify_oop
1517 }
1518 __ add(rscratch2, rscratch2, 1);
1519 __ b(loop);
1520 __ bind(end);
1521 }
1522
1523 // Arguments:
1524 // stub_id - is used to name the stub and identify all details of
1525 // how to perform the copy.
1526 //
1527 // entry - is assigned to the stub's post push entry point unless
1528 // it is null
1529 //
1530 // Inputs:
1531 // c_rarg0 - source array address
1532 // c_rarg1 - destination array address
1533 // c_rarg2 - element count, treated as ssize_t, can be zero
1534 //
1535 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1536 // the hardware handle it. The two dwords within qwords that span
1537 // cache line boundaries will still be loaded and stored atomically.
1538 //
1539 // Side Effects: nopush_entry is set to the (post push) entry point
1540 // so it can be used by the corresponding conjoint
1541 // copy method
1542 //
1543 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1544 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1545 RegSet saved_reg = RegSet::of(s, d, count);
1546 int size;
1547 bool aligned;
1548 bool is_oop;
1549 bool dest_uninitialized;
1550 switch (stub_id) {
1551 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1552 size = sizeof(jbyte);
1553 aligned = false;
1554 is_oop = false;
1555 dest_uninitialized = false;
1556 break;
1557 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1558 size = sizeof(jbyte);
1559 aligned = true;
1560 is_oop = false;
1561 dest_uninitialized = false;
1562 break;
1563 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1564 size = sizeof(jshort);
1565 aligned = false;
1566 is_oop = false;
1567 dest_uninitialized = false;
1568 break;
1569 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1570 size = sizeof(jshort);
1571 aligned = true;
1572 is_oop = false;
1573 dest_uninitialized = false;
1574 break;
1575 case StubId::stubgen_jint_disjoint_arraycopy_id:
1576 size = sizeof(jint);
1577 aligned = false;
1578 is_oop = false;
1579 dest_uninitialized = false;
1580 break;
1581 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1582 size = sizeof(jint);
1583 aligned = true;
1584 is_oop = false;
1585 dest_uninitialized = false;
1586 break;
1587 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1588 // since this is always aligned we can (should!) use the same
1589 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1590 ShouldNotReachHere();
1591 break;
1592 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1593 size = sizeof(jlong);
1594 aligned = true;
1595 is_oop = false;
1596 dest_uninitialized = false;
1597 break;
1598 case StubId::stubgen_oop_disjoint_arraycopy_id:
1599 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1600 aligned = !UseCompressedOops;
1601 is_oop = true;
1602 dest_uninitialized = false;
1603 break;
1604 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1605 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1606 aligned = !UseCompressedOops;
1607 is_oop = true;
1608 dest_uninitialized = false;
1609 break;
1610 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1611 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1612 aligned = !UseCompressedOops;
1613 is_oop = true;
1614 dest_uninitialized = true;
1615 break;
1616 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1617 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1618 aligned = !UseCompressedOops;
1619 is_oop = true;
1620 dest_uninitialized = true;
1621 break;
1622 default:
1623 ShouldNotReachHere();
1624 break;
1625 }
1626
1627 __ align(CodeEntryAlignment);
1628 StubCodeMark mark(this, stub_id);
1629 address start = __ pc();
1630 __ enter();
1631
1632 if (nopush_entry != nullptr) {
1633 *nopush_entry = __ pc();
1634 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1635 BLOCK_COMMENT("Entry:");
1636 }
1637
1638 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1639 if (dest_uninitialized) {
1640 decorators |= IS_DEST_UNINITIALIZED;
1641 }
1642 if (aligned) {
1643 decorators |= ARRAYCOPY_ALIGNED;
1644 }
1645
1646 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1647 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1648
1649 if (is_oop) {
1650 // save regs before copy_memory
1651 __ push(RegSet::of(d, count), sp);
1652 }
1653 {
1654 // UnsafeMemoryAccess page error: continue after unsafe access
1655 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1656 UnsafeMemoryAccessMark umam(this, add_entry, true);
1657 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1658 }
1659
1660 if (is_oop) {
1661 __ pop(RegSet::of(d, count), sp);
1662 if (VerifyOops)
1663 verify_oop_array(size, d, count, r16);
1664 }
1665
1666 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1667
1668 __ leave();
1669 __ mov(r0, zr); // return 0
1670 __ ret(lr);
1671 return start;
1672 }
1673
1674 // Arguments:
1675 // stub_id - is used to name the stub and identify all details of
1676 // how to perform the copy.
1677 //
1678 // nooverlap_target - identifes the (post push) entry for the
1679 // corresponding disjoint copy routine which can be
1680 // jumped to if the ranges do not actually overlap
1681 //
1682 // entry - is assigned to the stub's post push entry point unless
1683 // it is null
1684 //
1685 //
1686 // Inputs:
1687 // c_rarg0 - source array address
1688 // c_rarg1 - destination array address
1689 // c_rarg2 - element count, treated as ssize_t, can be zero
1690 //
1691 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1692 // the hardware handle it. The two dwords within qwords that span
1693 // cache line boundaries will still be loaded and stored atomically.
1694 //
1695 // Side Effects:
1696 // nopush_entry is set to the no-overlap entry point so it can be
1697 // used by some other conjoint copy method
1698 //
1699 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1700 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1701 RegSet saved_regs = RegSet::of(s, d, count);
1702 int size;
1703 bool aligned;
1704 bool is_oop;
1705 bool dest_uninitialized;
1706 switch (stub_id) {
1707 case StubId::stubgen_jbyte_arraycopy_id:
1708 size = sizeof(jbyte);
1709 aligned = false;
1710 is_oop = false;
1711 dest_uninitialized = false;
1712 break;
1713 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1714 size = sizeof(jbyte);
1715 aligned = true;
1716 is_oop = false;
1717 dest_uninitialized = false;
1718 break;
1719 case StubId::stubgen_jshort_arraycopy_id:
1720 size = sizeof(jshort);
1721 aligned = false;
1722 is_oop = false;
1723 dest_uninitialized = false;
1724 break;
1725 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1726 size = sizeof(jshort);
1727 aligned = true;
1728 is_oop = false;
1729 dest_uninitialized = false;
1730 break;
1731 case StubId::stubgen_jint_arraycopy_id:
1732 size = sizeof(jint);
1733 aligned = false;
1734 is_oop = false;
1735 dest_uninitialized = false;
1736 break;
1737 case StubId::stubgen_arrayof_jint_arraycopy_id:
1738 size = sizeof(jint);
1739 aligned = true;
1740 is_oop = false;
1741 dest_uninitialized = false;
1742 break;
1743 case StubId::stubgen_jlong_arraycopy_id:
1744 // since this is always aligned we can (should!) use the same
1745 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1746 ShouldNotReachHere();
1747 break;
1748 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1749 size = sizeof(jlong);
1750 aligned = true;
1751 is_oop = false;
1752 dest_uninitialized = false;
1753 break;
1754 case StubId::stubgen_oop_arraycopy_id:
1755 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1756 aligned = !UseCompressedOops;
1757 is_oop = true;
1758 dest_uninitialized = false;
1759 break;
1760 case StubId::stubgen_arrayof_oop_arraycopy_id:
1761 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1762 aligned = !UseCompressedOops;
1763 is_oop = true;
1764 dest_uninitialized = false;
1765 break;
1766 case StubId::stubgen_oop_arraycopy_uninit_id:
1767 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1768 aligned = !UseCompressedOops;
1769 is_oop = true;
1770 dest_uninitialized = true;
1771 break;
1772 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1773 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1774 aligned = !UseCompressedOops;
1775 is_oop = true;
1776 dest_uninitialized = true;
1777 break;
1778 default:
1779 ShouldNotReachHere();
1780 }
1781
1782 StubCodeMark mark(this, stub_id);
1783 address start = __ pc();
1784 __ enter();
1785
1786 if (nopush_entry != nullptr) {
1787 *nopush_entry = __ pc();
1788 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1789 BLOCK_COMMENT("Entry:");
1790 }
1791
1792 // use fwd copy when (d-s) above_equal (count*size)
1793 Label L_overlapping;
1794 __ sub(rscratch1, d, s);
1795 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1796 __ br(Assembler::LO, L_overlapping);
1797 __ b(RuntimeAddress(nooverlap_target));
1798 __ bind(L_overlapping);
1799
1800 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1801 if (dest_uninitialized) {
1802 decorators |= IS_DEST_UNINITIALIZED;
1803 }
1804 if (aligned) {
1805 decorators |= ARRAYCOPY_ALIGNED;
1806 }
1807
1808 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1810
1811 if (is_oop) {
1812 // save regs before copy_memory
1813 __ push(RegSet::of(d, count), sp);
1814 }
1815 {
1816 // UnsafeMemoryAccess page error: continue after unsafe access
1817 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1818 UnsafeMemoryAccessMark umam(this, add_entry, true);
1819 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1820 }
1821 if (is_oop) {
1822 __ pop(RegSet::of(d, count), sp);
1823 if (VerifyOops)
1824 verify_oop_array(size, d, count, r16);
1825 }
1826 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1827 __ leave();
1828 __ mov(r0, zr); // return 0
1829 __ ret(lr);
1830 return start;
1831 }
1832
1833 // Helper for generating a dynamic type check.
1834 // Smashes rscratch1, rscratch2.
1835 void generate_type_check(Register sub_klass,
1836 Register super_check_offset,
1837 Register super_klass,
1838 Register temp1,
1839 Register temp2,
1840 Register result,
1841 Label& L_success) {
1842 assert_different_registers(sub_klass, super_check_offset, super_klass);
1843
1844 BLOCK_COMMENT("type_check:");
1845
1846 Label L_miss;
1847
1848 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1849 super_check_offset);
1850 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1851
1852 // Fall through on failure!
1853 __ BIND(L_miss);
1854 }
1855
1856 //
1857 // Generate checkcasting array copy stub
1858 //
1859 // Input:
1860 // c_rarg0 - source array address
1861 // c_rarg1 - destination array address
1862 // c_rarg2 - element count, treated as ssize_t, can be zero
1863 // c_rarg3 - size_t ckoff (super_check_offset)
1864 // c_rarg4 - oop ckval (super_klass)
1865 //
1866 // Output:
1867 // r0 == 0 - success
1868 // r0 == -1^K - failure, where K is partial transfer count
1869 //
1870 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1871 bool dest_uninitialized;
1872 switch (stub_id) {
1873 case StubId::stubgen_checkcast_arraycopy_id:
1874 dest_uninitialized = false;
1875 break;
1876 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1877 dest_uninitialized = true;
1878 break;
1879 default:
1880 ShouldNotReachHere();
1881 }
1882
1883 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1884
1885 // Input registers (after setup_arg_regs)
1886 const Register from = c_rarg0; // source array address
1887 const Register to = c_rarg1; // destination array address
1888 const Register count = c_rarg2; // elementscount
1889 const Register ckoff = c_rarg3; // super_check_offset
1890 const Register ckval = c_rarg4; // super_klass
1891
1892 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1893 RegSet wb_post_saved_regs = RegSet::of(count);
1894
1895 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1896 const Register copied_oop = r22; // actual oop copied
1897 const Register count_save = r21; // orig elementscount
1898 const Register start_to = r20; // destination array start address
1899 const Register r19_klass = r19; // oop._klass
1900
1901 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1902 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1903
1904 //---------------------------------------------------------------
1905 // Assembler stub will be used for this call to arraycopy
1906 // if the two arrays are subtypes of Object[] but the
1907 // destination array type is not equal to or a supertype
1908 // of the source type. Each element must be separately
1909 // checked.
1910
1911 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1912 copied_oop, r19_klass, count_save);
1913
1914 __ align(CodeEntryAlignment);
1915 StubCodeMark mark(this, stub_id);
1916 address start = __ pc();
1917
1918 __ enter(); // required for proper stackwalking of RuntimeStub frame
1919
1920 #ifdef ASSERT
1921 // caller guarantees that the arrays really are different
1922 // otherwise, we would have to make conjoint checks
1923 { Label L;
1924 __ b(L); // conjoint check not yet implemented
1925 __ stop("checkcast_copy within a single array");
1926 __ bind(L);
1927 }
1928 #endif //ASSERT
1929
1930 // Caller of this entry point must set up the argument registers.
1931 if (nopush_entry != nullptr) {
1932 *nopush_entry = __ pc();
1933 BLOCK_COMMENT("Entry:");
1934 }
1935
1936 // Empty array: Nothing to do.
1937 __ cbz(count, L_done);
1938 __ push(RegSet::of(r19, r20, r21, r22), sp);
1939
1940 #ifdef ASSERT
1941 BLOCK_COMMENT("assert consistent ckoff/ckval");
1942 // The ckoff and ckval must be mutually consistent,
1943 // even though caller generates both.
1944 { Label L;
1945 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1946 __ ldrw(start_to, Address(ckval, sco_offset));
1947 __ cmpw(ckoff, start_to);
1948 __ br(Assembler::EQ, L);
1949 __ stop("super_check_offset inconsistent");
1950 __ bind(L);
1951 }
1952 #endif //ASSERT
1953
1954 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1955 bool is_oop = true;
1956 int element_size = UseCompressedOops ? 4 : 8;
1957 if (dest_uninitialized) {
1958 decorators |= IS_DEST_UNINITIALIZED;
1959 }
1960
1961 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1962 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1963
1964 // save the original count
1965 __ mov(count_save, count);
1966
1967 // Copy from low to high addresses
1968 __ mov(start_to, to); // Save destination array start address
1969 __ b(L_load_element);
1970
1971 // ======== begin loop ========
1972 // (Loop is rotated; its entry is L_load_element.)
1973 // Loop control:
1974 // for (; count != 0; count--) {
1975 // copied_oop = load_heap_oop(from++);
1976 // ... generate_type_check ...;
1977 // store_heap_oop(to++, copied_oop);
1978 // }
1979 __ align(OptoLoopAlignment);
1980
1981 __ BIND(L_store_element);
1982 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1983 __ post(to, element_size), copied_oop, noreg,
1984 gct1, gct2, gct3);
1985 __ sub(count, count, 1);
1986 __ cbz(count, L_do_card_marks);
1987
1988 // ======== loop entry is here ========
1989 __ BIND(L_load_element);
1990 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1991 copied_oop, noreg, __ post(from, element_size),
1992 gct1);
1993 __ cbz(copied_oop, L_store_element);
1994
1995 __ load_klass(r19_klass, copied_oop);// query the object klass
1996
1997 BLOCK_COMMENT("type_check:");
1998 generate_type_check(/*sub_klass*/r19_klass,
1999 /*super_check_offset*/ckoff,
2000 /*super_klass*/ckval,
2001 /*r_array_base*/gct1,
2002 /*temp2*/gct2,
2003 /*result*/r10, L_store_element);
2004
2005 // Fall through on failure!
2006
2007 // ======== end loop ========
2008
2009 // It was a real error; we must depend on the caller to finish the job.
2010 // Register count = remaining oops, count_orig = total oops.
2011 // Emit GC store barriers for the oops we have copied and report
2012 // their number to the caller.
2013
2014 __ subs(count, count_save, count); // K = partially copied oop count
2015 __ eon(count, count, zr); // report (-1^K) to caller
2016 __ br(Assembler::EQ, L_done_pop);
2017
2018 __ BIND(L_do_card_marks);
2019 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2020
2021 __ bind(L_done_pop);
2022 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2023 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2024
2025 __ bind(L_done);
2026 __ mov(r0, count);
2027 __ leave();
2028 __ ret(lr);
2029
2030 return start;
2031 }
2032
2033 // Perform range checks on the proposed arraycopy.
2034 // Kills temp, but nothing else.
2035 // Also, clean the sign bits of src_pos and dst_pos.
2036 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2037 Register src_pos, // source position (c_rarg1)
2038 Register dst, // destination array oo (c_rarg2)
2039 Register dst_pos, // destination position (c_rarg3)
2040 Register length,
2041 Register temp,
2042 Label& L_failed) {
2043 BLOCK_COMMENT("arraycopy_range_checks:");
2044
2045 assert_different_registers(rscratch1, temp);
2046
2047 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2048 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2049 __ addw(temp, length, src_pos);
2050 __ cmpw(temp, rscratch1);
2051 __ br(Assembler::HI, L_failed);
2052
2053 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2054 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2055 __ addw(temp, length, dst_pos);
2056 __ cmpw(temp, rscratch1);
2057 __ br(Assembler::HI, L_failed);
2058
2059 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2060 __ movw(src_pos, src_pos);
2061 __ movw(dst_pos, dst_pos);
2062
2063 BLOCK_COMMENT("arraycopy_range_checks done");
2064 }
2065
2066 // These stubs get called from some dumb test routine.
2067 // I'll write them properly when they're called from
2068 // something that's actually doing something.
2069 static void fake_arraycopy_stub(address src, address dst, int count) {
2070 assert(count == 0, "huh?");
2071 }
2072
2073
2074 //
2075 // Generate 'unsafe' array copy stub
2076 // Though just as safe as the other stubs, it takes an unscaled
2077 // size_t argument instead of an element count.
2078 //
2079 // Input:
2080 // c_rarg0 - source array address
2081 // c_rarg1 - destination array address
2082 // c_rarg2 - byte count, treated as ssize_t, can be zero
2083 //
2084 // Examines the alignment of the operands and dispatches
2085 // to a long, int, short, or byte copy loop.
2086 //
2087 address generate_unsafe_copy(address byte_copy_entry,
2088 address short_copy_entry,
2089 address int_copy_entry,
2090 address long_copy_entry) {
2091 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2092
2093 Label L_long_aligned, L_int_aligned, L_short_aligned;
2094 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2095
2096 __ align(CodeEntryAlignment);
2097 StubCodeMark mark(this, stub_id);
2098 address start = __ pc();
2099 __ enter(); // required for proper stackwalking of RuntimeStub frame
2100
2101 // bump this on entry, not on exit:
2102 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2103
2104 __ orr(rscratch1, s, d);
2105 __ orr(rscratch1, rscratch1, count);
2106
2107 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2108 __ cbz(rscratch1, L_long_aligned);
2109 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2110 __ cbz(rscratch1, L_int_aligned);
2111 __ tbz(rscratch1, 0, L_short_aligned);
2112 __ b(RuntimeAddress(byte_copy_entry));
2113
2114 __ BIND(L_short_aligned);
2115 __ lsr(count, count, LogBytesPerShort); // size => short_count
2116 __ b(RuntimeAddress(short_copy_entry));
2117 __ BIND(L_int_aligned);
2118 __ lsr(count, count, LogBytesPerInt); // size => int_count
2119 __ b(RuntimeAddress(int_copy_entry));
2120 __ BIND(L_long_aligned);
2121 __ lsr(count, count, LogBytesPerLong); // size => long_count
2122 __ b(RuntimeAddress(long_copy_entry));
2123
2124 return start;
2125 }
2126
2127 //
2128 // Generate generic array copy stubs
2129 //
2130 // Input:
2131 // c_rarg0 - src oop
2132 // c_rarg1 - src_pos (32-bits)
2133 // c_rarg2 - dst oop
2134 // c_rarg3 - dst_pos (32-bits)
2135 // c_rarg4 - element count (32-bits)
2136 //
2137 // Output:
2138 // r0 == 0 - success
2139 // r0 == -1^K - failure, where K is partial transfer count
2140 //
2141 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2142 address int_copy_entry, address oop_copy_entry,
2143 address long_copy_entry, address checkcast_copy_entry) {
2144 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2145
2146 Label L_failed, L_objArray;
2147 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2148
2149 // Input registers
2150 const Register src = c_rarg0; // source array oop
2151 const Register src_pos = c_rarg1; // source position
2152 const Register dst = c_rarg2; // destination array oop
2153 const Register dst_pos = c_rarg3; // destination position
2154 const Register length = c_rarg4;
2155
2156
2157 // Registers used as temps
2158 const Register dst_klass = c_rarg5;
2159
2160 __ align(CodeEntryAlignment);
2161
2162 StubCodeMark mark(this, stub_id);
2163
2164 address start = __ pc();
2165
2166 __ enter(); // required for proper stackwalking of RuntimeStub frame
2167
2168 // bump this on entry, not on exit:
2169 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2170
2171 //-----------------------------------------------------------------------
2172 // Assembler stub will be used for this call to arraycopy
2173 // if the following conditions are met:
2174 //
2175 // (1) src and dst must not be null.
2176 // (2) src_pos must not be negative.
2177 // (3) dst_pos must not be negative.
2178 // (4) length must not be negative.
2179 // (5) src klass and dst klass should be the same and not null.
2180 // (6) src and dst should be arrays.
2181 // (7) src_pos + length must not exceed length of src.
2182 // (8) dst_pos + length must not exceed length of dst.
2183 //
2184
2185 // if (src == nullptr) return -1;
2186 __ cbz(src, L_failed);
2187
2188 // if (src_pos < 0) return -1;
2189 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2190
2191 // if (dst == nullptr) return -1;
2192 __ cbz(dst, L_failed);
2193
2194 // if (dst_pos < 0) return -1;
2195 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2196
2197 // registers used as temp
2198 const Register scratch_length = r16; // elements count to copy
2199 const Register scratch_src_klass = r17; // array klass
2200 const Register lh = r15; // layout helper
2201
2202 // if (length < 0) return -1;
2203 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2204 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2205
2206 __ load_klass(scratch_src_klass, src);
2207 #ifdef ASSERT
2208 // assert(src->klass() != nullptr);
2209 {
2210 BLOCK_COMMENT("assert klasses not null {");
2211 Label L1, L2;
2212 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2213 __ bind(L1);
2214 __ stop("broken null klass");
2215 __ bind(L2);
2216 __ load_klass(rscratch1, dst);
2217 __ cbz(rscratch1, L1); // this would be broken also
2218 BLOCK_COMMENT("} assert klasses not null done");
2219 }
2220 #endif
2221
2222 // Load layout helper (32-bits)
2223 //
2224 // |array_tag| | header_size | element_type | |log2_element_size|
2225 // 32 30 24 16 8 2 0
2226 //
2227 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2228 //
2229
2230 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2231
2232 // Handle objArrays completely differently...
2233 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2234 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2235 __ movw(rscratch1, objArray_lh);
2236 __ eorw(rscratch2, lh, rscratch1);
2237 __ cbzw(rscratch2, L_objArray);
2238
2239 // if (src->klass() != dst->klass()) return -1;
2240 __ load_klass(rscratch2, dst);
2241 __ eor(rscratch2, rscratch2, scratch_src_klass);
2242 __ cbnz(rscratch2, L_failed);
2243
2244 // Check for flat inline type array -> return -1
2245 __ test_flat_array_oop(src, rscratch2, L_failed);
2246
2247 // Check for null-free (non-flat) inline type array -> handle as object array
2248 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2249
2250 // if (!src->is_Array()) return -1;
2251 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2252
2253 // At this point, it is known to be a typeArray (array_tag 0x3).
2254 #ifdef ASSERT
2255 {
2256 BLOCK_COMMENT("assert primitive array {");
2257 Label L;
2258 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2259 __ cmpw(lh, rscratch2);
2260 __ br(Assembler::GE, L);
2261 __ stop("must be a primitive array");
2262 __ bind(L);
2263 BLOCK_COMMENT("} assert primitive array done");
2264 }
2265 #endif
2266
2267 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2268 rscratch2, L_failed);
2269
2270 // TypeArrayKlass
2271 //
2272 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2273 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2274 //
2275
2276 const Register rscratch1_offset = rscratch1; // array offset
2277 const Register r15_elsize = lh; // element size
2278
2279 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2280 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2281 __ add(src, src, rscratch1_offset); // src array offset
2282 __ add(dst, dst, rscratch1_offset); // dst array offset
2283 BLOCK_COMMENT("choose copy loop based on element size");
2284
2285 // next registers should be set before the jump to corresponding stub
2286 const Register from = c_rarg0; // source array address
2287 const Register to = c_rarg1; // destination array address
2288 const Register count = c_rarg2; // elements count
2289
2290 // 'from', 'to', 'count' registers should be set in such order
2291 // since they are the same as 'src', 'src_pos', 'dst'.
2292
2293 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2294
2295 // The possible values of elsize are 0-3, i.e. exact_log2(element
2296 // size in bytes). We do a simple bitwise binary search.
2297 __ BIND(L_copy_bytes);
2298 __ tbnz(r15_elsize, 1, L_copy_ints);
2299 __ tbnz(r15_elsize, 0, L_copy_shorts);
2300 __ lea(from, Address(src, src_pos));// src_addr
2301 __ lea(to, Address(dst, dst_pos));// dst_addr
2302 __ movw(count, scratch_length); // length
2303 __ b(RuntimeAddress(byte_copy_entry));
2304
2305 __ BIND(L_copy_shorts);
2306 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2307 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2308 __ movw(count, scratch_length); // length
2309 __ b(RuntimeAddress(short_copy_entry));
2310
2311 __ BIND(L_copy_ints);
2312 __ tbnz(r15_elsize, 0, L_copy_longs);
2313 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2314 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2315 __ movw(count, scratch_length); // length
2316 __ b(RuntimeAddress(int_copy_entry));
2317
2318 __ BIND(L_copy_longs);
2319 #ifdef ASSERT
2320 {
2321 BLOCK_COMMENT("assert long copy {");
2322 Label L;
2323 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2324 __ cmpw(r15_elsize, LogBytesPerLong);
2325 __ br(Assembler::EQ, L);
2326 __ stop("must be long copy, but elsize is wrong");
2327 __ bind(L);
2328 BLOCK_COMMENT("} assert long copy done");
2329 }
2330 #endif
2331 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2332 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2333 __ movw(count, scratch_length); // length
2334 __ b(RuntimeAddress(long_copy_entry));
2335
2336 // ObjArrayKlass
2337 __ BIND(L_objArray);
2338 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2339
2340 Label L_plain_copy, L_checkcast_copy;
2341 // test array classes for subtyping
2342 __ load_klass(r15, dst);
2343 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2344 __ br(Assembler::NE, L_checkcast_copy);
2345
2346 // Identically typed arrays can be copied without element-wise checks.
2347 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2348 rscratch2, L_failed);
2349
2350 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2351 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2352 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2353 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2354 __ movw(count, scratch_length); // length
2355 __ BIND(L_plain_copy);
2356 __ b(RuntimeAddress(oop_copy_entry));
2357
2358 __ BIND(L_checkcast_copy);
2359 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2360 {
2361 // Before looking at dst.length, make sure dst is also an objArray.
2362 __ ldrw(rscratch1, Address(r15, lh_offset));
2363 __ movw(rscratch2, objArray_lh);
2364 __ eorw(rscratch1, rscratch1, rscratch2);
2365 __ cbnzw(rscratch1, L_failed);
2366
2367 // It is safe to examine both src.length and dst.length.
2368 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2369 r15, L_failed);
2370
2371 __ load_klass(dst_klass, dst); // reload
2372
2373 // Marshal the base address arguments now, freeing registers.
2374 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2375 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2376 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2377 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2378 __ movw(count, length); // length (reloaded)
2379 Register sco_temp = c_rarg3; // this register is free now
2380 assert_different_registers(from, to, count, sco_temp,
2381 dst_klass, scratch_src_klass);
2382 // assert_clean_int(count, sco_temp);
2383
2384 // Generate the type check.
2385 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2386 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2387
2388 // Smashes rscratch1, rscratch2
2389 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2390 L_plain_copy);
2391
2392 // Fetch destination element klass from the ObjArrayKlass header.
2393 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2394 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2395 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2396
2397 // the checkcast_copy loop needs two extra arguments:
2398 assert(c_rarg3 == sco_temp, "#3 already in place");
2399 // Set up arguments for checkcast_copy_entry.
2400 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2401 __ b(RuntimeAddress(checkcast_copy_entry));
2402 }
2403
2404 __ BIND(L_failed);
2405 __ mov(r0, -1);
2406 __ leave(); // required for proper stackwalking of RuntimeStub frame
2407 __ ret(lr);
2408
2409 return start;
2410 }
2411
2412 //
2413 // Generate stub for array fill. If "aligned" is true, the
2414 // "to" address is assumed to be heapword aligned.
2415 //
2416 // Arguments for generated stub:
2417 // to: c_rarg0
2418 // value: c_rarg1
2419 // count: c_rarg2 treated as signed
2420 //
2421 address generate_fill(StubId stub_id) {
2422 BasicType t;
2423 bool aligned;
2424
2425 switch (stub_id) {
2426 case StubId::stubgen_jbyte_fill_id:
2427 t = T_BYTE;
2428 aligned = false;
2429 break;
2430 case StubId::stubgen_jshort_fill_id:
2431 t = T_SHORT;
2432 aligned = false;
2433 break;
2434 case StubId::stubgen_jint_fill_id:
2435 t = T_INT;
2436 aligned = false;
2437 break;
2438 case StubId::stubgen_arrayof_jbyte_fill_id:
2439 t = T_BYTE;
2440 aligned = true;
2441 break;
2442 case StubId::stubgen_arrayof_jshort_fill_id:
2443 t = T_SHORT;
2444 aligned = true;
2445 break;
2446 case StubId::stubgen_arrayof_jint_fill_id:
2447 t = T_INT;
2448 aligned = true;
2449 break;
2450 default:
2451 ShouldNotReachHere();
2452 };
2453
2454 __ align(CodeEntryAlignment);
2455 StubCodeMark mark(this, stub_id);
2456 address start = __ pc();
2457
2458 BLOCK_COMMENT("Entry:");
2459
2460 const Register to = c_rarg0; // source array address
2461 const Register value = c_rarg1; // value
2462 const Register count = c_rarg2; // elements count
2463
2464 const Register bz_base = r10; // base for block_zero routine
2465 const Register cnt_words = r11; // temp register
2466
2467 __ enter();
2468
2469 Label L_fill_elements, L_exit1;
2470
2471 int shift = -1;
2472 switch (t) {
2473 case T_BYTE:
2474 shift = 0;
2475 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2476 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2477 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2478 __ br(Assembler::LO, L_fill_elements);
2479 break;
2480 case T_SHORT:
2481 shift = 1;
2482 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2483 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2484 __ br(Assembler::LO, L_fill_elements);
2485 break;
2486 case T_INT:
2487 shift = 2;
2488 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2489 __ br(Assembler::LO, L_fill_elements);
2490 break;
2491 default: ShouldNotReachHere();
2492 }
2493
2494 // Align source address at 8 bytes address boundary.
2495 Label L_skip_align1, L_skip_align2, L_skip_align4;
2496 if (!aligned) {
2497 switch (t) {
2498 case T_BYTE:
2499 // One byte misalignment happens only for byte arrays.
2500 __ tbz(to, 0, L_skip_align1);
2501 __ strb(value, Address(__ post(to, 1)));
2502 __ subw(count, count, 1);
2503 __ bind(L_skip_align1);
2504 // Fallthrough
2505 case T_SHORT:
2506 // Two bytes misalignment happens only for byte and short (char) arrays.
2507 __ tbz(to, 1, L_skip_align2);
2508 __ strh(value, Address(__ post(to, 2)));
2509 __ subw(count, count, 2 >> shift);
2510 __ bind(L_skip_align2);
2511 // Fallthrough
2512 case T_INT:
2513 // Align to 8 bytes, we know we are 4 byte aligned to start.
2514 __ tbz(to, 2, L_skip_align4);
2515 __ strw(value, Address(__ post(to, 4)));
2516 __ subw(count, count, 4 >> shift);
2517 __ bind(L_skip_align4);
2518 break;
2519 default: ShouldNotReachHere();
2520 }
2521 }
2522
2523 //
2524 // Fill large chunks
2525 //
2526 __ lsrw(cnt_words, count, 3 - shift); // number of words
2527 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2528 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2529 if (UseBlockZeroing) {
2530 Label non_block_zeroing, rest;
2531 // If the fill value is zero we can use the fast zero_words().
2532 __ cbnz(value, non_block_zeroing);
2533 __ mov(bz_base, to);
2534 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2535 address tpc = __ zero_words(bz_base, cnt_words);
2536 if (tpc == nullptr) {
2537 fatal("CodeCache is full at generate_fill");
2538 }
2539 __ b(rest);
2540 __ bind(non_block_zeroing);
2541 __ fill_words(to, cnt_words, value);
2542 __ bind(rest);
2543 } else {
2544 __ fill_words(to, cnt_words, value);
2545 }
2546
2547 // Remaining count is less than 8 bytes. Fill it by a single store.
2548 // Note that the total length is no less than 8 bytes.
2549 if (t == T_BYTE || t == T_SHORT) {
2550 Label L_exit1;
2551 __ cbzw(count, L_exit1);
2552 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2553 __ str(value, Address(to, -8)); // overwrite some elements
2554 __ bind(L_exit1);
2555 __ leave();
2556 __ ret(lr);
2557 }
2558
2559 // Handle copies less than 8 bytes.
2560 Label L_fill_2, L_fill_4, L_exit2;
2561 __ bind(L_fill_elements);
2562 switch (t) {
2563 case T_BYTE:
2564 __ tbz(count, 0, L_fill_2);
2565 __ strb(value, Address(__ post(to, 1)));
2566 __ bind(L_fill_2);
2567 __ tbz(count, 1, L_fill_4);
2568 __ strh(value, Address(__ post(to, 2)));
2569 __ bind(L_fill_4);
2570 __ tbz(count, 2, L_exit2);
2571 __ strw(value, Address(to));
2572 break;
2573 case T_SHORT:
2574 __ tbz(count, 0, L_fill_4);
2575 __ strh(value, Address(__ post(to, 2)));
2576 __ bind(L_fill_4);
2577 __ tbz(count, 1, L_exit2);
2578 __ strw(value, Address(to));
2579 break;
2580 case T_INT:
2581 __ cbzw(count, L_exit2);
2582 __ strw(value, Address(to));
2583 break;
2584 default: ShouldNotReachHere();
2585 }
2586 __ bind(L_exit2);
2587 __ leave();
2588 __ ret(lr);
2589 return start;
2590 }
2591
2592 address generate_unsafecopy_common_error_exit() {
2593 address start_pc = __ pc();
2594 __ leave();
2595 __ mov(r0, 0);
2596 __ ret(lr);
2597 return start_pc;
2598 }
2599
2600 //
2601 // Generate 'unsafe' set memory stub
2602 // Though just as safe as the other stubs, it takes an unscaled
2603 // size_t (# bytes) argument instead of an element count.
2604 //
2605 // This fill operation is atomicity preserving: as long as the
2606 // address supplied is sufficiently aligned, all writes of up to 64
2607 // bits in size are single-copy atomic.
2608 //
2609 // Input:
2610 // c_rarg0 - destination array address
2611 // c_rarg1 - byte count (size_t)
2612 // c_rarg2 - byte value
2613 //
2614 address generate_unsafe_setmemory() {
2615 __ align(CodeEntryAlignment);
2616 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2617 address start = __ pc();
2618
2619 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2620 Label tail;
2621
2622 UnsafeMemoryAccessMark umam(this, true, false);
2623
2624 __ enter(); // required for proper stackwalking of RuntimeStub frame
2625
2626 __ dup(v0, __ T16B, value);
2627
2628 if (AvoidUnalignedAccesses) {
2629 __ cmp(count, (u1)16);
2630 __ br(__ LO, tail);
2631
2632 __ mov(rscratch1, 16);
2633 __ andr(rscratch2, dest, 15);
2634 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2635 __ strq(v0, Address(dest));
2636 __ sub(count, count, rscratch1);
2637 __ add(dest, dest, rscratch1);
2638 }
2639
2640 __ subs(count, count, (u1)64);
2641 __ br(__ LO, tail);
2642 {
2643 Label again;
2644 __ bind(again);
2645 __ stpq(v0, v0, Address(dest));
2646 __ stpq(v0, v0, Address(dest, 32));
2647
2648 __ subs(count, count, 64);
2649 __ add(dest, dest, 64);
2650 __ br(__ HS, again);
2651 }
2652
2653 __ bind(tail);
2654 // The count of bytes is off by 64, but we don't need to correct
2655 // it because we're only going to use the least-significant few
2656 // count bits from here on.
2657 // __ add(count, count, 64);
2658
2659 {
2660 Label dont;
2661 __ tbz(count, exact_log2(32), dont);
2662 __ stpq(v0, v0, __ post(dest, 32));
2663 __ bind(dont);
2664 }
2665 {
2666 Label dont;
2667 __ tbz(count, exact_log2(16), dont);
2668 __ strq(v0, __ post(dest, 16));
2669 __ bind(dont);
2670 }
2671 {
2672 Label dont;
2673 __ tbz(count, exact_log2(8), dont);
2674 __ strd(v0, __ post(dest, 8));
2675 __ bind(dont);
2676 }
2677
2678 Label finished;
2679 __ tst(count, 7);
2680 __ br(__ EQ, finished);
2681
2682 {
2683 Label dont;
2684 __ tbz(count, exact_log2(4), dont);
2685 __ strs(v0, __ post(dest, 4));
2686 __ bind(dont);
2687 }
2688 {
2689 Label dont;
2690 __ tbz(count, exact_log2(2), dont);
2691 __ bfi(value, value, 8, 8);
2692 __ strh(value, __ post(dest, 2));
2693 __ bind(dont);
2694 }
2695 {
2696 Label dont;
2697 __ tbz(count, exact_log2(1), dont);
2698 __ strb(value, Address(dest));
2699 __ bind(dont);
2700 }
2701
2702 __ bind(finished);
2703 __ leave();
2704 __ ret(lr);
2705
2706 return start;
2707 }
2708
2709 address generate_data_cache_writeback() {
2710 const Register line = c_rarg0; // address of line to write back
2711
2712 __ align(CodeEntryAlignment);
2713
2714 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2715 StubCodeMark mark(this, stub_id);
2716
2717 address start = __ pc();
2718 __ enter();
2719 __ cache_wb(Address(line, 0));
2720 __ leave();
2721 __ ret(lr);
2722
2723 return start;
2724 }
2725
2726 address generate_data_cache_writeback_sync() {
2727 const Register is_pre = c_rarg0; // pre or post sync
2728
2729 __ align(CodeEntryAlignment);
2730
2731 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2732 StubCodeMark mark(this, stub_id);
2733
2734 // pre wbsync is a no-op
2735 // post wbsync translates to an sfence
2736
2737 Label skip;
2738 address start = __ pc();
2739 __ enter();
2740 __ cbnz(is_pre, skip);
2741 __ cache_wbsync(false);
2742 __ bind(skip);
2743 __ leave();
2744 __ ret(lr);
2745
2746 return start;
2747 }
2748
2749 void generate_arraycopy_stubs() {
2750 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2751 // entry immediately following their stack push. This can be used
2752 // as a post-push branch target for compatible stubs when they
2753 // identify a special case that can be handled by the fallback
2754 // stub e.g a disjoint copy stub may be use as a special case
2755 // fallback for its compatible conjoint copy stub.
2756 //
2757 // A no push entry is always returned in the following local and
2758 // then published by assigning to the appropriate entry field in
2759 // class StubRoutines. The entry value is then passed to the
2760 // generator for the compatible stub. That means the entry must be
2761 // listed when saving to/restoring from the AOT cache, ensuring
2762 // that the inter-stub jumps are noted at AOT-cache save and
2763 // relocated at AOT cache load.
2764 address nopush_entry;
2765
2766 // generate the common exit first so later stubs can rely on it if
2767 // they want an UnsafeMemoryAccess exit non-local to the stub
2768 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2769 // register the stub as the default exit with class UnsafeMemoryAccess
2770 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2771
2772 // generate and publish arch64-specific bulk copy routines first
2773 // so we can call them from other copy stubs
2774 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2775 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2776
2777 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2778 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2779
2780 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2781 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2782
2783 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2784
2785 //*** jbyte
2786 // Always need aligned and unaligned versions
2787 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2788 // disjoint nopush entry is needed by conjoint copy
2789 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2790 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2791 // conjoint nopush entry is needed by generic/unsafe copy
2792 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2793 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2794 // disjoint arrayof nopush entry is needed by conjoint copy
2795 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2796 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2797
2798 //*** jshort
2799 // Always need aligned and unaligned versions
2800 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2801 // disjoint nopush entry is needed by conjoint copy
2802 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2803 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2804 // conjoint nopush entry is used by generic/unsafe copy
2805 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2806 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2807 // disjoint arrayof nopush entry is needed by conjoint copy
2808 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2809 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2810
2811 //*** jint
2812 // Aligned versions
2813 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2814 // disjoint arrayof nopush entry is needed by conjoint copy
2815 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2816 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2817 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2818 // jint_arraycopy_nopush always points to the unaligned version
2819 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2820 // disjoint nopush entry is needed by conjoint copy
2821 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2822 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2823 // conjoint nopush entry is needed by generic/unsafe copy
2824 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2825
2826 //*** jlong
2827 // It is always aligned
2828 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2829 // disjoint arrayof nopush entry is needed by conjoint copy
2830 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2831 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2832 // conjoint nopush entry is needed by generic/unsafe copy
2833 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2834 // disjoint normal/nopush and conjoint normal entries are not
2835 // generated since the arrayof versions are the same
2836 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2837 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2838 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2839
2840 //*** oops
2841 {
2842 StubRoutines::_arrayof_oop_disjoint_arraycopy
2843 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2844 // disjoint arrayof nopush entry is needed by conjoint copy
2845 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2846 StubRoutines::_arrayof_oop_arraycopy
2847 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2848 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2849 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2850 // Aligned versions without pre-barriers
2851 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2852 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2853 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2854 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2855 // note that we don't need a returned nopush entry because the
2856 // generic/unsafe copy does not cater for uninit arrays.
2857 StubRoutines::_arrayof_oop_arraycopy_uninit
2858 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2859 }
2860
2861 // for oop copies reuse arrayof entries for non-arrayof cases
2862 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2863 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2864 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2865 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2866 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2867 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2868
2869 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2870 // checkcast nopush entry is needed by generic copy
2871 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2872 // note that we don't need a returned nopush entry because the
2873 // generic copy does not cater for uninit arrays.
2874 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2875
2876 // unsafe arraycopy may fallback on conjoint stubs
2877 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2878 StubRoutines::_jshort_arraycopy_nopush,
2879 StubRoutines::_jint_arraycopy_nopush,
2880 StubRoutines::_jlong_arraycopy_nopush);
2881
2882 // generic arraycopy may fallback on conjoint stubs
2883 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2884 StubRoutines::_jshort_arraycopy_nopush,
2885 StubRoutines::_jint_arraycopy_nopush,
2886 StubRoutines::_oop_arraycopy_nopush,
2887 StubRoutines::_jlong_arraycopy_nopush,
2888 StubRoutines::_checkcast_arraycopy_nopush);
2889
2890 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2891 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2892 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2893 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2894 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2895 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2896 }
2897
2898 void generate_math_stubs() { Unimplemented(); }
2899
2900 // Arguments:
2901 //
2902 // Inputs:
2903 // c_rarg0 - source byte array address
2904 // c_rarg1 - destination byte array address
2905 // c_rarg2 - K (key) in little endian int array
2906 //
2907 address generate_aescrypt_encryptBlock() {
2908 __ align(CodeEntryAlignment);
2909 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2910 StubCodeMark mark(this, stub_id);
2911
2912 const Register from = c_rarg0; // source array address
2913 const Register to = c_rarg1; // destination array address
2914 const Register key = c_rarg2; // key array address
2915 const Register keylen = rscratch1;
2916
2917 address start = __ pc();
2918 __ enter();
2919
2920 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2921
2922 __ aesenc_loadkeys(key, keylen);
2923 __ aesecb_encrypt(from, to, keylen);
2924
2925 __ mov(r0, 0);
2926
2927 __ leave();
2928 __ ret(lr);
2929
2930 return start;
2931 }
2932
2933 // Arguments:
2934 //
2935 // Inputs:
2936 // c_rarg0 - source byte array address
2937 // c_rarg1 - destination byte array address
2938 // c_rarg2 - K (key) in little endian int array
2939 //
2940 address generate_aescrypt_decryptBlock() {
2941 assert(UseAES, "need AES cryptographic extension support");
2942 __ align(CodeEntryAlignment);
2943 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2944 StubCodeMark mark(this, stub_id);
2945 Label L_doLast;
2946
2947 const Register from = c_rarg0; // source array address
2948 const Register to = c_rarg1; // destination array address
2949 const Register key = c_rarg2; // key array address
2950 const Register keylen = rscratch1;
2951
2952 address start = __ pc();
2953 __ enter(); // required for proper stackwalking of RuntimeStub frame
2954
2955 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2956
2957 __ aesecb_decrypt(from, to, key, keylen);
2958
2959 __ mov(r0, 0);
2960
2961 __ leave();
2962 __ ret(lr);
2963
2964 return start;
2965 }
2966
2967 // Arguments:
2968 //
2969 // Inputs:
2970 // c_rarg0 - source byte array address
2971 // c_rarg1 - destination byte array address
2972 // c_rarg2 - K (key) in little endian int array
2973 // c_rarg3 - r vector byte array address
2974 // c_rarg4 - input length
2975 //
2976 // Output:
2977 // x0 - input length
2978 //
2979 address generate_cipherBlockChaining_encryptAESCrypt() {
2980 assert(UseAES, "need AES cryptographic extension support");
2981 __ align(CodeEntryAlignment);
2982 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2983 StubCodeMark mark(this, stub_id);
2984
2985 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2986
2987 const Register from = c_rarg0; // source array address
2988 const Register to = c_rarg1; // destination array address
2989 const Register key = c_rarg2; // key array address
2990 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2991 // and left with the results of the last encryption block
2992 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2993 const Register keylen = rscratch1;
2994
2995 address start = __ pc();
2996
2997 __ enter();
2998
2999 __ movw(rscratch2, len_reg);
3000
3001 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3002
3003 __ ld1(v0, __ T16B, rvec);
3004
3005 __ cmpw(keylen, 52);
3006 __ br(Assembler::CC, L_loadkeys_44);
3007 __ br(Assembler::EQ, L_loadkeys_52);
3008
3009 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3010 __ rev32(v17, __ T16B, v17);
3011 __ rev32(v18, __ T16B, v18);
3012 __ BIND(L_loadkeys_52);
3013 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3014 __ rev32(v19, __ T16B, v19);
3015 __ rev32(v20, __ T16B, v20);
3016 __ BIND(L_loadkeys_44);
3017 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3018 __ rev32(v21, __ T16B, v21);
3019 __ rev32(v22, __ T16B, v22);
3020 __ rev32(v23, __ T16B, v23);
3021 __ rev32(v24, __ T16B, v24);
3022 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3023 __ rev32(v25, __ T16B, v25);
3024 __ rev32(v26, __ T16B, v26);
3025 __ rev32(v27, __ T16B, v27);
3026 __ rev32(v28, __ T16B, v28);
3027 __ ld1(v29, v30, v31, __ T16B, key);
3028 __ rev32(v29, __ T16B, v29);
3029 __ rev32(v30, __ T16B, v30);
3030 __ rev32(v31, __ T16B, v31);
3031
3032 __ BIND(L_aes_loop);
3033 __ ld1(v1, __ T16B, __ post(from, 16));
3034 __ eor(v0, __ T16B, v0, v1);
3035
3036 __ br(Assembler::CC, L_rounds_44);
3037 __ br(Assembler::EQ, L_rounds_52);
3038
3039 __ aese(v0, v17); __ aesmc(v0, v0);
3040 __ aese(v0, v18); __ aesmc(v0, v0);
3041 __ BIND(L_rounds_52);
3042 __ aese(v0, v19); __ aesmc(v0, v0);
3043 __ aese(v0, v20); __ aesmc(v0, v0);
3044 __ BIND(L_rounds_44);
3045 __ aese(v0, v21); __ aesmc(v0, v0);
3046 __ aese(v0, v22); __ aesmc(v0, v0);
3047 __ aese(v0, v23); __ aesmc(v0, v0);
3048 __ aese(v0, v24); __ aesmc(v0, v0);
3049 __ aese(v0, v25); __ aesmc(v0, v0);
3050 __ aese(v0, v26); __ aesmc(v0, v0);
3051 __ aese(v0, v27); __ aesmc(v0, v0);
3052 __ aese(v0, v28); __ aesmc(v0, v0);
3053 __ aese(v0, v29); __ aesmc(v0, v0);
3054 __ aese(v0, v30);
3055 __ eor(v0, __ T16B, v0, v31);
3056
3057 __ st1(v0, __ T16B, __ post(to, 16));
3058
3059 __ subw(len_reg, len_reg, 16);
3060 __ cbnzw(len_reg, L_aes_loop);
3061
3062 __ st1(v0, __ T16B, rvec);
3063
3064 __ mov(r0, rscratch2);
3065
3066 __ leave();
3067 __ ret(lr);
3068
3069 return start;
3070 }
3071
3072 // Arguments:
3073 //
3074 // Inputs:
3075 // c_rarg0 - source byte array address
3076 // c_rarg1 - destination byte array address
3077 // c_rarg2 - K (key) in little endian int array
3078 // c_rarg3 - r vector byte array address
3079 // c_rarg4 - input length
3080 //
3081 // Output:
3082 // r0 - input length
3083 //
3084 address generate_cipherBlockChaining_decryptAESCrypt() {
3085 assert(UseAES, "need AES cryptographic extension support");
3086 __ align(CodeEntryAlignment);
3087 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3088 StubCodeMark mark(this, stub_id);
3089
3090 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3091
3092 const Register from = c_rarg0; // source array address
3093 const Register to = c_rarg1; // destination array address
3094 const Register key = c_rarg2; // key array address
3095 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3096 // and left with the results of the last encryption block
3097 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3098 const Register keylen = rscratch1;
3099
3100 address start = __ pc();
3101
3102 __ enter();
3103
3104 __ movw(rscratch2, len_reg);
3105
3106 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3107
3108 __ ld1(v2, __ T16B, rvec);
3109
3110 __ ld1(v31, __ T16B, __ post(key, 16));
3111 __ rev32(v31, __ T16B, v31);
3112
3113 __ cmpw(keylen, 52);
3114 __ br(Assembler::CC, L_loadkeys_44);
3115 __ br(Assembler::EQ, L_loadkeys_52);
3116
3117 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3118 __ rev32(v17, __ T16B, v17);
3119 __ rev32(v18, __ T16B, v18);
3120 __ BIND(L_loadkeys_52);
3121 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3122 __ rev32(v19, __ T16B, v19);
3123 __ rev32(v20, __ T16B, v20);
3124 __ BIND(L_loadkeys_44);
3125 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3126 __ rev32(v21, __ T16B, v21);
3127 __ rev32(v22, __ T16B, v22);
3128 __ rev32(v23, __ T16B, v23);
3129 __ rev32(v24, __ T16B, v24);
3130 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3131 __ rev32(v25, __ T16B, v25);
3132 __ rev32(v26, __ T16B, v26);
3133 __ rev32(v27, __ T16B, v27);
3134 __ rev32(v28, __ T16B, v28);
3135 __ ld1(v29, v30, __ T16B, key);
3136 __ rev32(v29, __ T16B, v29);
3137 __ rev32(v30, __ T16B, v30);
3138
3139 __ BIND(L_aes_loop);
3140 __ ld1(v0, __ T16B, __ post(from, 16));
3141 __ orr(v1, __ T16B, v0, v0);
3142
3143 __ br(Assembler::CC, L_rounds_44);
3144 __ br(Assembler::EQ, L_rounds_52);
3145
3146 __ aesd(v0, v17); __ aesimc(v0, v0);
3147 __ aesd(v0, v18); __ aesimc(v0, v0);
3148 __ BIND(L_rounds_52);
3149 __ aesd(v0, v19); __ aesimc(v0, v0);
3150 __ aesd(v0, v20); __ aesimc(v0, v0);
3151 __ BIND(L_rounds_44);
3152 __ aesd(v0, v21); __ aesimc(v0, v0);
3153 __ aesd(v0, v22); __ aesimc(v0, v0);
3154 __ aesd(v0, v23); __ aesimc(v0, v0);
3155 __ aesd(v0, v24); __ aesimc(v0, v0);
3156 __ aesd(v0, v25); __ aesimc(v0, v0);
3157 __ aesd(v0, v26); __ aesimc(v0, v0);
3158 __ aesd(v0, v27); __ aesimc(v0, v0);
3159 __ aesd(v0, v28); __ aesimc(v0, v0);
3160 __ aesd(v0, v29); __ aesimc(v0, v0);
3161 __ aesd(v0, v30);
3162 __ eor(v0, __ T16B, v0, v31);
3163 __ eor(v0, __ T16B, v0, v2);
3164
3165 __ st1(v0, __ T16B, __ post(to, 16));
3166 __ orr(v2, __ T16B, v1, v1);
3167
3168 __ subw(len_reg, len_reg, 16);
3169 __ cbnzw(len_reg, L_aes_loop);
3170
3171 __ st1(v2, __ T16B, rvec);
3172
3173 __ mov(r0, rscratch2);
3174
3175 __ leave();
3176 __ ret(lr);
3177
3178 return start;
3179 }
3180
3181 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3182 // Inputs: 128-bits. in is preserved.
3183 // The least-significant 64-bit word is in the upper dword of each vector.
3184 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3185 // Output: result
3186 void be_add_128_64(FloatRegister result, FloatRegister in,
3187 FloatRegister inc, FloatRegister tmp) {
3188 assert_different_registers(result, tmp, inc);
3189
3190 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3191 // input
3192 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3193 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3194 // MSD == 0 (must be!) to LSD
3195 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3196 }
3197
3198 // CTR AES crypt.
3199 // Arguments:
3200 //
3201 // Inputs:
3202 // c_rarg0 - source byte array address
3203 // c_rarg1 - destination byte array address
3204 // c_rarg2 - K (key) in little endian int array
3205 // c_rarg3 - counter vector byte array address
3206 // c_rarg4 - input length
3207 // c_rarg5 - saved encryptedCounter start
3208 // c_rarg6 - saved used length
3209 //
3210 // Output:
3211 // r0 - input length
3212 //
3213 address generate_counterMode_AESCrypt() {
3214 const Register in = c_rarg0;
3215 const Register out = c_rarg1;
3216 const Register key = c_rarg2;
3217 const Register counter = c_rarg3;
3218 const Register saved_len = c_rarg4, len = r10;
3219 const Register saved_encrypted_ctr = c_rarg5;
3220 const Register used_ptr = c_rarg6, used = r12;
3221
3222 const Register offset = r7;
3223 const Register keylen = r11;
3224
3225 const unsigned char block_size = 16;
3226 const int bulk_width = 4;
3227 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3228 // performance with larger data sizes, but it also means that the
3229 // fast path isn't used until you have at least 8 blocks, and up
3230 // to 127 bytes of data will be executed on the slow path. For
3231 // that reason, and also so as not to blow away too much icache, 4
3232 // blocks seems like a sensible compromise.
3233
3234 // Algorithm:
3235 //
3236 // if (len == 0) {
3237 // goto DONE;
3238 // }
3239 // int result = len;
3240 // do {
3241 // if (used >= blockSize) {
3242 // if (len >= bulk_width * blockSize) {
3243 // CTR_large_block();
3244 // if (len == 0)
3245 // goto DONE;
3246 // }
3247 // for (;;) {
3248 // 16ByteVector v0 = counter;
3249 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3250 // used = 0;
3251 // if (len < blockSize)
3252 // break; /* goto NEXT */
3253 // 16ByteVector v1 = load16Bytes(in, offset);
3254 // v1 = v1 ^ encryptedCounter;
3255 // store16Bytes(out, offset);
3256 // used = blockSize;
3257 // offset += blockSize;
3258 // len -= blockSize;
3259 // if (len == 0)
3260 // goto DONE;
3261 // }
3262 // }
3263 // NEXT:
3264 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3265 // len--;
3266 // } while (len != 0);
3267 // DONE:
3268 // return result;
3269 //
3270 // CTR_large_block()
3271 // Wide bulk encryption of whole blocks.
3272
3273 __ align(CodeEntryAlignment);
3274 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3275 StubCodeMark mark(this, stub_id);
3276 const address start = __ pc();
3277 __ enter();
3278
3279 Label DONE, CTR_large_block, large_block_return;
3280 __ ldrw(used, Address(used_ptr));
3281 __ cbzw(saved_len, DONE);
3282
3283 __ mov(len, saved_len);
3284 __ mov(offset, 0);
3285
3286 // Compute #rounds for AES based on the length of the key array
3287 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3288
3289 __ aesenc_loadkeys(key, keylen);
3290
3291 {
3292 Label L_CTR_loop, NEXT;
3293
3294 __ bind(L_CTR_loop);
3295
3296 __ cmp(used, block_size);
3297 __ br(__ LO, NEXT);
3298
3299 // Maybe we have a lot of data
3300 __ subsw(rscratch1, len, bulk_width * block_size);
3301 __ br(__ HS, CTR_large_block);
3302 __ BIND(large_block_return);
3303 __ cbzw(len, DONE);
3304
3305 // Setup the counter
3306 __ movi(v4, __ T4S, 0);
3307 __ movi(v5, __ T4S, 1);
3308 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3309
3310 // 128-bit big-endian increment
3311 __ ld1(v0, __ T16B, counter);
3312 __ rev64(v16, __ T16B, v0);
3313 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3314 __ rev64(v16, __ T16B, v16);
3315 __ st1(v16, __ T16B, counter);
3316 // Previous counter value is in v0
3317 // v4 contains { 0, 1 }
3318
3319 {
3320 // We have fewer than bulk_width blocks of data left. Encrypt
3321 // them one by one until there is less than a full block
3322 // remaining, being careful to save both the encrypted counter
3323 // and the counter.
3324
3325 Label inner_loop;
3326 __ bind(inner_loop);
3327 // Counter to encrypt is in v0
3328 __ aesecb_encrypt(noreg, noreg, keylen);
3329 __ st1(v0, __ T16B, saved_encrypted_ctr);
3330
3331 // Do we have a remaining full block?
3332
3333 __ mov(used, 0);
3334 __ cmp(len, block_size);
3335 __ br(__ LO, NEXT);
3336
3337 // Yes, we have a full block
3338 __ ldrq(v1, Address(in, offset));
3339 __ eor(v1, __ T16B, v1, v0);
3340 __ strq(v1, Address(out, offset));
3341 __ mov(used, block_size);
3342 __ add(offset, offset, block_size);
3343
3344 __ subw(len, len, block_size);
3345 __ cbzw(len, DONE);
3346
3347 // Increment the counter, store it back
3348 __ orr(v0, __ T16B, v16, v16);
3349 __ rev64(v16, __ T16B, v16);
3350 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3351 __ rev64(v16, __ T16B, v16);
3352 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3353
3354 __ b(inner_loop);
3355 }
3356
3357 __ BIND(NEXT);
3358
3359 // Encrypt a single byte, and loop.
3360 // We expect this to be a rare event.
3361 __ ldrb(rscratch1, Address(in, offset));
3362 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3363 __ eor(rscratch1, rscratch1, rscratch2);
3364 __ strb(rscratch1, Address(out, offset));
3365 __ add(offset, offset, 1);
3366 __ add(used, used, 1);
3367 __ subw(len, len,1);
3368 __ cbnzw(len, L_CTR_loop);
3369 }
3370
3371 __ bind(DONE);
3372 __ strw(used, Address(used_ptr));
3373 __ mov(r0, saved_len);
3374
3375 __ leave(); // required for proper stackwalking of RuntimeStub frame
3376 __ ret(lr);
3377
3378 // Bulk encryption
3379
3380 __ BIND (CTR_large_block);
3381 assert(bulk_width == 4 || bulk_width == 8, "must be");
3382
3383 if (bulk_width == 8) {
3384 __ sub(sp, sp, 4 * 16);
3385 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3386 }
3387 __ sub(sp, sp, 4 * 16);
3388 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3389 RegSet saved_regs = (RegSet::of(in, out, offset)
3390 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3391 __ push(saved_regs, sp);
3392 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3393 __ add(in, in, offset);
3394 __ add(out, out, offset);
3395
3396 // Keys should already be loaded into the correct registers
3397
3398 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3399 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3400
3401 // AES/CTR loop
3402 {
3403 Label L_CTR_loop;
3404 __ BIND(L_CTR_loop);
3405
3406 // Setup the counters
3407 __ movi(v8, __ T4S, 0);
3408 __ movi(v9, __ T4S, 1);
3409 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3410
3411 for (int i = 0; i < bulk_width; i++) {
3412 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3413 __ rev64(v0_ofs, __ T16B, v16);
3414 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3415 }
3416
3417 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3418
3419 // Encrypt the counters
3420 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3421
3422 if (bulk_width == 8) {
3423 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3424 }
3425
3426 // XOR the encrypted counters with the inputs
3427 for (int i = 0; i < bulk_width; i++) {
3428 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3429 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3430 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3431 }
3432
3433 // Write the encrypted data
3434 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3435 if (bulk_width == 8) {
3436 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3437 }
3438
3439 __ subw(len, len, 16 * bulk_width);
3440 __ cbnzw(len, L_CTR_loop);
3441 }
3442
3443 // Save the counter back where it goes
3444 __ rev64(v16, __ T16B, v16);
3445 __ st1(v16, __ T16B, counter);
3446
3447 __ pop(saved_regs, sp);
3448
3449 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3450 if (bulk_width == 8) {
3451 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3452 }
3453
3454 __ andr(rscratch1, len, -16 * bulk_width);
3455 __ sub(len, len, rscratch1);
3456 __ add(offset, offset, rscratch1);
3457 __ mov(used, 16);
3458 __ strw(used, Address(used_ptr));
3459 __ b(large_block_return);
3460
3461 return start;
3462 }
3463
3464 // Vector AES Galois Counter Mode implementation. Parameters:
3465 //
3466 // in = c_rarg0
3467 // len = c_rarg1
3468 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3469 // out = c_rarg3
3470 // key = c_rarg4
3471 // state = c_rarg5 - GHASH.state
3472 // subkeyHtbl = c_rarg6 - powers of H
3473 // counter = c_rarg7 - 16 bytes of CTR
3474 // return - number of processed bytes
3475 address generate_galoisCounterMode_AESCrypt() {
3476 Label ghash_polynomial; // local data generated after code
3477
3478 __ align(CodeEntryAlignment);
3479 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3480 StubCodeMark mark(this, stub_id);
3481 address start = __ pc();
3482 __ enter();
3483
3484 const Register in = c_rarg0;
3485 const Register len = c_rarg1;
3486 const Register ct = c_rarg2;
3487 const Register out = c_rarg3;
3488 // and updated with the incremented counter in the end
3489
3490 const Register key = c_rarg4;
3491 const Register state = c_rarg5;
3492
3493 const Register subkeyHtbl = c_rarg6;
3494
3495 const Register counter = c_rarg7;
3496
3497 const Register keylen = r10;
3498 // Save state before entering routine
3499 __ sub(sp, sp, 4 * 16);
3500 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3501 __ sub(sp, sp, 4 * 16);
3502 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3503
3504 // __ andr(len, len, -512);
3505 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3506 __ str(len, __ pre(sp, -2 * wordSize));
3507
3508 Label DONE;
3509 __ cbz(len, DONE);
3510
3511 // Compute #rounds for AES based on the length of the key array
3512 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3513
3514 __ aesenc_loadkeys(key, keylen);
3515 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3516 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3517
3518 // AES/CTR loop
3519 {
3520 Label L_CTR_loop;
3521 __ BIND(L_CTR_loop);
3522
3523 // Setup the counters
3524 __ movi(v8, __ T4S, 0);
3525 __ movi(v9, __ T4S, 1);
3526 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3527
3528 assert(v0->encoding() < v8->encoding(), "");
3529 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3530 FloatRegister f = as_FloatRegister(i);
3531 __ rev32(f, __ T16B, v16);
3532 __ addv(v16, __ T4S, v16, v8);
3533 }
3534
3535 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3536
3537 // Encrypt the counters
3538 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3539
3540 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3541
3542 // XOR the encrypted counters with the inputs
3543 for (int i = 0; i < 8; i++) {
3544 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3545 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3546 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3547 }
3548 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3549 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3550
3551 __ subw(len, len, 16 * 8);
3552 __ cbnzw(len, L_CTR_loop);
3553 }
3554
3555 __ rev32(v16, __ T16B, v16);
3556 __ st1(v16, __ T16B, counter);
3557
3558 __ ldr(len, Address(sp));
3559 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3560
3561 // GHASH/CTR loop
3562 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3563 len, /*unrolls*/4);
3564
3565 #ifdef ASSERT
3566 { Label L;
3567 __ cmp(len, (unsigned char)0);
3568 __ br(Assembler::EQ, L);
3569 __ stop("stubGenerator: abort");
3570 __ bind(L);
3571 }
3572 #endif
3573
3574 __ bind(DONE);
3575 // Return the number of bytes processed
3576 __ ldr(r0, __ post(sp, 2 * wordSize));
3577
3578 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3579 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3580
3581 __ leave(); // required for proper stackwalking of RuntimeStub frame
3582 __ ret(lr);
3583
3584 // bind label and generate polynomial data
3585 __ align(wordSize * 2);
3586 __ bind(ghash_polynomial);
3587 __ emit_int64(0x87); // The low-order bits of the field
3588 // polynomial (i.e. p = z^7+z^2+z+1)
3589 // repeated in the low and high parts of a
3590 // 128-bit vector
3591 __ emit_int64(0x87);
3592
3593 return start;
3594 }
3595
3596 class Cached64Bytes {
3597 private:
3598 MacroAssembler *_masm;
3599 Register _regs[8];
3600
3601 public:
3602 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3603 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3604 auto it = rs.begin();
3605 for (auto &r: _regs) {
3606 r = *it;
3607 ++it;
3608 }
3609 }
3610
3611 void gen_loads(Register base) {
3612 for (int i = 0; i < 8; i += 2) {
3613 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3614 }
3615 }
3616
3617 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3618 void extract_u32(Register dest, int i) {
3619 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3620 }
3621 };
3622
3623 // Utility routines for md5.
3624 // Clobbers r10 and r11.
3625 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3626 int k, int s, int t) {
3627 Register rscratch3 = r10;
3628 Register rscratch4 = r11;
3629
3630 __ eorw(rscratch3, r3, r4);
3631 __ movw(rscratch2, t);
3632 __ andw(rscratch3, rscratch3, r2);
3633 __ addw(rscratch4, r1, rscratch2);
3634 reg_cache.extract_u32(rscratch1, k);
3635 __ eorw(rscratch3, rscratch3, r4);
3636 __ addw(rscratch4, rscratch4, rscratch1);
3637 __ addw(rscratch3, rscratch3, rscratch4);
3638 __ rorw(rscratch2, rscratch3, 32 - s);
3639 __ addw(r1, rscratch2, r2);
3640 }
3641
3642 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3643 int k, int s, int t) {
3644 Register rscratch3 = r10;
3645 Register rscratch4 = r11;
3646
3647 reg_cache.extract_u32(rscratch1, k);
3648 __ movw(rscratch2, t);
3649 __ addw(rscratch4, r1, rscratch2);
3650 __ addw(rscratch4, rscratch4, rscratch1);
3651 __ bicw(rscratch2, r3, r4);
3652 __ andw(rscratch3, r2, r4);
3653 __ addw(rscratch2, rscratch2, rscratch4);
3654 __ addw(rscratch2, rscratch2, rscratch3);
3655 __ rorw(rscratch2, rscratch2, 32 - s);
3656 __ addw(r1, rscratch2, r2);
3657 }
3658
3659 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3660 int k, int s, int t) {
3661 Register rscratch3 = r10;
3662 Register rscratch4 = r11;
3663
3664 __ eorw(rscratch3, r3, r4);
3665 __ movw(rscratch2, t);
3666 __ addw(rscratch4, r1, rscratch2);
3667 reg_cache.extract_u32(rscratch1, k);
3668 __ eorw(rscratch3, rscratch3, r2);
3669 __ addw(rscratch4, rscratch4, rscratch1);
3670 __ addw(rscratch3, rscratch3, rscratch4);
3671 __ rorw(rscratch2, rscratch3, 32 - s);
3672 __ addw(r1, rscratch2, r2);
3673 }
3674
3675 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3676 int k, int s, int t) {
3677 Register rscratch3 = r10;
3678 Register rscratch4 = r11;
3679
3680 __ movw(rscratch3, t);
3681 __ ornw(rscratch2, r2, r4);
3682 __ addw(rscratch4, r1, rscratch3);
3683 reg_cache.extract_u32(rscratch1, k);
3684 __ eorw(rscratch3, rscratch2, r3);
3685 __ addw(rscratch4, rscratch4, rscratch1);
3686 __ addw(rscratch3, rscratch3, rscratch4);
3687 __ rorw(rscratch2, rscratch3, 32 - s);
3688 __ addw(r1, rscratch2, r2);
3689 }
3690
3691 // Arguments:
3692 //
3693 // Inputs:
3694 // c_rarg0 - byte[] source+offset
3695 // c_rarg1 - int[] SHA.state
3696 // c_rarg2 - int offset
3697 // c_rarg3 - int limit
3698 //
3699 address generate_md5_implCompress(StubId stub_id) {
3700 bool multi_block;
3701 switch (stub_id) {
3702 case StubId::stubgen_md5_implCompress_id:
3703 multi_block = false;
3704 break;
3705 case StubId::stubgen_md5_implCompressMB_id:
3706 multi_block = true;
3707 break;
3708 default:
3709 ShouldNotReachHere();
3710 }
3711 __ align(CodeEntryAlignment);
3712
3713 StubCodeMark mark(this, stub_id);
3714 address start = __ pc();
3715
3716 Register buf = c_rarg0;
3717 Register state = c_rarg1;
3718 Register ofs = c_rarg2;
3719 Register limit = c_rarg3;
3720 Register a = r4;
3721 Register b = r5;
3722 Register c = r6;
3723 Register d = r7;
3724 Register rscratch3 = r10;
3725 Register rscratch4 = r11;
3726
3727 Register state_regs[2] = { r12, r13 };
3728 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3729 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3730
3731 __ push(saved_regs, sp);
3732
3733 __ ldp(state_regs[0], state_regs[1], Address(state));
3734 __ ubfx(a, state_regs[0], 0, 32);
3735 __ ubfx(b, state_regs[0], 32, 32);
3736 __ ubfx(c, state_regs[1], 0, 32);
3737 __ ubfx(d, state_regs[1], 32, 32);
3738
3739 Label md5_loop;
3740 __ BIND(md5_loop);
3741
3742 reg_cache.gen_loads(buf);
3743
3744 // Round 1
3745 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3746 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3747 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3748 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3749 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3750 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3751 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3752 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3753 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3754 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3755 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3756 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3757 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3758 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3759 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3760 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3761
3762 // Round 2
3763 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3764 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3765 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3766 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3767 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3768 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3769 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3770 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3771 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3772 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3773 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3774 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3775 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3776 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3777 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3778 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3779
3780 // Round 3
3781 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3782 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3783 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3784 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3785 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3786 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3787 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3788 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3789 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3790 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3791 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3792 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3793 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3794 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3795 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3796 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3797
3798 // Round 4
3799 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3800 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3801 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3802 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3803 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3804 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3805 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3806 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3807 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3808 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3809 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3810 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3811 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3812 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3813 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3814 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3815
3816 __ addw(a, state_regs[0], a);
3817 __ ubfx(rscratch2, state_regs[0], 32, 32);
3818 __ addw(b, rscratch2, b);
3819 __ addw(c, state_regs[1], c);
3820 __ ubfx(rscratch4, state_regs[1], 32, 32);
3821 __ addw(d, rscratch4, d);
3822
3823 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3824 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3825
3826 if (multi_block) {
3827 __ add(buf, buf, 64);
3828 __ add(ofs, ofs, 64);
3829 __ cmp(ofs, limit);
3830 __ br(Assembler::LE, md5_loop);
3831 __ mov(c_rarg0, ofs); // return ofs
3832 }
3833
3834 // write hash values back in the correct order
3835 __ stp(state_regs[0], state_regs[1], Address(state));
3836
3837 __ pop(saved_regs, sp);
3838
3839 __ ret(lr);
3840
3841 return start;
3842 }
3843
3844 // Arguments:
3845 //
3846 // Inputs:
3847 // c_rarg0 - byte[] source+offset
3848 // c_rarg1 - int[] SHA.state
3849 // c_rarg2 - int offset
3850 // c_rarg3 - int limit
3851 //
3852 address generate_sha1_implCompress(StubId stub_id) {
3853 bool multi_block;
3854 switch (stub_id) {
3855 case StubId::stubgen_sha1_implCompress_id:
3856 multi_block = false;
3857 break;
3858 case StubId::stubgen_sha1_implCompressMB_id:
3859 multi_block = true;
3860 break;
3861 default:
3862 ShouldNotReachHere();
3863 }
3864
3865 __ align(CodeEntryAlignment);
3866
3867 StubCodeMark mark(this, stub_id);
3868 address start = __ pc();
3869
3870 Register buf = c_rarg0;
3871 Register state = c_rarg1;
3872 Register ofs = c_rarg2;
3873 Register limit = c_rarg3;
3874
3875 Label keys;
3876 Label sha1_loop;
3877
3878 // load the keys into v0..v3
3879 __ adr(rscratch1, keys);
3880 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3881 // load 5 words state into v6, v7
3882 __ ldrq(v6, Address(state, 0));
3883 __ ldrs(v7, Address(state, 16));
3884
3885
3886 __ BIND(sha1_loop);
3887 // load 64 bytes of data into v16..v19
3888 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3889 __ rev32(v16, __ T16B, v16);
3890 __ rev32(v17, __ T16B, v17);
3891 __ rev32(v18, __ T16B, v18);
3892 __ rev32(v19, __ T16B, v19);
3893
3894 // do the sha1
3895 __ addv(v4, __ T4S, v16, v0);
3896 __ orr(v20, __ T16B, v6, v6);
3897
3898 FloatRegister d0 = v16;
3899 FloatRegister d1 = v17;
3900 FloatRegister d2 = v18;
3901 FloatRegister d3 = v19;
3902
3903 for (int round = 0; round < 20; round++) {
3904 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3905 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3906 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3907 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3908 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3909
3910 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3911 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3912 __ sha1h(tmp2, __ T4S, v20);
3913 if (round < 5)
3914 __ sha1c(v20, __ T4S, tmp3, tmp4);
3915 else if (round < 10 || round >= 15)
3916 __ sha1p(v20, __ T4S, tmp3, tmp4);
3917 else
3918 __ sha1m(v20, __ T4S, tmp3, tmp4);
3919 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3920
3921 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3922 }
3923
3924 __ addv(v7, __ T2S, v7, v21);
3925 __ addv(v6, __ T4S, v6, v20);
3926
3927 if (multi_block) {
3928 __ add(ofs, ofs, 64);
3929 __ cmp(ofs, limit);
3930 __ br(Assembler::LE, sha1_loop);
3931 __ mov(c_rarg0, ofs); // return ofs
3932 }
3933
3934 __ strq(v6, Address(state, 0));
3935 __ strs(v7, Address(state, 16));
3936
3937 __ ret(lr);
3938
3939 __ bind(keys);
3940 __ emit_int32(0x5a827999);
3941 __ emit_int32(0x6ed9eba1);
3942 __ emit_int32(0x8f1bbcdc);
3943 __ emit_int32(0xca62c1d6);
3944
3945 return start;
3946 }
3947
3948
3949 // Arguments:
3950 //
3951 // Inputs:
3952 // c_rarg0 - byte[] source+offset
3953 // c_rarg1 - int[] SHA.state
3954 // c_rarg2 - int offset
3955 // c_rarg3 - int limit
3956 //
3957 address generate_sha256_implCompress(StubId stub_id) {
3958 bool multi_block;
3959 switch (stub_id) {
3960 case StubId::stubgen_sha256_implCompress_id:
3961 multi_block = false;
3962 break;
3963 case StubId::stubgen_sha256_implCompressMB_id:
3964 multi_block = true;
3965 break;
3966 default:
3967 ShouldNotReachHere();
3968 }
3969
3970 static const uint32_t round_consts[64] = {
3971 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3972 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3973 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3974 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3975 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3976 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3977 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3978 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3979 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3980 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3981 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3982 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3983 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3984 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3985 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3986 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3987 };
3988
3989 __ align(CodeEntryAlignment);
3990
3991 StubCodeMark mark(this, stub_id);
3992 address start = __ pc();
3993
3994 Register buf = c_rarg0;
3995 Register state = c_rarg1;
3996 Register ofs = c_rarg2;
3997 Register limit = c_rarg3;
3998
3999 Label sha1_loop;
4000
4001 __ stpd(v8, v9, __ pre(sp, -32));
4002 __ stpd(v10, v11, Address(sp, 16));
4003
4004 // dga == v0
4005 // dgb == v1
4006 // dg0 == v2
4007 // dg1 == v3
4008 // dg2 == v4
4009 // t0 == v6
4010 // t1 == v7
4011
4012 // load 16 keys to v16..v31
4013 __ lea(rscratch1, ExternalAddress((address)round_consts));
4014 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4015 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4016 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4017 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4018
4019 // load 8 words (256 bits) state
4020 __ ldpq(v0, v1, state);
4021
4022 __ BIND(sha1_loop);
4023 // load 64 bytes of data into v8..v11
4024 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4025 __ rev32(v8, __ T16B, v8);
4026 __ rev32(v9, __ T16B, v9);
4027 __ rev32(v10, __ T16B, v10);
4028 __ rev32(v11, __ T16B, v11);
4029
4030 __ addv(v6, __ T4S, v8, v16);
4031 __ orr(v2, __ T16B, v0, v0);
4032 __ orr(v3, __ T16B, v1, v1);
4033
4034 FloatRegister d0 = v8;
4035 FloatRegister d1 = v9;
4036 FloatRegister d2 = v10;
4037 FloatRegister d3 = v11;
4038
4039
4040 for (int round = 0; round < 16; round++) {
4041 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4042 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4043 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4044 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4045
4046 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4047 __ orr(v4, __ T16B, v2, v2);
4048 if (round < 15)
4049 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4050 __ sha256h(v2, __ T4S, v3, tmp2);
4051 __ sha256h2(v3, __ T4S, v4, tmp2);
4052 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4053
4054 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4055 }
4056
4057 __ addv(v0, __ T4S, v0, v2);
4058 __ addv(v1, __ T4S, v1, v3);
4059
4060 if (multi_block) {
4061 __ add(ofs, ofs, 64);
4062 __ cmp(ofs, limit);
4063 __ br(Assembler::LE, sha1_loop);
4064 __ mov(c_rarg0, ofs); // return ofs
4065 }
4066
4067 __ ldpd(v10, v11, Address(sp, 16));
4068 __ ldpd(v8, v9, __ post(sp, 32));
4069
4070 __ stpq(v0, v1, state);
4071
4072 __ ret(lr);
4073
4074 return start;
4075 }
4076
4077 // Double rounds for sha512.
4078 void sha512_dround(int dr,
4079 FloatRegister vi0, FloatRegister vi1,
4080 FloatRegister vi2, FloatRegister vi3,
4081 FloatRegister vi4, FloatRegister vrc0,
4082 FloatRegister vrc1, FloatRegister vin0,
4083 FloatRegister vin1, FloatRegister vin2,
4084 FloatRegister vin3, FloatRegister vin4) {
4085 if (dr < 36) {
4086 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4087 }
4088 __ addv(v5, __ T2D, vrc0, vin0);
4089 __ ext(v6, __ T16B, vi2, vi3, 8);
4090 __ ext(v5, __ T16B, v5, v5, 8);
4091 __ ext(v7, __ T16B, vi1, vi2, 8);
4092 __ addv(vi3, __ T2D, vi3, v5);
4093 if (dr < 32) {
4094 __ ext(v5, __ T16B, vin3, vin4, 8);
4095 __ sha512su0(vin0, __ T2D, vin1);
4096 }
4097 __ sha512h(vi3, __ T2D, v6, v7);
4098 if (dr < 32) {
4099 __ sha512su1(vin0, __ T2D, vin2, v5);
4100 }
4101 __ addv(vi4, __ T2D, vi1, vi3);
4102 __ sha512h2(vi3, __ T2D, vi1, vi0);
4103 }
4104
4105 // Arguments:
4106 //
4107 // Inputs:
4108 // c_rarg0 - byte[] source+offset
4109 // c_rarg1 - int[] SHA.state
4110 // c_rarg2 - int offset
4111 // c_rarg3 - int limit
4112 //
4113 address generate_sha512_implCompress(StubId stub_id) {
4114 bool multi_block;
4115 switch (stub_id) {
4116 case StubId::stubgen_sha512_implCompress_id:
4117 multi_block = false;
4118 break;
4119 case StubId::stubgen_sha512_implCompressMB_id:
4120 multi_block = true;
4121 break;
4122 default:
4123 ShouldNotReachHere();
4124 }
4125
4126 static const uint64_t round_consts[80] = {
4127 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4128 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4129 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4130 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4131 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4132 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4133 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4134 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4135 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4136 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4137 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4138 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4139 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4140 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4141 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4142 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4143 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4144 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4145 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4146 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4147 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4148 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4149 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4150 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4151 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4152 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4153 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4154 };
4155
4156 __ align(CodeEntryAlignment);
4157
4158 StubCodeMark mark(this, stub_id);
4159 address start = __ pc();
4160
4161 Register buf = c_rarg0;
4162 Register state = c_rarg1;
4163 Register ofs = c_rarg2;
4164 Register limit = c_rarg3;
4165
4166 __ stpd(v8, v9, __ pre(sp, -64));
4167 __ stpd(v10, v11, Address(sp, 16));
4168 __ stpd(v12, v13, Address(sp, 32));
4169 __ stpd(v14, v15, Address(sp, 48));
4170
4171 Label sha512_loop;
4172
4173 // load state
4174 __ ld1(v8, v9, v10, v11, __ T2D, state);
4175
4176 // load first 4 round constants
4177 __ lea(rscratch1, ExternalAddress((address)round_consts));
4178 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4179
4180 __ BIND(sha512_loop);
4181 // load 128B of data into v12..v19
4182 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4183 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4184 __ rev64(v12, __ T16B, v12);
4185 __ rev64(v13, __ T16B, v13);
4186 __ rev64(v14, __ T16B, v14);
4187 __ rev64(v15, __ T16B, v15);
4188 __ rev64(v16, __ T16B, v16);
4189 __ rev64(v17, __ T16B, v17);
4190 __ rev64(v18, __ T16B, v18);
4191 __ rev64(v19, __ T16B, v19);
4192
4193 __ mov(rscratch2, rscratch1);
4194
4195 __ mov(v0, __ T16B, v8);
4196 __ mov(v1, __ T16B, v9);
4197 __ mov(v2, __ T16B, v10);
4198 __ mov(v3, __ T16B, v11);
4199
4200 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4201 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4202 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4203 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4204 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4205 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4206 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4207 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4208 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4209 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4210 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4211 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4212 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4213 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4214 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4215 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4216 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4217 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4218 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4219 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4220 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4221 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4222 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4223 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4224 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4225 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4226 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4227 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4228 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4229 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4230 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4231 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4232 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4233 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4234 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4235 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4236 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4237 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4238 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4239 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4240
4241 __ addv(v8, __ T2D, v8, v0);
4242 __ addv(v9, __ T2D, v9, v1);
4243 __ addv(v10, __ T2D, v10, v2);
4244 __ addv(v11, __ T2D, v11, v3);
4245
4246 if (multi_block) {
4247 __ add(ofs, ofs, 128);
4248 __ cmp(ofs, limit);
4249 __ br(Assembler::LE, sha512_loop);
4250 __ mov(c_rarg0, ofs); // return ofs
4251 }
4252
4253 __ st1(v8, v9, v10, v11, __ T2D, state);
4254
4255 __ ldpd(v14, v15, Address(sp, 48));
4256 __ ldpd(v12, v13, Address(sp, 32));
4257 __ ldpd(v10, v11, Address(sp, 16));
4258 __ ldpd(v8, v9, __ post(sp, 64));
4259
4260 __ ret(lr);
4261
4262 return start;
4263 }
4264
4265 // Execute one round of keccak of two computations in parallel.
4266 // One of the states should be loaded into the lower halves of
4267 // the vector registers v0-v24, the other should be loaded into
4268 // the upper halves of those registers. The ld1r instruction loads
4269 // the round constant into both halves of register v31.
4270 // Intermediate results c0...c5 and d0...d5 are computed
4271 // in registers v25...v30.
4272 // All vector instructions that are used operate on both register
4273 // halves in parallel.
4274 // If only a single computation is needed, one can only load the lower halves.
4275 void keccak_round(Register rscratch1) {
4276 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4277 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4278 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4279 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4280 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4281 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4282 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4283 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4284 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4285 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4286
4287 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4288 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4289 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4290 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4291 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4292
4293 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4294 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4295 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4296 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4297 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4298 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4299 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4300 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4301 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4302 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4303 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4304 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4305 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4306 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4307 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4308 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4309 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4310 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4311 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4312 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4313 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4314 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4315 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4316 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4317 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4318
4319 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4320 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4321 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4322 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4323 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4324
4325 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4326
4327 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4328 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4329 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4330 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4331 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4332
4333 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4334 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4335 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4336 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4337 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4338
4339 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4340 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4341 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4342 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4343 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4344
4345 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4346 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4347 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4348 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4349 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4350
4351 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4352 }
4353
4354 // Arguments:
4355 //
4356 // Inputs:
4357 // c_rarg0 - byte[] source+offset
4358 // c_rarg1 - byte[] SHA.state
4359 // c_rarg2 - int block_size
4360 // c_rarg3 - int offset
4361 // c_rarg4 - int limit
4362 //
4363 address generate_sha3_implCompress(StubId stub_id) {
4364 bool multi_block;
4365 switch (stub_id) {
4366 case StubId::stubgen_sha3_implCompress_id:
4367 multi_block = false;
4368 break;
4369 case StubId::stubgen_sha3_implCompressMB_id:
4370 multi_block = true;
4371 break;
4372 default:
4373 ShouldNotReachHere();
4374 }
4375
4376 static const uint64_t round_consts[24] = {
4377 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4378 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4379 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4380 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4381 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4382 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4383 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4384 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4385 };
4386
4387 __ align(CodeEntryAlignment);
4388
4389 StubCodeMark mark(this, stub_id);
4390 address start = __ pc();
4391
4392 Register buf = c_rarg0;
4393 Register state = c_rarg1;
4394 Register block_size = c_rarg2;
4395 Register ofs = c_rarg3;
4396 Register limit = c_rarg4;
4397
4398 Label sha3_loop, rounds24_loop;
4399 Label sha3_512_or_sha3_384, shake128;
4400
4401 __ stpd(v8, v9, __ pre(sp, -64));
4402 __ stpd(v10, v11, Address(sp, 16));
4403 __ stpd(v12, v13, Address(sp, 32));
4404 __ stpd(v14, v15, Address(sp, 48));
4405
4406 // load state
4407 __ add(rscratch1, state, 32);
4408 __ ld1(v0, v1, v2, v3, __ T1D, state);
4409 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4410 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4411 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4412 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4413 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4414 __ ld1(v24, __ T1D, rscratch1);
4415
4416 __ BIND(sha3_loop);
4417
4418 // 24 keccak rounds
4419 __ movw(rscratch2, 24);
4420
4421 // load round_constants base
4422 __ lea(rscratch1, ExternalAddress((address) round_consts));
4423
4424 // load input
4425 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4426 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4427 __ eor(v0, __ T8B, v0, v25);
4428 __ eor(v1, __ T8B, v1, v26);
4429 __ eor(v2, __ T8B, v2, v27);
4430 __ eor(v3, __ T8B, v3, v28);
4431 __ eor(v4, __ T8B, v4, v29);
4432 __ eor(v5, __ T8B, v5, v30);
4433 __ eor(v6, __ T8B, v6, v31);
4434
4435 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4436 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4437
4438 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4439 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4440 __ eor(v7, __ T8B, v7, v25);
4441 __ eor(v8, __ T8B, v8, v26);
4442 __ eor(v9, __ T8B, v9, v27);
4443 __ eor(v10, __ T8B, v10, v28);
4444 __ eor(v11, __ T8B, v11, v29);
4445 __ eor(v12, __ T8B, v12, v30);
4446 __ eor(v13, __ T8B, v13, v31);
4447
4448 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4449 __ eor(v14, __ T8B, v14, v25);
4450 __ eor(v15, __ T8B, v15, v26);
4451 __ eor(v16, __ T8B, v16, v27);
4452
4453 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4454 __ andw(c_rarg5, block_size, 48);
4455 __ cbzw(c_rarg5, rounds24_loop);
4456
4457 __ tbnz(block_size, 5, shake128);
4458 // block_size == 144, bit5 == 0, SHA3-224
4459 __ ldrd(v28, __ post(buf, 8));
4460 __ eor(v17, __ T8B, v17, v28);
4461 __ b(rounds24_loop);
4462
4463 __ BIND(shake128);
4464 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4465 __ eor(v17, __ T8B, v17, v28);
4466 __ eor(v18, __ T8B, v18, v29);
4467 __ eor(v19, __ T8B, v19, v30);
4468 __ eor(v20, __ T8B, v20, v31);
4469 __ b(rounds24_loop); // block_size == 168, SHAKE128
4470
4471 __ BIND(sha3_512_or_sha3_384);
4472 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4473 __ eor(v7, __ T8B, v7, v25);
4474 __ eor(v8, __ T8B, v8, v26);
4475 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4476
4477 // SHA3-384
4478 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4479 __ eor(v9, __ T8B, v9, v27);
4480 __ eor(v10, __ T8B, v10, v28);
4481 __ eor(v11, __ T8B, v11, v29);
4482 __ eor(v12, __ T8B, v12, v30);
4483
4484 __ BIND(rounds24_loop);
4485 __ subw(rscratch2, rscratch2, 1);
4486
4487 keccak_round(rscratch1);
4488
4489 __ cbnzw(rscratch2, rounds24_loop);
4490
4491 if (multi_block) {
4492 __ add(ofs, ofs, block_size);
4493 __ cmp(ofs, limit);
4494 __ br(Assembler::LE, sha3_loop);
4495 __ mov(c_rarg0, ofs); // return ofs
4496 }
4497
4498 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4499 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4500 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4501 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4502 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4503 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4504 __ st1(v24, __ T1D, state);
4505
4506 // restore callee-saved registers
4507 __ ldpd(v14, v15, Address(sp, 48));
4508 __ ldpd(v12, v13, Address(sp, 32));
4509 __ ldpd(v10, v11, Address(sp, 16));
4510 __ ldpd(v8, v9, __ post(sp, 64));
4511
4512 __ ret(lr);
4513
4514 return start;
4515 }
4516
4517 // Inputs:
4518 // c_rarg0 - long[] state0
4519 // c_rarg1 - long[] state1
4520 address generate_double_keccak() {
4521 static const uint64_t round_consts[24] = {
4522 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4523 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4524 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4525 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4526 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4527 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4528 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4529 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4530 };
4531
4532 // Implements the double_keccak() method of the
4533 // sun.secyrity.provider.SHA3Parallel class
4534 __ align(CodeEntryAlignment);
4535 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4536 address start = __ pc();
4537 __ enter();
4538
4539 Register state0 = c_rarg0;
4540 Register state1 = c_rarg1;
4541
4542 Label rounds24_loop;
4543
4544 // save callee-saved registers
4545 __ stpd(v8, v9, __ pre(sp, -64));
4546 __ stpd(v10, v11, Address(sp, 16));
4547 __ stpd(v12, v13, Address(sp, 32));
4548 __ stpd(v14, v15, Address(sp, 48));
4549
4550 // load states
4551 __ add(rscratch1, state0, 32);
4552 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4553 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4554 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4555 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4556 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4557 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4558 __ ld1(v24, __ D, 0, rscratch1);
4559 __ add(rscratch1, state1, 32);
4560 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4561 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4562 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4563 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4564 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4565 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4566 __ ld1(v24, __ D, 1, rscratch1);
4567
4568 // 24 keccak rounds
4569 __ movw(rscratch2, 24);
4570
4571 // load round_constants base
4572 __ lea(rscratch1, ExternalAddress((address) round_consts));
4573
4574 __ BIND(rounds24_loop);
4575 __ subw(rscratch2, rscratch2, 1);
4576 keccak_round(rscratch1);
4577 __ cbnzw(rscratch2, rounds24_loop);
4578
4579 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4580 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4581 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4582 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4583 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4584 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4585 __ st1(v24, __ D, 0, state0);
4586 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4587 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4588 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4589 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4590 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4591 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4592 __ st1(v24, __ D, 1, state1);
4593
4594 // restore callee-saved vector registers
4595 __ ldpd(v14, v15, Address(sp, 48));
4596 __ ldpd(v12, v13, Address(sp, 32));
4597 __ ldpd(v10, v11, Address(sp, 16));
4598 __ ldpd(v8, v9, __ post(sp, 64));
4599
4600 __ leave(); // required for proper stackwalking of RuntimeStub frame
4601 __ mov(r0, zr); // return 0
4602 __ ret(lr);
4603
4604 return start;
4605 }
4606
4607 // ChaCha20 block function. This version parallelizes the 32-bit
4608 // state elements on each of 16 vectors, producing 4 blocks of
4609 // keystream at a time.
4610 //
4611 // state (int[16]) = c_rarg0
4612 // keystream (byte[256]) = c_rarg1
4613 // return - number of bytes of produced keystream (always 256)
4614 //
4615 // This implementation takes each 32-bit integer from the state
4616 // array and broadcasts it across all 4 32-bit lanes of a vector register
4617 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4618 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4619 // the quarter round schedule is implemented as outlined in RFC 7539 section
4620 // 2.3. However, instead of sequentially processing the 3 quarter round
4621 // operations represented by one QUARTERROUND function, we instead stack all
4622 // the adds, xors and left-rotations from the first 4 quarter rounds together
4623 // and then do the same for the second set of 4 quarter rounds. This removes
4624 // some latency that would otherwise be incurred by waiting for an add to
4625 // complete before performing an xor (which depends on the result of the
4626 // add), etc. An adjustment happens between the first and second groups of 4
4627 // quarter rounds, but this is done only in the inputs to the macro functions
4628 // that generate the assembly instructions - these adjustments themselves are
4629 // not part of the resulting assembly.
4630 // The 4 registers v0-v3 are used during the quarter round operations as
4631 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4632 // registers become the vectors involved in adding the start state back onto
4633 // the post-QR working state. After the adds are complete, each of the 16
4634 // vectors write their first lane back to the keystream buffer, followed
4635 // by the second lane from all vectors and so on.
4636 address generate_chacha20Block_blockpar() {
4637 Label L_twoRounds, L_cc20_const;
4638 __ align(CodeEntryAlignment);
4639 StubId stub_id = StubId::stubgen_chacha20Block_id;
4640 StubCodeMark mark(this, stub_id);
4641 address start = __ pc();
4642 __ enter();
4643
4644 int i, j;
4645 const Register state = c_rarg0;
4646 const Register keystream = c_rarg1;
4647 const Register loopCtr = r10;
4648 const Register tmpAddr = r11;
4649 const FloatRegister ctrAddOverlay = v28;
4650 const FloatRegister lrot8Tbl = v29;
4651
4652 // Organize SIMD registers in an array that facilitates
4653 // putting repetitive opcodes into loop structures. It is
4654 // important that each grouping of 4 registers is monotonically
4655 // increasing to support the requirements of multi-register
4656 // instructions (e.g. ld4r, st4, etc.)
4657 const FloatRegister workSt[16] = {
4658 v4, v5, v6, v7, v16, v17, v18, v19,
4659 v20, v21, v22, v23, v24, v25, v26, v27
4660 };
4661
4662 // Pull in constant data. The first 16 bytes are the add overlay
4663 // which is applied to the vector holding the counter (state[12]).
4664 // The second 16 bytes is the index register for the 8-bit left
4665 // rotation tbl instruction.
4666 __ adr(tmpAddr, L_cc20_const);
4667 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4668
4669 // Load from memory and interlace across 16 SIMD registers,
4670 // With each word from memory being broadcast to all lanes of
4671 // each successive SIMD register.
4672 // Addr(0) -> All lanes in workSt[i]
4673 // Addr(4) -> All lanes workSt[i + 1], etc.
4674 __ mov(tmpAddr, state);
4675 for (i = 0; i < 16; i += 4) {
4676 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4677 __ post(tmpAddr, 16));
4678 }
4679 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4680
4681 // Before entering the loop, create 5 4-register arrays. These
4682 // will hold the 4 registers that represent the a/b/c/d fields
4683 // in the quarter round operation. For instance the "b" field
4684 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4685 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4686 // since it is part of a diagonal organization. The aSet and scratch
4687 // register sets are defined at declaration time because they do not change
4688 // organization at any point during the 20-round processing.
4689 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4690 FloatRegister bSet[4];
4691 FloatRegister cSet[4];
4692 FloatRegister dSet[4];
4693 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4694
4695 // Set up the 10 iteration loop and perform all 8 quarter round ops
4696 __ mov(loopCtr, 10);
4697 __ BIND(L_twoRounds);
4698
4699 // Set to columnar organization and do the following 4 quarter-rounds:
4700 // QUARTERROUND(0, 4, 8, 12)
4701 // QUARTERROUND(1, 5, 9, 13)
4702 // QUARTERROUND(2, 6, 10, 14)
4703 // QUARTERROUND(3, 7, 11, 15)
4704 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4705 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4706 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4707
4708 __ cc20_qr_add4(aSet, bSet); // a += b
4709 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4710 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4711
4712 __ cc20_qr_add4(cSet, dSet); // c += d
4713 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4714 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4715
4716 __ cc20_qr_add4(aSet, bSet); // a += b
4717 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4718 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4719
4720 __ cc20_qr_add4(cSet, dSet); // c += d
4721 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4722 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4723
4724 // Set to diagonal organization and do the next 4 quarter-rounds:
4725 // QUARTERROUND(0, 5, 10, 15)
4726 // QUARTERROUND(1, 6, 11, 12)
4727 // QUARTERROUND(2, 7, 8, 13)
4728 // QUARTERROUND(3, 4, 9, 14)
4729 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4730 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4731 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4732
4733 __ cc20_qr_add4(aSet, bSet); // a += b
4734 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4735 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4736
4737 __ cc20_qr_add4(cSet, dSet); // c += d
4738 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4739 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4740
4741 __ cc20_qr_add4(aSet, bSet); // a += b
4742 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4743 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4744
4745 __ cc20_qr_add4(cSet, dSet); // c += d
4746 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4747 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4748
4749 // Decrement and iterate
4750 __ sub(loopCtr, loopCtr, 1);
4751 __ cbnz(loopCtr, L_twoRounds);
4752
4753 __ mov(tmpAddr, state);
4754
4755 // Add the starting state back to the post-loop keystream
4756 // state. We read/interlace the state array from memory into
4757 // 4 registers similar to what we did in the beginning. Then
4758 // add the counter overlay onto workSt[12] at the end.
4759 for (i = 0; i < 16; i += 4) {
4760 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4761 __ addv(workSt[i], __ T4S, workSt[i], v0);
4762 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4763 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4764 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4765 }
4766 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4767
4768 // Write working state into the keystream buffer. This is accomplished
4769 // by taking the lane "i" from each of the four vectors and writing
4770 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4771 // repeating with the next 4 vectors until all 16 vectors have been used.
4772 // Then move to the next lane and repeat the process until all lanes have
4773 // been written.
4774 for (i = 0; i < 4; i++) {
4775 for (j = 0; j < 16; j += 4) {
4776 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4777 __ post(keystream, 16));
4778 }
4779 }
4780
4781 __ mov(r0, 256); // Return length of output keystream
4782 __ leave();
4783 __ ret(lr);
4784
4785 // bind label and generate local constant data used by this stub
4786 // The constant data is broken into two 128-bit segments to be loaded
4787 // onto FloatRegisters. The first 128 bits are a counter add overlay
4788 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4789 // The second 128-bits is a table constant used for 8-bit left rotations.
4790 __ BIND(L_cc20_const);
4791 __ emit_int64(0x0000000100000000UL);
4792 __ emit_int64(0x0000000300000002UL);
4793 __ emit_int64(0x0605040702010003UL);
4794 __ emit_int64(0x0E0D0C0F0A09080BUL);
4795
4796 return start;
4797 }
4798
4799 // Helpers to schedule parallel operation bundles across vector
4800 // register sequences of size 2, 4 or 8.
4801
4802 // Implement various primitive computations across vector sequences
4803
4804 template<int N>
4805 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4806 const VSeq<N>& v1, const VSeq<N>& v2) {
4807 // output must not be constant
4808 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4809 // output cannot overwrite pending inputs
4810 assert(!vs_write_before_read(v, v1), "output overwrites input");
4811 assert(!vs_write_before_read(v, v2), "output overwrites input");
4812 for (int i = 0; i < N; i++) {
4813 __ addv(v[i], T, v1[i], v2[i]);
4814 }
4815 }
4816
4817 template<int N>
4818 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4819 const VSeq<N>& v1, const VSeq<N>& v2) {
4820 // output must not be constant
4821 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4822 // output cannot overwrite pending inputs
4823 assert(!vs_write_before_read(v, v1), "output overwrites input");
4824 assert(!vs_write_before_read(v, v2), "output overwrites input");
4825 for (int i = 0; i < N; i++) {
4826 __ subv(v[i], T, v1[i], v2[i]);
4827 }
4828 }
4829
4830 template<int N>
4831 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4832 const VSeq<N>& v1, const VSeq<N>& v2) {
4833 // output must not be constant
4834 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4835 // output cannot overwrite pending inputs
4836 assert(!vs_write_before_read(v, v1), "output overwrites input");
4837 assert(!vs_write_before_read(v, v2), "output overwrites input");
4838 for (int i = 0; i < N; i++) {
4839 __ mulv(v[i], T, v1[i], v2[i]);
4840 }
4841 }
4842
4843 template<int N>
4844 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4845 // output must not be constant
4846 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4847 // output cannot overwrite pending inputs
4848 assert(!vs_write_before_read(v, v1), "output overwrites input");
4849 for (int i = 0; i < N; i++) {
4850 __ negr(v[i], T, v1[i]);
4851 }
4852 }
4853
4854 template<int N>
4855 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4856 const VSeq<N>& v1, int shift) {
4857 // output must not be constant
4858 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4859 // output cannot overwrite pending inputs
4860 assert(!vs_write_before_read(v, v1), "output overwrites input");
4861 for (int i = 0; i < N; i++) {
4862 __ sshr(v[i], T, v1[i], shift);
4863 }
4864 }
4865
4866 template<int N>
4867 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4868 // output must not be constant
4869 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4870 // output cannot overwrite pending inputs
4871 assert(!vs_write_before_read(v, v1), "output overwrites input");
4872 assert(!vs_write_before_read(v, v2), "output overwrites input");
4873 for (int i = 0; i < N; i++) {
4874 __ andr(v[i], __ T16B, v1[i], v2[i]);
4875 }
4876 }
4877
4878 template<int N>
4879 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4880 // output must not be constant
4881 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4882 // output cannot overwrite pending inputs
4883 assert(!vs_write_before_read(v, v1), "output overwrites input");
4884 assert(!vs_write_before_read(v, v2), "output overwrites input");
4885 for (int i = 0; i < N; i++) {
4886 __ orr(v[i], __ T16B, v1[i], v2[i]);
4887 }
4888 }
4889
4890 template<int N>
4891 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4892 // output must not be constant
4893 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4894 // output cannot overwrite pending inputs
4895 assert(!vs_write_before_read(v, v1), "output overwrites input");
4896 for (int i = 0; i < N; i++) {
4897 __ notr(v[i], __ T16B, v1[i]);
4898 }
4899 }
4900
4901 template<int N>
4902 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4903 // output must not be constant
4904 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4905 // output cannot overwrite pending inputs
4906 assert(!vs_write_before_read(v, v1), "output overwrites input");
4907 assert(!vs_write_before_read(v, v2), "output overwrites input");
4908 for (int i = 0; i < N; i++) {
4909 __ sqdmulh(v[i], T, v1[i], v2[i]);
4910 }
4911 }
4912
4913 template<int N>
4914 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4915 // output must not be constant
4916 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4917 // output cannot overwrite pending inputs
4918 assert(!vs_write_before_read(v, v1), "output overwrites input");
4919 assert(!vs_write_before_read(v, v2), "output overwrites input");
4920 for (int i = 0; i < N; i++) {
4921 __ mlsv(v[i], T, v1[i], v2[i]);
4922 }
4923 }
4924
4925 // load N/2 successive pairs of quadword values from memory in order
4926 // into N successive vector registers of the sequence via the
4927 // address supplied in base.
4928 template<int N>
4929 void vs_ldpq(const VSeq<N>& v, Register base) {
4930 for (int i = 0; i < N; i += 2) {
4931 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4932 }
4933 }
4934
4935 // load N/2 successive pairs of quadword values from memory in order
4936 // into N vector registers of the sequence via the address supplied
4937 // in base using post-increment addressing
4938 template<int N>
4939 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4940 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4941 for (int i = 0; i < N; i += 2) {
4942 __ ldpq(v[i], v[i+1], __ post(base, 32));
4943 }
4944 }
4945
4946 // store N successive vector registers of the sequence into N/2
4947 // successive pairs of quadword memory locations via the address
4948 // supplied in base using post-increment addressing
4949 template<int N>
4950 void vs_stpq_post(const VSeq<N>& v, Register base) {
4951 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4952 for (int i = 0; i < N; i += 2) {
4953 __ stpq(v[i], v[i+1], __ post(base, 32));
4954 }
4955 }
4956
4957 // load N/2 pairs of quadword values from memory de-interleaved into
4958 // N vector registers 2 at a time via the address supplied in base
4959 // using post-increment addressing.
4960 template<int N>
4961 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4962 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4963 for (int i = 0; i < N; i += 2) {
4964 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4965 }
4966 }
4967
4968 // store N vector registers interleaved into N/2 pairs of quadword
4969 // memory locations via the address supplied in base using
4970 // post-increment addressing.
4971 template<int N>
4972 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4973 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4974 for (int i = 0; i < N; i += 2) {
4975 __ st2(v[i], v[i+1], T, __ post(base, 32));
4976 }
4977 }
4978
4979 // load N quadword values from memory de-interleaved into N vector
4980 // registers 3 elements at a time via the address supplied in base.
4981 template<int N>
4982 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4983 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4984 for (int i = 0; i < N; i += 3) {
4985 __ ld3(v[i], v[i+1], v[i+2], T, base);
4986 }
4987 }
4988
4989 // load N quadword values from memory de-interleaved into N vector
4990 // registers 3 elements at a time via the address supplied in base
4991 // using post-increment addressing.
4992 template<int N>
4993 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4994 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4995 for (int i = 0; i < N; i += 3) {
4996 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4997 }
4998 }
4999
5000 // load N/2 pairs of quadword values from memory into N vector
5001 // registers via the address supplied in base with each pair indexed
5002 // using the the start offset plus the corresponding entry in the
5003 // offsets array
5004 template<int N>
5005 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5006 for (int i = 0; i < N/2; i++) {
5007 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5008 }
5009 }
5010
5011 // store N vector registers into N/2 pairs of quadword memory
5012 // locations via the address supplied in base with each pair indexed
5013 // using the the start offset plus the corresponding entry in the
5014 // offsets array
5015 template<int N>
5016 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5017 for (int i = 0; i < N/2; i++) {
5018 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5019 }
5020 }
5021
5022 // load N single quadword values from memory into N vector registers
5023 // via the address supplied in base with each value indexed using
5024 // the the start offset plus the corresponding entry in the offsets
5025 // array
5026 template<int N>
5027 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5028 int start, int (&offsets)[N]) {
5029 for (int i = 0; i < N; i++) {
5030 __ ldr(v[i], T, Address(base, start + offsets[i]));
5031 }
5032 }
5033
5034 // store N vector registers into N single quadword memory locations
5035 // via the address supplied in base with each value indexed using
5036 // the the start offset plus the corresponding entry in the offsets
5037 // array
5038 template<int N>
5039 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5040 int start, int (&offsets)[N]) {
5041 for (int i = 0; i < N; i++) {
5042 __ str(v[i], T, Address(base, start + offsets[i]));
5043 }
5044 }
5045
5046 // load N/2 pairs of quadword values from memory de-interleaved into
5047 // N vector registers 2 at a time via the address supplied in base
5048 // with each pair indexed using the the start offset plus the
5049 // corresponding entry in the offsets array
5050 template<int N>
5051 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5052 Register tmp, int start, int (&offsets)[N/2]) {
5053 for (int i = 0; i < N/2; i++) {
5054 __ add(tmp, base, start + offsets[i]);
5055 __ ld2(v[2*i], v[2*i+1], T, tmp);
5056 }
5057 }
5058
5059 // store N vector registers 2 at a time interleaved into N/2 pairs
5060 // of quadword memory locations via the address supplied in base
5061 // with each pair indexed using the the start offset plus the
5062 // corresponding entry in the offsets array
5063 template<int N>
5064 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5065 Register tmp, int start, int (&offsets)[N/2]) {
5066 for (int i = 0; i < N/2; i++) {
5067 __ add(tmp, base, start + offsets[i]);
5068 __ st2(v[2*i], v[2*i+1], T, tmp);
5069 }
5070 }
5071
5072 // Helper routines for various flavours of Montgomery multiply
5073
5074 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5075 // multiplications in parallel
5076 //
5077
5078 // See the montMul() method of the sun.security.provider.ML_DSA
5079 // class.
5080 //
5081 // Computes 4x4S results or 8x8H results
5082 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5083 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5084 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5085 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5086 // Outputs: va - 4x4S or 4x8H vector register sequences
5087 // vb, vc, vtmp and vq must all be disjoint
5088 // va must be disjoint from all other inputs/temps or must equal vc
5089 // va must have a non-zero delta i.e. it must not be a constant vseq.
5090 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5091 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5092 Assembler::SIMD_Arrangement T,
5093 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5094 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5095 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5096 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5097 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5098
5099 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5100 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5101
5102 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5103
5104 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5105 assert(vs_disjoint(va, vb), "va and vb overlap");
5106 assert(vs_disjoint(va, vq), "va and vq overlap");
5107 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5108 assert(!va.is_constant(), "output vector must identify 4 different registers");
5109
5110 // schedule 4 streams of instructions across the vector sequences
5111 for (int i = 0; i < 4; i++) {
5112 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5113 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5114 }
5115
5116 for (int i = 0; i < 4; i++) {
5117 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5118 }
5119
5120 for (int i = 0; i < 4; i++) {
5121 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5122 }
5123
5124 for (int i = 0; i < 4; i++) {
5125 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5126 }
5127 }
5128
5129 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5130 // multiplications in parallel
5131 //
5132
5133 // See the montMul() method of the sun.security.provider.ML_DSA
5134 // class.
5135 //
5136 // Computes 4x4S results or 8x8H results
5137 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5138 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5139 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5140 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5141 // Outputs: va - 4x4S or 4x8H vector register sequences
5142 // vb, vc, vtmp and vq must all be disjoint
5143 // va must be disjoint from all other inputs/temps or must equal vc
5144 // va must have a non-zero delta i.e. it must not be a constant vseq.
5145 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5146 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5147 Assembler::SIMD_Arrangement T,
5148 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5149 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5150 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5151 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5152 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5153
5154 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5155 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5156
5157 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5158
5159 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5160 assert(vs_disjoint(va, vb), "va and vb overlap");
5161 assert(vs_disjoint(va, vq), "va and vq overlap");
5162 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5163 assert(!va.is_constant(), "output vector must identify 2 different registers");
5164
5165 // schedule 2 streams of instructions across the vector sequences
5166 for (int i = 0; i < 2; i++) {
5167 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5168 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5169 }
5170
5171 for (int i = 0; i < 2; i++) {
5172 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5173 }
5174
5175 for (int i = 0; i < 2; i++) {
5176 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5177 }
5178
5179 for (int i = 0; i < 2; i++) {
5180 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5181 }
5182 }
5183
5184 // Perform 16 16-bit Montgomery multiplications in parallel.
5185 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5186 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5187 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5188 // It will assert that the register use is valid
5189 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5190 }
5191
5192 // Perform 32 16-bit Montgomery multiplications in parallel.
5193 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5194 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5195 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5196 // It will assert that the register use is valid
5197 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5198 }
5199
5200 // Perform 64 16-bit Montgomery multiplications in parallel.
5201 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5202 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5203 // Schedule two successive 4x8H multiplies via the montmul helper
5204 // on the front and back halves of va, vb and vc. The helper will
5205 // assert that the register use has no overlap conflicts on each
5206 // individual call but we also need to ensure that the necessary
5207 // disjoint/equality constraints are met across both calls.
5208
5209 // vb, vc, vtmp and vq must be disjoint. va must either be
5210 // disjoint from all other registers or equal vc
5211
5212 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5213 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5214 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5215
5216 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5217 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5218
5219 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5220
5221 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5222 assert(vs_disjoint(va, vb), "va and vb overlap");
5223 assert(vs_disjoint(va, vq), "va and vq overlap");
5224 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5225
5226 // we multiply the front and back halves of each sequence 4 at a
5227 // time because
5228 //
5229 // 1) we are currently only able to get 4-way instruction
5230 // parallelism at best
5231 //
5232 // 2) we need registers for the constants in vq and temporary
5233 // scratch registers to hold intermediate results so vtmp can only
5234 // be a VSeq<4> which means we only have 4 scratch slots
5235
5236 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5237 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5238 }
5239
5240 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5241 const VSeq<4>& vc,
5242 const VSeq<4>& vtmp,
5243 const VSeq<2>& vq) {
5244 // compute a = montmul(a1, c)
5245 kyber_montmul32(vc, va1, vc, vtmp, vq);
5246 // ouptut a1 = a0 - a
5247 vs_subv(va1, __ T8H, va0, vc);
5248 // and a0 = a0 + a
5249 vs_addv(va0, __ T8H, va0, vc);
5250 }
5251
5252 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5253 const VSeq<4>& vb,
5254 const VSeq<4>& vtmp1,
5255 const VSeq<4>& vtmp2,
5256 const VSeq<2>& vq) {
5257 // compute c = a0 - a1
5258 vs_subv(vtmp1, __ T8H, va0, va1);
5259 // output a0 = a0 + a1
5260 vs_addv(va0, __ T8H, va0, va1);
5261 // output a1 = b montmul c
5262 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5263 }
5264
5265 void load64shorts(const VSeq<8>& v, Register shorts) {
5266 vs_ldpq_post(v, shorts);
5267 }
5268
5269 void load32shorts(const VSeq<4>& v, Register shorts) {
5270 vs_ldpq_post(v, shorts);
5271 }
5272
5273 void store64shorts(VSeq<8> v, Register tmpAddr) {
5274 vs_stpq_post(v, tmpAddr);
5275 }
5276
5277 // Kyber NTT function.
5278 // Implements
5279 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5280 //
5281 // coeffs (short[256]) = c_rarg0
5282 // ntt_zetas (short[256]) = c_rarg1
5283 address generate_kyberNtt() {
5284
5285 __ align(CodeEntryAlignment);
5286 StubId stub_id = StubId::stubgen_kyberNtt_id;
5287 StubCodeMark mark(this, stub_id);
5288 address start = __ pc();
5289 __ enter();
5290
5291 const Register coeffs = c_rarg0;
5292 const Register zetas = c_rarg1;
5293
5294 const Register kyberConsts = r10;
5295 const Register tmpAddr = r11;
5296
5297 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5298 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5299 VSeq<2> vq(30); // n.b. constants overlap vs3
5300
5301 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5302 // load the montmul constants
5303 vs_ldpq(vq, kyberConsts);
5304
5305 // Each level corresponds to an iteration of the outermost loop of the
5306 // Java method seilerNTT(int[] coeffs). There are some differences
5307 // from what is done in the seilerNTT() method, though:
5308 // 1. The computation is using 16-bit signed values, we do not convert them
5309 // to ints here.
5310 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5311 // this array for each level, it is easier that way to fill up the vector
5312 // registers.
5313 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5314 // multiplications (this is because that way there should not be any
5315 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5316 // that we can use the 16-bit arithmetic in the vector unit.
5317 //
5318 // On each level, we fill up the vector registers in such a way that the
5319 // array elements that need to be multiplied by the zetas go into one
5320 // set of vector registers while the corresponding ones that don't need to
5321 // be multiplied, go into another set.
5322 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5323 // registers interleaving the steps of 4 identical computations,
5324 // each done on 8 16-bit values per register.
5325
5326 // At levels 0-3 the coefficients multiplied by or added/subtracted
5327 // to the zetas occur in discrete blocks whose size is some multiple
5328 // of 32.
5329
5330 // level 0
5331 __ add(tmpAddr, coeffs, 256);
5332 load64shorts(vs1, tmpAddr);
5333 load64shorts(vs2, zetas);
5334 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5335 __ add(tmpAddr, coeffs, 0);
5336 load64shorts(vs1, tmpAddr);
5337 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5338 vs_addv(vs1, __ T8H, vs1, vs2);
5339 __ add(tmpAddr, coeffs, 0);
5340 vs_stpq_post(vs1, tmpAddr);
5341 __ add(tmpAddr, coeffs, 256);
5342 vs_stpq_post(vs3, tmpAddr);
5343 // restore montmul constants
5344 vs_ldpq(vq, kyberConsts);
5345 load64shorts(vs1, tmpAddr);
5346 load64shorts(vs2, zetas);
5347 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5348 __ add(tmpAddr, coeffs, 128);
5349 load64shorts(vs1, tmpAddr);
5350 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5351 vs_addv(vs1, __ T8H, vs1, vs2);
5352 __ add(tmpAddr, coeffs, 128);
5353 store64shorts(vs1, tmpAddr);
5354 __ add(tmpAddr, coeffs, 384);
5355 store64shorts(vs3, tmpAddr);
5356
5357 // level 1
5358 // restore montmul constants
5359 vs_ldpq(vq, kyberConsts);
5360 __ add(tmpAddr, coeffs, 128);
5361 load64shorts(vs1, tmpAddr);
5362 load64shorts(vs2, zetas);
5363 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5364 __ add(tmpAddr, coeffs, 0);
5365 load64shorts(vs1, tmpAddr);
5366 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5367 vs_addv(vs1, __ T8H, vs1, vs2);
5368 __ add(tmpAddr, coeffs, 0);
5369 store64shorts(vs1, tmpAddr);
5370 store64shorts(vs3, tmpAddr);
5371 vs_ldpq(vq, kyberConsts);
5372 __ add(tmpAddr, coeffs, 384);
5373 load64shorts(vs1, tmpAddr);
5374 load64shorts(vs2, zetas);
5375 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5376 __ add(tmpAddr, coeffs, 256);
5377 load64shorts(vs1, tmpAddr);
5378 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5379 vs_addv(vs1, __ T8H, vs1, vs2);
5380 __ add(tmpAddr, coeffs, 256);
5381 store64shorts(vs1, tmpAddr);
5382 store64shorts(vs3, tmpAddr);
5383
5384 // level 2
5385 vs_ldpq(vq, kyberConsts);
5386 int offsets1[4] = { 0, 32, 128, 160 };
5387 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5388 load64shorts(vs2, zetas);
5389 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5390 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5391 // kyber_subv_addv64();
5392 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5393 vs_addv(vs1, __ T8H, vs1, vs2);
5394 __ add(tmpAddr, coeffs, 0);
5395 vs_stpq_post(vs_front(vs1), tmpAddr);
5396 vs_stpq_post(vs_front(vs3), tmpAddr);
5397 vs_stpq_post(vs_back(vs1), tmpAddr);
5398 vs_stpq_post(vs_back(vs3), tmpAddr);
5399 vs_ldpq(vq, kyberConsts);
5400 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5401 load64shorts(vs2, zetas);
5402 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5403 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5404 // kyber_subv_addv64();
5405 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5406 vs_addv(vs1, __ T8H, vs1, vs2);
5407 __ add(tmpAddr, coeffs, 256);
5408 vs_stpq_post(vs_front(vs1), tmpAddr);
5409 vs_stpq_post(vs_front(vs3), tmpAddr);
5410 vs_stpq_post(vs_back(vs1), tmpAddr);
5411 vs_stpq_post(vs_back(vs3), tmpAddr);
5412
5413 // level 3
5414 vs_ldpq(vq, kyberConsts);
5415 int offsets2[4] = { 0, 64, 128, 192 };
5416 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5417 load64shorts(vs2, zetas);
5418 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5419 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5420 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5421 vs_addv(vs1, __ T8H, vs1, vs2);
5422 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5423 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5424
5425 vs_ldpq(vq, kyberConsts);
5426 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5427 load64shorts(vs2, zetas);
5428 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5429 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5430 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5431 vs_addv(vs1, __ T8H, vs1, vs2);
5432 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5433 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5434
5435 // level 4
5436 // At level 4 coefficients occur in 8 discrete blocks of size 16
5437 // so they are loaded using employing an ldr at 8 distinct offsets.
5438
5439 vs_ldpq(vq, kyberConsts);
5440 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5441 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5442 load64shorts(vs2, zetas);
5443 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5444 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5445 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5446 vs_addv(vs1, __ T8H, vs1, vs2);
5447 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5448 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5449
5450 vs_ldpq(vq, kyberConsts);
5451 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5452 load64shorts(vs2, zetas);
5453 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5454 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5455 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5456 vs_addv(vs1, __ T8H, vs1, vs2);
5457 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5458 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5459
5460 // level 5
5461 // At level 5 related coefficients occur in discrete blocks of size 8 so
5462 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5463
5464 vs_ldpq(vq, kyberConsts);
5465 int offsets4[4] = { 0, 32, 64, 96 };
5466 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5467 load32shorts(vs_front(vs2), zetas);
5468 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5469 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5470 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5471 load32shorts(vs_front(vs2), zetas);
5472 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5473 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5474 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5475 load32shorts(vs_front(vs2), zetas);
5476 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5477 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5478
5479 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5480 load32shorts(vs_front(vs2), zetas);
5481 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5482 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5483
5484 // level 6
5485 // At level 6 related coefficients occur in discrete blocks of size 4 so
5486 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5487
5488 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5489 load32shorts(vs_front(vs2), zetas);
5490 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5491 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5492 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5493 // __ ldpq(v18, v19, __ post(zetas, 32));
5494 load32shorts(vs_front(vs2), zetas);
5495 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5496 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5497
5498 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5499 load32shorts(vs_front(vs2), zetas);
5500 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5501 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5502
5503 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5504 load32shorts(vs_front(vs2), zetas);
5505 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5506 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5507
5508 __ leave(); // required for proper stackwalking of RuntimeStub frame
5509 __ mov(r0, zr); // return 0
5510 __ ret(lr);
5511
5512 return start;
5513 }
5514
5515 // Kyber Inverse NTT function
5516 // Implements
5517 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5518 //
5519 // coeffs (short[256]) = c_rarg0
5520 // ntt_zetas (short[256]) = c_rarg1
5521 address generate_kyberInverseNtt() {
5522
5523 __ align(CodeEntryAlignment);
5524 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5525 StubCodeMark mark(this, stub_id);
5526 address start = __ pc();
5527 __ enter();
5528
5529 const Register coeffs = c_rarg0;
5530 const Register zetas = c_rarg1;
5531
5532 const Register kyberConsts = r10;
5533 const Register tmpAddr = r11;
5534 const Register tmpAddr2 = c_rarg2;
5535
5536 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5537 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5538 VSeq<2> vq(30); // n.b. constants overlap vs3
5539
5540 __ lea(kyberConsts,
5541 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5542
5543 // level 0
5544 // At level 0 related coefficients occur in discrete blocks of size 4 so
5545 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5546
5547 vs_ldpq(vq, kyberConsts);
5548 int offsets4[4] = { 0, 32, 64, 96 };
5549 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5550 load32shorts(vs_front(vs2), zetas);
5551 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5552 vs_front(vs2), vs_back(vs2), vtmp, vq);
5553 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5554 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5555 load32shorts(vs_front(vs2), zetas);
5556 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5557 vs_front(vs2), vs_back(vs2), vtmp, vq);
5558 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5559 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5560 load32shorts(vs_front(vs2), zetas);
5561 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5562 vs_front(vs2), vs_back(vs2), vtmp, vq);
5563 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5564 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5565 load32shorts(vs_front(vs2), zetas);
5566 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5567 vs_front(vs2), vs_back(vs2), vtmp, vq);
5568 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5569
5570 // level 1
5571 // At level 1 related coefficients occur in discrete blocks of size 8 so
5572 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5573
5574 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5575 load32shorts(vs_front(vs2), zetas);
5576 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5577 vs_front(vs2), vs_back(vs2), vtmp, vq);
5578 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5579 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5580 load32shorts(vs_front(vs2), zetas);
5581 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5582 vs_front(vs2), vs_back(vs2), vtmp, vq);
5583 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5584
5585 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5586 load32shorts(vs_front(vs2), zetas);
5587 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5588 vs_front(vs2), vs_back(vs2), vtmp, vq);
5589 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5590 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5591 load32shorts(vs_front(vs2), zetas);
5592 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5593 vs_front(vs2), vs_back(vs2), vtmp, vq);
5594 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5595
5596 // level 2
5597 // At level 2 coefficients occur in 8 discrete blocks of size 16
5598 // so they are loaded using employing an ldr at 8 distinct offsets.
5599
5600 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5601 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5602 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5603 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5604 vs_subv(vs1, __ T8H, vs1, vs2);
5605 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5606 load64shorts(vs2, zetas);
5607 vs_ldpq(vq, kyberConsts);
5608 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5609 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5610
5611 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5612 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5613 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5614 vs_subv(vs1, __ T8H, vs1, vs2);
5615 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5616 load64shorts(vs2, zetas);
5617 vs_ldpq(vq, kyberConsts);
5618 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5619 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5620
5621 // Barrett reduction at indexes where overflow may happen
5622
5623 // load q and the multiplier for the Barrett reduction
5624 __ add(tmpAddr, kyberConsts, 16);
5625 vs_ldpq(vq, tmpAddr);
5626
5627 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5628 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5629 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5630 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5631 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5632 vs_sshr(vs2, __ T8H, vs2, 11);
5633 vs_mlsv(vs1, __ T8H, vs2, vq1);
5634 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5635 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5636 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5637 vs_sshr(vs2, __ T8H, vs2, 11);
5638 vs_mlsv(vs1, __ T8H, vs2, vq1);
5639 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5640
5641 // level 3
5642 // From level 3 upwards coefficients occur in discrete blocks whose size is
5643 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5644
5645 int offsets2[4] = { 0, 64, 128, 192 };
5646 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5647 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5648 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5649 vs_subv(vs1, __ T8H, vs1, vs2);
5650 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5651 load64shorts(vs2, zetas);
5652 vs_ldpq(vq, kyberConsts);
5653 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5654 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5655
5656 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5657 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5658 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5659 vs_subv(vs1, __ T8H, vs1, vs2);
5660 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5661 load64shorts(vs2, zetas);
5662 vs_ldpq(vq, kyberConsts);
5663 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5664 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5665
5666 // level 4
5667
5668 int offsets1[4] = { 0, 32, 128, 160 };
5669 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5670 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5671 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5672 vs_subv(vs1, __ T8H, vs1, vs2);
5673 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5674 load64shorts(vs2, zetas);
5675 vs_ldpq(vq, kyberConsts);
5676 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5677 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5678
5679 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5680 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5681 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5682 vs_subv(vs1, __ T8H, vs1, vs2);
5683 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5684 load64shorts(vs2, zetas);
5685 vs_ldpq(vq, kyberConsts);
5686 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5687 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5688
5689 // level 5
5690
5691 __ add(tmpAddr, coeffs, 0);
5692 load64shorts(vs1, tmpAddr);
5693 __ add(tmpAddr, coeffs, 128);
5694 load64shorts(vs2, tmpAddr);
5695 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5696 vs_subv(vs1, __ T8H, vs1, vs2);
5697 __ add(tmpAddr, coeffs, 0);
5698 store64shorts(vs3, tmpAddr);
5699 load64shorts(vs2, zetas);
5700 vs_ldpq(vq, kyberConsts);
5701 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5702 __ add(tmpAddr, coeffs, 128);
5703 store64shorts(vs2, tmpAddr);
5704
5705 load64shorts(vs1, tmpAddr);
5706 __ add(tmpAddr, coeffs, 384);
5707 load64shorts(vs2, tmpAddr);
5708 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5709 vs_subv(vs1, __ T8H, vs1, vs2);
5710 __ add(tmpAddr, coeffs, 256);
5711 store64shorts(vs3, tmpAddr);
5712 load64shorts(vs2, zetas);
5713 vs_ldpq(vq, kyberConsts);
5714 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5715 __ add(tmpAddr, coeffs, 384);
5716 store64shorts(vs2, tmpAddr);
5717
5718 // Barrett reduction at indexes where overflow may happen
5719
5720 // load q and the multiplier for the Barrett reduction
5721 __ add(tmpAddr, kyberConsts, 16);
5722 vs_ldpq(vq, tmpAddr);
5723
5724 int offsets0[2] = { 0, 256 };
5725 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5726 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5727 vs_sshr(vs2, __ T8H, vs2, 11);
5728 vs_mlsv(vs1, __ T8H, vs2, vq1);
5729 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5730
5731 // level 6
5732
5733 __ add(tmpAddr, coeffs, 0);
5734 load64shorts(vs1, tmpAddr);
5735 __ add(tmpAddr, coeffs, 256);
5736 load64shorts(vs2, tmpAddr);
5737 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5738 vs_subv(vs1, __ T8H, vs1, vs2);
5739 __ add(tmpAddr, coeffs, 0);
5740 store64shorts(vs3, tmpAddr);
5741 load64shorts(vs2, zetas);
5742 vs_ldpq(vq, kyberConsts);
5743 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5744 __ add(tmpAddr, coeffs, 256);
5745 store64shorts(vs2, tmpAddr);
5746
5747 __ add(tmpAddr, coeffs, 128);
5748 load64shorts(vs1, tmpAddr);
5749 __ add(tmpAddr, coeffs, 384);
5750 load64shorts(vs2, tmpAddr);
5751 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5752 vs_subv(vs1, __ T8H, vs1, vs2);
5753 __ add(tmpAddr, coeffs, 128);
5754 store64shorts(vs3, tmpAddr);
5755 load64shorts(vs2, zetas);
5756 vs_ldpq(vq, kyberConsts);
5757 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5758 __ add(tmpAddr, coeffs, 384);
5759 store64shorts(vs2, tmpAddr);
5760
5761 // multiply by 2^-n
5762
5763 // load toMont(2^-n mod q)
5764 __ add(tmpAddr, kyberConsts, 48);
5765 __ ldr(v29, __ Q, tmpAddr);
5766
5767 vs_ldpq(vq, kyberConsts);
5768 __ add(tmpAddr, coeffs, 0);
5769 load64shorts(vs1, tmpAddr);
5770 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5771 __ add(tmpAddr, coeffs, 0);
5772 store64shorts(vs2, tmpAddr);
5773
5774 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5775 load64shorts(vs1, tmpAddr);
5776 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5777 __ add(tmpAddr, coeffs, 128);
5778 store64shorts(vs2, tmpAddr);
5779
5780 // now tmpAddr contains coeffs + 256
5781 load64shorts(vs1, tmpAddr);
5782 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5783 __ add(tmpAddr, coeffs, 256);
5784 store64shorts(vs2, tmpAddr);
5785
5786 // now tmpAddr contains coeffs + 384
5787 load64shorts(vs1, tmpAddr);
5788 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5789 __ add(tmpAddr, coeffs, 384);
5790 store64shorts(vs2, tmpAddr);
5791
5792 __ leave(); // required for proper stackwalking of RuntimeStub frame
5793 __ mov(r0, zr); // return 0
5794 __ ret(lr);
5795
5796 return start;
5797 }
5798
5799 // Kyber multiply polynomials in the NTT domain.
5800 // Implements
5801 // static int implKyberNttMult(
5802 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5803 //
5804 // result (short[256]) = c_rarg0
5805 // ntta (short[256]) = c_rarg1
5806 // nttb (short[256]) = c_rarg2
5807 // zetas (short[128]) = c_rarg3
5808 address generate_kyberNttMult() {
5809
5810 __ align(CodeEntryAlignment);
5811 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5812 StubCodeMark mark(this, stub_id);
5813 address start = __ pc();
5814 __ enter();
5815
5816 const Register result = c_rarg0;
5817 const Register ntta = c_rarg1;
5818 const Register nttb = c_rarg2;
5819 const Register zetas = c_rarg3;
5820
5821 const Register kyberConsts = r10;
5822 const Register limit = r11;
5823
5824 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5825 VSeq<4> vs3(16), vs4(20);
5826 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5827 VSeq<2> vz(28); // pair of zetas
5828 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5829
5830 __ lea(kyberConsts,
5831 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5832
5833 Label kyberNttMult_loop;
5834
5835 __ add(limit, result, 512);
5836
5837 // load q and qinv
5838 vs_ldpq(vq, kyberConsts);
5839
5840 // load R^2 mod q (to convert back from Montgomery representation)
5841 __ add(kyberConsts, kyberConsts, 64);
5842 __ ldr(v27, __ Q, kyberConsts);
5843
5844 __ BIND(kyberNttMult_loop);
5845
5846 // load 16 zetas
5847 vs_ldpq_post(vz, zetas);
5848
5849 // load 2 sets of 32 coefficients from the two input arrays
5850 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5851 // are striped across pairs of vector registers
5852 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5853 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5854 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5855 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5856
5857 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5858 // i.e. montmul the first and second halves of vs1 in order and
5859 // then with one sequence reversed storing the two results in vs3
5860 //
5861 // vs3[0] <- montmul(a0, b0)
5862 // vs3[1] <- montmul(a1, b1)
5863 // vs3[2] <- montmul(a0, b1)
5864 // vs3[3] <- montmul(a1, b0)
5865 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5866 kyber_montmul16(vs_back(vs3),
5867 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5868
5869 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5870 // i.e. montmul the first and second halves of vs4 in order and
5871 // then with one sequence reversed storing the two results in vs1
5872 //
5873 // vs1[0] <- montmul(a2, b2)
5874 // vs1[1] <- montmul(a3, b3)
5875 // vs1[2] <- montmul(a2, b3)
5876 // vs1[3] <- montmul(a3, b2)
5877 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5878 kyber_montmul16(vs_back(vs1),
5879 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5880
5881 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5882 // We can schedule two montmuls at a time if we use a suitable vector
5883 // sequence <vs3[1], vs1[1]>.
5884 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5885 VSeq<2> vs5(vs3[1], delta);
5886
5887 // vs3[1] <- montmul(montmul(a1, b1), z0)
5888 // vs1[1] <- montmul(montmul(a3, b3), z1)
5889 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5890
5891 // add results in pairs storing in vs3
5892 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5893 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5894 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5895
5896 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5897 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5898 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5899
5900 // vs1 <- montmul(vs3, montRSquareModQ)
5901 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5902
5903 // store back the two pairs of result vectors de-interleaved as 8H elements
5904 // i.e. storing each pairs of shorts striped across a register pair adjacent
5905 // in memory
5906 vs_st2_post(vs1, __ T8H, result);
5907
5908 __ cmp(result, limit);
5909 __ br(Assembler::NE, kyberNttMult_loop);
5910
5911 __ leave(); // required for proper stackwalking of RuntimeStub frame
5912 __ mov(r0, zr); // return 0
5913 __ ret(lr);
5914
5915 return start;
5916 }
5917
5918 // Kyber add 2 polynomials.
5919 // Implements
5920 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5921 //
5922 // result (short[256]) = c_rarg0
5923 // a (short[256]) = c_rarg1
5924 // b (short[256]) = c_rarg2
5925 address generate_kyberAddPoly_2() {
5926
5927 __ align(CodeEntryAlignment);
5928 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5929 StubCodeMark mark(this, stub_id);
5930 address start = __ pc();
5931 __ enter();
5932
5933 const Register result = c_rarg0;
5934 const Register a = c_rarg1;
5935 const Register b = c_rarg2;
5936
5937 const Register kyberConsts = r11;
5938
5939 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5940 // So, we can load, add and store the data in 3 groups of 11,
5941 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5942 // registers. A further constraint is that the mapping needs
5943 // to skip callee saves. So, we allocate the register
5944 // sequences using two 8 sequences, two 2 sequences and two
5945 // single registers.
5946 VSeq<8> vs1_1(0);
5947 VSeq<2> vs1_2(16);
5948 FloatRegister vs1_3 = v28;
5949 VSeq<8> vs2_1(18);
5950 VSeq<2> vs2_2(26);
5951 FloatRegister vs2_3 = v29;
5952
5953 // two constant vector sequences
5954 VSeq<8> vc_1(31, 0);
5955 VSeq<2> vc_2(31, 0);
5956
5957 FloatRegister vc_3 = v31;
5958 __ lea(kyberConsts,
5959 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5960
5961 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5962 for (int i = 0; i < 3; i++) {
5963 // load 80 or 88 values from a into vs1_1/2/3
5964 vs_ldpq_post(vs1_1, a);
5965 vs_ldpq_post(vs1_2, a);
5966 if (i < 2) {
5967 __ ldr(vs1_3, __ Q, __ post(a, 16));
5968 }
5969 // load 80 or 88 values from b into vs2_1/2/3
5970 vs_ldpq_post(vs2_1, b);
5971 vs_ldpq_post(vs2_2, b);
5972 if (i < 2) {
5973 __ ldr(vs2_3, __ Q, __ post(b, 16));
5974 }
5975 // sum 80 or 88 values across vs1 and vs2 into vs1
5976 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5977 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5978 if (i < 2) {
5979 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5980 }
5981 // add constant to all 80 or 88 results
5982 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5983 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5984 if (i < 2) {
5985 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5986 }
5987 // store 80 or 88 values
5988 vs_stpq_post(vs1_1, result);
5989 vs_stpq_post(vs1_2, result);
5990 if (i < 2) {
5991 __ str(vs1_3, __ Q, __ post(result, 16));
5992 }
5993 }
5994
5995 __ leave(); // required for proper stackwalking of RuntimeStub frame
5996 __ mov(r0, zr); // return 0
5997 __ ret(lr);
5998
5999 return start;
6000 }
6001
6002 // Kyber add 3 polynomials.
6003 // Implements
6004 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6005 //
6006 // result (short[256]) = c_rarg0
6007 // a (short[256]) = c_rarg1
6008 // b (short[256]) = c_rarg2
6009 // c (short[256]) = c_rarg3
6010 address generate_kyberAddPoly_3() {
6011
6012 __ align(CodeEntryAlignment);
6013 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6014 StubCodeMark mark(this, stub_id);
6015 address start = __ pc();
6016 __ enter();
6017
6018 const Register result = c_rarg0;
6019 const Register a = c_rarg1;
6020 const Register b = c_rarg2;
6021 const Register c = c_rarg3;
6022
6023 const Register kyberConsts = r11;
6024
6025 // As above we sum 256 sets of values in total i.e. 32 x 8H
6026 // quadwords. So, we can load, add and store the data in 3
6027 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6028 // of 10 or 11 registers. A further constraint is that the
6029 // mapping needs to skip callee saves. So, we allocate the
6030 // register sequences using two 8 sequences, two 2 sequences
6031 // and two single registers.
6032 VSeq<8> vs1_1(0);
6033 VSeq<2> vs1_2(16);
6034 FloatRegister vs1_3 = v28;
6035 VSeq<8> vs2_1(18);
6036 VSeq<2> vs2_2(26);
6037 FloatRegister vs2_3 = v29;
6038
6039 // two constant vector sequences
6040 VSeq<8> vc_1(31, 0);
6041 VSeq<2> vc_2(31, 0);
6042
6043 FloatRegister vc_3 = v31;
6044
6045 __ lea(kyberConsts,
6046 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6047
6048 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6049 for (int i = 0; i < 3; i++) {
6050 // load 80 or 88 values from a into vs1_1/2/3
6051 vs_ldpq_post(vs1_1, a);
6052 vs_ldpq_post(vs1_2, a);
6053 if (i < 2) {
6054 __ ldr(vs1_3, __ Q, __ post(a, 16));
6055 }
6056 // load 80 or 88 values from b into vs2_1/2/3
6057 vs_ldpq_post(vs2_1, b);
6058 vs_ldpq_post(vs2_2, b);
6059 if (i < 2) {
6060 __ ldr(vs2_3, __ Q, __ post(b, 16));
6061 }
6062 // sum 80 or 88 values across vs1 and vs2 into vs1
6063 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6064 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6065 if (i < 2) {
6066 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6067 }
6068 // load 80 or 88 values from c into vs2_1/2/3
6069 vs_ldpq_post(vs2_1, c);
6070 vs_ldpq_post(vs2_2, c);
6071 if (i < 2) {
6072 __ ldr(vs2_3, __ Q, __ post(c, 16));
6073 }
6074 // sum 80 or 88 values across vs1 and vs2 into vs1
6075 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6076 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6077 if (i < 2) {
6078 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6079 }
6080 // add constant to all 80 or 88 results
6081 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6082 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6083 if (i < 2) {
6084 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6085 }
6086 // store 80 or 88 values
6087 vs_stpq_post(vs1_1, result);
6088 vs_stpq_post(vs1_2, result);
6089 if (i < 2) {
6090 __ str(vs1_3, __ Q, __ post(result, 16));
6091 }
6092 }
6093
6094 __ leave(); // required for proper stackwalking of RuntimeStub frame
6095 __ mov(r0, zr); // return 0
6096 __ ret(lr);
6097
6098 return start;
6099 }
6100
6101 // Kyber parse XOF output to polynomial coefficient candidates
6102 // or decodePoly(12, ...).
6103 // Implements
6104 // static int implKyber12To16(
6105 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6106 //
6107 // (parsedLength or (parsedLength - 48) must be divisible by 64.)
6108 //
6109 // condensed (byte[]) = c_rarg0
6110 // condensedIndex = c_rarg1
6111 // parsed (short[112 or 256]) = c_rarg2
6112 // parsedLength (112 or 256) = c_rarg3
6113 address generate_kyber12To16() {
6114 Label L_F00, L_loop, L_end;
6115
6116 __ align(CodeEntryAlignment);
6117 StubId stub_id = StubId::stubgen_kyber12To16_id;
6118 StubCodeMark mark(this, stub_id);
6119 address start = __ pc();
6120 __ enter();
6121
6122 const Register condensed = c_rarg0;
6123 const Register condensedOffs = c_rarg1;
6124 const Register parsed = c_rarg2;
6125 const Register parsedLength = c_rarg3;
6126
6127 const Register tmpAddr = r11;
6128
6129 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6130 // quadwords so we need a 6 vector sequence for the inputs.
6131 // Parsing produces 64 shorts, employing two 8 vector
6132 // sequences to store and combine the intermediate data.
6133 VSeq<6> vin(24);
6134 VSeq<8> va(0), vb(16);
6135
6136 __ adr(tmpAddr, L_F00);
6137 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6138 __ add(condensed, condensed, condensedOffs);
6139
6140 __ BIND(L_loop);
6141 // load 96 (6 x 16B) byte values
6142 vs_ld3_post(vin, __ T16B, condensed);
6143
6144 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6145 // holds 48 (16x3) contiguous bytes from memory striped
6146 // horizontally across each of the 16 byte lanes. Equivalently,
6147 // that is 16 pairs of 12-bit integers. Likewise the back half
6148 // holds the next 48 bytes in the same arrangement.
6149
6150 // Each vector in the front half can also be viewed as a vertical
6151 // strip across the 16 pairs of 12 bit integers. Each byte in
6152 // vin[0] stores the low 8 bits of the first int in a pair. Each
6153 // byte in vin[1] stores the high 4 bits of the first int and the
6154 // low 4 bits of the second int. Each byte in vin[2] stores the
6155 // high 8 bits of the second int. Likewise the vectors in second
6156 // half.
6157
6158 // Converting the data to 16-bit shorts requires first of all
6159 // expanding each of the 6 x 16B vectors into 6 corresponding
6160 // pairs of 8H vectors. Mask, shift and add operations on the
6161 // resulting vector pairs can be used to combine 4 and 8 bit
6162 // parts of related 8H vector elements.
6163 //
6164 // The middle vectors (vin[2] and vin[5]) are actually expanded
6165 // twice, one copy manipulated to provide the lower 4 bits
6166 // belonging to the first short in a pair and another copy
6167 // manipulated to provide the higher 4 bits belonging to the
6168 // second short in a pair. This is why the the vector sequences va
6169 // and vb used to hold the expanded 8H elements are of length 8.
6170
6171 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6172 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6173 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6174 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6175 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6176 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6177 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6178 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6179
6180 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6181 // and vb[4:5]
6182 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6183 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6184 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6185 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6186 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6187 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6188
6189 // shift lo byte of copy 1 of the middle stripe into the high byte
6190 __ shl(va[2], __ T8H, va[2], 8);
6191 __ shl(va[3], __ T8H, va[3], 8);
6192 __ shl(vb[2], __ T8H, vb[2], 8);
6193 __ shl(vb[3], __ T8H, vb[3], 8);
6194
6195 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6196 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6197 // are in bit positions [4..11].
6198 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6199 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6200 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6201 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6202
6203 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6204 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6205 // copy2
6206 __ andr(va[2], __ T16B, va[2], v31);
6207 __ andr(va[3], __ T16B, va[3], v31);
6208 __ ushr(va[4], __ T8H, va[4], 4);
6209 __ ushr(va[5], __ T8H, va[5], 4);
6210 __ andr(vb[2], __ T16B, vb[2], v31);
6211 __ andr(vb[3], __ T16B, vb[3], v31);
6212 __ ushr(vb[4], __ T8H, vb[4], 4);
6213 __ ushr(vb[5], __ T8H, vb[5], 4);
6214
6215 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6216 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6217 // n.b. the ordering ensures: i) inputs are consumed before they
6218 // are overwritten ii) the order of 16-bit results across successive
6219 // pairs of vectors in va and then vb reflects the order of the
6220 // corresponding 12-bit inputs
6221 __ addv(va[0], __ T8H, va[0], va[2]);
6222 __ addv(va[2], __ T8H, va[1], va[3]);
6223 __ addv(va[1], __ T8H, va[4], va[6]);
6224 __ addv(va[3], __ T8H, va[5], va[7]);
6225 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6226 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6227 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6228 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6229
6230 // store 64 results interleaved as shorts
6231 vs_st2_post(vs_front(va), __ T8H, parsed);
6232 vs_st2_post(vs_front(vb), __ T8H, parsed);
6233
6234 __ sub(parsedLength, parsedLength, 64);
6235 __ cmp(parsedLength, (u1)64);
6236 __ br(Assembler::GE, L_loop);
6237 __ cbz(parsedLength, L_end);
6238
6239 // if anything is left it should be a final 72 bytes of input
6240 // i.e. a final 48 12-bit values. so we handle this by loading
6241 // 48 bytes into all 16B lanes of front(vin) and only 24
6242 // bytes into the lower 8B lane of back(vin)
6243 vs_ld3_post(vs_front(vin), __ T16B, condensed);
6244 vs_ld3(vs_back(vin), __ T8B, condensed);
6245
6246 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6247 // n.b. target elements 2 and 3 of va duplicate elements 4 and
6248 // 5 and target element 2 of vb duplicates element 4.
6249 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6250 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6251 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6252 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6253 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6254 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6255
6256 // This time expand just the lower 8 lanes
6257 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6258 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6259 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6260
6261 // shift lo byte of copy 1 of the middle stripe into the high byte
6262 __ shl(va[2], __ T8H, va[2], 8);
6263 __ shl(va[3], __ T8H, va[3], 8);
6264 __ shl(vb[2], __ T8H, vb[2], 8);
6265
6266 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
6267 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
6268 // int are in bit positions [4..11].
6269 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6270 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6271 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6272
6273 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
6274 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
6275 // copy2
6276 __ andr(va[2], __ T16B, va[2], v31);
6277 __ andr(va[3], __ T16B, va[3], v31);
6278 __ ushr(va[4], __ T8H, va[4], 4);
6279 __ ushr(va[5], __ T8H, va[5], 4);
6280 __ andr(vb[2], __ T16B, vb[2], v31);
6281 __ ushr(vb[4], __ T8H, vb[4], 4);
6282
6283
6284
6285 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
6286 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
6287
6288 // n.b. ordering ensures: i) inputs are consumed before they are
6289 // overwritten ii) order of 16-bit results across succsessive
6290 // pairs of vectors in va and then lower half of vb reflects order
6291 // of corresponding 12-bit inputs
6292 __ addv(va[0], __ T8H, va[0], va[2]);
6293 __ addv(va[2], __ T8H, va[1], va[3]);
6294 __ addv(va[1], __ T8H, va[4], va[6]);
6295 __ addv(va[3], __ T8H, va[5], va[7]);
6296 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6297 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6298
6299 // store 48 results interleaved as shorts
6300 vs_st2_post(vs_front(va), __ T8H, parsed);
6301 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
6302
6303 __ BIND(L_end);
6304
6305 __ leave(); // required for proper stackwalking of RuntimeStub frame
6306 __ mov(r0, zr); // return 0
6307 __ ret(lr);
6308
6309 // bind label and generate constant data used by this stub
6310 __ BIND(L_F00);
6311 __ emit_int64(0x0f000f000f000f00);
6312 __ emit_int64(0x0f000f000f000f00);
6313
6314 return start;
6315 }
6316
6317 // Kyber Barrett reduce function.
6318 // Implements
6319 // static int implKyberBarrettReduce(short[] coeffs) {}
6320 //
6321 // coeffs (short[256]) = c_rarg0
6322 address generate_kyberBarrettReduce() {
6323
6324 __ align(CodeEntryAlignment);
6325 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6326 StubCodeMark mark(this, stub_id);
6327 address start = __ pc();
6328 __ enter();
6329
6330 const Register coeffs = c_rarg0;
6331
6332 const Register kyberConsts = r10;
6333 const Register result = r11;
6334
6335 // As above we process 256 sets of values in total i.e. 32 x
6336 // 8H quadwords. So, we can load, add and store the data in 3
6337 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6338 // of 10 or 11 registers. A further constraint is that the
6339 // mapping needs to skip callee saves. So, we allocate the
6340 // register sequences using two 8 sequences, two 2 sequences
6341 // and two single registers.
6342 VSeq<8> vs1_1(0);
6343 VSeq<2> vs1_2(16);
6344 FloatRegister vs1_3 = v28;
6345 VSeq<8> vs2_1(18);
6346 VSeq<2> vs2_2(26);
6347 FloatRegister vs2_3 = v29;
6348
6349 // we also need a pair of corresponding constant sequences
6350
6351 VSeq<8> vc1_1(30, 0);
6352 VSeq<2> vc1_2(30, 0);
6353 FloatRegister vc1_3 = v30; // for kyber_q
6354
6355 VSeq<8> vc2_1(31, 0);
6356 VSeq<2> vc2_2(31, 0);
6357 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6358
6359 __ add(result, coeffs, 0);
6360 __ lea(kyberConsts,
6361 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6362
6363 // load q and the multiplier for the Barrett reduction
6364 __ add(kyberConsts, kyberConsts, 16);
6365 __ ldpq(vc1_3, vc2_3, kyberConsts);
6366
6367 for (int i = 0; i < 3; i++) {
6368 // load 80 or 88 coefficients
6369 vs_ldpq_post(vs1_1, coeffs);
6370 vs_ldpq_post(vs1_2, coeffs);
6371 if (i < 2) {
6372 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6373 }
6374
6375 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6376 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6377 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6378 if (i < 2) {
6379 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6380 }
6381
6382 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6383 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6384 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6385 if (i < 2) {
6386 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6387 }
6388
6389 // vs1 <- vs1 - vs2 * kyber_q
6390 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6391 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6392 if (i < 2) {
6393 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6394 }
6395
6396 vs_stpq_post(vs1_1, result);
6397 vs_stpq_post(vs1_2, result);
6398 if (i < 2) {
6399 __ str(vs1_3, __ Q, __ post(result, 16));
6400 }
6401 }
6402
6403 __ leave(); // required for proper stackwalking of RuntimeStub frame
6404 __ mov(r0, zr); // return 0
6405 __ ret(lr);
6406
6407 return start;
6408 }
6409
6410
6411 // Dilithium-specific montmul helper routines that generate parallel
6412 // code for, respectively, a single 4x4s vector sequence montmul or
6413 // two such multiplies in a row.
6414
6415 // Perform 16 32-bit Montgomery multiplications in parallel
6416 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6417 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6418 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6419 // It will assert that the register use is valid
6420 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6421 }
6422
6423 // Perform 2x16 32-bit Montgomery multiplications in parallel
6424 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6425 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6426 // Schedule two successive 4x4S multiplies via the montmul helper
6427 // on the front and back halves of va, vb and vc. The helper will
6428 // assert that the register use has no overlap conflicts on each
6429 // individual call but we also need to ensure that the necessary
6430 // disjoint/equality constraints are met across both calls.
6431
6432 // vb, vc, vtmp and vq must be disjoint. va must either be
6433 // disjoint from all other registers or equal vc
6434
6435 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6436 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6437 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6438
6439 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6440 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6441
6442 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6443
6444 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6445 assert(vs_disjoint(va, vb), "va and vb overlap");
6446 assert(vs_disjoint(va, vq), "va and vq overlap");
6447 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6448
6449 // We multiply the front and back halves of each sequence 4 at a
6450 // time because
6451 //
6452 // 1) we are currently only able to get 4-way instruction
6453 // parallelism at best
6454 //
6455 // 2) we need registers for the constants in vq and temporary
6456 // scratch registers to hold intermediate results so vtmp can only
6457 // be a VSeq<4> which means we only have 4 scratch slots.
6458
6459 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6460 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6461 }
6462
6463 // Perform combined montmul then add/sub on 4x4S vectors.
6464 void dilithium_montmul16_sub_add(
6465 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6466 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6467 // compute a = montmul(a1, c)
6468 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6469 // ouptut a1 = a0 - a
6470 vs_subv(va1, __ T4S, va0, vc);
6471 // and a0 = a0 + a
6472 vs_addv(va0, __ T4S, va0, vc);
6473 }
6474
6475 // Perform combined add/sub then montul on 4x4S vectors.
6476 void dilithium_sub_add_montmul16(
6477 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6478 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6479 // compute c = a0 - a1
6480 vs_subv(vtmp1, __ T4S, va0, va1);
6481 // output a0 = a0 + a1
6482 vs_addv(va0, __ T4S, va0, va1);
6483 // output a1 = b montmul c
6484 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6485 }
6486
6487 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6488 // in the Java implementation come in sequences of at least 8, so we
6489 // can use ldpq to collect the corresponding data into pairs of vector
6490 // registers.
6491 // We collect the coefficients corresponding to the 'j+l' indexes into
6492 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6493 // then we do the (Montgomery) multiplications by the zetas in parallel
6494 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6495 // v0-v7, then do the additions into v24-v31 and the subtractions into
6496 // v0-v7 and finally save the results back to the coeffs array.
6497 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6498 const Register coeffs, const Register zetas) {
6499 int c1 = 0;
6500 int c2 = 512;
6501 int startIncr;
6502 // don't use callee save registers v8 - v15
6503 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6504 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6505 VSeq<2> vq(30); // n.b. constants overlap vs3
6506 int offsets[4] = { 0, 32, 64, 96 };
6507
6508 for (int level = 0; level < 5; level++) {
6509 int c1Start = c1;
6510 int c2Start = c2;
6511 if (level == 3) {
6512 offsets[1] = 32;
6513 offsets[2] = 128;
6514 offsets[3] = 160;
6515 } else if (level == 4) {
6516 offsets[1] = 64;
6517 offsets[2] = 128;
6518 offsets[3] = 192;
6519 }
6520
6521 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6522 // time at 4 different offsets and multiply them in order by the
6523 // next set of input values. So we employ indexed load and store
6524 // pair instructions with arrangement 4S.
6525 for (int i = 0; i < 4; i++) {
6526 // reload q and qinv
6527 vs_ldpq(vq, dilithiumConsts); // qInv, q
6528 // load 8x4S coefficients via second start pos == c2
6529 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6530 // load next 8x4S inputs == b
6531 vs_ldpq_post(vs2, zetas);
6532 // compute a == c2 * b mod MONT_Q
6533 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6534 // load 8x4s coefficients via first start pos == c1
6535 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6536 // compute a1 = c1 + a
6537 vs_addv(vs3, __ T4S, vs1, vs2);
6538 // compute a2 = c1 - a
6539 vs_subv(vs1, __ T4S, vs1, vs2);
6540 // output a1 and a2
6541 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6542 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6543
6544 int k = 4 * level + i;
6545
6546 if (k > 7) {
6547 startIncr = 256;
6548 } else if (k == 5) {
6549 startIncr = 384;
6550 } else {
6551 startIncr = 128;
6552 }
6553
6554 c1Start += startIncr;
6555 c2Start += startIncr;
6556 }
6557
6558 c2 /= 2;
6559 }
6560 }
6561
6562 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6563 // Implements the method
6564 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6565 // of the Java class sun.security.provider
6566 //
6567 // coeffs (int[256]) = c_rarg0
6568 // zetas (int[256]) = c_rarg1
6569 address generate_dilithiumAlmostNtt() {
6570
6571 __ align(CodeEntryAlignment);
6572 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6573 StubCodeMark mark(this, stub_id);
6574 address start = __ pc();
6575 __ enter();
6576
6577 const Register coeffs = c_rarg0;
6578 const Register zetas = c_rarg1;
6579
6580 const Register tmpAddr = r9;
6581 const Register dilithiumConsts = r10;
6582 const Register result = r11;
6583 // don't use callee save registers v8 - v15
6584 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6585 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6586 VSeq<2> vq(30); // n.b. constants overlap vs3
6587 int offsets[4] = { 0, 32, 64, 96};
6588 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6589 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6590 __ add(result, coeffs, 0);
6591 __ lea(dilithiumConsts,
6592 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6593
6594 // Each level represents one iteration of the outer for loop of the Java version.
6595
6596 // level 0-4
6597 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6598
6599 // level 5
6600
6601 // At level 5 the coefficients we need to combine with the zetas
6602 // are grouped in memory in blocks of size 4. So, for both sets of
6603 // coefficients we load 4 adjacent values at 8 different offsets
6604 // using an indexed ldr with register variant Q and multiply them
6605 // in sequence order by the next set of inputs. Likewise we store
6606 // the resuls using an indexed str with register variant Q.
6607 for (int i = 0; i < 1024; i += 256) {
6608 // reload constants q, qinv each iteration as they get clobbered later
6609 vs_ldpq(vq, dilithiumConsts); // qInv, q
6610 // load 32 (8x4S) coefficients via first offsets = c1
6611 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6612 // load next 32 (8x4S) inputs = b
6613 vs_ldpq_post(vs2, zetas);
6614 // a = b montul c1
6615 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6616 // load 32 (8x4S) coefficients via second offsets = c2
6617 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6618 // add/sub with result of multiply
6619 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6620 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6621 // write back new coefficients using same offsets
6622 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6623 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6624 }
6625
6626 // level 6
6627 // At level 6 the coefficients we need to combine with the zetas
6628 // are grouped in memory in pairs, the first two being montmul
6629 // inputs and the second add/sub inputs. We can still implement
6630 // the montmul+sub+add using 4-way parallelism but only if we
6631 // combine the coefficients with the zetas 16 at a time. We load 8
6632 // adjacent values at 4 different offsets using an ld2 load with
6633 // arrangement 2D. That interleaves the lower and upper halves of
6634 // each pair of quadwords into successive vector registers. We
6635 // then need to montmul the 4 even elements of the coefficients
6636 // register sequence by the zetas in order and then add/sub the 4
6637 // odd elements of the coefficients register sequence. We use an
6638 // equivalent st2 operation to store the results back into memory
6639 // de-interleaved.
6640 for (int i = 0; i < 1024; i += 128) {
6641 // reload constants q, qinv each iteration as they get clobbered later
6642 vs_ldpq(vq, dilithiumConsts); // qInv, q
6643 // load interleaved 16 (4x2D) coefficients via offsets
6644 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6645 // load next 16 (4x4S) inputs
6646 vs_ldpq_post(vs_front(vs2), zetas);
6647 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6648 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6649 vs_front(vs2), vtmp, vq);
6650 // store interleaved 16 (4x2D) coefficients via offsets
6651 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6652 }
6653
6654 // level 7
6655 // At level 7 the coefficients we need to combine with the zetas
6656 // occur singly with montmul inputs alterating with add/sub
6657 // inputs. Once again we can use 4-way parallelism to combine 16
6658 // zetas at a time. However, we have to load 8 adjacent values at
6659 // 4 different offsets using an ld2 load with arrangement 4S. That
6660 // interleaves the the odd words of each pair into one
6661 // coefficients vector register and the even words of the pair
6662 // into the next register. We then need to montmul the 4 even
6663 // elements of the coefficients register sequence by the zetas in
6664 // order and then add/sub the 4 odd elements of the coefficients
6665 // register sequence. We use an equivalent st2 operation to store
6666 // the results back into memory de-interleaved.
6667
6668 for (int i = 0; i < 1024; i += 128) {
6669 // reload constants q, qinv each iteration as they get clobbered later
6670 vs_ldpq(vq, dilithiumConsts); // qInv, q
6671 // load interleaved 16 (4x4S) coefficients via offsets
6672 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6673 // load next 16 (4x4S) inputs
6674 vs_ldpq_post(vs_front(vs2), zetas);
6675 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6676 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6677 vs_front(vs2), vtmp, vq);
6678 // store interleaved 16 (4x4S) coefficients via offsets
6679 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6680 }
6681 __ leave(); // required for proper stackwalking of RuntimeStub frame
6682 __ mov(r0, zr); // return 0
6683 __ ret(lr);
6684
6685 return start;
6686 }
6687
6688 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6689 // in the Java implementation come in sequences of at least 8, so we
6690 // can use ldpq to collect the corresponding data into pairs of vector
6691 // registers
6692 // We collect the coefficients that correspond to the 'j's into vs1
6693 // the coefficiets that correspond to the 'j+l's into vs2 then
6694 // do the additions into vs3 and the subtractions into vs1 then
6695 // save the result of the additions, load the zetas into vs2
6696 // do the (Montgomery) multiplications by zeta in parallel into vs2
6697 // finally save the results back to the coeffs array
6698 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6699 const Register coeffs, const Register zetas) {
6700 int c1 = 0;
6701 int c2 = 32;
6702 int startIncr;
6703 int offsets[4];
6704 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6705 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6706 VSeq<2> vq(30); // n.b. constants overlap vs3
6707
6708 offsets[0] = 0;
6709
6710 for (int level = 3; level < 8; level++) {
6711 int c1Start = c1;
6712 int c2Start = c2;
6713 if (level == 3) {
6714 offsets[1] = 64;
6715 offsets[2] = 128;
6716 offsets[3] = 192;
6717 } else if (level == 4) {
6718 offsets[1] = 32;
6719 offsets[2] = 128;
6720 offsets[3] = 160;
6721 } else {
6722 offsets[1] = 32;
6723 offsets[2] = 64;
6724 offsets[3] = 96;
6725 }
6726
6727 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6728 // time at 4 different offsets and multiply them in order by the
6729 // next set of input values. So we employ indexed load and store
6730 // pair instructions with arrangement 4S.
6731 for (int i = 0; i < 4; i++) {
6732 // load v1 32 (8x4S) coefficients relative to first start index
6733 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6734 // load v2 32 (8x4S) coefficients relative to second start index
6735 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6736 // a0 = v1 + v2 -- n.b. clobbers vqs
6737 vs_addv(vs3, __ T4S, vs1, vs2);
6738 // a1 = v1 - v2
6739 vs_subv(vs1, __ T4S, vs1, vs2);
6740 // save a1 relative to first start index
6741 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6742 // load constants q, qinv each iteration as they get clobbered above
6743 vs_ldpq(vq, dilithiumConsts); // qInv, q
6744 // load b next 32 (8x4S) inputs
6745 vs_ldpq_post(vs2, zetas);
6746 // a = a1 montmul b
6747 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6748 // save a relative to second start index
6749 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6750
6751 int k = 4 * level + i;
6752
6753 if (k < 24) {
6754 startIncr = 256;
6755 } else if (k == 25) {
6756 startIncr = 384;
6757 } else {
6758 startIncr = 128;
6759 }
6760
6761 c1Start += startIncr;
6762 c2Start += startIncr;
6763 }
6764
6765 c2 *= 2;
6766 }
6767 }
6768
6769 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6770 // Implements the method
6771 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6772 // the sun.security.provider.ML_DSA class.
6773 //
6774 // coeffs (int[256]) = c_rarg0
6775 // zetas (int[256]) = c_rarg1
6776 address generate_dilithiumAlmostInverseNtt() {
6777
6778 __ align(CodeEntryAlignment);
6779 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6780 StubCodeMark mark(this, stub_id);
6781 address start = __ pc();
6782 __ enter();
6783
6784 const Register coeffs = c_rarg0;
6785 const Register zetas = c_rarg1;
6786
6787 const Register tmpAddr = r9;
6788 const Register dilithiumConsts = r10;
6789 const Register result = r11;
6790 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6791 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6792 VSeq<2> vq(30); // n.b. constants overlap vs3
6793 int offsets[4] = { 0, 32, 64, 96 };
6794 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6795 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6796
6797 __ add(result, coeffs, 0);
6798 __ lea(dilithiumConsts,
6799 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6800
6801 // Each level represents one iteration of the outer for loop of the Java version
6802
6803 // level 0
6804 // At level 0 we need to interleave adjacent quartets of
6805 // coefficients before we multiply and add/sub by the next 16
6806 // zetas just as we did for level 7 in the multiply code. So we
6807 // load and store the values using an ld2/st2 with arrangement 4S.
6808 for (int i = 0; i < 1024; i += 128) {
6809 // load constants q, qinv
6810 // n.b. this can be moved out of the loop as they do not get
6811 // clobbered by first two loops
6812 vs_ldpq(vq, dilithiumConsts); // qInv, q
6813 // a0/a1 load interleaved 32 (8x4S) coefficients
6814 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6815 // b load next 32 (8x4S) inputs
6816 vs_ldpq_post(vs_front(vs2), zetas);
6817 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6818 // n.b. second half of vs2 provides temporary register storage
6819 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6820 vs_front(vs2), vs_back(vs2), vtmp, vq);
6821 // a0/a1 store interleaved 32 (8x4S) coefficients
6822 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6823 }
6824
6825 // level 1
6826 // At level 1 we need to interleave pairs of adjacent pairs of
6827 // coefficients before we multiply by the next 16 zetas just as we
6828 // did for level 6 in the multiply code. So we load and store the
6829 // values an ld2/st2 with arrangement 2D.
6830 for (int i = 0; i < 1024; i += 128) {
6831 // a0/a1 load interleaved 32 (8x2D) coefficients
6832 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6833 // b load next 16 (4x4S) inputs
6834 vs_ldpq_post(vs_front(vs2), zetas);
6835 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6836 // n.b. second half of vs2 provides temporary register storage
6837 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6838 vs_front(vs2), vs_back(vs2), vtmp, vq);
6839 // a0/a1 store interleaved 32 (8x2D) coefficients
6840 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6841 }
6842
6843 // level 2
6844 // At level 2 coefficients come in blocks of 4. So, we load 4
6845 // adjacent coefficients at 8 distinct offsets for both the first
6846 // and second coefficient sequences, using an ldr with register
6847 // variant Q then combine them with next set of 32 zetas. Likewise
6848 // we store the results using an str with register variant Q.
6849 for (int i = 0; i < 1024; i += 256) {
6850 // c0 load 32 (8x4S) coefficients via first offsets
6851 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6852 // c1 load 32 (8x4S) coefficients via second offsets
6853 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6854 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6855 vs_addv(vs3, __ T4S, vs1, vs2);
6856 // c = c0 - c1
6857 vs_subv(vs1, __ T4S, vs1, vs2);
6858 // store a0 32 (8x4S) coefficients via first offsets
6859 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6860 // b load 32 (8x4S) next inputs
6861 vs_ldpq_post(vs2, zetas);
6862 // reload constants q, qinv -- they were clobbered earlier
6863 vs_ldpq(vq, dilithiumConsts); // qInv, q
6864 // compute a1 = b montmul c
6865 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6866 // store a1 32 (8x4S) coefficients via second offsets
6867 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6868 }
6869
6870 // level 3-7
6871 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6872
6873 __ leave(); // required for proper stackwalking of RuntimeStub frame
6874 __ mov(r0, zr); // return 0
6875 __ ret(lr);
6876
6877 return start;
6878 }
6879
6880 // Dilithium multiply polynomials in the NTT domain.
6881 // Straightforward implementation of the method
6882 // static int implDilithiumNttMult(
6883 // int[] result, int[] ntta, int[] nttb {} of
6884 // the sun.security.provider.ML_DSA class.
6885 //
6886 // result (int[256]) = c_rarg0
6887 // poly1 (int[256]) = c_rarg1
6888 // poly2 (int[256]) = c_rarg2
6889 address generate_dilithiumNttMult() {
6890
6891 __ align(CodeEntryAlignment);
6892 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6893 StubCodeMark mark(this, stub_id);
6894 address start = __ pc();
6895 __ enter();
6896
6897 Label L_loop;
6898
6899 const Register result = c_rarg0;
6900 const Register poly1 = c_rarg1;
6901 const Register poly2 = c_rarg2;
6902
6903 const Register dilithiumConsts = r10;
6904 const Register len = r11;
6905
6906 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6907 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6908 VSeq<2> vq(30); // n.b. constants overlap vs3
6909 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6910
6911 __ lea(dilithiumConsts,
6912 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6913
6914 // load constants q, qinv
6915 vs_ldpq(vq, dilithiumConsts); // qInv, q
6916 // load constant rSquare into v29
6917 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6918
6919 __ mov(len, zr);
6920 __ add(len, len, 1024);
6921
6922 __ BIND(L_loop);
6923
6924 // b load 32 (8x4S) next inputs from poly1
6925 vs_ldpq_post(vs1, poly1);
6926 // c load 32 (8x4S) next inputs from poly2
6927 vs_ldpq_post(vs2, poly2);
6928 // compute a = b montmul c
6929 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6930 // compute a = rsquare montmul a
6931 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6932 // save a 32 (8x4S) results
6933 vs_stpq_post(vs2, result);
6934
6935 __ sub(len, len, 128);
6936 __ cmp(len, (u1)128);
6937 __ br(Assembler::GE, L_loop);
6938
6939 __ leave(); // required for proper stackwalking of RuntimeStub frame
6940 __ mov(r0, zr); // return 0
6941 __ ret(lr);
6942
6943 return start;
6944 }
6945
6946 // Dilithium Motgomery multiply an array by a constant.
6947 // A straightforward implementation of the method
6948 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6949 // of the sun.security.provider.MLDSA class
6950 //
6951 // coeffs (int[256]) = c_rarg0
6952 // constant (int) = c_rarg1
6953 address generate_dilithiumMontMulByConstant() {
6954
6955 __ align(CodeEntryAlignment);
6956 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6957 StubCodeMark mark(this, stub_id);
6958 address start = __ pc();
6959 __ enter();
6960
6961 Label L_loop;
6962
6963 const Register coeffs = c_rarg0;
6964 const Register constant = c_rarg1;
6965
6966 const Register dilithiumConsts = r10;
6967 const Register result = r11;
6968 const Register len = r12;
6969
6970 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6971 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6972 VSeq<2> vq(30); // n.b. constants overlap vs3
6973 VSeq<8> vconst(29, 0); // for montmul by constant
6974
6975 // results track inputs
6976 __ add(result, coeffs, 0);
6977 __ lea(dilithiumConsts,
6978 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6979
6980 // load constants q, qinv -- they do not get clobbered by first two loops
6981 vs_ldpq(vq, dilithiumConsts); // qInv, q
6982 // copy caller supplied constant across vconst
6983 __ dup(vconst[0], __ T4S, constant);
6984 __ mov(len, zr);
6985 __ add(len, len, 1024);
6986
6987 __ BIND(L_loop);
6988
6989 // load next 32 inputs
6990 vs_ldpq_post(vs2, coeffs);
6991 // mont mul by constant
6992 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6993 // write next 32 results
6994 vs_stpq_post(vs2, result);
6995
6996 __ sub(len, len, 128);
6997 __ cmp(len, (u1)128);
6998 __ br(Assembler::GE, L_loop);
6999
7000 __ leave(); // required for proper stackwalking of RuntimeStub frame
7001 __ mov(r0, zr); // return 0
7002 __ ret(lr);
7003
7004 return start;
7005 }
7006
7007 // Dilithium decompose poly.
7008 // Implements the method
7009 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7010 // of the sun.security.provider.ML_DSA class
7011 //
7012 // input (int[256]) = c_rarg0
7013 // lowPart (int[256]) = c_rarg1
7014 // highPart (int[256]) = c_rarg2
7015 // twoGamma2 (int) = c_rarg3
7016 // multiplier (int) = c_rarg4
7017 address generate_dilithiumDecomposePoly() {
7018
7019 __ align(CodeEntryAlignment);
7020 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7021 StubCodeMark mark(this, stub_id);
7022 address start = __ pc();
7023 Label L_loop;
7024
7025 const Register input = c_rarg0;
7026 const Register lowPart = c_rarg1;
7027 const Register highPart = c_rarg2;
7028 const Register twoGamma2 = c_rarg3;
7029 const Register multiplier = c_rarg4;
7030
7031 const Register len = r9;
7032 const Register dilithiumConsts = r10;
7033 const Register tmp = r11;
7034
7035 // 6 independent sets of 4x4s values
7036 VSeq<4> vs1(0), vs2(4), vs3(8);
7037 VSeq<4> vs4(12), vs5(16), vtmp(20);
7038
7039 // 7 constants for cross-multiplying
7040 VSeq<4> one(25, 0);
7041 VSeq<4> qminus1(26, 0);
7042 VSeq<4> g2(27, 0);
7043 VSeq<4> twog2(28, 0);
7044 VSeq<4> mult(29, 0);
7045 VSeq<4> q(30, 0);
7046 VSeq<4> qadd(31, 0);
7047
7048 __ enter();
7049
7050 __ lea(dilithiumConsts,
7051 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7052
7053 // save callee-saved registers
7054 __ stpd(v8, v9, __ pre(sp, -64));
7055 __ stpd(v10, v11, Address(sp, 16));
7056 __ stpd(v12, v13, Address(sp, 32));
7057 __ stpd(v14, v15, Address(sp, 48));
7058
7059 // populate constant registers
7060 __ mov(tmp, zr);
7061 __ add(tmp, tmp, 1);
7062 __ dup(one[0], __ T4S, tmp); // 1
7063 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7064 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7065 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7066 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7067 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7068 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7069
7070 __ mov(len, zr);
7071 __ add(len, len, 1024);
7072
7073 __ BIND(L_loop);
7074
7075 // load next 4x4S inputs interleaved: rplus --> vs1
7076 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7077
7078 // rplus = rplus - ((rplus + qadd) >> 23) * q
7079 vs_addv(vtmp, __ T4S, vs1, qadd);
7080 vs_sshr(vtmp, __ T4S, vtmp, 23);
7081 vs_mulv(vtmp, __ T4S, vtmp, q);
7082 vs_subv(vs1, __ T4S, vs1, vtmp);
7083
7084 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7085 vs_sshr(vtmp, __ T4S, vs1, 31);
7086 vs_andr(vtmp, vtmp, q);
7087 vs_addv(vs1, __ T4S, vs1, vtmp);
7088
7089 // quotient --> vs2
7090 // int quotient = (rplus * multiplier) >> 22;
7091 vs_mulv(vtmp, __ T4S, vs1, mult);
7092 vs_sshr(vs2, __ T4S, vtmp, 22);
7093
7094 // r0 --> vs3
7095 // int r0 = rplus - quotient * twoGamma2;
7096 vs_mulv(vtmp, __ T4S, vs2, twog2);
7097 vs_subv(vs3, __ T4S, vs1, vtmp);
7098
7099 // mask --> vs4
7100 // int mask = (twoGamma2 - r0) >> 22;
7101 vs_subv(vtmp, __ T4S, twog2, vs3);
7102 vs_sshr(vs4, __ T4S, vtmp, 22);
7103
7104 // r0 -= (mask & twoGamma2);
7105 vs_andr(vtmp, vs4, twog2);
7106 vs_subv(vs3, __ T4S, vs3, vtmp);
7107
7108 // quotient += (mask & 1);
7109 vs_andr(vtmp, vs4, one);
7110 vs_addv(vs2, __ T4S, vs2, vtmp);
7111
7112 // mask = (twoGamma2 / 2 - r0) >> 31;
7113 vs_subv(vtmp, __ T4S, g2, vs3);
7114 vs_sshr(vs4, __ T4S, vtmp, 31);
7115
7116 // r0 -= (mask & twoGamma2);
7117 vs_andr(vtmp, vs4, twog2);
7118 vs_subv(vs3, __ T4S, vs3, vtmp);
7119
7120 // quotient += (mask & 1);
7121 vs_andr(vtmp, vs4, one);
7122 vs_addv(vs2, __ T4S, vs2, vtmp);
7123
7124 // r1 --> vs5
7125 // int r1 = rplus - r0 - (dilithium_q - 1);
7126 vs_subv(vtmp, __ T4S, vs1, vs3);
7127 vs_subv(vs5, __ T4S, vtmp, qminus1);
7128
7129 // r1 --> vs1 (overwriting rplus)
7130 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7131 vs_negr(vtmp, __ T4S, vs5);
7132 vs_orr(vtmp, vs5, vtmp);
7133 vs_sshr(vs1, __ T4S, vtmp, 31);
7134
7135 // r0 += ~r1;
7136 vs_notr(vtmp, vs1);
7137 vs_addv(vs3, __ T4S, vs3, vtmp);
7138
7139 // r1 = r1 & quotient;
7140 vs_andr(vs1, vs2, vs1);
7141
7142 // store results inteleaved
7143 // lowPart[m] = r0;
7144 // highPart[m] = r1;
7145 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7146 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7147
7148 __ sub(len, len, 64);
7149 __ cmp(len, (u1)64);
7150 __ br(Assembler::GE, L_loop);
7151
7152 // restore callee-saved vector registers
7153 __ ldpd(v14, v15, Address(sp, 48));
7154 __ ldpd(v12, v13, Address(sp, 32));
7155 __ ldpd(v10, v11, Address(sp, 16));
7156 __ ldpd(v8, v9, __ post(sp, 64));
7157
7158 __ leave(); // required for proper stackwalking of RuntimeStub frame
7159 __ mov(r0, zr); // return 0
7160 __ ret(lr);
7161
7162 return start;
7163 }
7164
7165 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7166 Register tmp0, Register tmp1, Register tmp2) {
7167 __ bic(tmp0, a2, a1); // for a0
7168 __ bic(tmp1, a3, a2); // for a1
7169 __ bic(tmp2, a4, a3); // for a2
7170 __ eor(a2, a2, tmp2);
7171 __ bic(tmp2, a0, a4); // for a3
7172 __ eor(a3, a3, tmp2);
7173 __ bic(tmp2, a1, a0); // for a4
7174 __ eor(a0, a0, tmp0);
7175 __ eor(a1, a1, tmp1);
7176 __ eor(a4, a4, tmp2);
7177 }
7178
7179 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7180 Register a0, Register a1, Register a2, Register a3, Register a4,
7181 Register a5, Register a6, Register a7, Register a8, Register a9,
7182 Register a10, Register a11, Register a12, Register a13, Register a14,
7183 Register a15, Register a16, Register a17, Register a18, Register a19,
7184 Register a20, Register a21, Register a22, Register a23, Register a24,
7185 Register tmp0, Register tmp1, Register tmp2) {
7186 __ eor3(tmp1, a4, a9, a14);
7187 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7188 __ eor3(tmp2, a1, a6, a11);
7189 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7190 __ rax1(tmp2, tmp0, tmp1); // d0
7191 {
7192
7193 Register tmp3, tmp4;
7194 if (can_use_fp && can_use_r18) {
7195 tmp3 = rfp;
7196 tmp4 = r18_tls;
7197 } else {
7198 tmp3 = a4;
7199 tmp4 = a9;
7200 __ stp(tmp3, tmp4, __ pre(sp, -16));
7201 }
7202
7203 __ eor3(tmp3, a0, a5, a10);
7204 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7205 __ eor(a0, a0, tmp2);
7206 __ eor(a5, a5, tmp2);
7207 __ eor(a10, a10, tmp2);
7208 __ eor(a15, a15, tmp2);
7209 __ eor(a20, a20, tmp2); // d0(tmp2)
7210 __ eor3(tmp3, a2, a7, a12);
7211 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7212 __ rax1(tmp3, tmp4, tmp2); // d1
7213 __ eor(a1, a1, tmp3);
7214 __ eor(a6, a6, tmp3);
7215 __ eor(a11, a11, tmp3);
7216 __ eor(a16, a16, tmp3);
7217 __ eor(a21, a21, tmp3); // d1(tmp3)
7218 __ rax1(tmp3, tmp2, tmp0); // d3
7219 __ eor3(tmp2, a3, a8, a13);
7220 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7221 __ eor(a3, a3, tmp3);
7222 __ eor(a8, a8, tmp3);
7223 __ eor(a13, a13, tmp3);
7224 __ eor(a18, a18, tmp3);
7225 __ eor(a23, a23, tmp3);
7226 __ rax1(tmp2, tmp1, tmp0); // d2
7227 __ eor(a2, a2, tmp2);
7228 __ eor(a7, a7, tmp2);
7229 __ eor(a12, a12, tmp2);
7230 __ rax1(tmp0, tmp0, tmp4); // d4
7231 if (!can_use_fp || !can_use_r18) {
7232 __ ldp(tmp3, tmp4, __ post(sp, 16));
7233 }
7234 __ eor(a17, a17, tmp2);
7235 __ eor(a22, a22, tmp2);
7236 __ eor(a4, a4, tmp0);
7237 __ eor(a9, a9, tmp0);
7238 __ eor(a14, a14, tmp0);
7239 __ eor(a19, a19, tmp0);
7240 __ eor(a24, a24, tmp0);
7241 }
7242
7243 __ rol(tmp0, a10, 3);
7244 __ rol(a10, a1, 1);
7245 __ rol(a1, a6, 44);
7246 __ rol(a6, a9, 20);
7247 __ rol(a9, a22, 61);
7248 __ rol(a22, a14, 39);
7249 __ rol(a14, a20, 18);
7250 __ rol(a20, a2, 62);
7251 __ rol(a2, a12, 43);
7252 __ rol(a12, a13, 25);
7253 __ rol(a13, a19, 8) ;
7254 __ rol(a19, a23, 56);
7255 __ rol(a23, a15, 41);
7256 __ rol(a15, a4, 27);
7257 __ rol(a4, a24, 14);
7258 __ rol(a24, a21, 2);
7259 __ rol(a21, a8, 55);
7260 __ rol(a8, a16, 45);
7261 __ rol(a16, a5, 36);
7262 __ rol(a5, a3, 28);
7263 __ rol(a3, a18, 21);
7264 __ rol(a18, a17, 15);
7265 __ rol(a17, a11, 10);
7266 __ rol(a11, a7, 6);
7267 __ mov(a7, tmp0);
7268
7269 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7270 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7271 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7272 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7273 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7274
7275 __ ldr(tmp1, __ post(rc, 8));
7276 __ eor(a0, a0, tmp1);
7277
7278 }
7279
7280 // Arguments:
7281 //
7282 // Inputs:
7283 // c_rarg0 - byte[] source+offset
7284 // c_rarg1 - byte[] SHA.state
7285 // c_rarg2 - int block_size
7286 // c_rarg3 - int offset
7287 // c_rarg4 - int limit
7288 //
7289 address generate_sha3_implCompress_gpr(StubId stub_id) {
7290 bool multi_block;
7291 switch (stub_id) {
7292 case StubId::stubgen_sha3_implCompress_id:
7293 multi_block = false;
7294 break;
7295 case StubId::stubgen_sha3_implCompressMB_id:
7296 multi_block = true;
7297 break;
7298 default:
7299 ShouldNotReachHere();
7300 }
7301
7302 static const uint64_t round_consts[24] = {
7303 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7304 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7305 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7306 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7307 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7308 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7309 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7310 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7311 };
7312
7313 __ align(CodeEntryAlignment);
7314 StubCodeMark mark(this, stub_id);
7315 address start = __ pc();
7316
7317 Register buf = c_rarg0;
7318 Register state = c_rarg1;
7319 Register block_size = c_rarg2;
7320 Register ofs = c_rarg3;
7321 Register limit = c_rarg4;
7322
7323 // use r3.r17,r19..r28 to keep a0..a24.
7324 // a0..a24 are respective locals from SHA3.java
7325 Register a0 = r25,
7326 a1 = r26,
7327 a2 = r27,
7328 a3 = r3,
7329 a4 = r4,
7330 a5 = r5,
7331 a6 = r6,
7332 a7 = r7,
7333 a8 = rscratch1, // r8
7334 a9 = rscratch2, // r9
7335 a10 = r10,
7336 a11 = r11,
7337 a12 = r12,
7338 a13 = r13,
7339 a14 = r14,
7340 a15 = r15,
7341 a16 = r16,
7342 a17 = r17,
7343 a18 = r28,
7344 a19 = r19,
7345 a20 = r20,
7346 a21 = r21,
7347 a22 = r22,
7348 a23 = r23,
7349 a24 = r24;
7350
7351 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7352
7353 Label sha3_loop, rounds24_preloop, loop_body;
7354 Label sha3_512_or_sha3_384, shake128;
7355
7356 bool can_use_r18 = false;
7357 #ifndef R18_RESERVED
7358 can_use_r18 = true;
7359 #endif
7360 bool can_use_fp = !PreserveFramePointer;
7361
7362 __ enter();
7363
7364 // save almost all yet unsaved gpr registers on stack
7365 __ str(block_size, __ pre(sp, -128));
7366 if (multi_block) {
7367 __ stpw(ofs, limit, Address(sp, 8));
7368 }
7369 // 8 bytes at sp+16 will be used to keep buf
7370 __ stp(r19, r20, Address(sp, 32));
7371 __ stp(r21, r22, Address(sp, 48));
7372 __ stp(r23, r24, Address(sp, 64));
7373 __ stp(r25, r26, Address(sp, 80));
7374 __ stp(r27, r28, Address(sp, 96));
7375 if (can_use_r18 && can_use_fp) {
7376 __ stp(r18_tls, state, Address(sp, 112));
7377 } else {
7378 __ str(state, Address(sp, 112));
7379 }
7380
7381 // begin sha3 calculations: loading a0..a24 from state arrary
7382 __ ldp(a0, a1, state);
7383 __ ldp(a2, a3, Address(state, 16));
7384 __ ldp(a4, a5, Address(state, 32));
7385 __ ldp(a6, a7, Address(state, 48));
7386 __ ldp(a8, a9, Address(state, 64));
7387 __ ldp(a10, a11, Address(state, 80));
7388 __ ldp(a12, a13, Address(state, 96));
7389 __ ldp(a14, a15, Address(state, 112));
7390 __ ldp(a16, a17, Address(state, 128));
7391 __ ldp(a18, a19, Address(state, 144));
7392 __ ldp(a20, a21, Address(state, 160));
7393 __ ldp(a22, a23, Address(state, 176));
7394 __ ldr(a24, Address(state, 192));
7395
7396 __ BIND(sha3_loop);
7397
7398 // load input
7399 __ ldp(tmp3, tmp2, __ post(buf, 16));
7400 __ eor(a0, a0, tmp3);
7401 __ eor(a1, a1, tmp2);
7402 __ ldp(tmp3, tmp2, __ post(buf, 16));
7403 __ eor(a2, a2, tmp3);
7404 __ eor(a3, a3, tmp2);
7405 __ ldp(tmp3, tmp2, __ post(buf, 16));
7406 __ eor(a4, a4, tmp3);
7407 __ eor(a5, a5, tmp2);
7408 __ ldr(tmp3, __ post(buf, 8));
7409 __ eor(a6, a6, tmp3);
7410
7411 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7412 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7413
7414 __ ldp(tmp3, tmp2, __ post(buf, 16));
7415 __ eor(a7, a7, tmp3);
7416 __ eor(a8, a8, tmp2);
7417 __ ldp(tmp3, tmp2, __ post(buf, 16));
7418 __ eor(a9, a9, tmp3);
7419 __ eor(a10, a10, tmp2);
7420 __ ldp(tmp3, tmp2, __ post(buf, 16));
7421 __ eor(a11, a11, tmp3);
7422 __ eor(a12, a12, tmp2);
7423 __ ldp(tmp3, tmp2, __ post(buf, 16));
7424 __ eor(a13, a13, tmp3);
7425 __ eor(a14, a14, tmp2);
7426 __ ldp(tmp3, tmp2, __ post(buf, 16));
7427 __ eor(a15, a15, tmp3);
7428 __ eor(a16, a16, tmp2);
7429
7430 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7431 __ andw(tmp2, block_size, 48);
7432 __ cbzw(tmp2, rounds24_preloop);
7433 __ tbnz(block_size, 5, shake128);
7434 // block_size == 144, bit5 == 0, SHA3-244
7435 __ ldr(tmp3, __ post(buf, 8));
7436 __ eor(a17, a17, tmp3);
7437 __ b(rounds24_preloop);
7438
7439 __ BIND(shake128);
7440 __ ldp(tmp3, tmp2, __ post(buf, 16));
7441 __ eor(a17, a17, tmp3);
7442 __ eor(a18, a18, tmp2);
7443 __ ldp(tmp3, tmp2, __ post(buf, 16));
7444 __ eor(a19, a19, tmp3);
7445 __ eor(a20, a20, tmp2);
7446 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7447
7448 __ BIND(sha3_512_or_sha3_384);
7449 __ ldp(tmp3, tmp2, __ post(buf, 16));
7450 __ eor(a7, a7, tmp3);
7451 __ eor(a8, a8, tmp2);
7452 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7453
7454 // SHA3-384
7455 __ ldp(tmp3, tmp2, __ post(buf, 16));
7456 __ eor(a9, a9, tmp3);
7457 __ eor(a10, a10, tmp2);
7458 __ ldp(tmp3, tmp2, __ post(buf, 16));
7459 __ eor(a11, a11, tmp3);
7460 __ eor(a12, a12, tmp2);
7461
7462 __ BIND(rounds24_preloop);
7463 __ fmovs(v0, 24.0); // float loop counter,
7464 __ fmovs(v1, 1.0); // exact representation
7465
7466 __ str(buf, Address(sp, 16));
7467 __ lea(tmp3, ExternalAddress((address) round_consts));
7468
7469 __ BIND(loop_body);
7470 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7471 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7472 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7473 tmp0, tmp1, tmp2);
7474 __ fsubs(v0, v0, v1);
7475 __ fcmps(v0, 0.0);
7476 __ br(__ NE, loop_body);
7477
7478 if (multi_block) {
7479 __ ldrw(block_size, sp); // block_size
7480 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7481 __ addw(tmp2, tmp2, block_size);
7482 __ cmpw(tmp2, tmp1);
7483 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7484 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7485 __ br(Assembler::LE, sha3_loop);
7486 __ movw(c_rarg0, tmp2); // return offset
7487 }
7488 if (can_use_fp && can_use_r18) {
7489 __ ldp(r18_tls, state, Address(sp, 112));
7490 } else {
7491 __ ldr(state, Address(sp, 112));
7492 }
7493 // save calculated sha3 state
7494 __ stp(a0, a1, Address(state));
7495 __ stp(a2, a3, Address(state, 16));
7496 __ stp(a4, a5, Address(state, 32));
7497 __ stp(a6, a7, Address(state, 48));
7498 __ stp(a8, a9, Address(state, 64));
7499 __ stp(a10, a11, Address(state, 80));
7500 __ stp(a12, a13, Address(state, 96));
7501 __ stp(a14, a15, Address(state, 112));
7502 __ stp(a16, a17, Address(state, 128));
7503 __ stp(a18, a19, Address(state, 144));
7504 __ stp(a20, a21, Address(state, 160));
7505 __ stp(a22, a23, Address(state, 176));
7506 __ str(a24, Address(state, 192));
7507
7508 // restore required registers from stack
7509 __ ldp(r19, r20, Address(sp, 32));
7510 __ ldp(r21, r22, Address(sp, 48));
7511 __ ldp(r23, r24, Address(sp, 64));
7512 __ ldp(r25, r26, Address(sp, 80));
7513 __ ldp(r27, r28, Address(sp, 96));
7514 if (can_use_fp && can_use_r18) {
7515 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7516 } // else no need to recalculate rfp, since it wasn't changed
7517
7518 __ leave();
7519
7520 __ ret(lr);
7521
7522 return start;
7523 }
7524
7525 /**
7526 * Arguments:
7527 *
7528 * Inputs:
7529 * c_rarg0 - int crc
7530 * c_rarg1 - byte* buf
7531 * c_rarg2 - int length
7532 *
7533 * Output:
7534 * rax - int crc result
7535 */
7536 address generate_updateBytesCRC32() {
7537 assert(UseCRC32Intrinsics, "what are we doing here?");
7538
7539 __ align(CodeEntryAlignment);
7540 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7541 StubCodeMark mark(this, stub_id);
7542
7543 address start = __ pc();
7544
7545 const Register crc = c_rarg0; // crc
7546 const Register buf = c_rarg1; // source java byte array address
7547 const Register len = c_rarg2; // length
7548 const Register table0 = c_rarg3; // crc_table address
7549 const Register table1 = c_rarg4;
7550 const Register table2 = c_rarg5;
7551 const Register table3 = c_rarg6;
7552 const Register tmp3 = c_rarg7;
7553
7554 BLOCK_COMMENT("Entry:");
7555 __ enter(); // required for proper stackwalking of RuntimeStub frame
7556
7557 __ kernel_crc32(crc, buf, len,
7558 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7559
7560 __ leave(); // required for proper stackwalking of RuntimeStub frame
7561 __ ret(lr);
7562
7563 return start;
7564 }
7565
7566 /**
7567 * Arguments:
7568 *
7569 * Inputs:
7570 * c_rarg0 - int crc
7571 * c_rarg1 - byte* buf
7572 * c_rarg2 - int length
7573 * c_rarg3 - int* table
7574 *
7575 * Output:
7576 * r0 - int crc result
7577 */
7578 address generate_updateBytesCRC32C() {
7579 assert(UseCRC32CIntrinsics, "what are we doing here?");
7580
7581 __ align(CodeEntryAlignment);
7582 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7583 StubCodeMark mark(this, stub_id);
7584
7585 address start = __ pc();
7586
7587 const Register crc = c_rarg0; // crc
7588 const Register buf = c_rarg1; // source java byte array address
7589 const Register len = c_rarg2; // length
7590 const Register table0 = c_rarg3; // crc_table address
7591 const Register table1 = c_rarg4;
7592 const Register table2 = c_rarg5;
7593 const Register table3 = c_rarg6;
7594 const Register tmp3 = c_rarg7;
7595
7596 BLOCK_COMMENT("Entry:");
7597 __ enter(); // required for proper stackwalking of RuntimeStub frame
7598
7599 __ kernel_crc32c(crc, buf, len,
7600 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7601
7602 __ leave(); // required for proper stackwalking of RuntimeStub frame
7603 __ ret(lr);
7604
7605 return start;
7606 }
7607
7608 /***
7609 * Arguments:
7610 *
7611 * Inputs:
7612 * c_rarg0 - int adler
7613 * c_rarg1 - byte* buff
7614 * c_rarg2 - int len
7615 *
7616 * Output:
7617 * c_rarg0 - int adler result
7618 */
7619 address generate_updateBytesAdler32() {
7620 __ align(CodeEntryAlignment);
7621 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7622 StubCodeMark mark(this, stub_id);
7623 address start = __ pc();
7624
7625 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7626
7627 // Aliases
7628 Register adler = c_rarg0;
7629 Register s1 = c_rarg0;
7630 Register s2 = c_rarg3;
7631 Register buff = c_rarg1;
7632 Register len = c_rarg2;
7633 Register nmax = r4;
7634 Register base = r5;
7635 Register count = r6;
7636 Register temp0 = rscratch1;
7637 Register temp1 = rscratch2;
7638 FloatRegister vbytes = v0;
7639 FloatRegister vs1acc = v1;
7640 FloatRegister vs2acc = v2;
7641 FloatRegister vtable = v3;
7642
7643 // Max number of bytes we can process before having to take the mod
7644 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7645 uint64_t BASE = 0xfff1;
7646 uint64_t NMAX = 0x15B0;
7647
7648 __ mov(base, BASE);
7649 __ mov(nmax, NMAX);
7650
7651 // Load accumulation coefficients for the upper 16 bits
7652 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7653 __ ld1(vtable, __ T16B, Address(temp0));
7654
7655 // s1 is initialized to the lower 16 bits of adler
7656 // s2 is initialized to the upper 16 bits of adler
7657 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7658 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7659
7660 // The pipelined loop needs at least 16 elements for 1 iteration
7661 // It does check this, but it is more effective to skip to the cleanup loop
7662 __ cmp(len, (u1)16);
7663 __ br(Assembler::HS, L_nmax);
7664 __ cbz(len, L_combine);
7665
7666 __ bind(L_simple_by1_loop);
7667 __ ldrb(temp0, Address(__ post(buff, 1)));
7668 __ add(s1, s1, temp0);
7669 __ add(s2, s2, s1);
7670 __ subs(len, len, 1);
7671 __ br(Assembler::HI, L_simple_by1_loop);
7672
7673 // s1 = s1 % BASE
7674 __ subs(temp0, s1, base);
7675 __ csel(s1, temp0, s1, Assembler::HS);
7676
7677 // s2 = s2 % BASE
7678 __ lsr(temp0, s2, 16);
7679 __ lsl(temp1, temp0, 4);
7680 __ sub(temp1, temp1, temp0);
7681 __ add(s2, temp1, s2, ext::uxth);
7682
7683 __ subs(temp0, s2, base);
7684 __ csel(s2, temp0, s2, Assembler::HS);
7685
7686 __ b(L_combine);
7687
7688 __ bind(L_nmax);
7689 __ subs(len, len, nmax);
7690 __ sub(count, nmax, 16);
7691 __ br(Assembler::LO, L_by16);
7692
7693 __ bind(L_nmax_loop);
7694
7695 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7696 vbytes, vs1acc, vs2acc, vtable);
7697
7698 __ subs(count, count, 16);
7699 __ br(Assembler::HS, L_nmax_loop);
7700
7701 // s1 = s1 % BASE
7702 __ lsr(temp0, s1, 16);
7703 __ lsl(temp1, temp0, 4);
7704 __ sub(temp1, temp1, temp0);
7705 __ add(temp1, temp1, s1, ext::uxth);
7706
7707 __ lsr(temp0, temp1, 16);
7708 __ lsl(s1, temp0, 4);
7709 __ sub(s1, s1, temp0);
7710 __ add(s1, s1, temp1, ext:: uxth);
7711
7712 __ subs(temp0, s1, base);
7713 __ csel(s1, temp0, s1, Assembler::HS);
7714
7715 // s2 = s2 % BASE
7716 __ lsr(temp0, s2, 16);
7717 __ lsl(temp1, temp0, 4);
7718 __ sub(temp1, temp1, temp0);
7719 __ add(temp1, temp1, s2, ext::uxth);
7720
7721 __ lsr(temp0, temp1, 16);
7722 __ lsl(s2, temp0, 4);
7723 __ sub(s2, s2, temp0);
7724 __ add(s2, s2, temp1, ext:: uxth);
7725
7726 __ subs(temp0, s2, base);
7727 __ csel(s2, temp0, s2, Assembler::HS);
7728
7729 __ subs(len, len, nmax);
7730 __ sub(count, nmax, 16);
7731 __ br(Assembler::HS, L_nmax_loop);
7732
7733 __ bind(L_by16);
7734 __ adds(len, len, count);
7735 __ br(Assembler::LO, L_by1);
7736
7737 __ bind(L_by16_loop);
7738
7739 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7740 vbytes, vs1acc, vs2acc, vtable);
7741
7742 __ subs(len, len, 16);
7743 __ br(Assembler::HS, L_by16_loop);
7744
7745 __ bind(L_by1);
7746 __ adds(len, len, 15);
7747 __ br(Assembler::LO, L_do_mod);
7748
7749 __ bind(L_by1_loop);
7750 __ ldrb(temp0, Address(__ post(buff, 1)));
7751 __ add(s1, temp0, s1);
7752 __ add(s2, s2, s1);
7753 __ subs(len, len, 1);
7754 __ br(Assembler::HS, L_by1_loop);
7755
7756 __ bind(L_do_mod);
7757 // s1 = s1 % BASE
7758 __ lsr(temp0, s1, 16);
7759 __ lsl(temp1, temp0, 4);
7760 __ sub(temp1, temp1, temp0);
7761 __ add(temp1, temp1, s1, ext::uxth);
7762
7763 __ lsr(temp0, temp1, 16);
7764 __ lsl(s1, temp0, 4);
7765 __ sub(s1, s1, temp0);
7766 __ add(s1, s1, temp1, ext:: uxth);
7767
7768 __ subs(temp0, s1, base);
7769 __ csel(s1, temp0, s1, Assembler::HS);
7770
7771 // s2 = s2 % BASE
7772 __ lsr(temp0, s2, 16);
7773 __ lsl(temp1, temp0, 4);
7774 __ sub(temp1, temp1, temp0);
7775 __ add(temp1, temp1, s2, ext::uxth);
7776
7777 __ lsr(temp0, temp1, 16);
7778 __ lsl(s2, temp0, 4);
7779 __ sub(s2, s2, temp0);
7780 __ add(s2, s2, temp1, ext:: uxth);
7781
7782 __ subs(temp0, s2, base);
7783 __ csel(s2, temp0, s2, Assembler::HS);
7784
7785 // Combine lower bits and higher bits
7786 __ bind(L_combine);
7787 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7788
7789 __ ret(lr);
7790
7791 return start;
7792 }
7793
7794 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7795 Register temp0, Register temp1, FloatRegister vbytes,
7796 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7797 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7798 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7799 // In non-vectorized code, we update s1 and s2 as:
7800 // s1 <- s1 + b1
7801 // s2 <- s2 + s1
7802 // s1 <- s1 + b2
7803 // s2 <- s2 + b1
7804 // ...
7805 // s1 <- s1 + b16
7806 // s2 <- s2 + s1
7807 // Putting above assignments together, we have:
7808 // s1_new = s1 + b1 + b2 + ... + b16
7809 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7810 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7811 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7812 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7813
7814 // s2 = s2 + s1 * 16
7815 __ add(s2, s2, s1, Assembler::LSL, 4);
7816
7817 // vs1acc = b1 + b2 + b3 + ... + b16
7818 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7819 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7820 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7821 __ uaddlv(vs1acc, __ T16B, vbytes);
7822 __ uaddlv(vs2acc, __ T8H, vs2acc);
7823
7824 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7825 __ fmovd(temp0, vs1acc);
7826 __ fmovd(temp1, vs2acc);
7827 __ add(s1, s1, temp0);
7828 __ add(s2, s2, temp1);
7829 }
7830
7831 /**
7832 * Arguments:
7833 *
7834 * Input:
7835 * c_rarg0 - x address
7836 * c_rarg1 - x length
7837 * c_rarg2 - y address
7838 * c_rarg3 - y length
7839 * c_rarg4 - z address
7840 */
7841 address generate_multiplyToLen() {
7842 __ align(CodeEntryAlignment);
7843 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7844 StubCodeMark mark(this, stub_id);
7845
7846 address start = __ pc();
7847 const Register x = r0;
7848 const Register xlen = r1;
7849 const Register y = r2;
7850 const Register ylen = r3;
7851 const Register z = r4;
7852
7853 const Register tmp0 = r5;
7854 const Register tmp1 = r10;
7855 const Register tmp2 = r11;
7856 const Register tmp3 = r12;
7857 const Register tmp4 = r13;
7858 const Register tmp5 = r14;
7859 const Register tmp6 = r15;
7860 const Register tmp7 = r16;
7861
7862 BLOCK_COMMENT("Entry:");
7863 __ enter(); // required for proper stackwalking of RuntimeStub frame
7864 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7865 __ leave(); // required for proper stackwalking of RuntimeStub frame
7866 __ ret(lr);
7867
7868 return start;
7869 }
7870
7871 address generate_squareToLen() {
7872 // squareToLen algorithm for sizes 1..127 described in java code works
7873 // faster than multiply_to_len on some CPUs and slower on others, but
7874 // multiply_to_len shows a bit better overall results
7875 __ align(CodeEntryAlignment);
7876 StubId stub_id = StubId::stubgen_squareToLen_id;
7877 StubCodeMark mark(this, stub_id);
7878 address start = __ pc();
7879
7880 const Register x = r0;
7881 const Register xlen = r1;
7882 const Register z = r2;
7883 const Register y = r4; // == x
7884 const Register ylen = r5; // == xlen
7885
7886 const Register tmp0 = r3;
7887 const Register tmp1 = r10;
7888 const Register tmp2 = r11;
7889 const Register tmp3 = r12;
7890 const Register tmp4 = r13;
7891 const Register tmp5 = r14;
7892 const Register tmp6 = r15;
7893 const Register tmp7 = r16;
7894
7895 RegSet spilled_regs = RegSet::of(y, ylen);
7896 BLOCK_COMMENT("Entry:");
7897 __ enter();
7898 __ push(spilled_regs, sp);
7899 __ mov(y, x);
7900 __ mov(ylen, xlen);
7901 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7902 __ pop(spilled_regs, sp);
7903 __ leave();
7904 __ ret(lr);
7905 return start;
7906 }
7907
7908 address generate_mulAdd() {
7909 __ align(CodeEntryAlignment);
7910 StubId stub_id = StubId::stubgen_mulAdd_id;
7911 StubCodeMark mark(this, stub_id);
7912
7913 address start = __ pc();
7914
7915 const Register out = r0;
7916 const Register in = r1;
7917 const Register offset = r2;
7918 const Register len = r3;
7919 const Register k = r4;
7920
7921 BLOCK_COMMENT("Entry:");
7922 __ enter();
7923 __ mul_add(out, in, offset, len, k);
7924 __ leave();
7925 __ ret(lr);
7926
7927 return start;
7928 }
7929
7930 // Arguments:
7931 //
7932 // Input:
7933 // c_rarg0 - newArr address
7934 // c_rarg1 - oldArr address
7935 // c_rarg2 - newIdx
7936 // c_rarg3 - shiftCount
7937 // c_rarg4 - numIter
7938 //
7939 address generate_bigIntegerRightShift() {
7940 __ align(CodeEntryAlignment);
7941 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7942 StubCodeMark mark(this, stub_id);
7943 address start = __ pc();
7944
7945 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7946
7947 Register newArr = c_rarg0;
7948 Register oldArr = c_rarg1;
7949 Register newIdx = c_rarg2;
7950 Register shiftCount = c_rarg3;
7951 Register numIter = c_rarg4;
7952 Register idx = numIter;
7953
7954 Register newArrCur = rscratch1;
7955 Register shiftRevCount = rscratch2;
7956 Register oldArrCur = r13;
7957 Register oldArrNext = r14;
7958
7959 FloatRegister oldElem0 = v0;
7960 FloatRegister oldElem1 = v1;
7961 FloatRegister newElem = v2;
7962 FloatRegister shiftVCount = v3;
7963 FloatRegister shiftVRevCount = v4;
7964
7965 __ cbz(idx, Exit);
7966
7967 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7968
7969 // left shift count
7970 __ movw(shiftRevCount, 32);
7971 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7972
7973 // numIter too small to allow a 4-words SIMD loop, rolling back
7974 __ cmp(numIter, (u1)4);
7975 __ br(Assembler::LT, ShiftThree);
7976
7977 __ dup(shiftVCount, __ T4S, shiftCount);
7978 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7979 __ negr(shiftVCount, __ T4S, shiftVCount);
7980
7981 __ BIND(ShiftSIMDLoop);
7982
7983 // Calculate the load addresses
7984 __ sub(idx, idx, 4);
7985 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7986 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7987 __ add(oldArrCur, oldArrNext, 4);
7988
7989 // Load 4 words and process
7990 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7991 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7992 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7993 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7994 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7995 __ st1(newElem, __ T4S, Address(newArrCur));
7996
7997 __ cmp(idx, (u1)4);
7998 __ br(Assembler::LT, ShiftTwoLoop);
7999 __ b(ShiftSIMDLoop);
8000
8001 __ BIND(ShiftTwoLoop);
8002 __ cbz(idx, Exit);
8003 __ cmp(idx, (u1)1);
8004 __ br(Assembler::EQ, ShiftOne);
8005
8006 // Calculate the load addresses
8007 __ sub(idx, idx, 2);
8008 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8009 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8010 __ add(oldArrCur, oldArrNext, 4);
8011
8012 // Load 2 words and process
8013 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8014 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8015 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8016 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8017 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8018 __ st1(newElem, __ T2S, Address(newArrCur));
8019 __ b(ShiftTwoLoop);
8020
8021 __ BIND(ShiftThree);
8022 __ tbz(idx, 1, ShiftOne);
8023 __ tbz(idx, 0, ShiftTwo);
8024 __ ldrw(r10, Address(oldArr, 12));
8025 __ ldrw(r11, Address(oldArr, 8));
8026 __ lsrvw(r10, r10, shiftCount);
8027 __ lslvw(r11, r11, shiftRevCount);
8028 __ orrw(r12, r10, r11);
8029 __ strw(r12, Address(newArr, 8));
8030
8031 __ BIND(ShiftTwo);
8032 __ ldrw(r10, Address(oldArr, 8));
8033 __ ldrw(r11, Address(oldArr, 4));
8034 __ lsrvw(r10, r10, shiftCount);
8035 __ lslvw(r11, r11, shiftRevCount);
8036 __ orrw(r12, r10, r11);
8037 __ strw(r12, Address(newArr, 4));
8038
8039 __ BIND(ShiftOne);
8040 __ ldrw(r10, Address(oldArr, 4));
8041 __ ldrw(r11, Address(oldArr));
8042 __ lsrvw(r10, r10, shiftCount);
8043 __ lslvw(r11, r11, shiftRevCount);
8044 __ orrw(r12, r10, r11);
8045 __ strw(r12, Address(newArr));
8046
8047 __ BIND(Exit);
8048 __ ret(lr);
8049
8050 return start;
8051 }
8052
8053 // Arguments:
8054 //
8055 // Input:
8056 // c_rarg0 - newArr address
8057 // c_rarg1 - oldArr address
8058 // c_rarg2 - newIdx
8059 // c_rarg3 - shiftCount
8060 // c_rarg4 - numIter
8061 //
8062 address generate_bigIntegerLeftShift() {
8063 __ align(CodeEntryAlignment);
8064 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8065 StubCodeMark mark(this, stub_id);
8066 address start = __ pc();
8067
8068 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8069
8070 Register newArr = c_rarg0;
8071 Register oldArr = c_rarg1;
8072 Register newIdx = c_rarg2;
8073 Register shiftCount = c_rarg3;
8074 Register numIter = c_rarg4;
8075
8076 Register shiftRevCount = rscratch1;
8077 Register oldArrNext = rscratch2;
8078
8079 FloatRegister oldElem0 = v0;
8080 FloatRegister oldElem1 = v1;
8081 FloatRegister newElem = v2;
8082 FloatRegister shiftVCount = v3;
8083 FloatRegister shiftVRevCount = v4;
8084
8085 __ cbz(numIter, Exit);
8086
8087 __ add(oldArrNext, oldArr, 4);
8088 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8089
8090 // right shift count
8091 __ movw(shiftRevCount, 32);
8092 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8093
8094 // numIter too small to allow a 4-words SIMD loop, rolling back
8095 __ cmp(numIter, (u1)4);
8096 __ br(Assembler::LT, ShiftThree);
8097
8098 __ dup(shiftVCount, __ T4S, shiftCount);
8099 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8100 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8101
8102 __ BIND(ShiftSIMDLoop);
8103
8104 // load 4 words and process
8105 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8106 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8107 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8108 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8109 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8110 __ st1(newElem, __ T4S, __ post(newArr, 16));
8111 __ sub(numIter, numIter, 4);
8112
8113 __ cmp(numIter, (u1)4);
8114 __ br(Assembler::LT, ShiftTwoLoop);
8115 __ b(ShiftSIMDLoop);
8116
8117 __ BIND(ShiftTwoLoop);
8118 __ cbz(numIter, Exit);
8119 __ cmp(numIter, (u1)1);
8120 __ br(Assembler::EQ, ShiftOne);
8121
8122 // load 2 words and process
8123 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8124 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8125 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8126 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8127 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8128 __ st1(newElem, __ T2S, __ post(newArr, 8));
8129 __ sub(numIter, numIter, 2);
8130 __ b(ShiftTwoLoop);
8131
8132 __ BIND(ShiftThree);
8133 __ ldrw(r10, __ post(oldArr, 4));
8134 __ ldrw(r11, __ post(oldArrNext, 4));
8135 __ lslvw(r10, r10, shiftCount);
8136 __ lsrvw(r11, r11, shiftRevCount);
8137 __ orrw(r12, r10, r11);
8138 __ strw(r12, __ post(newArr, 4));
8139 __ tbz(numIter, 1, Exit);
8140 __ tbz(numIter, 0, ShiftOne);
8141
8142 __ BIND(ShiftTwo);
8143 __ ldrw(r10, __ post(oldArr, 4));
8144 __ ldrw(r11, __ post(oldArrNext, 4));
8145 __ lslvw(r10, r10, shiftCount);
8146 __ lsrvw(r11, r11, shiftRevCount);
8147 __ orrw(r12, r10, r11);
8148 __ strw(r12, __ post(newArr, 4));
8149
8150 __ BIND(ShiftOne);
8151 __ ldrw(r10, Address(oldArr));
8152 __ ldrw(r11, Address(oldArrNext));
8153 __ lslvw(r10, r10, shiftCount);
8154 __ lsrvw(r11, r11, shiftRevCount);
8155 __ orrw(r12, r10, r11);
8156 __ strw(r12, Address(newArr));
8157
8158 __ BIND(Exit);
8159 __ ret(lr);
8160
8161 return start;
8162 }
8163
8164 address generate_count_positives(address &count_positives_long) {
8165 const u1 large_loop_size = 64;
8166 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8167 int dcache_line = VM_Version::dcache_line_size();
8168
8169 Register ary1 = r1, len = r2, result = r0;
8170
8171 __ align(CodeEntryAlignment);
8172
8173 StubId stub_id = StubId::stubgen_count_positives_id;
8174 StubCodeMark mark(this, stub_id);
8175
8176 address entry = __ pc();
8177
8178 __ enter();
8179 // precondition: a copy of len is already in result
8180 // __ mov(result, len);
8181
8182 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8183 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8184
8185 __ cmp(len, (u1)15);
8186 __ br(Assembler::GT, LEN_OVER_15);
8187 // The only case when execution falls into this code is when pointer is near
8188 // the end of memory page and we have to avoid reading next page
8189 __ add(ary1, ary1, len);
8190 __ subs(len, len, 8);
8191 __ br(Assembler::GT, LEN_OVER_8);
8192 __ ldr(rscratch2, Address(ary1, -8));
8193 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8194 __ lsrv(rscratch2, rscratch2, rscratch1);
8195 __ tst(rscratch2, UPPER_BIT_MASK);
8196 __ csel(result, zr, result, Assembler::NE);
8197 __ leave();
8198 __ ret(lr);
8199 __ bind(LEN_OVER_8);
8200 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8201 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8202 __ tst(rscratch2, UPPER_BIT_MASK);
8203 __ br(Assembler::NE, RET_NO_POP);
8204 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8205 __ lsrv(rscratch1, rscratch1, rscratch2);
8206 __ tst(rscratch1, UPPER_BIT_MASK);
8207 __ bind(RET_NO_POP);
8208 __ csel(result, zr, result, Assembler::NE);
8209 __ leave();
8210 __ ret(lr);
8211
8212 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8213 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8214
8215 count_positives_long = __ pc(); // 2nd entry point
8216
8217 __ enter();
8218
8219 __ bind(LEN_OVER_15);
8220 __ push(spilled_regs, sp);
8221 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8222 __ cbz(rscratch2, ALIGNED);
8223 __ ldp(tmp6, tmp1, Address(ary1));
8224 __ mov(tmp5, 16);
8225 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8226 __ add(ary1, ary1, rscratch1);
8227 __ orr(tmp6, tmp6, tmp1);
8228 __ tst(tmp6, UPPER_BIT_MASK);
8229 __ br(Assembler::NE, RET_ADJUST);
8230 __ sub(len, len, rscratch1);
8231
8232 __ bind(ALIGNED);
8233 __ cmp(len, large_loop_size);
8234 __ br(Assembler::LT, CHECK_16);
8235 // Perform 16-byte load as early return in pre-loop to handle situation
8236 // when initially aligned large array has negative values at starting bytes,
8237 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8238 // slower. Cases with negative bytes further ahead won't be affected that
8239 // much. In fact, it'll be faster due to early loads, less instructions and
8240 // less branches in LARGE_LOOP.
8241 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8242 __ sub(len, len, 16);
8243 __ orr(tmp6, tmp6, tmp1);
8244 __ tst(tmp6, UPPER_BIT_MASK);
8245 __ br(Assembler::NE, RET_ADJUST_16);
8246 __ cmp(len, large_loop_size);
8247 __ br(Assembler::LT, CHECK_16);
8248
8249 if (SoftwarePrefetchHintDistance >= 0
8250 && SoftwarePrefetchHintDistance >= dcache_line) {
8251 // initial prefetch
8252 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8253 }
8254 __ bind(LARGE_LOOP);
8255 if (SoftwarePrefetchHintDistance >= 0) {
8256 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8257 }
8258 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8259 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8260 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8261 // instructions per cycle and have less branches, but this approach disables
8262 // early return, thus, all 64 bytes are loaded and checked every time.
8263 __ ldp(tmp2, tmp3, Address(ary1));
8264 __ ldp(tmp4, tmp5, Address(ary1, 16));
8265 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8266 __ ldp(tmp6, tmp1, Address(ary1, 48));
8267 __ add(ary1, ary1, large_loop_size);
8268 __ sub(len, len, large_loop_size);
8269 __ orr(tmp2, tmp2, tmp3);
8270 __ orr(tmp4, tmp4, tmp5);
8271 __ orr(rscratch1, rscratch1, rscratch2);
8272 __ orr(tmp6, tmp6, tmp1);
8273 __ orr(tmp2, tmp2, tmp4);
8274 __ orr(rscratch1, rscratch1, tmp6);
8275 __ orr(tmp2, tmp2, rscratch1);
8276 __ tst(tmp2, UPPER_BIT_MASK);
8277 __ br(Assembler::NE, RET_ADJUST_LONG);
8278 __ cmp(len, large_loop_size);
8279 __ br(Assembler::GE, LARGE_LOOP);
8280
8281 __ bind(CHECK_16); // small 16-byte load pre-loop
8282 __ cmp(len, (u1)16);
8283 __ br(Assembler::LT, POST_LOOP16);
8284
8285 __ bind(LOOP16); // small 16-byte load loop
8286 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8287 __ sub(len, len, 16);
8288 __ orr(tmp2, tmp2, tmp3);
8289 __ tst(tmp2, UPPER_BIT_MASK);
8290 __ br(Assembler::NE, RET_ADJUST_16);
8291 __ cmp(len, (u1)16);
8292 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8293
8294 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8295 __ cmp(len, (u1)8);
8296 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8297 __ ldr(tmp3, Address(__ post(ary1, 8)));
8298 __ tst(tmp3, UPPER_BIT_MASK);
8299 __ br(Assembler::NE, RET_ADJUST);
8300 __ sub(len, len, 8);
8301
8302 __ bind(POST_LOOP16_LOAD_TAIL);
8303 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8304 __ ldr(tmp1, Address(ary1));
8305 __ mov(tmp2, 64);
8306 __ sub(tmp4, tmp2, len, __ LSL, 3);
8307 __ lslv(tmp1, tmp1, tmp4);
8308 __ tst(tmp1, UPPER_BIT_MASK);
8309 __ br(Assembler::NE, RET_ADJUST);
8310 // Fallthrough
8311
8312 __ bind(RET_LEN);
8313 __ pop(spilled_regs, sp);
8314 __ leave();
8315 __ ret(lr);
8316
8317 // difference result - len is the count of guaranteed to be
8318 // positive bytes
8319
8320 __ bind(RET_ADJUST_LONG);
8321 __ add(len, len, (u1)(large_loop_size - 16));
8322 __ bind(RET_ADJUST_16);
8323 __ add(len, len, 16);
8324 __ bind(RET_ADJUST);
8325 __ pop(spilled_regs, sp);
8326 __ leave();
8327 __ sub(result, result, len);
8328 __ ret(lr);
8329
8330 return entry;
8331 }
8332
8333 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8334 bool usePrefetch, Label &NOT_EQUAL) {
8335 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8336 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8337 tmp7 = r12, tmp8 = r13;
8338 Label LOOP;
8339
8340 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8341 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8342 __ bind(LOOP);
8343 if (usePrefetch) {
8344 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8345 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8346 }
8347 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8348 __ eor(tmp1, tmp1, tmp2);
8349 __ eor(tmp3, tmp3, tmp4);
8350 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8351 __ orr(tmp1, tmp1, tmp3);
8352 __ cbnz(tmp1, NOT_EQUAL);
8353 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8354 __ eor(tmp5, tmp5, tmp6);
8355 __ eor(tmp7, tmp7, tmp8);
8356 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8357 __ orr(tmp5, tmp5, tmp7);
8358 __ cbnz(tmp5, NOT_EQUAL);
8359 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8360 __ eor(tmp1, tmp1, tmp2);
8361 __ eor(tmp3, tmp3, tmp4);
8362 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8363 __ orr(tmp1, tmp1, tmp3);
8364 __ cbnz(tmp1, NOT_EQUAL);
8365 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8366 __ eor(tmp5, tmp5, tmp6);
8367 __ sub(cnt1, cnt1, 8 * wordSize);
8368 __ eor(tmp7, tmp7, tmp8);
8369 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8370 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8371 // cmp) because subs allows an unlimited range of immediate operand.
8372 __ subs(tmp6, cnt1, loopThreshold);
8373 __ orr(tmp5, tmp5, tmp7);
8374 __ cbnz(tmp5, NOT_EQUAL);
8375 __ br(__ GE, LOOP);
8376 // post-loop
8377 __ eor(tmp1, tmp1, tmp2);
8378 __ eor(tmp3, tmp3, tmp4);
8379 __ orr(tmp1, tmp1, tmp3);
8380 __ sub(cnt1, cnt1, 2 * wordSize);
8381 __ cbnz(tmp1, NOT_EQUAL);
8382 }
8383
8384 void generate_large_array_equals_loop_simd(int loopThreshold,
8385 bool usePrefetch, Label &NOT_EQUAL) {
8386 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8387 tmp2 = rscratch2;
8388 Label LOOP;
8389
8390 __ bind(LOOP);
8391 if (usePrefetch) {
8392 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8393 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8394 }
8395 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8396 __ sub(cnt1, cnt1, 8 * wordSize);
8397 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8398 __ subs(tmp1, cnt1, loopThreshold);
8399 __ eor(v0, __ T16B, v0, v4);
8400 __ eor(v1, __ T16B, v1, v5);
8401 __ eor(v2, __ T16B, v2, v6);
8402 __ eor(v3, __ T16B, v3, v7);
8403 __ orr(v0, __ T16B, v0, v1);
8404 __ orr(v1, __ T16B, v2, v3);
8405 __ orr(v0, __ T16B, v0, v1);
8406 __ umov(tmp1, v0, __ D, 0);
8407 __ umov(tmp2, v0, __ D, 1);
8408 __ orr(tmp1, tmp1, tmp2);
8409 __ cbnz(tmp1, NOT_EQUAL);
8410 __ br(__ GE, LOOP);
8411 }
8412
8413 // a1 = r1 - array1 address
8414 // a2 = r2 - array2 address
8415 // result = r0 - return value. Already contains "false"
8416 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8417 // r3-r5 are reserved temporary registers
8418 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8419 address generate_large_array_equals() {
8420 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8421 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8422 tmp7 = r12, tmp8 = r13;
8423 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8424 SMALL_LOOP, POST_LOOP;
8425 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8426 // calculate if at least 32 prefetched bytes are used
8427 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8428 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8429 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8430 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8431 tmp5, tmp6, tmp7, tmp8);
8432
8433 __ align(CodeEntryAlignment);
8434
8435 StubId stub_id = StubId::stubgen_large_array_equals_id;
8436 StubCodeMark mark(this, stub_id);
8437
8438 address entry = __ pc();
8439 __ enter();
8440 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8441 // also advance pointers to use post-increment instead of pre-increment
8442 __ add(a1, a1, wordSize);
8443 __ add(a2, a2, wordSize);
8444 if (AvoidUnalignedAccesses) {
8445 // both implementations (SIMD/nonSIMD) are using relatively large load
8446 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8447 // on some CPUs in case of address is not at least 16-byte aligned.
8448 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8449 // load if needed at least for 1st address and make if 16-byte aligned.
8450 Label ALIGNED16;
8451 __ tbz(a1, 3, ALIGNED16);
8452 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8453 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8454 __ sub(cnt1, cnt1, wordSize);
8455 __ eor(tmp1, tmp1, tmp2);
8456 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8457 __ bind(ALIGNED16);
8458 }
8459 if (UseSIMDForArrayEquals) {
8460 if (SoftwarePrefetchHintDistance >= 0) {
8461 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8462 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8463 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8464 /* prfm = */ true, NOT_EQUAL);
8465 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8466 __ br(__ LT, TAIL);
8467 }
8468 __ bind(NO_PREFETCH_LARGE_LOOP);
8469 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8470 /* prfm = */ false, NOT_EQUAL);
8471 } else {
8472 __ push(spilled_regs, sp);
8473 if (SoftwarePrefetchHintDistance >= 0) {
8474 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8475 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8476 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8477 /* prfm = */ true, NOT_EQUAL);
8478 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8479 __ br(__ LT, TAIL);
8480 }
8481 __ bind(NO_PREFETCH_LARGE_LOOP);
8482 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8483 /* prfm = */ false, NOT_EQUAL);
8484 }
8485 __ bind(TAIL);
8486 __ cbz(cnt1, EQUAL);
8487 __ subs(cnt1, cnt1, wordSize);
8488 __ br(__ LE, POST_LOOP);
8489 __ bind(SMALL_LOOP);
8490 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8491 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8492 __ subs(cnt1, cnt1, wordSize);
8493 __ eor(tmp1, tmp1, tmp2);
8494 __ cbnz(tmp1, NOT_EQUAL);
8495 __ br(__ GT, SMALL_LOOP);
8496 __ bind(POST_LOOP);
8497 __ ldr(tmp1, Address(a1, cnt1));
8498 __ ldr(tmp2, Address(a2, cnt1));
8499 __ eor(tmp1, tmp1, tmp2);
8500 __ cbnz(tmp1, NOT_EQUAL);
8501 __ bind(EQUAL);
8502 __ mov(result, true);
8503 __ bind(NOT_EQUAL);
8504 if (!UseSIMDForArrayEquals) {
8505 __ pop(spilled_regs, sp);
8506 }
8507 __ bind(NOT_EQUAL_NO_POP);
8508 __ leave();
8509 __ ret(lr);
8510 return entry;
8511 }
8512
8513 // result = r0 - return value. Contains initial hashcode value on entry.
8514 // ary = r1 - array address
8515 // cnt = r2 - elements count
8516 // Clobbers: v0-v13, rscratch1, rscratch2
8517 address generate_large_arrays_hashcode(BasicType eltype) {
8518 const Register result = r0, ary = r1, cnt = r2;
8519 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8520 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8521 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8522 const FloatRegister vpowm = v13;
8523
8524 ARRAYS_HASHCODE_REGISTERS;
8525
8526 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8527
8528 unsigned int vf; // vectorization factor
8529 bool multiply_by_halves;
8530 Assembler::SIMD_Arrangement load_arrangement;
8531 switch (eltype) {
8532 case T_BOOLEAN:
8533 case T_BYTE:
8534 load_arrangement = Assembler::T8B;
8535 multiply_by_halves = true;
8536 vf = 8;
8537 break;
8538 case T_CHAR:
8539 case T_SHORT:
8540 load_arrangement = Assembler::T8H;
8541 multiply_by_halves = true;
8542 vf = 8;
8543 break;
8544 case T_INT:
8545 load_arrangement = Assembler::T4S;
8546 multiply_by_halves = false;
8547 vf = 4;
8548 break;
8549 default:
8550 ShouldNotReachHere();
8551 }
8552
8553 // Unroll factor
8554 const unsigned uf = 4;
8555
8556 // Effective vectorization factor
8557 const unsigned evf = vf * uf;
8558
8559 __ align(CodeEntryAlignment);
8560
8561 StubId stub_id;
8562 switch (eltype) {
8563 case T_BOOLEAN:
8564 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8565 break;
8566 case T_BYTE:
8567 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8568 break;
8569 case T_CHAR:
8570 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8571 break;
8572 case T_SHORT:
8573 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8574 break;
8575 case T_INT:
8576 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8577 break;
8578 default:
8579 stub_id = StubId::NO_STUBID;
8580 ShouldNotReachHere();
8581 };
8582
8583 StubCodeMark mark(this, stub_id);
8584
8585 address entry = __ pc();
8586 __ enter();
8587
8588 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8589 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8590 // value shouldn't change throughout both loops.
8591 __ movw(rscratch1, intpow(31U, 3));
8592 __ mov(vpow, Assembler::S, 0, rscratch1);
8593 __ movw(rscratch1, intpow(31U, 2));
8594 __ mov(vpow, Assembler::S, 1, rscratch1);
8595 __ movw(rscratch1, intpow(31U, 1));
8596 __ mov(vpow, Assembler::S, 2, rscratch1);
8597 __ movw(rscratch1, intpow(31U, 0));
8598 __ mov(vpow, Assembler::S, 3, rscratch1);
8599
8600 __ mov(vmul0, Assembler::T16B, 0);
8601 __ mov(vmul0, Assembler::S, 3, result);
8602
8603 __ andr(rscratch2, cnt, (uf - 1) * vf);
8604 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8605
8606 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8607 __ mov(vpowm, Assembler::S, 0, rscratch1);
8608
8609 // SMALL LOOP
8610 __ bind(SMALL_LOOP);
8611
8612 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8613 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8614 __ subsw(rscratch2, rscratch2, vf);
8615
8616 if (load_arrangement == Assembler::T8B) {
8617 // Extend 8B to 8H to be able to use vector multiply
8618 // instructions
8619 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8620 if (is_signed_subword_type(eltype)) {
8621 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8622 } else {
8623 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8624 }
8625 }
8626
8627 switch (load_arrangement) {
8628 case Assembler::T4S:
8629 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8630 break;
8631 case Assembler::T8B:
8632 case Assembler::T8H:
8633 assert(is_subword_type(eltype), "subword type expected");
8634 if (is_signed_subword_type(eltype)) {
8635 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8636 } else {
8637 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8638 }
8639 break;
8640 default:
8641 __ should_not_reach_here();
8642 }
8643
8644 // Process the upper half of a vector
8645 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8646 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8647 if (is_signed_subword_type(eltype)) {
8648 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8649 } else {
8650 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8651 }
8652 }
8653
8654 __ br(Assembler::HI, SMALL_LOOP);
8655
8656 // SMALL LOOP'S EPILOQUE
8657 __ lsr(rscratch2, cnt, exact_log2(evf));
8658 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8659
8660 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8661 __ addv(vmul0, Assembler::T4S, vmul0);
8662 __ umov(result, vmul0, Assembler::S, 0);
8663
8664 // TAIL
8665 __ bind(TAIL);
8666
8667 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8668 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8669 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8670 __ andr(rscratch2, cnt, vf - 1);
8671 __ bind(TAIL_SHORTCUT);
8672 __ adr(rscratch1, BR_BASE);
8673 // For Cortex-A53 offset is 4 because 2 nops are generated.
8674 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8675 __ movw(rscratch2, 0x1f);
8676 __ br(rscratch1);
8677
8678 for (size_t i = 0; i < vf - 1; ++i) {
8679 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8680 eltype);
8681 __ maddw(result, result, rscratch2, rscratch1);
8682 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8683 // Generate 2nd nop to have 4 instructions per iteration.
8684 if (VM_Version::supports_a53mac()) {
8685 __ nop();
8686 }
8687 }
8688 __ bind(BR_BASE);
8689
8690 __ leave();
8691 __ ret(lr);
8692
8693 // LARGE LOOP
8694 __ bind(LARGE_LOOP_PREHEADER);
8695
8696 __ lsr(rscratch2, cnt, exact_log2(evf));
8697
8698 if (multiply_by_halves) {
8699 // 31^4 - multiplier between lower and upper parts of a register
8700 __ movw(rscratch1, intpow(31U, vf / 2));
8701 __ mov(vpowm, Assembler::S, 1, rscratch1);
8702 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8703 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8704 __ mov(vpowm, Assembler::S, 0, rscratch1);
8705 } else {
8706 // 31^16
8707 __ movw(rscratch1, intpow(31U, evf));
8708 __ mov(vpowm, Assembler::S, 0, rscratch1);
8709 }
8710
8711 __ mov(vmul3, Assembler::T16B, 0);
8712 __ mov(vmul2, Assembler::T16B, 0);
8713 __ mov(vmul1, Assembler::T16B, 0);
8714
8715 __ bind(LARGE_LOOP);
8716
8717 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8718 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8719 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8720 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8721
8722 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8723 Address(__ post(ary, evf * type2aelembytes(eltype))));
8724
8725 if (load_arrangement == Assembler::T8B) {
8726 // Extend 8B to 8H to be able to use vector multiply
8727 // instructions
8728 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8729 if (is_signed_subword_type(eltype)) {
8730 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8731 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8732 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8733 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8734 } else {
8735 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8736 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8737 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8738 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8739 }
8740 }
8741
8742 switch (load_arrangement) {
8743 case Assembler::T4S:
8744 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8745 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8746 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8747 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8748 break;
8749 case Assembler::T8B:
8750 case Assembler::T8H:
8751 assert(is_subword_type(eltype), "subword type expected");
8752 if (is_signed_subword_type(eltype)) {
8753 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8754 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8755 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8756 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8757 } else {
8758 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8759 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8760 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8761 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8762 }
8763 break;
8764 default:
8765 __ should_not_reach_here();
8766 }
8767
8768 // Process the upper half of a vector
8769 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8770 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8771 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8772 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8773 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8774 if (is_signed_subword_type(eltype)) {
8775 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8776 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8777 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8778 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8779 } else {
8780 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8781 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8782 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8783 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8784 }
8785 }
8786
8787 __ subsw(rscratch2, rscratch2, 1);
8788 __ br(Assembler::HI, LARGE_LOOP);
8789
8790 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8791 __ addv(vmul3, Assembler::T4S, vmul3);
8792 __ umov(result, vmul3, Assembler::S, 0);
8793
8794 __ mov(rscratch2, intpow(31U, vf));
8795
8796 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8797 __ addv(vmul2, Assembler::T4S, vmul2);
8798 __ umov(rscratch1, vmul2, Assembler::S, 0);
8799 __ maddw(result, result, rscratch2, rscratch1);
8800
8801 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8802 __ addv(vmul1, Assembler::T4S, vmul1);
8803 __ umov(rscratch1, vmul1, Assembler::S, 0);
8804 __ maddw(result, result, rscratch2, rscratch1);
8805
8806 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8807 __ addv(vmul0, Assembler::T4S, vmul0);
8808 __ umov(rscratch1, vmul0, Assembler::S, 0);
8809 __ maddw(result, result, rscratch2, rscratch1);
8810
8811 __ andr(rscratch2, cnt, vf - 1);
8812 __ cbnz(rscratch2, TAIL_SHORTCUT);
8813
8814 __ leave();
8815 __ ret(lr);
8816
8817 return entry;
8818 }
8819
8820 address generate_dsin_dcos(bool isCos) {
8821 __ align(CodeEntryAlignment);
8822 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8823 StubCodeMark mark(this, stub_id);
8824 address start = __ pc();
8825 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8826 (address)StubRoutines::aarch64::_two_over_pi,
8827 (address)StubRoutines::aarch64::_pio2,
8828 (address)StubRoutines::aarch64::_dsin_coef,
8829 (address)StubRoutines::aarch64::_dcos_coef);
8830 return start;
8831 }
8832
8833 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8834 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8835 Label &DIFF2) {
8836 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8837 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8838
8839 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8840 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8841 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8842 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8843
8844 __ fmovd(tmpL, vtmp3);
8845 __ eor(rscratch2, tmp3, tmpL);
8846 __ cbnz(rscratch2, DIFF2);
8847
8848 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8849 __ umov(tmpL, vtmp3, __ D, 1);
8850 __ eor(rscratch2, tmpU, tmpL);
8851 __ cbnz(rscratch2, DIFF1);
8852
8853 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8854 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8855 __ fmovd(tmpL, vtmp);
8856 __ eor(rscratch2, tmp3, tmpL);
8857 __ cbnz(rscratch2, DIFF2);
8858
8859 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8860 __ umov(tmpL, vtmp, __ D, 1);
8861 __ eor(rscratch2, tmpU, tmpL);
8862 __ cbnz(rscratch2, DIFF1);
8863 }
8864
8865 // r0 = result
8866 // r1 = str1
8867 // r2 = cnt1
8868 // r3 = str2
8869 // r4 = cnt2
8870 // r10 = tmp1
8871 // r11 = tmp2
8872 address generate_compare_long_string_different_encoding(bool isLU) {
8873 __ align(CodeEntryAlignment);
8874 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8875 StubCodeMark mark(this, stub_id);
8876 address entry = __ pc();
8877 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8878 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8879 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8880 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8881 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8882 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8883 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8884
8885 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8886
8887 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8888 // cnt2 == amount of characters left to compare
8889 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8890 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8891 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8892 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8893 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8894 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8895 __ eor(rscratch2, tmp1, tmp2);
8896 __ mov(rscratch1, tmp2);
8897 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8898 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8899 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8900 __ push(spilled_regs, sp);
8901 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8902 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8903
8904 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8905
8906 if (SoftwarePrefetchHintDistance >= 0) {
8907 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8908 __ br(__ LT, NO_PREFETCH);
8909 __ bind(LARGE_LOOP_PREFETCH);
8910 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8911 __ mov(tmp4, 2);
8912 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8913 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8914 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8915 __ subs(tmp4, tmp4, 1);
8916 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8917 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8918 __ mov(tmp4, 2);
8919 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8920 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8921 __ subs(tmp4, tmp4, 1);
8922 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8923 __ sub(cnt2, cnt2, 64);
8924 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8925 __ br(__ GE, LARGE_LOOP_PREFETCH);
8926 }
8927 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8928 __ bind(NO_PREFETCH);
8929 __ subs(cnt2, cnt2, 16);
8930 __ br(__ LT, TAIL);
8931 __ align(OptoLoopAlignment);
8932 __ bind(SMALL_LOOP); // smaller loop
8933 __ subs(cnt2, cnt2, 16);
8934 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8935 __ br(__ GE, SMALL_LOOP);
8936 __ cmn(cnt2, (u1)16);
8937 __ br(__ EQ, LOAD_LAST);
8938 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8939 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8940 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8941 __ ldr(tmp3, Address(cnt1, -8));
8942 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8943 __ b(LOAD_LAST);
8944 __ bind(DIFF2);
8945 __ mov(tmpU, tmp3);
8946 __ bind(DIFF1);
8947 __ pop(spilled_regs, sp);
8948 __ b(CALCULATE_DIFFERENCE);
8949 __ bind(LOAD_LAST);
8950 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8951 // No need to load it again
8952 __ mov(tmpU, tmp3);
8953 __ pop(spilled_regs, sp);
8954
8955 // tmp2 points to the address of the last 4 Latin1 characters right now
8956 __ ldrs(vtmp, Address(tmp2));
8957 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8958 __ fmovd(tmpL, vtmp);
8959
8960 __ eor(rscratch2, tmpU, tmpL);
8961 __ cbz(rscratch2, DONE);
8962
8963 // Find the first different characters in the longwords and
8964 // compute their difference.
8965 __ bind(CALCULATE_DIFFERENCE);
8966 __ rev(rscratch2, rscratch2);
8967 __ clz(rscratch2, rscratch2);
8968 __ andr(rscratch2, rscratch2, -16);
8969 __ lsrv(tmp1, tmp1, rscratch2);
8970 __ uxthw(tmp1, tmp1);
8971 __ lsrv(rscratch1, rscratch1, rscratch2);
8972 __ uxthw(rscratch1, rscratch1);
8973 __ subw(result, tmp1, rscratch1);
8974 __ bind(DONE);
8975 __ ret(lr);
8976 return entry;
8977 }
8978
8979 // r0 = input (float16)
8980 // v0 = result (float)
8981 // v1 = temporary float register
8982 address generate_float16ToFloat() {
8983 __ align(CodeEntryAlignment);
8984 StubId stub_id = StubId::stubgen_hf2f_id;
8985 StubCodeMark mark(this, stub_id);
8986 address entry = __ pc();
8987 BLOCK_COMMENT("Entry:");
8988 __ flt16_to_flt(v0, r0, v1);
8989 __ ret(lr);
8990 return entry;
8991 }
8992
8993 // v0 = input (float)
8994 // r0 = result (float16)
8995 // v1 = temporary float register
8996 address generate_floatToFloat16() {
8997 __ align(CodeEntryAlignment);
8998 StubId stub_id = StubId::stubgen_f2hf_id;
8999 StubCodeMark mark(this, stub_id);
9000 address entry = __ pc();
9001 BLOCK_COMMENT("Entry:");
9002 __ flt_to_flt16(r0, v0, v1);
9003 __ ret(lr);
9004 return entry;
9005 }
9006
9007 address generate_method_entry_barrier() {
9008 __ align(CodeEntryAlignment);
9009 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9010 StubCodeMark mark(this, stub_id);
9011
9012 Label deoptimize_label;
9013
9014 address start = __ pc();
9015
9016 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9017
9018 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9019 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9020 // We can get here despite the nmethod being good, if we have not
9021 // yet applied our cross modification fence (or data fence).
9022 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9023 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9024 __ ldrw(rscratch2, rscratch2);
9025 __ strw(rscratch2, thread_epoch_addr);
9026 __ isb();
9027 __ membar(__ LoadLoad);
9028 }
9029
9030 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9031
9032 __ enter();
9033 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9034
9035 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9036
9037 __ push_call_clobbered_registers();
9038
9039 __ mov(c_rarg0, rscratch2);
9040 __ call_VM_leaf
9041 (CAST_FROM_FN_PTR
9042 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9043
9044 __ reset_last_Java_frame(true);
9045
9046 __ mov(rscratch1, r0);
9047
9048 __ pop_call_clobbered_registers();
9049
9050 __ cbnz(rscratch1, deoptimize_label);
9051
9052 __ leave();
9053 __ ret(lr);
9054
9055 __ BIND(deoptimize_label);
9056
9057 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9058 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9059
9060 __ mov(sp, rscratch1);
9061 __ br(rscratch2);
9062
9063 return start;
9064 }
9065
9066 // r0 = result
9067 // r1 = str1
9068 // r2 = cnt1
9069 // r3 = str2
9070 // r4 = cnt2
9071 // r10 = tmp1
9072 // r11 = tmp2
9073 address generate_compare_long_string_same_encoding(bool isLL) {
9074 __ align(CodeEntryAlignment);
9075 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9076 StubCodeMark mark(this, stub_id);
9077 address entry = __ pc();
9078 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9079 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9080
9081 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9082
9083 // exit from large loop when less than 64 bytes left to read or we're about
9084 // to prefetch memory behind array border
9085 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9086
9087 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9088 __ eor(rscratch2, tmp1, tmp2);
9089 __ cbnz(rscratch2, CAL_DIFFERENCE);
9090
9091 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9092 // update pointers, because of previous read
9093 __ add(str1, str1, wordSize);
9094 __ add(str2, str2, wordSize);
9095 if (SoftwarePrefetchHintDistance >= 0) {
9096 __ align(OptoLoopAlignment);
9097 __ bind(LARGE_LOOP_PREFETCH);
9098 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9099 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9100
9101 for (int i = 0; i < 4; i++) {
9102 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9103 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9104 __ cmp(tmp1, tmp2);
9105 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9106 __ br(Assembler::NE, DIFF);
9107 }
9108 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9109 __ add(str1, str1, 64);
9110 __ add(str2, str2, 64);
9111 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9112 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9113 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9114 }
9115
9116 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9117 __ br(Assembler::LE, LESS16);
9118 __ align(OptoLoopAlignment);
9119 __ bind(LOOP_COMPARE16);
9120 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9121 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9122 __ cmp(tmp1, tmp2);
9123 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9124 __ br(Assembler::NE, DIFF);
9125 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9126 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9127 __ br(Assembler::LT, LESS16);
9128
9129 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9130 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9131 __ cmp(tmp1, tmp2);
9132 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9133 __ br(Assembler::NE, DIFF);
9134 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9135 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9136 __ br(Assembler::GE, LOOP_COMPARE16);
9137 __ cbz(cnt2, LENGTH_DIFF);
9138
9139 __ bind(LESS16);
9140 // each 8 compare
9141 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9142 __ br(Assembler::LE, LESS8);
9143 __ ldr(tmp1, Address(__ post(str1, 8)));
9144 __ ldr(tmp2, Address(__ post(str2, 8)));
9145 __ eor(rscratch2, tmp1, tmp2);
9146 __ cbnz(rscratch2, CAL_DIFFERENCE);
9147 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9148
9149 __ bind(LESS8); // directly load last 8 bytes
9150 if (!isLL) {
9151 __ add(cnt2, cnt2, cnt2);
9152 }
9153 __ ldr(tmp1, Address(str1, cnt2));
9154 __ ldr(tmp2, Address(str2, cnt2));
9155 __ eor(rscratch2, tmp1, tmp2);
9156 __ cbz(rscratch2, LENGTH_DIFF);
9157 __ b(CAL_DIFFERENCE);
9158
9159 __ bind(DIFF);
9160 __ cmp(tmp1, tmp2);
9161 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9162 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9163 // reuse rscratch2 register for the result of eor instruction
9164 __ eor(rscratch2, tmp1, tmp2);
9165
9166 __ bind(CAL_DIFFERENCE);
9167 __ rev(rscratch2, rscratch2);
9168 __ clz(rscratch2, rscratch2);
9169 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9170 __ lsrv(tmp1, tmp1, rscratch2);
9171 __ lsrv(tmp2, tmp2, rscratch2);
9172 if (isLL) {
9173 __ uxtbw(tmp1, tmp1);
9174 __ uxtbw(tmp2, tmp2);
9175 } else {
9176 __ uxthw(tmp1, tmp1);
9177 __ uxthw(tmp2, tmp2);
9178 }
9179 __ subw(result, tmp1, tmp2);
9180
9181 __ bind(LENGTH_DIFF);
9182 __ ret(lr);
9183 return entry;
9184 }
9185
9186 enum string_compare_mode {
9187 LL,
9188 LU,
9189 UL,
9190 UU,
9191 };
9192
9193 // The following registers are declared in aarch64.ad
9194 // r0 = result
9195 // r1 = str1
9196 // r2 = cnt1
9197 // r3 = str2
9198 // r4 = cnt2
9199 // r10 = tmp1
9200 // r11 = tmp2
9201 // z0 = ztmp1
9202 // z1 = ztmp2
9203 // p0 = pgtmp1
9204 // p1 = pgtmp2
9205 address generate_compare_long_string_sve(string_compare_mode mode) {
9206 StubId stub_id;
9207 switch (mode) {
9208 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9209 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9210 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9211 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9212 default: ShouldNotReachHere();
9213 }
9214
9215 __ align(CodeEntryAlignment);
9216 address entry = __ pc();
9217 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9218 tmp1 = r10, tmp2 = r11;
9219
9220 Label LOOP, DONE, MISMATCH;
9221 Register vec_len = tmp1;
9222 Register idx = tmp2;
9223 // The minimum of the string lengths has been stored in cnt2.
9224 Register cnt = cnt2;
9225 FloatRegister ztmp1 = z0, ztmp2 = z1;
9226 PRegister pgtmp1 = p0, pgtmp2 = p1;
9227
9228 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9229 switch (mode) { \
9230 case LL: \
9231 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9232 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9233 break; \
9234 case LU: \
9235 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9236 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9237 break; \
9238 case UL: \
9239 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9240 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9241 break; \
9242 case UU: \
9243 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9244 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9245 break; \
9246 default: \
9247 ShouldNotReachHere(); \
9248 }
9249
9250 StubCodeMark mark(this, stub_id);
9251
9252 __ mov(idx, 0);
9253 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9254
9255 if (mode == LL) {
9256 __ sve_cntb(vec_len);
9257 } else {
9258 __ sve_cnth(vec_len);
9259 }
9260
9261 __ sub(rscratch1, cnt, vec_len);
9262
9263 __ bind(LOOP);
9264
9265 // main loop
9266 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9267 __ add(idx, idx, vec_len);
9268 // Compare strings.
9269 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9270 __ br(__ NE, MISMATCH);
9271 __ cmp(idx, rscratch1);
9272 __ br(__ LT, LOOP);
9273
9274 // post loop, last iteration
9275 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9276
9277 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9278 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9279 __ br(__ EQ, DONE);
9280
9281 __ bind(MISMATCH);
9282
9283 // Crop the vector to find its location.
9284 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9285 // Extract the first different characters of each string.
9286 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9287 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9288
9289 // Compute the difference of the first different characters.
9290 __ sub(result, rscratch1, rscratch2);
9291
9292 __ bind(DONE);
9293 __ ret(lr);
9294 #undef LOAD_PAIR
9295 return entry;
9296 }
9297
9298 void generate_compare_long_strings() {
9299 if (UseSVE == 0) {
9300 StubRoutines::aarch64::_compare_long_string_LL
9301 = generate_compare_long_string_same_encoding(true);
9302 StubRoutines::aarch64::_compare_long_string_UU
9303 = generate_compare_long_string_same_encoding(false);
9304 StubRoutines::aarch64::_compare_long_string_LU
9305 = generate_compare_long_string_different_encoding(true);
9306 StubRoutines::aarch64::_compare_long_string_UL
9307 = generate_compare_long_string_different_encoding(false);
9308 } else {
9309 StubRoutines::aarch64::_compare_long_string_LL
9310 = generate_compare_long_string_sve(LL);
9311 StubRoutines::aarch64::_compare_long_string_UU
9312 = generate_compare_long_string_sve(UU);
9313 StubRoutines::aarch64::_compare_long_string_LU
9314 = generate_compare_long_string_sve(LU);
9315 StubRoutines::aarch64::_compare_long_string_UL
9316 = generate_compare_long_string_sve(UL);
9317 }
9318 }
9319
9320 // R0 = result
9321 // R1 = str2
9322 // R2 = cnt1
9323 // R3 = str1
9324 // R4 = cnt2
9325 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9326 //
9327 // This generic linear code use few additional ideas, which makes it faster:
9328 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9329 // in order to skip initial loading(help in systems with 1 ld pipeline)
9330 // 2) we can use "fast" algorithm of finding single character to search for
9331 // first symbol with less branches(1 branch per each loaded register instead
9332 // of branch for each symbol), so, this is where constants like
9333 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9334 // 3) after loading and analyzing 1st register of source string, it can be
9335 // used to search for every 1st character entry, saving few loads in
9336 // comparison with "simplier-but-slower" implementation
9337 // 4) in order to avoid lots of push/pop operations, code below is heavily
9338 // re-using/re-initializing/compressing register values, which makes code
9339 // larger and a bit less readable, however, most of extra operations are
9340 // issued during loads or branches, so, penalty is minimal
9341 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9342 StubId stub_id;
9343 if (str1_isL) {
9344 if (str2_isL) {
9345 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9346 } else {
9347 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9348 }
9349 } else {
9350 if (str2_isL) {
9351 ShouldNotReachHere();
9352 } else {
9353 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9354 }
9355 }
9356 __ align(CodeEntryAlignment);
9357 StubCodeMark mark(this, stub_id);
9358 address entry = __ pc();
9359
9360 int str1_chr_size = str1_isL ? 1 : 2;
9361 int str2_chr_size = str2_isL ? 1 : 2;
9362 int str1_chr_shift = str1_isL ? 0 : 1;
9363 int str2_chr_shift = str2_isL ? 0 : 1;
9364 bool isL = str1_isL && str2_isL;
9365 // parameters
9366 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9367 // temporary registers
9368 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9369 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9370 // redefinitions
9371 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9372
9373 __ push(spilled_regs, sp);
9374 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9375 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9376 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9377 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9378 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9379 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9380 // Read whole register from str1. It is safe, because length >=8 here
9381 __ ldr(ch1, Address(str1));
9382 // Read whole register from str2. It is safe, because length >=8 here
9383 __ ldr(ch2, Address(str2));
9384 __ sub(cnt2, cnt2, cnt1);
9385 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9386 if (str1_isL != str2_isL) {
9387 __ eor(v0, __ T16B, v0, v0);
9388 }
9389 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9390 __ mul(first, first, tmp1);
9391 // check if we have less than 1 register to check
9392 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9393 if (str1_isL != str2_isL) {
9394 __ fmovd(v1, ch1);
9395 }
9396 __ br(__ LE, L_SMALL);
9397 __ eor(ch2, first, ch2);
9398 if (str1_isL != str2_isL) {
9399 __ zip1(v1, __ T16B, v1, v0);
9400 }
9401 __ sub(tmp2, ch2, tmp1);
9402 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9403 __ bics(tmp2, tmp2, ch2);
9404 if (str1_isL != str2_isL) {
9405 __ fmovd(ch1, v1);
9406 }
9407 __ br(__ NE, L_HAS_ZERO);
9408 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9409 __ add(result, result, wordSize/str2_chr_size);
9410 __ add(str2, str2, wordSize);
9411 __ br(__ LT, L_POST_LOOP);
9412 __ BIND(L_LOOP);
9413 __ ldr(ch2, Address(str2));
9414 __ eor(ch2, first, ch2);
9415 __ sub(tmp2, ch2, tmp1);
9416 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9417 __ bics(tmp2, tmp2, ch2);
9418 __ br(__ NE, L_HAS_ZERO);
9419 __ BIND(L_LOOP_PROCEED);
9420 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9421 __ add(str2, str2, wordSize);
9422 __ add(result, result, wordSize/str2_chr_size);
9423 __ br(__ GE, L_LOOP);
9424 __ BIND(L_POST_LOOP);
9425 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9426 __ br(__ LE, NOMATCH);
9427 __ ldr(ch2, Address(str2));
9428 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9429 __ eor(ch2, first, ch2);
9430 __ sub(tmp2, ch2, tmp1);
9431 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9432 __ mov(tmp4, -1); // all bits set
9433 __ b(L_SMALL_PROCEED);
9434 __ align(OptoLoopAlignment);
9435 __ BIND(L_SMALL);
9436 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9437 __ eor(ch2, first, ch2);
9438 if (str1_isL != str2_isL) {
9439 __ zip1(v1, __ T16B, v1, v0);
9440 }
9441 __ sub(tmp2, ch2, tmp1);
9442 __ mov(tmp4, -1); // all bits set
9443 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9444 if (str1_isL != str2_isL) {
9445 __ fmovd(ch1, v1); // move converted 4 symbols
9446 }
9447 __ BIND(L_SMALL_PROCEED);
9448 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9449 __ bic(tmp2, tmp2, ch2);
9450 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9451 __ rbit(tmp2, tmp2);
9452 __ br(__ EQ, NOMATCH);
9453 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9454 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9455 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9456 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9457 if (str2_isL) { // LL
9458 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9459 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9460 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9461 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9462 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9463 } else {
9464 __ mov(ch2, 0xE); // all bits in byte set except last one
9465 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9466 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9467 __ lslv(tmp2, tmp2, tmp4);
9468 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9469 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9470 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9471 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9472 }
9473 __ cmp(ch1, ch2);
9474 __ mov(tmp4, wordSize/str2_chr_size);
9475 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9476 __ BIND(L_SMALL_CMP_LOOP);
9477 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9478 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9479 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9480 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9481 __ add(tmp4, tmp4, 1);
9482 __ cmp(tmp4, cnt1);
9483 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9484 __ cmp(first, ch2);
9485 __ br(__ EQ, L_SMALL_CMP_LOOP);
9486 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9487 __ cbz(tmp2, NOMATCH); // no more matches. exit
9488 __ clz(tmp4, tmp2);
9489 __ add(result, result, 1); // advance index
9490 __ add(str2, str2, str2_chr_size); // advance pointer
9491 __ b(L_SMALL_HAS_ZERO_LOOP);
9492 __ align(OptoLoopAlignment);
9493 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9494 __ cmp(first, ch2);
9495 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9496 __ b(DONE);
9497 __ align(OptoLoopAlignment);
9498 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9499 if (str2_isL) { // LL
9500 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9501 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9502 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9504 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9505 } else {
9506 __ mov(ch2, 0xE); // all bits in byte set except last one
9507 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9508 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9509 __ lslv(tmp2, tmp2, tmp4);
9510 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9511 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9512 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9514 }
9515 __ cmp(ch1, ch2);
9516 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9517 __ b(DONE);
9518 __ align(OptoLoopAlignment);
9519 __ BIND(L_HAS_ZERO);
9520 __ rbit(tmp2, tmp2);
9521 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9522 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9523 // It's fine because both counters are 32bit and are not changed in this
9524 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9525 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9526 __ sub(result, result, 1);
9527 __ BIND(L_HAS_ZERO_LOOP);
9528 __ mov(cnt1, wordSize/str2_chr_size);
9529 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9530 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9531 if (str2_isL) {
9532 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9533 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9534 __ lslv(tmp2, tmp2, tmp4);
9535 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9536 __ add(tmp4, tmp4, 1);
9537 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9538 __ lsl(tmp2, tmp2, 1);
9539 __ mov(tmp4, wordSize/str2_chr_size);
9540 } else {
9541 __ mov(ch2, 0xE);
9542 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9543 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9544 __ lslv(tmp2, tmp2, tmp4);
9545 __ add(tmp4, tmp4, 1);
9546 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9547 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9548 __ lsl(tmp2, tmp2, 1);
9549 __ mov(tmp4, wordSize/str2_chr_size);
9550 __ sub(str2, str2, str2_chr_size);
9551 }
9552 __ cmp(ch1, ch2);
9553 __ mov(tmp4, wordSize/str2_chr_size);
9554 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9555 __ BIND(L_CMP_LOOP);
9556 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9557 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9558 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9559 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9560 __ add(tmp4, tmp4, 1);
9561 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9562 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9563 __ cmp(cnt1, ch2);
9564 __ br(__ EQ, L_CMP_LOOP);
9565 __ BIND(L_CMP_LOOP_NOMATCH);
9566 // here we're not matched
9567 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9568 __ clz(tmp4, tmp2);
9569 __ add(str2, str2, str2_chr_size); // advance pointer
9570 __ b(L_HAS_ZERO_LOOP);
9571 __ align(OptoLoopAlignment);
9572 __ BIND(L_CMP_LOOP_LAST_CMP);
9573 __ cmp(cnt1, ch2);
9574 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9575 __ b(DONE);
9576 __ align(OptoLoopAlignment);
9577 __ BIND(L_CMP_LOOP_LAST_CMP2);
9578 if (str2_isL) {
9579 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9580 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9581 __ lslv(tmp2, tmp2, tmp4);
9582 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9583 __ add(tmp4, tmp4, 1);
9584 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9585 __ lsl(tmp2, tmp2, 1);
9586 } else {
9587 __ mov(ch2, 0xE);
9588 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9589 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9590 __ lslv(tmp2, tmp2, tmp4);
9591 __ add(tmp4, tmp4, 1);
9592 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9593 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9594 __ lsl(tmp2, tmp2, 1);
9595 __ sub(str2, str2, str2_chr_size);
9596 }
9597 __ cmp(ch1, ch2);
9598 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9599 __ b(DONE);
9600 __ align(OptoLoopAlignment);
9601 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9602 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9603 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9604 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9605 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9606 // result by analyzed characters value, so, we can just reset lower bits
9607 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9608 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9609 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9610 // index of last analyzed substring inside current octet. So, str2 in at
9611 // respective start address. We need to advance it to next octet
9612 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9613 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9614 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9615 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9616 __ movw(cnt2, cnt2);
9617 __ b(L_LOOP_PROCEED);
9618 __ align(OptoLoopAlignment);
9619 __ BIND(NOMATCH);
9620 __ mov(result, -1);
9621 __ BIND(DONE);
9622 __ pop(spilled_regs, sp);
9623 __ ret(lr);
9624 return entry;
9625 }
9626
9627 void generate_string_indexof_stubs() {
9628 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9629 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9630 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9631 }
9632
9633 void inflate_and_store_2_fp_registers(bool generatePrfm,
9634 FloatRegister src1, FloatRegister src2) {
9635 Register dst = r1;
9636 __ zip1(v1, __ T16B, src1, v0);
9637 __ zip2(v2, __ T16B, src1, v0);
9638 if (generatePrfm) {
9639 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9640 }
9641 __ zip1(v3, __ T16B, src2, v0);
9642 __ zip2(v4, __ T16B, src2, v0);
9643 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9644 }
9645
9646 // R0 = src
9647 // R1 = dst
9648 // R2 = len
9649 // R3 = len >> 3
9650 // V0 = 0
9651 // v1 = loaded 8 bytes
9652 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9653 address generate_large_byte_array_inflate() {
9654 __ align(CodeEntryAlignment);
9655 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9656 StubCodeMark mark(this, stub_id);
9657 address entry = __ pc();
9658 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9659 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9660 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9661
9662 // do one more 8-byte read to have address 16-byte aligned in most cases
9663 // also use single store instruction
9664 __ ldrd(v2, __ post(src, 8));
9665 __ sub(octetCounter, octetCounter, 2);
9666 __ zip1(v1, __ T16B, v1, v0);
9667 __ zip1(v2, __ T16B, v2, v0);
9668 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9669 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9670 __ subs(rscratch1, octetCounter, large_loop_threshold);
9671 __ br(__ LE, LOOP_START);
9672 __ b(LOOP_PRFM_START);
9673 __ bind(LOOP_PRFM);
9674 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9675 __ bind(LOOP_PRFM_START);
9676 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9677 __ sub(octetCounter, octetCounter, 8);
9678 __ subs(rscratch1, octetCounter, large_loop_threshold);
9679 inflate_and_store_2_fp_registers(true, v3, v4);
9680 inflate_and_store_2_fp_registers(true, v5, v6);
9681 __ br(__ GT, LOOP_PRFM);
9682 __ cmp(octetCounter, (u1)8);
9683 __ br(__ LT, DONE);
9684 __ bind(LOOP);
9685 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9686 __ bind(LOOP_START);
9687 __ sub(octetCounter, octetCounter, 8);
9688 __ cmp(octetCounter, (u1)8);
9689 inflate_and_store_2_fp_registers(false, v3, v4);
9690 inflate_and_store_2_fp_registers(false, v5, v6);
9691 __ br(__ GE, LOOP);
9692 __ bind(DONE);
9693 __ ret(lr);
9694 return entry;
9695 }
9696
9697 /**
9698 * Arguments:
9699 *
9700 * Input:
9701 * c_rarg0 - current state address
9702 * c_rarg1 - H key address
9703 * c_rarg2 - data address
9704 * c_rarg3 - number of blocks
9705 *
9706 * Output:
9707 * Updated state at c_rarg0
9708 */
9709 address generate_ghash_processBlocks() {
9710 // Bafflingly, GCM uses little-endian for the byte order, but
9711 // big-endian for the bit order. For example, the polynomial 1 is
9712 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9713 //
9714 // So, we must either reverse the bytes in each word and do
9715 // everything big-endian or reverse the bits in each byte and do
9716 // it little-endian. On AArch64 it's more idiomatic to reverse
9717 // the bits in each byte (we have an instruction, RBIT, to do
9718 // that) and keep the data in little-endian bit order through the
9719 // calculation, bit-reversing the inputs and outputs.
9720
9721 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9722 StubCodeMark mark(this, stub_id);
9723 Label polynomial; // local data generated at end of stub
9724 __ align(CodeEntryAlignment);
9725 address start = __ pc();
9726
9727 Register state = c_rarg0;
9728 Register subkeyH = c_rarg1;
9729 Register data = c_rarg2;
9730 Register blocks = c_rarg3;
9731
9732 FloatRegister vzr = v30;
9733 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9734
9735 __ adr(rscratch1, polynomial);
9736 __ ldrq(v24, rscratch1); // The field polynomial
9737
9738 __ ldrq(v0, Address(state));
9739 __ ldrq(v1, Address(subkeyH));
9740
9741 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9742 __ rbit(v0, __ T16B, v0);
9743 __ rev64(v1, __ T16B, v1);
9744 __ rbit(v1, __ T16B, v1);
9745
9746 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9747 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9748
9749 {
9750 Label L_ghash_loop;
9751 __ bind(L_ghash_loop);
9752
9753 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9754 // reversing each byte
9755 __ rbit(v2, __ T16B, v2);
9756 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9757
9758 // Multiply state in v2 by subkey in v1
9759 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9760 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9761 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9762 // Reduce v7:v5 by the field polynomial
9763 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9764
9765 __ sub(blocks, blocks, 1);
9766 __ cbnz(blocks, L_ghash_loop);
9767 }
9768
9769 // The bit-reversed result is at this point in v0
9770 __ rev64(v0, __ T16B, v0);
9771 __ rbit(v0, __ T16B, v0);
9772
9773 __ st1(v0, __ T16B, state);
9774 __ ret(lr);
9775
9776 // bind label and generate local polynomial data
9777 __ align(wordSize * 2);
9778 __ bind(polynomial);
9779 __ emit_int64(0x87); // The low-order bits of the field
9780 // polynomial (i.e. p = z^7+z^2+z+1)
9781 // repeated in the low and high parts of a
9782 // 128-bit vector
9783 __ emit_int64(0x87);
9784
9785 return start;
9786 }
9787
9788 address generate_ghash_processBlocks_wide() {
9789 address small = generate_ghash_processBlocks();
9790
9791 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9792 StubCodeMark mark(this, stub_id);
9793 Label polynomial; // local data generated after stub
9794 __ align(CodeEntryAlignment);
9795 address start = __ pc();
9796
9797 Register state = c_rarg0;
9798 Register subkeyH = c_rarg1;
9799 Register data = c_rarg2;
9800 Register blocks = c_rarg3;
9801
9802 const int unroll = 4;
9803
9804 __ cmp(blocks, (unsigned char)(unroll * 2));
9805 __ br(__ LT, small);
9806
9807 if (unroll > 1) {
9808 // Save state before entering routine
9809 __ sub(sp, sp, 4 * 16);
9810 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9811 __ sub(sp, sp, 4 * 16);
9812 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9813 }
9814
9815 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9816
9817 if (unroll > 1) {
9818 // And restore state
9819 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9820 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9821 }
9822
9823 __ cmp(blocks, (unsigned char)0);
9824 __ br(__ GT, small);
9825
9826 __ ret(lr);
9827
9828 // bind label and generate polynomial data
9829 __ align(wordSize * 2);
9830 __ bind(polynomial);
9831 __ emit_int64(0x87); // The low-order bits of the field
9832 // polynomial (i.e. p = z^7+z^2+z+1)
9833 // repeated in the low and high parts of a
9834 // 128-bit vector
9835 __ emit_int64(0x87);
9836
9837 return start;
9838
9839 }
9840
9841 void generate_base64_encode_simdround(Register src, Register dst,
9842 FloatRegister codec, u8 size) {
9843
9844 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9845 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9846 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9847
9848 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9849
9850 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9851
9852 __ ushr(ind0, arrangement, in0, 2);
9853
9854 __ ushr(ind1, arrangement, in1, 2);
9855 __ shl(in0, arrangement, in0, 6);
9856 __ orr(ind1, arrangement, ind1, in0);
9857 __ ushr(ind1, arrangement, ind1, 2);
9858
9859 __ ushr(ind2, arrangement, in2, 4);
9860 __ shl(in1, arrangement, in1, 4);
9861 __ orr(ind2, arrangement, in1, ind2);
9862 __ ushr(ind2, arrangement, ind2, 2);
9863
9864 __ shl(ind3, arrangement, in2, 2);
9865 __ ushr(ind3, arrangement, ind3, 2);
9866
9867 __ tbl(out0, arrangement, codec, 4, ind0);
9868 __ tbl(out1, arrangement, codec, 4, ind1);
9869 __ tbl(out2, arrangement, codec, 4, ind2);
9870 __ tbl(out3, arrangement, codec, 4, ind3);
9871
9872 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9873 }
9874
9875 /**
9876 * Arguments:
9877 *
9878 * Input:
9879 * c_rarg0 - src_start
9880 * c_rarg1 - src_offset
9881 * c_rarg2 - src_length
9882 * c_rarg3 - dest_start
9883 * c_rarg4 - dest_offset
9884 * c_rarg5 - isURL
9885 *
9886 */
9887 address generate_base64_encodeBlock() {
9888
9889 static const char toBase64[64] = {
9890 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9891 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9892 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9893 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9894 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9895 };
9896
9897 static const char toBase64URL[64] = {
9898 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9899 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9900 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9901 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9902 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9903 };
9904
9905 __ align(CodeEntryAlignment);
9906 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9907 StubCodeMark mark(this, stub_id);
9908 address start = __ pc();
9909
9910 Register src = c_rarg0; // source array
9911 Register soff = c_rarg1; // source start offset
9912 Register send = c_rarg2; // source end offset
9913 Register dst = c_rarg3; // dest array
9914 Register doff = c_rarg4; // position for writing to dest array
9915 Register isURL = c_rarg5; // Base64 or URL character set
9916
9917 // c_rarg6 and c_rarg7 are free to use as temps
9918 Register codec = c_rarg6;
9919 Register length = c_rarg7;
9920
9921 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9922
9923 __ add(src, src, soff);
9924 __ add(dst, dst, doff);
9925 __ sub(length, send, soff);
9926
9927 // load the codec base address
9928 __ lea(codec, ExternalAddress((address) toBase64));
9929 __ cbz(isURL, ProcessData);
9930 __ lea(codec, ExternalAddress((address) toBase64URL));
9931
9932 __ BIND(ProcessData);
9933
9934 // too short to formup a SIMD loop, roll back
9935 __ cmp(length, (u1)24);
9936 __ br(Assembler::LT, Process3B);
9937
9938 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9939
9940 __ BIND(Process48B);
9941 __ cmp(length, (u1)48);
9942 __ br(Assembler::LT, Process24B);
9943 generate_base64_encode_simdround(src, dst, v0, 16);
9944 __ sub(length, length, 48);
9945 __ b(Process48B);
9946
9947 __ BIND(Process24B);
9948 __ cmp(length, (u1)24);
9949 __ br(Assembler::LT, SIMDExit);
9950 generate_base64_encode_simdround(src, dst, v0, 8);
9951 __ sub(length, length, 24);
9952
9953 __ BIND(SIMDExit);
9954 __ cbz(length, Exit);
9955
9956 __ BIND(Process3B);
9957 // 3 src bytes, 24 bits
9958 __ ldrb(r10, __ post(src, 1));
9959 __ ldrb(r11, __ post(src, 1));
9960 __ ldrb(r12, __ post(src, 1));
9961 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9962 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9963 // codec index
9964 __ ubfmw(r15, r12, 18, 23);
9965 __ ubfmw(r14, r12, 12, 17);
9966 __ ubfmw(r13, r12, 6, 11);
9967 __ andw(r12, r12, 63);
9968 // get the code based on the codec
9969 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9970 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9971 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9972 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9973 __ strb(r15, __ post(dst, 1));
9974 __ strb(r14, __ post(dst, 1));
9975 __ strb(r13, __ post(dst, 1));
9976 __ strb(r12, __ post(dst, 1));
9977 __ sub(length, length, 3);
9978 __ cbnz(length, Process3B);
9979
9980 __ BIND(Exit);
9981 __ ret(lr);
9982
9983 return start;
9984 }
9985
9986 void generate_base64_decode_simdround(Register src, Register dst,
9987 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9988
9989 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9990 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9991
9992 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9993 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9994
9995 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9996
9997 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9998
9999 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10000
10001 // we need unsigned saturating subtract, to make sure all input values
10002 // in range [0, 63] will have 0U value in the higher half lookup
10003 __ uqsubv(decH0, __ T16B, in0, v27);
10004 __ uqsubv(decH1, __ T16B, in1, v27);
10005 __ uqsubv(decH2, __ T16B, in2, v27);
10006 __ uqsubv(decH3, __ T16B, in3, v27);
10007
10008 // lower half lookup
10009 __ tbl(decL0, arrangement, codecL, 4, in0);
10010 __ tbl(decL1, arrangement, codecL, 4, in1);
10011 __ tbl(decL2, arrangement, codecL, 4, in2);
10012 __ tbl(decL3, arrangement, codecL, 4, in3);
10013
10014 // higher half lookup
10015 __ tbx(decH0, arrangement, codecH, 4, decH0);
10016 __ tbx(decH1, arrangement, codecH, 4, decH1);
10017 __ tbx(decH2, arrangement, codecH, 4, decH2);
10018 __ tbx(decH3, arrangement, codecH, 4, decH3);
10019
10020 // combine lower and higher
10021 __ orr(decL0, arrangement, decL0, decH0);
10022 __ orr(decL1, arrangement, decL1, decH1);
10023 __ orr(decL2, arrangement, decL2, decH2);
10024 __ orr(decL3, arrangement, decL3, decH3);
10025
10026 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10027 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10028 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10029 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10030 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10031 __ orr(in0, arrangement, decH0, decH1);
10032 __ orr(in1, arrangement, decH2, decH3);
10033 __ orr(in2, arrangement, in0, in1);
10034 __ umaxv(in3, arrangement, in2);
10035 __ umov(rscratch2, in3, __ B, 0);
10036
10037 // get the data to output
10038 __ shl(out0, arrangement, decL0, 2);
10039 __ ushr(out1, arrangement, decL1, 4);
10040 __ orr(out0, arrangement, out0, out1);
10041 __ shl(out1, arrangement, decL1, 4);
10042 __ ushr(out2, arrangement, decL2, 2);
10043 __ orr(out1, arrangement, out1, out2);
10044 __ shl(out2, arrangement, decL2, 6);
10045 __ orr(out2, arrangement, out2, decL3);
10046
10047 __ cbz(rscratch2, NoIllegalData);
10048
10049 // handle illegal input
10050 __ umov(r10, in2, __ D, 0);
10051 if (size == 16) {
10052 __ cbnz(r10, ErrorInLowerHalf);
10053
10054 // illegal input is in higher half, store the lower half now.
10055 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10056
10057 __ umov(r10, in2, __ D, 1);
10058 __ umov(r11, out0, __ D, 1);
10059 __ umov(r12, out1, __ D, 1);
10060 __ umov(r13, out2, __ D, 1);
10061 __ b(StoreLegalData);
10062
10063 __ BIND(ErrorInLowerHalf);
10064 }
10065 __ umov(r11, out0, __ D, 0);
10066 __ umov(r12, out1, __ D, 0);
10067 __ umov(r13, out2, __ D, 0);
10068
10069 __ BIND(StoreLegalData);
10070 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10071 __ strb(r11, __ post(dst, 1));
10072 __ strb(r12, __ post(dst, 1));
10073 __ strb(r13, __ post(dst, 1));
10074 __ lsr(r10, r10, 8);
10075 __ lsr(r11, r11, 8);
10076 __ lsr(r12, r12, 8);
10077 __ lsr(r13, r13, 8);
10078 __ b(StoreLegalData);
10079
10080 __ BIND(NoIllegalData);
10081 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10082 }
10083
10084
10085 /**
10086 * Arguments:
10087 *
10088 * Input:
10089 * c_rarg0 - src_start
10090 * c_rarg1 - src_offset
10091 * c_rarg2 - src_length
10092 * c_rarg3 - dest_start
10093 * c_rarg4 - dest_offset
10094 * c_rarg5 - isURL
10095 * c_rarg6 - isMIME
10096 *
10097 */
10098 address generate_base64_decodeBlock() {
10099
10100 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10101 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10102 // titled "Base64 decoding".
10103
10104 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10105 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10106 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10107 static const uint8_t fromBase64ForNoSIMD[256] = {
10108 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10109 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10110 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10111 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10112 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10113 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10114 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10115 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10122 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10123 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10124 };
10125
10126 static const uint8_t fromBase64URLForNoSIMD[256] = {
10127 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10128 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10129 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10130 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10131 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10132 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10133 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10134 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10135 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10136 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10137 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10138 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10139 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10140 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10141 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143 };
10144
10145 // A legal value of base64 code is in range [0, 127]. We need two lookups
10146 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10147 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10148 // table vector lookup use tbx, out of range indices are unchanged in
10149 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10150 // The value of index 64 is set to 0, so that we know that we already get the
10151 // decoded data with the 1st lookup.
10152 static const uint8_t fromBase64ForSIMD[128] = {
10153 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10154 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10155 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10156 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10157 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10158 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10159 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10160 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10161 };
10162
10163 static const uint8_t fromBase64URLForSIMD[128] = {
10164 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10165 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10166 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10167 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10168 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10169 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10170 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10171 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10172 };
10173
10174 __ align(CodeEntryAlignment);
10175 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10176 StubCodeMark mark(this, stub_id);
10177 address start = __ pc();
10178
10179 Register src = c_rarg0; // source array
10180 Register soff = c_rarg1; // source start offset
10181 Register send = c_rarg2; // source end offset
10182 Register dst = c_rarg3; // dest array
10183 Register doff = c_rarg4; // position for writing to dest array
10184 Register isURL = c_rarg5; // Base64 or URL character set
10185 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10186
10187 Register length = send; // reuse send as length of source data to process
10188
10189 Register simd_codec = c_rarg6;
10190 Register nosimd_codec = c_rarg7;
10191
10192 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10193
10194 __ enter();
10195
10196 __ add(src, src, soff);
10197 __ add(dst, dst, doff);
10198
10199 __ mov(doff, dst);
10200
10201 __ sub(length, send, soff);
10202 __ bfm(length, zr, 0, 1);
10203
10204 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10205 __ cbz(isURL, ProcessData);
10206 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10207
10208 __ BIND(ProcessData);
10209 __ mov(rscratch1, length);
10210 __ cmp(length, (u1)144); // 144 = 80 + 64
10211 __ br(Assembler::LT, Process4B);
10212
10213 // In the MIME case, the line length cannot be more than 76
10214 // bytes (see RFC 2045). This is too short a block for SIMD
10215 // to be worthwhile, so we use non-SIMD here.
10216 __ movw(rscratch1, 79);
10217
10218 __ BIND(Process4B);
10219 __ ldrw(r14, __ post(src, 4));
10220 __ ubfxw(r10, r14, 0, 8);
10221 __ ubfxw(r11, r14, 8, 8);
10222 __ ubfxw(r12, r14, 16, 8);
10223 __ ubfxw(r13, r14, 24, 8);
10224 // get the de-code
10225 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10226 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10227 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10228 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10229 // error detection, 255u indicates an illegal input
10230 __ orrw(r14, r10, r11);
10231 __ orrw(r15, r12, r13);
10232 __ orrw(r14, r14, r15);
10233 __ tbnz(r14, 7, Exit);
10234 // recover the data
10235 __ lslw(r14, r10, 10);
10236 __ bfiw(r14, r11, 4, 6);
10237 __ bfmw(r14, r12, 2, 5);
10238 __ rev16w(r14, r14);
10239 __ bfiw(r13, r12, 6, 2);
10240 __ strh(r14, __ post(dst, 2));
10241 __ strb(r13, __ post(dst, 1));
10242 // non-simd loop
10243 __ subsw(rscratch1, rscratch1, 4);
10244 __ br(Assembler::GT, Process4B);
10245
10246 // if exiting from PreProcess80B, rscratch1 == -1;
10247 // otherwise, rscratch1 == 0.
10248 __ cbzw(rscratch1, Exit);
10249 __ sub(length, length, 80);
10250
10251 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10252 __ cbz(isURL, SIMDEnter);
10253 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10254
10255 __ BIND(SIMDEnter);
10256 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10257 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10258 __ mov(rscratch1, 63);
10259 __ dup(v27, __ T16B, rscratch1);
10260
10261 __ BIND(Process64B);
10262 __ cmp(length, (u1)64);
10263 __ br(Assembler::LT, Process32B);
10264 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10265 __ sub(length, length, 64);
10266 __ b(Process64B);
10267
10268 __ BIND(Process32B);
10269 __ cmp(length, (u1)32);
10270 __ br(Assembler::LT, SIMDExit);
10271 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10272 __ sub(length, length, 32);
10273 __ b(Process32B);
10274
10275 __ BIND(SIMDExit);
10276 __ cbz(length, Exit);
10277 __ movw(rscratch1, length);
10278 __ b(Process4B);
10279
10280 __ BIND(Exit);
10281 __ sub(c_rarg0, dst, doff);
10282
10283 __ leave();
10284 __ ret(lr);
10285
10286 return start;
10287 }
10288
10289 // Support for spin waits.
10290 address generate_spin_wait() {
10291 __ align(CodeEntryAlignment);
10292 StubId stub_id = StubId::stubgen_spin_wait_id;
10293 StubCodeMark mark(this, stub_id);
10294 address start = __ pc();
10295
10296 __ spin_wait();
10297 __ ret(lr);
10298
10299 return start;
10300 }
10301
10302 void generate_lookup_secondary_supers_table_stub() {
10303 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10304 StubCodeMark mark(this, stub_id);
10305
10306 const Register
10307 r_super_klass = r0,
10308 r_array_base = r1,
10309 r_array_length = r2,
10310 r_array_index = r3,
10311 r_sub_klass = r4,
10312 r_bitmap = rscratch2,
10313 result = r5;
10314 const FloatRegister
10315 vtemp = v0;
10316
10317 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10318 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10319 Label L_success;
10320 __ enter();
10321 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10322 r_array_base, r_array_length, r_array_index,
10323 vtemp, result, slot,
10324 /*stub_is_near*/true);
10325 __ leave();
10326 __ ret(lr);
10327 }
10328 }
10329
10330 // Slow path implementation for UseSecondarySupersTable.
10331 address generate_lookup_secondary_supers_table_slow_path_stub() {
10332 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10333 StubCodeMark mark(this, stub_id);
10334
10335 address start = __ pc();
10336 const Register
10337 r_super_klass = r0, // argument
10338 r_array_base = r1, // argument
10339 temp1 = r2, // temp
10340 r_array_index = r3, // argument
10341 r_bitmap = rscratch2, // argument
10342 result = r5; // argument
10343
10344 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10345 __ ret(lr);
10346
10347 return start;
10348 }
10349
10350 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10351
10352 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10353 //
10354 // If LSE is in use, generate LSE versions of all the stubs. The
10355 // non-LSE versions are in atomic_aarch64.S.
10356
10357 // class AtomicStubMark records the entry point of a stub and the
10358 // stub pointer which will point to it. The stub pointer is set to
10359 // the entry point when ~AtomicStubMark() is called, which must be
10360 // after ICache::invalidate_range. This ensures safe publication of
10361 // the generated code.
10362 class AtomicStubMark {
10363 address _entry_point;
10364 aarch64_atomic_stub_t *_stub;
10365 MacroAssembler *_masm;
10366 public:
10367 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10368 _masm = masm;
10369 __ align(32);
10370 _entry_point = __ pc();
10371 _stub = stub;
10372 }
10373 ~AtomicStubMark() {
10374 *_stub = (aarch64_atomic_stub_t)_entry_point;
10375 }
10376 };
10377
10378 // NB: For memory_order_conservative we need a trailing membar after
10379 // LSE atomic operations but not a leading membar.
10380 //
10381 // We don't need a leading membar because a clause in the Arm ARM
10382 // says:
10383 //
10384 // Barrier-ordered-before
10385 //
10386 // Barrier instructions order prior Memory effects before subsequent
10387 // Memory effects generated by the same Observer. A read or a write
10388 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10389 // Observer if and only if RW1 appears in program order before RW 2
10390 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10391 // instruction with both Acquire and Release semantics.
10392 //
10393 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10394 // and Release semantics, therefore we don't need a leading
10395 // barrier. However, there is no corresponding Barrier-ordered-after
10396 // relationship, therefore we need a trailing membar to prevent a
10397 // later store or load from being reordered with the store in an
10398 // atomic instruction.
10399 //
10400 // This was checked by using the herd7 consistency model simulator
10401 // (http://diy.inria.fr/) with this test case:
10402 //
10403 // AArch64 LseCas
10404 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10405 // P0 | P1;
10406 // LDR W4, [X2] | MOV W3, #0;
10407 // DMB LD | MOV W4, #1;
10408 // LDR W3, [X1] | CASAL W3, W4, [X1];
10409 // | DMB ISH;
10410 // | STR W4, [X2];
10411 // exists
10412 // (0:X3=0 /\ 0:X4=1)
10413 //
10414 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10415 // with the store to x in P1. Without the DMB in P1 this may happen.
10416 //
10417 // At the time of writing we don't know of any AArch64 hardware that
10418 // reorders stores in this way, but the Reference Manual permits it.
10419
10420 void gen_cas_entry(Assembler::operand_size size,
10421 atomic_memory_order order) {
10422 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10423 exchange_val = c_rarg2;
10424 bool acquire, release;
10425 switch (order) {
10426 case memory_order_relaxed:
10427 acquire = false;
10428 release = false;
10429 break;
10430 case memory_order_release:
10431 acquire = false;
10432 release = true;
10433 break;
10434 default:
10435 acquire = true;
10436 release = true;
10437 break;
10438 }
10439 __ mov(prev, compare_val);
10440 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10441 if (order == memory_order_conservative) {
10442 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10443 }
10444 if (size == Assembler::xword) {
10445 __ mov(r0, prev);
10446 } else {
10447 __ movw(r0, prev);
10448 }
10449 __ ret(lr);
10450 }
10451
10452 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10453 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10454 // If not relaxed, then default to conservative. Relaxed is the only
10455 // case we use enough to be worth specializing.
10456 if (order == memory_order_relaxed) {
10457 __ ldadd(size, incr, prev, addr);
10458 } else {
10459 __ ldaddal(size, incr, prev, addr);
10460 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10461 }
10462 if (size == Assembler::xword) {
10463 __ mov(r0, prev);
10464 } else {
10465 __ movw(r0, prev);
10466 }
10467 __ ret(lr);
10468 }
10469
10470 void gen_swpal_entry(Assembler::operand_size size) {
10471 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10472 __ swpal(size, incr, prev, addr);
10473 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10474 if (size == Assembler::xword) {
10475 __ mov(r0, prev);
10476 } else {
10477 __ movw(r0, prev);
10478 }
10479 __ ret(lr);
10480 }
10481
10482 void generate_atomic_entry_points() {
10483 if (! UseLSE) {
10484 return;
10485 }
10486 __ align(CodeEntryAlignment);
10487 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10488 StubCodeMark mark(this, stub_id);
10489 address first_entry = __ pc();
10490
10491 // ADD, memory_order_conservative
10492 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10493 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10494 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10495 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10496
10497 // ADD, memory_order_relaxed
10498 AtomicStubMark mark_fetch_add_4_relaxed
10499 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10500 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10501 AtomicStubMark mark_fetch_add_8_relaxed
10502 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10503 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10504
10505 // XCHG, memory_order_conservative
10506 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10507 gen_swpal_entry(Assembler::word);
10508 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10509 gen_swpal_entry(Assembler::xword);
10510
10511 // CAS, memory_order_conservative
10512 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10513 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10514 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10515 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10516 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10517 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10518
10519 // CAS, memory_order_relaxed
10520 AtomicStubMark mark_cmpxchg_1_relaxed
10521 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10522 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10523 AtomicStubMark mark_cmpxchg_4_relaxed
10524 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10525 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10526 AtomicStubMark mark_cmpxchg_8_relaxed
10527 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10528 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10529
10530 AtomicStubMark mark_cmpxchg_4_release
10531 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10532 gen_cas_entry(MacroAssembler::word, memory_order_release);
10533 AtomicStubMark mark_cmpxchg_8_release
10534 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10535 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10536
10537 AtomicStubMark mark_cmpxchg_4_seq_cst
10538 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10539 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10540 AtomicStubMark mark_cmpxchg_8_seq_cst
10541 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10542 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10543
10544 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10545 }
10546 #endif // LINUX
10547
10548 static void save_return_registers(MacroAssembler* masm) {
10549 if (InlineTypeReturnedAsFields) {
10550 masm->push(RegSet::range(r0, r7), sp);
10551 masm->sub(sp, sp, 4 * wordSize);
10552 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10553 masm->sub(sp, sp, 4 * wordSize);
10554 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10555 } else {
10556 masm->fmovd(rscratch1, v0);
10557 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10558 }
10559 }
10560
10561 static void restore_return_registers(MacroAssembler* masm) {
10562 if (InlineTypeReturnedAsFields) {
10563 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10564 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10565 masm->pop(RegSet::range(r0, r7), sp);
10566 } else {
10567 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10568 masm->fmovd(v0, rscratch1);
10569 }
10570 }
10571
10572 address generate_cont_thaw(Continuation::thaw_kind kind) {
10573 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10574 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10575
10576 address start = __ pc();
10577
10578 if (return_barrier) {
10579 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10580 __ mov(sp, rscratch1);
10581 }
10582 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10583
10584 if (return_barrier) {
10585 // preserve possible return value from a method returning to the return barrier
10586 save_return_registers(_masm);
10587 }
10588
10589 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10590 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10591 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10592
10593 if (return_barrier) {
10594 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10595 restore_return_registers(_masm);
10596 }
10597 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10598
10599
10600 Label thaw_success;
10601 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10602 __ cbnz(rscratch2, thaw_success);
10603 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10604 __ br(rscratch1);
10605 __ bind(thaw_success);
10606
10607 // make room for the thawed frames
10608 __ sub(rscratch1, sp, rscratch2);
10609 __ andr(rscratch1, rscratch1, -16); // align
10610 __ mov(sp, rscratch1);
10611
10612 if (return_barrier) {
10613 // save original return value -- again
10614 save_return_registers(_masm);
10615 }
10616
10617 // If we want, we can templatize thaw by kind, and have three different entries
10618 __ movw(c_rarg1, (uint32_t)kind);
10619
10620 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10621 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10622
10623 if (return_barrier) {
10624 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10625 restore_return_registers(_masm);
10626 } else {
10627 __ mov(r0, zr); // return 0 (success) from doYield
10628 }
10629
10630 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10631 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10632 __ mov(rfp, sp);
10633
10634 if (return_barrier_exception) {
10635 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10636 __ authenticate_return_address(c_rarg1);
10637 __ verify_oop(r0);
10638 // save return value containing the exception oop in callee-saved R19
10639 __ mov(r19, r0);
10640
10641 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10642
10643 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10644 // __ reinitialize_ptrue();
10645
10646 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10647
10648 __ mov(r1, r0); // the exception handler
10649 __ mov(r0, r19); // restore return value containing the exception oop
10650 __ verify_oop(r0);
10651
10652 __ leave();
10653 __ mov(r3, lr);
10654 __ br(r1); // the exception handler
10655 } else {
10656 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10657 __ leave();
10658 __ ret(lr);
10659 }
10660
10661 return start;
10662 }
10663
10664 address generate_cont_thaw() {
10665 if (!Continuations::enabled()) return nullptr;
10666
10667 StubId stub_id = StubId::stubgen_cont_thaw_id;
10668 StubCodeMark mark(this, stub_id);
10669 address start = __ pc();
10670 generate_cont_thaw(Continuation::thaw_top);
10671 return start;
10672 }
10673
10674 address generate_cont_returnBarrier() {
10675 if (!Continuations::enabled()) return nullptr;
10676
10677 // TODO: will probably need multiple return barriers depending on return type
10678 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10679 StubCodeMark mark(this, stub_id);
10680 address start = __ pc();
10681
10682 generate_cont_thaw(Continuation::thaw_return_barrier);
10683
10684 return start;
10685 }
10686
10687 address generate_cont_returnBarrier_exception() {
10688 if (!Continuations::enabled()) return nullptr;
10689
10690 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10691 StubCodeMark mark(this, stub_id);
10692 address start = __ pc();
10693
10694 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10695
10696 return start;
10697 }
10698
10699 address generate_cont_preempt_stub() {
10700 if (!Continuations::enabled()) return nullptr;
10701 StubId stub_id = StubId::stubgen_cont_preempt_id;
10702 StubCodeMark mark(this, stub_id);
10703 address start = __ pc();
10704
10705 __ reset_last_Java_frame(true);
10706
10707 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10708 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10709 __ mov(sp, rscratch2);
10710
10711 Label preemption_cancelled;
10712 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10713 __ cbnz(rscratch1, preemption_cancelled);
10714
10715 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10716 SharedRuntime::continuation_enter_cleanup(_masm);
10717 __ leave();
10718 __ ret(lr);
10719
10720 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10721 __ bind(preemption_cancelled);
10722 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10723 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10724 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10725 __ ldr(rscratch1, Address(rscratch1));
10726 __ br(rscratch1);
10727
10728 return start;
10729 }
10730
10731 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10732 // are represented as long[5], with BITS_PER_LIMB = 26.
10733 // Pack five 26-bit limbs into three 64-bit registers.
10734 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10735 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10736 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10737 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10738 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10739
10740 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10741 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10742 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10743 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10744
10745 if (dest2->is_valid()) {
10746 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10747 } else {
10748 #ifdef ASSERT
10749 Label OK;
10750 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10751 __ br(__ EQ, OK);
10752 __ stop("high bits of Poly1305 integer should be zero");
10753 __ should_not_reach_here();
10754 __ bind(OK);
10755 #endif
10756 }
10757 }
10758
10759 // As above, but return only a 128-bit integer, packed into two
10760 // 64-bit registers.
10761 void pack_26(Register dest0, Register dest1, Register src) {
10762 pack_26(dest0, dest1, noreg, src);
10763 }
10764
10765 // Multiply and multiply-accumulate unsigned 64-bit registers.
10766 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10767 __ mul(prod_lo, n, m);
10768 __ umulh(prod_hi, n, m);
10769 }
10770 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10771 wide_mul(rscratch1, rscratch2, n, m);
10772 __ adds(sum_lo, sum_lo, rscratch1);
10773 __ adc(sum_hi, sum_hi, rscratch2);
10774 }
10775
10776 // Poly1305, RFC 7539
10777
10778 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10779 // description of the tricks used to simplify and accelerate this
10780 // computation.
10781
10782 address generate_poly1305_processBlocks() {
10783 __ align(CodeEntryAlignment);
10784 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10785 StubCodeMark mark(this, stub_id);
10786 address start = __ pc();
10787 Label here;
10788 __ enter();
10789 RegSet callee_saved = RegSet::range(r19, r28);
10790 __ push(callee_saved, sp);
10791
10792 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10793
10794 // Arguments
10795 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10796
10797 // R_n is the 128-bit randomly-generated key, packed into two
10798 // registers. The caller passes this key to us as long[5], with
10799 // BITS_PER_LIMB = 26.
10800 const Register R_0 = *++regs, R_1 = *++regs;
10801 pack_26(R_0, R_1, r_start);
10802
10803 // RR_n is (R_n >> 2) * 5
10804 const Register RR_0 = *++regs, RR_1 = *++regs;
10805 __ lsr(RR_0, R_0, 2);
10806 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10807 __ lsr(RR_1, R_1, 2);
10808 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10809
10810 // U_n is the current checksum
10811 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10812 pack_26(U_0, U_1, U_2, acc_start);
10813
10814 static constexpr int BLOCK_LENGTH = 16;
10815 Label DONE, LOOP;
10816
10817 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10818 __ br(Assembler::LT, DONE); {
10819 __ bind(LOOP);
10820
10821 // S_n is to be the sum of U_n and the next block of data
10822 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10823 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10824 __ adds(S_0, U_0, S_0);
10825 __ adcs(S_1, U_1, S_1);
10826 __ adc(S_2, U_2, zr);
10827 __ add(S_2, S_2, 1);
10828
10829 const Register U_0HI = *++regs, U_1HI = *++regs;
10830
10831 // NB: this logic depends on some of the special properties of
10832 // Poly1305 keys. In particular, because we know that the top
10833 // four bits of R_0 and R_1 are zero, we can add together
10834 // partial products without any risk of needing to propagate a
10835 // carry out.
10836 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10837 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10838 __ andr(U_2, R_0, 3);
10839 __ mul(U_2, S_2, U_2);
10840
10841 // Recycle registers S_0, S_1, S_2
10842 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10843
10844 // Partial reduction mod 2**130 - 5
10845 __ adds(U_1, U_0HI, U_1);
10846 __ adc(U_2, U_1HI, U_2);
10847 // Sum now in U_2:U_1:U_0.
10848 // Dead: U_0HI, U_1HI.
10849 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10850
10851 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10852
10853 // First, U_2:U_1:U_0 += (U_2 >> 2)
10854 __ lsr(rscratch1, U_2, 2);
10855 __ andr(U_2, U_2, (u8)3);
10856 __ adds(U_0, U_0, rscratch1);
10857 __ adcs(U_1, U_1, zr);
10858 __ adc(U_2, U_2, zr);
10859 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10860 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10861 __ adcs(U_1, U_1, zr);
10862 __ adc(U_2, U_2, zr);
10863
10864 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10865 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10866 __ br(~ Assembler::LT, LOOP);
10867 }
10868
10869 // Further reduce modulo 2^130 - 5
10870 __ lsr(rscratch1, U_2, 2);
10871 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10872 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10873 __ adcs(U_1, U_1, zr);
10874 __ andr(U_2, U_2, (u1)3);
10875 __ adc(U_2, U_2, zr);
10876
10877 // Unpack the sum into five 26-bit limbs and write to memory.
10878 __ ubfiz(rscratch1, U_0, 0, 26);
10879 __ ubfx(rscratch2, U_0, 26, 26);
10880 __ stp(rscratch1, rscratch2, Address(acc_start));
10881 __ ubfx(rscratch1, U_0, 52, 12);
10882 __ bfi(rscratch1, U_1, 12, 14);
10883 __ ubfx(rscratch2, U_1, 14, 26);
10884 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10885 __ ubfx(rscratch1, U_1, 40, 24);
10886 __ bfi(rscratch1, U_2, 24, 3);
10887 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10888
10889 __ bind(DONE);
10890 __ pop(callee_saved, sp);
10891 __ leave();
10892 __ ret(lr);
10893
10894 return start;
10895 }
10896
10897 // exception handler for upcall stubs
10898 address generate_upcall_stub_exception_handler() {
10899 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10900 StubCodeMark mark(this, stub_id);
10901 address start = __ pc();
10902
10903 // Native caller has no idea how to handle exceptions,
10904 // so we just crash here. Up to callee to catch exceptions.
10905 __ verify_oop(r0);
10906 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10907 __ blr(rscratch1);
10908 __ should_not_reach_here();
10909
10910 return start;
10911 }
10912
10913 // load Method* target of MethodHandle
10914 // j_rarg0 = jobject receiver
10915 // rmethod = result
10916 address generate_upcall_stub_load_target() {
10917 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10918 StubCodeMark mark(this, stub_id);
10919 address start = __ pc();
10920
10921 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10922 // Load target method from receiver
10923 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10924 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10925 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10926 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10927 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10928 noreg, noreg);
10929 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10930
10931 __ ret(lr);
10932
10933 return start;
10934 }
10935
10936 #undef __
10937 #define __ masm->
10938
10939 class MontgomeryMultiplyGenerator : public MacroAssembler {
10940
10941 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10942 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10943
10944 RegSet _toSave;
10945 bool _squaring;
10946
10947 public:
10948 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10949 : MacroAssembler(as->code()), _squaring(squaring) {
10950
10951 // Register allocation
10952
10953 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10954 Pa_base = *regs; // Argument registers
10955 if (squaring)
10956 Pb_base = Pa_base;
10957 else
10958 Pb_base = *++regs;
10959 Pn_base = *++regs;
10960 Rlen= *++regs;
10961 inv = *++regs;
10962 Pm_base = *++regs;
10963
10964 // Working registers:
10965 Ra = *++regs; // The current digit of a, b, n, and m.
10966 Rb = *++regs;
10967 Rm = *++regs;
10968 Rn = *++regs;
10969
10970 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10971 Pb = *++regs;
10972 Pm = *++regs;
10973 Pn = *++regs;
10974
10975 t0 = *++regs; // Three registers which form a
10976 t1 = *++regs; // triple-precision accumuator.
10977 t2 = *++regs;
10978
10979 Ri = *++regs; // Inner and outer loop indexes.
10980 Rj = *++regs;
10981
10982 Rhi_ab = *++regs; // Product registers: low and high parts
10983 Rlo_ab = *++regs; // of a*b and m*n.
10984 Rhi_mn = *++regs;
10985 Rlo_mn = *++regs;
10986
10987 // r19 and up are callee-saved.
10988 _toSave = RegSet::range(r19, *regs) + Pm_base;
10989 }
10990
10991 private:
10992 void save_regs() {
10993 push(_toSave, sp);
10994 }
10995
10996 void restore_regs() {
10997 pop(_toSave, sp);
10998 }
10999
11000 template <typename T>
11001 void unroll_2(Register count, T block) {
11002 Label loop, end, odd;
11003 tbnz(count, 0, odd);
11004 cbz(count, end);
11005 align(16);
11006 bind(loop);
11007 (this->*block)();
11008 bind(odd);
11009 (this->*block)();
11010 subs(count, count, 2);
11011 br(Assembler::GT, loop);
11012 bind(end);
11013 }
11014
11015 template <typename T>
11016 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11017 Label loop, end, odd;
11018 tbnz(count, 0, odd);
11019 cbz(count, end);
11020 align(16);
11021 bind(loop);
11022 (this->*block)(d, s, tmp);
11023 bind(odd);
11024 (this->*block)(d, s, tmp);
11025 subs(count, count, 2);
11026 br(Assembler::GT, loop);
11027 bind(end);
11028 }
11029
11030 void pre1(RegisterOrConstant i) {
11031 block_comment("pre1");
11032 // Pa = Pa_base;
11033 // Pb = Pb_base + i;
11034 // Pm = Pm_base;
11035 // Pn = Pn_base + i;
11036 // Ra = *Pa;
11037 // Rb = *Pb;
11038 // Rm = *Pm;
11039 // Rn = *Pn;
11040 ldr(Ra, Address(Pa_base));
11041 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11042 ldr(Rm, Address(Pm_base));
11043 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11044 lea(Pa, Address(Pa_base));
11045 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11046 lea(Pm, Address(Pm_base));
11047 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11048
11049 // Zero the m*n result.
11050 mov(Rhi_mn, zr);
11051 mov(Rlo_mn, zr);
11052 }
11053
11054 // The core multiply-accumulate step of a Montgomery
11055 // multiplication. The idea is to schedule operations as a
11056 // pipeline so that instructions with long latencies (loads and
11057 // multiplies) have time to complete before their results are
11058 // used. This most benefits in-order implementations of the
11059 // architecture but out-of-order ones also benefit.
11060 void step() {
11061 block_comment("step");
11062 // MACC(Ra, Rb, t0, t1, t2);
11063 // Ra = *++Pa;
11064 // Rb = *--Pb;
11065 umulh(Rhi_ab, Ra, Rb);
11066 mul(Rlo_ab, Ra, Rb);
11067 ldr(Ra, pre(Pa, wordSize));
11068 ldr(Rb, pre(Pb, -wordSize));
11069 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11070 // previous iteration.
11071 // MACC(Rm, Rn, t0, t1, t2);
11072 // Rm = *++Pm;
11073 // Rn = *--Pn;
11074 umulh(Rhi_mn, Rm, Rn);
11075 mul(Rlo_mn, Rm, Rn);
11076 ldr(Rm, pre(Pm, wordSize));
11077 ldr(Rn, pre(Pn, -wordSize));
11078 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11079 }
11080
11081 void post1() {
11082 block_comment("post1");
11083
11084 // MACC(Ra, Rb, t0, t1, t2);
11085 // Ra = *++Pa;
11086 // Rb = *--Pb;
11087 umulh(Rhi_ab, Ra, Rb);
11088 mul(Rlo_ab, Ra, Rb);
11089 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11090 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11091
11092 // *Pm = Rm = t0 * inv;
11093 mul(Rm, t0, inv);
11094 str(Rm, Address(Pm));
11095
11096 // MACC(Rm, Rn, t0, t1, t2);
11097 // t0 = t1; t1 = t2; t2 = 0;
11098 umulh(Rhi_mn, Rm, Rn);
11099
11100 #ifndef PRODUCT
11101 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11102 {
11103 mul(Rlo_mn, Rm, Rn);
11104 add(Rlo_mn, t0, Rlo_mn);
11105 Label ok;
11106 cbz(Rlo_mn, ok); {
11107 stop("broken Montgomery multiply");
11108 } bind(ok);
11109 }
11110 #endif
11111 // We have very carefully set things up so that
11112 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11113 // the lower half of Rm * Rn because we know the result already:
11114 // it must be -t0. t0 + (-t0) must generate a carry iff
11115 // t0 != 0. So, rather than do a mul and an adds we just set
11116 // the carry flag iff t0 is nonzero.
11117 //
11118 // mul(Rlo_mn, Rm, Rn);
11119 // adds(zr, t0, Rlo_mn);
11120 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11121 adcs(t0, t1, Rhi_mn);
11122 adc(t1, t2, zr);
11123 mov(t2, zr);
11124 }
11125
11126 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11127 block_comment("pre2");
11128 // Pa = Pa_base + i-len;
11129 // Pb = Pb_base + len;
11130 // Pm = Pm_base + i-len;
11131 // Pn = Pn_base + len;
11132
11133 if (i.is_register()) {
11134 sub(Rj, i.as_register(), len);
11135 } else {
11136 mov(Rj, i.as_constant());
11137 sub(Rj, Rj, len);
11138 }
11139 // Rj == i-len
11140
11141 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11142 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11143 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11144 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11145
11146 // Ra = *++Pa;
11147 // Rb = *--Pb;
11148 // Rm = *++Pm;
11149 // Rn = *--Pn;
11150 ldr(Ra, pre(Pa, wordSize));
11151 ldr(Rb, pre(Pb, -wordSize));
11152 ldr(Rm, pre(Pm, wordSize));
11153 ldr(Rn, pre(Pn, -wordSize));
11154
11155 mov(Rhi_mn, zr);
11156 mov(Rlo_mn, zr);
11157 }
11158
11159 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11160 block_comment("post2");
11161 if (i.is_constant()) {
11162 mov(Rj, i.as_constant()-len.as_constant());
11163 } else {
11164 sub(Rj, i.as_register(), len);
11165 }
11166
11167 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11168
11169 // As soon as we know the least significant digit of our result,
11170 // store it.
11171 // Pm_base[i-len] = t0;
11172 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11173
11174 // t0 = t1; t1 = t2; t2 = 0;
11175 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11176 adc(t1, t2, zr);
11177 mov(t2, zr);
11178 }
11179
11180 // A carry in t0 after Montgomery multiplication means that we
11181 // should subtract multiples of n from our result in m. We'll
11182 // keep doing that until there is no carry.
11183 void normalize(RegisterOrConstant len) {
11184 block_comment("normalize");
11185 // while (t0)
11186 // t0 = sub(Pm_base, Pn_base, t0, len);
11187 Label loop, post, again;
11188 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11189 cbz(t0, post); {
11190 bind(again); {
11191 mov(i, zr);
11192 mov(cnt, len);
11193 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11194 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11195 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11196 align(16);
11197 bind(loop); {
11198 sbcs(Rm, Rm, Rn);
11199 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11200 add(i, i, 1);
11201 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11202 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11203 sub(cnt, cnt, 1);
11204 } cbnz(cnt, loop);
11205 sbc(t0, t0, zr);
11206 } cbnz(t0, again);
11207 } bind(post);
11208 }
11209
11210 // Move memory at s to d, reversing words.
11211 // Increments d to end of copied memory
11212 // Destroys tmp1, tmp2
11213 // Preserves len
11214 // Leaves s pointing to the address which was in d at start
11215 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11216 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11217 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11218
11219 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11220 mov(tmp1, len);
11221 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11222 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11223 }
11224 // where
11225 void reverse1(Register d, Register s, Register tmp) {
11226 ldr(tmp, pre(s, -wordSize));
11227 ror(tmp, tmp, 32);
11228 str(tmp, post(d, wordSize));
11229 }
11230
11231 void step_squaring() {
11232 // An extra ACC
11233 step();
11234 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11235 }
11236
11237 void last_squaring(RegisterOrConstant i) {
11238 Label dont;
11239 // if ((i & 1) == 0) {
11240 tbnz(i.as_register(), 0, dont); {
11241 // MACC(Ra, Rb, t0, t1, t2);
11242 // Ra = *++Pa;
11243 // Rb = *--Pb;
11244 umulh(Rhi_ab, Ra, Rb);
11245 mul(Rlo_ab, Ra, Rb);
11246 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11247 } bind(dont);
11248 }
11249
11250 void extra_step_squaring() {
11251 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11252
11253 // MACC(Rm, Rn, t0, t1, t2);
11254 // Rm = *++Pm;
11255 // Rn = *--Pn;
11256 umulh(Rhi_mn, Rm, Rn);
11257 mul(Rlo_mn, Rm, Rn);
11258 ldr(Rm, pre(Pm, wordSize));
11259 ldr(Rn, pre(Pn, -wordSize));
11260 }
11261
11262 void post1_squaring() {
11263 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11264
11265 // *Pm = Rm = t0 * inv;
11266 mul(Rm, t0, inv);
11267 str(Rm, Address(Pm));
11268
11269 // MACC(Rm, Rn, t0, t1, t2);
11270 // t0 = t1; t1 = t2; t2 = 0;
11271 umulh(Rhi_mn, Rm, Rn);
11272
11273 #ifndef PRODUCT
11274 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11275 {
11276 mul(Rlo_mn, Rm, Rn);
11277 add(Rlo_mn, t0, Rlo_mn);
11278 Label ok;
11279 cbz(Rlo_mn, ok); {
11280 stop("broken Montgomery multiply");
11281 } bind(ok);
11282 }
11283 #endif
11284 // We have very carefully set things up so that
11285 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11286 // the lower half of Rm * Rn because we know the result already:
11287 // it must be -t0. t0 + (-t0) must generate a carry iff
11288 // t0 != 0. So, rather than do a mul and an adds we just set
11289 // the carry flag iff t0 is nonzero.
11290 //
11291 // mul(Rlo_mn, Rm, Rn);
11292 // adds(zr, t0, Rlo_mn);
11293 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11294 adcs(t0, t1, Rhi_mn);
11295 adc(t1, t2, zr);
11296 mov(t2, zr);
11297 }
11298
11299 void acc(Register Rhi, Register Rlo,
11300 Register t0, Register t1, Register t2) {
11301 adds(t0, t0, Rlo);
11302 adcs(t1, t1, Rhi);
11303 adc(t2, t2, zr);
11304 }
11305
11306 public:
11307 /**
11308 * Fast Montgomery multiplication. The derivation of the
11309 * algorithm is in A Cryptographic Library for the Motorola
11310 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11311 *
11312 * Arguments:
11313 *
11314 * Inputs for multiplication:
11315 * c_rarg0 - int array elements a
11316 * c_rarg1 - int array elements b
11317 * c_rarg2 - int array elements n (the modulus)
11318 * c_rarg3 - int length
11319 * c_rarg4 - int inv
11320 * c_rarg5 - int array elements m (the result)
11321 *
11322 * Inputs for squaring:
11323 * c_rarg0 - int array elements a
11324 * c_rarg1 - int array elements n (the modulus)
11325 * c_rarg2 - int length
11326 * c_rarg3 - int inv
11327 * c_rarg4 - int array elements m (the result)
11328 *
11329 */
11330 address generate_multiply() {
11331 Label argh, nothing;
11332 bind(argh);
11333 stop("MontgomeryMultiply total_allocation must be <= 8192");
11334
11335 align(CodeEntryAlignment);
11336 address entry = pc();
11337
11338 cbzw(Rlen, nothing);
11339
11340 enter();
11341
11342 // Make room.
11343 cmpw(Rlen, 512);
11344 br(Assembler::HI, argh);
11345 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11346 andr(sp, Ra, -2 * wordSize);
11347
11348 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11349
11350 {
11351 // Copy input args, reversing as we go. We use Ra as a
11352 // temporary variable.
11353 reverse(Ra, Pa_base, Rlen, t0, t1);
11354 if (!_squaring)
11355 reverse(Ra, Pb_base, Rlen, t0, t1);
11356 reverse(Ra, Pn_base, Rlen, t0, t1);
11357 }
11358
11359 // Push all call-saved registers and also Pm_base which we'll need
11360 // at the end.
11361 save_regs();
11362
11363 #ifndef PRODUCT
11364 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11365 {
11366 ldr(Rn, Address(Pn_base, 0));
11367 mul(Rlo_mn, Rn, inv);
11368 subs(zr, Rlo_mn, -1);
11369 Label ok;
11370 br(EQ, ok); {
11371 stop("broken inverse in Montgomery multiply");
11372 } bind(ok);
11373 }
11374 #endif
11375
11376 mov(Pm_base, Ra);
11377
11378 mov(t0, zr);
11379 mov(t1, zr);
11380 mov(t2, zr);
11381
11382 block_comment("for (int i = 0; i < len; i++) {");
11383 mov(Ri, zr); {
11384 Label loop, end;
11385 cmpw(Ri, Rlen);
11386 br(Assembler::GE, end);
11387
11388 bind(loop);
11389 pre1(Ri);
11390
11391 block_comment(" for (j = i; j; j--) {"); {
11392 movw(Rj, Ri);
11393 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11394 } block_comment(" } // j");
11395
11396 post1();
11397 addw(Ri, Ri, 1);
11398 cmpw(Ri, Rlen);
11399 br(Assembler::LT, loop);
11400 bind(end);
11401 block_comment("} // i");
11402 }
11403
11404 block_comment("for (int i = len; i < 2*len; i++) {");
11405 mov(Ri, Rlen); {
11406 Label loop, end;
11407 cmpw(Ri, Rlen, Assembler::LSL, 1);
11408 br(Assembler::GE, end);
11409
11410 bind(loop);
11411 pre2(Ri, Rlen);
11412
11413 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11414 lslw(Rj, Rlen, 1);
11415 subw(Rj, Rj, Ri);
11416 subw(Rj, Rj, 1);
11417 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11418 } block_comment(" } // j");
11419
11420 post2(Ri, Rlen);
11421 addw(Ri, Ri, 1);
11422 cmpw(Ri, Rlen, Assembler::LSL, 1);
11423 br(Assembler::LT, loop);
11424 bind(end);
11425 }
11426 block_comment("} // i");
11427
11428 normalize(Rlen);
11429
11430 mov(Ra, Pm_base); // Save Pm_base in Ra
11431 restore_regs(); // Restore caller's Pm_base
11432
11433 // Copy our result into caller's Pm_base
11434 reverse(Pm_base, Ra, Rlen, t0, t1);
11435
11436 leave();
11437 bind(nothing);
11438 ret(lr);
11439
11440 return entry;
11441 }
11442 // In C, approximately:
11443
11444 // void
11445 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11446 // julong Pn_base[], julong Pm_base[],
11447 // julong inv, int len) {
11448 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11449 // julong *Pa, *Pb, *Pn, *Pm;
11450 // julong Ra, Rb, Rn, Rm;
11451
11452 // int i;
11453
11454 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11455
11456 // for (i = 0; i < len; i++) {
11457 // int j;
11458
11459 // Pa = Pa_base;
11460 // Pb = Pb_base + i;
11461 // Pm = Pm_base;
11462 // Pn = Pn_base + i;
11463
11464 // Ra = *Pa;
11465 // Rb = *Pb;
11466 // Rm = *Pm;
11467 // Rn = *Pn;
11468
11469 // int iters = i;
11470 // for (j = 0; iters--; j++) {
11471 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11472 // MACC(Ra, Rb, t0, t1, t2);
11473 // Ra = *++Pa;
11474 // Rb = *--Pb;
11475 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11476 // MACC(Rm, Rn, t0, t1, t2);
11477 // Rm = *++Pm;
11478 // Rn = *--Pn;
11479 // }
11480
11481 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11482 // MACC(Ra, Rb, t0, t1, t2);
11483 // *Pm = Rm = t0 * inv;
11484 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11485 // MACC(Rm, Rn, t0, t1, t2);
11486
11487 // assert(t0 == 0, "broken Montgomery multiply");
11488
11489 // t0 = t1; t1 = t2; t2 = 0;
11490 // }
11491
11492 // for (i = len; i < 2*len; i++) {
11493 // int j;
11494
11495 // Pa = Pa_base + i-len;
11496 // Pb = Pb_base + len;
11497 // Pm = Pm_base + i-len;
11498 // Pn = Pn_base + len;
11499
11500 // Ra = *++Pa;
11501 // Rb = *--Pb;
11502 // Rm = *++Pm;
11503 // Rn = *--Pn;
11504
11505 // int iters = len*2-i-1;
11506 // for (j = i-len+1; iters--; j++) {
11507 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11508 // MACC(Ra, Rb, t0, t1, t2);
11509 // Ra = *++Pa;
11510 // Rb = *--Pb;
11511 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11512 // MACC(Rm, Rn, t0, t1, t2);
11513 // Rm = *++Pm;
11514 // Rn = *--Pn;
11515 // }
11516
11517 // Pm_base[i-len] = t0;
11518 // t0 = t1; t1 = t2; t2 = 0;
11519 // }
11520
11521 // while (t0)
11522 // t0 = sub(Pm_base, Pn_base, t0, len);
11523 // }
11524
11525 /**
11526 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11527 * multiplies than Montgomery multiplication so it should be up to
11528 * 25% faster. However, its loop control is more complex and it
11529 * may actually run slower on some machines.
11530 *
11531 * Arguments:
11532 *
11533 * Inputs:
11534 * c_rarg0 - int array elements a
11535 * c_rarg1 - int array elements n (the modulus)
11536 * c_rarg2 - int length
11537 * c_rarg3 - int inv
11538 * c_rarg4 - int array elements m (the result)
11539 *
11540 */
11541 address generate_square() {
11542 Label argh;
11543 bind(argh);
11544 stop("MontgomeryMultiply total_allocation must be <= 8192");
11545
11546 align(CodeEntryAlignment);
11547 address entry = pc();
11548
11549 enter();
11550
11551 // Make room.
11552 cmpw(Rlen, 512);
11553 br(Assembler::HI, argh);
11554 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11555 andr(sp, Ra, -2 * wordSize);
11556
11557 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11558
11559 {
11560 // Copy input args, reversing as we go. We use Ra as a
11561 // temporary variable.
11562 reverse(Ra, Pa_base, Rlen, t0, t1);
11563 reverse(Ra, Pn_base, Rlen, t0, t1);
11564 }
11565
11566 // Push all call-saved registers and also Pm_base which we'll need
11567 // at the end.
11568 save_regs();
11569
11570 mov(Pm_base, Ra);
11571
11572 mov(t0, zr);
11573 mov(t1, zr);
11574 mov(t2, zr);
11575
11576 block_comment("for (int i = 0; i < len; i++) {");
11577 mov(Ri, zr); {
11578 Label loop, end;
11579 bind(loop);
11580 cmp(Ri, Rlen);
11581 br(Assembler::GE, end);
11582
11583 pre1(Ri);
11584
11585 block_comment("for (j = (i+1)/2; j; j--) {"); {
11586 add(Rj, Ri, 1);
11587 lsr(Rj, Rj, 1);
11588 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11589 } block_comment(" } // j");
11590
11591 last_squaring(Ri);
11592
11593 block_comment(" for (j = i/2; j; j--) {"); {
11594 lsr(Rj, Ri, 1);
11595 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11596 } block_comment(" } // j");
11597
11598 post1_squaring();
11599 add(Ri, Ri, 1);
11600 cmp(Ri, Rlen);
11601 br(Assembler::LT, loop);
11602
11603 bind(end);
11604 block_comment("} // i");
11605 }
11606
11607 block_comment("for (int i = len; i < 2*len; i++) {");
11608 mov(Ri, Rlen); {
11609 Label loop, end;
11610 bind(loop);
11611 cmp(Ri, Rlen, Assembler::LSL, 1);
11612 br(Assembler::GE, end);
11613
11614 pre2(Ri, Rlen);
11615
11616 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11617 lsl(Rj, Rlen, 1);
11618 sub(Rj, Rj, Ri);
11619 sub(Rj, Rj, 1);
11620 lsr(Rj, Rj, 1);
11621 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11622 } block_comment(" } // j");
11623
11624 last_squaring(Ri);
11625
11626 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11627 lsl(Rj, Rlen, 1);
11628 sub(Rj, Rj, Ri);
11629 lsr(Rj, Rj, 1);
11630 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11631 } block_comment(" } // j");
11632
11633 post2(Ri, Rlen);
11634 add(Ri, Ri, 1);
11635 cmp(Ri, Rlen, Assembler::LSL, 1);
11636
11637 br(Assembler::LT, loop);
11638 bind(end);
11639 block_comment("} // i");
11640 }
11641
11642 normalize(Rlen);
11643
11644 mov(Ra, Pm_base); // Save Pm_base in Ra
11645 restore_regs(); // Restore caller's Pm_base
11646
11647 // Copy our result into caller's Pm_base
11648 reverse(Pm_base, Ra, Rlen, t0, t1);
11649
11650 leave();
11651 ret(lr);
11652
11653 return entry;
11654 }
11655 // In C, approximately:
11656
11657 // void
11658 // montgomery_square(julong Pa_base[], julong Pn_base[],
11659 // julong Pm_base[], julong inv, int len) {
11660 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11661 // julong *Pa, *Pb, *Pn, *Pm;
11662 // julong Ra, Rb, Rn, Rm;
11663
11664 // int i;
11665
11666 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11667
11668 // for (i = 0; i < len; i++) {
11669 // int j;
11670
11671 // Pa = Pa_base;
11672 // Pb = Pa_base + i;
11673 // Pm = Pm_base;
11674 // Pn = Pn_base + i;
11675
11676 // Ra = *Pa;
11677 // Rb = *Pb;
11678 // Rm = *Pm;
11679 // Rn = *Pn;
11680
11681 // int iters = (i+1)/2;
11682 // for (j = 0; iters--; j++) {
11683 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11684 // MACC2(Ra, Rb, t0, t1, t2);
11685 // Ra = *++Pa;
11686 // Rb = *--Pb;
11687 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11688 // MACC(Rm, Rn, t0, t1, t2);
11689 // Rm = *++Pm;
11690 // Rn = *--Pn;
11691 // }
11692 // if ((i & 1) == 0) {
11693 // assert(Ra == Pa_base[j], "must be");
11694 // MACC(Ra, Ra, t0, t1, t2);
11695 // }
11696 // iters = i/2;
11697 // assert(iters == i-j, "must be");
11698 // for (; iters--; j++) {
11699 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11700 // MACC(Rm, Rn, t0, t1, t2);
11701 // Rm = *++Pm;
11702 // Rn = *--Pn;
11703 // }
11704
11705 // *Pm = Rm = t0 * inv;
11706 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11707 // MACC(Rm, Rn, t0, t1, t2);
11708
11709 // assert(t0 == 0, "broken Montgomery multiply");
11710
11711 // t0 = t1; t1 = t2; t2 = 0;
11712 // }
11713
11714 // for (i = len; i < 2*len; i++) {
11715 // int start = i-len+1;
11716 // int end = start + (len - start)/2;
11717 // int j;
11718
11719 // Pa = Pa_base + i-len;
11720 // Pb = Pa_base + len;
11721 // Pm = Pm_base + i-len;
11722 // Pn = Pn_base + len;
11723
11724 // Ra = *++Pa;
11725 // Rb = *--Pb;
11726 // Rm = *++Pm;
11727 // Rn = *--Pn;
11728
11729 // int iters = (2*len-i-1)/2;
11730 // assert(iters == end-start, "must be");
11731 // for (j = start; iters--; j++) {
11732 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11733 // MACC2(Ra, Rb, t0, t1, t2);
11734 // Ra = *++Pa;
11735 // Rb = *--Pb;
11736 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11737 // MACC(Rm, Rn, t0, t1, t2);
11738 // Rm = *++Pm;
11739 // Rn = *--Pn;
11740 // }
11741 // if ((i & 1) == 0) {
11742 // assert(Ra == Pa_base[j], "must be");
11743 // MACC(Ra, Ra, t0, t1, t2);
11744 // }
11745 // iters = (2*len-i)/2;
11746 // assert(iters == len-j, "must be");
11747 // for (; iters--; j++) {
11748 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11749 // MACC(Rm, Rn, t0, t1, t2);
11750 // Rm = *++Pm;
11751 // Rn = *--Pn;
11752 // }
11753 // Pm_base[i-len] = t0;
11754 // t0 = t1; t1 = t2; t2 = 0;
11755 // }
11756
11757 // while (t0)
11758 // t0 = sub(Pm_base, Pn_base, t0, len);
11759 // }
11760 };
11761
11762 // Call here from the interpreter or compiled code to either load
11763 // multiple returned values from the inline type instance being
11764 // returned to registers or to store returned values to a newly
11765 // allocated inline type instance.
11766 address generate_return_value_stub(address destination, const char* name, bool has_res) {
11767 // We need to save all registers the calling convention may use so
11768 // the runtime calls read or update those registers. This needs to
11769 // be in sync with SharedRuntime::java_return_convention().
11770 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11771 enum layout {
11772 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
11773 j_rarg6_off, j_rarg6_2,
11774 j_rarg5_off, j_rarg5_2,
11775 j_rarg4_off, j_rarg4_2,
11776 j_rarg3_off, j_rarg3_2,
11777 j_rarg2_off, j_rarg2_2,
11778 j_rarg1_off, j_rarg1_2,
11779 j_rarg0_off, j_rarg0_2,
11780
11781 j_farg7_off, j_farg7_2,
11782 j_farg6_off, j_farg6_2,
11783 j_farg5_off, j_farg5_2,
11784 j_farg4_off, j_farg4_2,
11785 j_farg3_off, j_farg3_2,
11786 j_farg2_off, j_farg2_2,
11787 j_farg1_off, j_farg1_2,
11788 j_farg0_off, j_farg0_2,
11789
11790 rfp_off, rfp_off2,
11791 return_off, return_off2,
11792
11793 framesize // inclusive of return address
11794 };
11795
11796 CodeBuffer code(name, 512, 64);
11797 MacroAssembler* masm = new MacroAssembler(&code);
11798
11799 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11800 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11801 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11802 int frame_size_in_words = frame_size_in_bytes / wordSize;
11803
11804 OopMapSet* oop_maps = new OopMapSet();
11805 OopMap* map = new OopMap(frame_size_in_slots, 0);
11806
11807 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11808 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11809 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11810 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11811 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11812 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11813 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11814 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11815
11816 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11817 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11818 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11819 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11820 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11821 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11822 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11823 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11824
11825 address start = __ pc();
11826
11827 __ enter(); // Save FP and LR before call
11828
11829 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11830 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11831 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11832 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11833
11834 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11835 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11836 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11837 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11838
11839 int frame_complete = __ offset();
11840
11841 // Set up last_Java_sp and last_Java_fp
11842 address the_pc = __ pc();
11843 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11844
11845 // Call runtime
11846 __ mov(c_rarg1, r0);
11847 __ mov(c_rarg0, rthread);
11848
11849 __ mov(rscratch1, destination);
11850 __ blr(rscratch1);
11851
11852 oop_maps->add_gc_map(the_pc - start, map);
11853
11854 __ reset_last_Java_frame(false);
11855
11856 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11857 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11858 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11859 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11860
11861 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11862 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11863 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11864 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11865
11866 __ leave();
11867
11868 // check for pending exceptions
11869 Label pending;
11870 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11871 __ cbnz(rscratch1, pending);
11872
11873 if (has_res) {
11874 __ get_vm_result_oop(r0, rthread);
11875 }
11876
11877 __ ret(lr);
11878
11879 __ bind(pending);
11880 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11881
11882 // -------------
11883 // make sure all code is generated
11884 masm->flush();
11885
11886 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11887 return stub->entry_point();
11888 }
11889
11890 // Initialization
11891 void generate_preuniverse_stubs() {
11892 // preuniverse stubs are not needed for aarch64
11893 }
11894
11895 void generate_initial_stubs() {
11896 // Generate initial stubs and initializes the entry points
11897
11898 // entry points that exist in all platforms Note: This is code
11899 // that could be shared among different platforms - however the
11900 // benefit seems to be smaller than the disadvantage of having a
11901 // much more complicated generator structure. See also comment in
11902 // stubRoutines.hpp.
11903
11904 StubRoutines::_forward_exception_entry = generate_forward_exception();
11905
11906 StubRoutines::_call_stub_entry =
11907 generate_call_stub(StubRoutines::_call_stub_return_address);
11908
11909 // is referenced by megamorphic call
11910 StubRoutines::_catch_exception_entry = generate_catch_exception();
11911
11912 // Initialize table for copy memory (arraycopy) check.
11913 if (UnsafeMemoryAccess::_table == nullptr) {
11914 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11915 }
11916
11917 if (UseCRC32Intrinsics) {
11918 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11919 }
11920
11921 if (UseCRC32CIntrinsics) {
11922 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11923 }
11924
11925 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11926 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11927 }
11928
11929 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11930 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11931 }
11932
11933 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11934 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11935 StubRoutines::_hf2f = generate_float16ToFloat();
11936 StubRoutines::_f2hf = generate_floatToFloat16();
11937 }
11938
11939 if (InlineTypeReturnedAsFields) {
11940 StubRoutines::_load_inline_type_fields_in_regs =
11941 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11942 StubRoutines::_store_inline_type_fields_to_buf =
11943 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11944 }
11945
11946 }
11947
11948 void generate_continuation_stubs() {
11949 // Continuation stubs:
11950 StubRoutines::_cont_thaw = generate_cont_thaw();
11951 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11952 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11953 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11954 }
11955
11956 void generate_final_stubs() {
11957 // support for verify_oop (must happen after universe_init)
11958 if (VerifyOops) {
11959 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11960 }
11961
11962 // arraycopy stubs used by compilers
11963 generate_arraycopy_stubs();
11964
11965 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11966
11967 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11968
11969 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11970 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11971
11972 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11973
11974 generate_atomic_entry_points();
11975
11976 #endif // LINUX
11977
11978 #ifdef COMPILER2
11979 if (UseSecondarySupersTable) {
11980 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11981 if (! InlineSecondarySupersTest) {
11982 generate_lookup_secondary_supers_table_stub();
11983 }
11984 }
11985 #endif
11986
11987 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11988
11989 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11990 }
11991
11992 void generate_compiler_stubs() {
11993 #if COMPILER2_OR_JVMCI
11994
11995 if (UseSVE == 0) {
11996 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11997 }
11998
11999 // array equals stub for large arrays.
12000 if (!UseSimpleArrayEquals) {
12001 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12002 }
12003
12004 // arrays_hascode stub for large arrays.
12005 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12006 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12007 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12008 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12009 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12010
12011 // byte_array_inflate stub for large arrays.
12012 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12013
12014 // countPositives stub for large arrays.
12015 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12016
12017 generate_compare_long_strings();
12018
12019 generate_string_indexof_stubs();
12020
12021 #ifdef COMPILER2
12022 if (UseMultiplyToLenIntrinsic) {
12023 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12024 }
12025
12026 if (UseSquareToLenIntrinsic) {
12027 StubRoutines::_squareToLen = generate_squareToLen();
12028 }
12029
12030 if (UseMulAddIntrinsic) {
12031 StubRoutines::_mulAdd = generate_mulAdd();
12032 }
12033
12034 if (UseSIMDForBigIntegerShiftIntrinsics) {
12035 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12036 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12037 }
12038
12039 if (UseMontgomeryMultiplyIntrinsic) {
12040 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12041 StubCodeMark mark(this, stub_id);
12042 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12043 StubRoutines::_montgomeryMultiply = g.generate_multiply();
12044 }
12045
12046 if (UseMontgomerySquareIntrinsic) {
12047 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12048 StubCodeMark mark(this, stub_id);
12049 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12050 // We use generate_multiply() rather than generate_square()
12051 // because it's faster for the sizes of modulus we care about.
12052 StubRoutines::_montgomerySquare = g.generate_multiply();
12053 }
12054
12055 #endif // COMPILER2
12056
12057 if (UseChaCha20Intrinsics) {
12058 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12059 }
12060
12061 if (UseKyberIntrinsics) {
12062 StubRoutines::_kyberNtt = generate_kyberNtt();
12063 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12064 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12065 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12066 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12067 StubRoutines::_kyber12To16 = generate_kyber12To16();
12068 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12069 }
12070
12071 if (UseDilithiumIntrinsics) {
12072 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12073 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12074 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12075 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12076 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12077 }
12078
12079 if (UseBASE64Intrinsics) {
12080 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12081 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12082 }
12083
12084 // data cache line writeback
12085 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12086 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12087
12088 if (UseAESIntrinsics) {
12089 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12090 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12091 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12092 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12093 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12094 }
12095 if (UseGHASHIntrinsics) {
12096 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12097 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12098 }
12099 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12100 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12101 }
12102
12103 if (UseMD5Intrinsics) {
12104 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12105 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12106 }
12107 if (UseSHA1Intrinsics) {
12108 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12109 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12110 }
12111 if (UseSHA256Intrinsics) {
12112 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12113 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12114 }
12115 if (UseSHA512Intrinsics) {
12116 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12117 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12118 }
12119 if (UseSHA3Intrinsics) {
12120
12121 StubRoutines::_double_keccak = generate_double_keccak();
12122 if (UseSIMDForSHA3Intrinsic) {
12123 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12124 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12125 } else {
12126 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12127 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12128 }
12129 }
12130
12131 if (UsePoly1305Intrinsics) {
12132 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12133 }
12134
12135 // generate Adler32 intrinsics code
12136 if (UseAdler32Intrinsics) {
12137 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12138 }
12139
12140 #endif // COMPILER2_OR_JVMCI
12141 }
12142
12143 public:
12144 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12145 switch(blob_id) {
12146 case BlobId::stubgen_preuniverse_id:
12147 generate_preuniverse_stubs();
12148 break;
12149 case BlobId::stubgen_initial_id:
12150 generate_initial_stubs();
12151 break;
12152 case BlobId::stubgen_continuation_id:
12153 generate_continuation_stubs();
12154 break;
12155 case BlobId::stubgen_compiler_id:
12156 generate_compiler_stubs();
12157 break;
12158 case BlobId::stubgen_final_id:
12159 generate_final_stubs();
12160 break;
12161 default:
12162 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12163 break;
12164 };
12165 }
12166 }; // end class declaration
12167
12168 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12169 StubGenerator g(code, blob_id);
12170 }
12171
12172
12173 #if defined (LINUX)
12174
12175 // Define pointers to atomic stubs and initialize them to point to the
12176 // code in atomic_aarch64.S.
12177
12178 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12179 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12180 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12181 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12182 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12183
12184 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12185 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12186 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12187 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12188 DEFAULT_ATOMIC_OP(xchg, 4, )
12189 DEFAULT_ATOMIC_OP(xchg, 8, )
12190 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12191 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12192 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12193 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12194 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12195 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12196 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12197 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12198 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12199 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12200
12201 #undef DEFAULT_ATOMIC_OP
12202
12203 #endif // LINUX