1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 // All of j_rargN may be used to return inline type fields so be careful
332 // not to clobber those.
333 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
334 // assignment of Rresult below.
335 Register Rresult = r14, Rresult_type = r15;
336 __ ldr(Rresult, result);
337 Label is_long, is_float, is_double, check_prim, exit;
338 __ ldr(Rresult_type, result_type);
339 __ cmp(Rresult_type, (u1)T_OBJECT);
340 __ br(Assembler::EQ, check_prim);
341 __ cmp(Rresult_type, (u1)T_LONG);
342 __ br(Assembler::EQ, is_long);
343 __ cmp(Rresult_type, (u1)T_FLOAT);
344 __ br(Assembler::EQ, is_float);
345 __ cmp(Rresult_type, (u1)T_DOUBLE);
346 __ br(Assembler::EQ, is_double);
347
348 // handle T_INT case
349 __ strw(r0, Address(Rresult));
350
351 __ BIND(exit);
352
353 // pop parameters
354 __ sub(esp, rfp, -sp_after_call_off * wordSize);
355
356 #ifdef ASSERT
357 // verify that threads correspond
358 {
359 Label L, S;
360 __ ldr(rscratch1, thread);
361 __ cmp(rthread, rscratch1);
362 __ br(Assembler::NE, S);
363 __ get_thread(rscratch1);
364 __ cmp(rthread, rscratch1);
365 __ br(Assembler::EQ, L);
366 __ BIND(S);
367 __ stop("StubRoutines::call_stub: threads must correspond");
368 __ BIND(L);
369 }
370 #endif
371
372 __ pop_cont_fastpath(rthread);
373
374 // restore callee-save registers
375 __ ldpd(v15, v14, d15_save);
376 __ ldpd(v13, v12, d13_save);
377 __ ldpd(v11, v10, d11_save);
378 __ ldpd(v9, v8, d9_save);
379
380 __ ldp(r28, r27, r28_save);
381 __ ldp(r26, r25, r26_save);
382 __ ldp(r24, r23, r24_save);
383 __ ldp(r22, r21, r22_save);
384 __ ldp(r20, r19, r20_save);
385
386 // restore fpcr
387 __ ldr(rscratch1, fpcr_save);
388 __ set_fpcr(rscratch1);
389
390 __ ldp(c_rarg0, c_rarg1, call_wrapper);
391 __ ldrw(c_rarg2, result_type);
392 __ ldr(c_rarg3, method);
393 __ ldp(c_rarg4, c_rarg5, entry_point);
394 __ ldp(c_rarg6, c_rarg7, parameter_size);
395
396 // leave frame and return to caller
397 __ leave();
398 __ ret(lr);
399
400 // handle return types different from T_INT
401 __ BIND(check_prim);
402 if (InlineTypeReturnedAsFields) {
403 // Check for scalarized return value
404 __ tbz(r0, 0, is_long);
405 // Load pack handler address
406 __ andr(rscratch1, r0, -2);
407 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
408 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
409 __ blr(rscratch1);
410 __ b(exit);
411 }
412
413 __ BIND(is_long);
414 __ str(r0, Address(Rresult, 0));
415 __ br(Assembler::AL, exit);
416
417 __ BIND(is_float);
418 __ strs(j_farg0, Address(Rresult, 0));
419 __ br(Assembler::AL, exit);
420
421 __ BIND(is_double);
422 __ strd(j_farg0, Address(Rresult, 0));
423 __ br(Assembler::AL, exit);
424
425 return start;
426 }
427
428 // Return point for a Java call if there's an exception thrown in
429 // Java code. The exception is caught and transformed into a
430 // pending exception stored in JavaThread that can be tested from
431 // within the VM.
432 //
433 // Note: Usually the parameters are removed by the callee. In case
434 // of an exception crossing an activation frame boundary, that is
435 // not the case if the callee is compiled code => need to setup the
436 // rsp.
437 //
438 // r0: exception oop
439
440 address generate_catch_exception() {
441 StubId stub_id = StubId::stubgen_catch_exception_id;
442 StubCodeMark mark(this, stub_id);
443 address start = __ pc();
444
445 // same as in generate_call_stub():
446 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
447 const Address thread (rfp, thread_off * wordSize);
448
449 #ifdef ASSERT
450 // verify that threads correspond
451 {
452 Label L, S;
453 __ ldr(rscratch1, thread);
454 __ cmp(rthread, rscratch1);
455 __ br(Assembler::NE, S);
456 __ get_thread(rscratch1);
457 __ cmp(rthread, rscratch1);
458 __ br(Assembler::EQ, L);
459 __ bind(S);
460 __ stop("StubRoutines::catch_exception: threads must correspond");
461 __ bind(L);
462 }
463 #endif
464
465 // set pending exception
466 __ verify_oop(r0);
467
468 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
469 __ mov(rscratch1, (address)__FILE__);
470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
471 __ movw(rscratch1, (int)__LINE__);
472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
473
474 // complete return to VM
475 assert(StubRoutines::_call_stub_return_address != nullptr,
476 "_call_stub_return_address must have been generated before");
477 __ b(StubRoutines::_call_stub_return_address);
478
479 return start;
480 }
481
482 // Continuation point for runtime calls returning with a pending
483 // exception. The pending exception check happened in the runtime
484 // or native call stub. The pending exception in Thread is
485 // converted into a Java-level exception.
486 //
487 // Contract with Java-level exception handlers:
488 // r0: exception
489 // r3: throwing pc
490 //
491 // NOTE: At entry of this stub, exception-pc must be in LR !!
492
493 // NOTE: this is always used as a jump target within generated code
494 // so it just needs to be generated code with no x86 prolog
495
496 address generate_forward_exception() {
497 StubId stub_id = StubId::stubgen_forward_exception_id;
498 StubCodeMark mark(this, stub_id);
499 address start = __ pc();
500
501 // Upon entry, LR points to the return address returning into
502 // Java (interpreted or compiled) code; i.e., the return address
503 // becomes the throwing pc.
504 //
505 // Arguments pushed before the runtime call are still on the stack
506 // but the exception handler will reset the stack pointer ->
507 // ignore them. A potential result in registers can be ignored as
508 // well.
509
510 #ifdef ASSERT
511 // make sure this code is only executed if there is a pending exception
512 {
513 Label L;
514 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
515 __ cbnz(rscratch1, L);
516 __ stop("StubRoutines::forward exception: no pending exception (1)");
517 __ bind(L);
518 }
519 #endif
520
521 // compute exception handler into r19
522
523 // call the VM to find the handler address associated with the
524 // caller address. pass thread in r0 and caller pc (ret address)
525 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
526 // the stack.
527 __ mov(c_rarg1, lr);
528 // lr will be trashed by the VM call so we move it to R19
529 // (callee-saved) because we also need to pass it to the handler
530 // returned by this call.
531 __ mov(r19, lr);
532 BLOCK_COMMENT("call exception_handler_for_return_address");
533 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
534 SharedRuntime::exception_handler_for_return_address),
535 rthread, c_rarg1);
536 // Reinitialize the ptrue predicate register, in case the external runtime
537 // call clobbers ptrue reg, as we may return to SVE compiled code.
538 __ reinitialize_ptrue();
539
540 // we should not really care that lr is no longer the callee
541 // address. we saved the value the handler needs in r19 so we can
542 // just copy it to r3. however, the C2 handler will push its own
543 // frame and then calls into the VM and the VM code asserts that
544 // the PC for the frame above the handler belongs to a compiled
545 // Java method. So, we restore lr here to satisfy that assert.
546 __ mov(lr, r19);
547 // setup r0 & r3 & clear pending exception
548 __ mov(r3, r19);
549 __ mov(r19, r0);
550 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
551 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
552
553 #ifdef ASSERT
554 // make sure exception is set
555 {
556 Label L;
557 __ cbnz(r0, L);
558 __ stop("StubRoutines::forward exception: no pending exception (2)");
559 __ bind(L);
560 }
561 #endif
562
563 // continue at exception handler
564 // r0: exception
565 // r3: throwing pc
566 // r19: exception handler
567 __ verify_oop(r0);
568 __ br(r19);
569
570 return start;
571 }
572
573 // Non-destructive plausibility checks for oops
574 //
575 // Arguments:
576 // r0: oop to verify
577 // rscratch1: error message
578 //
579 // Stack after saving c_rarg3:
580 // [tos + 0]: saved c_rarg3
581 // [tos + 1]: saved c_rarg2
582 // [tos + 2]: saved lr
583 // [tos + 3]: saved rscratch2
584 // [tos + 4]: saved r0
585 // [tos + 5]: saved rscratch1
586 address generate_verify_oop() {
587 StubId stub_id = StubId::stubgen_verify_oop_id;
588 StubCodeMark mark(this, stub_id);
589 address start = __ pc();
590
591 Label exit, error;
592
593 // save c_rarg2 and c_rarg3
594 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
595
596 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
597 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
598 __ ldr(c_rarg3, Address(c_rarg2));
599 __ add(c_rarg3, c_rarg3, 1);
600 __ str(c_rarg3, Address(c_rarg2));
601
602 // object is in r0
603 // make sure object is 'reasonable'
604 __ cbz(r0, exit); // if obj is null it is OK
605
606 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
607 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
608
609 // return if everything seems ok
610 __ bind(exit);
611
612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
613 __ ret(lr);
614
615 // handle errors
616 __ bind(error);
617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
618
619 __ push(RegSet::range(r0, r29), sp);
620 // debug(char* msg, int64_t pc, int64_t regs[])
621 __ mov(c_rarg0, rscratch1); // pass address of error message
622 __ mov(c_rarg1, lr); // pass return address
623 __ mov(c_rarg2, sp); // pass address of regs on stack
624 #ifndef PRODUCT
625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
626 #endif
627 BLOCK_COMMENT("call MacroAssembler::debug");
628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
629 __ blr(rscratch1);
630 __ hlt(0);
631
632 return start;
633 }
634
635 // Generate indices for iota vector.
636 address generate_iota_indices(StubId stub_id) {
637 __ align(CodeEntryAlignment);
638 StubCodeMark mark(this, stub_id);
639 address start = __ pc();
640 // B
641 __ emit_data64(0x0706050403020100, relocInfo::none);
642 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
643 // H
644 __ emit_data64(0x0003000200010000, relocInfo::none);
645 __ emit_data64(0x0007000600050004, relocInfo::none);
646 // S
647 __ emit_data64(0x0000000100000000, relocInfo::none);
648 __ emit_data64(0x0000000300000002, relocInfo::none);
649 // D
650 __ emit_data64(0x0000000000000000, relocInfo::none);
651 __ emit_data64(0x0000000000000001, relocInfo::none);
652 // S - FP
653 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
654 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
655 // D - FP
656 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
657 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
658 return start;
659 }
660
661 // The inner part of zero_words(). This is the bulk operation,
662 // zeroing words in blocks, possibly using DC ZVA to do it. The
663 // caller is responsible for zeroing the last few words.
664 //
665 // Inputs:
666 // r10: the HeapWord-aligned base address of an array to zero.
667 // r11: the count in HeapWords, r11 > 0.
668 //
669 // Returns r10 and r11, adjusted for the caller to clear.
670 // r10: the base address of the tail of words left to clear.
671 // r11: the number of words in the tail.
672 // r11 < MacroAssembler::zero_words_block_size.
673
674 address generate_zero_blocks() {
675 Label done;
676 Label base_aligned;
677
678 Register base = r10, cnt = r11;
679
680 __ align(CodeEntryAlignment);
681 StubId stub_id = StubId::stubgen_zero_blocks_id;
682 StubCodeMark mark(this, stub_id);
683 address start = __ pc();
684
685 if (UseBlockZeroing) {
686 int zva_length = VM_Version::zva_length();
687
688 // Ensure ZVA length can be divided by 16. This is required by
689 // the subsequent operations.
690 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
691
692 __ tbz(base, 3, base_aligned);
693 __ str(zr, Address(__ post(base, 8)));
694 __ sub(cnt, cnt, 1);
695 __ bind(base_aligned);
696
697 // Ensure count >= zva_length * 2 so that it still deserves a zva after
698 // alignment.
699 Label small;
700 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
701 __ subs(rscratch1, cnt, low_limit >> 3);
702 __ br(Assembler::LT, small);
703 __ zero_dcache_blocks(base, cnt);
704 __ bind(small);
705 }
706
707 {
708 // Number of stp instructions we'll unroll
709 const int unroll =
710 MacroAssembler::zero_words_block_size / 2;
711 // Clear the remaining blocks.
712 Label loop;
713 __ subs(cnt, cnt, unroll * 2);
714 __ br(Assembler::LT, done);
715 __ bind(loop);
716 for (int i = 0; i < unroll; i++)
717 __ stp(zr, zr, __ post(base, 16));
718 __ subs(cnt, cnt, unroll * 2);
719 __ br(Assembler::GE, loop);
720 __ bind(done);
721 __ add(cnt, cnt, unroll * 2);
722 }
723
724 __ ret(lr);
725
726 return start;
727 }
728
729
730 typedef enum {
731 copy_forwards = 1,
732 copy_backwards = -1
733 } copy_direction;
734
735 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
736 // for arraycopy stubs.
737 class ArrayCopyBarrierSetHelper : StackObj {
738 BarrierSetAssembler* _bs_asm;
739 MacroAssembler* _masm;
740 DecoratorSet _decorators;
741 BasicType _type;
742 Register _gct1;
743 Register _gct2;
744 Register _gct3;
745 FloatRegister _gcvt1;
746 FloatRegister _gcvt2;
747 FloatRegister _gcvt3;
748
749 public:
750 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
751 DecoratorSet decorators,
752 BasicType type,
753 Register gct1,
754 Register gct2,
755 Register gct3,
756 FloatRegister gcvt1,
757 FloatRegister gcvt2,
758 FloatRegister gcvt3)
759 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
760 _masm(masm),
761 _decorators(decorators),
762 _type(type),
763 _gct1(gct1),
764 _gct2(gct2),
765 _gct3(gct3),
766 _gcvt1(gcvt1),
767 _gcvt2(gcvt2),
768 _gcvt3(gcvt3) {
769 }
770
771 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
772 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
773 dst1, dst2, src,
774 _gct1, _gct2, _gcvt1);
775 }
776
777 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
778 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
779 dst, src1, src2,
780 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
781 }
782
783 void copy_load_at_16(Register dst1, Register dst2, Address src) {
784 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
785 dst1, dst2, src,
786 _gct1);
787 }
788
789 void copy_store_at_16(Address dst, Register src1, Register src2) {
790 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
791 dst, src1, src2,
792 _gct1, _gct2, _gct3);
793 }
794
795 void copy_load_at_8(Register dst, Address src) {
796 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
797 dst, noreg, src,
798 _gct1);
799 }
800
801 void copy_store_at_8(Address dst, Register src) {
802 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
803 dst, src, noreg,
804 _gct1, _gct2, _gct3);
805 }
806 };
807
808 // Bulk copy of blocks of 8 words.
809 //
810 // count is a count of words.
811 //
812 // Precondition: count >= 8
813 //
814 // Postconditions:
815 //
816 // The least significant bit of count contains the remaining count
817 // of words to copy. The rest of count is trash.
818 //
819 // s and d are adjusted to point to the remaining words to copy
820 //
821 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
822 BasicType type;
823 copy_direction direction;
824
825 switch (stub_id) {
826 case StubId::stubgen_copy_byte_f_id:
827 direction = copy_forwards;
828 type = T_BYTE;
829 break;
830 case StubId::stubgen_copy_byte_b_id:
831 direction = copy_backwards;
832 type = T_BYTE;
833 break;
834 case StubId::stubgen_copy_oop_f_id:
835 direction = copy_forwards;
836 type = T_OBJECT;
837 break;
838 case StubId::stubgen_copy_oop_b_id:
839 direction = copy_backwards;
840 type = T_OBJECT;
841 break;
842 case StubId::stubgen_copy_oop_uninit_f_id:
843 direction = copy_forwards;
844 type = T_OBJECT;
845 break;
846 case StubId::stubgen_copy_oop_uninit_b_id:
847 direction = copy_backwards;
848 type = T_OBJECT;
849 break;
850 default:
851 ShouldNotReachHere();
852 }
853
854 int unit = wordSize * direction;
855 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
856
857 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
858 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
859 const Register stride = r14;
860 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
861 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
862 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
863
864 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
865 assert_different_registers(s, d, count, rscratch1, rscratch2);
866
867 Label again, drain;
868
869 __ align(CodeEntryAlignment);
870
871 StubCodeMark mark(this, stub_id);
872
873 address start = __ pc();
874
875 Label unaligned_copy_long;
876 if (AvoidUnalignedAccesses) {
877 __ tbnz(d, 3, unaligned_copy_long);
878 }
879
880 if (direction == copy_forwards) {
881 __ sub(s, s, bias);
882 __ sub(d, d, bias);
883 }
884
885 #ifdef ASSERT
886 // Make sure we are never given < 8 words
887 {
888 Label L;
889 __ cmp(count, (u1)8);
890 __ br(Assembler::GE, L);
891 __ stop("genrate_copy_longs called with < 8 words");
892 __ bind(L);
893 }
894 #endif
895
896 // Fill 8 registers
897 if (UseSIMDForMemoryOps) {
898 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
899 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
900 } else {
901 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
902 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
903 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
904 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
905 }
906
907 __ subs(count, count, 16);
908 __ br(Assembler::LO, drain);
909
910 int prefetch = PrefetchCopyIntervalInBytes;
911 bool use_stride = false;
912 if (direction == copy_backwards) {
913 use_stride = prefetch > 256;
914 prefetch = -prefetch;
915 if (use_stride) __ mov(stride, prefetch);
916 }
917
918 __ bind(again);
919
920 if (PrefetchCopyIntervalInBytes > 0)
921 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
922
923 if (UseSIMDForMemoryOps) {
924 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
925 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
926 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
927 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
928 } else {
929 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
930 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
931 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
932 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
933 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
934 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
937 }
938
939 __ subs(count, count, 8);
940 __ br(Assembler::HS, again);
941
942 // Drain
943 __ bind(drain);
944 if (UseSIMDForMemoryOps) {
945 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
946 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
947 } else {
948 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
949 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
950 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
951 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
952 }
953
954 {
955 Label L1, L2;
956 __ tbz(count, exact_log2(4), L1);
957 if (UseSIMDForMemoryOps) {
958 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
959 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
960 } else {
961 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
962 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
963 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
964 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
965 }
966 __ bind(L1);
967
968 if (direction == copy_forwards) {
969 __ add(s, s, bias);
970 __ add(d, d, bias);
971 }
972
973 __ tbz(count, 1, L2);
974 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
975 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
976 __ bind(L2);
977 }
978
979 __ ret(lr);
980
981 if (AvoidUnalignedAccesses) {
982 Label drain, again;
983 // Register order for storing. Order is different for backward copy.
984
985 __ bind(unaligned_copy_long);
986
987 // source address is even aligned, target odd aligned
988 //
989 // when forward copying word pairs we read long pairs at offsets
990 // {0, 2, 4, 6} (in long words). when backwards copying we read
991 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
992 // address by -2 in the forwards case so we can compute the
993 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
994 // or -1.
995 //
996 // when forward copying we need to store 1 word, 3 pairs and
997 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
998 // zero offset We adjust the destination by -1 which means we
999 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1000 //
1001 // When backwards copyng we need to store 1 word, 3 pairs and
1002 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1003 // offsets {1, 3, 5, 7, 8} * unit.
1004
1005 if (direction == copy_forwards) {
1006 __ sub(s, s, 16);
1007 __ sub(d, d, 8);
1008 }
1009
1010 // Fill 8 registers
1011 //
1012 // for forwards copy s was offset by -16 from the original input
1013 // value of s so the register contents are at these offsets
1014 // relative to the 64 bit block addressed by that original input
1015 // and so on for each successive 64 byte block when s is updated
1016 //
1017 // t0 at offset 0, t1 at offset 8
1018 // t2 at offset 16, t3 at offset 24
1019 // t4 at offset 32, t5 at offset 40
1020 // t6 at offset 48, t7 at offset 56
1021
1022 // for backwards copy s was not offset so the register contents
1023 // are at these offsets into the preceding 64 byte block
1024 // relative to that original input and so on for each successive
1025 // preceding 64 byte block when s is updated. this explains the
1026 // slightly counter-intuitive looking pattern of register usage
1027 // in the stp instructions for backwards copy.
1028 //
1029 // t0 at offset -16, t1 at offset -8
1030 // t2 at offset -32, t3 at offset -24
1031 // t4 at offset -48, t5 at offset -40
1032 // t6 at offset -64, t7 at offset -56
1033
1034 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1035 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1036 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1037 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1038
1039 __ subs(count, count, 16);
1040 __ br(Assembler::LO, drain);
1041
1042 int prefetch = PrefetchCopyIntervalInBytes;
1043 bool use_stride = false;
1044 if (direction == copy_backwards) {
1045 use_stride = prefetch > 256;
1046 prefetch = -prefetch;
1047 if (use_stride) __ mov(stride, prefetch);
1048 }
1049
1050 __ bind(again);
1051
1052 if (PrefetchCopyIntervalInBytes > 0)
1053 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1054
1055 if (direction == copy_forwards) {
1056 // allowing for the offset of -8 the store instructions place
1057 // registers into the target 64 bit block at the following
1058 // offsets
1059 //
1060 // t0 at offset 0
1061 // t1 at offset 8, t2 at offset 16
1062 // t3 at offset 24, t4 at offset 32
1063 // t5 at offset 40, t6 at offset 48
1064 // t7 at offset 56
1065
1066 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1067 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1068 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1069 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1070 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1071 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1072 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1073 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1074 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1075 } else {
1076 // d was not offset when we started so the registers are
1077 // written into the 64 bit block preceding d with the following
1078 // offsets
1079 //
1080 // t1 at offset -8
1081 // t3 at offset -24, t0 at offset -16
1082 // t5 at offset -48, t2 at offset -32
1083 // t7 at offset -56, t4 at offset -48
1084 // t6 at offset -64
1085 //
1086 // note that this matches the offsets previously noted for the
1087 // loads
1088
1089 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1090 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1091 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1092 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1093 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1094 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1095 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1096 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1097 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1098 }
1099
1100 __ subs(count, count, 8);
1101 __ br(Assembler::HS, again);
1102
1103 // Drain
1104 //
1105 // this uses the same pattern of offsets and register arguments
1106 // as above
1107 __ bind(drain);
1108 if (direction == copy_forwards) {
1109 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1110 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1111 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1112 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1113 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1114 } else {
1115 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1116 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1117 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1118 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1119 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1120 }
1121 // now we need to copy any remaining part block which may
1122 // include a 4 word block subblock and/or a 2 word subblock.
1123 // bits 2 and 1 in the count are the tell-tale for whether we
1124 // have each such subblock
1125 {
1126 Label L1, L2;
1127 __ tbz(count, exact_log2(4), L1);
1128 // this is the same as above but copying only 4 longs hence
1129 // with only one intervening stp between the str instructions
1130 // but note that the offsets and registers still follow the
1131 // same pattern
1132 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1133 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1134 if (direction == copy_forwards) {
1135 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1136 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1137 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1141 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1142 }
1143 __ bind(L1);
1144
1145 __ tbz(count, 1, L2);
1146 // this is the same as above but copying only 2 longs hence
1147 // there is no intervening stp between the str instructions
1148 // but note that the offset and register patterns are still
1149 // the same
1150 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1151 if (direction == copy_forwards) {
1152 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1153 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1154 } else {
1155 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1156 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1157 }
1158 __ bind(L2);
1159
1160 // for forwards copy we need to re-adjust the offsets we
1161 // applied so that s and d are follow the last words written
1162
1163 if (direction == copy_forwards) {
1164 __ add(s, s, 16);
1165 __ add(d, d, 8);
1166 }
1167
1168 }
1169
1170 __ ret(lr);
1171 }
1172
1173 return start;
1174 }
1175
1176 // Small copy: less than 16 bytes.
1177 //
1178 // NB: Ignores all of the bits of count which represent more than 15
1179 // bytes, so a caller doesn't have to mask them.
1180
1181 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1182 bool is_backwards = step < 0;
1183 size_t granularity = g_uabs(step);
1184 int direction = is_backwards ? -1 : 1;
1185
1186 Label Lword, Lint, Lshort, Lbyte;
1187
1188 assert(granularity
1189 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1190
1191 const Register t0 = r3;
1192 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1193 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1194
1195 // ??? I don't know if this bit-test-and-branch is the right thing
1196 // to do. It does a lot of jumping, resulting in several
1197 // mispredicted branches. It might make more sense to do this
1198 // with something like Duff's device with a single computed branch.
1199
1200 __ tbz(count, 3 - exact_log2(granularity), Lword);
1201 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1202 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1203 __ bind(Lword);
1204
1205 if (granularity <= sizeof (jint)) {
1206 __ tbz(count, 2 - exact_log2(granularity), Lint);
1207 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1208 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1209 __ bind(Lint);
1210 }
1211
1212 if (granularity <= sizeof (jshort)) {
1213 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1214 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1215 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1216 __ bind(Lshort);
1217 }
1218
1219 if (granularity <= sizeof (jbyte)) {
1220 __ tbz(count, 0, Lbyte);
1221 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1222 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1223 __ bind(Lbyte);
1224 }
1225 }
1226
1227 // All-singing all-dancing memory copy.
1228 //
1229 // Copy count units of memory from s to d. The size of a unit is
1230 // step, which can be positive or negative depending on the direction
1231 // of copy. If is_aligned is false, we align the source address.
1232 //
1233
1234 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1235 Register s, Register d, Register count, int step) {
1236 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1237 bool is_backwards = step < 0;
1238 unsigned int granularity = g_uabs(step);
1239 const Register t0 = r3, t1 = r4;
1240
1241 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1242 // load all the data before writing anything
1243 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1244 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1245 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1246 const Register send = r17, dend = r16;
1247 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1248 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1249 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1250
1251 if (PrefetchCopyIntervalInBytes > 0)
1252 __ prfm(Address(s, 0), PLDL1KEEP);
1253 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1254 __ br(Assembler::HI, copy_big);
1255
1256 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1257 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1258
1259 __ cmp(count, u1(16/granularity));
1260 __ br(Assembler::LS, copy16);
1261
1262 __ cmp(count, u1(64/granularity));
1263 __ br(Assembler::HI, copy80);
1264
1265 __ cmp(count, u1(32/granularity));
1266 __ br(Assembler::LS, copy32);
1267
1268 // 33..64 bytes
1269 if (UseSIMDForMemoryOps) {
1270 bs.copy_load_at_32(v0, v1, Address(s, 0));
1271 bs.copy_load_at_32(v2, v3, Address(send, -32));
1272 bs.copy_store_at_32(Address(d, 0), v0, v1);
1273 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1274 } else {
1275 bs.copy_load_at_16(t0, t1, Address(s, 0));
1276 bs.copy_load_at_16(t2, t3, Address(s, 16));
1277 bs.copy_load_at_16(t4, t5, Address(send, -32));
1278 bs.copy_load_at_16(t6, t7, Address(send, -16));
1279
1280 bs.copy_store_at_16(Address(d, 0), t0, t1);
1281 bs.copy_store_at_16(Address(d, 16), t2, t3);
1282 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1283 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1284 }
1285 __ b(finish);
1286
1287 // 17..32 bytes
1288 __ bind(copy32);
1289 bs.copy_load_at_16(t0, t1, Address(s, 0));
1290 bs.copy_load_at_16(t6, t7, Address(send, -16));
1291
1292 bs.copy_store_at_16(Address(d, 0), t0, t1);
1293 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1294 __ b(finish);
1295
1296 // 65..80/96 bytes
1297 // (96 bytes if SIMD because we do 32 byes per instruction)
1298 __ bind(copy80);
1299 if (UseSIMDForMemoryOps) {
1300 bs.copy_load_at_32(v0, v1, Address(s, 0));
1301 bs.copy_load_at_32(v2, v3, Address(s, 32));
1302 // Unaligned pointers can be an issue for copying.
1303 // The issue has more chances to happen when granularity of data is
1304 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1305 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1306 // The most performance drop has been seen for the range 65-80 bytes.
1307 // For such cases using the pair of ldp/stp instead of the third pair of
1308 // ldpq/stpq fixes the performance issue.
1309 if (granularity < sizeof (jint)) {
1310 Label copy96;
1311 __ cmp(count, u1(80/granularity));
1312 __ br(Assembler::HI, copy96);
1313 bs.copy_load_at_16(t0, t1, Address(send, -16));
1314
1315 bs.copy_store_at_32(Address(d, 0), v0, v1);
1316 bs.copy_store_at_32(Address(d, 32), v2, v3);
1317
1318 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1319 __ b(finish);
1320
1321 __ bind(copy96);
1322 }
1323 bs.copy_load_at_32(v4, v5, Address(send, -32));
1324
1325 bs.copy_store_at_32(Address(d, 0), v0, v1);
1326 bs.copy_store_at_32(Address(d, 32), v2, v3);
1327
1328 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1329 } else {
1330 bs.copy_load_at_16(t0, t1, Address(s, 0));
1331 bs.copy_load_at_16(t2, t3, Address(s, 16));
1332 bs.copy_load_at_16(t4, t5, Address(s, 32));
1333 bs.copy_load_at_16(t6, t7, Address(s, 48));
1334 bs.copy_load_at_16(t8, t9, Address(send, -16));
1335
1336 bs.copy_store_at_16(Address(d, 0), t0, t1);
1337 bs.copy_store_at_16(Address(d, 16), t2, t3);
1338 bs.copy_store_at_16(Address(d, 32), t4, t5);
1339 bs.copy_store_at_16(Address(d, 48), t6, t7);
1340 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1341 }
1342 __ b(finish);
1343
1344 // 0..16 bytes
1345 __ bind(copy16);
1346 __ cmp(count, u1(8/granularity));
1347 __ br(Assembler::LO, copy8);
1348
1349 // 8..16 bytes
1350 bs.copy_load_at_8(t0, Address(s, 0));
1351 bs.copy_load_at_8(t1, Address(send, -8));
1352 bs.copy_store_at_8(Address(d, 0), t0);
1353 bs.copy_store_at_8(Address(dend, -8), t1);
1354 __ b(finish);
1355
1356 if (granularity < 8) {
1357 // 4..7 bytes
1358 __ bind(copy8);
1359 __ tbz(count, 2 - exact_log2(granularity), copy4);
1360 __ ldrw(t0, Address(s, 0));
1361 __ ldrw(t1, Address(send, -4));
1362 __ strw(t0, Address(d, 0));
1363 __ strw(t1, Address(dend, -4));
1364 __ b(finish);
1365 if (granularity < 4) {
1366 // 0..3 bytes
1367 __ bind(copy4);
1368 __ cbz(count, finish); // get rid of 0 case
1369 if (granularity == 2) {
1370 __ ldrh(t0, Address(s, 0));
1371 __ strh(t0, Address(d, 0));
1372 } else { // granularity == 1
1373 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1374 // the first and last byte.
1375 // Handle the 3 byte case by loading and storing base + count/2
1376 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1377 // This does means in the 1 byte case we load/store the same
1378 // byte 3 times.
1379 __ lsr(count, count, 1);
1380 __ ldrb(t0, Address(s, 0));
1381 __ ldrb(t1, Address(send, -1));
1382 __ ldrb(t2, Address(s, count));
1383 __ strb(t0, Address(d, 0));
1384 __ strb(t1, Address(dend, -1));
1385 __ strb(t2, Address(d, count));
1386 }
1387 __ b(finish);
1388 }
1389 }
1390
1391 __ bind(copy_big);
1392 if (is_backwards) {
1393 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1394 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1395 }
1396
1397 // Now we've got the small case out of the way we can align the
1398 // source address on a 2-word boundary.
1399
1400 // Here we will materialize a count in r15, which is used by copy_memory_small
1401 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1402 // Up until here, we have used t9, which aliases r15, but from here on, that register
1403 // can not be used as a temp register, as it contains the count.
1404
1405 Label aligned;
1406
1407 if (is_aligned) {
1408 // We may have to adjust by 1 word to get s 2-word-aligned.
1409 __ tbz(s, exact_log2(wordSize), aligned);
1410 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1411 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1412 __ sub(count, count, wordSize/granularity);
1413 } else {
1414 if (is_backwards) {
1415 __ andr(r15, s, 2 * wordSize - 1);
1416 } else {
1417 __ neg(r15, s);
1418 __ andr(r15, r15, 2 * wordSize - 1);
1419 }
1420 // r15 is the byte adjustment needed to align s.
1421 __ cbz(r15, aligned);
1422 int shift = exact_log2(granularity);
1423 if (shift > 0) {
1424 __ lsr(r15, r15, shift);
1425 }
1426 __ sub(count, count, r15);
1427
1428 #if 0
1429 // ?? This code is only correct for a disjoint copy. It may or
1430 // may not make sense to use it in that case.
1431
1432 // Copy the first pair; s and d may not be aligned.
1433 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1434 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1435
1436 // Align s and d, adjust count
1437 if (is_backwards) {
1438 __ sub(s, s, r15);
1439 __ sub(d, d, r15);
1440 } else {
1441 __ add(s, s, r15);
1442 __ add(d, d, r15);
1443 }
1444 #else
1445 copy_memory_small(decorators, type, s, d, r15, step);
1446 #endif
1447 }
1448
1449 __ bind(aligned);
1450
1451 // s is now 2-word-aligned.
1452
1453 // We have a count of units and some trailing bytes. Adjust the
1454 // count and do a bulk copy of words. If the shift is zero
1455 // perform a move instead to benefit from zero latency moves.
1456 int shift = exact_log2(wordSize/granularity);
1457 if (shift > 0) {
1458 __ lsr(r15, count, shift);
1459 } else {
1460 __ mov(r15, count);
1461 }
1462 if (direction == copy_forwards) {
1463 if (type != T_OBJECT) {
1464 __ bl(StubRoutines::aarch64::copy_byte_f());
1465 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1466 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1467 } else {
1468 __ bl(StubRoutines::aarch64::copy_oop_f());
1469 }
1470 } else {
1471 if (type != T_OBJECT) {
1472 __ bl(StubRoutines::aarch64::copy_byte_b());
1473 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1474 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1475 } else {
1476 __ bl(StubRoutines::aarch64::copy_oop_b());
1477 }
1478 }
1479
1480 // And the tail.
1481 copy_memory_small(decorators, type, s, d, count, step);
1482
1483 if (granularity >= 8) __ bind(copy8);
1484 if (granularity >= 4) __ bind(copy4);
1485 __ bind(finish);
1486 }
1487
1488
1489 void clobber_registers() {
1490 #ifdef ASSERT
1491 RegSet clobbered
1492 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1493 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1494 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1495 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1496 __ mov(*it, rscratch1);
1497 }
1498 #endif
1499
1500 }
1501
1502 // Scan over array at a for count oops, verifying each one.
1503 // Preserves a and count, clobbers rscratch1 and rscratch2.
1504 void verify_oop_array (int size, Register a, Register count, Register temp) {
1505 Label loop, end;
1506 __ mov(rscratch1, a);
1507 __ mov(rscratch2, zr);
1508 __ bind(loop);
1509 __ cmp(rscratch2, count);
1510 __ br(Assembler::HS, end);
1511 if (size == wordSize) {
1512 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1513 __ verify_oop(temp);
1514 } else {
1515 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1516 __ decode_heap_oop(temp); // calls verify_oop
1517 }
1518 __ add(rscratch2, rscratch2, 1);
1519 __ b(loop);
1520 __ bind(end);
1521 }
1522
1523 // Arguments:
1524 // stub_id - is used to name the stub and identify all details of
1525 // how to perform the copy.
1526 //
1527 // entry - is assigned to the stub's post push entry point unless
1528 // it is null
1529 //
1530 // Inputs:
1531 // c_rarg0 - source array address
1532 // c_rarg1 - destination array address
1533 // c_rarg2 - element count, treated as ssize_t, can be zero
1534 //
1535 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1536 // the hardware handle it. The two dwords within qwords that span
1537 // cache line boundaries will still be loaded and stored atomically.
1538 //
1539 // Side Effects: nopush_entry is set to the (post push) entry point
1540 // so it can be used by the corresponding conjoint
1541 // copy method
1542 //
1543 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1544 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1545 RegSet saved_reg = RegSet::of(s, d, count);
1546 int size;
1547 bool aligned;
1548 bool is_oop;
1549 bool dest_uninitialized;
1550 switch (stub_id) {
1551 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1552 size = sizeof(jbyte);
1553 aligned = false;
1554 is_oop = false;
1555 dest_uninitialized = false;
1556 break;
1557 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1558 size = sizeof(jbyte);
1559 aligned = true;
1560 is_oop = false;
1561 dest_uninitialized = false;
1562 break;
1563 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1564 size = sizeof(jshort);
1565 aligned = false;
1566 is_oop = false;
1567 dest_uninitialized = false;
1568 break;
1569 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1570 size = sizeof(jshort);
1571 aligned = true;
1572 is_oop = false;
1573 dest_uninitialized = false;
1574 break;
1575 case StubId::stubgen_jint_disjoint_arraycopy_id:
1576 size = sizeof(jint);
1577 aligned = false;
1578 is_oop = false;
1579 dest_uninitialized = false;
1580 break;
1581 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1582 size = sizeof(jint);
1583 aligned = true;
1584 is_oop = false;
1585 dest_uninitialized = false;
1586 break;
1587 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1588 // since this is always aligned we can (should!) use the same
1589 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1590 ShouldNotReachHere();
1591 break;
1592 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1593 size = sizeof(jlong);
1594 aligned = true;
1595 is_oop = false;
1596 dest_uninitialized = false;
1597 break;
1598 case StubId::stubgen_oop_disjoint_arraycopy_id:
1599 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1600 aligned = !UseCompressedOops;
1601 is_oop = true;
1602 dest_uninitialized = false;
1603 break;
1604 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1605 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1606 aligned = !UseCompressedOops;
1607 is_oop = true;
1608 dest_uninitialized = false;
1609 break;
1610 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1611 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1612 aligned = !UseCompressedOops;
1613 is_oop = true;
1614 dest_uninitialized = true;
1615 break;
1616 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1617 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1618 aligned = !UseCompressedOops;
1619 is_oop = true;
1620 dest_uninitialized = true;
1621 break;
1622 default:
1623 ShouldNotReachHere();
1624 break;
1625 }
1626
1627 __ align(CodeEntryAlignment);
1628 StubCodeMark mark(this, stub_id);
1629 address start = __ pc();
1630 __ enter();
1631
1632 if (nopush_entry != nullptr) {
1633 *nopush_entry = __ pc();
1634 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1635 BLOCK_COMMENT("Entry:");
1636 }
1637
1638 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1639 if (dest_uninitialized) {
1640 decorators |= IS_DEST_UNINITIALIZED;
1641 }
1642 if (aligned) {
1643 decorators |= ARRAYCOPY_ALIGNED;
1644 }
1645
1646 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1647 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1648
1649 if (is_oop) {
1650 // save regs before copy_memory
1651 __ push(RegSet::of(d, count), sp);
1652 }
1653 {
1654 // UnsafeMemoryAccess page error: continue after unsafe access
1655 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1656 UnsafeMemoryAccessMark umam(this, add_entry, true);
1657 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1658 }
1659
1660 if (is_oop) {
1661 __ pop(RegSet::of(d, count), sp);
1662 if (VerifyOops)
1663 verify_oop_array(size, d, count, r16);
1664 }
1665
1666 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1667
1668 __ leave();
1669 __ mov(r0, zr); // return 0
1670 __ ret(lr);
1671 return start;
1672 }
1673
1674 // Arguments:
1675 // stub_id - is used to name the stub and identify all details of
1676 // how to perform the copy.
1677 //
1678 // nooverlap_target - identifes the (post push) entry for the
1679 // corresponding disjoint copy routine which can be
1680 // jumped to if the ranges do not actually overlap
1681 //
1682 // entry - is assigned to the stub's post push entry point unless
1683 // it is null
1684 //
1685 //
1686 // Inputs:
1687 // c_rarg0 - source array address
1688 // c_rarg1 - destination array address
1689 // c_rarg2 - element count, treated as ssize_t, can be zero
1690 //
1691 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1692 // the hardware handle it. The two dwords within qwords that span
1693 // cache line boundaries will still be loaded and stored atomically.
1694 //
1695 // Side Effects:
1696 // nopush_entry is set to the no-overlap entry point so it can be
1697 // used by some other conjoint copy method
1698 //
1699 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1700 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1701 RegSet saved_regs = RegSet::of(s, d, count);
1702 int size;
1703 bool aligned;
1704 bool is_oop;
1705 bool dest_uninitialized;
1706 switch (stub_id) {
1707 case StubId::stubgen_jbyte_arraycopy_id:
1708 size = sizeof(jbyte);
1709 aligned = false;
1710 is_oop = false;
1711 dest_uninitialized = false;
1712 break;
1713 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1714 size = sizeof(jbyte);
1715 aligned = true;
1716 is_oop = false;
1717 dest_uninitialized = false;
1718 break;
1719 case StubId::stubgen_jshort_arraycopy_id:
1720 size = sizeof(jshort);
1721 aligned = false;
1722 is_oop = false;
1723 dest_uninitialized = false;
1724 break;
1725 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1726 size = sizeof(jshort);
1727 aligned = true;
1728 is_oop = false;
1729 dest_uninitialized = false;
1730 break;
1731 case StubId::stubgen_jint_arraycopy_id:
1732 size = sizeof(jint);
1733 aligned = false;
1734 is_oop = false;
1735 dest_uninitialized = false;
1736 break;
1737 case StubId::stubgen_arrayof_jint_arraycopy_id:
1738 size = sizeof(jint);
1739 aligned = true;
1740 is_oop = false;
1741 dest_uninitialized = false;
1742 break;
1743 case StubId::stubgen_jlong_arraycopy_id:
1744 // since this is always aligned we can (should!) use the same
1745 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1746 ShouldNotReachHere();
1747 break;
1748 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1749 size = sizeof(jlong);
1750 aligned = true;
1751 is_oop = false;
1752 dest_uninitialized = false;
1753 break;
1754 case StubId::stubgen_oop_arraycopy_id:
1755 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1756 aligned = !UseCompressedOops;
1757 is_oop = true;
1758 dest_uninitialized = false;
1759 break;
1760 case StubId::stubgen_arrayof_oop_arraycopy_id:
1761 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1762 aligned = !UseCompressedOops;
1763 is_oop = true;
1764 dest_uninitialized = false;
1765 break;
1766 case StubId::stubgen_oop_arraycopy_uninit_id:
1767 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1768 aligned = !UseCompressedOops;
1769 is_oop = true;
1770 dest_uninitialized = true;
1771 break;
1772 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1773 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1774 aligned = !UseCompressedOops;
1775 is_oop = true;
1776 dest_uninitialized = true;
1777 break;
1778 default:
1779 ShouldNotReachHere();
1780 }
1781
1782 StubCodeMark mark(this, stub_id);
1783 address start = __ pc();
1784 __ enter();
1785
1786 if (nopush_entry != nullptr) {
1787 *nopush_entry = __ pc();
1788 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1789 BLOCK_COMMENT("Entry:");
1790 }
1791
1792 // use fwd copy when (d-s) above_equal (count*size)
1793 Label L_overlapping;
1794 __ sub(rscratch1, d, s);
1795 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1796 __ br(Assembler::LO, L_overlapping);
1797 __ b(RuntimeAddress(nooverlap_target));
1798 __ bind(L_overlapping);
1799
1800 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1801 if (dest_uninitialized) {
1802 decorators |= IS_DEST_UNINITIALIZED;
1803 }
1804 if (aligned) {
1805 decorators |= ARRAYCOPY_ALIGNED;
1806 }
1807
1808 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1809 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1810
1811 if (is_oop) {
1812 // save regs before copy_memory
1813 __ push(RegSet::of(d, count), sp);
1814 }
1815 {
1816 // UnsafeMemoryAccess page error: continue after unsafe access
1817 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1818 UnsafeMemoryAccessMark umam(this, add_entry, true);
1819 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1820 }
1821 if (is_oop) {
1822 __ pop(RegSet::of(d, count), sp);
1823 if (VerifyOops)
1824 verify_oop_array(size, d, count, r16);
1825 }
1826 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1827 __ leave();
1828 __ mov(r0, zr); // return 0
1829 __ ret(lr);
1830 return start;
1831 }
1832
1833 // Helper for generating a dynamic type check.
1834 // Smashes rscratch1, rscratch2.
1835 void generate_type_check(Register sub_klass,
1836 Register super_check_offset,
1837 Register super_klass,
1838 Register temp1,
1839 Register temp2,
1840 Register result,
1841 Label& L_success) {
1842 assert_different_registers(sub_klass, super_check_offset, super_klass);
1843
1844 BLOCK_COMMENT("type_check:");
1845
1846 Label L_miss;
1847
1848 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1849 super_check_offset);
1850 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1851
1852 // Fall through on failure!
1853 __ BIND(L_miss);
1854 }
1855
1856 //
1857 // Generate checkcasting array copy stub
1858 //
1859 // Input:
1860 // c_rarg0 - source array address
1861 // c_rarg1 - destination array address
1862 // c_rarg2 - element count, treated as ssize_t, can be zero
1863 // c_rarg3 - size_t ckoff (super_check_offset)
1864 // c_rarg4 - oop ckval (super_klass)
1865 //
1866 // Output:
1867 // r0 == 0 - success
1868 // r0 == -1^K - failure, where K is partial transfer count
1869 //
1870 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1871 bool dest_uninitialized;
1872 switch (stub_id) {
1873 case StubId::stubgen_checkcast_arraycopy_id:
1874 dest_uninitialized = false;
1875 break;
1876 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1877 dest_uninitialized = true;
1878 break;
1879 default:
1880 ShouldNotReachHere();
1881 }
1882
1883 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1884
1885 // Input registers (after setup_arg_regs)
1886 const Register from = c_rarg0; // source array address
1887 const Register to = c_rarg1; // destination array address
1888 const Register count = c_rarg2; // elementscount
1889 const Register ckoff = c_rarg3; // super_check_offset
1890 const Register ckval = c_rarg4; // super_klass
1891
1892 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1893
1894 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1895 const Register copied_oop = r22; // actual oop copied
1896 const Register count_save = r21; // orig elementscount
1897 const Register start_to = r20; // destination array start address
1898 const Register r19_klass = r19; // oop._klass
1899
1900 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1901 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1902
1903 //---------------------------------------------------------------
1904 // Assembler stub will be used for this call to arraycopy
1905 // if the two arrays are subtypes of Object[] but the
1906 // destination array type is not equal to or a supertype
1907 // of the source type. Each element must be separately
1908 // checked.
1909
1910 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1911 copied_oop, r19_klass, count_save);
1912
1913 __ align(CodeEntryAlignment);
1914 StubCodeMark mark(this, stub_id);
1915 address start = __ pc();
1916
1917 __ enter(); // required for proper stackwalking of RuntimeStub frame
1918
1919 #ifdef ASSERT
1920 // caller guarantees that the arrays really are different
1921 // otherwise, we would have to make conjoint checks
1922 { Label L;
1923 __ b(L); // conjoint check not yet implemented
1924 __ stop("checkcast_copy within a single array");
1925 __ bind(L);
1926 }
1927 #endif //ASSERT
1928
1929 // Caller of this entry point must set up the argument registers.
1930 if (nopush_entry != nullptr) {
1931 *nopush_entry = __ pc();
1932 BLOCK_COMMENT("Entry:");
1933 }
1934
1935 // Empty array: Nothing to do.
1936 __ cbz(count, L_done);
1937 __ push(RegSet::of(r19, r20, r21, r22), sp);
1938
1939 #ifdef ASSERT
1940 BLOCK_COMMENT("assert consistent ckoff/ckval");
1941 // The ckoff and ckval must be mutually consistent,
1942 // even though caller generates both.
1943 { Label L;
1944 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1945 __ ldrw(start_to, Address(ckval, sco_offset));
1946 __ cmpw(ckoff, start_to);
1947 __ br(Assembler::EQ, L);
1948 __ stop("super_check_offset inconsistent");
1949 __ bind(L);
1950 }
1951 #endif //ASSERT
1952
1953 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1954 bool is_oop = true;
1955 int element_size = UseCompressedOops ? 4 : 8;
1956 if (dest_uninitialized) {
1957 decorators |= IS_DEST_UNINITIALIZED;
1958 }
1959
1960 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1961 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1962
1963 // save the original count
1964 __ mov(count_save, count);
1965
1966 // Copy from low to high addresses
1967 __ mov(start_to, to); // Save destination array start address
1968 __ b(L_load_element);
1969
1970 // ======== begin loop ========
1971 // (Loop is rotated; its entry is L_load_element.)
1972 // Loop control:
1973 // for (; count != 0; count--) {
1974 // copied_oop = load_heap_oop(from++);
1975 // ... generate_type_check ...;
1976 // store_heap_oop(to++, copied_oop);
1977 // }
1978 __ align(OptoLoopAlignment);
1979
1980 __ BIND(L_store_element);
1981 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1982 __ post(to, element_size), copied_oop, noreg,
1983 gct1, gct2, gct3);
1984 __ sub(count, count, 1);
1985 __ cbz(count, L_do_card_marks);
1986
1987 // ======== loop entry is here ========
1988 __ BIND(L_load_element);
1989 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1990 copied_oop, noreg, __ post(from, element_size),
1991 gct1);
1992 __ cbz(copied_oop, L_store_element);
1993
1994 __ load_klass(r19_klass, copied_oop);// query the object klass
1995
1996 BLOCK_COMMENT("type_check:");
1997 generate_type_check(/*sub_klass*/r19_klass,
1998 /*super_check_offset*/ckoff,
1999 /*super_klass*/ckval,
2000 /*r_array_base*/gct1,
2001 /*temp2*/gct2,
2002 /*result*/r10, L_store_element);
2003
2004 // Fall through on failure!
2005
2006 // ======== end loop ========
2007
2008 // It was a real error; we must depend on the caller to finish the job.
2009 // Register count = remaining oops, count_orig = total oops.
2010 // Emit GC store barriers for the oops we have copied and report
2011 // their number to the caller.
2012
2013 __ subs(count, count_save, count); // K = partially copied oop count
2014 __ eon(count, count, zr); // report (-1^K) to caller
2015 __ br(Assembler::EQ, L_done_pop);
2016
2017 __ BIND(L_do_card_marks);
2018 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2019
2020 __ bind(L_done_pop);
2021 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2022 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2023
2024 __ bind(L_done);
2025 __ mov(r0, count);
2026 __ leave();
2027 __ ret(lr);
2028
2029 return start;
2030 }
2031
2032 // Perform range checks on the proposed arraycopy.
2033 // Kills temp, but nothing else.
2034 // Also, clean the sign bits of src_pos and dst_pos.
2035 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2036 Register src_pos, // source position (c_rarg1)
2037 Register dst, // destination array oo (c_rarg2)
2038 Register dst_pos, // destination position (c_rarg3)
2039 Register length,
2040 Register temp,
2041 Label& L_failed) {
2042 BLOCK_COMMENT("arraycopy_range_checks:");
2043
2044 assert_different_registers(rscratch1, temp);
2045
2046 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2047 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2048 __ addw(temp, length, src_pos);
2049 __ cmpw(temp, rscratch1);
2050 __ br(Assembler::HI, L_failed);
2051
2052 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2053 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2054 __ addw(temp, length, dst_pos);
2055 __ cmpw(temp, rscratch1);
2056 __ br(Assembler::HI, L_failed);
2057
2058 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2059 __ movw(src_pos, src_pos);
2060 __ movw(dst_pos, dst_pos);
2061
2062 BLOCK_COMMENT("arraycopy_range_checks done");
2063 }
2064
2065 // These stubs get called from some dumb test routine.
2066 // I'll write them properly when they're called from
2067 // something that's actually doing something.
2068 static void fake_arraycopy_stub(address src, address dst, int count) {
2069 assert(count == 0, "huh?");
2070 }
2071
2072
2073 //
2074 // Generate 'unsafe' array copy stub
2075 // Though just as safe as the other stubs, it takes an unscaled
2076 // size_t argument instead of an element count.
2077 //
2078 // Input:
2079 // c_rarg0 - source array address
2080 // c_rarg1 - destination array address
2081 // c_rarg2 - byte count, treated as ssize_t, can be zero
2082 //
2083 // Examines the alignment of the operands and dispatches
2084 // to a long, int, short, or byte copy loop.
2085 //
2086 address generate_unsafe_copy(address byte_copy_entry,
2087 address short_copy_entry,
2088 address int_copy_entry,
2089 address long_copy_entry) {
2090 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2091
2092 Label L_long_aligned, L_int_aligned, L_short_aligned;
2093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2094
2095 __ align(CodeEntryAlignment);
2096 StubCodeMark mark(this, stub_id);
2097 address start = __ pc();
2098 __ enter(); // required for proper stackwalking of RuntimeStub frame
2099
2100 // bump this on entry, not on exit:
2101 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2102
2103 __ orr(rscratch1, s, d);
2104 __ orr(rscratch1, rscratch1, count);
2105
2106 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2107 __ cbz(rscratch1, L_long_aligned);
2108 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2109 __ cbz(rscratch1, L_int_aligned);
2110 __ tbz(rscratch1, 0, L_short_aligned);
2111 __ b(RuntimeAddress(byte_copy_entry));
2112
2113 __ BIND(L_short_aligned);
2114 __ lsr(count, count, LogBytesPerShort); // size => short_count
2115 __ b(RuntimeAddress(short_copy_entry));
2116 __ BIND(L_int_aligned);
2117 __ lsr(count, count, LogBytesPerInt); // size => int_count
2118 __ b(RuntimeAddress(int_copy_entry));
2119 __ BIND(L_long_aligned);
2120 __ lsr(count, count, LogBytesPerLong); // size => long_count
2121 __ b(RuntimeAddress(long_copy_entry));
2122
2123 return start;
2124 }
2125
2126 //
2127 // Generate generic array copy stubs
2128 //
2129 // Input:
2130 // c_rarg0 - src oop
2131 // c_rarg1 - src_pos (32-bits)
2132 // c_rarg2 - dst oop
2133 // c_rarg3 - dst_pos (32-bits)
2134 // c_rarg4 - element count (32-bits)
2135 //
2136 // Output:
2137 // r0 == 0 - success
2138 // r0 == -1^K - failure, where K is partial transfer count
2139 //
2140 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2141 address int_copy_entry, address oop_copy_entry,
2142 address long_copy_entry, address checkcast_copy_entry) {
2143 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2144
2145 Label L_failed, L_objArray;
2146 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2147
2148 // Input registers
2149 const Register src = c_rarg0; // source array oop
2150 const Register src_pos = c_rarg1; // source position
2151 const Register dst = c_rarg2; // destination array oop
2152 const Register dst_pos = c_rarg3; // destination position
2153 const Register length = c_rarg4;
2154
2155
2156 // Registers used as temps
2157 const Register dst_klass = c_rarg5;
2158
2159 __ align(CodeEntryAlignment);
2160
2161 StubCodeMark mark(this, stub_id);
2162
2163 address start = __ pc();
2164
2165 __ enter(); // required for proper stackwalking of RuntimeStub frame
2166
2167 // bump this on entry, not on exit:
2168 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2169
2170 //-----------------------------------------------------------------------
2171 // Assembler stub will be used for this call to arraycopy
2172 // if the following conditions are met:
2173 //
2174 // (1) src and dst must not be null.
2175 // (2) src_pos must not be negative.
2176 // (3) dst_pos must not be negative.
2177 // (4) length must not be negative.
2178 // (5) src klass and dst klass should be the same and not null.
2179 // (6) src and dst should be arrays.
2180 // (7) src_pos + length must not exceed length of src.
2181 // (8) dst_pos + length must not exceed length of dst.
2182 //
2183
2184 // if (src == nullptr) return -1;
2185 __ cbz(src, L_failed);
2186
2187 // if (src_pos < 0) return -1;
2188 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2189
2190 // if (dst == nullptr) return -1;
2191 __ cbz(dst, L_failed);
2192
2193 // if (dst_pos < 0) return -1;
2194 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2195
2196 // registers used as temp
2197 const Register scratch_length = r16; // elements count to copy
2198 const Register scratch_src_klass = r17; // array klass
2199 const Register lh = r15; // layout helper
2200
2201 // if (length < 0) return -1;
2202 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2203 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2204
2205 __ load_klass(scratch_src_klass, src);
2206 #ifdef ASSERT
2207 // assert(src->klass() != nullptr);
2208 {
2209 BLOCK_COMMENT("assert klasses not null {");
2210 Label L1, L2;
2211 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2212 __ bind(L1);
2213 __ stop("broken null klass");
2214 __ bind(L2);
2215 __ load_klass(rscratch1, dst);
2216 __ cbz(rscratch1, L1); // this would be broken also
2217 BLOCK_COMMENT("} assert klasses not null done");
2218 }
2219 #endif
2220
2221 // Load layout helper (32-bits)
2222 //
2223 // |array_tag| | header_size | element_type | |log2_element_size|
2224 // 32 30 24 16 8 2 0
2225 //
2226 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2227 //
2228
2229 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2230
2231 // Handle objArrays completely differently...
2232 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2233 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2234 __ movw(rscratch1, objArray_lh);
2235 __ eorw(rscratch2, lh, rscratch1);
2236 __ cbzw(rscratch2, L_objArray);
2237
2238 // if (src->klass() != dst->klass()) return -1;
2239 __ load_klass(rscratch2, dst);
2240 __ eor(rscratch2, rscratch2, scratch_src_klass);
2241 __ cbnz(rscratch2, L_failed);
2242
2243 // Check for flat inline type array -> return -1
2244 __ test_flat_array_oop(src, rscratch2, L_failed);
2245
2246 // Check for null-free (non-flat) inline type array -> handle as object array
2247 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2248
2249 // if (!src->is_Array()) return -1;
2250 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2251
2252 // At this point, it is known to be a typeArray (array_tag 0x3).
2253 #ifdef ASSERT
2254 {
2255 BLOCK_COMMENT("assert primitive array {");
2256 Label L;
2257 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2258 __ cmpw(lh, rscratch2);
2259 __ br(Assembler::GE, L);
2260 __ stop("must be a primitive array");
2261 __ bind(L);
2262 BLOCK_COMMENT("} assert primitive array done");
2263 }
2264 #endif
2265
2266 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2267 rscratch2, L_failed);
2268
2269 // TypeArrayKlass
2270 //
2271 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2272 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2273 //
2274
2275 const Register rscratch1_offset = rscratch1; // array offset
2276 const Register r15_elsize = lh; // element size
2277
2278 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2279 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2280 __ add(src, src, rscratch1_offset); // src array offset
2281 __ add(dst, dst, rscratch1_offset); // dst array offset
2282 BLOCK_COMMENT("choose copy loop based on element size");
2283
2284 // next registers should be set before the jump to corresponding stub
2285 const Register from = c_rarg0; // source array address
2286 const Register to = c_rarg1; // destination array address
2287 const Register count = c_rarg2; // elements count
2288
2289 // 'from', 'to', 'count' registers should be set in such order
2290 // since they are the same as 'src', 'src_pos', 'dst'.
2291
2292 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2293
2294 // The possible values of elsize are 0-3, i.e. exact_log2(element
2295 // size in bytes). We do a simple bitwise binary search.
2296 __ BIND(L_copy_bytes);
2297 __ tbnz(r15_elsize, 1, L_copy_ints);
2298 __ tbnz(r15_elsize, 0, L_copy_shorts);
2299 __ lea(from, Address(src, src_pos));// src_addr
2300 __ lea(to, Address(dst, dst_pos));// dst_addr
2301 __ movw(count, scratch_length); // length
2302 __ b(RuntimeAddress(byte_copy_entry));
2303
2304 __ BIND(L_copy_shorts);
2305 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2306 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2307 __ movw(count, scratch_length); // length
2308 __ b(RuntimeAddress(short_copy_entry));
2309
2310 __ BIND(L_copy_ints);
2311 __ tbnz(r15_elsize, 0, L_copy_longs);
2312 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2313 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2314 __ movw(count, scratch_length); // length
2315 __ b(RuntimeAddress(int_copy_entry));
2316
2317 __ BIND(L_copy_longs);
2318 #ifdef ASSERT
2319 {
2320 BLOCK_COMMENT("assert long copy {");
2321 Label L;
2322 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2323 __ cmpw(r15_elsize, LogBytesPerLong);
2324 __ br(Assembler::EQ, L);
2325 __ stop("must be long copy, but elsize is wrong");
2326 __ bind(L);
2327 BLOCK_COMMENT("} assert long copy done");
2328 }
2329 #endif
2330 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2331 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2332 __ movw(count, scratch_length); // length
2333 __ b(RuntimeAddress(long_copy_entry));
2334
2335 // ObjArrayKlass
2336 __ BIND(L_objArray);
2337 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2338
2339 Label L_plain_copy, L_checkcast_copy;
2340 // test array classes for subtyping
2341 __ load_klass(r15, dst);
2342 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2343 __ br(Assembler::NE, L_checkcast_copy);
2344
2345 // Identically typed arrays can be copied without element-wise checks.
2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347 rscratch2, L_failed);
2348
2349 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2350 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2351 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2352 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2353 __ movw(count, scratch_length); // length
2354 __ BIND(L_plain_copy);
2355 __ b(RuntimeAddress(oop_copy_entry));
2356
2357 __ BIND(L_checkcast_copy);
2358 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2359 {
2360 // Before looking at dst.length, make sure dst is also an objArray.
2361 __ ldrw(rscratch1, Address(r15, lh_offset));
2362 __ movw(rscratch2, objArray_lh);
2363 __ eorw(rscratch1, rscratch1, rscratch2);
2364 __ cbnzw(rscratch1, L_failed);
2365
2366 // It is safe to examine both src.length and dst.length.
2367 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2368 r15, L_failed);
2369
2370 __ load_klass(dst_klass, dst); // reload
2371
2372 // Marshal the base address arguments now, freeing registers.
2373 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2374 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2375 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2376 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2377 __ movw(count, length); // length (reloaded)
2378 Register sco_temp = c_rarg3; // this register is free now
2379 assert_different_registers(from, to, count, sco_temp,
2380 dst_klass, scratch_src_klass);
2381 // assert_clean_int(count, sco_temp);
2382
2383 // Generate the type check.
2384 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2385 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2386
2387 // Smashes rscratch1, rscratch2
2388 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2389 L_plain_copy);
2390
2391 // Fetch destination element klass from the ObjArrayKlass header.
2392 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2393 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2394 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2395
2396 // the checkcast_copy loop needs two extra arguments:
2397 assert(c_rarg3 == sco_temp, "#3 already in place");
2398 // Set up arguments for checkcast_copy_entry.
2399 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2400 __ b(RuntimeAddress(checkcast_copy_entry));
2401 }
2402
2403 __ BIND(L_failed);
2404 __ mov(r0, -1);
2405 __ leave(); // required for proper stackwalking of RuntimeStub frame
2406 __ ret(lr);
2407
2408 return start;
2409 }
2410
2411 //
2412 // Generate stub for array fill. If "aligned" is true, the
2413 // "to" address is assumed to be heapword aligned.
2414 //
2415 // Arguments for generated stub:
2416 // to: c_rarg0
2417 // value: c_rarg1
2418 // count: c_rarg2 treated as signed
2419 //
2420 address generate_fill(StubId stub_id) {
2421 BasicType t;
2422 bool aligned;
2423
2424 switch (stub_id) {
2425 case StubId::stubgen_jbyte_fill_id:
2426 t = T_BYTE;
2427 aligned = false;
2428 break;
2429 case StubId::stubgen_jshort_fill_id:
2430 t = T_SHORT;
2431 aligned = false;
2432 break;
2433 case StubId::stubgen_jint_fill_id:
2434 t = T_INT;
2435 aligned = false;
2436 break;
2437 case StubId::stubgen_arrayof_jbyte_fill_id:
2438 t = T_BYTE;
2439 aligned = true;
2440 break;
2441 case StubId::stubgen_arrayof_jshort_fill_id:
2442 t = T_SHORT;
2443 aligned = true;
2444 break;
2445 case StubId::stubgen_arrayof_jint_fill_id:
2446 t = T_INT;
2447 aligned = true;
2448 break;
2449 default:
2450 ShouldNotReachHere();
2451 };
2452
2453 __ align(CodeEntryAlignment);
2454 StubCodeMark mark(this, stub_id);
2455 address start = __ pc();
2456
2457 BLOCK_COMMENT("Entry:");
2458
2459 const Register to = c_rarg0; // source array address
2460 const Register value = c_rarg1; // value
2461 const Register count = c_rarg2; // elements count
2462
2463 const Register bz_base = r10; // base for block_zero routine
2464 const Register cnt_words = r11; // temp register
2465
2466 __ enter();
2467
2468 Label L_fill_elements, L_exit1;
2469
2470 int shift = -1;
2471 switch (t) {
2472 case T_BYTE:
2473 shift = 0;
2474 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2475 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2476 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2477 __ br(Assembler::LO, L_fill_elements);
2478 break;
2479 case T_SHORT:
2480 shift = 1;
2481 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2482 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2483 __ br(Assembler::LO, L_fill_elements);
2484 break;
2485 case T_INT:
2486 shift = 2;
2487 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2488 __ br(Assembler::LO, L_fill_elements);
2489 break;
2490 default: ShouldNotReachHere();
2491 }
2492
2493 // Align source address at 8 bytes address boundary.
2494 Label L_skip_align1, L_skip_align2, L_skip_align4;
2495 if (!aligned) {
2496 switch (t) {
2497 case T_BYTE:
2498 // One byte misalignment happens only for byte arrays.
2499 __ tbz(to, 0, L_skip_align1);
2500 __ strb(value, Address(__ post(to, 1)));
2501 __ subw(count, count, 1);
2502 __ bind(L_skip_align1);
2503 // Fallthrough
2504 case T_SHORT:
2505 // Two bytes misalignment happens only for byte and short (char) arrays.
2506 __ tbz(to, 1, L_skip_align2);
2507 __ strh(value, Address(__ post(to, 2)));
2508 __ subw(count, count, 2 >> shift);
2509 __ bind(L_skip_align2);
2510 // Fallthrough
2511 case T_INT:
2512 // Align to 8 bytes, we know we are 4 byte aligned to start.
2513 __ tbz(to, 2, L_skip_align4);
2514 __ strw(value, Address(__ post(to, 4)));
2515 __ subw(count, count, 4 >> shift);
2516 __ bind(L_skip_align4);
2517 break;
2518 default: ShouldNotReachHere();
2519 }
2520 }
2521
2522 //
2523 // Fill large chunks
2524 //
2525 __ lsrw(cnt_words, count, 3 - shift); // number of words
2526 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2527 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2528 if (UseBlockZeroing) {
2529 Label non_block_zeroing, rest;
2530 // If the fill value is zero we can use the fast zero_words().
2531 __ cbnz(value, non_block_zeroing);
2532 __ mov(bz_base, to);
2533 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2534 address tpc = __ zero_words(bz_base, cnt_words);
2535 if (tpc == nullptr) {
2536 fatal("CodeCache is full at generate_fill");
2537 }
2538 __ b(rest);
2539 __ bind(non_block_zeroing);
2540 __ fill_words(to, cnt_words, value);
2541 __ bind(rest);
2542 } else {
2543 __ fill_words(to, cnt_words, value);
2544 }
2545
2546 // Remaining count is less than 8 bytes. Fill it by a single store.
2547 // Note that the total length is no less than 8 bytes.
2548 if (t == T_BYTE || t == T_SHORT) {
2549 Label L_exit1;
2550 __ cbzw(count, L_exit1);
2551 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2552 __ str(value, Address(to, -8)); // overwrite some elements
2553 __ bind(L_exit1);
2554 __ leave();
2555 __ ret(lr);
2556 }
2557
2558 // Handle copies less than 8 bytes.
2559 Label L_fill_2, L_fill_4, L_exit2;
2560 __ bind(L_fill_elements);
2561 switch (t) {
2562 case T_BYTE:
2563 __ tbz(count, 0, L_fill_2);
2564 __ strb(value, Address(__ post(to, 1)));
2565 __ bind(L_fill_2);
2566 __ tbz(count, 1, L_fill_4);
2567 __ strh(value, Address(__ post(to, 2)));
2568 __ bind(L_fill_4);
2569 __ tbz(count, 2, L_exit2);
2570 __ strw(value, Address(to));
2571 break;
2572 case T_SHORT:
2573 __ tbz(count, 0, L_fill_4);
2574 __ strh(value, Address(__ post(to, 2)));
2575 __ bind(L_fill_4);
2576 __ tbz(count, 1, L_exit2);
2577 __ strw(value, Address(to));
2578 break;
2579 case T_INT:
2580 __ cbzw(count, L_exit2);
2581 __ strw(value, Address(to));
2582 break;
2583 default: ShouldNotReachHere();
2584 }
2585 __ bind(L_exit2);
2586 __ leave();
2587 __ ret(lr);
2588 return start;
2589 }
2590
2591 address generate_unsafecopy_common_error_exit() {
2592 address start_pc = __ pc();
2593 __ leave();
2594 __ mov(r0, 0);
2595 __ ret(lr);
2596 return start_pc;
2597 }
2598
2599 //
2600 // Generate 'unsafe' set memory stub
2601 // Though just as safe as the other stubs, it takes an unscaled
2602 // size_t (# bytes) argument instead of an element count.
2603 //
2604 // This fill operation is atomicity preserving: as long as the
2605 // address supplied is sufficiently aligned, all writes of up to 64
2606 // bits in size are single-copy atomic.
2607 //
2608 // Input:
2609 // c_rarg0 - destination array address
2610 // c_rarg1 - byte count (size_t)
2611 // c_rarg2 - byte value
2612 //
2613 address generate_unsafe_setmemory() {
2614 __ align(CodeEntryAlignment);
2615 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2616 address start = __ pc();
2617
2618 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2619 Label tail;
2620
2621 UnsafeMemoryAccessMark umam(this, true, false);
2622
2623 __ enter(); // required for proper stackwalking of RuntimeStub frame
2624
2625 __ dup(v0, __ T16B, value);
2626
2627 if (AvoidUnalignedAccesses) {
2628 __ cmp(count, (u1)16);
2629 __ br(__ LO, tail);
2630
2631 __ mov(rscratch1, 16);
2632 __ andr(rscratch2, dest, 15);
2633 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2634 __ strq(v0, Address(dest));
2635 __ sub(count, count, rscratch1);
2636 __ add(dest, dest, rscratch1);
2637 }
2638
2639 __ subs(count, count, (u1)64);
2640 __ br(__ LO, tail);
2641 {
2642 Label again;
2643 __ bind(again);
2644 __ stpq(v0, v0, Address(dest));
2645 __ stpq(v0, v0, Address(dest, 32));
2646
2647 __ subs(count, count, 64);
2648 __ add(dest, dest, 64);
2649 __ br(__ HS, again);
2650 }
2651
2652 __ bind(tail);
2653 // The count of bytes is off by 64, but we don't need to correct
2654 // it because we're only going to use the least-significant few
2655 // count bits from here on.
2656 // __ add(count, count, 64);
2657
2658 {
2659 Label dont;
2660 __ tbz(count, exact_log2(32), dont);
2661 __ stpq(v0, v0, __ post(dest, 32));
2662 __ bind(dont);
2663 }
2664 {
2665 Label dont;
2666 __ tbz(count, exact_log2(16), dont);
2667 __ strq(v0, __ post(dest, 16));
2668 __ bind(dont);
2669 }
2670 {
2671 Label dont;
2672 __ tbz(count, exact_log2(8), dont);
2673 __ strd(v0, __ post(dest, 8));
2674 __ bind(dont);
2675 }
2676
2677 Label finished;
2678 __ tst(count, 7);
2679 __ br(__ EQ, finished);
2680
2681 {
2682 Label dont;
2683 __ tbz(count, exact_log2(4), dont);
2684 __ strs(v0, __ post(dest, 4));
2685 __ bind(dont);
2686 }
2687 {
2688 Label dont;
2689 __ tbz(count, exact_log2(2), dont);
2690 __ bfi(value, value, 8, 8);
2691 __ strh(value, __ post(dest, 2));
2692 __ bind(dont);
2693 }
2694 {
2695 Label dont;
2696 __ tbz(count, exact_log2(1), dont);
2697 __ strb(value, Address(dest));
2698 __ bind(dont);
2699 }
2700
2701 __ bind(finished);
2702 __ leave();
2703 __ ret(lr);
2704
2705 return start;
2706 }
2707
2708 address generate_data_cache_writeback() {
2709 const Register line = c_rarg0; // address of line to write back
2710
2711 __ align(CodeEntryAlignment);
2712
2713 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2714 StubCodeMark mark(this, stub_id);
2715
2716 address start = __ pc();
2717 __ enter();
2718 __ cache_wb(Address(line, 0));
2719 __ leave();
2720 __ ret(lr);
2721
2722 return start;
2723 }
2724
2725 address generate_data_cache_writeback_sync() {
2726 const Register is_pre = c_rarg0; // pre or post sync
2727
2728 __ align(CodeEntryAlignment);
2729
2730 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2731 StubCodeMark mark(this, stub_id);
2732
2733 // pre wbsync is a no-op
2734 // post wbsync translates to an sfence
2735
2736 Label skip;
2737 address start = __ pc();
2738 __ enter();
2739 __ cbnz(is_pre, skip);
2740 __ cache_wbsync(false);
2741 __ bind(skip);
2742 __ leave();
2743 __ ret(lr);
2744
2745 return start;
2746 }
2747
2748 void generate_arraycopy_stubs() {
2749 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2750 // entry immediately following their stack push. This can be used
2751 // as a post-push branch target for compatible stubs when they
2752 // identify a special case that can be handled by the fallback
2753 // stub e.g a disjoint copy stub may be use as a special case
2754 // fallback for its compatible conjoint copy stub.
2755 //
2756 // A no push entry is always returned in the following local and
2757 // then published by assigning to the appropriate entry field in
2758 // class StubRoutines. The entry value is then passed to the
2759 // generator for the compatible stub. That means the entry must be
2760 // listed when saving to/restoring from the AOT cache, ensuring
2761 // that the inter-stub jumps are noted at AOT-cache save and
2762 // relocated at AOT cache load.
2763 address nopush_entry;
2764
2765 // generate the common exit first so later stubs can rely on it if
2766 // they want an UnsafeMemoryAccess exit non-local to the stub
2767 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2768 // register the stub as the default exit with class UnsafeMemoryAccess
2769 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2770
2771 // generate and publish arch64-specific bulk copy routines first
2772 // so we can call them from other copy stubs
2773 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2774 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2775
2776 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2777 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2778
2779 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2780 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2781
2782 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2783
2784 //*** jbyte
2785 // Always need aligned and unaligned versions
2786 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2787 // disjoint nopush entry is needed by conjoint copy
2788 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2789 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2790 // conjoint nopush entry is needed by generic/unsafe copy
2791 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2792 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2793 // disjoint arrayof nopush entry is needed by conjoint copy
2794 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2795 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2796
2797 //*** jshort
2798 // Always need aligned and unaligned versions
2799 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2800 // disjoint nopush entry is needed by conjoint copy
2801 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2802 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2803 // conjoint nopush entry is used by generic/unsafe copy
2804 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2805 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2806 // disjoint arrayof nopush entry is needed by conjoint copy
2807 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2808 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2809
2810 //*** jint
2811 // Aligned versions
2812 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2813 // disjoint arrayof nopush entry is needed by conjoint copy
2814 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2815 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2816 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2817 // jint_arraycopy_nopush always points to the unaligned version
2818 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2819 // disjoint nopush entry is needed by conjoint copy
2820 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2821 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2822 // conjoint nopush entry is needed by generic/unsafe copy
2823 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2824
2825 //*** jlong
2826 // It is always aligned
2827 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2828 // disjoint arrayof nopush entry is needed by conjoint copy
2829 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2830 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2831 // conjoint nopush entry is needed by generic/unsafe copy
2832 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2833 // disjoint normal/nopush and conjoint normal entries are not
2834 // generated since the arrayof versions are the same
2835 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2836 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2837 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2838
2839 //*** oops
2840 {
2841 StubRoutines::_arrayof_oop_disjoint_arraycopy
2842 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2843 // disjoint arrayof nopush entry is needed by conjoint copy
2844 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2845 StubRoutines::_arrayof_oop_arraycopy
2846 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2847 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2848 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2849 // Aligned versions without pre-barriers
2850 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2851 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2852 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2853 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2854 // note that we don't need a returned nopush entry because the
2855 // generic/unsafe copy does not cater for uninit arrays.
2856 StubRoutines::_arrayof_oop_arraycopy_uninit
2857 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2858 }
2859
2860 // for oop copies reuse arrayof entries for non-arrayof cases
2861 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2862 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2863 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2864 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2865 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2866 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2867
2868 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2869 // checkcast nopush entry is needed by generic copy
2870 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2871 // note that we don't need a returned nopush entry because the
2872 // generic copy does not cater for uninit arrays.
2873 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2874
2875 // unsafe arraycopy may fallback on conjoint stubs
2876 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2877 StubRoutines::_jshort_arraycopy_nopush,
2878 StubRoutines::_jint_arraycopy_nopush,
2879 StubRoutines::_jlong_arraycopy_nopush);
2880
2881 // generic arraycopy may fallback on conjoint stubs
2882 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2883 StubRoutines::_jshort_arraycopy_nopush,
2884 StubRoutines::_jint_arraycopy_nopush,
2885 StubRoutines::_oop_arraycopy_nopush,
2886 StubRoutines::_jlong_arraycopy_nopush,
2887 StubRoutines::_checkcast_arraycopy_nopush);
2888
2889 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2890 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2891 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2892 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2893 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2894 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2895 }
2896
2897 void generate_math_stubs() { Unimplemented(); }
2898
2899 // Arguments:
2900 //
2901 // Inputs:
2902 // c_rarg0 - source byte array address
2903 // c_rarg1 - destination byte array address
2904 // c_rarg2 - sessionKe (key) in little endian int array
2905 //
2906 address generate_aescrypt_encryptBlock() {
2907 __ align(CodeEntryAlignment);
2908 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2909 StubCodeMark mark(this, stub_id);
2910
2911 const Register from = c_rarg0; // source array address
2912 const Register to = c_rarg1; // destination array address
2913 const Register key = c_rarg2; // key array address
2914 const Register keylen = rscratch1;
2915
2916 address start = __ pc();
2917 __ enter();
2918
2919 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2920
2921 __ aesenc_loadkeys(key, keylen);
2922 __ aesecb_encrypt(from, to, keylen);
2923
2924 __ mov(r0, 0);
2925
2926 __ leave();
2927 __ ret(lr);
2928
2929 return start;
2930 }
2931
2932 // Arguments:
2933 //
2934 // Inputs:
2935 // c_rarg0 - source byte array address
2936 // c_rarg1 - destination byte array address
2937 // c_rarg2 - sessionKd (key) in little endian int array
2938 //
2939 address generate_aescrypt_decryptBlock() {
2940 assert(UseAES, "need AES cryptographic extension support");
2941 __ align(CodeEntryAlignment);
2942 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2943 StubCodeMark mark(this, stub_id);
2944 Label L_doLast;
2945
2946 const Register from = c_rarg0; // source array address
2947 const Register to = c_rarg1; // destination array address
2948 const Register key = c_rarg2; // key array address
2949 const Register keylen = rscratch1;
2950
2951 address start = __ pc();
2952 __ enter(); // required for proper stackwalking of RuntimeStub frame
2953
2954 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2955
2956 __ aesecb_decrypt(from, to, key, keylen);
2957
2958 __ mov(r0, 0);
2959
2960 __ leave();
2961 __ ret(lr);
2962
2963 return start;
2964 }
2965
2966 // Arguments:
2967 //
2968 // Inputs:
2969 // c_rarg0 - source byte array address
2970 // c_rarg1 - destination byte array address
2971 // c_rarg2 - sessionKe (key) in little endian int array
2972 // c_rarg3 - r vector byte array address
2973 // c_rarg4 - input length
2974 //
2975 // Output:
2976 // x0 - input length
2977 //
2978 address generate_cipherBlockChaining_encryptAESCrypt() {
2979 assert(UseAES, "need AES cryptographic extension support");
2980 __ align(CodeEntryAlignment);
2981 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2982 StubCodeMark mark(this, stub_id);
2983
2984 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2985
2986 const Register from = c_rarg0; // source array address
2987 const Register to = c_rarg1; // destination array address
2988 const Register key = c_rarg2; // key array address
2989 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2990 // and left with the results of the last encryption block
2991 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2992 const Register keylen = rscratch1;
2993
2994 address start = __ pc();
2995
2996 __ enter();
2997
2998 __ movw(rscratch2, len_reg);
2999
3000 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3001
3002 __ ld1(v0, __ T16B, rvec);
3003
3004 __ cmpw(keylen, 52);
3005 __ br(Assembler::CC, L_loadkeys_44);
3006 __ br(Assembler::EQ, L_loadkeys_52);
3007
3008 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3009 __ rev32(v17, __ T16B, v17);
3010 __ rev32(v18, __ T16B, v18);
3011 __ BIND(L_loadkeys_52);
3012 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3013 __ rev32(v19, __ T16B, v19);
3014 __ rev32(v20, __ T16B, v20);
3015 __ BIND(L_loadkeys_44);
3016 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3017 __ rev32(v21, __ T16B, v21);
3018 __ rev32(v22, __ T16B, v22);
3019 __ rev32(v23, __ T16B, v23);
3020 __ rev32(v24, __ T16B, v24);
3021 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3022 __ rev32(v25, __ T16B, v25);
3023 __ rev32(v26, __ T16B, v26);
3024 __ rev32(v27, __ T16B, v27);
3025 __ rev32(v28, __ T16B, v28);
3026 __ ld1(v29, v30, v31, __ T16B, key);
3027 __ rev32(v29, __ T16B, v29);
3028 __ rev32(v30, __ T16B, v30);
3029 __ rev32(v31, __ T16B, v31);
3030
3031 __ BIND(L_aes_loop);
3032 __ ld1(v1, __ T16B, __ post(from, 16));
3033 __ eor(v0, __ T16B, v0, v1);
3034
3035 __ br(Assembler::CC, L_rounds_44);
3036 __ br(Assembler::EQ, L_rounds_52);
3037
3038 __ aese(v0, v17); __ aesmc(v0, v0);
3039 __ aese(v0, v18); __ aesmc(v0, v0);
3040 __ BIND(L_rounds_52);
3041 __ aese(v0, v19); __ aesmc(v0, v0);
3042 __ aese(v0, v20); __ aesmc(v0, v0);
3043 __ BIND(L_rounds_44);
3044 __ aese(v0, v21); __ aesmc(v0, v0);
3045 __ aese(v0, v22); __ aesmc(v0, v0);
3046 __ aese(v0, v23); __ aesmc(v0, v0);
3047 __ aese(v0, v24); __ aesmc(v0, v0);
3048 __ aese(v0, v25); __ aesmc(v0, v0);
3049 __ aese(v0, v26); __ aesmc(v0, v0);
3050 __ aese(v0, v27); __ aesmc(v0, v0);
3051 __ aese(v0, v28); __ aesmc(v0, v0);
3052 __ aese(v0, v29); __ aesmc(v0, v0);
3053 __ aese(v0, v30);
3054 __ eor(v0, __ T16B, v0, v31);
3055
3056 __ st1(v0, __ T16B, __ post(to, 16));
3057
3058 __ subw(len_reg, len_reg, 16);
3059 __ cbnzw(len_reg, L_aes_loop);
3060
3061 __ st1(v0, __ T16B, rvec);
3062
3063 __ mov(r0, rscratch2);
3064
3065 __ leave();
3066 __ ret(lr);
3067
3068 return start;
3069 }
3070
3071 // Arguments:
3072 //
3073 // Inputs:
3074 // c_rarg0 - source byte array address
3075 // c_rarg1 - destination byte array address
3076 // c_rarg2 - sessionKd (key) in little endian int array
3077 // c_rarg3 - r vector byte array address
3078 // c_rarg4 - input length
3079 //
3080 // Output:
3081 // r0 - input length
3082 //
3083 address generate_cipherBlockChaining_decryptAESCrypt() {
3084 assert(UseAES, "need AES cryptographic extension support");
3085 __ align(CodeEntryAlignment);
3086 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3087 StubCodeMark mark(this, stub_id);
3088
3089 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3090
3091 const Register from = c_rarg0; // source array address
3092 const Register to = c_rarg1; // destination array address
3093 const Register key = c_rarg2; // key array address
3094 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3095 // and left with the results of the last encryption block
3096 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3097 const Register keylen = rscratch1;
3098
3099 address start = __ pc();
3100
3101 __ enter();
3102
3103 __ movw(rscratch2, len_reg);
3104
3105 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3106
3107 __ ld1(v2, __ T16B, rvec);
3108
3109 __ ld1(v31, __ T16B, __ post(key, 16));
3110 __ rev32(v31, __ T16B, v31);
3111
3112 __ cmpw(keylen, 52);
3113 __ br(Assembler::CC, L_loadkeys_44);
3114 __ br(Assembler::EQ, L_loadkeys_52);
3115
3116 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3117 __ rev32(v17, __ T16B, v17);
3118 __ rev32(v18, __ T16B, v18);
3119 __ BIND(L_loadkeys_52);
3120 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3121 __ rev32(v19, __ T16B, v19);
3122 __ rev32(v20, __ T16B, v20);
3123 __ BIND(L_loadkeys_44);
3124 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3125 __ rev32(v21, __ T16B, v21);
3126 __ rev32(v22, __ T16B, v22);
3127 __ rev32(v23, __ T16B, v23);
3128 __ rev32(v24, __ T16B, v24);
3129 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3130 __ rev32(v25, __ T16B, v25);
3131 __ rev32(v26, __ T16B, v26);
3132 __ rev32(v27, __ T16B, v27);
3133 __ rev32(v28, __ T16B, v28);
3134 __ ld1(v29, v30, __ T16B, key);
3135 __ rev32(v29, __ T16B, v29);
3136 __ rev32(v30, __ T16B, v30);
3137
3138 __ BIND(L_aes_loop);
3139 __ ld1(v0, __ T16B, __ post(from, 16));
3140 __ orr(v1, __ T16B, v0, v0);
3141
3142 __ br(Assembler::CC, L_rounds_44);
3143 __ br(Assembler::EQ, L_rounds_52);
3144
3145 __ aesd(v0, v17); __ aesimc(v0, v0);
3146 __ aesd(v0, v18); __ aesimc(v0, v0);
3147 __ BIND(L_rounds_52);
3148 __ aesd(v0, v19); __ aesimc(v0, v0);
3149 __ aesd(v0, v20); __ aesimc(v0, v0);
3150 __ BIND(L_rounds_44);
3151 __ aesd(v0, v21); __ aesimc(v0, v0);
3152 __ aesd(v0, v22); __ aesimc(v0, v0);
3153 __ aesd(v0, v23); __ aesimc(v0, v0);
3154 __ aesd(v0, v24); __ aesimc(v0, v0);
3155 __ aesd(v0, v25); __ aesimc(v0, v0);
3156 __ aesd(v0, v26); __ aesimc(v0, v0);
3157 __ aesd(v0, v27); __ aesimc(v0, v0);
3158 __ aesd(v0, v28); __ aesimc(v0, v0);
3159 __ aesd(v0, v29); __ aesimc(v0, v0);
3160 __ aesd(v0, v30);
3161 __ eor(v0, __ T16B, v0, v31);
3162 __ eor(v0, __ T16B, v0, v2);
3163
3164 __ st1(v0, __ T16B, __ post(to, 16));
3165 __ orr(v2, __ T16B, v1, v1);
3166
3167 __ subw(len_reg, len_reg, 16);
3168 __ cbnzw(len_reg, L_aes_loop);
3169
3170 __ st1(v2, __ T16B, rvec);
3171
3172 __ mov(r0, rscratch2);
3173
3174 __ leave();
3175 __ ret(lr);
3176
3177 return start;
3178 }
3179
3180 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3181 // Inputs: 128-bits. in is preserved.
3182 // The least-significant 64-bit word is in the upper dword of each vector.
3183 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3184 // Output: result
3185 void be_add_128_64(FloatRegister result, FloatRegister in,
3186 FloatRegister inc, FloatRegister tmp) {
3187 assert_different_registers(result, tmp, inc);
3188
3189 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3190 // input
3191 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3192 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3193 // MSD == 0 (must be!) to LSD
3194 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3195 }
3196
3197 // CTR AES crypt.
3198 // Arguments:
3199 //
3200 // Inputs:
3201 // c_rarg0 - source byte array address
3202 // c_rarg1 - destination byte array address
3203 // c_rarg2 - sessionKe (key) in little endian int array
3204 // c_rarg3 - counter vector byte array address
3205 // c_rarg4 - input length
3206 // c_rarg5 - saved encryptedCounter start
3207 // c_rarg6 - saved used length
3208 //
3209 // Output:
3210 // r0 - input length
3211 //
3212 address generate_counterMode_AESCrypt() {
3213 const Register in = c_rarg0;
3214 const Register out = c_rarg1;
3215 const Register key = c_rarg2;
3216 const Register counter = c_rarg3;
3217 const Register saved_len = c_rarg4, len = r10;
3218 const Register saved_encrypted_ctr = c_rarg5;
3219 const Register used_ptr = c_rarg6, used = r12;
3220
3221 const Register offset = r7;
3222 const Register keylen = r11;
3223
3224 const unsigned char block_size = 16;
3225 const int bulk_width = 4;
3226 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3227 // performance with larger data sizes, but it also means that the
3228 // fast path isn't used until you have at least 8 blocks, and up
3229 // to 127 bytes of data will be executed on the slow path. For
3230 // that reason, and also so as not to blow away too much icache, 4
3231 // blocks seems like a sensible compromise.
3232
3233 // Algorithm:
3234 //
3235 // if (len == 0) {
3236 // goto DONE;
3237 // }
3238 // int result = len;
3239 // do {
3240 // if (used >= blockSize) {
3241 // if (len >= bulk_width * blockSize) {
3242 // CTR_large_block();
3243 // if (len == 0)
3244 // goto DONE;
3245 // }
3246 // for (;;) {
3247 // 16ByteVector v0 = counter;
3248 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3249 // used = 0;
3250 // if (len < blockSize)
3251 // break; /* goto NEXT */
3252 // 16ByteVector v1 = load16Bytes(in, offset);
3253 // v1 = v1 ^ encryptedCounter;
3254 // store16Bytes(out, offset);
3255 // used = blockSize;
3256 // offset += blockSize;
3257 // len -= blockSize;
3258 // if (len == 0)
3259 // goto DONE;
3260 // }
3261 // }
3262 // NEXT:
3263 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3264 // len--;
3265 // } while (len != 0);
3266 // DONE:
3267 // return result;
3268 //
3269 // CTR_large_block()
3270 // Wide bulk encryption of whole blocks.
3271
3272 __ align(CodeEntryAlignment);
3273 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3274 StubCodeMark mark(this, stub_id);
3275 const address start = __ pc();
3276 __ enter();
3277
3278 Label DONE, CTR_large_block, large_block_return;
3279 __ ldrw(used, Address(used_ptr));
3280 __ cbzw(saved_len, DONE);
3281
3282 __ mov(len, saved_len);
3283 __ mov(offset, 0);
3284
3285 // Compute #rounds for AES based on the length of the key array
3286 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3287
3288 __ aesenc_loadkeys(key, keylen);
3289
3290 {
3291 Label L_CTR_loop, NEXT;
3292
3293 __ bind(L_CTR_loop);
3294
3295 __ cmp(used, block_size);
3296 __ br(__ LO, NEXT);
3297
3298 // Maybe we have a lot of data
3299 __ subsw(rscratch1, len, bulk_width * block_size);
3300 __ br(__ HS, CTR_large_block);
3301 __ BIND(large_block_return);
3302 __ cbzw(len, DONE);
3303
3304 // Setup the counter
3305 __ movi(v4, __ T4S, 0);
3306 __ movi(v5, __ T4S, 1);
3307 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3308
3309 // 128-bit big-endian increment
3310 __ ld1(v0, __ T16B, counter);
3311 __ rev64(v16, __ T16B, v0);
3312 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3313 __ rev64(v16, __ T16B, v16);
3314 __ st1(v16, __ T16B, counter);
3315 // Previous counter value is in v0
3316 // v4 contains { 0, 1 }
3317
3318 {
3319 // We have fewer than bulk_width blocks of data left. Encrypt
3320 // them one by one until there is less than a full block
3321 // remaining, being careful to save both the encrypted counter
3322 // and the counter.
3323
3324 Label inner_loop;
3325 __ bind(inner_loop);
3326 // Counter to encrypt is in v0
3327 __ aesecb_encrypt(noreg, noreg, keylen);
3328 __ st1(v0, __ T16B, saved_encrypted_ctr);
3329
3330 // Do we have a remaining full block?
3331
3332 __ mov(used, 0);
3333 __ cmp(len, block_size);
3334 __ br(__ LO, NEXT);
3335
3336 // Yes, we have a full block
3337 __ ldrq(v1, Address(in, offset));
3338 __ eor(v1, __ T16B, v1, v0);
3339 __ strq(v1, Address(out, offset));
3340 __ mov(used, block_size);
3341 __ add(offset, offset, block_size);
3342
3343 __ subw(len, len, block_size);
3344 __ cbzw(len, DONE);
3345
3346 // Increment the counter, store it back
3347 __ orr(v0, __ T16B, v16, v16);
3348 __ rev64(v16, __ T16B, v16);
3349 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3350 __ rev64(v16, __ T16B, v16);
3351 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3352
3353 __ b(inner_loop);
3354 }
3355
3356 __ BIND(NEXT);
3357
3358 // Encrypt a single byte, and loop.
3359 // We expect this to be a rare event.
3360 __ ldrb(rscratch1, Address(in, offset));
3361 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3362 __ eor(rscratch1, rscratch1, rscratch2);
3363 __ strb(rscratch1, Address(out, offset));
3364 __ add(offset, offset, 1);
3365 __ add(used, used, 1);
3366 __ subw(len, len,1);
3367 __ cbnzw(len, L_CTR_loop);
3368 }
3369
3370 __ bind(DONE);
3371 __ strw(used, Address(used_ptr));
3372 __ mov(r0, saved_len);
3373
3374 __ leave(); // required for proper stackwalking of RuntimeStub frame
3375 __ ret(lr);
3376
3377 // Bulk encryption
3378
3379 __ BIND (CTR_large_block);
3380 assert(bulk_width == 4 || bulk_width == 8, "must be");
3381
3382 if (bulk_width == 8) {
3383 __ sub(sp, sp, 4 * 16);
3384 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3385 }
3386 __ sub(sp, sp, 4 * 16);
3387 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3388 RegSet saved_regs = (RegSet::of(in, out, offset)
3389 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3390 __ push(saved_regs, sp);
3391 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3392 __ add(in, in, offset);
3393 __ add(out, out, offset);
3394
3395 // Keys should already be loaded into the correct registers
3396
3397 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3398 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3399
3400 // AES/CTR loop
3401 {
3402 Label L_CTR_loop;
3403 __ BIND(L_CTR_loop);
3404
3405 // Setup the counters
3406 __ movi(v8, __ T4S, 0);
3407 __ movi(v9, __ T4S, 1);
3408 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3409
3410 for (int i = 0; i < bulk_width; i++) {
3411 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3412 __ rev64(v0_ofs, __ T16B, v16);
3413 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3414 }
3415
3416 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3417
3418 // Encrypt the counters
3419 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3420
3421 if (bulk_width == 8) {
3422 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3423 }
3424
3425 // XOR the encrypted counters with the inputs
3426 for (int i = 0; i < bulk_width; i++) {
3427 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3428 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3429 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3430 }
3431
3432 // Write the encrypted data
3433 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3434 if (bulk_width == 8) {
3435 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3436 }
3437
3438 __ subw(len, len, 16 * bulk_width);
3439 __ cbnzw(len, L_CTR_loop);
3440 }
3441
3442 // Save the counter back where it goes
3443 __ rev64(v16, __ T16B, v16);
3444 __ st1(v16, __ T16B, counter);
3445
3446 __ pop(saved_regs, sp);
3447
3448 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3449 if (bulk_width == 8) {
3450 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3451 }
3452
3453 __ andr(rscratch1, len, -16 * bulk_width);
3454 __ sub(len, len, rscratch1);
3455 __ add(offset, offset, rscratch1);
3456 __ mov(used, 16);
3457 __ strw(used, Address(used_ptr));
3458 __ b(large_block_return);
3459
3460 return start;
3461 }
3462
3463 // Vector AES Galois Counter Mode implementation. Parameters:
3464 //
3465 // in = c_rarg0
3466 // len = c_rarg1
3467 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3468 // out = c_rarg3
3469 // key = c_rarg4
3470 // state = c_rarg5 - GHASH.state
3471 // subkeyHtbl = c_rarg6 - powers of H
3472 // counter = c_rarg7 - 16 bytes of CTR
3473 // return - number of processed bytes
3474 address generate_galoisCounterMode_AESCrypt() {
3475 Label ghash_polynomial; // local data generated after code
3476
3477 __ align(CodeEntryAlignment);
3478 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3479 StubCodeMark mark(this, stub_id);
3480 address start = __ pc();
3481 __ enter();
3482
3483 const Register in = c_rarg0;
3484 const Register len = c_rarg1;
3485 const Register ct = c_rarg2;
3486 const Register out = c_rarg3;
3487 // and updated with the incremented counter in the end
3488
3489 const Register key = c_rarg4;
3490 const Register state = c_rarg5;
3491
3492 const Register subkeyHtbl = c_rarg6;
3493
3494 const Register counter = c_rarg7;
3495
3496 const Register keylen = r10;
3497 // Save state before entering routine
3498 __ sub(sp, sp, 4 * 16);
3499 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3500 __ sub(sp, sp, 4 * 16);
3501 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3502
3503 // __ andr(len, len, -512);
3504 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3505 __ str(len, __ pre(sp, -2 * wordSize));
3506
3507 Label DONE;
3508 __ cbz(len, DONE);
3509
3510 // Compute #rounds for AES based on the length of the key array
3511 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3512
3513 __ aesenc_loadkeys(key, keylen);
3514 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3515 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3516
3517 // AES/CTR loop
3518 {
3519 Label L_CTR_loop;
3520 __ BIND(L_CTR_loop);
3521
3522 // Setup the counters
3523 __ movi(v8, __ T4S, 0);
3524 __ movi(v9, __ T4S, 1);
3525 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3526
3527 assert(v0->encoding() < v8->encoding(), "");
3528 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3529 FloatRegister f = as_FloatRegister(i);
3530 __ rev32(f, __ T16B, v16);
3531 __ addv(v16, __ T4S, v16, v8);
3532 }
3533
3534 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3535
3536 // Encrypt the counters
3537 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3538
3539 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3540
3541 // XOR the encrypted counters with the inputs
3542 for (int i = 0; i < 8; i++) {
3543 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3544 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3545 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3546 }
3547 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3548 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3549
3550 __ subw(len, len, 16 * 8);
3551 __ cbnzw(len, L_CTR_loop);
3552 }
3553
3554 __ rev32(v16, __ T16B, v16);
3555 __ st1(v16, __ T16B, counter);
3556
3557 __ ldr(len, Address(sp));
3558 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3559
3560 // GHASH/CTR loop
3561 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3562 len, /*unrolls*/4);
3563
3564 #ifdef ASSERT
3565 { Label L;
3566 __ cmp(len, (unsigned char)0);
3567 __ br(Assembler::EQ, L);
3568 __ stop("stubGenerator: abort");
3569 __ bind(L);
3570 }
3571 #endif
3572
3573 __ bind(DONE);
3574 // Return the number of bytes processed
3575 __ ldr(r0, __ post(sp, 2 * wordSize));
3576
3577 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3578 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3579
3580 __ leave(); // required for proper stackwalking of RuntimeStub frame
3581 __ ret(lr);
3582
3583 // bind label and generate polynomial data
3584 __ align(wordSize * 2);
3585 __ bind(ghash_polynomial);
3586 __ emit_int64(0x87); // The low-order bits of the field
3587 // polynomial (i.e. p = z^7+z^2+z+1)
3588 // repeated in the low and high parts of a
3589 // 128-bit vector
3590 __ emit_int64(0x87);
3591
3592 return start;
3593 }
3594
3595 class Cached64Bytes {
3596 private:
3597 MacroAssembler *_masm;
3598 Register _regs[8];
3599
3600 public:
3601 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3602 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3603 auto it = rs.begin();
3604 for (auto &r: _regs) {
3605 r = *it;
3606 ++it;
3607 }
3608 }
3609
3610 void gen_loads(Register base) {
3611 for (int i = 0; i < 8; i += 2) {
3612 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3613 }
3614 }
3615
3616 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3617 void extract_u32(Register dest, int i) {
3618 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3619 }
3620 };
3621
3622 // Utility routines for md5.
3623 // Clobbers r10 and r11.
3624 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3625 int k, int s, int t) {
3626 Register rscratch3 = r10;
3627 Register rscratch4 = r11;
3628
3629 __ eorw(rscratch3, r3, r4);
3630 __ movw(rscratch2, t);
3631 __ andw(rscratch3, rscratch3, r2);
3632 __ addw(rscratch4, r1, rscratch2);
3633 reg_cache.extract_u32(rscratch1, k);
3634 __ eorw(rscratch3, rscratch3, r4);
3635 __ addw(rscratch4, rscratch4, rscratch1);
3636 __ addw(rscratch3, rscratch3, rscratch4);
3637 __ rorw(rscratch2, rscratch3, 32 - s);
3638 __ addw(r1, rscratch2, r2);
3639 }
3640
3641 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3642 int k, int s, int t) {
3643 Register rscratch3 = r10;
3644 Register rscratch4 = r11;
3645
3646 reg_cache.extract_u32(rscratch1, k);
3647 __ movw(rscratch2, t);
3648 __ addw(rscratch4, r1, rscratch2);
3649 __ addw(rscratch4, rscratch4, rscratch1);
3650 __ bicw(rscratch2, r3, r4);
3651 __ andw(rscratch3, r2, r4);
3652 __ addw(rscratch2, rscratch2, rscratch4);
3653 __ addw(rscratch2, rscratch2, rscratch3);
3654 __ rorw(rscratch2, rscratch2, 32 - s);
3655 __ addw(r1, rscratch2, r2);
3656 }
3657
3658 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3659 int k, int s, int t) {
3660 Register rscratch3 = r10;
3661 Register rscratch4 = r11;
3662
3663 __ eorw(rscratch3, r3, r4);
3664 __ movw(rscratch2, t);
3665 __ addw(rscratch4, r1, rscratch2);
3666 reg_cache.extract_u32(rscratch1, k);
3667 __ eorw(rscratch3, rscratch3, r2);
3668 __ addw(rscratch4, rscratch4, rscratch1);
3669 __ addw(rscratch3, rscratch3, rscratch4);
3670 __ rorw(rscratch2, rscratch3, 32 - s);
3671 __ addw(r1, rscratch2, r2);
3672 }
3673
3674 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3675 int k, int s, int t) {
3676 Register rscratch3 = r10;
3677 Register rscratch4 = r11;
3678
3679 __ movw(rscratch3, t);
3680 __ ornw(rscratch2, r2, r4);
3681 __ addw(rscratch4, r1, rscratch3);
3682 reg_cache.extract_u32(rscratch1, k);
3683 __ eorw(rscratch3, rscratch2, r3);
3684 __ addw(rscratch4, rscratch4, rscratch1);
3685 __ addw(rscratch3, rscratch3, rscratch4);
3686 __ rorw(rscratch2, rscratch3, 32 - s);
3687 __ addw(r1, rscratch2, r2);
3688 }
3689
3690 // Arguments:
3691 //
3692 // Inputs:
3693 // c_rarg0 - byte[] source+offset
3694 // c_rarg1 - int[] SHA.state
3695 // c_rarg2 - int offset
3696 // c_rarg3 - int limit
3697 //
3698 address generate_md5_implCompress(StubId stub_id) {
3699 bool multi_block;
3700 switch (stub_id) {
3701 case StubId::stubgen_md5_implCompress_id:
3702 multi_block = false;
3703 break;
3704 case StubId::stubgen_md5_implCompressMB_id:
3705 multi_block = true;
3706 break;
3707 default:
3708 ShouldNotReachHere();
3709 }
3710 __ align(CodeEntryAlignment);
3711
3712 StubCodeMark mark(this, stub_id);
3713 address start = __ pc();
3714
3715 Register buf = c_rarg0;
3716 Register state = c_rarg1;
3717 Register ofs = c_rarg2;
3718 Register limit = c_rarg3;
3719 Register a = r4;
3720 Register b = r5;
3721 Register c = r6;
3722 Register d = r7;
3723 Register rscratch3 = r10;
3724 Register rscratch4 = r11;
3725
3726 Register state_regs[2] = { r12, r13 };
3727 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3728 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3729
3730 __ push(saved_regs, sp);
3731
3732 __ ldp(state_regs[0], state_regs[1], Address(state));
3733 __ ubfx(a, state_regs[0], 0, 32);
3734 __ ubfx(b, state_regs[0], 32, 32);
3735 __ ubfx(c, state_regs[1], 0, 32);
3736 __ ubfx(d, state_regs[1], 32, 32);
3737
3738 Label md5_loop;
3739 __ BIND(md5_loop);
3740
3741 reg_cache.gen_loads(buf);
3742
3743 // Round 1
3744 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3745 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3746 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3747 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3748 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3749 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3750 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3751 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3752 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3753 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3754 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3755 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3756 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3757 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3758 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3759 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3760
3761 // Round 2
3762 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3763 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3764 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3765 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3766 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3767 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3768 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3769 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3770 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3771 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3772 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3773 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3774 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3775 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3776 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3777 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3778
3779 // Round 3
3780 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3781 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3782 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3783 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3784 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3785 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3786 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3787 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3788 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3789 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3790 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3791 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3792 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3793 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3794 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3795 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3796
3797 // Round 4
3798 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3799 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3800 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3801 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3802 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3803 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3804 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3805 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3806 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3807 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3808 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3809 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3810 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3811 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3812 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3813 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3814
3815 __ addw(a, state_regs[0], a);
3816 __ ubfx(rscratch2, state_regs[0], 32, 32);
3817 __ addw(b, rscratch2, b);
3818 __ addw(c, state_regs[1], c);
3819 __ ubfx(rscratch4, state_regs[1], 32, 32);
3820 __ addw(d, rscratch4, d);
3821
3822 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3823 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3824
3825 if (multi_block) {
3826 __ add(buf, buf, 64);
3827 __ add(ofs, ofs, 64);
3828 __ cmp(ofs, limit);
3829 __ br(Assembler::LE, md5_loop);
3830 __ mov(c_rarg0, ofs); // return ofs
3831 }
3832
3833 // write hash values back in the correct order
3834 __ stp(state_regs[0], state_regs[1], Address(state));
3835
3836 __ pop(saved_regs, sp);
3837
3838 __ ret(lr);
3839
3840 return start;
3841 }
3842
3843 // Arguments:
3844 //
3845 // Inputs:
3846 // c_rarg0 - byte[] source+offset
3847 // c_rarg1 - int[] SHA.state
3848 // c_rarg2 - int offset
3849 // c_rarg3 - int limit
3850 //
3851 address generate_sha1_implCompress(StubId stub_id) {
3852 bool multi_block;
3853 switch (stub_id) {
3854 case StubId::stubgen_sha1_implCompress_id:
3855 multi_block = false;
3856 break;
3857 case StubId::stubgen_sha1_implCompressMB_id:
3858 multi_block = true;
3859 break;
3860 default:
3861 ShouldNotReachHere();
3862 }
3863
3864 __ align(CodeEntryAlignment);
3865
3866 StubCodeMark mark(this, stub_id);
3867 address start = __ pc();
3868
3869 Register buf = c_rarg0;
3870 Register state = c_rarg1;
3871 Register ofs = c_rarg2;
3872 Register limit = c_rarg3;
3873
3874 Label keys;
3875 Label sha1_loop;
3876
3877 // load the keys into v0..v3
3878 __ adr(rscratch1, keys);
3879 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3880 // load 5 words state into v6, v7
3881 __ ldrq(v6, Address(state, 0));
3882 __ ldrs(v7, Address(state, 16));
3883
3884
3885 __ BIND(sha1_loop);
3886 // load 64 bytes of data into v16..v19
3887 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3888 __ rev32(v16, __ T16B, v16);
3889 __ rev32(v17, __ T16B, v17);
3890 __ rev32(v18, __ T16B, v18);
3891 __ rev32(v19, __ T16B, v19);
3892
3893 // do the sha1
3894 __ addv(v4, __ T4S, v16, v0);
3895 __ orr(v20, __ T16B, v6, v6);
3896
3897 FloatRegister d0 = v16;
3898 FloatRegister d1 = v17;
3899 FloatRegister d2 = v18;
3900 FloatRegister d3 = v19;
3901
3902 for (int round = 0; round < 20; round++) {
3903 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3904 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3905 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3906 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3907 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3908
3909 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3910 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3911 __ sha1h(tmp2, __ T4S, v20);
3912 if (round < 5)
3913 __ sha1c(v20, __ T4S, tmp3, tmp4);
3914 else if (round < 10 || round >= 15)
3915 __ sha1p(v20, __ T4S, tmp3, tmp4);
3916 else
3917 __ sha1m(v20, __ T4S, tmp3, tmp4);
3918 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3919
3920 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3921 }
3922
3923 __ addv(v7, __ T2S, v7, v21);
3924 __ addv(v6, __ T4S, v6, v20);
3925
3926 if (multi_block) {
3927 __ add(ofs, ofs, 64);
3928 __ cmp(ofs, limit);
3929 __ br(Assembler::LE, sha1_loop);
3930 __ mov(c_rarg0, ofs); // return ofs
3931 }
3932
3933 __ strq(v6, Address(state, 0));
3934 __ strs(v7, Address(state, 16));
3935
3936 __ ret(lr);
3937
3938 __ bind(keys);
3939 __ emit_int32(0x5a827999);
3940 __ emit_int32(0x6ed9eba1);
3941 __ emit_int32(0x8f1bbcdc);
3942 __ emit_int32(0xca62c1d6);
3943
3944 return start;
3945 }
3946
3947
3948 // Arguments:
3949 //
3950 // Inputs:
3951 // c_rarg0 - byte[] source+offset
3952 // c_rarg1 - int[] SHA.state
3953 // c_rarg2 - int offset
3954 // c_rarg3 - int limit
3955 //
3956 address generate_sha256_implCompress(StubId stub_id) {
3957 bool multi_block;
3958 switch (stub_id) {
3959 case StubId::stubgen_sha256_implCompress_id:
3960 multi_block = false;
3961 break;
3962 case StubId::stubgen_sha256_implCompressMB_id:
3963 multi_block = true;
3964 break;
3965 default:
3966 ShouldNotReachHere();
3967 }
3968
3969 static const uint32_t round_consts[64] = {
3970 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3971 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3972 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3973 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3974 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3975 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3976 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3977 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3978 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3979 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3980 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3981 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3982 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3983 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3984 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3985 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3986 };
3987
3988 __ align(CodeEntryAlignment);
3989
3990 StubCodeMark mark(this, stub_id);
3991 address start = __ pc();
3992
3993 Register buf = c_rarg0;
3994 Register state = c_rarg1;
3995 Register ofs = c_rarg2;
3996 Register limit = c_rarg3;
3997
3998 Label sha1_loop;
3999
4000 __ stpd(v8, v9, __ pre(sp, -32));
4001 __ stpd(v10, v11, Address(sp, 16));
4002
4003 // dga == v0
4004 // dgb == v1
4005 // dg0 == v2
4006 // dg1 == v3
4007 // dg2 == v4
4008 // t0 == v6
4009 // t1 == v7
4010
4011 // load 16 keys to v16..v31
4012 __ lea(rscratch1, ExternalAddress((address)round_consts));
4013 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4014 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4015 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4016 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4017
4018 // load 8 words (256 bits) state
4019 __ ldpq(v0, v1, state);
4020
4021 __ BIND(sha1_loop);
4022 // load 64 bytes of data into v8..v11
4023 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4024 __ rev32(v8, __ T16B, v8);
4025 __ rev32(v9, __ T16B, v9);
4026 __ rev32(v10, __ T16B, v10);
4027 __ rev32(v11, __ T16B, v11);
4028
4029 __ addv(v6, __ T4S, v8, v16);
4030 __ orr(v2, __ T16B, v0, v0);
4031 __ orr(v3, __ T16B, v1, v1);
4032
4033 FloatRegister d0 = v8;
4034 FloatRegister d1 = v9;
4035 FloatRegister d2 = v10;
4036 FloatRegister d3 = v11;
4037
4038
4039 for (int round = 0; round < 16; round++) {
4040 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4041 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4042 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4043 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4044
4045 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4046 __ orr(v4, __ T16B, v2, v2);
4047 if (round < 15)
4048 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4049 __ sha256h(v2, __ T4S, v3, tmp2);
4050 __ sha256h2(v3, __ T4S, v4, tmp2);
4051 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4052
4053 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4054 }
4055
4056 __ addv(v0, __ T4S, v0, v2);
4057 __ addv(v1, __ T4S, v1, v3);
4058
4059 if (multi_block) {
4060 __ add(ofs, ofs, 64);
4061 __ cmp(ofs, limit);
4062 __ br(Assembler::LE, sha1_loop);
4063 __ mov(c_rarg0, ofs); // return ofs
4064 }
4065
4066 __ ldpd(v10, v11, Address(sp, 16));
4067 __ ldpd(v8, v9, __ post(sp, 32));
4068
4069 __ stpq(v0, v1, state);
4070
4071 __ ret(lr);
4072
4073 return start;
4074 }
4075
4076 // Double rounds for sha512.
4077 void sha512_dround(int dr,
4078 FloatRegister vi0, FloatRegister vi1,
4079 FloatRegister vi2, FloatRegister vi3,
4080 FloatRegister vi4, FloatRegister vrc0,
4081 FloatRegister vrc1, FloatRegister vin0,
4082 FloatRegister vin1, FloatRegister vin2,
4083 FloatRegister vin3, FloatRegister vin4) {
4084 if (dr < 36) {
4085 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4086 }
4087 __ addv(v5, __ T2D, vrc0, vin0);
4088 __ ext(v6, __ T16B, vi2, vi3, 8);
4089 __ ext(v5, __ T16B, v5, v5, 8);
4090 __ ext(v7, __ T16B, vi1, vi2, 8);
4091 __ addv(vi3, __ T2D, vi3, v5);
4092 if (dr < 32) {
4093 __ ext(v5, __ T16B, vin3, vin4, 8);
4094 __ sha512su0(vin0, __ T2D, vin1);
4095 }
4096 __ sha512h(vi3, __ T2D, v6, v7);
4097 if (dr < 32) {
4098 __ sha512su1(vin0, __ T2D, vin2, v5);
4099 }
4100 __ addv(vi4, __ T2D, vi1, vi3);
4101 __ sha512h2(vi3, __ T2D, vi1, vi0);
4102 }
4103
4104 // Arguments:
4105 //
4106 // Inputs:
4107 // c_rarg0 - byte[] source+offset
4108 // c_rarg1 - int[] SHA.state
4109 // c_rarg2 - int offset
4110 // c_rarg3 - int limit
4111 //
4112 address generate_sha512_implCompress(StubId stub_id) {
4113 bool multi_block;
4114 switch (stub_id) {
4115 case StubId::stubgen_sha512_implCompress_id:
4116 multi_block = false;
4117 break;
4118 case StubId::stubgen_sha512_implCompressMB_id:
4119 multi_block = true;
4120 break;
4121 default:
4122 ShouldNotReachHere();
4123 }
4124
4125 static const uint64_t round_consts[80] = {
4126 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4127 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4128 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4129 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4130 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4131 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4132 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4133 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4134 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4135 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4136 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4137 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4138 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4139 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4140 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4141 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4142 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4143 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4144 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4145 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4146 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4147 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4148 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4149 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4150 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4151 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4152 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4153 };
4154
4155 __ align(CodeEntryAlignment);
4156
4157 StubCodeMark mark(this, stub_id);
4158 address start = __ pc();
4159
4160 Register buf = c_rarg0;
4161 Register state = c_rarg1;
4162 Register ofs = c_rarg2;
4163 Register limit = c_rarg3;
4164
4165 __ stpd(v8, v9, __ pre(sp, -64));
4166 __ stpd(v10, v11, Address(sp, 16));
4167 __ stpd(v12, v13, Address(sp, 32));
4168 __ stpd(v14, v15, Address(sp, 48));
4169
4170 Label sha512_loop;
4171
4172 // load state
4173 __ ld1(v8, v9, v10, v11, __ T2D, state);
4174
4175 // load first 4 round constants
4176 __ lea(rscratch1, ExternalAddress((address)round_consts));
4177 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4178
4179 __ BIND(sha512_loop);
4180 // load 128B of data into v12..v19
4181 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4182 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4183 __ rev64(v12, __ T16B, v12);
4184 __ rev64(v13, __ T16B, v13);
4185 __ rev64(v14, __ T16B, v14);
4186 __ rev64(v15, __ T16B, v15);
4187 __ rev64(v16, __ T16B, v16);
4188 __ rev64(v17, __ T16B, v17);
4189 __ rev64(v18, __ T16B, v18);
4190 __ rev64(v19, __ T16B, v19);
4191
4192 __ mov(rscratch2, rscratch1);
4193
4194 __ mov(v0, __ T16B, v8);
4195 __ mov(v1, __ T16B, v9);
4196 __ mov(v2, __ T16B, v10);
4197 __ mov(v3, __ T16B, v11);
4198
4199 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4200 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4201 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4202 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4203 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4204 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4205 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4206 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4207 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4208 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4209 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4210 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4211 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4212 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4213 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4214 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4215 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4216 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4217 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4218 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4219 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4220 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4221 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4222 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4223 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4224 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4225 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4226 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4227 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4228 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4229 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4230 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4231 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4232 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4233 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4234 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4235 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4236 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4237 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4238 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4239
4240 __ addv(v8, __ T2D, v8, v0);
4241 __ addv(v9, __ T2D, v9, v1);
4242 __ addv(v10, __ T2D, v10, v2);
4243 __ addv(v11, __ T2D, v11, v3);
4244
4245 if (multi_block) {
4246 __ add(ofs, ofs, 128);
4247 __ cmp(ofs, limit);
4248 __ br(Assembler::LE, sha512_loop);
4249 __ mov(c_rarg0, ofs); // return ofs
4250 }
4251
4252 __ st1(v8, v9, v10, v11, __ T2D, state);
4253
4254 __ ldpd(v14, v15, Address(sp, 48));
4255 __ ldpd(v12, v13, Address(sp, 32));
4256 __ ldpd(v10, v11, Address(sp, 16));
4257 __ ldpd(v8, v9, __ post(sp, 64));
4258
4259 __ ret(lr);
4260
4261 return start;
4262 }
4263
4264 // Execute one round of keccak of two computations in parallel.
4265 // One of the states should be loaded into the lower halves of
4266 // the vector registers v0-v24, the other should be loaded into
4267 // the upper halves of those registers. The ld1r instruction loads
4268 // the round constant into both halves of register v31.
4269 // Intermediate results c0...c5 and d0...d5 are computed
4270 // in registers v25...v30.
4271 // All vector instructions that are used operate on both register
4272 // halves in parallel.
4273 // If only a single computation is needed, one can only load the lower halves.
4274 void keccak_round(Register rscratch1) {
4275 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4276 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4277 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4278 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4279 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4280 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4281 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4282 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4283 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4284 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4285
4286 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4287 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4288 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4289 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4290 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4291
4292 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4293 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4294 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4295 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4296 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4297 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4298 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4299 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4300 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4301 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4302 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4303 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4304 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4305 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4306 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4307 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4308 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4309 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4310 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4311 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4312 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4313 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4314 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4315 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4316 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4317
4318 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4319 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4320 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4321 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4322 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4323
4324 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4325
4326 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4327 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4328 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4329 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4330 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4331
4332 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4333 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4334 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4335 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4336 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4337
4338 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4339 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4340 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4341 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4342 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4343
4344 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4345 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4346 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4347 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4348 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4349
4350 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4351 }
4352
4353 // Arguments:
4354 //
4355 // Inputs:
4356 // c_rarg0 - byte[] source+offset
4357 // c_rarg1 - byte[] SHA.state
4358 // c_rarg2 - int block_size
4359 // c_rarg3 - int offset
4360 // c_rarg4 - int limit
4361 //
4362 address generate_sha3_implCompress(StubId stub_id) {
4363 bool multi_block;
4364 switch (stub_id) {
4365 case StubId::stubgen_sha3_implCompress_id:
4366 multi_block = false;
4367 break;
4368 case StubId::stubgen_sha3_implCompressMB_id:
4369 multi_block = true;
4370 break;
4371 default:
4372 ShouldNotReachHere();
4373 }
4374
4375 static const uint64_t round_consts[24] = {
4376 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4377 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4378 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4379 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4380 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4381 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4382 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4383 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4384 };
4385
4386 __ align(CodeEntryAlignment);
4387
4388 StubCodeMark mark(this, stub_id);
4389 address start = __ pc();
4390
4391 Register buf = c_rarg0;
4392 Register state = c_rarg1;
4393 Register block_size = c_rarg2;
4394 Register ofs = c_rarg3;
4395 Register limit = c_rarg4;
4396
4397 Label sha3_loop, rounds24_loop;
4398 Label sha3_512_or_sha3_384, shake128;
4399
4400 __ stpd(v8, v9, __ pre(sp, -64));
4401 __ stpd(v10, v11, Address(sp, 16));
4402 __ stpd(v12, v13, Address(sp, 32));
4403 __ stpd(v14, v15, Address(sp, 48));
4404
4405 // load state
4406 __ add(rscratch1, state, 32);
4407 __ ld1(v0, v1, v2, v3, __ T1D, state);
4408 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4409 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4410 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4411 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4412 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4413 __ ld1(v24, __ T1D, rscratch1);
4414
4415 __ BIND(sha3_loop);
4416
4417 // 24 keccak rounds
4418 __ movw(rscratch2, 24);
4419
4420 // load round_constants base
4421 __ lea(rscratch1, ExternalAddress((address) round_consts));
4422
4423 // load input
4424 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4425 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4426 __ eor(v0, __ T8B, v0, v25);
4427 __ eor(v1, __ T8B, v1, v26);
4428 __ eor(v2, __ T8B, v2, v27);
4429 __ eor(v3, __ T8B, v3, v28);
4430 __ eor(v4, __ T8B, v4, v29);
4431 __ eor(v5, __ T8B, v5, v30);
4432 __ eor(v6, __ T8B, v6, v31);
4433
4434 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4435 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4436
4437 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4438 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4439 __ eor(v7, __ T8B, v7, v25);
4440 __ eor(v8, __ T8B, v8, v26);
4441 __ eor(v9, __ T8B, v9, v27);
4442 __ eor(v10, __ T8B, v10, v28);
4443 __ eor(v11, __ T8B, v11, v29);
4444 __ eor(v12, __ T8B, v12, v30);
4445 __ eor(v13, __ T8B, v13, v31);
4446
4447 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4448 __ eor(v14, __ T8B, v14, v25);
4449 __ eor(v15, __ T8B, v15, v26);
4450 __ eor(v16, __ T8B, v16, v27);
4451
4452 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4453 __ andw(c_rarg5, block_size, 48);
4454 __ cbzw(c_rarg5, rounds24_loop);
4455
4456 __ tbnz(block_size, 5, shake128);
4457 // block_size == 144, bit5 == 0, SHA3-224
4458 __ ldrd(v28, __ post(buf, 8));
4459 __ eor(v17, __ T8B, v17, v28);
4460 __ b(rounds24_loop);
4461
4462 __ BIND(shake128);
4463 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4464 __ eor(v17, __ T8B, v17, v28);
4465 __ eor(v18, __ T8B, v18, v29);
4466 __ eor(v19, __ T8B, v19, v30);
4467 __ eor(v20, __ T8B, v20, v31);
4468 __ b(rounds24_loop); // block_size == 168, SHAKE128
4469
4470 __ BIND(sha3_512_or_sha3_384);
4471 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4472 __ eor(v7, __ T8B, v7, v25);
4473 __ eor(v8, __ T8B, v8, v26);
4474 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4475
4476 // SHA3-384
4477 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4478 __ eor(v9, __ T8B, v9, v27);
4479 __ eor(v10, __ T8B, v10, v28);
4480 __ eor(v11, __ T8B, v11, v29);
4481 __ eor(v12, __ T8B, v12, v30);
4482
4483 __ BIND(rounds24_loop);
4484 __ subw(rscratch2, rscratch2, 1);
4485
4486 keccak_round(rscratch1);
4487
4488 __ cbnzw(rscratch2, rounds24_loop);
4489
4490 if (multi_block) {
4491 __ add(ofs, ofs, block_size);
4492 __ cmp(ofs, limit);
4493 __ br(Assembler::LE, sha3_loop);
4494 __ mov(c_rarg0, ofs); // return ofs
4495 }
4496
4497 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4498 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4499 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4500 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4501 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4502 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4503 __ st1(v24, __ T1D, state);
4504
4505 // restore callee-saved registers
4506 __ ldpd(v14, v15, Address(sp, 48));
4507 __ ldpd(v12, v13, Address(sp, 32));
4508 __ ldpd(v10, v11, Address(sp, 16));
4509 __ ldpd(v8, v9, __ post(sp, 64));
4510
4511 __ ret(lr);
4512
4513 return start;
4514 }
4515
4516 // Inputs:
4517 // c_rarg0 - long[] state0
4518 // c_rarg1 - long[] state1
4519 address generate_double_keccak() {
4520 static const uint64_t round_consts[24] = {
4521 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4522 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4523 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4524 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4525 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4526 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4527 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4528 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4529 };
4530
4531 // Implements the double_keccak() method of the
4532 // sun.secyrity.provider.SHA3Parallel class
4533 __ align(CodeEntryAlignment);
4534 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4535 address start = __ pc();
4536 __ enter();
4537
4538 Register state0 = c_rarg0;
4539 Register state1 = c_rarg1;
4540
4541 Label rounds24_loop;
4542
4543 // save callee-saved registers
4544 __ stpd(v8, v9, __ pre(sp, -64));
4545 __ stpd(v10, v11, Address(sp, 16));
4546 __ stpd(v12, v13, Address(sp, 32));
4547 __ stpd(v14, v15, Address(sp, 48));
4548
4549 // load states
4550 __ add(rscratch1, state0, 32);
4551 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4552 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4553 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4554 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4555 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4556 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4557 __ ld1(v24, __ D, 0, rscratch1);
4558 __ add(rscratch1, state1, 32);
4559 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4560 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4561 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4562 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4563 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4564 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4565 __ ld1(v24, __ D, 1, rscratch1);
4566
4567 // 24 keccak rounds
4568 __ movw(rscratch2, 24);
4569
4570 // load round_constants base
4571 __ lea(rscratch1, ExternalAddress((address) round_consts));
4572
4573 __ BIND(rounds24_loop);
4574 __ subw(rscratch2, rscratch2, 1);
4575 keccak_round(rscratch1);
4576 __ cbnzw(rscratch2, rounds24_loop);
4577
4578 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4579 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4580 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4581 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4582 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4583 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4584 __ st1(v24, __ D, 0, state0);
4585 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4586 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4587 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4588 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4589 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4590 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4591 __ st1(v24, __ D, 1, state1);
4592
4593 // restore callee-saved vector registers
4594 __ ldpd(v14, v15, Address(sp, 48));
4595 __ ldpd(v12, v13, Address(sp, 32));
4596 __ ldpd(v10, v11, Address(sp, 16));
4597 __ ldpd(v8, v9, __ post(sp, 64));
4598
4599 __ leave(); // required for proper stackwalking of RuntimeStub frame
4600 __ mov(r0, zr); // return 0
4601 __ ret(lr);
4602
4603 return start;
4604 }
4605
4606 // ChaCha20 block function. This version parallelizes the 32-bit
4607 // state elements on each of 16 vectors, producing 4 blocks of
4608 // keystream at a time.
4609 //
4610 // state (int[16]) = c_rarg0
4611 // keystream (byte[256]) = c_rarg1
4612 // return - number of bytes of produced keystream (always 256)
4613 //
4614 // This implementation takes each 32-bit integer from the state
4615 // array and broadcasts it across all 4 32-bit lanes of a vector register
4616 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4617 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4618 // the quarter round schedule is implemented as outlined in RFC 7539 section
4619 // 2.3. However, instead of sequentially processing the 3 quarter round
4620 // operations represented by one QUARTERROUND function, we instead stack all
4621 // the adds, xors and left-rotations from the first 4 quarter rounds together
4622 // and then do the same for the second set of 4 quarter rounds. This removes
4623 // some latency that would otherwise be incurred by waiting for an add to
4624 // complete before performing an xor (which depends on the result of the
4625 // add), etc. An adjustment happens between the first and second groups of 4
4626 // quarter rounds, but this is done only in the inputs to the macro functions
4627 // that generate the assembly instructions - these adjustments themselves are
4628 // not part of the resulting assembly.
4629 // The 4 registers v0-v3 are used during the quarter round operations as
4630 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4631 // registers become the vectors involved in adding the start state back onto
4632 // the post-QR working state. After the adds are complete, each of the 16
4633 // vectors write their first lane back to the keystream buffer, followed
4634 // by the second lane from all vectors and so on.
4635 address generate_chacha20Block_blockpar() {
4636 Label L_twoRounds, L_cc20_const;
4637 __ align(CodeEntryAlignment);
4638 StubId stub_id = StubId::stubgen_chacha20Block_id;
4639 StubCodeMark mark(this, stub_id);
4640 address start = __ pc();
4641 __ enter();
4642
4643 int i, j;
4644 const Register state = c_rarg0;
4645 const Register keystream = c_rarg1;
4646 const Register loopCtr = r10;
4647 const Register tmpAddr = r11;
4648 const FloatRegister ctrAddOverlay = v28;
4649 const FloatRegister lrot8Tbl = v29;
4650
4651 // Organize SIMD registers in an array that facilitates
4652 // putting repetitive opcodes into loop structures. It is
4653 // important that each grouping of 4 registers is monotonically
4654 // increasing to support the requirements of multi-register
4655 // instructions (e.g. ld4r, st4, etc.)
4656 const FloatRegister workSt[16] = {
4657 v4, v5, v6, v7, v16, v17, v18, v19,
4658 v20, v21, v22, v23, v24, v25, v26, v27
4659 };
4660
4661 // Pull in constant data. The first 16 bytes are the add overlay
4662 // which is applied to the vector holding the counter (state[12]).
4663 // The second 16 bytes is the index register for the 8-bit left
4664 // rotation tbl instruction.
4665 __ adr(tmpAddr, L_cc20_const);
4666 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4667
4668 // Load from memory and interlace across 16 SIMD registers,
4669 // With each word from memory being broadcast to all lanes of
4670 // each successive SIMD register.
4671 // Addr(0) -> All lanes in workSt[i]
4672 // Addr(4) -> All lanes workSt[i + 1], etc.
4673 __ mov(tmpAddr, state);
4674 for (i = 0; i < 16; i += 4) {
4675 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4676 __ post(tmpAddr, 16));
4677 }
4678 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4679
4680 // Before entering the loop, create 5 4-register arrays. These
4681 // will hold the 4 registers that represent the a/b/c/d fields
4682 // in the quarter round operation. For instance the "b" field
4683 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4684 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4685 // since it is part of a diagonal organization. The aSet and scratch
4686 // register sets are defined at declaration time because they do not change
4687 // organization at any point during the 20-round processing.
4688 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4689 FloatRegister bSet[4];
4690 FloatRegister cSet[4];
4691 FloatRegister dSet[4];
4692 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4693
4694 // Set up the 10 iteration loop and perform all 8 quarter round ops
4695 __ mov(loopCtr, 10);
4696 __ BIND(L_twoRounds);
4697
4698 // Set to columnar organization and do the following 4 quarter-rounds:
4699 // QUARTERROUND(0, 4, 8, 12)
4700 // QUARTERROUND(1, 5, 9, 13)
4701 // QUARTERROUND(2, 6, 10, 14)
4702 // QUARTERROUND(3, 7, 11, 15)
4703 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4704 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4705 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4706
4707 __ cc20_qr_add4(aSet, bSet); // a += b
4708 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4709 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4710
4711 __ cc20_qr_add4(cSet, dSet); // c += d
4712 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4713 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4714
4715 __ cc20_qr_add4(aSet, bSet); // a += b
4716 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4717 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4718
4719 __ cc20_qr_add4(cSet, dSet); // c += d
4720 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4721 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4722
4723 // Set to diagonal organization and do the next 4 quarter-rounds:
4724 // QUARTERROUND(0, 5, 10, 15)
4725 // QUARTERROUND(1, 6, 11, 12)
4726 // QUARTERROUND(2, 7, 8, 13)
4727 // QUARTERROUND(3, 4, 9, 14)
4728 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4729 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4730 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4731
4732 __ cc20_qr_add4(aSet, bSet); // a += b
4733 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4734 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4735
4736 __ cc20_qr_add4(cSet, dSet); // c += d
4737 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4738 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4739
4740 __ cc20_qr_add4(aSet, bSet); // a += b
4741 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4742 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4743
4744 __ cc20_qr_add4(cSet, dSet); // c += d
4745 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4746 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4747
4748 // Decrement and iterate
4749 __ sub(loopCtr, loopCtr, 1);
4750 __ cbnz(loopCtr, L_twoRounds);
4751
4752 __ mov(tmpAddr, state);
4753
4754 // Add the starting state back to the post-loop keystream
4755 // state. We read/interlace the state array from memory into
4756 // 4 registers similar to what we did in the beginning. Then
4757 // add the counter overlay onto workSt[12] at the end.
4758 for (i = 0; i < 16; i += 4) {
4759 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4760 __ addv(workSt[i], __ T4S, workSt[i], v0);
4761 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4762 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4763 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4764 }
4765 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4766
4767 // Write working state into the keystream buffer. This is accomplished
4768 // by taking the lane "i" from each of the four vectors and writing
4769 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4770 // repeating with the next 4 vectors until all 16 vectors have been used.
4771 // Then move to the next lane and repeat the process until all lanes have
4772 // been written.
4773 for (i = 0; i < 4; i++) {
4774 for (j = 0; j < 16; j += 4) {
4775 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4776 __ post(keystream, 16));
4777 }
4778 }
4779
4780 __ mov(r0, 256); // Return length of output keystream
4781 __ leave();
4782 __ ret(lr);
4783
4784 // bind label and generate local constant data used by this stub
4785 // The constant data is broken into two 128-bit segments to be loaded
4786 // onto FloatRegisters. The first 128 bits are a counter add overlay
4787 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4788 // The second 128-bits is a table constant used for 8-bit left rotations.
4789 __ BIND(L_cc20_const);
4790 __ emit_int64(0x0000000100000000UL);
4791 __ emit_int64(0x0000000300000002UL);
4792 __ emit_int64(0x0605040702010003UL);
4793 __ emit_int64(0x0E0D0C0F0A09080BUL);
4794
4795 return start;
4796 }
4797
4798 // Helpers to schedule parallel operation bundles across vector
4799 // register sequences of size 2, 4 or 8.
4800
4801 // Implement various primitive computations across vector sequences
4802
4803 template<int N>
4804 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4805 const VSeq<N>& v1, const VSeq<N>& v2) {
4806 // output must not be constant
4807 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4808 // output cannot overwrite pending inputs
4809 assert(!vs_write_before_read(v, v1), "output overwrites input");
4810 assert(!vs_write_before_read(v, v2), "output overwrites input");
4811 for (int i = 0; i < N; i++) {
4812 __ addv(v[i], T, v1[i], v2[i]);
4813 }
4814 }
4815
4816 template<int N>
4817 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4818 const VSeq<N>& v1, const VSeq<N>& v2) {
4819 // output must not be constant
4820 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4821 // output cannot overwrite pending inputs
4822 assert(!vs_write_before_read(v, v1), "output overwrites input");
4823 assert(!vs_write_before_read(v, v2), "output overwrites input");
4824 for (int i = 0; i < N; i++) {
4825 __ subv(v[i], T, v1[i], v2[i]);
4826 }
4827 }
4828
4829 template<int N>
4830 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4831 const VSeq<N>& v1, const VSeq<N>& v2) {
4832 // output must not be constant
4833 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4834 // output cannot overwrite pending inputs
4835 assert(!vs_write_before_read(v, v1), "output overwrites input");
4836 assert(!vs_write_before_read(v, v2), "output overwrites input");
4837 for (int i = 0; i < N; i++) {
4838 __ mulv(v[i], T, v1[i], v2[i]);
4839 }
4840 }
4841
4842 template<int N>
4843 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4844 // output must not be constant
4845 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4846 // output cannot overwrite pending inputs
4847 assert(!vs_write_before_read(v, v1), "output overwrites input");
4848 for (int i = 0; i < N; i++) {
4849 __ negr(v[i], T, v1[i]);
4850 }
4851 }
4852
4853 template<int N>
4854 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4855 const VSeq<N>& v1, int shift) {
4856 // output must not be constant
4857 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4858 // output cannot overwrite pending inputs
4859 assert(!vs_write_before_read(v, v1), "output overwrites input");
4860 for (int i = 0; i < N; i++) {
4861 __ sshr(v[i], T, v1[i], shift);
4862 }
4863 }
4864
4865 template<int N>
4866 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4867 // output must not be constant
4868 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4869 // output cannot overwrite pending inputs
4870 assert(!vs_write_before_read(v, v1), "output overwrites input");
4871 assert(!vs_write_before_read(v, v2), "output overwrites input");
4872 for (int i = 0; i < N; i++) {
4873 __ andr(v[i], __ T16B, v1[i], v2[i]);
4874 }
4875 }
4876
4877 template<int N>
4878 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4879 // output must not be constant
4880 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4881 // output cannot overwrite pending inputs
4882 assert(!vs_write_before_read(v, v1), "output overwrites input");
4883 assert(!vs_write_before_read(v, v2), "output overwrites input");
4884 for (int i = 0; i < N; i++) {
4885 __ orr(v[i], __ T16B, v1[i], v2[i]);
4886 }
4887 }
4888
4889 template<int N>
4890 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4891 // output must not be constant
4892 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4893 // output cannot overwrite pending inputs
4894 assert(!vs_write_before_read(v, v1), "output overwrites input");
4895 for (int i = 0; i < N; i++) {
4896 __ notr(v[i], __ T16B, v1[i]);
4897 }
4898 }
4899
4900 template<int N>
4901 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4902 // output must not be constant
4903 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4904 // output cannot overwrite pending inputs
4905 assert(!vs_write_before_read(v, v1), "output overwrites input");
4906 assert(!vs_write_before_read(v, v2), "output overwrites input");
4907 for (int i = 0; i < N; i++) {
4908 __ sqdmulh(v[i], T, v1[i], v2[i]);
4909 }
4910 }
4911
4912 template<int N>
4913 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4914 // output must not be constant
4915 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4916 // output cannot overwrite pending inputs
4917 assert(!vs_write_before_read(v, v1), "output overwrites input");
4918 assert(!vs_write_before_read(v, v2), "output overwrites input");
4919 for (int i = 0; i < N; i++) {
4920 __ mlsv(v[i], T, v1[i], v2[i]);
4921 }
4922 }
4923
4924 // load N/2 successive pairs of quadword values from memory in order
4925 // into N successive vector registers of the sequence via the
4926 // address supplied in base.
4927 template<int N>
4928 void vs_ldpq(const VSeq<N>& v, Register base) {
4929 for (int i = 0; i < N; i += 2) {
4930 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4931 }
4932 }
4933
4934 // load N/2 successive pairs of quadword values from memory in order
4935 // into N vector registers of the sequence via the address supplied
4936 // in base using post-increment addressing
4937 template<int N>
4938 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4939 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4940 for (int i = 0; i < N; i += 2) {
4941 __ ldpq(v[i], v[i+1], __ post(base, 32));
4942 }
4943 }
4944
4945 // store N successive vector registers of the sequence into N/2
4946 // successive pairs of quadword memory locations via the address
4947 // supplied in base using post-increment addressing
4948 template<int N>
4949 void vs_stpq_post(const VSeq<N>& v, Register base) {
4950 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4951 for (int i = 0; i < N; i += 2) {
4952 __ stpq(v[i], v[i+1], __ post(base, 32));
4953 }
4954 }
4955
4956 // load N/2 pairs of quadword values from memory de-interleaved into
4957 // N vector registers 2 at a time via the address supplied in base
4958 // using post-increment addressing.
4959 template<int N>
4960 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4961 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4962 for (int i = 0; i < N; i += 2) {
4963 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4964 }
4965 }
4966
4967 // store N vector registers interleaved into N/2 pairs of quadword
4968 // memory locations via the address supplied in base using
4969 // post-increment addressing.
4970 template<int N>
4971 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4972 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4973 for (int i = 0; i < N; i += 2) {
4974 __ st2(v[i], v[i+1], T, __ post(base, 32));
4975 }
4976 }
4977
4978 // load N quadword values from memory de-interleaved into N vector
4979 // registers 3 elements at a time via the address supplied in base.
4980 template<int N>
4981 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4982 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4983 for (int i = 0; i < N; i += 3) {
4984 __ ld3(v[i], v[i+1], v[i+2], T, base);
4985 }
4986 }
4987
4988 // load N quadword values from memory de-interleaved into N vector
4989 // registers 3 elements at a time via the address supplied in base
4990 // using post-increment addressing.
4991 template<int N>
4992 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4993 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4994 for (int i = 0; i < N; i += 3) {
4995 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4996 }
4997 }
4998
4999 // load N/2 pairs of quadword values from memory into N vector
5000 // registers via the address supplied in base with each pair indexed
5001 // using the the start offset plus the corresponding entry in the
5002 // offsets array
5003 template<int N>
5004 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5005 for (int i = 0; i < N/2; i++) {
5006 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5007 }
5008 }
5009
5010 // store N vector registers into N/2 pairs of quadword memory
5011 // locations via the address supplied in base with each pair indexed
5012 // using the the start offset plus the corresponding entry in the
5013 // offsets array
5014 template<int N>
5015 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5016 for (int i = 0; i < N/2; i++) {
5017 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5018 }
5019 }
5020
5021 // load N single quadword values from memory into N vector registers
5022 // via the address supplied in base with each value indexed using
5023 // the the start offset plus the corresponding entry in the offsets
5024 // array
5025 template<int N>
5026 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5027 int start, int (&offsets)[N]) {
5028 for (int i = 0; i < N; i++) {
5029 __ ldr(v[i], T, Address(base, start + offsets[i]));
5030 }
5031 }
5032
5033 // store N vector registers into N single quadword memory locations
5034 // via the address supplied in base with each value indexed using
5035 // the the start offset plus the corresponding entry in the offsets
5036 // array
5037 template<int N>
5038 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5039 int start, int (&offsets)[N]) {
5040 for (int i = 0; i < N; i++) {
5041 __ str(v[i], T, Address(base, start + offsets[i]));
5042 }
5043 }
5044
5045 // load N/2 pairs of quadword values from memory de-interleaved into
5046 // N vector registers 2 at a time via the address supplied in base
5047 // with each pair indexed using the the start offset plus the
5048 // corresponding entry in the offsets array
5049 template<int N>
5050 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5051 Register tmp, int start, int (&offsets)[N/2]) {
5052 for (int i = 0; i < N/2; i++) {
5053 __ add(tmp, base, start + offsets[i]);
5054 __ ld2(v[2*i], v[2*i+1], T, tmp);
5055 }
5056 }
5057
5058 // store N vector registers 2 at a time interleaved into N/2 pairs
5059 // of quadword memory locations via the address supplied in base
5060 // with each pair indexed using the the start offset plus the
5061 // corresponding entry in the offsets array
5062 template<int N>
5063 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5064 Register tmp, int start, int (&offsets)[N/2]) {
5065 for (int i = 0; i < N/2; i++) {
5066 __ add(tmp, base, start + offsets[i]);
5067 __ st2(v[2*i], v[2*i+1], T, tmp);
5068 }
5069 }
5070
5071 // Helper routines for various flavours of Montgomery multiply
5072
5073 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5074 // multiplications in parallel
5075 //
5076
5077 // See the montMul() method of the sun.security.provider.ML_DSA
5078 // class.
5079 //
5080 // Computes 4x4S results or 8x8H results
5081 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5082 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5083 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5084 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5085 // Outputs: va - 4x4S or 4x8H vector register sequences
5086 // vb, vc, vtmp and vq must all be disjoint
5087 // va must be disjoint from all other inputs/temps or must equal vc
5088 // va must have a non-zero delta i.e. it must not be a constant vseq.
5089 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5090 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5091 Assembler::SIMD_Arrangement T,
5092 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5093 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5094 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5095 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5096 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5097
5098 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5099 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5100
5101 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5102
5103 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5104 assert(vs_disjoint(va, vb), "va and vb overlap");
5105 assert(vs_disjoint(va, vq), "va and vq overlap");
5106 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5107 assert(!va.is_constant(), "output vector must identify 4 different registers");
5108
5109 // schedule 4 streams of instructions across the vector sequences
5110 for (int i = 0; i < 4; i++) {
5111 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5112 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5113 }
5114
5115 for (int i = 0; i < 4; i++) {
5116 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5117 }
5118
5119 for (int i = 0; i < 4; i++) {
5120 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5121 }
5122
5123 for (int i = 0; i < 4; i++) {
5124 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5125 }
5126 }
5127
5128 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5129 // multiplications in parallel
5130 //
5131
5132 // See the montMul() method of the sun.security.provider.ML_DSA
5133 // class.
5134 //
5135 // Computes 4x4S results or 8x8H results
5136 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5137 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5138 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5139 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5140 // Outputs: va - 4x4S or 4x8H vector register sequences
5141 // vb, vc, vtmp and vq must all be disjoint
5142 // va must be disjoint from all other inputs/temps or must equal vc
5143 // va must have a non-zero delta i.e. it must not be a constant vseq.
5144 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5145 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5146 Assembler::SIMD_Arrangement T,
5147 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5148 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5149 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5150 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5151 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5152
5153 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5154 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5155
5156 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5157
5158 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5159 assert(vs_disjoint(va, vb), "va and vb overlap");
5160 assert(vs_disjoint(va, vq), "va and vq overlap");
5161 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5162 assert(!va.is_constant(), "output vector must identify 2 different registers");
5163
5164 // schedule 2 streams of instructions across the vector sequences
5165 for (int i = 0; i < 2; i++) {
5166 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5167 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5168 }
5169
5170 for (int i = 0; i < 2; i++) {
5171 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5172 }
5173
5174 for (int i = 0; i < 2; i++) {
5175 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5176 }
5177
5178 for (int i = 0; i < 2; i++) {
5179 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5180 }
5181 }
5182
5183 // Perform 16 16-bit Montgomery multiplications in parallel.
5184 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5185 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5186 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5187 // It will assert that the register use is valid
5188 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5189 }
5190
5191 // Perform 32 16-bit Montgomery multiplications in parallel.
5192 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5193 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5194 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5195 // It will assert that the register use is valid
5196 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5197 }
5198
5199 // Perform 64 16-bit Montgomery multiplications in parallel.
5200 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5201 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5202 // Schedule two successive 4x8H multiplies via the montmul helper
5203 // on the front and back halves of va, vb and vc. The helper will
5204 // assert that the register use has no overlap conflicts on each
5205 // individual call but we also need to ensure that the necessary
5206 // disjoint/equality constraints are met across both calls.
5207
5208 // vb, vc, vtmp and vq must be disjoint. va must either be
5209 // disjoint from all other registers or equal vc
5210
5211 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5212 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5213 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5214
5215 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5216 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5217
5218 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5219
5220 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5221 assert(vs_disjoint(va, vb), "va and vb overlap");
5222 assert(vs_disjoint(va, vq), "va and vq overlap");
5223 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5224
5225 // we multiply the front and back halves of each sequence 4 at a
5226 // time because
5227 //
5228 // 1) we are currently only able to get 4-way instruction
5229 // parallelism at best
5230 //
5231 // 2) we need registers for the constants in vq and temporary
5232 // scratch registers to hold intermediate results so vtmp can only
5233 // be a VSeq<4> which means we only have 4 scratch slots
5234
5235 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5236 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5237 }
5238
5239 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5240 const VSeq<4>& vc,
5241 const VSeq<4>& vtmp,
5242 const VSeq<2>& vq) {
5243 // compute a = montmul(a1, c)
5244 kyber_montmul32(vc, va1, vc, vtmp, vq);
5245 // ouptut a1 = a0 - a
5246 vs_subv(va1, __ T8H, va0, vc);
5247 // and a0 = a0 + a
5248 vs_addv(va0, __ T8H, va0, vc);
5249 }
5250
5251 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5252 const VSeq<4>& vb,
5253 const VSeq<4>& vtmp1,
5254 const VSeq<4>& vtmp2,
5255 const VSeq<2>& vq) {
5256 // compute c = a0 - a1
5257 vs_subv(vtmp1, __ T8H, va0, va1);
5258 // output a0 = a0 + a1
5259 vs_addv(va0, __ T8H, va0, va1);
5260 // output a1 = b montmul c
5261 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5262 }
5263
5264 void load64shorts(const VSeq<8>& v, Register shorts) {
5265 vs_ldpq_post(v, shorts);
5266 }
5267
5268 void load32shorts(const VSeq<4>& v, Register shorts) {
5269 vs_ldpq_post(v, shorts);
5270 }
5271
5272 void store64shorts(VSeq<8> v, Register tmpAddr) {
5273 vs_stpq_post(v, tmpAddr);
5274 }
5275
5276 // Kyber NTT function.
5277 // Implements
5278 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5279 //
5280 // coeffs (short[256]) = c_rarg0
5281 // ntt_zetas (short[256]) = c_rarg1
5282 address generate_kyberNtt() {
5283
5284 __ align(CodeEntryAlignment);
5285 StubId stub_id = StubId::stubgen_kyberNtt_id;
5286 StubCodeMark mark(this, stub_id);
5287 address start = __ pc();
5288 __ enter();
5289
5290 const Register coeffs = c_rarg0;
5291 const Register zetas = c_rarg1;
5292
5293 const Register kyberConsts = r10;
5294 const Register tmpAddr = r11;
5295
5296 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5297 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5298 VSeq<2> vq(30); // n.b. constants overlap vs3
5299
5300 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5301 // load the montmul constants
5302 vs_ldpq(vq, kyberConsts);
5303
5304 // Each level corresponds to an iteration of the outermost loop of the
5305 // Java method seilerNTT(int[] coeffs). There are some differences
5306 // from what is done in the seilerNTT() method, though:
5307 // 1. The computation is using 16-bit signed values, we do not convert them
5308 // to ints here.
5309 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5310 // this array for each level, it is easier that way to fill up the vector
5311 // registers.
5312 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5313 // multiplications (this is because that way there should not be any
5314 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5315 // that we can use the 16-bit arithmetic in the vector unit.
5316 //
5317 // On each level, we fill up the vector registers in such a way that the
5318 // array elements that need to be multiplied by the zetas go into one
5319 // set of vector registers while the corresponding ones that don't need to
5320 // be multiplied, go into another set.
5321 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5322 // registers interleaving the steps of 4 identical computations,
5323 // each done on 8 16-bit values per register.
5324
5325 // At levels 0-3 the coefficients multiplied by or added/subtracted
5326 // to the zetas occur in discrete blocks whose size is some multiple
5327 // of 32.
5328
5329 // level 0
5330 __ add(tmpAddr, coeffs, 256);
5331 load64shorts(vs1, tmpAddr);
5332 load64shorts(vs2, zetas);
5333 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5334 __ add(tmpAddr, coeffs, 0);
5335 load64shorts(vs1, tmpAddr);
5336 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5337 vs_addv(vs1, __ T8H, vs1, vs2);
5338 __ add(tmpAddr, coeffs, 0);
5339 vs_stpq_post(vs1, tmpAddr);
5340 __ add(tmpAddr, coeffs, 256);
5341 vs_stpq_post(vs3, tmpAddr);
5342 // restore montmul constants
5343 vs_ldpq(vq, kyberConsts);
5344 load64shorts(vs1, tmpAddr);
5345 load64shorts(vs2, zetas);
5346 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5347 __ add(tmpAddr, coeffs, 128);
5348 load64shorts(vs1, tmpAddr);
5349 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5350 vs_addv(vs1, __ T8H, vs1, vs2);
5351 __ add(tmpAddr, coeffs, 128);
5352 store64shorts(vs1, tmpAddr);
5353 __ add(tmpAddr, coeffs, 384);
5354 store64shorts(vs3, tmpAddr);
5355
5356 // level 1
5357 // restore montmul constants
5358 vs_ldpq(vq, kyberConsts);
5359 __ add(tmpAddr, coeffs, 128);
5360 load64shorts(vs1, tmpAddr);
5361 load64shorts(vs2, zetas);
5362 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5363 __ add(tmpAddr, coeffs, 0);
5364 load64shorts(vs1, tmpAddr);
5365 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5366 vs_addv(vs1, __ T8H, vs1, vs2);
5367 __ add(tmpAddr, coeffs, 0);
5368 store64shorts(vs1, tmpAddr);
5369 store64shorts(vs3, tmpAddr);
5370 vs_ldpq(vq, kyberConsts);
5371 __ add(tmpAddr, coeffs, 384);
5372 load64shorts(vs1, tmpAddr);
5373 load64shorts(vs2, zetas);
5374 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5375 __ add(tmpAddr, coeffs, 256);
5376 load64shorts(vs1, tmpAddr);
5377 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5378 vs_addv(vs1, __ T8H, vs1, vs2);
5379 __ add(tmpAddr, coeffs, 256);
5380 store64shorts(vs1, tmpAddr);
5381 store64shorts(vs3, tmpAddr);
5382
5383 // level 2
5384 vs_ldpq(vq, kyberConsts);
5385 int offsets1[4] = { 0, 32, 128, 160 };
5386 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5387 load64shorts(vs2, zetas);
5388 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5389 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5390 // kyber_subv_addv64();
5391 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5392 vs_addv(vs1, __ T8H, vs1, vs2);
5393 __ add(tmpAddr, coeffs, 0);
5394 vs_stpq_post(vs_front(vs1), tmpAddr);
5395 vs_stpq_post(vs_front(vs3), tmpAddr);
5396 vs_stpq_post(vs_back(vs1), tmpAddr);
5397 vs_stpq_post(vs_back(vs3), tmpAddr);
5398 vs_ldpq(vq, kyberConsts);
5399 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5400 load64shorts(vs2, zetas);
5401 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5402 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5403 // kyber_subv_addv64();
5404 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5405 vs_addv(vs1, __ T8H, vs1, vs2);
5406 __ add(tmpAddr, coeffs, 256);
5407 vs_stpq_post(vs_front(vs1), tmpAddr);
5408 vs_stpq_post(vs_front(vs3), tmpAddr);
5409 vs_stpq_post(vs_back(vs1), tmpAddr);
5410 vs_stpq_post(vs_back(vs3), tmpAddr);
5411
5412 // level 3
5413 vs_ldpq(vq, kyberConsts);
5414 int offsets2[4] = { 0, 64, 128, 192 };
5415 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5416 load64shorts(vs2, zetas);
5417 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5418 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5419 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5420 vs_addv(vs1, __ T8H, vs1, vs2);
5421 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5422 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5423
5424 vs_ldpq(vq, kyberConsts);
5425 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5426 load64shorts(vs2, zetas);
5427 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5428 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5429 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5430 vs_addv(vs1, __ T8H, vs1, vs2);
5431 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5432 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5433
5434 // level 4
5435 // At level 4 coefficients occur in 8 discrete blocks of size 16
5436 // so they are loaded using employing an ldr at 8 distinct offsets.
5437
5438 vs_ldpq(vq, kyberConsts);
5439 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5440 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5441 load64shorts(vs2, zetas);
5442 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5443 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5444 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5445 vs_addv(vs1, __ T8H, vs1, vs2);
5446 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5447 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5448
5449 vs_ldpq(vq, kyberConsts);
5450 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5451 load64shorts(vs2, zetas);
5452 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5453 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5454 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5455 vs_addv(vs1, __ T8H, vs1, vs2);
5456 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5457 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5458
5459 // level 5
5460 // At level 5 related coefficients occur in discrete blocks of size 8 so
5461 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5462
5463 vs_ldpq(vq, kyberConsts);
5464 int offsets4[4] = { 0, 32, 64, 96 };
5465 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5466 load32shorts(vs_front(vs2), zetas);
5467 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5468 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5469 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5470 load32shorts(vs_front(vs2), zetas);
5471 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5472 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5473 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5474 load32shorts(vs_front(vs2), zetas);
5475 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5476 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5477
5478 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5479 load32shorts(vs_front(vs2), zetas);
5480 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5481 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5482
5483 // level 6
5484 // At level 6 related coefficients occur in discrete blocks of size 4 so
5485 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5486
5487 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5488 load32shorts(vs_front(vs2), zetas);
5489 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5490 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5491 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5492 // __ ldpq(v18, v19, __ post(zetas, 32));
5493 load32shorts(vs_front(vs2), zetas);
5494 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5495 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5496
5497 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5498 load32shorts(vs_front(vs2), zetas);
5499 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5500 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5501
5502 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5503 load32shorts(vs_front(vs2), zetas);
5504 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5505 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5506
5507 __ leave(); // required for proper stackwalking of RuntimeStub frame
5508 __ mov(r0, zr); // return 0
5509 __ ret(lr);
5510
5511 return start;
5512 }
5513
5514 // Kyber Inverse NTT function
5515 // Implements
5516 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5517 //
5518 // coeffs (short[256]) = c_rarg0
5519 // ntt_zetas (short[256]) = c_rarg1
5520 address generate_kyberInverseNtt() {
5521
5522 __ align(CodeEntryAlignment);
5523 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5524 StubCodeMark mark(this, stub_id);
5525 address start = __ pc();
5526 __ enter();
5527
5528 const Register coeffs = c_rarg0;
5529 const Register zetas = c_rarg1;
5530
5531 const Register kyberConsts = r10;
5532 const Register tmpAddr = r11;
5533 const Register tmpAddr2 = c_rarg2;
5534
5535 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5536 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5537 VSeq<2> vq(30); // n.b. constants overlap vs3
5538
5539 __ lea(kyberConsts,
5540 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5541
5542 // level 0
5543 // At level 0 related coefficients occur in discrete blocks of size 4 so
5544 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5545
5546 vs_ldpq(vq, kyberConsts);
5547 int offsets4[4] = { 0, 32, 64, 96 };
5548 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5549 load32shorts(vs_front(vs2), zetas);
5550 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5551 vs_front(vs2), vs_back(vs2), vtmp, vq);
5552 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5553 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5554 load32shorts(vs_front(vs2), zetas);
5555 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5556 vs_front(vs2), vs_back(vs2), vtmp, vq);
5557 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5558 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5559 load32shorts(vs_front(vs2), zetas);
5560 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5561 vs_front(vs2), vs_back(vs2), vtmp, vq);
5562 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5563 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5564 load32shorts(vs_front(vs2), zetas);
5565 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5566 vs_front(vs2), vs_back(vs2), vtmp, vq);
5567 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5568
5569 // level 1
5570 // At level 1 related coefficients occur in discrete blocks of size 8 so
5571 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5572
5573 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5574 load32shorts(vs_front(vs2), zetas);
5575 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5576 vs_front(vs2), vs_back(vs2), vtmp, vq);
5577 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5578 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5579 load32shorts(vs_front(vs2), zetas);
5580 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5581 vs_front(vs2), vs_back(vs2), vtmp, vq);
5582 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5583
5584 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5585 load32shorts(vs_front(vs2), zetas);
5586 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5587 vs_front(vs2), vs_back(vs2), vtmp, vq);
5588 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5589 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5590 load32shorts(vs_front(vs2), zetas);
5591 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5592 vs_front(vs2), vs_back(vs2), vtmp, vq);
5593 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5594
5595 // level 2
5596 // At level 2 coefficients occur in 8 discrete blocks of size 16
5597 // so they are loaded using employing an ldr at 8 distinct offsets.
5598
5599 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5600 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5601 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5602 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5603 vs_subv(vs1, __ T8H, vs1, vs2);
5604 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5605 load64shorts(vs2, zetas);
5606 vs_ldpq(vq, kyberConsts);
5607 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5608 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5609
5610 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5611 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5612 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5613 vs_subv(vs1, __ T8H, vs1, vs2);
5614 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5615 load64shorts(vs2, zetas);
5616 vs_ldpq(vq, kyberConsts);
5617 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5618 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5619
5620 // Barrett reduction at indexes where overflow may happen
5621
5622 // load q and the multiplier for the Barrett reduction
5623 __ add(tmpAddr, kyberConsts, 16);
5624 vs_ldpq(vq, tmpAddr);
5625
5626 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5627 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5628 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5629 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5630 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5631 vs_sshr(vs2, __ T8H, vs2, 11);
5632 vs_mlsv(vs1, __ T8H, vs2, vq1);
5633 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5634 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5635 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5636 vs_sshr(vs2, __ T8H, vs2, 11);
5637 vs_mlsv(vs1, __ T8H, vs2, vq1);
5638 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5639
5640 // level 3
5641 // From level 3 upwards coefficients occur in discrete blocks whose size is
5642 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5643
5644 int offsets2[4] = { 0, 64, 128, 192 };
5645 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5646 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5647 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5648 vs_subv(vs1, __ T8H, vs1, vs2);
5649 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5650 load64shorts(vs2, zetas);
5651 vs_ldpq(vq, kyberConsts);
5652 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5653 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5654
5655 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5656 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5657 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5658 vs_subv(vs1, __ T8H, vs1, vs2);
5659 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5660 load64shorts(vs2, zetas);
5661 vs_ldpq(vq, kyberConsts);
5662 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5663 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5664
5665 // level 4
5666
5667 int offsets1[4] = { 0, 32, 128, 160 };
5668 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5669 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5670 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5671 vs_subv(vs1, __ T8H, vs1, vs2);
5672 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5673 load64shorts(vs2, zetas);
5674 vs_ldpq(vq, kyberConsts);
5675 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5676 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5677
5678 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5679 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5680 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5681 vs_subv(vs1, __ T8H, vs1, vs2);
5682 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5683 load64shorts(vs2, zetas);
5684 vs_ldpq(vq, kyberConsts);
5685 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5686 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5687
5688 // level 5
5689
5690 __ add(tmpAddr, coeffs, 0);
5691 load64shorts(vs1, tmpAddr);
5692 __ add(tmpAddr, coeffs, 128);
5693 load64shorts(vs2, tmpAddr);
5694 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5695 vs_subv(vs1, __ T8H, vs1, vs2);
5696 __ add(tmpAddr, coeffs, 0);
5697 store64shorts(vs3, tmpAddr);
5698 load64shorts(vs2, zetas);
5699 vs_ldpq(vq, kyberConsts);
5700 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5701 __ add(tmpAddr, coeffs, 128);
5702 store64shorts(vs2, tmpAddr);
5703
5704 load64shorts(vs1, tmpAddr);
5705 __ add(tmpAddr, coeffs, 384);
5706 load64shorts(vs2, tmpAddr);
5707 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5708 vs_subv(vs1, __ T8H, vs1, vs2);
5709 __ add(tmpAddr, coeffs, 256);
5710 store64shorts(vs3, tmpAddr);
5711 load64shorts(vs2, zetas);
5712 vs_ldpq(vq, kyberConsts);
5713 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5714 __ add(tmpAddr, coeffs, 384);
5715 store64shorts(vs2, tmpAddr);
5716
5717 // Barrett reduction at indexes where overflow may happen
5718
5719 // load q and the multiplier for the Barrett reduction
5720 __ add(tmpAddr, kyberConsts, 16);
5721 vs_ldpq(vq, tmpAddr);
5722
5723 int offsets0[2] = { 0, 256 };
5724 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5725 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5726 vs_sshr(vs2, __ T8H, vs2, 11);
5727 vs_mlsv(vs1, __ T8H, vs2, vq1);
5728 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5729
5730 // level 6
5731
5732 __ add(tmpAddr, coeffs, 0);
5733 load64shorts(vs1, tmpAddr);
5734 __ add(tmpAddr, coeffs, 256);
5735 load64shorts(vs2, tmpAddr);
5736 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5737 vs_subv(vs1, __ T8H, vs1, vs2);
5738 __ add(tmpAddr, coeffs, 0);
5739 store64shorts(vs3, tmpAddr);
5740 load64shorts(vs2, zetas);
5741 vs_ldpq(vq, kyberConsts);
5742 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5743 __ add(tmpAddr, coeffs, 256);
5744 store64shorts(vs2, tmpAddr);
5745
5746 __ add(tmpAddr, coeffs, 128);
5747 load64shorts(vs1, tmpAddr);
5748 __ add(tmpAddr, coeffs, 384);
5749 load64shorts(vs2, tmpAddr);
5750 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5751 vs_subv(vs1, __ T8H, vs1, vs2);
5752 __ add(tmpAddr, coeffs, 128);
5753 store64shorts(vs3, tmpAddr);
5754 load64shorts(vs2, zetas);
5755 vs_ldpq(vq, kyberConsts);
5756 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5757 __ add(tmpAddr, coeffs, 384);
5758 store64shorts(vs2, tmpAddr);
5759
5760 // multiply by 2^-n
5761
5762 // load toMont(2^-n mod q)
5763 __ add(tmpAddr, kyberConsts, 48);
5764 __ ldr(v29, __ Q, tmpAddr);
5765
5766 vs_ldpq(vq, kyberConsts);
5767 __ add(tmpAddr, coeffs, 0);
5768 load64shorts(vs1, tmpAddr);
5769 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5770 __ add(tmpAddr, coeffs, 0);
5771 store64shorts(vs2, tmpAddr);
5772
5773 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5774 load64shorts(vs1, tmpAddr);
5775 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5776 __ add(tmpAddr, coeffs, 128);
5777 store64shorts(vs2, tmpAddr);
5778
5779 // now tmpAddr contains coeffs + 256
5780 load64shorts(vs1, tmpAddr);
5781 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5782 __ add(tmpAddr, coeffs, 256);
5783 store64shorts(vs2, tmpAddr);
5784
5785 // now tmpAddr contains coeffs + 384
5786 load64shorts(vs1, tmpAddr);
5787 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5788 __ add(tmpAddr, coeffs, 384);
5789 store64shorts(vs2, tmpAddr);
5790
5791 __ leave(); // required for proper stackwalking of RuntimeStub frame
5792 __ mov(r0, zr); // return 0
5793 __ ret(lr);
5794
5795 return start;
5796 }
5797
5798 // Kyber multiply polynomials in the NTT domain.
5799 // Implements
5800 // static int implKyberNttMult(
5801 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5802 //
5803 // result (short[256]) = c_rarg0
5804 // ntta (short[256]) = c_rarg1
5805 // nttb (short[256]) = c_rarg2
5806 // zetas (short[128]) = c_rarg3
5807 address generate_kyberNttMult() {
5808
5809 __ align(CodeEntryAlignment);
5810 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5811 StubCodeMark mark(this, stub_id);
5812 address start = __ pc();
5813 __ enter();
5814
5815 const Register result = c_rarg0;
5816 const Register ntta = c_rarg1;
5817 const Register nttb = c_rarg2;
5818 const Register zetas = c_rarg3;
5819
5820 const Register kyberConsts = r10;
5821 const Register limit = r11;
5822
5823 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5824 VSeq<4> vs3(16), vs4(20);
5825 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5826 VSeq<2> vz(28); // pair of zetas
5827 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5828
5829 __ lea(kyberConsts,
5830 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5831
5832 Label kyberNttMult_loop;
5833
5834 __ add(limit, result, 512);
5835
5836 // load q and qinv
5837 vs_ldpq(vq, kyberConsts);
5838
5839 // load R^2 mod q (to convert back from Montgomery representation)
5840 __ add(kyberConsts, kyberConsts, 64);
5841 __ ldr(v27, __ Q, kyberConsts);
5842
5843 __ BIND(kyberNttMult_loop);
5844
5845 // load 16 zetas
5846 vs_ldpq_post(vz, zetas);
5847
5848 // load 2 sets of 32 coefficients from the two input arrays
5849 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5850 // are striped across pairs of vector registers
5851 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5852 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5853 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5854 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5855
5856 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5857 // i.e. montmul the first and second halves of vs1 in order and
5858 // then with one sequence reversed storing the two results in vs3
5859 //
5860 // vs3[0] <- montmul(a0, b0)
5861 // vs3[1] <- montmul(a1, b1)
5862 // vs3[2] <- montmul(a0, b1)
5863 // vs3[3] <- montmul(a1, b0)
5864 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5865 kyber_montmul16(vs_back(vs3),
5866 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5867
5868 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5869 // i.e. montmul the first and second halves of vs4 in order and
5870 // then with one sequence reversed storing the two results in vs1
5871 //
5872 // vs1[0] <- montmul(a2, b2)
5873 // vs1[1] <- montmul(a3, b3)
5874 // vs1[2] <- montmul(a2, b3)
5875 // vs1[3] <- montmul(a3, b2)
5876 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5877 kyber_montmul16(vs_back(vs1),
5878 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5879
5880 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5881 // We can schedule two montmuls at a time if we use a suitable vector
5882 // sequence <vs3[1], vs1[1]>.
5883 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5884 VSeq<2> vs5(vs3[1], delta);
5885
5886 // vs3[1] <- montmul(montmul(a1, b1), z0)
5887 // vs1[1] <- montmul(montmul(a3, b3), z1)
5888 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5889
5890 // add results in pairs storing in vs3
5891 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5892 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5893 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5894
5895 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5896 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5897 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5898
5899 // vs1 <- montmul(vs3, montRSquareModQ)
5900 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5901
5902 // store back the two pairs of result vectors de-interleaved as 8H elements
5903 // i.e. storing each pairs of shorts striped across a register pair adjacent
5904 // in memory
5905 vs_st2_post(vs1, __ T8H, result);
5906
5907 __ cmp(result, limit);
5908 __ br(Assembler::NE, kyberNttMult_loop);
5909
5910 __ leave(); // required for proper stackwalking of RuntimeStub frame
5911 __ mov(r0, zr); // return 0
5912 __ ret(lr);
5913
5914 return start;
5915 }
5916
5917 // Kyber add 2 polynomials.
5918 // Implements
5919 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5920 //
5921 // result (short[256]) = c_rarg0
5922 // a (short[256]) = c_rarg1
5923 // b (short[256]) = c_rarg2
5924 address generate_kyberAddPoly_2() {
5925
5926 __ align(CodeEntryAlignment);
5927 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5928 StubCodeMark mark(this, stub_id);
5929 address start = __ pc();
5930 __ enter();
5931
5932 const Register result = c_rarg0;
5933 const Register a = c_rarg1;
5934 const Register b = c_rarg2;
5935
5936 const Register kyberConsts = r11;
5937
5938 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5939 // So, we can load, add and store the data in 3 groups of 11,
5940 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5941 // registers. A further constraint is that the mapping needs
5942 // to skip callee saves. So, we allocate the register
5943 // sequences using two 8 sequences, two 2 sequences and two
5944 // single registers.
5945 VSeq<8> vs1_1(0);
5946 VSeq<2> vs1_2(16);
5947 FloatRegister vs1_3 = v28;
5948 VSeq<8> vs2_1(18);
5949 VSeq<2> vs2_2(26);
5950 FloatRegister vs2_3 = v29;
5951
5952 // two constant vector sequences
5953 VSeq<8> vc_1(31, 0);
5954 VSeq<2> vc_2(31, 0);
5955
5956 FloatRegister vc_3 = v31;
5957 __ lea(kyberConsts,
5958 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5959
5960 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5961 for (int i = 0; i < 3; i++) {
5962 // load 80 or 88 values from a into vs1_1/2/3
5963 vs_ldpq_post(vs1_1, a);
5964 vs_ldpq_post(vs1_2, a);
5965 if (i < 2) {
5966 __ ldr(vs1_3, __ Q, __ post(a, 16));
5967 }
5968 // load 80 or 88 values from b into vs2_1/2/3
5969 vs_ldpq_post(vs2_1, b);
5970 vs_ldpq_post(vs2_2, b);
5971 if (i < 2) {
5972 __ ldr(vs2_3, __ Q, __ post(b, 16));
5973 }
5974 // sum 80 or 88 values across vs1 and vs2 into vs1
5975 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5976 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5977 if (i < 2) {
5978 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5979 }
5980 // add constant to all 80 or 88 results
5981 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5982 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5983 if (i < 2) {
5984 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5985 }
5986 // store 80 or 88 values
5987 vs_stpq_post(vs1_1, result);
5988 vs_stpq_post(vs1_2, result);
5989 if (i < 2) {
5990 __ str(vs1_3, __ Q, __ post(result, 16));
5991 }
5992 }
5993
5994 __ leave(); // required for proper stackwalking of RuntimeStub frame
5995 __ mov(r0, zr); // return 0
5996 __ ret(lr);
5997
5998 return start;
5999 }
6000
6001 // Kyber add 3 polynomials.
6002 // Implements
6003 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6004 //
6005 // result (short[256]) = c_rarg0
6006 // a (short[256]) = c_rarg1
6007 // b (short[256]) = c_rarg2
6008 // c (short[256]) = c_rarg3
6009 address generate_kyberAddPoly_3() {
6010
6011 __ align(CodeEntryAlignment);
6012 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6013 StubCodeMark mark(this, stub_id);
6014 address start = __ pc();
6015 __ enter();
6016
6017 const Register result = c_rarg0;
6018 const Register a = c_rarg1;
6019 const Register b = c_rarg2;
6020 const Register c = c_rarg3;
6021
6022 const Register kyberConsts = r11;
6023
6024 // As above we sum 256 sets of values in total i.e. 32 x 8H
6025 // quadwords. So, we can load, add and store the data in 3
6026 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6027 // of 10 or 11 registers. A further constraint is that the
6028 // mapping needs to skip callee saves. So, we allocate the
6029 // register sequences using two 8 sequences, two 2 sequences
6030 // and two single registers.
6031 VSeq<8> vs1_1(0);
6032 VSeq<2> vs1_2(16);
6033 FloatRegister vs1_3 = v28;
6034 VSeq<8> vs2_1(18);
6035 VSeq<2> vs2_2(26);
6036 FloatRegister vs2_3 = v29;
6037
6038 // two constant vector sequences
6039 VSeq<8> vc_1(31, 0);
6040 VSeq<2> vc_2(31, 0);
6041
6042 FloatRegister vc_3 = v31;
6043
6044 __ lea(kyberConsts,
6045 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6046
6047 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6048 for (int i = 0; i < 3; i++) {
6049 // load 80 or 88 values from a into vs1_1/2/3
6050 vs_ldpq_post(vs1_1, a);
6051 vs_ldpq_post(vs1_2, a);
6052 if (i < 2) {
6053 __ ldr(vs1_3, __ Q, __ post(a, 16));
6054 }
6055 // load 80 or 88 values from b into vs2_1/2/3
6056 vs_ldpq_post(vs2_1, b);
6057 vs_ldpq_post(vs2_2, b);
6058 if (i < 2) {
6059 __ ldr(vs2_3, __ Q, __ post(b, 16));
6060 }
6061 // sum 80 or 88 values across vs1 and vs2 into vs1
6062 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6063 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6064 if (i < 2) {
6065 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6066 }
6067 // load 80 or 88 values from c into vs2_1/2/3
6068 vs_ldpq_post(vs2_1, c);
6069 vs_ldpq_post(vs2_2, c);
6070 if (i < 2) {
6071 __ ldr(vs2_3, __ Q, __ post(c, 16));
6072 }
6073 // sum 80 or 88 values across vs1 and vs2 into vs1
6074 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6075 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6076 if (i < 2) {
6077 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6078 }
6079 // add constant to all 80 or 88 results
6080 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6081 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6082 if (i < 2) {
6083 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6084 }
6085 // store 80 or 88 values
6086 vs_stpq_post(vs1_1, result);
6087 vs_stpq_post(vs1_2, result);
6088 if (i < 2) {
6089 __ str(vs1_3, __ Q, __ post(result, 16));
6090 }
6091 }
6092
6093 __ leave(); // required for proper stackwalking of RuntimeStub frame
6094 __ mov(r0, zr); // return 0
6095 __ ret(lr);
6096
6097 return start;
6098 }
6099
6100 // Kyber parse XOF output to polynomial coefficient candidates
6101 // or decodePoly(12, ...).
6102 // Implements
6103 // static int implKyber12To16(
6104 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6105 //
6106 // we assume that parsed and condensed are allocated such that for
6107 // n = (parsedLength + 63) / 64
6108 // n blocks of 96 bytes of input can be processed, i.e.
6109 // index + n * 96 <= condensed.length and
6110 // n * 64 <= parsed.length
6111 //
6112 // condensed (byte[]) = c_rarg0
6113 // condensedIndex = c_rarg1
6114 // parsed (short[]) = c_rarg2
6115 // parsedLength = c_rarg3
6116 address generate_kyber12To16() {
6117 Label L_F00, L_loop;
6118
6119 __ align(CodeEntryAlignment);
6120 StubId stub_id = StubId::stubgen_kyber12To16_id;
6121 StubCodeMark mark(this, stub_id);
6122 address start = __ pc();
6123 __ enter();
6124
6125 const Register condensed = c_rarg0;
6126 const Register condensedOffs = c_rarg1;
6127 const Register parsed = c_rarg2;
6128 const Register parsedLength = c_rarg3;
6129
6130 const Register tmpAddr = r11;
6131
6132 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6133 // quadwords so we need a 6 vector sequence for the inputs.
6134 // Parsing produces 64 shorts, employing two 8 vector
6135 // sequences to store and combine the intermediate data.
6136 VSeq<6> vin(24);
6137 VSeq<8> va(0), vb(16);
6138
6139 __ adr(tmpAddr, L_F00);
6140 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6141 __ add(condensed, condensed, condensedOffs);
6142
6143 __ BIND(L_loop);
6144 // load 96 (6 x 16B) byte values
6145 vs_ld3_post(vin, __ T16B, condensed);
6146
6147 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6148 // holds 48 (16x3) contiguous bytes from memory striped
6149 // horizontally across each of the 16 byte lanes. Equivalently,
6150 // that is 16 pairs of 12-bit integers. Likewise the back half
6151 // holds the next 48 bytes in the same arrangement.
6152
6153 // Each vector in the front half can also be viewed as a vertical
6154 // strip across the 16 pairs of 12 bit integers. Each byte in
6155 // vin[0] stores the low 8 bits of the first int in a pair. Each
6156 // byte in vin[1] stores the high 4 bits of the first int and the
6157 // low 4 bits of the second int. Each byte in vin[2] stores the
6158 // high 8 bits of the second int. Likewise the vectors in second
6159 // half.
6160
6161 // Converting the data to 16-bit shorts requires first of all
6162 // expanding each of the 6 x 16B vectors into 6 corresponding
6163 // pairs of 8H vectors. Mask, shift and add operations on the
6164 // resulting vector pairs can be used to combine 4 and 8 bit
6165 // parts of related 8H vector elements.
6166 //
6167 // The middle vectors (vin[2] and vin[5]) are actually expanded
6168 // twice, one copy manipulated to provide the lower 4 bits
6169 // belonging to the first short in a pair and another copy
6170 // manipulated to provide the higher 4 bits belonging to the
6171 // second short in a pair. This is why the the vector sequences va
6172 // and vb used to hold the expanded 8H elements are of length 8.
6173
6174 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6175 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6176 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6177 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6178 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6179 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6180 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6181 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6182
6183 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6184 // and vb[4:5]
6185 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6186 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6187 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6188 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6189 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6190 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6191
6192 // shift lo byte of copy 1 of the middle stripe into the high byte
6193 __ shl(va[2], __ T8H, va[2], 8);
6194 __ shl(va[3], __ T8H, va[3], 8);
6195 __ shl(vb[2], __ T8H, vb[2], 8);
6196 __ shl(vb[3], __ T8H, vb[3], 8);
6197
6198 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6199 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6200 // are in bit positions [4..11].
6201 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6202 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6203 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6204 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6205
6206 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6207 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6208 // copy2
6209 __ andr(va[2], __ T16B, va[2], v31);
6210 __ andr(va[3], __ T16B, va[3], v31);
6211 __ ushr(va[4], __ T8H, va[4], 4);
6212 __ ushr(va[5], __ T8H, va[5], 4);
6213 __ andr(vb[2], __ T16B, vb[2], v31);
6214 __ andr(vb[3], __ T16B, vb[3], v31);
6215 __ ushr(vb[4], __ T8H, vb[4], 4);
6216 __ ushr(vb[5], __ T8H, vb[5], 4);
6217
6218 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6219 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6220 // n.b. the ordering ensures: i) inputs are consumed before they
6221 // are overwritten ii) the order of 16-bit results across successive
6222 // pairs of vectors in va and then vb reflects the order of the
6223 // corresponding 12-bit inputs
6224 __ addv(va[0], __ T8H, va[0], va[2]);
6225 __ addv(va[2], __ T8H, va[1], va[3]);
6226 __ addv(va[1], __ T8H, va[4], va[6]);
6227 __ addv(va[3], __ T8H, va[5], va[7]);
6228 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6229 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6230 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6231 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6232
6233 // store 64 results interleaved as shorts
6234 vs_st2_post(vs_front(va), __ T8H, parsed);
6235 vs_st2_post(vs_front(vb), __ T8H, parsed);
6236
6237 __ sub(parsedLength, parsedLength, 64);
6238 __ cmp(parsedLength, (u1)0);
6239 __ br(Assembler::GT, L_loop);
6240
6241 __ leave(); // required for proper stackwalking of RuntimeStub frame
6242 __ mov(r0, zr); // return 0
6243 __ ret(lr);
6244
6245 // bind label and generate constant data used by this stub
6246 __ BIND(L_F00);
6247 __ emit_int64(0x0f000f000f000f00);
6248 __ emit_int64(0x0f000f000f000f00);
6249
6250 return start;
6251 }
6252
6253 // Kyber Barrett reduce function.
6254 // Implements
6255 // static int implKyberBarrettReduce(short[] coeffs) {}
6256 //
6257 // coeffs (short[256]) = c_rarg0
6258 address generate_kyberBarrettReduce() {
6259
6260 __ align(CodeEntryAlignment);
6261 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6262 StubCodeMark mark(this, stub_id);
6263 address start = __ pc();
6264 __ enter();
6265
6266 const Register coeffs = c_rarg0;
6267
6268 const Register kyberConsts = r10;
6269 const Register result = r11;
6270
6271 // As above we process 256 sets of values in total i.e. 32 x
6272 // 8H quadwords. So, we can load, add and store the data in 3
6273 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6274 // of 10 or 11 registers. A further constraint is that the
6275 // mapping needs to skip callee saves. So, we allocate the
6276 // register sequences using two 8 sequences, two 2 sequences
6277 // and two single registers.
6278 VSeq<8> vs1_1(0);
6279 VSeq<2> vs1_2(16);
6280 FloatRegister vs1_3 = v28;
6281 VSeq<8> vs2_1(18);
6282 VSeq<2> vs2_2(26);
6283 FloatRegister vs2_3 = v29;
6284
6285 // we also need a pair of corresponding constant sequences
6286
6287 VSeq<8> vc1_1(30, 0);
6288 VSeq<2> vc1_2(30, 0);
6289 FloatRegister vc1_3 = v30; // for kyber_q
6290
6291 VSeq<8> vc2_1(31, 0);
6292 VSeq<2> vc2_2(31, 0);
6293 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6294
6295 __ add(result, coeffs, 0);
6296 __ lea(kyberConsts,
6297 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6298
6299 // load q and the multiplier for the Barrett reduction
6300 __ add(kyberConsts, kyberConsts, 16);
6301 __ ldpq(vc1_3, vc2_3, kyberConsts);
6302
6303 for (int i = 0; i < 3; i++) {
6304 // load 80 or 88 coefficients
6305 vs_ldpq_post(vs1_1, coeffs);
6306 vs_ldpq_post(vs1_2, coeffs);
6307 if (i < 2) {
6308 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6309 }
6310
6311 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6312 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6313 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6314 if (i < 2) {
6315 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6316 }
6317
6318 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6319 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6320 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6321 if (i < 2) {
6322 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6323 }
6324
6325 // vs1 <- vs1 - vs2 * kyber_q
6326 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6327 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6328 if (i < 2) {
6329 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6330 }
6331
6332 vs_stpq_post(vs1_1, result);
6333 vs_stpq_post(vs1_2, result);
6334 if (i < 2) {
6335 __ str(vs1_3, __ Q, __ post(result, 16));
6336 }
6337 }
6338
6339 __ leave(); // required for proper stackwalking of RuntimeStub frame
6340 __ mov(r0, zr); // return 0
6341 __ ret(lr);
6342
6343 return start;
6344 }
6345
6346
6347 // Dilithium-specific montmul helper routines that generate parallel
6348 // code for, respectively, a single 4x4s vector sequence montmul or
6349 // two such multiplies in a row.
6350
6351 // Perform 16 32-bit Montgomery multiplications in parallel
6352 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6353 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6354 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6355 // It will assert that the register use is valid
6356 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6357 }
6358
6359 // Perform 2x16 32-bit Montgomery multiplications in parallel
6360 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6361 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6362 // Schedule two successive 4x4S multiplies via the montmul helper
6363 // on the front and back halves of va, vb and vc. The helper will
6364 // assert that the register use has no overlap conflicts on each
6365 // individual call but we also need to ensure that the necessary
6366 // disjoint/equality constraints are met across both calls.
6367
6368 // vb, vc, vtmp and vq must be disjoint. va must either be
6369 // disjoint from all other registers or equal vc
6370
6371 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6372 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6373 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6374
6375 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6376 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6377
6378 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6379
6380 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6381 assert(vs_disjoint(va, vb), "va and vb overlap");
6382 assert(vs_disjoint(va, vq), "va and vq overlap");
6383 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6384
6385 // We multiply the front and back halves of each sequence 4 at a
6386 // time because
6387 //
6388 // 1) we are currently only able to get 4-way instruction
6389 // parallelism at best
6390 //
6391 // 2) we need registers for the constants in vq and temporary
6392 // scratch registers to hold intermediate results so vtmp can only
6393 // be a VSeq<4> which means we only have 4 scratch slots.
6394
6395 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6396 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6397 }
6398
6399 // Perform combined montmul then add/sub on 4x4S vectors.
6400 void dilithium_montmul16_sub_add(
6401 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6402 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6403 // compute a = montmul(a1, c)
6404 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6405 // ouptut a1 = a0 - a
6406 vs_subv(va1, __ T4S, va0, vc);
6407 // and a0 = a0 + a
6408 vs_addv(va0, __ T4S, va0, vc);
6409 }
6410
6411 // Perform combined add/sub then montul on 4x4S vectors.
6412 void dilithium_sub_add_montmul16(
6413 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6414 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6415 // compute c = a0 - a1
6416 vs_subv(vtmp1, __ T4S, va0, va1);
6417 // output a0 = a0 + a1
6418 vs_addv(va0, __ T4S, va0, va1);
6419 // output a1 = b montmul c
6420 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6421 }
6422
6423 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6424 // in the Java implementation come in sequences of at least 8, so we
6425 // can use ldpq to collect the corresponding data into pairs of vector
6426 // registers.
6427 // We collect the coefficients corresponding to the 'j+l' indexes into
6428 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6429 // then we do the (Montgomery) multiplications by the zetas in parallel
6430 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6431 // v0-v7, then do the additions into v24-v31 and the subtractions into
6432 // v0-v7 and finally save the results back to the coeffs array.
6433 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6434 const Register coeffs, const Register zetas) {
6435 int c1 = 0;
6436 int c2 = 512;
6437 int startIncr;
6438 // don't use callee save registers v8 - v15
6439 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6440 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6441 VSeq<2> vq(30); // n.b. constants overlap vs3
6442 int offsets[4] = { 0, 32, 64, 96 };
6443
6444 for (int level = 0; level < 5; level++) {
6445 int c1Start = c1;
6446 int c2Start = c2;
6447 if (level == 3) {
6448 offsets[1] = 32;
6449 offsets[2] = 128;
6450 offsets[3] = 160;
6451 } else if (level == 4) {
6452 offsets[1] = 64;
6453 offsets[2] = 128;
6454 offsets[3] = 192;
6455 }
6456
6457 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6458 // time at 4 different offsets and multiply them in order by the
6459 // next set of input values. So we employ indexed load and store
6460 // pair instructions with arrangement 4S.
6461 for (int i = 0; i < 4; i++) {
6462 // reload q and qinv
6463 vs_ldpq(vq, dilithiumConsts); // qInv, q
6464 // load 8x4S coefficients via second start pos == c2
6465 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6466 // load next 8x4S inputs == b
6467 vs_ldpq_post(vs2, zetas);
6468 // compute a == c2 * b mod MONT_Q
6469 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6470 // load 8x4s coefficients via first start pos == c1
6471 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6472 // compute a1 = c1 + a
6473 vs_addv(vs3, __ T4S, vs1, vs2);
6474 // compute a2 = c1 - a
6475 vs_subv(vs1, __ T4S, vs1, vs2);
6476 // output a1 and a2
6477 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6478 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6479
6480 int k = 4 * level + i;
6481
6482 if (k > 7) {
6483 startIncr = 256;
6484 } else if (k == 5) {
6485 startIncr = 384;
6486 } else {
6487 startIncr = 128;
6488 }
6489
6490 c1Start += startIncr;
6491 c2Start += startIncr;
6492 }
6493
6494 c2 /= 2;
6495 }
6496 }
6497
6498 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6499 // Implements the method
6500 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6501 // of the Java class sun.security.provider
6502 //
6503 // coeffs (int[256]) = c_rarg0
6504 // zetas (int[256]) = c_rarg1
6505 address generate_dilithiumAlmostNtt() {
6506
6507 __ align(CodeEntryAlignment);
6508 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6509 StubCodeMark mark(this, stub_id);
6510 address start = __ pc();
6511 __ enter();
6512
6513 const Register coeffs = c_rarg0;
6514 const Register zetas = c_rarg1;
6515
6516 const Register tmpAddr = r9;
6517 const Register dilithiumConsts = r10;
6518 const Register result = r11;
6519 // don't use callee save registers v8 - v15
6520 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6521 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6522 VSeq<2> vq(30); // n.b. constants overlap vs3
6523 int offsets[4] = { 0, 32, 64, 96};
6524 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6525 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6526 __ add(result, coeffs, 0);
6527 __ lea(dilithiumConsts,
6528 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6529
6530 // Each level represents one iteration of the outer for loop of the Java version.
6531
6532 // level 0-4
6533 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6534
6535 // level 5
6536
6537 // At level 5 the coefficients we need to combine with the zetas
6538 // are grouped in memory in blocks of size 4. So, for both sets of
6539 // coefficients we load 4 adjacent values at 8 different offsets
6540 // using an indexed ldr with register variant Q and multiply them
6541 // in sequence order by the next set of inputs. Likewise we store
6542 // the resuls using an indexed str with register variant Q.
6543 for (int i = 0; i < 1024; i += 256) {
6544 // reload constants q, qinv each iteration as they get clobbered later
6545 vs_ldpq(vq, dilithiumConsts); // qInv, q
6546 // load 32 (8x4S) coefficients via first offsets = c1
6547 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6548 // load next 32 (8x4S) inputs = b
6549 vs_ldpq_post(vs2, zetas);
6550 // a = b montul c1
6551 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6552 // load 32 (8x4S) coefficients via second offsets = c2
6553 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6554 // add/sub with result of multiply
6555 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6556 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6557 // write back new coefficients using same offsets
6558 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6559 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6560 }
6561
6562 // level 6
6563 // At level 6 the coefficients we need to combine with the zetas
6564 // are grouped in memory in pairs, the first two being montmul
6565 // inputs and the second add/sub inputs. We can still implement
6566 // the montmul+sub+add using 4-way parallelism but only if we
6567 // combine the coefficients with the zetas 16 at a time. We load 8
6568 // adjacent values at 4 different offsets using an ld2 load with
6569 // arrangement 2D. That interleaves the lower and upper halves of
6570 // each pair of quadwords into successive vector registers. We
6571 // then need to montmul the 4 even elements of the coefficients
6572 // register sequence by the zetas in order and then add/sub the 4
6573 // odd elements of the coefficients register sequence. We use an
6574 // equivalent st2 operation to store the results back into memory
6575 // de-interleaved.
6576 for (int i = 0; i < 1024; i += 128) {
6577 // reload constants q, qinv each iteration as they get clobbered later
6578 vs_ldpq(vq, dilithiumConsts); // qInv, q
6579 // load interleaved 16 (4x2D) coefficients via offsets
6580 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6581 // load next 16 (4x4S) inputs
6582 vs_ldpq_post(vs_front(vs2), zetas);
6583 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6584 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6585 vs_front(vs2), vtmp, vq);
6586 // store interleaved 16 (4x2D) coefficients via offsets
6587 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6588 }
6589
6590 // level 7
6591 // At level 7 the coefficients we need to combine with the zetas
6592 // occur singly with montmul inputs alterating with add/sub
6593 // inputs. Once again we can use 4-way parallelism to combine 16
6594 // zetas at a time. However, we have to load 8 adjacent values at
6595 // 4 different offsets using an ld2 load with arrangement 4S. That
6596 // interleaves the the odd words of each pair into one
6597 // coefficients vector register and the even words of the pair
6598 // into the next register. We then need to montmul the 4 even
6599 // elements of the coefficients register sequence by the zetas in
6600 // order and then add/sub the 4 odd elements of the coefficients
6601 // register sequence. We use an equivalent st2 operation to store
6602 // the results back into memory de-interleaved.
6603
6604 for (int i = 0; i < 1024; i += 128) {
6605 // reload constants q, qinv each iteration as they get clobbered later
6606 vs_ldpq(vq, dilithiumConsts); // qInv, q
6607 // load interleaved 16 (4x4S) coefficients via offsets
6608 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6609 // load next 16 (4x4S) inputs
6610 vs_ldpq_post(vs_front(vs2), zetas);
6611 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6612 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6613 vs_front(vs2), vtmp, vq);
6614 // store interleaved 16 (4x4S) coefficients via offsets
6615 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6616 }
6617 __ leave(); // required for proper stackwalking of RuntimeStub frame
6618 __ mov(r0, zr); // return 0
6619 __ ret(lr);
6620
6621 return start;
6622 }
6623
6624 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6625 // in the Java implementation come in sequences of at least 8, so we
6626 // can use ldpq to collect the corresponding data into pairs of vector
6627 // registers
6628 // We collect the coefficients that correspond to the 'j's into vs1
6629 // the coefficiets that correspond to the 'j+l's into vs2 then
6630 // do the additions into vs3 and the subtractions into vs1 then
6631 // save the result of the additions, load the zetas into vs2
6632 // do the (Montgomery) multiplications by zeta in parallel into vs2
6633 // finally save the results back to the coeffs array
6634 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6635 const Register coeffs, const Register zetas) {
6636 int c1 = 0;
6637 int c2 = 32;
6638 int startIncr;
6639 int offsets[4];
6640 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6641 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6642 VSeq<2> vq(30); // n.b. constants overlap vs3
6643
6644 offsets[0] = 0;
6645
6646 for (int level = 3; level < 8; level++) {
6647 int c1Start = c1;
6648 int c2Start = c2;
6649 if (level == 3) {
6650 offsets[1] = 64;
6651 offsets[2] = 128;
6652 offsets[3] = 192;
6653 } else if (level == 4) {
6654 offsets[1] = 32;
6655 offsets[2] = 128;
6656 offsets[3] = 160;
6657 } else {
6658 offsets[1] = 32;
6659 offsets[2] = 64;
6660 offsets[3] = 96;
6661 }
6662
6663 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6664 // time at 4 different offsets and multiply them in order by the
6665 // next set of input values. So we employ indexed load and store
6666 // pair instructions with arrangement 4S.
6667 for (int i = 0; i < 4; i++) {
6668 // load v1 32 (8x4S) coefficients relative to first start index
6669 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6670 // load v2 32 (8x4S) coefficients relative to second start index
6671 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6672 // a0 = v1 + v2 -- n.b. clobbers vqs
6673 vs_addv(vs3, __ T4S, vs1, vs2);
6674 // a1 = v1 - v2
6675 vs_subv(vs1, __ T4S, vs1, vs2);
6676 // save a1 relative to first start index
6677 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6678 // load constants q, qinv each iteration as they get clobbered above
6679 vs_ldpq(vq, dilithiumConsts); // qInv, q
6680 // load b next 32 (8x4S) inputs
6681 vs_ldpq_post(vs2, zetas);
6682 // a = a1 montmul b
6683 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6684 // save a relative to second start index
6685 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6686
6687 int k = 4 * level + i;
6688
6689 if (k < 24) {
6690 startIncr = 256;
6691 } else if (k == 25) {
6692 startIncr = 384;
6693 } else {
6694 startIncr = 128;
6695 }
6696
6697 c1Start += startIncr;
6698 c2Start += startIncr;
6699 }
6700
6701 c2 *= 2;
6702 }
6703 }
6704
6705 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6706 // Implements the method
6707 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6708 // the sun.security.provider.ML_DSA class.
6709 //
6710 // coeffs (int[256]) = c_rarg0
6711 // zetas (int[256]) = c_rarg1
6712 address generate_dilithiumAlmostInverseNtt() {
6713
6714 __ align(CodeEntryAlignment);
6715 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6716 StubCodeMark mark(this, stub_id);
6717 address start = __ pc();
6718 __ enter();
6719
6720 const Register coeffs = c_rarg0;
6721 const Register zetas = c_rarg1;
6722
6723 const Register tmpAddr = r9;
6724 const Register dilithiumConsts = r10;
6725 const Register result = r11;
6726 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6727 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6728 VSeq<2> vq(30); // n.b. constants overlap vs3
6729 int offsets[4] = { 0, 32, 64, 96 };
6730 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6731 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6732
6733 __ add(result, coeffs, 0);
6734 __ lea(dilithiumConsts,
6735 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6736
6737 // Each level represents one iteration of the outer for loop of the Java version
6738
6739 // level 0
6740 // At level 0 we need to interleave adjacent quartets of
6741 // coefficients before we multiply and add/sub by the next 16
6742 // zetas just as we did for level 7 in the multiply code. So we
6743 // load and store the values using an ld2/st2 with arrangement 4S.
6744 for (int i = 0; i < 1024; i += 128) {
6745 // load constants q, qinv
6746 // n.b. this can be moved out of the loop as they do not get
6747 // clobbered by first two loops
6748 vs_ldpq(vq, dilithiumConsts); // qInv, q
6749 // a0/a1 load interleaved 32 (8x4S) coefficients
6750 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6751 // b load next 32 (8x4S) inputs
6752 vs_ldpq_post(vs_front(vs2), zetas);
6753 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6754 // n.b. second half of vs2 provides temporary register storage
6755 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6756 vs_front(vs2), vs_back(vs2), vtmp, vq);
6757 // a0/a1 store interleaved 32 (8x4S) coefficients
6758 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6759 }
6760
6761 // level 1
6762 // At level 1 we need to interleave pairs of adjacent pairs of
6763 // coefficients before we multiply by the next 16 zetas just as we
6764 // did for level 6 in the multiply code. So we load and store the
6765 // values an ld2/st2 with arrangement 2D.
6766 for (int i = 0; i < 1024; i += 128) {
6767 // a0/a1 load interleaved 32 (8x2D) coefficients
6768 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6769 // b load next 16 (4x4S) inputs
6770 vs_ldpq_post(vs_front(vs2), zetas);
6771 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6772 // n.b. second half of vs2 provides temporary register storage
6773 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6774 vs_front(vs2), vs_back(vs2), vtmp, vq);
6775 // a0/a1 store interleaved 32 (8x2D) coefficients
6776 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6777 }
6778
6779 // level 2
6780 // At level 2 coefficients come in blocks of 4. So, we load 4
6781 // adjacent coefficients at 8 distinct offsets for both the first
6782 // and second coefficient sequences, using an ldr with register
6783 // variant Q then combine them with next set of 32 zetas. Likewise
6784 // we store the results using an str with register variant Q.
6785 for (int i = 0; i < 1024; i += 256) {
6786 // c0 load 32 (8x4S) coefficients via first offsets
6787 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6788 // c1 load 32 (8x4S) coefficients via second offsets
6789 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6790 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6791 vs_addv(vs3, __ T4S, vs1, vs2);
6792 // c = c0 - c1
6793 vs_subv(vs1, __ T4S, vs1, vs2);
6794 // store a0 32 (8x4S) coefficients via first offsets
6795 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6796 // b load 32 (8x4S) next inputs
6797 vs_ldpq_post(vs2, zetas);
6798 // reload constants q, qinv -- they were clobbered earlier
6799 vs_ldpq(vq, dilithiumConsts); // qInv, q
6800 // compute a1 = b montmul c
6801 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6802 // store a1 32 (8x4S) coefficients via second offsets
6803 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6804 }
6805
6806 // level 3-7
6807 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6808
6809 __ leave(); // required for proper stackwalking of RuntimeStub frame
6810 __ mov(r0, zr); // return 0
6811 __ ret(lr);
6812
6813 return start;
6814 }
6815
6816 // Dilithium multiply polynomials in the NTT domain.
6817 // Straightforward implementation of the method
6818 // static int implDilithiumNttMult(
6819 // int[] result, int[] ntta, int[] nttb {} of
6820 // the sun.security.provider.ML_DSA class.
6821 //
6822 // result (int[256]) = c_rarg0
6823 // poly1 (int[256]) = c_rarg1
6824 // poly2 (int[256]) = c_rarg2
6825 address generate_dilithiumNttMult() {
6826
6827 __ align(CodeEntryAlignment);
6828 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6829 StubCodeMark mark(this, stub_id);
6830 address start = __ pc();
6831 __ enter();
6832
6833 Label L_loop;
6834
6835 const Register result = c_rarg0;
6836 const Register poly1 = c_rarg1;
6837 const Register poly2 = c_rarg2;
6838
6839 const Register dilithiumConsts = r10;
6840 const Register len = r11;
6841
6842 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6843 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6844 VSeq<2> vq(30); // n.b. constants overlap vs3
6845 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6846
6847 __ lea(dilithiumConsts,
6848 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6849
6850 // load constants q, qinv
6851 vs_ldpq(vq, dilithiumConsts); // qInv, q
6852 // load constant rSquare into v29
6853 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6854
6855 __ mov(len, zr);
6856 __ add(len, len, 1024);
6857
6858 __ BIND(L_loop);
6859
6860 // b load 32 (8x4S) next inputs from poly1
6861 vs_ldpq_post(vs1, poly1);
6862 // c load 32 (8x4S) next inputs from poly2
6863 vs_ldpq_post(vs2, poly2);
6864 // compute a = b montmul c
6865 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6866 // compute a = rsquare montmul a
6867 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6868 // save a 32 (8x4S) results
6869 vs_stpq_post(vs2, result);
6870
6871 __ sub(len, len, 128);
6872 __ cmp(len, (u1)128);
6873 __ br(Assembler::GE, L_loop);
6874
6875 __ leave(); // required for proper stackwalking of RuntimeStub frame
6876 __ mov(r0, zr); // return 0
6877 __ ret(lr);
6878
6879 return start;
6880 }
6881
6882 // Dilithium Motgomery multiply an array by a constant.
6883 // A straightforward implementation of the method
6884 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6885 // of the sun.security.provider.MLDSA class
6886 //
6887 // coeffs (int[256]) = c_rarg0
6888 // constant (int) = c_rarg1
6889 address generate_dilithiumMontMulByConstant() {
6890
6891 __ align(CodeEntryAlignment);
6892 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6893 StubCodeMark mark(this, stub_id);
6894 address start = __ pc();
6895 __ enter();
6896
6897 Label L_loop;
6898
6899 const Register coeffs = c_rarg0;
6900 const Register constant = c_rarg1;
6901
6902 const Register dilithiumConsts = r10;
6903 const Register result = r11;
6904 const Register len = r12;
6905
6906 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6907 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6908 VSeq<2> vq(30); // n.b. constants overlap vs3
6909 VSeq<8> vconst(29, 0); // for montmul by constant
6910
6911 // results track inputs
6912 __ add(result, coeffs, 0);
6913 __ lea(dilithiumConsts,
6914 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6915
6916 // load constants q, qinv -- they do not get clobbered by first two loops
6917 vs_ldpq(vq, dilithiumConsts); // qInv, q
6918 // copy caller supplied constant across vconst
6919 __ dup(vconst[0], __ T4S, constant);
6920 __ mov(len, zr);
6921 __ add(len, len, 1024);
6922
6923 __ BIND(L_loop);
6924
6925 // load next 32 inputs
6926 vs_ldpq_post(vs2, coeffs);
6927 // mont mul by constant
6928 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6929 // write next 32 results
6930 vs_stpq_post(vs2, result);
6931
6932 __ sub(len, len, 128);
6933 __ cmp(len, (u1)128);
6934 __ br(Assembler::GE, L_loop);
6935
6936 __ leave(); // required for proper stackwalking of RuntimeStub frame
6937 __ mov(r0, zr); // return 0
6938 __ ret(lr);
6939
6940 return start;
6941 }
6942
6943 // Dilithium decompose poly.
6944 // Implements the method
6945 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6946 // of the sun.security.provider.ML_DSA class
6947 //
6948 // input (int[256]) = c_rarg0
6949 // lowPart (int[256]) = c_rarg1
6950 // highPart (int[256]) = c_rarg2
6951 // twoGamma2 (int) = c_rarg3
6952 // multiplier (int) = c_rarg4
6953 address generate_dilithiumDecomposePoly() {
6954
6955 __ align(CodeEntryAlignment);
6956 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
6957 StubCodeMark mark(this, stub_id);
6958 address start = __ pc();
6959 Label L_loop;
6960
6961 const Register input = c_rarg0;
6962 const Register lowPart = c_rarg1;
6963 const Register highPart = c_rarg2;
6964 const Register twoGamma2 = c_rarg3;
6965 const Register multiplier = c_rarg4;
6966
6967 const Register len = r9;
6968 const Register dilithiumConsts = r10;
6969 const Register tmp = r11;
6970
6971 // 6 independent sets of 4x4s values
6972 VSeq<4> vs1(0), vs2(4), vs3(8);
6973 VSeq<4> vs4(12), vs5(16), vtmp(20);
6974
6975 // 7 constants for cross-multiplying
6976 VSeq<4> one(25, 0);
6977 VSeq<4> qminus1(26, 0);
6978 VSeq<4> g2(27, 0);
6979 VSeq<4> twog2(28, 0);
6980 VSeq<4> mult(29, 0);
6981 VSeq<4> q(30, 0);
6982 VSeq<4> qadd(31, 0);
6983
6984 __ enter();
6985
6986 __ lea(dilithiumConsts,
6987 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6988
6989 // save callee-saved registers
6990 __ stpd(v8, v9, __ pre(sp, -64));
6991 __ stpd(v10, v11, Address(sp, 16));
6992 __ stpd(v12, v13, Address(sp, 32));
6993 __ stpd(v14, v15, Address(sp, 48));
6994
6995 // populate constant registers
6996 __ mov(tmp, zr);
6997 __ add(tmp, tmp, 1);
6998 __ dup(one[0], __ T4S, tmp); // 1
6999 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7000 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7001 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7002 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7003 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7004 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7005
7006 __ mov(len, zr);
7007 __ add(len, len, 1024);
7008
7009 __ BIND(L_loop);
7010
7011 // load next 4x4S inputs interleaved: rplus --> vs1
7012 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7013
7014 // rplus = rplus - ((rplus + qadd) >> 23) * q
7015 vs_addv(vtmp, __ T4S, vs1, qadd);
7016 vs_sshr(vtmp, __ T4S, vtmp, 23);
7017 vs_mulv(vtmp, __ T4S, vtmp, q);
7018 vs_subv(vs1, __ T4S, vs1, vtmp);
7019
7020 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7021 vs_sshr(vtmp, __ T4S, vs1, 31);
7022 vs_andr(vtmp, vtmp, q);
7023 vs_addv(vs1, __ T4S, vs1, vtmp);
7024
7025 // quotient --> vs2
7026 // int quotient = (rplus * multiplier) >> 22;
7027 vs_mulv(vtmp, __ T4S, vs1, mult);
7028 vs_sshr(vs2, __ T4S, vtmp, 22);
7029
7030 // r0 --> vs3
7031 // int r0 = rplus - quotient * twoGamma2;
7032 vs_mulv(vtmp, __ T4S, vs2, twog2);
7033 vs_subv(vs3, __ T4S, vs1, vtmp);
7034
7035 // mask --> vs4
7036 // int mask = (twoGamma2 - r0) >> 22;
7037 vs_subv(vtmp, __ T4S, twog2, vs3);
7038 vs_sshr(vs4, __ T4S, vtmp, 22);
7039
7040 // r0 -= (mask & twoGamma2);
7041 vs_andr(vtmp, vs4, twog2);
7042 vs_subv(vs3, __ T4S, vs3, vtmp);
7043
7044 // quotient += (mask & 1);
7045 vs_andr(vtmp, vs4, one);
7046 vs_addv(vs2, __ T4S, vs2, vtmp);
7047
7048 // mask = (twoGamma2 / 2 - r0) >> 31;
7049 vs_subv(vtmp, __ T4S, g2, vs3);
7050 vs_sshr(vs4, __ T4S, vtmp, 31);
7051
7052 // r0 -= (mask & twoGamma2);
7053 vs_andr(vtmp, vs4, twog2);
7054 vs_subv(vs3, __ T4S, vs3, vtmp);
7055
7056 // quotient += (mask & 1);
7057 vs_andr(vtmp, vs4, one);
7058 vs_addv(vs2, __ T4S, vs2, vtmp);
7059
7060 // r1 --> vs5
7061 // int r1 = rplus - r0 - (dilithium_q - 1);
7062 vs_subv(vtmp, __ T4S, vs1, vs3);
7063 vs_subv(vs5, __ T4S, vtmp, qminus1);
7064
7065 // r1 --> vs1 (overwriting rplus)
7066 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7067 vs_negr(vtmp, __ T4S, vs5);
7068 vs_orr(vtmp, vs5, vtmp);
7069 vs_sshr(vs1, __ T4S, vtmp, 31);
7070
7071 // r0 += ~r1;
7072 vs_notr(vtmp, vs1);
7073 vs_addv(vs3, __ T4S, vs3, vtmp);
7074
7075 // r1 = r1 & quotient;
7076 vs_andr(vs1, vs2, vs1);
7077
7078 // store results inteleaved
7079 // lowPart[m] = r0;
7080 // highPart[m] = r1;
7081 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7082 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7083
7084 __ sub(len, len, 64);
7085 __ cmp(len, (u1)64);
7086 __ br(Assembler::GE, L_loop);
7087
7088 // restore callee-saved vector registers
7089 __ ldpd(v14, v15, Address(sp, 48));
7090 __ ldpd(v12, v13, Address(sp, 32));
7091 __ ldpd(v10, v11, Address(sp, 16));
7092 __ ldpd(v8, v9, __ post(sp, 64));
7093
7094 __ leave(); // required for proper stackwalking of RuntimeStub frame
7095 __ mov(r0, zr); // return 0
7096 __ ret(lr);
7097
7098 return start;
7099 }
7100
7101 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7102 Register tmp0, Register tmp1, Register tmp2) {
7103 __ bic(tmp0, a2, a1); // for a0
7104 __ bic(tmp1, a3, a2); // for a1
7105 __ bic(tmp2, a4, a3); // for a2
7106 __ eor(a2, a2, tmp2);
7107 __ bic(tmp2, a0, a4); // for a3
7108 __ eor(a3, a3, tmp2);
7109 __ bic(tmp2, a1, a0); // for a4
7110 __ eor(a0, a0, tmp0);
7111 __ eor(a1, a1, tmp1);
7112 __ eor(a4, a4, tmp2);
7113 }
7114
7115 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7116 Register a0, Register a1, Register a2, Register a3, Register a4,
7117 Register a5, Register a6, Register a7, Register a8, Register a9,
7118 Register a10, Register a11, Register a12, Register a13, Register a14,
7119 Register a15, Register a16, Register a17, Register a18, Register a19,
7120 Register a20, Register a21, Register a22, Register a23, Register a24,
7121 Register tmp0, Register tmp1, Register tmp2) {
7122 __ eor3(tmp1, a4, a9, a14);
7123 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7124 __ eor3(tmp2, a1, a6, a11);
7125 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7126 __ rax1(tmp2, tmp0, tmp1); // d0
7127 {
7128
7129 Register tmp3, tmp4;
7130 if (can_use_fp && can_use_r18) {
7131 tmp3 = rfp;
7132 tmp4 = r18_tls;
7133 } else {
7134 tmp3 = a4;
7135 tmp4 = a9;
7136 __ stp(tmp3, tmp4, __ pre(sp, -16));
7137 }
7138
7139 __ eor3(tmp3, a0, a5, a10);
7140 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7141 __ eor(a0, a0, tmp2);
7142 __ eor(a5, a5, tmp2);
7143 __ eor(a10, a10, tmp2);
7144 __ eor(a15, a15, tmp2);
7145 __ eor(a20, a20, tmp2); // d0(tmp2)
7146 __ eor3(tmp3, a2, a7, a12);
7147 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7148 __ rax1(tmp3, tmp4, tmp2); // d1
7149 __ eor(a1, a1, tmp3);
7150 __ eor(a6, a6, tmp3);
7151 __ eor(a11, a11, tmp3);
7152 __ eor(a16, a16, tmp3);
7153 __ eor(a21, a21, tmp3); // d1(tmp3)
7154 __ rax1(tmp3, tmp2, tmp0); // d3
7155 __ eor3(tmp2, a3, a8, a13);
7156 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7157 __ eor(a3, a3, tmp3);
7158 __ eor(a8, a8, tmp3);
7159 __ eor(a13, a13, tmp3);
7160 __ eor(a18, a18, tmp3);
7161 __ eor(a23, a23, tmp3);
7162 __ rax1(tmp2, tmp1, tmp0); // d2
7163 __ eor(a2, a2, tmp2);
7164 __ eor(a7, a7, tmp2);
7165 __ eor(a12, a12, tmp2);
7166 __ rax1(tmp0, tmp0, tmp4); // d4
7167 if (!can_use_fp || !can_use_r18) {
7168 __ ldp(tmp3, tmp4, __ post(sp, 16));
7169 }
7170 __ eor(a17, a17, tmp2);
7171 __ eor(a22, a22, tmp2);
7172 __ eor(a4, a4, tmp0);
7173 __ eor(a9, a9, tmp0);
7174 __ eor(a14, a14, tmp0);
7175 __ eor(a19, a19, tmp0);
7176 __ eor(a24, a24, tmp0);
7177 }
7178
7179 __ rol(tmp0, a10, 3);
7180 __ rol(a10, a1, 1);
7181 __ rol(a1, a6, 44);
7182 __ rol(a6, a9, 20);
7183 __ rol(a9, a22, 61);
7184 __ rol(a22, a14, 39);
7185 __ rol(a14, a20, 18);
7186 __ rol(a20, a2, 62);
7187 __ rol(a2, a12, 43);
7188 __ rol(a12, a13, 25);
7189 __ rol(a13, a19, 8) ;
7190 __ rol(a19, a23, 56);
7191 __ rol(a23, a15, 41);
7192 __ rol(a15, a4, 27);
7193 __ rol(a4, a24, 14);
7194 __ rol(a24, a21, 2);
7195 __ rol(a21, a8, 55);
7196 __ rol(a8, a16, 45);
7197 __ rol(a16, a5, 36);
7198 __ rol(a5, a3, 28);
7199 __ rol(a3, a18, 21);
7200 __ rol(a18, a17, 15);
7201 __ rol(a17, a11, 10);
7202 __ rol(a11, a7, 6);
7203 __ mov(a7, tmp0);
7204
7205 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7206 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7207 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7208 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7209 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7210
7211 __ ldr(tmp1, __ post(rc, 8));
7212 __ eor(a0, a0, tmp1);
7213
7214 }
7215
7216 // Arguments:
7217 //
7218 // Inputs:
7219 // c_rarg0 - byte[] source+offset
7220 // c_rarg1 - byte[] SHA.state
7221 // c_rarg2 - int block_size
7222 // c_rarg3 - int offset
7223 // c_rarg4 - int limit
7224 //
7225 address generate_sha3_implCompress_gpr(StubId stub_id) {
7226 bool multi_block;
7227 switch (stub_id) {
7228 case StubId::stubgen_sha3_implCompress_id:
7229 multi_block = false;
7230 break;
7231 case StubId::stubgen_sha3_implCompressMB_id:
7232 multi_block = true;
7233 break;
7234 default:
7235 ShouldNotReachHere();
7236 }
7237
7238 static const uint64_t round_consts[24] = {
7239 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7240 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7241 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7242 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7243 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7244 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7245 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7246 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7247 };
7248
7249 __ align(CodeEntryAlignment);
7250 StubCodeMark mark(this, stub_id);
7251 address start = __ pc();
7252
7253 Register buf = c_rarg0;
7254 Register state = c_rarg1;
7255 Register block_size = c_rarg2;
7256 Register ofs = c_rarg3;
7257 Register limit = c_rarg4;
7258
7259 // use r3.r17,r19..r28 to keep a0..a24.
7260 // a0..a24 are respective locals from SHA3.java
7261 Register a0 = r25,
7262 a1 = r26,
7263 a2 = r27,
7264 a3 = r3,
7265 a4 = r4,
7266 a5 = r5,
7267 a6 = r6,
7268 a7 = r7,
7269 a8 = rscratch1, // r8
7270 a9 = rscratch2, // r9
7271 a10 = r10,
7272 a11 = r11,
7273 a12 = r12,
7274 a13 = r13,
7275 a14 = r14,
7276 a15 = r15,
7277 a16 = r16,
7278 a17 = r17,
7279 a18 = r28,
7280 a19 = r19,
7281 a20 = r20,
7282 a21 = r21,
7283 a22 = r22,
7284 a23 = r23,
7285 a24 = r24;
7286
7287 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7288
7289 Label sha3_loop, rounds24_preloop, loop_body;
7290 Label sha3_512_or_sha3_384, shake128;
7291
7292 bool can_use_r18 = false;
7293 #ifndef R18_RESERVED
7294 can_use_r18 = true;
7295 #endif
7296 bool can_use_fp = !PreserveFramePointer;
7297
7298 __ enter();
7299
7300 // save almost all yet unsaved gpr registers on stack
7301 __ str(block_size, __ pre(sp, -128));
7302 if (multi_block) {
7303 __ stpw(ofs, limit, Address(sp, 8));
7304 }
7305 // 8 bytes at sp+16 will be used to keep buf
7306 __ stp(r19, r20, Address(sp, 32));
7307 __ stp(r21, r22, Address(sp, 48));
7308 __ stp(r23, r24, Address(sp, 64));
7309 __ stp(r25, r26, Address(sp, 80));
7310 __ stp(r27, r28, Address(sp, 96));
7311 if (can_use_r18 && can_use_fp) {
7312 __ stp(r18_tls, state, Address(sp, 112));
7313 } else {
7314 __ str(state, Address(sp, 112));
7315 }
7316
7317 // begin sha3 calculations: loading a0..a24 from state arrary
7318 __ ldp(a0, a1, state);
7319 __ ldp(a2, a3, Address(state, 16));
7320 __ ldp(a4, a5, Address(state, 32));
7321 __ ldp(a6, a7, Address(state, 48));
7322 __ ldp(a8, a9, Address(state, 64));
7323 __ ldp(a10, a11, Address(state, 80));
7324 __ ldp(a12, a13, Address(state, 96));
7325 __ ldp(a14, a15, Address(state, 112));
7326 __ ldp(a16, a17, Address(state, 128));
7327 __ ldp(a18, a19, Address(state, 144));
7328 __ ldp(a20, a21, Address(state, 160));
7329 __ ldp(a22, a23, Address(state, 176));
7330 __ ldr(a24, Address(state, 192));
7331
7332 __ BIND(sha3_loop);
7333
7334 // load input
7335 __ ldp(tmp3, tmp2, __ post(buf, 16));
7336 __ eor(a0, a0, tmp3);
7337 __ eor(a1, a1, tmp2);
7338 __ ldp(tmp3, tmp2, __ post(buf, 16));
7339 __ eor(a2, a2, tmp3);
7340 __ eor(a3, a3, tmp2);
7341 __ ldp(tmp3, tmp2, __ post(buf, 16));
7342 __ eor(a4, a4, tmp3);
7343 __ eor(a5, a5, tmp2);
7344 __ ldr(tmp3, __ post(buf, 8));
7345 __ eor(a6, a6, tmp3);
7346
7347 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7348 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7349
7350 __ ldp(tmp3, tmp2, __ post(buf, 16));
7351 __ eor(a7, a7, tmp3);
7352 __ eor(a8, a8, tmp2);
7353 __ ldp(tmp3, tmp2, __ post(buf, 16));
7354 __ eor(a9, a9, tmp3);
7355 __ eor(a10, a10, tmp2);
7356 __ ldp(tmp3, tmp2, __ post(buf, 16));
7357 __ eor(a11, a11, tmp3);
7358 __ eor(a12, a12, tmp2);
7359 __ ldp(tmp3, tmp2, __ post(buf, 16));
7360 __ eor(a13, a13, tmp3);
7361 __ eor(a14, a14, tmp2);
7362 __ ldp(tmp3, tmp2, __ post(buf, 16));
7363 __ eor(a15, a15, tmp3);
7364 __ eor(a16, a16, tmp2);
7365
7366 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7367 __ andw(tmp2, block_size, 48);
7368 __ cbzw(tmp2, rounds24_preloop);
7369 __ tbnz(block_size, 5, shake128);
7370 // block_size == 144, bit5 == 0, SHA3-244
7371 __ ldr(tmp3, __ post(buf, 8));
7372 __ eor(a17, a17, tmp3);
7373 __ b(rounds24_preloop);
7374
7375 __ BIND(shake128);
7376 __ ldp(tmp3, tmp2, __ post(buf, 16));
7377 __ eor(a17, a17, tmp3);
7378 __ eor(a18, a18, tmp2);
7379 __ ldp(tmp3, tmp2, __ post(buf, 16));
7380 __ eor(a19, a19, tmp3);
7381 __ eor(a20, a20, tmp2);
7382 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7383
7384 __ BIND(sha3_512_or_sha3_384);
7385 __ ldp(tmp3, tmp2, __ post(buf, 16));
7386 __ eor(a7, a7, tmp3);
7387 __ eor(a8, a8, tmp2);
7388 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7389
7390 // SHA3-384
7391 __ ldp(tmp3, tmp2, __ post(buf, 16));
7392 __ eor(a9, a9, tmp3);
7393 __ eor(a10, a10, tmp2);
7394 __ ldp(tmp3, tmp2, __ post(buf, 16));
7395 __ eor(a11, a11, tmp3);
7396 __ eor(a12, a12, tmp2);
7397
7398 __ BIND(rounds24_preloop);
7399 __ fmovs(v0, 24.0); // float loop counter,
7400 __ fmovs(v1, 1.0); // exact representation
7401
7402 __ str(buf, Address(sp, 16));
7403 __ lea(tmp3, ExternalAddress((address) round_consts));
7404
7405 __ BIND(loop_body);
7406 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7407 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7408 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7409 tmp0, tmp1, tmp2);
7410 __ fsubs(v0, v0, v1);
7411 __ fcmps(v0, 0.0);
7412 __ br(__ NE, loop_body);
7413
7414 if (multi_block) {
7415 __ ldrw(block_size, sp); // block_size
7416 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7417 __ addw(tmp2, tmp2, block_size);
7418 __ cmpw(tmp2, tmp1);
7419 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7420 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7421 __ br(Assembler::LE, sha3_loop);
7422 __ movw(c_rarg0, tmp2); // return offset
7423 }
7424 if (can_use_fp && can_use_r18) {
7425 __ ldp(r18_tls, state, Address(sp, 112));
7426 } else {
7427 __ ldr(state, Address(sp, 112));
7428 }
7429 // save calculated sha3 state
7430 __ stp(a0, a1, Address(state));
7431 __ stp(a2, a3, Address(state, 16));
7432 __ stp(a4, a5, Address(state, 32));
7433 __ stp(a6, a7, Address(state, 48));
7434 __ stp(a8, a9, Address(state, 64));
7435 __ stp(a10, a11, Address(state, 80));
7436 __ stp(a12, a13, Address(state, 96));
7437 __ stp(a14, a15, Address(state, 112));
7438 __ stp(a16, a17, Address(state, 128));
7439 __ stp(a18, a19, Address(state, 144));
7440 __ stp(a20, a21, Address(state, 160));
7441 __ stp(a22, a23, Address(state, 176));
7442 __ str(a24, Address(state, 192));
7443
7444 // restore required registers from stack
7445 __ ldp(r19, r20, Address(sp, 32));
7446 __ ldp(r21, r22, Address(sp, 48));
7447 __ ldp(r23, r24, Address(sp, 64));
7448 __ ldp(r25, r26, Address(sp, 80));
7449 __ ldp(r27, r28, Address(sp, 96));
7450 if (can_use_fp && can_use_r18) {
7451 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7452 } // else no need to recalculate rfp, since it wasn't changed
7453
7454 __ leave();
7455
7456 __ ret(lr);
7457
7458 return start;
7459 }
7460
7461 /**
7462 * Arguments:
7463 *
7464 * Inputs:
7465 * c_rarg0 - int crc
7466 * c_rarg1 - byte* buf
7467 * c_rarg2 - int length
7468 *
7469 * Output:
7470 * rax - int crc result
7471 */
7472 address generate_updateBytesCRC32() {
7473 assert(UseCRC32Intrinsics, "what are we doing here?");
7474
7475 __ align(CodeEntryAlignment);
7476 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7477 StubCodeMark mark(this, stub_id);
7478
7479 address start = __ pc();
7480
7481 const Register crc = c_rarg0; // crc
7482 const Register buf = c_rarg1; // source java byte array address
7483 const Register len = c_rarg2; // length
7484 const Register table0 = c_rarg3; // crc_table address
7485 const Register table1 = c_rarg4;
7486 const Register table2 = c_rarg5;
7487 const Register table3 = c_rarg6;
7488 const Register tmp3 = c_rarg7;
7489
7490 BLOCK_COMMENT("Entry:");
7491 __ enter(); // required for proper stackwalking of RuntimeStub frame
7492
7493 __ kernel_crc32(crc, buf, len,
7494 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7495
7496 __ leave(); // required for proper stackwalking of RuntimeStub frame
7497 __ ret(lr);
7498
7499 return start;
7500 }
7501
7502 /**
7503 * Arguments:
7504 *
7505 * Inputs:
7506 * c_rarg0 - int crc
7507 * c_rarg1 - byte* buf
7508 * c_rarg2 - int length
7509 * c_rarg3 - int* table
7510 *
7511 * Output:
7512 * r0 - int crc result
7513 */
7514 address generate_updateBytesCRC32C() {
7515 assert(UseCRC32CIntrinsics, "what are we doing here?");
7516
7517 __ align(CodeEntryAlignment);
7518 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7519 StubCodeMark mark(this, stub_id);
7520
7521 address start = __ pc();
7522
7523 const Register crc = c_rarg0; // crc
7524 const Register buf = c_rarg1; // source java byte array address
7525 const Register len = c_rarg2; // length
7526 const Register table0 = c_rarg3; // crc_table address
7527 const Register table1 = c_rarg4;
7528 const Register table2 = c_rarg5;
7529 const Register table3 = c_rarg6;
7530 const Register tmp3 = c_rarg7;
7531
7532 BLOCK_COMMENT("Entry:");
7533 __ enter(); // required for proper stackwalking of RuntimeStub frame
7534
7535 __ kernel_crc32c(crc, buf, len,
7536 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7537
7538 __ leave(); // required for proper stackwalking of RuntimeStub frame
7539 __ ret(lr);
7540
7541 return start;
7542 }
7543
7544 /***
7545 * Arguments:
7546 *
7547 * Inputs:
7548 * c_rarg0 - int adler
7549 * c_rarg1 - byte* buff
7550 * c_rarg2 - int len
7551 *
7552 * Output:
7553 * c_rarg0 - int adler result
7554 */
7555 address generate_updateBytesAdler32() {
7556 __ align(CodeEntryAlignment);
7557 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7558 StubCodeMark mark(this, stub_id);
7559 address start = __ pc();
7560
7561 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7562
7563 // Aliases
7564 Register adler = c_rarg0;
7565 Register s1 = c_rarg0;
7566 Register s2 = c_rarg3;
7567 Register buff = c_rarg1;
7568 Register len = c_rarg2;
7569 Register nmax = r4;
7570 Register base = r5;
7571 Register count = r6;
7572 Register temp0 = rscratch1;
7573 Register temp1 = rscratch2;
7574 FloatRegister vbytes = v0;
7575 FloatRegister vs1acc = v1;
7576 FloatRegister vs2acc = v2;
7577 FloatRegister vtable = v3;
7578
7579 // Max number of bytes we can process before having to take the mod
7580 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7581 uint64_t BASE = 0xfff1;
7582 uint64_t NMAX = 0x15B0;
7583
7584 __ mov(base, BASE);
7585 __ mov(nmax, NMAX);
7586
7587 // Load accumulation coefficients for the upper 16 bits
7588 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7589 __ ld1(vtable, __ T16B, Address(temp0));
7590
7591 // s1 is initialized to the lower 16 bits of adler
7592 // s2 is initialized to the upper 16 bits of adler
7593 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7594 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7595
7596 // The pipelined loop needs at least 16 elements for 1 iteration
7597 // It does check this, but it is more effective to skip to the cleanup loop
7598 __ cmp(len, (u1)16);
7599 __ br(Assembler::HS, L_nmax);
7600 __ cbz(len, L_combine);
7601
7602 __ bind(L_simple_by1_loop);
7603 __ ldrb(temp0, Address(__ post(buff, 1)));
7604 __ add(s1, s1, temp0);
7605 __ add(s2, s2, s1);
7606 __ subs(len, len, 1);
7607 __ br(Assembler::HI, L_simple_by1_loop);
7608
7609 // s1 = s1 % BASE
7610 __ subs(temp0, s1, base);
7611 __ csel(s1, temp0, s1, Assembler::HS);
7612
7613 // s2 = s2 % BASE
7614 __ lsr(temp0, s2, 16);
7615 __ lsl(temp1, temp0, 4);
7616 __ sub(temp1, temp1, temp0);
7617 __ add(s2, temp1, s2, ext::uxth);
7618
7619 __ subs(temp0, s2, base);
7620 __ csel(s2, temp0, s2, Assembler::HS);
7621
7622 __ b(L_combine);
7623
7624 __ bind(L_nmax);
7625 __ subs(len, len, nmax);
7626 __ sub(count, nmax, 16);
7627 __ br(Assembler::LO, L_by16);
7628
7629 __ bind(L_nmax_loop);
7630
7631 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7632 vbytes, vs1acc, vs2acc, vtable);
7633
7634 __ subs(count, count, 16);
7635 __ br(Assembler::HS, L_nmax_loop);
7636
7637 // s1 = s1 % BASE
7638 __ lsr(temp0, s1, 16);
7639 __ lsl(temp1, temp0, 4);
7640 __ sub(temp1, temp1, temp0);
7641 __ add(temp1, temp1, s1, ext::uxth);
7642
7643 __ lsr(temp0, temp1, 16);
7644 __ lsl(s1, temp0, 4);
7645 __ sub(s1, s1, temp0);
7646 __ add(s1, s1, temp1, ext:: uxth);
7647
7648 __ subs(temp0, s1, base);
7649 __ csel(s1, temp0, s1, Assembler::HS);
7650
7651 // s2 = s2 % BASE
7652 __ lsr(temp0, s2, 16);
7653 __ lsl(temp1, temp0, 4);
7654 __ sub(temp1, temp1, temp0);
7655 __ add(temp1, temp1, s2, ext::uxth);
7656
7657 __ lsr(temp0, temp1, 16);
7658 __ lsl(s2, temp0, 4);
7659 __ sub(s2, s2, temp0);
7660 __ add(s2, s2, temp1, ext:: uxth);
7661
7662 __ subs(temp0, s2, base);
7663 __ csel(s2, temp0, s2, Assembler::HS);
7664
7665 __ subs(len, len, nmax);
7666 __ sub(count, nmax, 16);
7667 __ br(Assembler::HS, L_nmax_loop);
7668
7669 __ bind(L_by16);
7670 __ adds(len, len, count);
7671 __ br(Assembler::LO, L_by1);
7672
7673 __ bind(L_by16_loop);
7674
7675 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7676 vbytes, vs1acc, vs2acc, vtable);
7677
7678 __ subs(len, len, 16);
7679 __ br(Assembler::HS, L_by16_loop);
7680
7681 __ bind(L_by1);
7682 __ adds(len, len, 15);
7683 __ br(Assembler::LO, L_do_mod);
7684
7685 __ bind(L_by1_loop);
7686 __ ldrb(temp0, Address(__ post(buff, 1)));
7687 __ add(s1, temp0, s1);
7688 __ add(s2, s2, s1);
7689 __ subs(len, len, 1);
7690 __ br(Assembler::HS, L_by1_loop);
7691
7692 __ bind(L_do_mod);
7693 // s1 = s1 % BASE
7694 __ lsr(temp0, s1, 16);
7695 __ lsl(temp1, temp0, 4);
7696 __ sub(temp1, temp1, temp0);
7697 __ add(temp1, temp1, s1, ext::uxth);
7698
7699 __ lsr(temp0, temp1, 16);
7700 __ lsl(s1, temp0, 4);
7701 __ sub(s1, s1, temp0);
7702 __ add(s1, s1, temp1, ext:: uxth);
7703
7704 __ subs(temp0, s1, base);
7705 __ csel(s1, temp0, s1, Assembler::HS);
7706
7707 // s2 = s2 % BASE
7708 __ lsr(temp0, s2, 16);
7709 __ lsl(temp1, temp0, 4);
7710 __ sub(temp1, temp1, temp0);
7711 __ add(temp1, temp1, s2, ext::uxth);
7712
7713 __ lsr(temp0, temp1, 16);
7714 __ lsl(s2, temp0, 4);
7715 __ sub(s2, s2, temp0);
7716 __ add(s2, s2, temp1, ext:: uxth);
7717
7718 __ subs(temp0, s2, base);
7719 __ csel(s2, temp0, s2, Assembler::HS);
7720
7721 // Combine lower bits and higher bits
7722 __ bind(L_combine);
7723 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7724
7725 __ ret(lr);
7726
7727 return start;
7728 }
7729
7730 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7731 Register temp0, Register temp1, FloatRegister vbytes,
7732 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7733 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7734 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7735 // In non-vectorized code, we update s1 and s2 as:
7736 // s1 <- s1 + b1
7737 // s2 <- s2 + s1
7738 // s1 <- s1 + b2
7739 // s2 <- s2 + b1
7740 // ...
7741 // s1 <- s1 + b16
7742 // s2 <- s2 + s1
7743 // Putting above assignments together, we have:
7744 // s1_new = s1 + b1 + b2 + ... + b16
7745 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7746 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7747 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7748 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7749
7750 // s2 = s2 + s1 * 16
7751 __ add(s2, s2, s1, Assembler::LSL, 4);
7752
7753 // vs1acc = b1 + b2 + b3 + ... + b16
7754 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7755 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7756 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7757 __ uaddlv(vs1acc, __ T16B, vbytes);
7758 __ uaddlv(vs2acc, __ T8H, vs2acc);
7759
7760 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7761 __ fmovd(temp0, vs1acc);
7762 __ fmovd(temp1, vs2acc);
7763 __ add(s1, s1, temp0);
7764 __ add(s2, s2, temp1);
7765 }
7766
7767 /**
7768 * Arguments:
7769 *
7770 * Input:
7771 * c_rarg0 - x address
7772 * c_rarg1 - x length
7773 * c_rarg2 - y address
7774 * c_rarg3 - y length
7775 * c_rarg4 - z address
7776 */
7777 address generate_multiplyToLen() {
7778 __ align(CodeEntryAlignment);
7779 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7780 StubCodeMark mark(this, stub_id);
7781
7782 address start = __ pc();
7783 const Register x = r0;
7784 const Register xlen = r1;
7785 const Register y = r2;
7786 const Register ylen = r3;
7787 const Register z = r4;
7788
7789 const Register tmp0 = r5;
7790 const Register tmp1 = r10;
7791 const Register tmp2 = r11;
7792 const Register tmp3 = r12;
7793 const Register tmp4 = r13;
7794 const Register tmp5 = r14;
7795 const Register tmp6 = r15;
7796 const Register tmp7 = r16;
7797
7798 BLOCK_COMMENT("Entry:");
7799 __ enter(); // required for proper stackwalking of RuntimeStub frame
7800 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7801 __ leave(); // required for proper stackwalking of RuntimeStub frame
7802 __ ret(lr);
7803
7804 return start;
7805 }
7806
7807 address generate_squareToLen() {
7808 // squareToLen algorithm for sizes 1..127 described in java code works
7809 // faster than multiply_to_len on some CPUs and slower on others, but
7810 // multiply_to_len shows a bit better overall results
7811 __ align(CodeEntryAlignment);
7812 StubId stub_id = StubId::stubgen_squareToLen_id;
7813 StubCodeMark mark(this, stub_id);
7814 address start = __ pc();
7815
7816 const Register x = r0;
7817 const Register xlen = r1;
7818 const Register z = r2;
7819 const Register y = r4; // == x
7820 const Register ylen = r5; // == xlen
7821
7822 const Register tmp0 = r3;
7823 const Register tmp1 = r10;
7824 const Register tmp2 = r11;
7825 const Register tmp3 = r12;
7826 const Register tmp4 = r13;
7827 const Register tmp5 = r14;
7828 const Register tmp6 = r15;
7829 const Register tmp7 = r16;
7830
7831 RegSet spilled_regs = RegSet::of(y, ylen);
7832 BLOCK_COMMENT("Entry:");
7833 __ enter();
7834 __ push(spilled_regs, sp);
7835 __ mov(y, x);
7836 __ mov(ylen, xlen);
7837 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7838 __ pop(spilled_regs, sp);
7839 __ leave();
7840 __ ret(lr);
7841 return start;
7842 }
7843
7844 address generate_mulAdd() {
7845 __ align(CodeEntryAlignment);
7846 StubId stub_id = StubId::stubgen_mulAdd_id;
7847 StubCodeMark mark(this, stub_id);
7848
7849 address start = __ pc();
7850
7851 const Register out = r0;
7852 const Register in = r1;
7853 const Register offset = r2;
7854 const Register len = r3;
7855 const Register k = r4;
7856
7857 BLOCK_COMMENT("Entry:");
7858 __ enter();
7859 __ mul_add(out, in, offset, len, k);
7860 __ leave();
7861 __ ret(lr);
7862
7863 return start;
7864 }
7865
7866 // Arguments:
7867 //
7868 // Input:
7869 // c_rarg0 - newArr address
7870 // c_rarg1 - oldArr address
7871 // c_rarg2 - newIdx
7872 // c_rarg3 - shiftCount
7873 // c_rarg4 - numIter
7874 //
7875 address generate_bigIntegerRightShift() {
7876 __ align(CodeEntryAlignment);
7877 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7878 StubCodeMark mark(this, stub_id);
7879 address start = __ pc();
7880
7881 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7882
7883 Register newArr = c_rarg0;
7884 Register oldArr = c_rarg1;
7885 Register newIdx = c_rarg2;
7886 Register shiftCount = c_rarg3;
7887 Register numIter = c_rarg4;
7888 Register idx = numIter;
7889
7890 Register newArrCur = rscratch1;
7891 Register shiftRevCount = rscratch2;
7892 Register oldArrCur = r13;
7893 Register oldArrNext = r14;
7894
7895 FloatRegister oldElem0 = v0;
7896 FloatRegister oldElem1 = v1;
7897 FloatRegister newElem = v2;
7898 FloatRegister shiftVCount = v3;
7899 FloatRegister shiftVRevCount = v4;
7900
7901 __ cbz(idx, Exit);
7902
7903 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7904
7905 // left shift count
7906 __ movw(shiftRevCount, 32);
7907 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7908
7909 // numIter too small to allow a 4-words SIMD loop, rolling back
7910 __ cmp(numIter, (u1)4);
7911 __ br(Assembler::LT, ShiftThree);
7912
7913 __ dup(shiftVCount, __ T4S, shiftCount);
7914 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7915 __ negr(shiftVCount, __ T4S, shiftVCount);
7916
7917 __ BIND(ShiftSIMDLoop);
7918
7919 // Calculate the load addresses
7920 __ sub(idx, idx, 4);
7921 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7922 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7923 __ add(oldArrCur, oldArrNext, 4);
7924
7925 // Load 4 words and process
7926 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7927 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7928 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7929 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7930 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7931 __ st1(newElem, __ T4S, Address(newArrCur));
7932
7933 __ cmp(idx, (u1)4);
7934 __ br(Assembler::LT, ShiftTwoLoop);
7935 __ b(ShiftSIMDLoop);
7936
7937 __ BIND(ShiftTwoLoop);
7938 __ cbz(idx, Exit);
7939 __ cmp(idx, (u1)1);
7940 __ br(Assembler::EQ, ShiftOne);
7941
7942 // Calculate the load addresses
7943 __ sub(idx, idx, 2);
7944 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7945 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7946 __ add(oldArrCur, oldArrNext, 4);
7947
7948 // Load 2 words and process
7949 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
7950 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
7951 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
7952 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
7953 __ orr(newElem, __ T8B, oldElem0, oldElem1);
7954 __ st1(newElem, __ T2S, Address(newArrCur));
7955 __ b(ShiftTwoLoop);
7956
7957 __ BIND(ShiftThree);
7958 __ tbz(idx, 1, ShiftOne);
7959 __ tbz(idx, 0, ShiftTwo);
7960 __ ldrw(r10, Address(oldArr, 12));
7961 __ ldrw(r11, Address(oldArr, 8));
7962 __ lsrvw(r10, r10, shiftCount);
7963 __ lslvw(r11, r11, shiftRevCount);
7964 __ orrw(r12, r10, r11);
7965 __ strw(r12, Address(newArr, 8));
7966
7967 __ BIND(ShiftTwo);
7968 __ ldrw(r10, Address(oldArr, 8));
7969 __ ldrw(r11, Address(oldArr, 4));
7970 __ lsrvw(r10, r10, shiftCount);
7971 __ lslvw(r11, r11, shiftRevCount);
7972 __ orrw(r12, r10, r11);
7973 __ strw(r12, Address(newArr, 4));
7974
7975 __ BIND(ShiftOne);
7976 __ ldrw(r10, Address(oldArr, 4));
7977 __ ldrw(r11, Address(oldArr));
7978 __ lsrvw(r10, r10, shiftCount);
7979 __ lslvw(r11, r11, shiftRevCount);
7980 __ orrw(r12, r10, r11);
7981 __ strw(r12, Address(newArr));
7982
7983 __ BIND(Exit);
7984 __ ret(lr);
7985
7986 return start;
7987 }
7988
7989 // Arguments:
7990 //
7991 // Input:
7992 // c_rarg0 - newArr address
7993 // c_rarg1 - oldArr address
7994 // c_rarg2 - newIdx
7995 // c_rarg3 - shiftCount
7996 // c_rarg4 - numIter
7997 //
7998 address generate_bigIntegerLeftShift() {
7999 __ align(CodeEntryAlignment);
8000 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8001 StubCodeMark mark(this, stub_id);
8002 address start = __ pc();
8003
8004 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8005
8006 Register newArr = c_rarg0;
8007 Register oldArr = c_rarg1;
8008 Register newIdx = c_rarg2;
8009 Register shiftCount = c_rarg3;
8010 Register numIter = c_rarg4;
8011
8012 Register shiftRevCount = rscratch1;
8013 Register oldArrNext = rscratch2;
8014
8015 FloatRegister oldElem0 = v0;
8016 FloatRegister oldElem1 = v1;
8017 FloatRegister newElem = v2;
8018 FloatRegister shiftVCount = v3;
8019 FloatRegister shiftVRevCount = v4;
8020
8021 __ cbz(numIter, Exit);
8022
8023 __ add(oldArrNext, oldArr, 4);
8024 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8025
8026 // right shift count
8027 __ movw(shiftRevCount, 32);
8028 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8029
8030 // numIter too small to allow a 4-words SIMD loop, rolling back
8031 __ cmp(numIter, (u1)4);
8032 __ br(Assembler::LT, ShiftThree);
8033
8034 __ dup(shiftVCount, __ T4S, shiftCount);
8035 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8036 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8037
8038 __ BIND(ShiftSIMDLoop);
8039
8040 // load 4 words and process
8041 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8042 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8043 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8044 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8045 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8046 __ st1(newElem, __ T4S, __ post(newArr, 16));
8047 __ sub(numIter, numIter, 4);
8048
8049 __ cmp(numIter, (u1)4);
8050 __ br(Assembler::LT, ShiftTwoLoop);
8051 __ b(ShiftSIMDLoop);
8052
8053 __ BIND(ShiftTwoLoop);
8054 __ cbz(numIter, Exit);
8055 __ cmp(numIter, (u1)1);
8056 __ br(Assembler::EQ, ShiftOne);
8057
8058 // load 2 words and process
8059 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8060 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8061 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8062 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8063 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8064 __ st1(newElem, __ T2S, __ post(newArr, 8));
8065 __ sub(numIter, numIter, 2);
8066 __ b(ShiftTwoLoop);
8067
8068 __ BIND(ShiftThree);
8069 __ ldrw(r10, __ post(oldArr, 4));
8070 __ ldrw(r11, __ post(oldArrNext, 4));
8071 __ lslvw(r10, r10, shiftCount);
8072 __ lsrvw(r11, r11, shiftRevCount);
8073 __ orrw(r12, r10, r11);
8074 __ strw(r12, __ post(newArr, 4));
8075 __ tbz(numIter, 1, Exit);
8076 __ tbz(numIter, 0, ShiftOne);
8077
8078 __ BIND(ShiftTwo);
8079 __ ldrw(r10, __ post(oldArr, 4));
8080 __ ldrw(r11, __ post(oldArrNext, 4));
8081 __ lslvw(r10, r10, shiftCount);
8082 __ lsrvw(r11, r11, shiftRevCount);
8083 __ orrw(r12, r10, r11);
8084 __ strw(r12, __ post(newArr, 4));
8085
8086 __ BIND(ShiftOne);
8087 __ ldrw(r10, Address(oldArr));
8088 __ ldrw(r11, Address(oldArrNext));
8089 __ lslvw(r10, r10, shiftCount);
8090 __ lsrvw(r11, r11, shiftRevCount);
8091 __ orrw(r12, r10, r11);
8092 __ strw(r12, Address(newArr));
8093
8094 __ BIND(Exit);
8095 __ ret(lr);
8096
8097 return start;
8098 }
8099
8100 address generate_count_positives(address &count_positives_long) {
8101 const u1 large_loop_size = 64;
8102 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8103 int dcache_line = VM_Version::dcache_line_size();
8104
8105 Register ary1 = r1, len = r2, result = r0;
8106
8107 __ align(CodeEntryAlignment);
8108
8109 StubId stub_id = StubId::stubgen_count_positives_id;
8110 StubCodeMark mark(this, stub_id);
8111
8112 address entry = __ pc();
8113
8114 __ enter();
8115 // precondition: a copy of len is already in result
8116 // __ mov(result, len);
8117
8118 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8119 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8120
8121 __ cmp(len, (u1)15);
8122 __ br(Assembler::GT, LEN_OVER_15);
8123 // The only case when execution falls into this code is when pointer is near
8124 // the end of memory page and we have to avoid reading next page
8125 __ add(ary1, ary1, len);
8126 __ subs(len, len, 8);
8127 __ br(Assembler::GT, LEN_OVER_8);
8128 __ ldr(rscratch2, Address(ary1, -8));
8129 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8130 __ lsrv(rscratch2, rscratch2, rscratch1);
8131 __ tst(rscratch2, UPPER_BIT_MASK);
8132 __ csel(result, zr, result, Assembler::NE);
8133 __ leave();
8134 __ ret(lr);
8135 __ bind(LEN_OVER_8);
8136 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8137 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8138 __ tst(rscratch2, UPPER_BIT_MASK);
8139 __ br(Assembler::NE, RET_NO_POP);
8140 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8141 __ lsrv(rscratch1, rscratch1, rscratch2);
8142 __ tst(rscratch1, UPPER_BIT_MASK);
8143 __ bind(RET_NO_POP);
8144 __ csel(result, zr, result, Assembler::NE);
8145 __ leave();
8146 __ ret(lr);
8147
8148 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8149 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8150
8151 count_positives_long = __ pc(); // 2nd entry point
8152
8153 __ enter();
8154
8155 __ bind(LEN_OVER_15);
8156 __ push(spilled_regs, sp);
8157 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8158 __ cbz(rscratch2, ALIGNED);
8159 __ ldp(tmp6, tmp1, Address(ary1));
8160 __ mov(tmp5, 16);
8161 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8162 __ add(ary1, ary1, rscratch1);
8163 __ orr(tmp6, tmp6, tmp1);
8164 __ tst(tmp6, UPPER_BIT_MASK);
8165 __ br(Assembler::NE, RET_ADJUST);
8166 __ sub(len, len, rscratch1);
8167
8168 __ bind(ALIGNED);
8169 __ cmp(len, large_loop_size);
8170 __ br(Assembler::LT, CHECK_16);
8171 // Perform 16-byte load as early return in pre-loop to handle situation
8172 // when initially aligned large array has negative values at starting bytes,
8173 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8174 // slower. Cases with negative bytes further ahead won't be affected that
8175 // much. In fact, it'll be faster due to early loads, less instructions and
8176 // less branches in LARGE_LOOP.
8177 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8178 __ sub(len, len, 16);
8179 __ orr(tmp6, tmp6, tmp1);
8180 __ tst(tmp6, UPPER_BIT_MASK);
8181 __ br(Assembler::NE, RET_ADJUST_16);
8182 __ cmp(len, large_loop_size);
8183 __ br(Assembler::LT, CHECK_16);
8184
8185 if (SoftwarePrefetchHintDistance >= 0
8186 && SoftwarePrefetchHintDistance >= dcache_line) {
8187 // initial prefetch
8188 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8189 }
8190 __ bind(LARGE_LOOP);
8191 if (SoftwarePrefetchHintDistance >= 0) {
8192 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8193 }
8194 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8195 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8196 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8197 // instructions per cycle and have less branches, but this approach disables
8198 // early return, thus, all 64 bytes are loaded and checked every time.
8199 __ ldp(tmp2, tmp3, Address(ary1));
8200 __ ldp(tmp4, tmp5, Address(ary1, 16));
8201 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8202 __ ldp(tmp6, tmp1, Address(ary1, 48));
8203 __ add(ary1, ary1, large_loop_size);
8204 __ sub(len, len, large_loop_size);
8205 __ orr(tmp2, tmp2, tmp3);
8206 __ orr(tmp4, tmp4, tmp5);
8207 __ orr(rscratch1, rscratch1, rscratch2);
8208 __ orr(tmp6, tmp6, tmp1);
8209 __ orr(tmp2, tmp2, tmp4);
8210 __ orr(rscratch1, rscratch1, tmp6);
8211 __ orr(tmp2, tmp2, rscratch1);
8212 __ tst(tmp2, UPPER_BIT_MASK);
8213 __ br(Assembler::NE, RET_ADJUST_LONG);
8214 __ cmp(len, large_loop_size);
8215 __ br(Assembler::GE, LARGE_LOOP);
8216
8217 __ bind(CHECK_16); // small 16-byte load pre-loop
8218 __ cmp(len, (u1)16);
8219 __ br(Assembler::LT, POST_LOOP16);
8220
8221 __ bind(LOOP16); // small 16-byte load loop
8222 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8223 __ sub(len, len, 16);
8224 __ orr(tmp2, tmp2, tmp3);
8225 __ tst(tmp2, UPPER_BIT_MASK);
8226 __ br(Assembler::NE, RET_ADJUST_16);
8227 __ cmp(len, (u1)16);
8228 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8229
8230 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8231 __ cmp(len, (u1)8);
8232 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8233 __ ldr(tmp3, Address(__ post(ary1, 8)));
8234 __ tst(tmp3, UPPER_BIT_MASK);
8235 __ br(Assembler::NE, RET_ADJUST);
8236 __ sub(len, len, 8);
8237
8238 __ bind(POST_LOOP16_LOAD_TAIL);
8239 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8240 __ ldr(tmp1, Address(ary1));
8241 __ mov(tmp2, 64);
8242 __ sub(tmp4, tmp2, len, __ LSL, 3);
8243 __ lslv(tmp1, tmp1, tmp4);
8244 __ tst(tmp1, UPPER_BIT_MASK);
8245 __ br(Assembler::NE, RET_ADJUST);
8246 // Fallthrough
8247
8248 __ bind(RET_LEN);
8249 __ pop(spilled_regs, sp);
8250 __ leave();
8251 __ ret(lr);
8252
8253 // difference result - len is the count of guaranteed to be
8254 // positive bytes
8255
8256 __ bind(RET_ADJUST_LONG);
8257 __ add(len, len, (u1)(large_loop_size - 16));
8258 __ bind(RET_ADJUST_16);
8259 __ add(len, len, 16);
8260 __ bind(RET_ADJUST);
8261 __ pop(spilled_regs, sp);
8262 __ leave();
8263 __ sub(result, result, len);
8264 __ ret(lr);
8265
8266 return entry;
8267 }
8268
8269 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8270 bool usePrefetch, Label &NOT_EQUAL) {
8271 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8272 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8273 tmp7 = r12, tmp8 = r13;
8274 Label LOOP;
8275
8276 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8277 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8278 __ bind(LOOP);
8279 if (usePrefetch) {
8280 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8281 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8282 }
8283 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8284 __ eor(tmp1, tmp1, tmp2);
8285 __ eor(tmp3, tmp3, tmp4);
8286 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8287 __ orr(tmp1, tmp1, tmp3);
8288 __ cbnz(tmp1, NOT_EQUAL);
8289 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8290 __ eor(tmp5, tmp5, tmp6);
8291 __ eor(tmp7, tmp7, tmp8);
8292 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8293 __ orr(tmp5, tmp5, tmp7);
8294 __ cbnz(tmp5, NOT_EQUAL);
8295 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8296 __ eor(tmp1, tmp1, tmp2);
8297 __ eor(tmp3, tmp3, tmp4);
8298 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8299 __ orr(tmp1, tmp1, tmp3);
8300 __ cbnz(tmp1, NOT_EQUAL);
8301 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8302 __ eor(tmp5, tmp5, tmp6);
8303 __ sub(cnt1, cnt1, 8 * wordSize);
8304 __ eor(tmp7, tmp7, tmp8);
8305 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8306 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8307 // cmp) because subs allows an unlimited range of immediate operand.
8308 __ subs(tmp6, cnt1, loopThreshold);
8309 __ orr(tmp5, tmp5, tmp7);
8310 __ cbnz(tmp5, NOT_EQUAL);
8311 __ br(__ GE, LOOP);
8312 // post-loop
8313 __ eor(tmp1, tmp1, tmp2);
8314 __ eor(tmp3, tmp3, tmp4);
8315 __ orr(tmp1, tmp1, tmp3);
8316 __ sub(cnt1, cnt1, 2 * wordSize);
8317 __ cbnz(tmp1, NOT_EQUAL);
8318 }
8319
8320 void generate_large_array_equals_loop_simd(int loopThreshold,
8321 bool usePrefetch, Label &NOT_EQUAL) {
8322 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8323 tmp2 = rscratch2;
8324 Label LOOP;
8325
8326 __ bind(LOOP);
8327 if (usePrefetch) {
8328 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8329 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8330 }
8331 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8332 __ sub(cnt1, cnt1, 8 * wordSize);
8333 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8334 __ subs(tmp1, cnt1, loopThreshold);
8335 __ eor(v0, __ T16B, v0, v4);
8336 __ eor(v1, __ T16B, v1, v5);
8337 __ eor(v2, __ T16B, v2, v6);
8338 __ eor(v3, __ T16B, v3, v7);
8339 __ orr(v0, __ T16B, v0, v1);
8340 __ orr(v1, __ T16B, v2, v3);
8341 __ orr(v0, __ T16B, v0, v1);
8342 __ umov(tmp1, v0, __ D, 0);
8343 __ umov(tmp2, v0, __ D, 1);
8344 __ orr(tmp1, tmp1, tmp2);
8345 __ cbnz(tmp1, NOT_EQUAL);
8346 __ br(__ GE, LOOP);
8347 }
8348
8349 // a1 = r1 - array1 address
8350 // a2 = r2 - array2 address
8351 // result = r0 - return value. Already contains "false"
8352 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8353 // r3-r5 are reserved temporary registers
8354 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8355 address generate_large_array_equals() {
8356 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8357 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8358 tmp7 = r12, tmp8 = r13;
8359 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8360 SMALL_LOOP, POST_LOOP;
8361 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8362 // calculate if at least 32 prefetched bytes are used
8363 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8364 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8365 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8366 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8367 tmp5, tmp6, tmp7, tmp8);
8368
8369 __ align(CodeEntryAlignment);
8370
8371 StubId stub_id = StubId::stubgen_large_array_equals_id;
8372 StubCodeMark mark(this, stub_id);
8373
8374 address entry = __ pc();
8375 __ enter();
8376 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8377 // also advance pointers to use post-increment instead of pre-increment
8378 __ add(a1, a1, wordSize);
8379 __ add(a2, a2, wordSize);
8380 if (AvoidUnalignedAccesses) {
8381 // both implementations (SIMD/nonSIMD) are using relatively large load
8382 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8383 // on some CPUs in case of address is not at least 16-byte aligned.
8384 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8385 // load if needed at least for 1st address and make if 16-byte aligned.
8386 Label ALIGNED16;
8387 __ tbz(a1, 3, ALIGNED16);
8388 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8389 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8390 __ sub(cnt1, cnt1, wordSize);
8391 __ eor(tmp1, tmp1, tmp2);
8392 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8393 __ bind(ALIGNED16);
8394 }
8395 if (UseSIMDForArrayEquals) {
8396 if (SoftwarePrefetchHintDistance >= 0) {
8397 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8398 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8399 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8400 /* prfm = */ true, NOT_EQUAL);
8401 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8402 __ br(__ LT, TAIL);
8403 }
8404 __ bind(NO_PREFETCH_LARGE_LOOP);
8405 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8406 /* prfm = */ false, NOT_EQUAL);
8407 } else {
8408 __ push(spilled_regs, sp);
8409 if (SoftwarePrefetchHintDistance >= 0) {
8410 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8411 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8412 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8413 /* prfm = */ true, NOT_EQUAL);
8414 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8415 __ br(__ LT, TAIL);
8416 }
8417 __ bind(NO_PREFETCH_LARGE_LOOP);
8418 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8419 /* prfm = */ false, NOT_EQUAL);
8420 }
8421 __ bind(TAIL);
8422 __ cbz(cnt1, EQUAL);
8423 __ subs(cnt1, cnt1, wordSize);
8424 __ br(__ LE, POST_LOOP);
8425 __ bind(SMALL_LOOP);
8426 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8427 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8428 __ subs(cnt1, cnt1, wordSize);
8429 __ eor(tmp1, tmp1, tmp2);
8430 __ cbnz(tmp1, NOT_EQUAL);
8431 __ br(__ GT, SMALL_LOOP);
8432 __ bind(POST_LOOP);
8433 __ ldr(tmp1, Address(a1, cnt1));
8434 __ ldr(tmp2, Address(a2, cnt1));
8435 __ eor(tmp1, tmp1, tmp2);
8436 __ cbnz(tmp1, NOT_EQUAL);
8437 __ bind(EQUAL);
8438 __ mov(result, true);
8439 __ bind(NOT_EQUAL);
8440 if (!UseSIMDForArrayEquals) {
8441 __ pop(spilled_regs, sp);
8442 }
8443 __ bind(NOT_EQUAL_NO_POP);
8444 __ leave();
8445 __ ret(lr);
8446 return entry;
8447 }
8448
8449 // result = r0 - return value. Contains initial hashcode value on entry.
8450 // ary = r1 - array address
8451 // cnt = r2 - elements count
8452 // Clobbers: v0-v13, rscratch1, rscratch2
8453 address generate_large_arrays_hashcode(BasicType eltype) {
8454 const Register result = r0, ary = r1, cnt = r2;
8455 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8456 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8457 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8458 const FloatRegister vpowm = v13;
8459
8460 ARRAYS_HASHCODE_REGISTERS;
8461
8462 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8463
8464 unsigned int vf; // vectorization factor
8465 bool multiply_by_halves;
8466 Assembler::SIMD_Arrangement load_arrangement;
8467 switch (eltype) {
8468 case T_BOOLEAN:
8469 case T_BYTE:
8470 load_arrangement = Assembler::T8B;
8471 multiply_by_halves = true;
8472 vf = 8;
8473 break;
8474 case T_CHAR:
8475 case T_SHORT:
8476 load_arrangement = Assembler::T8H;
8477 multiply_by_halves = true;
8478 vf = 8;
8479 break;
8480 case T_INT:
8481 load_arrangement = Assembler::T4S;
8482 multiply_by_halves = false;
8483 vf = 4;
8484 break;
8485 default:
8486 ShouldNotReachHere();
8487 }
8488
8489 // Unroll factor
8490 const unsigned uf = 4;
8491
8492 // Effective vectorization factor
8493 const unsigned evf = vf * uf;
8494
8495 __ align(CodeEntryAlignment);
8496
8497 StubId stub_id;
8498 switch (eltype) {
8499 case T_BOOLEAN:
8500 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8501 break;
8502 case T_BYTE:
8503 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8504 break;
8505 case T_CHAR:
8506 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8507 break;
8508 case T_SHORT:
8509 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8510 break;
8511 case T_INT:
8512 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8513 break;
8514 default:
8515 stub_id = StubId::NO_STUBID;
8516 ShouldNotReachHere();
8517 };
8518
8519 StubCodeMark mark(this, stub_id);
8520
8521 address entry = __ pc();
8522 __ enter();
8523
8524 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8525 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8526 // value shouldn't change throughout both loops.
8527 __ movw(rscratch1, intpow(31U, 3));
8528 __ mov(vpow, Assembler::S, 0, rscratch1);
8529 __ movw(rscratch1, intpow(31U, 2));
8530 __ mov(vpow, Assembler::S, 1, rscratch1);
8531 __ movw(rscratch1, intpow(31U, 1));
8532 __ mov(vpow, Assembler::S, 2, rscratch1);
8533 __ movw(rscratch1, intpow(31U, 0));
8534 __ mov(vpow, Assembler::S, 3, rscratch1);
8535
8536 __ mov(vmul0, Assembler::T16B, 0);
8537 __ mov(vmul0, Assembler::S, 3, result);
8538
8539 __ andr(rscratch2, cnt, (uf - 1) * vf);
8540 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8541
8542 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8543 __ mov(vpowm, Assembler::S, 0, rscratch1);
8544
8545 // SMALL LOOP
8546 __ bind(SMALL_LOOP);
8547
8548 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8549 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8550 __ subsw(rscratch2, rscratch2, vf);
8551
8552 if (load_arrangement == Assembler::T8B) {
8553 // Extend 8B to 8H to be able to use vector multiply
8554 // instructions
8555 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8556 if (is_signed_subword_type(eltype)) {
8557 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8558 } else {
8559 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8560 }
8561 }
8562
8563 switch (load_arrangement) {
8564 case Assembler::T4S:
8565 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8566 break;
8567 case Assembler::T8B:
8568 case Assembler::T8H:
8569 assert(is_subword_type(eltype), "subword type expected");
8570 if (is_signed_subword_type(eltype)) {
8571 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8572 } else {
8573 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8574 }
8575 break;
8576 default:
8577 __ should_not_reach_here();
8578 }
8579
8580 // Process the upper half of a vector
8581 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8582 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8583 if (is_signed_subword_type(eltype)) {
8584 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8585 } else {
8586 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8587 }
8588 }
8589
8590 __ br(Assembler::HI, SMALL_LOOP);
8591
8592 // SMALL LOOP'S EPILOQUE
8593 __ lsr(rscratch2, cnt, exact_log2(evf));
8594 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8595
8596 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8597 __ addv(vmul0, Assembler::T4S, vmul0);
8598 __ umov(result, vmul0, Assembler::S, 0);
8599
8600 // TAIL
8601 __ bind(TAIL);
8602
8603 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8604 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8605 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8606 __ andr(rscratch2, cnt, vf - 1);
8607 __ bind(TAIL_SHORTCUT);
8608 __ adr(rscratch1, BR_BASE);
8609 // For Cortex-A53 offset is 4 because 2 nops are generated.
8610 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8611 __ movw(rscratch2, 0x1f);
8612 __ br(rscratch1);
8613
8614 for (size_t i = 0; i < vf - 1; ++i) {
8615 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8616 eltype);
8617 __ maddw(result, result, rscratch2, rscratch1);
8618 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8619 // Generate 2nd nop to have 4 instructions per iteration.
8620 if (VM_Version::supports_a53mac()) {
8621 __ nop();
8622 }
8623 }
8624 __ bind(BR_BASE);
8625
8626 __ leave();
8627 __ ret(lr);
8628
8629 // LARGE LOOP
8630 __ bind(LARGE_LOOP_PREHEADER);
8631
8632 __ lsr(rscratch2, cnt, exact_log2(evf));
8633
8634 if (multiply_by_halves) {
8635 // 31^4 - multiplier between lower and upper parts of a register
8636 __ movw(rscratch1, intpow(31U, vf / 2));
8637 __ mov(vpowm, Assembler::S, 1, rscratch1);
8638 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8639 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8640 __ mov(vpowm, Assembler::S, 0, rscratch1);
8641 } else {
8642 // 31^16
8643 __ movw(rscratch1, intpow(31U, evf));
8644 __ mov(vpowm, Assembler::S, 0, rscratch1);
8645 }
8646
8647 __ mov(vmul3, Assembler::T16B, 0);
8648 __ mov(vmul2, Assembler::T16B, 0);
8649 __ mov(vmul1, Assembler::T16B, 0);
8650
8651 __ bind(LARGE_LOOP);
8652
8653 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8654 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8655 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8656 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8657
8658 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8659 Address(__ post(ary, evf * type2aelembytes(eltype))));
8660
8661 if (load_arrangement == Assembler::T8B) {
8662 // Extend 8B to 8H to be able to use vector multiply
8663 // instructions
8664 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8665 if (is_signed_subword_type(eltype)) {
8666 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8667 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8668 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8669 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8670 } else {
8671 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8672 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8673 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8674 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8675 }
8676 }
8677
8678 switch (load_arrangement) {
8679 case Assembler::T4S:
8680 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8681 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8682 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8683 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8684 break;
8685 case Assembler::T8B:
8686 case Assembler::T8H:
8687 assert(is_subword_type(eltype), "subword type expected");
8688 if (is_signed_subword_type(eltype)) {
8689 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8690 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8691 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8692 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8693 } else {
8694 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8695 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8696 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8697 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8698 }
8699 break;
8700 default:
8701 __ should_not_reach_here();
8702 }
8703
8704 // Process the upper half of a vector
8705 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8706 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8707 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8708 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8709 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8710 if (is_signed_subword_type(eltype)) {
8711 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8712 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8713 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8714 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8715 } else {
8716 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8717 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8718 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8719 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8720 }
8721 }
8722
8723 __ subsw(rscratch2, rscratch2, 1);
8724 __ br(Assembler::HI, LARGE_LOOP);
8725
8726 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8727 __ addv(vmul3, Assembler::T4S, vmul3);
8728 __ umov(result, vmul3, Assembler::S, 0);
8729
8730 __ mov(rscratch2, intpow(31U, vf));
8731
8732 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8733 __ addv(vmul2, Assembler::T4S, vmul2);
8734 __ umov(rscratch1, vmul2, Assembler::S, 0);
8735 __ maddw(result, result, rscratch2, rscratch1);
8736
8737 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8738 __ addv(vmul1, Assembler::T4S, vmul1);
8739 __ umov(rscratch1, vmul1, Assembler::S, 0);
8740 __ maddw(result, result, rscratch2, rscratch1);
8741
8742 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8743 __ addv(vmul0, Assembler::T4S, vmul0);
8744 __ umov(rscratch1, vmul0, Assembler::S, 0);
8745 __ maddw(result, result, rscratch2, rscratch1);
8746
8747 __ andr(rscratch2, cnt, vf - 1);
8748 __ cbnz(rscratch2, TAIL_SHORTCUT);
8749
8750 __ leave();
8751 __ ret(lr);
8752
8753 return entry;
8754 }
8755
8756 address generate_dsin_dcos(bool isCos) {
8757 __ align(CodeEntryAlignment);
8758 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8759 StubCodeMark mark(this, stub_id);
8760 address start = __ pc();
8761 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8762 (address)StubRoutines::aarch64::_two_over_pi,
8763 (address)StubRoutines::aarch64::_pio2,
8764 (address)StubRoutines::aarch64::_dsin_coef,
8765 (address)StubRoutines::aarch64::_dcos_coef);
8766 return start;
8767 }
8768
8769 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8770 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8771 Label &DIFF2) {
8772 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8773 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8774
8775 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8776 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8777 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8778 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8779
8780 __ fmovd(tmpL, vtmp3);
8781 __ eor(rscratch2, tmp3, tmpL);
8782 __ cbnz(rscratch2, DIFF2);
8783
8784 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8785 __ umov(tmpL, vtmp3, __ D, 1);
8786 __ eor(rscratch2, tmpU, tmpL);
8787 __ cbnz(rscratch2, DIFF1);
8788
8789 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8790 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8791 __ fmovd(tmpL, vtmp);
8792 __ eor(rscratch2, tmp3, tmpL);
8793 __ cbnz(rscratch2, DIFF2);
8794
8795 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8796 __ umov(tmpL, vtmp, __ D, 1);
8797 __ eor(rscratch2, tmpU, tmpL);
8798 __ cbnz(rscratch2, DIFF1);
8799 }
8800
8801 // r0 = result
8802 // r1 = str1
8803 // r2 = cnt1
8804 // r3 = str2
8805 // r4 = cnt2
8806 // r10 = tmp1
8807 // r11 = tmp2
8808 address generate_compare_long_string_different_encoding(bool isLU) {
8809 __ align(CodeEntryAlignment);
8810 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8811 StubCodeMark mark(this, stub_id);
8812 address entry = __ pc();
8813 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8814 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8815 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8816 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8817 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8818 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8819 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8820
8821 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8822
8823 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8824 // cnt2 == amount of characters left to compare
8825 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8826 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8827 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8828 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8829 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8830 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8831 __ eor(rscratch2, tmp1, tmp2);
8832 __ mov(rscratch1, tmp2);
8833 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8834 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8835 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8836 __ push(spilled_regs, sp);
8837 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8838 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8839
8840 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8841
8842 if (SoftwarePrefetchHintDistance >= 0) {
8843 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8844 __ br(__ LT, NO_PREFETCH);
8845 __ bind(LARGE_LOOP_PREFETCH);
8846 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8847 __ mov(tmp4, 2);
8848 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8849 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8850 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8851 __ subs(tmp4, tmp4, 1);
8852 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8853 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8854 __ mov(tmp4, 2);
8855 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8856 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8857 __ subs(tmp4, tmp4, 1);
8858 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8859 __ sub(cnt2, cnt2, 64);
8860 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8861 __ br(__ GE, LARGE_LOOP_PREFETCH);
8862 }
8863 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8864 __ bind(NO_PREFETCH);
8865 __ subs(cnt2, cnt2, 16);
8866 __ br(__ LT, TAIL);
8867 __ align(OptoLoopAlignment);
8868 __ bind(SMALL_LOOP); // smaller loop
8869 __ subs(cnt2, cnt2, 16);
8870 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8871 __ br(__ GE, SMALL_LOOP);
8872 __ cmn(cnt2, (u1)16);
8873 __ br(__ EQ, LOAD_LAST);
8874 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8875 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8876 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8877 __ ldr(tmp3, Address(cnt1, -8));
8878 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8879 __ b(LOAD_LAST);
8880 __ bind(DIFF2);
8881 __ mov(tmpU, tmp3);
8882 __ bind(DIFF1);
8883 __ pop(spilled_regs, sp);
8884 __ b(CALCULATE_DIFFERENCE);
8885 __ bind(LOAD_LAST);
8886 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8887 // No need to load it again
8888 __ mov(tmpU, tmp3);
8889 __ pop(spilled_regs, sp);
8890
8891 // tmp2 points to the address of the last 4 Latin1 characters right now
8892 __ ldrs(vtmp, Address(tmp2));
8893 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8894 __ fmovd(tmpL, vtmp);
8895
8896 __ eor(rscratch2, tmpU, tmpL);
8897 __ cbz(rscratch2, DONE);
8898
8899 // Find the first different characters in the longwords and
8900 // compute their difference.
8901 __ bind(CALCULATE_DIFFERENCE);
8902 __ rev(rscratch2, rscratch2);
8903 __ clz(rscratch2, rscratch2);
8904 __ andr(rscratch2, rscratch2, -16);
8905 __ lsrv(tmp1, tmp1, rscratch2);
8906 __ uxthw(tmp1, tmp1);
8907 __ lsrv(rscratch1, rscratch1, rscratch2);
8908 __ uxthw(rscratch1, rscratch1);
8909 __ subw(result, tmp1, rscratch1);
8910 __ bind(DONE);
8911 __ ret(lr);
8912 return entry;
8913 }
8914
8915 // r0 = input (float16)
8916 // v0 = result (float)
8917 // v1 = temporary float register
8918 address generate_float16ToFloat() {
8919 __ align(CodeEntryAlignment);
8920 StubId stub_id = StubId::stubgen_hf2f_id;
8921 StubCodeMark mark(this, stub_id);
8922 address entry = __ pc();
8923 BLOCK_COMMENT("Entry:");
8924 __ flt16_to_flt(v0, r0, v1);
8925 __ ret(lr);
8926 return entry;
8927 }
8928
8929 // v0 = input (float)
8930 // r0 = result (float16)
8931 // v1 = temporary float register
8932 address generate_floatToFloat16() {
8933 __ align(CodeEntryAlignment);
8934 StubId stub_id = StubId::stubgen_f2hf_id;
8935 StubCodeMark mark(this, stub_id);
8936 address entry = __ pc();
8937 BLOCK_COMMENT("Entry:");
8938 __ flt_to_flt16(r0, v0, v1);
8939 __ ret(lr);
8940 return entry;
8941 }
8942
8943 address generate_method_entry_barrier() {
8944 __ align(CodeEntryAlignment);
8945 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
8946 StubCodeMark mark(this, stub_id);
8947
8948 Label deoptimize_label;
8949
8950 address start = __ pc();
8951
8952 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
8953
8954 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
8955 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8956 // We can get here despite the nmethod being good, if we have not
8957 // yet applied our cross modification fence (or data fence).
8958 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
8959 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
8960 __ ldrw(rscratch2, rscratch2);
8961 __ strw(rscratch2, thread_epoch_addr);
8962 __ isb();
8963 __ membar(__ LoadLoad);
8964 }
8965
8966 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
8967
8968 __ enter();
8969 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
8970
8971 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
8972
8973 __ push_call_clobbered_registers();
8974
8975 __ mov(c_rarg0, rscratch2);
8976 __ call_VM_leaf
8977 (CAST_FROM_FN_PTR
8978 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
8979
8980 __ reset_last_Java_frame(true);
8981
8982 __ mov(rscratch1, r0);
8983
8984 __ pop_call_clobbered_registers();
8985
8986 __ cbnz(rscratch1, deoptimize_label);
8987
8988 __ leave();
8989 __ ret(lr);
8990
8991 __ BIND(deoptimize_label);
8992
8993 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
8994 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
8995
8996 __ mov(sp, rscratch1);
8997 __ br(rscratch2);
8998
8999 return start;
9000 }
9001
9002 // r0 = result
9003 // r1 = str1
9004 // r2 = cnt1
9005 // r3 = str2
9006 // r4 = cnt2
9007 // r10 = tmp1
9008 // r11 = tmp2
9009 address generate_compare_long_string_same_encoding(bool isLL) {
9010 __ align(CodeEntryAlignment);
9011 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9012 StubCodeMark mark(this, stub_id);
9013 address entry = __ pc();
9014 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9015 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9016
9017 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9018
9019 // exit from large loop when less than 64 bytes left to read or we're about
9020 // to prefetch memory behind array border
9021 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9022
9023 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9024 __ eor(rscratch2, tmp1, tmp2);
9025 __ cbnz(rscratch2, CAL_DIFFERENCE);
9026
9027 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9028 // update pointers, because of previous read
9029 __ add(str1, str1, wordSize);
9030 __ add(str2, str2, wordSize);
9031 if (SoftwarePrefetchHintDistance >= 0) {
9032 __ align(OptoLoopAlignment);
9033 __ bind(LARGE_LOOP_PREFETCH);
9034 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9035 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9036
9037 for (int i = 0; i < 4; i++) {
9038 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9039 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9040 __ cmp(tmp1, tmp2);
9041 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9042 __ br(Assembler::NE, DIFF);
9043 }
9044 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9045 __ add(str1, str1, 64);
9046 __ add(str2, str2, 64);
9047 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9048 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9049 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9050 }
9051
9052 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9053 __ br(Assembler::LE, LESS16);
9054 __ align(OptoLoopAlignment);
9055 __ bind(LOOP_COMPARE16);
9056 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9057 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9058 __ cmp(tmp1, tmp2);
9059 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9060 __ br(Assembler::NE, DIFF);
9061 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9062 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9063 __ br(Assembler::LT, LESS16);
9064
9065 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9066 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9067 __ cmp(tmp1, tmp2);
9068 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9069 __ br(Assembler::NE, DIFF);
9070 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9071 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9072 __ br(Assembler::GE, LOOP_COMPARE16);
9073 __ cbz(cnt2, LENGTH_DIFF);
9074
9075 __ bind(LESS16);
9076 // each 8 compare
9077 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9078 __ br(Assembler::LE, LESS8);
9079 __ ldr(tmp1, Address(__ post(str1, 8)));
9080 __ ldr(tmp2, Address(__ post(str2, 8)));
9081 __ eor(rscratch2, tmp1, tmp2);
9082 __ cbnz(rscratch2, CAL_DIFFERENCE);
9083 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9084
9085 __ bind(LESS8); // directly load last 8 bytes
9086 if (!isLL) {
9087 __ add(cnt2, cnt2, cnt2);
9088 }
9089 __ ldr(tmp1, Address(str1, cnt2));
9090 __ ldr(tmp2, Address(str2, cnt2));
9091 __ eor(rscratch2, tmp1, tmp2);
9092 __ cbz(rscratch2, LENGTH_DIFF);
9093 __ b(CAL_DIFFERENCE);
9094
9095 __ bind(DIFF);
9096 __ cmp(tmp1, tmp2);
9097 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9098 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9099 // reuse rscratch2 register for the result of eor instruction
9100 __ eor(rscratch2, tmp1, tmp2);
9101
9102 __ bind(CAL_DIFFERENCE);
9103 __ rev(rscratch2, rscratch2);
9104 __ clz(rscratch2, rscratch2);
9105 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9106 __ lsrv(tmp1, tmp1, rscratch2);
9107 __ lsrv(tmp2, tmp2, rscratch2);
9108 if (isLL) {
9109 __ uxtbw(tmp1, tmp1);
9110 __ uxtbw(tmp2, tmp2);
9111 } else {
9112 __ uxthw(tmp1, tmp1);
9113 __ uxthw(tmp2, tmp2);
9114 }
9115 __ subw(result, tmp1, tmp2);
9116
9117 __ bind(LENGTH_DIFF);
9118 __ ret(lr);
9119 return entry;
9120 }
9121
9122 enum string_compare_mode {
9123 LL,
9124 LU,
9125 UL,
9126 UU,
9127 };
9128
9129 // The following registers are declared in aarch64.ad
9130 // r0 = result
9131 // r1 = str1
9132 // r2 = cnt1
9133 // r3 = str2
9134 // r4 = cnt2
9135 // r10 = tmp1
9136 // r11 = tmp2
9137 // z0 = ztmp1
9138 // z1 = ztmp2
9139 // p0 = pgtmp1
9140 // p1 = pgtmp2
9141 address generate_compare_long_string_sve(string_compare_mode mode) {
9142 StubId stub_id;
9143 switch (mode) {
9144 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9145 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9146 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9147 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9148 default: ShouldNotReachHere();
9149 }
9150
9151 __ align(CodeEntryAlignment);
9152 address entry = __ pc();
9153 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9154 tmp1 = r10, tmp2 = r11;
9155
9156 Label LOOP, DONE, MISMATCH;
9157 Register vec_len = tmp1;
9158 Register idx = tmp2;
9159 // The minimum of the string lengths has been stored in cnt2.
9160 Register cnt = cnt2;
9161 FloatRegister ztmp1 = z0, ztmp2 = z1;
9162 PRegister pgtmp1 = p0, pgtmp2 = p1;
9163
9164 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9165 switch (mode) { \
9166 case LL: \
9167 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9168 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9169 break; \
9170 case LU: \
9171 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9172 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9173 break; \
9174 case UL: \
9175 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9176 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9177 break; \
9178 case UU: \
9179 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9180 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9181 break; \
9182 default: \
9183 ShouldNotReachHere(); \
9184 }
9185
9186 StubCodeMark mark(this, stub_id);
9187
9188 __ mov(idx, 0);
9189 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9190
9191 if (mode == LL) {
9192 __ sve_cntb(vec_len);
9193 } else {
9194 __ sve_cnth(vec_len);
9195 }
9196
9197 __ sub(rscratch1, cnt, vec_len);
9198
9199 __ bind(LOOP);
9200
9201 // main loop
9202 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9203 __ add(idx, idx, vec_len);
9204 // Compare strings.
9205 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9206 __ br(__ NE, MISMATCH);
9207 __ cmp(idx, rscratch1);
9208 __ br(__ LT, LOOP);
9209
9210 // post loop, last iteration
9211 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9212
9213 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9214 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9215 __ br(__ EQ, DONE);
9216
9217 __ bind(MISMATCH);
9218
9219 // Crop the vector to find its location.
9220 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9221 // Extract the first different characters of each string.
9222 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9223 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9224
9225 // Compute the difference of the first different characters.
9226 __ sub(result, rscratch1, rscratch2);
9227
9228 __ bind(DONE);
9229 __ ret(lr);
9230 #undef LOAD_PAIR
9231 return entry;
9232 }
9233
9234 void generate_compare_long_strings() {
9235 if (UseSVE == 0) {
9236 StubRoutines::aarch64::_compare_long_string_LL
9237 = generate_compare_long_string_same_encoding(true);
9238 StubRoutines::aarch64::_compare_long_string_UU
9239 = generate_compare_long_string_same_encoding(false);
9240 StubRoutines::aarch64::_compare_long_string_LU
9241 = generate_compare_long_string_different_encoding(true);
9242 StubRoutines::aarch64::_compare_long_string_UL
9243 = generate_compare_long_string_different_encoding(false);
9244 } else {
9245 StubRoutines::aarch64::_compare_long_string_LL
9246 = generate_compare_long_string_sve(LL);
9247 StubRoutines::aarch64::_compare_long_string_UU
9248 = generate_compare_long_string_sve(UU);
9249 StubRoutines::aarch64::_compare_long_string_LU
9250 = generate_compare_long_string_sve(LU);
9251 StubRoutines::aarch64::_compare_long_string_UL
9252 = generate_compare_long_string_sve(UL);
9253 }
9254 }
9255
9256 // R0 = result
9257 // R1 = str2
9258 // R2 = cnt1
9259 // R3 = str1
9260 // R4 = cnt2
9261 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9262 //
9263 // This generic linear code use few additional ideas, which makes it faster:
9264 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9265 // in order to skip initial loading(help in systems with 1 ld pipeline)
9266 // 2) we can use "fast" algorithm of finding single character to search for
9267 // first symbol with less branches(1 branch per each loaded register instead
9268 // of branch for each symbol), so, this is where constants like
9269 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9270 // 3) after loading and analyzing 1st register of source string, it can be
9271 // used to search for every 1st character entry, saving few loads in
9272 // comparison with "simplier-but-slower" implementation
9273 // 4) in order to avoid lots of push/pop operations, code below is heavily
9274 // re-using/re-initializing/compressing register values, which makes code
9275 // larger and a bit less readable, however, most of extra operations are
9276 // issued during loads or branches, so, penalty is minimal
9277 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9278 StubId stub_id;
9279 if (str1_isL) {
9280 if (str2_isL) {
9281 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9282 } else {
9283 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9284 }
9285 } else {
9286 if (str2_isL) {
9287 ShouldNotReachHere();
9288 } else {
9289 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9290 }
9291 }
9292 __ align(CodeEntryAlignment);
9293 StubCodeMark mark(this, stub_id);
9294 address entry = __ pc();
9295
9296 int str1_chr_size = str1_isL ? 1 : 2;
9297 int str2_chr_size = str2_isL ? 1 : 2;
9298 int str1_chr_shift = str1_isL ? 0 : 1;
9299 int str2_chr_shift = str2_isL ? 0 : 1;
9300 bool isL = str1_isL && str2_isL;
9301 // parameters
9302 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9303 // temporary registers
9304 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9305 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9306 // redefinitions
9307 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9308
9309 __ push(spilled_regs, sp);
9310 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9311 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9312 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9313 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9314 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9315 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9316 // Read whole register from str1. It is safe, because length >=8 here
9317 __ ldr(ch1, Address(str1));
9318 // Read whole register from str2. It is safe, because length >=8 here
9319 __ ldr(ch2, Address(str2));
9320 __ sub(cnt2, cnt2, cnt1);
9321 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9322 if (str1_isL != str2_isL) {
9323 __ eor(v0, __ T16B, v0, v0);
9324 }
9325 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9326 __ mul(first, first, tmp1);
9327 // check if we have less than 1 register to check
9328 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9329 if (str1_isL != str2_isL) {
9330 __ fmovd(v1, ch1);
9331 }
9332 __ br(__ LE, L_SMALL);
9333 __ eor(ch2, first, ch2);
9334 if (str1_isL != str2_isL) {
9335 __ zip1(v1, __ T16B, v1, v0);
9336 }
9337 __ sub(tmp2, ch2, tmp1);
9338 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9339 __ bics(tmp2, tmp2, ch2);
9340 if (str1_isL != str2_isL) {
9341 __ fmovd(ch1, v1);
9342 }
9343 __ br(__ NE, L_HAS_ZERO);
9344 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9345 __ add(result, result, wordSize/str2_chr_size);
9346 __ add(str2, str2, wordSize);
9347 __ br(__ LT, L_POST_LOOP);
9348 __ BIND(L_LOOP);
9349 __ ldr(ch2, Address(str2));
9350 __ eor(ch2, first, ch2);
9351 __ sub(tmp2, ch2, tmp1);
9352 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9353 __ bics(tmp2, tmp2, ch2);
9354 __ br(__ NE, L_HAS_ZERO);
9355 __ BIND(L_LOOP_PROCEED);
9356 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9357 __ add(str2, str2, wordSize);
9358 __ add(result, result, wordSize/str2_chr_size);
9359 __ br(__ GE, L_LOOP);
9360 __ BIND(L_POST_LOOP);
9361 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9362 __ br(__ LE, NOMATCH);
9363 __ ldr(ch2, Address(str2));
9364 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9365 __ eor(ch2, first, ch2);
9366 __ sub(tmp2, ch2, tmp1);
9367 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9368 __ mov(tmp4, -1); // all bits set
9369 __ b(L_SMALL_PROCEED);
9370 __ align(OptoLoopAlignment);
9371 __ BIND(L_SMALL);
9372 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9373 __ eor(ch2, first, ch2);
9374 if (str1_isL != str2_isL) {
9375 __ zip1(v1, __ T16B, v1, v0);
9376 }
9377 __ sub(tmp2, ch2, tmp1);
9378 __ mov(tmp4, -1); // all bits set
9379 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9380 if (str1_isL != str2_isL) {
9381 __ fmovd(ch1, v1); // move converted 4 symbols
9382 }
9383 __ BIND(L_SMALL_PROCEED);
9384 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9385 __ bic(tmp2, tmp2, ch2);
9386 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9387 __ rbit(tmp2, tmp2);
9388 __ br(__ EQ, NOMATCH);
9389 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9390 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9391 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9392 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9393 if (str2_isL) { // LL
9394 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9395 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9396 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9397 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9398 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9399 } else {
9400 __ mov(ch2, 0xE); // all bits in byte set except last one
9401 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9402 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9403 __ lslv(tmp2, tmp2, tmp4);
9404 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9405 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9406 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9407 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9408 }
9409 __ cmp(ch1, ch2);
9410 __ mov(tmp4, wordSize/str2_chr_size);
9411 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9412 __ BIND(L_SMALL_CMP_LOOP);
9413 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9414 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9415 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9416 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9417 __ add(tmp4, tmp4, 1);
9418 __ cmp(tmp4, cnt1);
9419 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9420 __ cmp(first, ch2);
9421 __ br(__ EQ, L_SMALL_CMP_LOOP);
9422 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9423 __ cbz(tmp2, NOMATCH); // no more matches. exit
9424 __ clz(tmp4, tmp2);
9425 __ add(result, result, 1); // advance index
9426 __ add(str2, str2, str2_chr_size); // advance pointer
9427 __ b(L_SMALL_HAS_ZERO_LOOP);
9428 __ align(OptoLoopAlignment);
9429 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9430 __ cmp(first, ch2);
9431 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9432 __ b(DONE);
9433 __ align(OptoLoopAlignment);
9434 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9435 if (str2_isL) { // LL
9436 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9437 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9438 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9439 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9440 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9441 } else {
9442 __ mov(ch2, 0xE); // all bits in byte set except last one
9443 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9444 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9445 __ lslv(tmp2, tmp2, tmp4);
9446 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9447 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9448 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9449 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9450 }
9451 __ cmp(ch1, ch2);
9452 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9453 __ b(DONE);
9454 __ align(OptoLoopAlignment);
9455 __ BIND(L_HAS_ZERO);
9456 __ rbit(tmp2, tmp2);
9457 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9458 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9459 // It's fine because both counters are 32bit and are not changed in this
9460 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9461 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9462 __ sub(result, result, 1);
9463 __ BIND(L_HAS_ZERO_LOOP);
9464 __ mov(cnt1, wordSize/str2_chr_size);
9465 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9466 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9467 if (str2_isL) {
9468 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9469 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9470 __ lslv(tmp2, tmp2, tmp4);
9471 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9472 __ add(tmp4, tmp4, 1);
9473 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9474 __ lsl(tmp2, tmp2, 1);
9475 __ mov(tmp4, wordSize/str2_chr_size);
9476 } else {
9477 __ mov(ch2, 0xE);
9478 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9479 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9480 __ lslv(tmp2, tmp2, tmp4);
9481 __ add(tmp4, tmp4, 1);
9482 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9483 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9484 __ lsl(tmp2, tmp2, 1);
9485 __ mov(tmp4, wordSize/str2_chr_size);
9486 __ sub(str2, str2, str2_chr_size);
9487 }
9488 __ cmp(ch1, ch2);
9489 __ mov(tmp4, wordSize/str2_chr_size);
9490 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9491 __ BIND(L_CMP_LOOP);
9492 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9493 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9494 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9495 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9496 __ add(tmp4, tmp4, 1);
9497 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9498 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9499 __ cmp(cnt1, ch2);
9500 __ br(__ EQ, L_CMP_LOOP);
9501 __ BIND(L_CMP_LOOP_NOMATCH);
9502 // here we're not matched
9503 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9504 __ clz(tmp4, tmp2);
9505 __ add(str2, str2, str2_chr_size); // advance pointer
9506 __ b(L_HAS_ZERO_LOOP);
9507 __ align(OptoLoopAlignment);
9508 __ BIND(L_CMP_LOOP_LAST_CMP);
9509 __ cmp(cnt1, ch2);
9510 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9511 __ b(DONE);
9512 __ align(OptoLoopAlignment);
9513 __ BIND(L_CMP_LOOP_LAST_CMP2);
9514 if (str2_isL) {
9515 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9516 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9517 __ lslv(tmp2, tmp2, tmp4);
9518 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9519 __ add(tmp4, tmp4, 1);
9520 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9521 __ lsl(tmp2, tmp2, 1);
9522 } else {
9523 __ mov(ch2, 0xE);
9524 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9525 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9526 __ lslv(tmp2, tmp2, tmp4);
9527 __ add(tmp4, tmp4, 1);
9528 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9529 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9530 __ lsl(tmp2, tmp2, 1);
9531 __ sub(str2, str2, str2_chr_size);
9532 }
9533 __ cmp(ch1, ch2);
9534 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9535 __ b(DONE);
9536 __ align(OptoLoopAlignment);
9537 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9538 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9539 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9540 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9541 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9542 // result by analyzed characters value, so, we can just reset lower bits
9543 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9544 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9545 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9546 // index of last analyzed substring inside current octet. So, str2 in at
9547 // respective start address. We need to advance it to next octet
9548 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9549 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9550 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9551 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9552 __ movw(cnt2, cnt2);
9553 __ b(L_LOOP_PROCEED);
9554 __ align(OptoLoopAlignment);
9555 __ BIND(NOMATCH);
9556 __ mov(result, -1);
9557 __ BIND(DONE);
9558 __ pop(spilled_regs, sp);
9559 __ ret(lr);
9560 return entry;
9561 }
9562
9563 void generate_string_indexof_stubs() {
9564 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9565 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9566 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9567 }
9568
9569 void inflate_and_store_2_fp_registers(bool generatePrfm,
9570 FloatRegister src1, FloatRegister src2) {
9571 Register dst = r1;
9572 __ zip1(v1, __ T16B, src1, v0);
9573 __ zip2(v2, __ T16B, src1, v0);
9574 if (generatePrfm) {
9575 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9576 }
9577 __ zip1(v3, __ T16B, src2, v0);
9578 __ zip2(v4, __ T16B, src2, v0);
9579 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9580 }
9581
9582 // R0 = src
9583 // R1 = dst
9584 // R2 = len
9585 // R3 = len >> 3
9586 // V0 = 0
9587 // v1 = loaded 8 bytes
9588 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9589 address generate_large_byte_array_inflate() {
9590 __ align(CodeEntryAlignment);
9591 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9592 StubCodeMark mark(this, stub_id);
9593 address entry = __ pc();
9594 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9595 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9596 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9597
9598 // do one more 8-byte read to have address 16-byte aligned in most cases
9599 // also use single store instruction
9600 __ ldrd(v2, __ post(src, 8));
9601 __ sub(octetCounter, octetCounter, 2);
9602 __ zip1(v1, __ T16B, v1, v0);
9603 __ zip1(v2, __ T16B, v2, v0);
9604 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9605 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9606 __ subs(rscratch1, octetCounter, large_loop_threshold);
9607 __ br(__ LE, LOOP_START);
9608 __ b(LOOP_PRFM_START);
9609 __ bind(LOOP_PRFM);
9610 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9611 __ bind(LOOP_PRFM_START);
9612 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9613 __ sub(octetCounter, octetCounter, 8);
9614 __ subs(rscratch1, octetCounter, large_loop_threshold);
9615 inflate_and_store_2_fp_registers(true, v3, v4);
9616 inflate_and_store_2_fp_registers(true, v5, v6);
9617 __ br(__ GT, LOOP_PRFM);
9618 __ cmp(octetCounter, (u1)8);
9619 __ br(__ LT, DONE);
9620 __ bind(LOOP);
9621 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9622 __ bind(LOOP_START);
9623 __ sub(octetCounter, octetCounter, 8);
9624 __ cmp(octetCounter, (u1)8);
9625 inflate_and_store_2_fp_registers(false, v3, v4);
9626 inflate_and_store_2_fp_registers(false, v5, v6);
9627 __ br(__ GE, LOOP);
9628 __ bind(DONE);
9629 __ ret(lr);
9630 return entry;
9631 }
9632
9633 /**
9634 * Arguments:
9635 *
9636 * Input:
9637 * c_rarg0 - current state address
9638 * c_rarg1 - H key address
9639 * c_rarg2 - data address
9640 * c_rarg3 - number of blocks
9641 *
9642 * Output:
9643 * Updated state at c_rarg0
9644 */
9645 address generate_ghash_processBlocks() {
9646 // Bafflingly, GCM uses little-endian for the byte order, but
9647 // big-endian for the bit order. For example, the polynomial 1 is
9648 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9649 //
9650 // So, we must either reverse the bytes in each word and do
9651 // everything big-endian or reverse the bits in each byte and do
9652 // it little-endian. On AArch64 it's more idiomatic to reverse
9653 // the bits in each byte (we have an instruction, RBIT, to do
9654 // that) and keep the data in little-endian bit order through the
9655 // calculation, bit-reversing the inputs and outputs.
9656
9657 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9658 StubCodeMark mark(this, stub_id);
9659 Label polynomial; // local data generated at end of stub
9660 __ align(CodeEntryAlignment);
9661 address start = __ pc();
9662
9663 Register state = c_rarg0;
9664 Register subkeyH = c_rarg1;
9665 Register data = c_rarg2;
9666 Register blocks = c_rarg3;
9667
9668 FloatRegister vzr = v30;
9669 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9670
9671 __ adr(rscratch1, polynomial);
9672 __ ldrq(v24, rscratch1); // The field polynomial
9673
9674 __ ldrq(v0, Address(state));
9675 __ ldrq(v1, Address(subkeyH));
9676
9677 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9678 __ rbit(v0, __ T16B, v0);
9679 __ rev64(v1, __ T16B, v1);
9680 __ rbit(v1, __ T16B, v1);
9681
9682 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9683 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9684
9685 {
9686 Label L_ghash_loop;
9687 __ bind(L_ghash_loop);
9688
9689 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9690 // reversing each byte
9691 __ rbit(v2, __ T16B, v2);
9692 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9693
9694 // Multiply state in v2 by subkey in v1
9695 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9696 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9697 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9698 // Reduce v7:v5 by the field polynomial
9699 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9700
9701 __ sub(blocks, blocks, 1);
9702 __ cbnz(blocks, L_ghash_loop);
9703 }
9704
9705 // The bit-reversed result is at this point in v0
9706 __ rev64(v0, __ T16B, v0);
9707 __ rbit(v0, __ T16B, v0);
9708
9709 __ st1(v0, __ T16B, state);
9710 __ ret(lr);
9711
9712 // bind label and generate local polynomial data
9713 __ align(wordSize * 2);
9714 __ bind(polynomial);
9715 __ emit_int64(0x87); // The low-order bits of the field
9716 // polynomial (i.e. p = z^7+z^2+z+1)
9717 // repeated in the low and high parts of a
9718 // 128-bit vector
9719 __ emit_int64(0x87);
9720
9721 return start;
9722 }
9723
9724 address generate_ghash_processBlocks_wide() {
9725 address small = generate_ghash_processBlocks();
9726
9727 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9728 StubCodeMark mark(this, stub_id);
9729 Label polynomial; // local data generated after stub
9730 __ align(CodeEntryAlignment);
9731 address start = __ pc();
9732
9733 Register state = c_rarg0;
9734 Register subkeyH = c_rarg1;
9735 Register data = c_rarg2;
9736 Register blocks = c_rarg3;
9737
9738 const int unroll = 4;
9739
9740 __ cmp(blocks, (unsigned char)(unroll * 2));
9741 __ br(__ LT, small);
9742
9743 if (unroll > 1) {
9744 // Save state before entering routine
9745 __ sub(sp, sp, 4 * 16);
9746 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9747 __ sub(sp, sp, 4 * 16);
9748 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9749 }
9750
9751 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9752
9753 if (unroll > 1) {
9754 // And restore state
9755 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9756 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9757 }
9758
9759 __ cmp(blocks, (unsigned char)0);
9760 __ br(__ GT, small);
9761
9762 __ ret(lr);
9763
9764 // bind label and generate polynomial data
9765 __ align(wordSize * 2);
9766 __ bind(polynomial);
9767 __ emit_int64(0x87); // The low-order bits of the field
9768 // polynomial (i.e. p = z^7+z^2+z+1)
9769 // repeated in the low and high parts of a
9770 // 128-bit vector
9771 __ emit_int64(0x87);
9772
9773 return start;
9774
9775 }
9776
9777 void generate_base64_encode_simdround(Register src, Register dst,
9778 FloatRegister codec, u8 size) {
9779
9780 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9781 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9782 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9783
9784 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9785
9786 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9787
9788 __ ushr(ind0, arrangement, in0, 2);
9789
9790 __ ushr(ind1, arrangement, in1, 2);
9791 __ shl(in0, arrangement, in0, 6);
9792 __ orr(ind1, arrangement, ind1, in0);
9793 __ ushr(ind1, arrangement, ind1, 2);
9794
9795 __ ushr(ind2, arrangement, in2, 4);
9796 __ shl(in1, arrangement, in1, 4);
9797 __ orr(ind2, arrangement, in1, ind2);
9798 __ ushr(ind2, arrangement, ind2, 2);
9799
9800 __ shl(ind3, arrangement, in2, 2);
9801 __ ushr(ind3, arrangement, ind3, 2);
9802
9803 __ tbl(out0, arrangement, codec, 4, ind0);
9804 __ tbl(out1, arrangement, codec, 4, ind1);
9805 __ tbl(out2, arrangement, codec, 4, ind2);
9806 __ tbl(out3, arrangement, codec, 4, ind3);
9807
9808 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9809 }
9810
9811 /**
9812 * Arguments:
9813 *
9814 * Input:
9815 * c_rarg0 - src_start
9816 * c_rarg1 - src_offset
9817 * c_rarg2 - src_length
9818 * c_rarg3 - dest_start
9819 * c_rarg4 - dest_offset
9820 * c_rarg5 - isURL
9821 *
9822 */
9823 address generate_base64_encodeBlock() {
9824
9825 static const char toBase64[64] = {
9826 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9827 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9828 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9829 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9830 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9831 };
9832
9833 static const char toBase64URL[64] = {
9834 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9835 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9836 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9837 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9838 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9839 };
9840
9841 __ align(CodeEntryAlignment);
9842 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9843 StubCodeMark mark(this, stub_id);
9844 address start = __ pc();
9845
9846 Register src = c_rarg0; // source array
9847 Register soff = c_rarg1; // source start offset
9848 Register send = c_rarg2; // source end offset
9849 Register dst = c_rarg3; // dest array
9850 Register doff = c_rarg4; // position for writing to dest array
9851 Register isURL = c_rarg5; // Base64 or URL character set
9852
9853 // c_rarg6 and c_rarg7 are free to use as temps
9854 Register codec = c_rarg6;
9855 Register length = c_rarg7;
9856
9857 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9858
9859 __ add(src, src, soff);
9860 __ add(dst, dst, doff);
9861 __ sub(length, send, soff);
9862
9863 // load the codec base address
9864 __ lea(codec, ExternalAddress((address) toBase64));
9865 __ cbz(isURL, ProcessData);
9866 __ lea(codec, ExternalAddress((address) toBase64URL));
9867
9868 __ BIND(ProcessData);
9869
9870 // too short to formup a SIMD loop, roll back
9871 __ cmp(length, (u1)24);
9872 __ br(Assembler::LT, Process3B);
9873
9874 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9875
9876 __ BIND(Process48B);
9877 __ cmp(length, (u1)48);
9878 __ br(Assembler::LT, Process24B);
9879 generate_base64_encode_simdround(src, dst, v0, 16);
9880 __ sub(length, length, 48);
9881 __ b(Process48B);
9882
9883 __ BIND(Process24B);
9884 __ cmp(length, (u1)24);
9885 __ br(Assembler::LT, SIMDExit);
9886 generate_base64_encode_simdround(src, dst, v0, 8);
9887 __ sub(length, length, 24);
9888
9889 __ BIND(SIMDExit);
9890 __ cbz(length, Exit);
9891
9892 __ BIND(Process3B);
9893 // 3 src bytes, 24 bits
9894 __ ldrb(r10, __ post(src, 1));
9895 __ ldrb(r11, __ post(src, 1));
9896 __ ldrb(r12, __ post(src, 1));
9897 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9898 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9899 // codec index
9900 __ ubfmw(r15, r12, 18, 23);
9901 __ ubfmw(r14, r12, 12, 17);
9902 __ ubfmw(r13, r12, 6, 11);
9903 __ andw(r12, r12, 63);
9904 // get the code based on the codec
9905 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9906 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9907 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9908 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9909 __ strb(r15, __ post(dst, 1));
9910 __ strb(r14, __ post(dst, 1));
9911 __ strb(r13, __ post(dst, 1));
9912 __ strb(r12, __ post(dst, 1));
9913 __ sub(length, length, 3);
9914 __ cbnz(length, Process3B);
9915
9916 __ BIND(Exit);
9917 __ ret(lr);
9918
9919 return start;
9920 }
9921
9922 void generate_base64_decode_simdround(Register src, Register dst,
9923 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9924
9925 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9926 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9927
9928 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9929 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9930
9931 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9932
9933 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9934
9935 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9936
9937 // we need unsigned saturating subtract, to make sure all input values
9938 // in range [0, 63] will have 0U value in the higher half lookup
9939 __ uqsubv(decH0, __ T16B, in0, v27);
9940 __ uqsubv(decH1, __ T16B, in1, v27);
9941 __ uqsubv(decH2, __ T16B, in2, v27);
9942 __ uqsubv(decH3, __ T16B, in3, v27);
9943
9944 // lower half lookup
9945 __ tbl(decL0, arrangement, codecL, 4, in0);
9946 __ tbl(decL1, arrangement, codecL, 4, in1);
9947 __ tbl(decL2, arrangement, codecL, 4, in2);
9948 __ tbl(decL3, arrangement, codecL, 4, in3);
9949
9950 // higher half lookup
9951 __ tbx(decH0, arrangement, codecH, 4, decH0);
9952 __ tbx(decH1, arrangement, codecH, 4, decH1);
9953 __ tbx(decH2, arrangement, codecH, 4, decH2);
9954 __ tbx(decH3, arrangement, codecH, 4, decH3);
9955
9956 // combine lower and higher
9957 __ orr(decL0, arrangement, decL0, decH0);
9958 __ orr(decL1, arrangement, decL1, decH1);
9959 __ orr(decL2, arrangement, decL2, decH2);
9960 __ orr(decL3, arrangement, decL3, decH3);
9961
9962 // check illegal inputs, value larger than 63 (maximum of 6 bits)
9963 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
9964 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
9965 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
9966 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
9967 __ orr(in0, arrangement, decH0, decH1);
9968 __ orr(in1, arrangement, decH2, decH3);
9969 __ orr(in2, arrangement, in0, in1);
9970 __ umaxv(in3, arrangement, in2);
9971 __ umov(rscratch2, in3, __ B, 0);
9972
9973 // get the data to output
9974 __ shl(out0, arrangement, decL0, 2);
9975 __ ushr(out1, arrangement, decL1, 4);
9976 __ orr(out0, arrangement, out0, out1);
9977 __ shl(out1, arrangement, decL1, 4);
9978 __ ushr(out2, arrangement, decL2, 2);
9979 __ orr(out1, arrangement, out1, out2);
9980 __ shl(out2, arrangement, decL2, 6);
9981 __ orr(out2, arrangement, out2, decL3);
9982
9983 __ cbz(rscratch2, NoIllegalData);
9984
9985 // handle illegal input
9986 __ umov(r10, in2, __ D, 0);
9987 if (size == 16) {
9988 __ cbnz(r10, ErrorInLowerHalf);
9989
9990 // illegal input is in higher half, store the lower half now.
9991 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
9992
9993 __ umov(r10, in2, __ D, 1);
9994 __ umov(r11, out0, __ D, 1);
9995 __ umov(r12, out1, __ D, 1);
9996 __ umov(r13, out2, __ D, 1);
9997 __ b(StoreLegalData);
9998
9999 __ BIND(ErrorInLowerHalf);
10000 }
10001 __ umov(r11, out0, __ D, 0);
10002 __ umov(r12, out1, __ D, 0);
10003 __ umov(r13, out2, __ D, 0);
10004
10005 __ BIND(StoreLegalData);
10006 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10007 __ strb(r11, __ post(dst, 1));
10008 __ strb(r12, __ post(dst, 1));
10009 __ strb(r13, __ post(dst, 1));
10010 __ lsr(r10, r10, 8);
10011 __ lsr(r11, r11, 8);
10012 __ lsr(r12, r12, 8);
10013 __ lsr(r13, r13, 8);
10014 __ b(StoreLegalData);
10015
10016 __ BIND(NoIllegalData);
10017 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10018 }
10019
10020
10021 /**
10022 * Arguments:
10023 *
10024 * Input:
10025 * c_rarg0 - src_start
10026 * c_rarg1 - src_offset
10027 * c_rarg2 - src_length
10028 * c_rarg3 - dest_start
10029 * c_rarg4 - dest_offset
10030 * c_rarg5 - isURL
10031 * c_rarg6 - isMIME
10032 *
10033 */
10034 address generate_base64_decodeBlock() {
10035
10036 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10037 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10038 // titled "Base64 decoding".
10039
10040 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10041 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10042 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10043 static const uint8_t fromBase64ForNoSIMD[256] = {
10044 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10045 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10046 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10047 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10048 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10049 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10050 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10051 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10052 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10053 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10054 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10055 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10056 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10057 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10058 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10059 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10060 };
10061
10062 static const uint8_t fromBase64URLForNoSIMD[256] = {
10063 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10064 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10065 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10066 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10067 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10068 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10069 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10070 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10071 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10072 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10073 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10074 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10075 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10076 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10077 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10078 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10079 };
10080
10081 // A legal value of base64 code is in range [0, 127]. We need two lookups
10082 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10083 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10084 // table vector lookup use tbx, out of range indices are unchanged in
10085 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10086 // The value of index 64 is set to 0, so that we know that we already get the
10087 // decoded data with the 1st lookup.
10088 static const uint8_t fromBase64ForSIMD[128] = {
10089 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10090 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10091 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10092 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10093 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10094 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10095 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10096 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10097 };
10098
10099 static const uint8_t fromBase64URLForSIMD[128] = {
10100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10103 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10104 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10105 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10106 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10107 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10108 };
10109
10110 __ align(CodeEntryAlignment);
10111 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10112 StubCodeMark mark(this, stub_id);
10113 address start = __ pc();
10114
10115 Register src = c_rarg0; // source array
10116 Register soff = c_rarg1; // source start offset
10117 Register send = c_rarg2; // source end offset
10118 Register dst = c_rarg3; // dest array
10119 Register doff = c_rarg4; // position for writing to dest array
10120 Register isURL = c_rarg5; // Base64 or URL character set
10121 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10122
10123 Register length = send; // reuse send as length of source data to process
10124
10125 Register simd_codec = c_rarg6;
10126 Register nosimd_codec = c_rarg7;
10127
10128 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10129
10130 __ enter();
10131
10132 __ add(src, src, soff);
10133 __ add(dst, dst, doff);
10134
10135 __ mov(doff, dst);
10136
10137 __ sub(length, send, soff);
10138 __ bfm(length, zr, 0, 1);
10139
10140 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10141 __ cbz(isURL, ProcessData);
10142 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10143
10144 __ BIND(ProcessData);
10145 __ mov(rscratch1, length);
10146 __ cmp(length, (u1)144); // 144 = 80 + 64
10147 __ br(Assembler::LT, Process4B);
10148
10149 // In the MIME case, the line length cannot be more than 76
10150 // bytes (see RFC 2045). This is too short a block for SIMD
10151 // to be worthwhile, so we use non-SIMD here.
10152 __ movw(rscratch1, 79);
10153
10154 __ BIND(Process4B);
10155 __ ldrw(r14, __ post(src, 4));
10156 __ ubfxw(r10, r14, 0, 8);
10157 __ ubfxw(r11, r14, 8, 8);
10158 __ ubfxw(r12, r14, 16, 8);
10159 __ ubfxw(r13, r14, 24, 8);
10160 // get the de-code
10161 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10162 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10163 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10164 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10165 // error detection, 255u indicates an illegal input
10166 __ orrw(r14, r10, r11);
10167 __ orrw(r15, r12, r13);
10168 __ orrw(r14, r14, r15);
10169 __ tbnz(r14, 7, Exit);
10170 // recover the data
10171 __ lslw(r14, r10, 10);
10172 __ bfiw(r14, r11, 4, 6);
10173 __ bfmw(r14, r12, 2, 5);
10174 __ rev16w(r14, r14);
10175 __ bfiw(r13, r12, 6, 2);
10176 __ strh(r14, __ post(dst, 2));
10177 __ strb(r13, __ post(dst, 1));
10178 // non-simd loop
10179 __ subsw(rscratch1, rscratch1, 4);
10180 __ br(Assembler::GT, Process4B);
10181
10182 // if exiting from PreProcess80B, rscratch1 == -1;
10183 // otherwise, rscratch1 == 0.
10184 __ cbzw(rscratch1, Exit);
10185 __ sub(length, length, 80);
10186
10187 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10188 __ cbz(isURL, SIMDEnter);
10189 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10190
10191 __ BIND(SIMDEnter);
10192 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10193 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10194 __ mov(rscratch1, 63);
10195 __ dup(v27, __ T16B, rscratch1);
10196
10197 __ BIND(Process64B);
10198 __ cmp(length, (u1)64);
10199 __ br(Assembler::LT, Process32B);
10200 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10201 __ sub(length, length, 64);
10202 __ b(Process64B);
10203
10204 __ BIND(Process32B);
10205 __ cmp(length, (u1)32);
10206 __ br(Assembler::LT, SIMDExit);
10207 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10208 __ sub(length, length, 32);
10209 __ b(Process32B);
10210
10211 __ BIND(SIMDExit);
10212 __ cbz(length, Exit);
10213 __ movw(rscratch1, length);
10214 __ b(Process4B);
10215
10216 __ BIND(Exit);
10217 __ sub(c_rarg0, dst, doff);
10218
10219 __ leave();
10220 __ ret(lr);
10221
10222 return start;
10223 }
10224
10225 // Support for spin waits.
10226 address generate_spin_wait() {
10227 __ align(CodeEntryAlignment);
10228 StubId stub_id = StubId::stubgen_spin_wait_id;
10229 StubCodeMark mark(this, stub_id);
10230 address start = __ pc();
10231
10232 __ spin_wait();
10233 __ ret(lr);
10234
10235 return start;
10236 }
10237
10238 void generate_lookup_secondary_supers_table_stub() {
10239 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10240 StubCodeMark mark(this, stub_id);
10241
10242 const Register
10243 r_super_klass = r0,
10244 r_array_base = r1,
10245 r_array_length = r2,
10246 r_array_index = r3,
10247 r_sub_klass = r4,
10248 r_bitmap = rscratch2,
10249 result = r5;
10250 const FloatRegister
10251 vtemp = v0;
10252
10253 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10254 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10255 Label L_success;
10256 __ enter();
10257 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10258 r_array_base, r_array_length, r_array_index,
10259 vtemp, result, slot,
10260 /*stub_is_near*/true);
10261 __ leave();
10262 __ ret(lr);
10263 }
10264 }
10265
10266 // Slow path implementation for UseSecondarySupersTable.
10267 address generate_lookup_secondary_supers_table_slow_path_stub() {
10268 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10269 StubCodeMark mark(this, stub_id);
10270
10271 address start = __ pc();
10272 const Register
10273 r_super_klass = r0, // argument
10274 r_array_base = r1, // argument
10275 temp1 = r2, // temp
10276 r_array_index = r3, // argument
10277 r_bitmap = rscratch2, // argument
10278 result = r5; // argument
10279
10280 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10281 __ ret(lr);
10282
10283 return start;
10284 }
10285
10286 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10287
10288 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10289 //
10290 // If LSE is in use, generate LSE versions of all the stubs. The
10291 // non-LSE versions are in atomic_aarch64.S.
10292
10293 // class AtomicStubMark records the entry point of a stub and the
10294 // stub pointer which will point to it. The stub pointer is set to
10295 // the entry point when ~AtomicStubMark() is called, which must be
10296 // after ICache::invalidate_range. This ensures safe publication of
10297 // the generated code.
10298 class AtomicStubMark {
10299 address _entry_point;
10300 aarch64_atomic_stub_t *_stub;
10301 MacroAssembler *_masm;
10302 public:
10303 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10304 _masm = masm;
10305 __ align(32);
10306 _entry_point = __ pc();
10307 _stub = stub;
10308 }
10309 ~AtomicStubMark() {
10310 *_stub = (aarch64_atomic_stub_t)_entry_point;
10311 }
10312 };
10313
10314 // NB: For memory_order_conservative we need a trailing membar after
10315 // LSE atomic operations but not a leading membar.
10316 //
10317 // We don't need a leading membar because a clause in the Arm ARM
10318 // says:
10319 //
10320 // Barrier-ordered-before
10321 //
10322 // Barrier instructions order prior Memory effects before subsequent
10323 // Memory effects generated by the same Observer. A read or a write
10324 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10325 // Observer if and only if RW1 appears in program order before RW 2
10326 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10327 // instruction with both Acquire and Release semantics.
10328 //
10329 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10330 // and Release semantics, therefore we don't need a leading
10331 // barrier. However, there is no corresponding Barrier-ordered-after
10332 // relationship, therefore we need a trailing membar to prevent a
10333 // later store or load from being reordered with the store in an
10334 // atomic instruction.
10335 //
10336 // This was checked by using the herd7 consistency model simulator
10337 // (http://diy.inria.fr/) with this test case:
10338 //
10339 // AArch64 LseCas
10340 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10341 // P0 | P1;
10342 // LDR W4, [X2] | MOV W3, #0;
10343 // DMB LD | MOV W4, #1;
10344 // LDR W3, [X1] | CASAL W3, W4, [X1];
10345 // | DMB ISH;
10346 // | STR W4, [X2];
10347 // exists
10348 // (0:X3=0 /\ 0:X4=1)
10349 //
10350 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10351 // with the store to x in P1. Without the DMB in P1 this may happen.
10352 //
10353 // At the time of writing we don't know of any AArch64 hardware that
10354 // reorders stores in this way, but the Reference Manual permits it.
10355
10356 void gen_cas_entry(Assembler::operand_size size,
10357 atomic_memory_order order) {
10358 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10359 exchange_val = c_rarg2;
10360 bool acquire, release;
10361 switch (order) {
10362 case memory_order_relaxed:
10363 acquire = false;
10364 release = false;
10365 break;
10366 case memory_order_release:
10367 acquire = false;
10368 release = true;
10369 break;
10370 default:
10371 acquire = true;
10372 release = true;
10373 break;
10374 }
10375 __ mov(prev, compare_val);
10376 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10377 if (order == memory_order_conservative) {
10378 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10379 }
10380 if (size == Assembler::xword) {
10381 __ mov(r0, prev);
10382 } else {
10383 __ movw(r0, prev);
10384 }
10385 __ ret(lr);
10386 }
10387
10388 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10389 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10390 // If not relaxed, then default to conservative. Relaxed is the only
10391 // case we use enough to be worth specializing.
10392 if (order == memory_order_relaxed) {
10393 __ ldadd(size, incr, prev, addr);
10394 } else {
10395 __ ldaddal(size, incr, prev, addr);
10396 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10397 }
10398 if (size == Assembler::xword) {
10399 __ mov(r0, prev);
10400 } else {
10401 __ movw(r0, prev);
10402 }
10403 __ ret(lr);
10404 }
10405
10406 void gen_swpal_entry(Assembler::operand_size size) {
10407 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10408 __ swpal(size, incr, prev, addr);
10409 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10410 if (size == Assembler::xword) {
10411 __ mov(r0, prev);
10412 } else {
10413 __ movw(r0, prev);
10414 }
10415 __ ret(lr);
10416 }
10417
10418 void generate_atomic_entry_points() {
10419 if (! UseLSE) {
10420 return;
10421 }
10422 __ align(CodeEntryAlignment);
10423 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10424 StubCodeMark mark(this, stub_id);
10425 address first_entry = __ pc();
10426
10427 // ADD, memory_order_conservative
10428 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10429 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10430 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10431 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10432
10433 // ADD, memory_order_relaxed
10434 AtomicStubMark mark_fetch_add_4_relaxed
10435 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10436 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10437 AtomicStubMark mark_fetch_add_8_relaxed
10438 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10439 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10440
10441 // XCHG, memory_order_conservative
10442 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10443 gen_swpal_entry(Assembler::word);
10444 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10445 gen_swpal_entry(Assembler::xword);
10446
10447 // CAS, memory_order_conservative
10448 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10449 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10450 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10451 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10452 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10453 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10454
10455 // CAS, memory_order_relaxed
10456 AtomicStubMark mark_cmpxchg_1_relaxed
10457 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10458 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10459 AtomicStubMark mark_cmpxchg_4_relaxed
10460 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10461 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10462 AtomicStubMark mark_cmpxchg_8_relaxed
10463 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10464 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10465
10466 AtomicStubMark mark_cmpxchg_4_release
10467 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10468 gen_cas_entry(MacroAssembler::word, memory_order_release);
10469 AtomicStubMark mark_cmpxchg_8_release
10470 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10471 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10472
10473 AtomicStubMark mark_cmpxchg_4_seq_cst
10474 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10475 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10476 AtomicStubMark mark_cmpxchg_8_seq_cst
10477 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10478 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10479
10480 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10481 }
10482 #endif // LINUX
10483
10484 static void save_return_registers(MacroAssembler* masm) {
10485 if (InlineTypeReturnedAsFields) {
10486 masm->push(RegSet::range(r0, r7), sp);
10487 masm->sub(sp, sp, 4 * wordSize);
10488 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
10489 masm->sub(sp, sp, 4 * wordSize);
10490 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
10491 } else {
10492 masm->fmovd(rscratch1, v0);
10493 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
10494 }
10495 }
10496
10497 static void restore_return_registers(MacroAssembler* masm) {
10498 if (InlineTypeReturnedAsFields) {
10499 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10500 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
10501 masm->pop(RegSet::range(r0, r7), sp);
10502 } else {
10503 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
10504 masm->fmovd(v0, rscratch1);
10505 }
10506 }
10507
10508 address generate_cont_thaw(Continuation::thaw_kind kind) {
10509 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10510 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10511
10512 address start = __ pc();
10513
10514 if (return_barrier) {
10515 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10516 __ mov(sp, rscratch1);
10517 }
10518 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10519
10520 if (return_barrier) {
10521 // preserve possible return value from a method returning to the return barrier
10522 save_return_registers(_masm);
10523 }
10524
10525 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10526 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10527 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10528
10529 if (return_barrier) {
10530 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10531 restore_return_registers(_masm);
10532 }
10533 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10534
10535
10536 Label thaw_success;
10537 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10538 __ cbnz(rscratch2, thaw_success);
10539 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10540 __ br(rscratch1);
10541 __ bind(thaw_success);
10542
10543 // make room for the thawed frames
10544 __ sub(rscratch1, sp, rscratch2);
10545 __ andr(rscratch1, rscratch1, -16); // align
10546 __ mov(sp, rscratch1);
10547
10548 if (return_barrier) {
10549 // save original return value -- again
10550 save_return_registers(_masm);
10551 }
10552
10553 // If we want, we can templatize thaw by kind, and have three different entries
10554 __ movw(c_rarg1, (uint32_t)kind);
10555
10556 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10557 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10558
10559 if (return_barrier) {
10560 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10561 restore_return_registers(_masm);
10562 } else {
10563 __ mov(r0, zr); // return 0 (success) from doYield
10564 }
10565
10566 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10567 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10568 __ mov(rfp, sp);
10569
10570 if (return_barrier_exception) {
10571 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10572 __ authenticate_return_address(c_rarg1);
10573 __ verify_oop(r0);
10574 // save return value containing the exception oop in callee-saved R19
10575 __ mov(r19, r0);
10576
10577 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10578
10579 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10580 // __ reinitialize_ptrue();
10581
10582 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10583
10584 __ mov(r1, r0); // the exception handler
10585 __ mov(r0, r19); // restore return value containing the exception oop
10586 __ verify_oop(r0);
10587
10588 __ leave();
10589 __ mov(r3, lr);
10590 __ br(r1); // the exception handler
10591 } else {
10592 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10593 __ leave();
10594 __ ret(lr);
10595 }
10596
10597 return start;
10598 }
10599
10600 address generate_cont_thaw() {
10601 if (!Continuations::enabled()) return nullptr;
10602
10603 StubId stub_id = StubId::stubgen_cont_thaw_id;
10604 StubCodeMark mark(this, stub_id);
10605 address start = __ pc();
10606 generate_cont_thaw(Continuation::thaw_top);
10607 return start;
10608 }
10609
10610 address generate_cont_returnBarrier() {
10611 if (!Continuations::enabled()) return nullptr;
10612
10613 // TODO: will probably need multiple return barriers depending on return type
10614 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10615 StubCodeMark mark(this, stub_id);
10616 address start = __ pc();
10617
10618 generate_cont_thaw(Continuation::thaw_return_barrier);
10619
10620 return start;
10621 }
10622
10623 address generate_cont_returnBarrier_exception() {
10624 if (!Continuations::enabled()) return nullptr;
10625
10626 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10627 StubCodeMark mark(this, stub_id);
10628 address start = __ pc();
10629
10630 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10631
10632 return start;
10633 }
10634
10635 address generate_cont_preempt_stub() {
10636 if (!Continuations::enabled()) return nullptr;
10637 StubId stub_id = StubId::stubgen_cont_preempt_id;
10638 StubCodeMark mark(this, stub_id);
10639 address start = __ pc();
10640
10641 __ reset_last_Java_frame(true);
10642
10643 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10644 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10645 __ mov(sp, rscratch2);
10646
10647 Label preemption_cancelled;
10648 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10649 __ cbnz(rscratch1, preemption_cancelled);
10650
10651 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10652 SharedRuntime::continuation_enter_cleanup(_masm);
10653 __ leave();
10654 __ ret(lr);
10655
10656 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10657 __ bind(preemption_cancelled);
10658 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10659 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10660 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10661 __ ldr(rscratch1, Address(rscratch1));
10662 __ br(rscratch1);
10663
10664 return start;
10665 }
10666
10667 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10668 // are represented as long[5], with BITS_PER_LIMB = 26.
10669 // Pack five 26-bit limbs into three 64-bit registers.
10670 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10671 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10672 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10673 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10674 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10675
10676 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10677 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10678 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10679 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10680
10681 if (dest2->is_valid()) {
10682 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10683 } else {
10684 #ifdef ASSERT
10685 Label OK;
10686 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10687 __ br(__ EQ, OK);
10688 __ stop("high bits of Poly1305 integer should be zero");
10689 __ should_not_reach_here();
10690 __ bind(OK);
10691 #endif
10692 }
10693 }
10694
10695 // As above, but return only a 128-bit integer, packed into two
10696 // 64-bit registers.
10697 void pack_26(Register dest0, Register dest1, Register src) {
10698 pack_26(dest0, dest1, noreg, src);
10699 }
10700
10701 // Multiply and multiply-accumulate unsigned 64-bit registers.
10702 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10703 __ mul(prod_lo, n, m);
10704 __ umulh(prod_hi, n, m);
10705 }
10706 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10707 wide_mul(rscratch1, rscratch2, n, m);
10708 __ adds(sum_lo, sum_lo, rscratch1);
10709 __ adc(sum_hi, sum_hi, rscratch2);
10710 }
10711
10712 // Poly1305, RFC 7539
10713
10714 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10715 // description of the tricks used to simplify and accelerate this
10716 // computation.
10717
10718 address generate_poly1305_processBlocks() {
10719 __ align(CodeEntryAlignment);
10720 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10721 StubCodeMark mark(this, stub_id);
10722 address start = __ pc();
10723 Label here;
10724 __ enter();
10725 RegSet callee_saved = RegSet::range(r19, r28);
10726 __ push(callee_saved, sp);
10727
10728 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10729
10730 // Arguments
10731 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10732
10733 // R_n is the 128-bit randomly-generated key, packed into two
10734 // registers. The caller passes this key to us as long[5], with
10735 // BITS_PER_LIMB = 26.
10736 const Register R_0 = *++regs, R_1 = *++regs;
10737 pack_26(R_0, R_1, r_start);
10738
10739 // RR_n is (R_n >> 2) * 5
10740 const Register RR_0 = *++regs, RR_1 = *++regs;
10741 __ lsr(RR_0, R_0, 2);
10742 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10743 __ lsr(RR_1, R_1, 2);
10744 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10745
10746 // U_n is the current checksum
10747 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10748 pack_26(U_0, U_1, U_2, acc_start);
10749
10750 static constexpr int BLOCK_LENGTH = 16;
10751 Label DONE, LOOP;
10752
10753 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10754 __ br(Assembler::LT, DONE); {
10755 __ bind(LOOP);
10756
10757 // S_n is to be the sum of U_n and the next block of data
10758 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10759 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10760 __ adds(S_0, U_0, S_0);
10761 __ adcs(S_1, U_1, S_1);
10762 __ adc(S_2, U_2, zr);
10763 __ add(S_2, S_2, 1);
10764
10765 const Register U_0HI = *++regs, U_1HI = *++regs;
10766
10767 // NB: this logic depends on some of the special properties of
10768 // Poly1305 keys. In particular, because we know that the top
10769 // four bits of R_0 and R_1 are zero, we can add together
10770 // partial products without any risk of needing to propagate a
10771 // carry out.
10772 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10773 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10774 __ andr(U_2, R_0, 3);
10775 __ mul(U_2, S_2, U_2);
10776
10777 // Recycle registers S_0, S_1, S_2
10778 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10779
10780 // Partial reduction mod 2**130 - 5
10781 __ adds(U_1, U_0HI, U_1);
10782 __ adc(U_2, U_1HI, U_2);
10783 // Sum now in U_2:U_1:U_0.
10784 // Dead: U_0HI, U_1HI.
10785 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10786
10787 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10788
10789 // First, U_2:U_1:U_0 += (U_2 >> 2)
10790 __ lsr(rscratch1, U_2, 2);
10791 __ andr(U_2, U_2, (u8)3);
10792 __ adds(U_0, U_0, rscratch1);
10793 __ adcs(U_1, U_1, zr);
10794 __ adc(U_2, U_2, zr);
10795 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10796 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10797 __ adcs(U_1, U_1, zr);
10798 __ adc(U_2, U_2, zr);
10799
10800 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10801 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10802 __ br(~ Assembler::LT, LOOP);
10803 }
10804
10805 // Further reduce modulo 2^130 - 5
10806 __ lsr(rscratch1, U_2, 2);
10807 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10808 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10809 __ adcs(U_1, U_1, zr);
10810 __ andr(U_2, U_2, (u1)3);
10811 __ adc(U_2, U_2, zr);
10812
10813 // Unpack the sum into five 26-bit limbs and write to memory.
10814 __ ubfiz(rscratch1, U_0, 0, 26);
10815 __ ubfx(rscratch2, U_0, 26, 26);
10816 __ stp(rscratch1, rscratch2, Address(acc_start));
10817 __ ubfx(rscratch1, U_0, 52, 12);
10818 __ bfi(rscratch1, U_1, 12, 14);
10819 __ ubfx(rscratch2, U_1, 14, 26);
10820 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10821 __ ubfx(rscratch1, U_1, 40, 24);
10822 __ bfi(rscratch1, U_2, 24, 3);
10823 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10824
10825 __ bind(DONE);
10826 __ pop(callee_saved, sp);
10827 __ leave();
10828 __ ret(lr);
10829
10830 return start;
10831 }
10832
10833 // exception handler for upcall stubs
10834 address generate_upcall_stub_exception_handler() {
10835 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10836 StubCodeMark mark(this, stub_id);
10837 address start = __ pc();
10838
10839 // Native caller has no idea how to handle exceptions,
10840 // so we just crash here. Up to callee to catch exceptions.
10841 __ verify_oop(r0);
10842 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10843 __ blr(rscratch1);
10844 __ should_not_reach_here();
10845
10846 return start;
10847 }
10848
10849 // load Method* target of MethodHandle
10850 // j_rarg0 = jobject receiver
10851 // rmethod = result
10852 address generate_upcall_stub_load_target() {
10853 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10854 StubCodeMark mark(this, stub_id);
10855 address start = __ pc();
10856
10857 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10858 // Load target method from receiver
10859 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10860 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10861 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10862 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10863 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10864 noreg, noreg);
10865 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10866
10867 __ ret(lr);
10868
10869 return start;
10870 }
10871
10872 #undef __
10873 #define __ masm->
10874
10875 class MontgomeryMultiplyGenerator : public MacroAssembler {
10876
10877 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10878 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10879
10880 RegSet _toSave;
10881 bool _squaring;
10882
10883 public:
10884 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10885 : MacroAssembler(as->code()), _squaring(squaring) {
10886
10887 // Register allocation
10888
10889 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10890 Pa_base = *regs; // Argument registers
10891 if (squaring)
10892 Pb_base = Pa_base;
10893 else
10894 Pb_base = *++regs;
10895 Pn_base = *++regs;
10896 Rlen= *++regs;
10897 inv = *++regs;
10898 Pm_base = *++regs;
10899
10900 // Working registers:
10901 Ra = *++regs; // The current digit of a, b, n, and m.
10902 Rb = *++regs;
10903 Rm = *++regs;
10904 Rn = *++regs;
10905
10906 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10907 Pb = *++regs;
10908 Pm = *++regs;
10909 Pn = *++regs;
10910
10911 t0 = *++regs; // Three registers which form a
10912 t1 = *++regs; // triple-precision accumuator.
10913 t2 = *++regs;
10914
10915 Ri = *++regs; // Inner and outer loop indexes.
10916 Rj = *++regs;
10917
10918 Rhi_ab = *++regs; // Product registers: low and high parts
10919 Rlo_ab = *++regs; // of a*b and m*n.
10920 Rhi_mn = *++regs;
10921 Rlo_mn = *++regs;
10922
10923 // r19 and up are callee-saved.
10924 _toSave = RegSet::range(r19, *regs) + Pm_base;
10925 }
10926
10927 private:
10928 void save_regs() {
10929 push(_toSave, sp);
10930 }
10931
10932 void restore_regs() {
10933 pop(_toSave, sp);
10934 }
10935
10936 template <typename T>
10937 void unroll_2(Register count, T block) {
10938 Label loop, end, odd;
10939 tbnz(count, 0, odd);
10940 cbz(count, end);
10941 align(16);
10942 bind(loop);
10943 (this->*block)();
10944 bind(odd);
10945 (this->*block)();
10946 subs(count, count, 2);
10947 br(Assembler::GT, loop);
10948 bind(end);
10949 }
10950
10951 template <typename T>
10952 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10953 Label loop, end, odd;
10954 tbnz(count, 0, odd);
10955 cbz(count, end);
10956 align(16);
10957 bind(loop);
10958 (this->*block)(d, s, tmp);
10959 bind(odd);
10960 (this->*block)(d, s, tmp);
10961 subs(count, count, 2);
10962 br(Assembler::GT, loop);
10963 bind(end);
10964 }
10965
10966 void pre1(RegisterOrConstant i) {
10967 block_comment("pre1");
10968 // Pa = Pa_base;
10969 // Pb = Pb_base + i;
10970 // Pm = Pm_base;
10971 // Pn = Pn_base + i;
10972 // Ra = *Pa;
10973 // Rb = *Pb;
10974 // Rm = *Pm;
10975 // Rn = *Pn;
10976 ldr(Ra, Address(Pa_base));
10977 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10978 ldr(Rm, Address(Pm_base));
10979 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10980 lea(Pa, Address(Pa_base));
10981 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
10982 lea(Pm, Address(Pm_base));
10983 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
10984
10985 // Zero the m*n result.
10986 mov(Rhi_mn, zr);
10987 mov(Rlo_mn, zr);
10988 }
10989
10990 // The core multiply-accumulate step of a Montgomery
10991 // multiplication. The idea is to schedule operations as a
10992 // pipeline so that instructions with long latencies (loads and
10993 // multiplies) have time to complete before their results are
10994 // used. This most benefits in-order implementations of the
10995 // architecture but out-of-order ones also benefit.
10996 void step() {
10997 block_comment("step");
10998 // MACC(Ra, Rb, t0, t1, t2);
10999 // Ra = *++Pa;
11000 // Rb = *--Pb;
11001 umulh(Rhi_ab, Ra, Rb);
11002 mul(Rlo_ab, Ra, Rb);
11003 ldr(Ra, pre(Pa, wordSize));
11004 ldr(Rb, pre(Pb, -wordSize));
11005 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11006 // previous iteration.
11007 // MACC(Rm, Rn, t0, t1, t2);
11008 // Rm = *++Pm;
11009 // Rn = *--Pn;
11010 umulh(Rhi_mn, Rm, Rn);
11011 mul(Rlo_mn, Rm, Rn);
11012 ldr(Rm, pre(Pm, wordSize));
11013 ldr(Rn, pre(Pn, -wordSize));
11014 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11015 }
11016
11017 void post1() {
11018 block_comment("post1");
11019
11020 // MACC(Ra, Rb, t0, t1, t2);
11021 // Ra = *++Pa;
11022 // Rb = *--Pb;
11023 umulh(Rhi_ab, Ra, Rb);
11024 mul(Rlo_ab, Ra, Rb);
11025 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11026 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11027
11028 // *Pm = Rm = t0 * inv;
11029 mul(Rm, t0, inv);
11030 str(Rm, Address(Pm));
11031
11032 // MACC(Rm, Rn, t0, t1, t2);
11033 // t0 = t1; t1 = t2; t2 = 0;
11034 umulh(Rhi_mn, Rm, Rn);
11035
11036 #ifndef PRODUCT
11037 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11038 {
11039 mul(Rlo_mn, Rm, Rn);
11040 add(Rlo_mn, t0, Rlo_mn);
11041 Label ok;
11042 cbz(Rlo_mn, ok); {
11043 stop("broken Montgomery multiply");
11044 } bind(ok);
11045 }
11046 #endif
11047 // We have very carefully set things up so that
11048 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11049 // the lower half of Rm * Rn because we know the result already:
11050 // it must be -t0. t0 + (-t0) must generate a carry iff
11051 // t0 != 0. So, rather than do a mul and an adds we just set
11052 // the carry flag iff t0 is nonzero.
11053 //
11054 // mul(Rlo_mn, Rm, Rn);
11055 // adds(zr, t0, Rlo_mn);
11056 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11057 adcs(t0, t1, Rhi_mn);
11058 adc(t1, t2, zr);
11059 mov(t2, zr);
11060 }
11061
11062 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11063 block_comment("pre2");
11064 // Pa = Pa_base + i-len;
11065 // Pb = Pb_base + len;
11066 // Pm = Pm_base + i-len;
11067 // Pn = Pn_base + len;
11068
11069 if (i.is_register()) {
11070 sub(Rj, i.as_register(), len);
11071 } else {
11072 mov(Rj, i.as_constant());
11073 sub(Rj, Rj, len);
11074 }
11075 // Rj == i-len
11076
11077 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11078 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11079 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11080 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11081
11082 // Ra = *++Pa;
11083 // Rb = *--Pb;
11084 // Rm = *++Pm;
11085 // Rn = *--Pn;
11086 ldr(Ra, pre(Pa, wordSize));
11087 ldr(Rb, pre(Pb, -wordSize));
11088 ldr(Rm, pre(Pm, wordSize));
11089 ldr(Rn, pre(Pn, -wordSize));
11090
11091 mov(Rhi_mn, zr);
11092 mov(Rlo_mn, zr);
11093 }
11094
11095 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11096 block_comment("post2");
11097 if (i.is_constant()) {
11098 mov(Rj, i.as_constant()-len.as_constant());
11099 } else {
11100 sub(Rj, i.as_register(), len);
11101 }
11102
11103 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11104
11105 // As soon as we know the least significant digit of our result,
11106 // store it.
11107 // Pm_base[i-len] = t0;
11108 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11109
11110 // t0 = t1; t1 = t2; t2 = 0;
11111 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11112 adc(t1, t2, zr);
11113 mov(t2, zr);
11114 }
11115
11116 // A carry in t0 after Montgomery multiplication means that we
11117 // should subtract multiples of n from our result in m. We'll
11118 // keep doing that until there is no carry.
11119 void normalize(RegisterOrConstant len) {
11120 block_comment("normalize");
11121 // while (t0)
11122 // t0 = sub(Pm_base, Pn_base, t0, len);
11123 Label loop, post, again;
11124 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11125 cbz(t0, post); {
11126 bind(again); {
11127 mov(i, zr);
11128 mov(cnt, len);
11129 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11130 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11131 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11132 align(16);
11133 bind(loop); {
11134 sbcs(Rm, Rm, Rn);
11135 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11136 add(i, i, 1);
11137 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11138 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11139 sub(cnt, cnt, 1);
11140 } cbnz(cnt, loop);
11141 sbc(t0, t0, zr);
11142 } cbnz(t0, again);
11143 } bind(post);
11144 }
11145
11146 // Move memory at s to d, reversing words.
11147 // Increments d to end of copied memory
11148 // Destroys tmp1, tmp2
11149 // Preserves len
11150 // Leaves s pointing to the address which was in d at start
11151 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11152 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11153 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11154
11155 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11156 mov(tmp1, len);
11157 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11158 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11159 }
11160 // where
11161 void reverse1(Register d, Register s, Register tmp) {
11162 ldr(tmp, pre(s, -wordSize));
11163 ror(tmp, tmp, 32);
11164 str(tmp, post(d, wordSize));
11165 }
11166
11167 void step_squaring() {
11168 // An extra ACC
11169 step();
11170 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11171 }
11172
11173 void last_squaring(RegisterOrConstant i) {
11174 Label dont;
11175 // if ((i & 1) == 0) {
11176 tbnz(i.as_register(), 0, dont); {
11177 // MACC(Ra, Rb, t0, t1, t2);
11178 // Ra = *++Pa;
11179 // Rb = *--Pb;
11180 umulh(Rhi_ab, Ra, Rb);
11181 mul(Rlo_ab, Ra, Rb);
11182 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11183 } bind(dont);
11184 }
11185
11186 void extra_step_squaring() {
11187 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11188
11189 // MACC(Rm, Rn, t0, t1, t2);
11190 // Rm = *++Pm;
11191 // Rn = *--Pn;
11192 umulh(Rhi_mn, Rm, Rn);
11193 mul(Rlo_mn, Rm, Rn);
11194 ldr(Rm, pre(Pm, wordSize));
11195 ldr(Rn, pre(Pn, -wordSize));
11196 }
11197
11198 void post1_squaring() {
11199 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11200
11201 // *Pm = Rm = t0 * inv;
11202 mul(Rm, t0, inv);
11203 str(Rm, Address(Pm));
11204
11205 // MACC(Rm, Rn, t0, t1, t2);
11206 // t0 = t1; t1 = t2; t2 = 0;
11207 umulh(Rhi_mn, Rm, Rn);
11208
11209 #ifndef PRODUCT
11210 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11211 {
11212 mul(Rlo_mn, Rm, Rn);
11213 add(Rlo_mn, t0, Rlo_mn);
11214 Label ok;
11215 cbz(Rlo_mn, ok); {
11216 stop("broken Montgomery multiply");
11217 } bind(ok);
11218 }
11219 #endif
11220 // We have very carefully set things up so that
11221 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11222 // the lower half of Rm * Rn because we know the result already:
11223 // it must be -t0. t0 + (-t0) must generate a carry iff
11224 // t0 != 0. So, rather than do a mul and an adds we just set
11225 // the carry flag iff t0 is nonzero.
11226 //
11227 // mul(Rlo_mn, Rm, Rn);
11228 // adds(zr, t0, Rlo_mn);
11229 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11230 adcs(t0, t1, Rhi_mn);
11231 adc(t1, t2, zr);
11232 mov(t2, zr);
11233 }
11234
11235 void acc(Register Rhi, Register Rlo,
11236 Register t0, Register t1, Register t2) {
11237 adds(t0, t0, Rlo);
11238 adcs(t1, t1, Rhi);
11239 adc(t2, t2, zr);
11240 }
11241
11242 public:
11243 /**
11244 * Fast Montgomery multiplication. The derivation of the
11245 * algorithm is in A Cryptographic Library for the Motorola
11246 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11247 *
11248 * Arguments:
11249 *
11250 * Inputs for multiplication:
11251 * c_rarg0 - int array elements a
11252 * c_rarg1 - int array elements b
11253 * c_rarg2 - int array elements n (the modulus)
11254 * c_rarg3 - int length
11255 * c_rarg4 - int inv
11256 * c_rarg5 - int array elements m (the result)
11257 *
11258 * Inputs for squaring:
11259 * c_rarg0 - int array elements a
11260 * c_rarg1 - int array elements n (the modulus)
11261 * c_rarg2 - int length
11262 * c_rarg3 - int inv
11263 * c_rarg4 - int array elements m (the result)
11264 *
11265 */
11266 address generate_multiply() {
11267 Label argh, nothing;
11268 bind(argh);
11269 stop("MontgomeryMultiply total_allocation must be <= 8192");
11270
11271 align(CodeEntryAlignment);
11272 address entry = pc();
11273
11274 cbzw(Rlen, nothing);
11275
11276 enter();
11277
11278 // Make room.
11279 cmpw(Rlen, 512);
11280 br(Assembler::HI, argh);
11281 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11282 andr(sp, Ra, -2 * wordSize);
11283
11284 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11285
11286 {
11287 // Copy input args, reversing as we go. We use Ra as a
11288 // temporary variable.
11289 reverse(Ra, Pa_base, Rlen, t0, t1);
11290 if (!_squaring)
11291 reverse(Ra, Pb_base, Rlen, t0, t1);
11292 reverse(Ra, Pn_base, Rlen, t0, t1);
11293 }
11294
11295 // Push all call-saved registers and also Pm_base which we'll need
11296 // at the end.
11297 save_regs();
11298
11299 #ifndef PRODUCT
11300 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11301 {
11302 ldr(Rn, Address(Pn_base, 0));
11303 mul(Rlo_mn, Rn, inv);
11304 subs(zr, Rlo_mn, -1);
11305 Label ok;
11306 br(EQ, ok); {
11307 stop("broken inverse in Montgomery multiply");
11308 } bind(ok);
11309 }
11310 #endif
11311
11312 mov(Pm_base, Ra);
11313
11314 mov(t0, zr);
11315 mov(t1, zr);
11316 mov(t2, zr);
11317
11318 block_comment("for (int i = 0; i < len; i++) {");
11319 mov(Ri, zr); {
11320 Label loop, end;
11321 cmpw(Ri, Rlen);
11322 br(Assembler::GE, end);
11323
11324 bind(loop);
11325 pre1(Ri);
11326
11327 block_comment(" for (j = i; j; j--) {"); {
11328 movw(Rj, Ri);
11329 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11330 } block_comment(" } // j");
11331
11332 post1();
11333 addw(Ri, Ri, 1);
11334 cmpw(Ri, Rlen);
11335 br(Assembler::LT, loop);
11336 bind(end);
11337 block_comment("} // i");
11338 }
11339
11340 block_comment("for (int i = len; i < 2*len; i++) {");
11341 mov(Ri, Rlen); {
11342 Label loop, end;
11343 cmpw(Ri, Rlen, Assembler::LSL, 1);
11344 br(Assembler::GE, end);
11345
11346 bind(loop);
11347 pre2(Ri, Rlen);
11348
11349 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11350 lslw(Rj, Rlen, 1);
11351 subw(Rj, Rj, Ri);
11352 subw(Rj, Rj, 1);
11353 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11354 } block_comment(" } // j");
11355
11356 post2(Ri, Rlen);
11357 addw(Ri, Ri, 1);
11358 cmpw(Ri, Rlen, Assembler::LSL, 1);
11359 br(Assembler::LT, loop);
11360 bind(end);
11361 }
11362 block_comment("} // i");
11363
11364 normalize(Rlen);
11365
11366 mov(Ra, Pm_base); // Save Pm_base in Ra
11367 restore_regs(); // Restore caller's Pm_base
11368
11369 // Copy our result into caller's Pm_base
11370 reverse(Pm_base, Ra, Rlen, t0, t1);
11371
11372 leave();
11373 bind(nothing);
11374 ret(lr);
11375
11376 return entry;
11377 }
11378 // In C, approximately:
11379
11380 // void
11381 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11382 // julong Pn_base[], julong Pm_base[],
11383 // julong inv, int len) {
11384 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11385 // julong *Pa, *Pb, *Pn, *Pm;
11386 // julong Ra, Rb, Rn, Rm;
11387
11388 // int i;
11389
11390 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11391
11392 // for (i = 0; i < len; i++) {
11393 // int j;
11394
11395 // Pa = Pa_base;
11396 // Pb = Pb_base + i;
11397 // Pm = Pm_base;
11398 // Pn = Pn_base + i;
11399
11400 // Ra = *Pa;
11401 // Rb = *Pb;
11402 // Rm = *Pm;
11403 // Rn = *Pn;
11404
11405 // int iters = i;
11406 // for (j = 0; iters--; j++) {
11407 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11408 // MACC(Ra, Rb, t0, t1, t2);
11409 // Ra = *++Pa;
11410 // Rb = *--Pb;
11411 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11412 // MACC(Rm, Rn, t0, t1, t2);
11413 // Rm = *++Pm;
11414 // Rn = *--Pn;
11415 // }
11416
11417 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11418 // MACC(Ra, Rb, t0, t1, t2);
11419 // *Pm = Rm = t0 * inv;
11420 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11421 // MACC(Rm, Rn, t0, t1, t2);
11422
11423 // assert(t0 == 0, "broken Montgomery multiply");
11424
11425 // t0 = t1; t1 = t2; t2 = 0;
11426 // }
11427
11428 // for (i = len; i < 2*len; i++) {
11429 // int j;
11430
11431 // Pa = Pa_base + i-len;
11432 // Pb = Pb_base + len;
11433 // Pm = Pm_base + i-len;
11434 // Pn = Pn_base + len;
11435
11436 // Ra = *++Pa;
11437 // Rb = *--Pb;
11438 // Rm = *++Pm;
11439 // Rn = *--Pn;
11440
11441 // int iters = len*2-i-1;
11442 // for (j = i-len+1; iters--; j++) {
11443 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11444 // MACC(Ra, Rb, t0, t1, t2);
11445 // Ra = *++Pa;
11446 // Rb = *--Pb;
11447 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11448 // MACC(Rm, Rn, t0, t1, t2);
11449 // Rm = *++Pm;
11450 // Rn = *--Pn;
11451 // }
11452
11453 // Pm_base[i-len] = t0;
11454 // t0 = t1; t1 = t2; t2 = 0;
11455 // }
11456
11457 // while (t0)
11458 // t0 = sub(Pm_base, Pn_base, t0, len);
11459 // }
11460
11461 /**
11462 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11463 * multiplies than Montgomery multiplication so it should be up to
11464 * 25% faster. However, its loop control is more complex and it
11465 * may actually run slower on some machines.
11466 *
11467 * Arguments:
11468 *
11469 * Inputs:
11470 * c_rarg0 - int array elements a
11471 * c_rarg1 - int array elements n (the modulus)
11472 * c_rarg2 - int length
11473 * c_rarg3 - int inv
11474 * c_rarg4 - int array elements m (the result)
11475 *
11476 */
11477 address generate_square() {
11478 Label argh;
11479 bind(argh);
11480 stop("MontgomeryMultiply total_allocation must be <= 8192");
11481
11482 align(CodeEntryAlignment);
11483 address entry = pc();
11484
11485 enter();
11486
11487 // Make room.
11488 cmpw(Rlen, 512);
11489 br(Assembler::HI, argh);
11490 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11491 andr(sp, Ra, -2 * wordSize);
11492
11493 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11494
11495 {
11496 // Copy input args, reversing as we go. We use Ra as a
11497 // temporary variable.
11498 reverse(Ra, Pa_base, Rlen, t0, t1);
11499 reverse(Ra, Pn_base, Rlen, t0, t1);
11500 }
11501
11502 // Push all call-saved registers and also Pm_base which we'll need
11503 // at the end.
11504 save_regs();
11505
11506 mov(Pm_base, Ra);
11507
11508 mov(t0, zr);
11509 mov(t1, zr);
11510 mov(t2, zr);
11511
11512 block_comment("for (int i = 0; i < len; i++) {");
11513 mov(Ri, zr); {
11514 Label loop, end;
11515 bind(loop);
11516 cmp(Ri, Rlen);
11517 br(Assembler::GE, end);
11518
11519 pre1(Ri);
11520
11521 block_comment("for (j = (i+1)/2; j; j--) {"); {
11522 add(Rj, Ri, 1);
11523 lsr(Rj, Rj, 1);
11524 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11525 } block_comment(" } // j");
11526
11527 last_squaring(Ri);
11528
11529 block_comment(" for (j = i/2; j; j--) {"); {
11530 lsr(Rj, Ri, 1);
11531 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11532 } block_comment(" } // j");
11533
11534 post1_squaring();
11535 add(Ri, Ri, 1);
11536 cmp(Ri, Rlen);
11537 br(Assembler::LT, loop);
11538
11539 bind(end);
11540 block_comment("} // i");
11541 }
11542
11543 block_comment("for (int i = len; i < 2*len; i++) {");
11544 mov(Ri, Rlen); {
11545 Label loop, end;
11546 bind(loop);
11547 cmp(Ri, Rlen, Assembler::LSL, 1);
11548 br(Assembler::GE, end);
11549
11550 pre2(Ri, Rlen);
11551
11552 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11553 lsl(Rj, Rlen, 1);
11554 sub(Rj, Rj, Ri);
11555 sub(Rj, Rj, 1);
11556 lsr(Rj, Rj, 1);
11557 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11558 } block_comment(" } // j");
11559
11560 last_squaring(Ri);
11561
11562 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11563 lsl(Rj, Rlen, 1);
11564 sub(Rj, Rj, Ri);
11565 lsr(Rj, Rj, 1);
11566 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11567 } block_comment(" } // j");
11568
11569 post2(Ri, Rlen);
11570 add(Ri, Ri, 1);
11571 cmp(Ri, Rlen, Assembler::LSL, 1);
11572
11573 br(Assembler::LT, loop);
11574 bind(end);
11575 block_comment("} // i");
11576 }
11577
11578 normalize(Rlen);
11579
11580 mov(Ra, Pm_base); // Save Pm_base in Ra
11581 restore_regs(); // Restore caller's Pm_base
11582
11583 // Copy our result into caller's Pm_base
11584 reverse(Pm_base, Ra, Rlen, t0, t1);
11585
11586 leave();
11587 ret(lr);
11588
11589 return entry;
11590 }
11591 // In C, approximately:
11592
11593 // void
11594 // montgomery_square(julong Pa_base[], julong Pn_base[],
11595 // julong Pm_base[], julong inv, int len) {
11596 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11597 // julong *Pa, *Pb, *Pn, *Pm;
11598 // julong Ra, Rb, Rn, Rm;
11599
11600 // int i;
11601
11602 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11603
11604 // for (i = 0; i < len; i++) {
11605 // int j;
11606
11607 // Pa = Pa_base;
11608 // Pb = Pa_base + i;
11609 // Pm = Pm_base;
11610 // Pn = Pn_base + i;
11611
11612 // Ra = *Pa;
11613 // Rb = *Pb;
11614 // Rm = *Pm;
11615 // Rn = *Pn;
11616
11617 // int iters = (i+1)/2;
11618 // for (j = 0; iters--; j++) {
11619 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11620 // MACC2(Ra, Rb, t0, t1, t2);
11621 // Ra = *++Pa;
11622 // Rb = *--Pb;
11623 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11624 // MACC(Rm, Rn, t0, t1, t2);
11625 // Rm = *++Pm;
11626 // Rn = *--Pn;
11627 // }
11628 // if ((i & 1) == 0) {
11629 // assert(Ra == Pa_base[j], "must be");
11630 // MACC(Ra, Ra, t0, t1, t2);
11631 // }
11632 // iters = i/2;
11633 // assert(iters == i-j, "must be");
11634 // for (; iters--; j++) {
11635 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11636 // MACC(Rm, Rn, t0, t1, t2);
11637 // Rm = *++Pm;
11638 // Rn = *--Pn;
11639 // }
11640
11641 // *Pm = Rm = t0 * inv;
11642 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11643 // MACC(Rm, Rn, t0, t1, t2);
11644
11645 // assert(t0 == 0, "broken Montgomery multiply");
11646
11647 // t0 = t1; t1 = t2; t2 = 0;
11648 // }
11649
11650 // for (i = len; i < 2*len; i++) {
11651 // int start = i-len+1;
11652 // int end = start + (len - start)/2;
11653 // int j;
11654
11655 // Pa = Pa_base + i-len;
11656 // Pb = Pa_base + len;
11657 // Pm = Pm_base + i-len;
11658 // Pn = Pn_base + len;
11659
11660 // Ra = *++Pa;
11661 // Rb = *--Pb;
11662 // Rm = *++Pm;
11663 // Rn = *--Pn;
11664
11665 // int iters = (2*len-i-1)/2;
11666 // assert(iters == end-start, "must be");
11667 // for (j = start; iters--; j++) {
11668 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11669 // MACC2(Ra, Rb, t0, t1, t2);
11670 // Ra = *++Pa;
11671 // Rb = *--Pb;
11672 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11673 // MACC(Rm, Rn, t0, t1, t2);
11674 // Rm = *++Pm;
11675 // Rn = *--Pn;
11676 // }
11677 // if ((i & 1) == 0) {
11678 // assert(Ra == Pa_base[j], "must be");
11679 // MACC(Ra, Ra, t0, t1, t2);
11680 // }
11681 // iters = (2*len-i)/2;
11682 // assert(iters == len-j, "must be");
11683 // for (; iters--; j++) {
11684 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11685 // MACC(Rm, Rn, t0, t1, t2);
11686 // Rm = *++Pm;
11687 // Rn = *--Pn;
11688 // }
11689 // Pm_base[i-len] = t0;
11690 // t0 = t1; t1 = t2; t2 = 0;
11691 // }
11692
11693 // while (t0)
11694 // t0 = sub(Pm_base, Pn_base, t0, len);
11695 // }
11696 };
11697
11698 // Call here from the interpreter or compiled code to either load
11699 // multiple returned values from the inline type instance being
11700 // returned to registers or to store returned values to a newly
11701 // allocated inline type instance.
11702 address generate_return_value_stub(address destination, const char* name, bool has_res) {
11703 // We need to save all registers the calling convention may use so
11704 // the runtime calls read or update those registers. This needs to
11705 // be in sync with SharedRuntime::java_return_convention().
11706 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
11707 enum layout {
11708 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
11709 j_rarg6_off, j_rarg6_2,
11710 j_rarg5_off, j_rarg5_2,
11711 j_rarg4_off, j_rarg4_2,
11712 j_rarg3_off, j_rarg3_2,
11713 j_rarg2_off, j_rarg2_2,
11714 j_rarg1_off, j_rarg1_2,
11715 j_rarg0_off, j_rarg0_2,
11716
11717 j_farg7_off, j_farg7_2,
11718 j_farg6_off, j_farg6_2,
11719 j_farg5_off, j_farg5_2,
11720 j_farg4_off, j_farg4_2,
11721 j_farg3_off, j_farg3_2,
11722 j_farg2_off, j_farg2_2,
11723 j_farg1_off, j_farg1_2,
11724 j_farg0_off, j_farg0_2,
11725
11726 rfp_off, rfp_off2,
11727 return_off, return_off2,
11728
11729 framesize // inclusive of return address
11730 };
11731
11732 CodeBuffer code(name, 512, 64);
11733 MacroAssembler* masm = new MacroAssembler(&code);
11734
11735 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
11736 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
11737 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
11738 int frame_size_in_words = frame_size_in_bytes / wordSize;
11739
11740 OopMapSet* oop_maps = new OopMapSet();
11741 OopMap* map = new OopMap(frame_size_in_slots, 0);
11742
11743 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
11744 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
11745 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
11746 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
11747 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
11748 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
11749 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
11750 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
11751
11752 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
11753 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
11754 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
11755 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
11756 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
11757 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
11758 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
11759 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
11760
11761 address start = __ pc();
11762
11763 __ enter(); // Save FP and LR before call
11764
11765 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
11766 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
11767 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
11768 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
11769
11770 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
11771 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
11772 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
11773 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
11774
11775 int frame_complete = __ offset();
11776
11777 // Set up last_Java_sp and last_Java_fp
11778 address the_pc = __ pc();
11779 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
11780
11781 // Call runtime
11782 __ mov(c_rarg1, r0);
11783 __ mov(c_rarg0, rthread);
11784
11785 __ mov(rscratch1, destination);
11786 __ blr(rscratch1);
11787
11788 oop_maps->add_gc_map(the_pc - start, map);
11789
11790 __ reset_last_Java_frame(false);
11791
11792 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
11793 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
11794 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
11795 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
11796
11797 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
11798 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
11799 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
11800 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
11801
11802 __ leave();
11803
11804 // check for pending exceptions
11805 Label pending;
11806 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
11807 __ cbnz(rscratch1, pending);
11808
11809 if (has_res) {
11810 __ get_vm_result_oop(r0, rthread);
11811 }
11812
11813 __ ret(lr);
11814
11815 __ bind(pending);
11816 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
11817
11818 // -------------
11819 // make sure all code is generated
11820 masm->flush();
11821
11822 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
11823 return stub->entry_point();
11824 }
11825
11826 // Initialization
11827 void generate_preuniverse_stubs() {
11828 // preuniverse stubs are not needed for aarch64
11829 }
11830
11831 void generate_initial_stubs() {
11832 // Generate initial stubs and initializes the entry points
11833
11834 // entry points that exist in all platforms Note: This is code
11835 // that could be shared among different platforms - however the
11836 // benefit seems to be smaller than the disadvantage of having a
11837 // much more complicated generator structure. See also comment in
11838 // stubRoutines.hpp.
11839
11840 StubRoutines::_forward_exception_entry = generate_forward_exception();
11841
11842 StubRoutines::_call_stub_entry =
11843 generate_call_stub(StubRoutines::_call_stub_return_address);
11844
11845 // is referenced by megamorphic call
11846 StubRoutines::_catch_exception_entry = generate_catch_exception();
11847
11848 // Initialize table for copy memory (arraycopy) check.
11849 if (UnsafeMemoryAccess::_table == nullptr) {
11850 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11851 }
11852
11853 if (UseCRC32Intrinsics) {
11854 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11855 }
11856
11857 if (UseCRC32CIntrinsics) {
11858 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11859 }
11860
11861 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11862 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11863 }
11864
11865 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11866 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11867 }
11868
11869 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11870 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11871 StubRoutines::_hf2f = generate_float16ToFloat();
11872 StubRoutines::_f2hf = generate_floatToFloat16();
11873 }
11874
11875 if (InlineTypeReturnedAsFields) {
11876 StubRoutines::_load_inline_type_fields_in_regs =
11877 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
11878 StubRoutines::_store_inline_type_fields_to_buf =
11879 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
11880 }
11881
11882 }
11883
11884 void generate_continuation_stubs() {
11885 // Continuation stubs:
11886 StubRoutines::_cont_thaw = generate_cont_thaw();
11887 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11888 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11889 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11890 }
11891
11892 void generate_final_stubs() {
11893 // support for verify_oop (must happen after universe_init)
11894 if (VerifyOops) {
11895 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11896 }
11897
11898 // arraycopy stubs used by compilers
11899 generate_arraycopy_stubs();
11900
11901 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11902
11903 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11904
11905 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11906 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11907
11908 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11909
11910 generate_atomic_entry_points();
11911
11912 #endif // LINUX
11913
11914 #ifdef COMPILER2
11915 if (UseSecondarySupersTable) {
11916 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11917 if (! InlineSecondarySupersTest) {
11918 generate_lookup_secondary_supers_table_stub();
11919 }
11920 }
11921 #endif
11922
11923 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
11924 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11925 }
11926
11927 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11928 }
11929
11930 void generate_compiler_stubs() {
11931 #if COMPILER2_OR_JVMCI
11932
11933 if (UseSVE == 0) {
11934 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11935 }
11936
11937 // array equals stub for large arrays.
11938 if (!UseSimpleArrayEquals) {
11939 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11940 }
11941
11942 // arrays_hascode stub for large arrays.
11943 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11944 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11945 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11946 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11947 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11948
11949 // byte_array_inflate stub for large arrays.
11950 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11951
11952 // countPositives stub for large arrays.
11953 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11954
11955 generate_compare_long_strings();
11956
11957 generate_string_indexof_stubs();
11958
11959 #ifdef COMPILER2
11960 if (UseMultiplyToLenIntrinsic) {
11961 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11962 }
11963
11964 if (UseSquareToLenIntrinsic) {
11965 StubRoutines::_squareToLen = generate_squareToLen();
11966 }
11967
11968 if (UseMulAddIntrinsic) {
11969 StubRoutines::_mulAdd = generate_mulAdd();
11970 }
11971
11972 if (UseSIMDForBigIntegerShiftIntrinsics) {
11973 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11974 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11975 }
11976
11977 if (UseMontgomeryMultiplyIntrinsic) {
11978 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11979 StubCodeMark mark(this, stub_id);
11980 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11981 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11982 }
11983
11984 if (UseMontgomerySquareIntrinsic) {
11985 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11986 StubCodeMark mark(this, stub_id);
11987 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11988 // We use generate_multiply() rather than generate_square()
11989 // because it's faster for the sizes of modulus we care about.
11990 StubRoutines::_montgomerySquare = g.generate_multiply();
11991 }
11992
11993 #endif // COMPILER2
11994
11995 if (UseChaCha20Intrinsics) {
11996 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11997 }
11998
11999 if (UseKyberIntrinsics) {
12000 StubRoutines::_kyberNtt = generate_kyberNtt();
12001 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12002 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12003 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12004 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12005 StubRoutines::_kyber12To16 = generate_kyber12To16();
12006 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12007 }
12008
12009 if (UseDilithiumIntrinsics) {
12010 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12011 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12012 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12013 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12014 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12015 }
12016
12017 if (UseBASE64Intrinsics) {
12018 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12019 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12020 }
12021
12022 // data cache line writeback
12023 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12024 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12025
12026 if (UseAESIntrinsics) {
12027 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12028 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12029 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12030 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12031 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12032 }
12033 if (UseGHASHIntrinsics) {
12034 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12035 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
12036 }
12037 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12038 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12039 }
12040
12041 if (UseMD5Intrinsics) {
12042 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12043 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12044 }
12045 if (UseSHA1Intrinsics) {
12046 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12047 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12048 }
12049 if (UseSHA256Intrinsics) {
12050 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12051 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12052 }
12053 if (UseSHA512Intrinsics) {
12054 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12055 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12056 }
12057 if (UseSHA3Intrinsics) {
12058
12059 StubRoutines::_double_keccak = generate_double_keccak();
12060 if (UseSIMDForSHA3Intrinsic) {
12061 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12062 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12063 } else {
12064 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12065 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12066 }
12067 }
12068
12069 if (UsePoly1305Intrinsics) {
12070 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12071 }
12072
12073 // generate Adler32 intrinsics code
12074 if (UseAdler32Intrinsics) {
12075 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12076 }
12077
12078 #endif // COMPILER2_OR_JVMCI
12079 }
12080
12081 public:
12082 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
12083 switch(blob_id) {
12084 case BlobId::stubgen_preuniverse_id:
12085 generate_preuniverse_stubs();
12086 break;
12087 case BlobId::stubgen_initial_id:
12088 generate_initial_stubs();
12089 break;
12090 case BlobId::stubgen_continuation_id:
12091 generate_continuation_stubs();
12092 break;
12093 case BlobId::stubgen_compiler_id:
12094 generate_compiler_stubs();
12095 break;
12096 case BlobId::stubgen_final_id:
12097 generate_final_stubs();
12098 break;
12099 default:
12100 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12101 break;
12102 };
12103 }
12104 }; // end class declaration
12105
12106 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
12107 StubGenerator g(code, blob_id);
12108 }
12109
12110
12111 #if defined (LINUX)
12112
12113 // Define pointers to atomic stubs and initialize them to point to the
12114 // code in atomic_aarch64.S.
12115
12116 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12117 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12118 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12119 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12120 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12121
12122 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12123 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12124 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12125 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12126 DEFAULT_ATOMIC_OP(xchg, 4, )
12127 DEFAULT_ATOMIC_OP(xchg, 8, )
12128 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12129 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12130 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12131 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12132 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12133 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12134 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12135 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12136 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12137 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12138
12139 #undef DEFAULT_ATOMIC_OP
12140
12141 #endif // LINUX