1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Stub Code definitions
83
84 class StubGenerator: public StubCodeGenerator {
85 private:
86
87 #ifdef PRODUCT
88 #define inc_counter_np(counter) ((void)0)
89 #else
90 void inc_counter_np_(uint& counter) {
91 __ incrementw(ExternalAddress((address)&counter));
92 }
93 #define inc_counter_np(counter) \
94 BLOCK_COMMENT("inc_counter " #counter); \
95 inc_counter_np_(counter);
96 #endif
97
98 // Call stubs are used to call Java from C
99 //
100 // Arguments:
101 // c_rarg0: call wrapper address address
102 // c_rarg1: result address
103 // c_rarg2: result type BasicType
104 // c_rarg3: method Method*
105 // c_rarg4: (interpreter) entry point address
106 // c_rarg5: parameters intptr_t*
107 // c_rarg6: parameter size (in words) int
108 // c_rarg7: thread Thread*
109 //
110 // There is no return from the stub itself as any Java result
111 // is written to result
112 //
113 // we save r30 (lr) as the return PC at the base of the frame and
114 // link r29 (fp) below it as the frame pointer installing sp (r31)
115 // into fp.
116 //
117 // we save r0-r7, which accounts for all the c arguments.
118 //
119 // TODO: strictly do we need to save them all? they are treated as
120 // volatile by C so could we omit saving the ones we are going to
121 // place in global registers (thread? method?) or those we only use
122 // during setup of the Java call?
123 //
124 // we don't need to save r8 which C uses as an indirect result location
125 // return register.
126 //
127 // we don't need to save r9-r15 which both C and Java treat as
128 // volatile
129 //
130 // we don't need to save r16-18 because Java does not use them
131 //
132 // we save r19-r28 which Java uses as scratch registers and C
133 // expects to be callee-save
134 //
135 // we save the bottom 64 bits of each value stored in v8-v15; it is
136 // the responsibility of the caller to preserve larger values.
137 //
138 // so the stub frame looks like this when we enter Java code
139 //
140 // [ return_from_Java ] <--- sp
141 // [ argument word n ]
142 // ...
143 // -29 [ argument word 1 ]
144 // -28 [ saved Floating-point Control Register ]
145 // -26 [ saved v15 ] <--- sp_after_call
146 // -25 [ saved v14 ]
147 // -24 [ saved v13 ]
148 // -23 [ saved v12 ]
149 // -22 [ saved v11 ]
150 // -21 [ saved v10 ]
151 // -20 [ saved v9 ]
152 // -19 [ saved v8 ]
153 // -18 [ saved r28 ]
154 // -17 [ saved r27 ]
155 // -16 [ saved r26 ]
156 // -15 [ saved r25 ]
157 // -14 [ saved r24 ]
158 // -13 [ saved r23 ]
159 // -12 [ saved r22 ]
160 // -11 [ saved r21 ]
161 // -10 [ saved r20 ]
162 // -9 [ saved r19 ]
163 // -8 [ call wrapper (r0) ]
164 // -7 [ result (r1) ]
165 // -6 [ result type (r2) ]
166 // -5 [ method (r3) ]
167 // -4 [ entry point (r4) ]
168 // -3 [ parameters (r5) ]
169 // -2 [ parameter size (r6) ]
170 // -1 [ thread (r7) ]
171 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
172 // 1 [ saved lr (r30) ]
173
174 // Call stub stack layout word offsets from fp
175 enum call_stub_layout {
176 sp_after_call_off = -28,
177
178 fpcr_off = sp_after_call_off,
179 d15_off = -26,
180 d13_off = -24,
181 d11_off = -22,
182 d9_off = -20,
183
184 r28_off = -18,
185 r26_off = -16,
186 r24_off = -14,
187 r22_off = -12,
188 r20_off = -10,
189 call_wrapper_off = -8,
190 result_off = -7,
191 result_type_off = -6,
192 method_off = -5,
193 entry_point_off = -4,
194 parameter_size_off = -2,
195 thread_off = -1,
196 fp_f = 0,
197 retaddr_off = 1,
198 };
199
200 address generate_call_stub(address& return_address) {
201 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
202 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
203 "adjust this code");
204
205 StubId stub_id = StubId::stubgen_call_stub_id;
206 StubCodeMark mark(this, stub_id);
207 address start = __ pc();
208
209 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
210
211 const Address fpcr_save (rfp, fpcr_off * wordSize);
212 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
213 const Address result (rfp, result_off * wordSize);
214 const Address result_type (rfp, result_type_off * wordSize);
215 const Address method (rfp, method_off * wordSize);
216 const Address entry_point (rfp, entry_point_off * wordSize);
217 const Address parameter_size(rfp, parameter_size_off * wordSize);
218
219 const Address thread (rfp, thread_off * wordSize);
220
221 const Address d15_save (rfp, d15_off * wordSize);
222 const Address d13_save (rfp, d13_off * wordSize);
223 const Address d11_save (rfp, d11_off * wordSize);
224 const Address d9_save (rfp, d9_off * wordSize);
225
226 const Address r28_save (rfp, r28_off * wordSize);
227 const Address r26_save (rfp, r26_off * wordSize);
228 const Address r24_save (rfp, r24_off * wordSize);
229 const Address r22_save (rfp, r22_off * wordSize);
230 const Address r20_save (rfp, r20_off * wordSize);
231
232 // stub code
233
234 address aarch64_entry = __ pc();
235
236 // set up frame and move sp to end of save area
237 __ enter();
238 __ sub(sp, rfp, -sp_after_call_off * wordSize);
239
240 // save register parameters and Java scratch/global registers
241 // n.b. we save thread even though it gets installed in
242 // rthread because we want to sanity check rthread later
243 __ str(c_rarg7, thread);
244 __ strw(c_rarg6, parameter_size);
245 __ stp(c_rarg4, c_rarg5, entry_point);
246 __ stp(c_rarg2, c_rarg3, result_type);
247 __ stp(c_rarg0, c_rarg1, call_wrapper);
248
249 __ stp(r20, r19, r20_save);
250 __ stp(r22, r21, r22_save);
251 __ stp(r24, r23, r24_save);
252 __ stp(r26, r25, r26_save);
253 __ stp(r28, r27, r28_save);
254
255 __ stpd(v9, v8, d9_save);
256 __ stpd(v11, v10, d11_save);
257 __ stpd(v13, v12, d13_save);
258 __ stpd(v15, v14, d15_save);
259
260 __ get_fpcr(rscratch1);
261 __ str(rscratch1, fpcr_save);
262 // Set FPCR to the state we need. We do want Round to Nearest. We
263 // don't want non-IEEE rounding modes or floating-point traps.
264 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
265 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
266 __ set_fpcr(rscratch1);
267
268 // install Java thread in global register now we have saved
269 // whatever value it held
270 __ mov(rthread, c_rarg7);
271 // And method
272 __ mov(rmethod, c_rarg3);
273
274 // set up the heapbase register
275 __ reinit_heapbase();
276
277 #ifdef ASSERT
278 // make sure we have no pending exceptions
279 {
280 Label L;
281 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
282 __ cmp(rscratch1, (u1)NULL_WORD);
283 __ br(Assembler::EQ, L);
284 __ stop("StubRoutines::call_stub: entered with pending exception");
285 __ BIND(L);
286 }
287 #endif
288 // pass parameters if any
289 __ mov(esp, sp);
290 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
291 __ andr(sp, rscratch1, -2 * wordSize);
292
293 BLOCK_COMMENT("pass parameters if any");
294 Label parameters_done;
295 // parameter count is still in c_rarg6
296 // and parameter pointer identifying param 1 is in c_rarg5
297 __ cbzw(c_rarg6, parameters_done);
298
299 address loop = __ pc();
300 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
301 __ subsw(c_rarg6, c_rarg6, 1);
302 __ push(rscratch1);
303 __ br(Assembler::GT, loop);
304
305 __ BIND(parameters_done);
306
307 // call Java entry -- passing methdoOop, and current sp
308 // rmethod: Method*
309 // r19_sender_sp: sender sp
310 BLOCK_COMMENT("call Java function");
311 __ mov(r19_sender_sp, sp);
312 __ blr(c_rarg4);
313
314 // we do this here because the notify will already have been done
315 // if we get to the next instruction via an exception
316 //
317 // n.b. adding this instruction here affects the calculation of
318 // whether or not a routine returns to the call stub (used when
319 // doing stack walks) since the normal test is to check the return
320 // pc against the address saved below. so we may need to allow for
321 // this extra instruction in the check.
322
323 // save current address for use by exception handling code
324
325 return_address = __ pc();
326
327 // store result depending on type (everything that is not
328 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
329 // n.b. this assumes Java returns an integral result in r0
330 // and a floating result in j_farg0
331 __ ldr(j_rarg2, result);
332 Label is_long, is_float, is_double, exit;
333 __ ldr(j_rarg1, result_type);
334 __ cmp(j_rarg1, (u1)T_OBJECT);
335 __ br(Assembler::EQ, is_long);
336 __ cmp(j_rarg1, (u1)T_LONG);
337 __ br(Assembler::EQ, is_long);
338 __ cmp(j_rarg1, (u1)T_FLOAT);
339 __ br(Assembler::EQ, is_float);
340 __ cmp(j_rarg1, (u1)T_DOUBLE);
341 __ br(Assembler::EQ, is_double);
342
343 // handle T_INT case
344 __ strw(r0, Address(j_rarg2));
345
346 __ BIND(exit);
347
348 // pop parameters
349 __ sub(esp, rfp, -sp_after_call_off * wordSize);
350
351 #ifdef ASSERT
352 // verify that threads correspond
353 {
354 Label L, S;
355 __ ldr(rscratch1, thread);
356 __ cmp(rthread, rscratch1);
357 __ br(Assembler::NE, S);
358 __ get_thread(rscratch1);
359 __ cmp(rthread, rscratch1);
360 __ br(Assembler::EQ, L);
361 __ BIND(S);
362 __ stop("StubRoutines::call_stub: threads must correspond");
363 __ BIND(L);
364 }
365 #endif
366
367 __ pop_cont_fastpath(rthread);
368
369 // restore callee-save registers
370 __ ldpd(v15, v14, d15_save);
371 __ ldpd(v13, v12, d13_save);
372 __ ldpd(v11, v10, d11_save);
373 __ ldpd(v9, v8, d9_save);
374
375 __ ldp(r28, r27, r28_save);
376 __ ldp(r26, r25, r26_save);
377 __ ldp(r24, r23, r24_save);
378 __ ldp(r22, r21, r22_save);
379 __ ldp(r20, r19, r20_save);
380
381 // restore fpcr
382 __ ldr(rscratch1, fpcr_save);
383 __ set_fpcr(rscratch1);
384
385 __ ldp(c_rarg0, c_rarg1, call_wrapper);
386 __ ldrw(c_rarg2, result_type);
387 __ ldr(c_rarg3, method);
388 __ ldp(c_rarg4, c_rarg5, entry_point);
389 __ ldp(c_rarg6, c_rarg7, parameter_size);
390
391 // leave frame and return to caller
392 __ leave();
393 __ ret(lr);
394
395 // handle return types different from T_INT
396
397 __ BIND(is_long);
398 __ str(r0, Address(j_rarg2, 0));
399 __ br(Assembler::AL, exit);
400
401 __ BIND(is_float);
402 __ strs(j_farg0, Address(j_rarg2, 0));
403 __ br(Assembler::AL, exit);
404
405 __ BIND(is_double);
406 __ strd(j_farg0, Address(j_rarg2, 0));
407 __ br(Assembler::AL, exit);
408
409 return start;
410 }
411
412 // Return point for a Java call if there's an exception thrown in
413 // Java code. The exception is caught and transformed into a
414 // pending exception stored in JavaThread that can be tested from
415 // within the VM.
416 //
417 // Note: Usually the parameters are removed by the callee. In case
418 // of an exception crossing an activation frame boundary, that is
419 // not the case if the callee is compiled code => need to setup the
420 // rsp.
421 //
422 // r0: exception oop
423
424 address generate_catch_exception() {
425 StubId stub_id = StubId::stubgen_catch_exception_id;
426 StubCodeMark mark(this, stub_id);
427 address start = __ pc();
428
429 // same as in generate_call_stub():
430 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
431 const Address thread (rfp, thread_off * wordSize);
432
433 #ifdef ASSERT
434 // verify that threads correspond
435 {
436 Label L, S;
437 __ ldr(rscratch1, thread);
438 __ cmp(rthread, rscratch1);
439 __ br(Assembler::NE, S);
440 __ get_thread(rscratch1);
441 __ cmp(rthread, rscratch1);
442 __ br(Assembler::EQ, L);
443 __ bind(S);
444 __ stop("StubRoutines::catch_exception: threads must correspond");
445 __ bind(L);
446 }
447 #endif
448
449 // set pending exception
450 __ verify_oop(r0);
451
452 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
453 __ mov(rscratch1, (address)__FILE__);
454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
455 __ movw(rscratch1, (int)__LINE__);
456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
457
458 // complete return to VM
459 assert(StubRoutines::_call_stub_return_address != nullptr,
460 "_call_stub_return_address must have been generated before");
461 __ b(StubRoutines::_call_stub_return_address);
462
463 return start;
464 }
465
466 // Continuation point for runtime calls returning with a pending
467 // exception. The pending exception check happened in the runtime
468 // or native call stub. The pending exception in Thread is
469 // converted into a Java-level exception.
470 //
471 // Contract with Java-level exception handlers:
472 // r0: exception
473 // r3: throwing pc
474 //
475 // NOTE: At entry of this stub, exception-pc must be in LR !!
476
477 // NOTE: this is always used as a jump target within generated code
478 // so it just needs to be generated code with no x86 prolog
479
480 address generate_forward_exception() {
481 StubId stub_id = StubId::stubgen_forward_exception_id;
482 StubCodeMark mark(this, stub_id);
483 address start = __ pc();
484
485 // Upon entry, LR points to the return address returning into
486 // Java (interpreted or compiled) code; i.e., the return address
487 // becomes the throwing pc.
488 //
489 // Arguments pushed before the runtime call are still on the stack
490 // but the exception handler will reset the stack pointer ->
491 // ignore them. A potential result in registers can be ignored as
492 // well.
493
494 #ifdef ASSERT
495 // make sure this code is only executed if there is a pending exception
496 {
497 Label L;
498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
499 __ cbnz(rscratch1, L);
500 __ stop("StubRoutines::forward exception: no pending exception (1)");
501 __ bind(L);
502 }
503 #endif
504
505 // compute exception handler into r19
506
507 // call the VM to find the handler address associated with the
508 // caller address. pass thread in r0 and caller pc (ret address)
509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
510 // the stack.
511 __ mov(c_rarg1, lr);
512 // lr will be trashed by the VM call so we move it to R19
513 // (callee-saved) because we also need to pass it to the handler
514 // returned by this call.
515 __ mov(r19, lr);
516 BLOCK_COMMENT("call exception_handler_for_return_address");
517 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
518 SharedRuntime::exception_handler_for_return_address),
519 rthread, c_rarg1);
520 // Reinitialize the ptrue predicate register, in case the external runtime
521 // call clobbers ptrue reg, as we may return to SVE compiled code.
522 __ reinitialize_ptrue();
523
524 // we should not really care that lr is no longer the callee
525 // address. we saved the value the handler needs in r19 so we can
526 // just copy it to r3. however, the C2 handler will push its own
527 // frame and then calls into the VM and the VM code asserts that
528 // the PC for the frame above the handler belongs to a compiled
529 // Java method. So, we restore lr here to satisfy that assert.
530 __ mov(lr, r19);
531 // setup r0 & r3 & clear pending exception
532 __ mov(r3, r19);
533 __ mov(r19, r0);
534 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
535 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
536
537 #ifdef ASSERT
538 // make sure exception is set
539 {
540 Label L;
541 __ cbnz(r0, L);
542 __ stop("StubRoutines::forward exception: no pending exception (2)");
543 __ bind(L);
544 }
545 #endif
546
547 // continue at exception handler
548 // r0: exception
549 // r3: throwing pc
550 // r19: exception handler
551 __ verify_oop(r0);
552 __ br(r19);
553
554 return start;
555 }
556
557 // Non-destructive plausibility checks for oops
558 //
559 // Arguments:
560 // r0: oop to verify
561 // rscratch1: error message
562 //
563 // Stack after saving c_rarg3:
564 // [tos + 0]: saved c_rarg3
565 // [tos + 1]: saved c_rarg2
566 // [tos + 2]: saved lr
567 // [tos + 3]: saved rscratch2
568 // [tos + 4]: saved r0
569 // [tos + 5]: saved rscratch1
570 address generate_verify_oop() {
571 StubId stub_id = StubId::stubgen_verify_oop_id;
572 StubCodeMark mark(this, stub_id);
573 address start = __ pc();
574
575 Label exit, error;
576
577 // save c_rarg2 and c_rarg3
578 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
579
580 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
581 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
582 __ ldr(c_rarg3, Address(c_rarg2));
583 __ add(c_rarg3, c_rarg3, 1);
584 __ str(c_rarg3, Address(c_rarg2));
585
586 // object is in r0
587 // make sure object is 'reasonable'
588 __ cbz(r0, exit); // if obj is null it is OK
589
590 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
591 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
592
593 // return if everything seems ok
594 __ bind(exit);
595
596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
597 __ ret(lr);
598
599 // handle errors
600 __ bind(error);
601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
602
603 __ push(RegSet::range(r0, r29), sp);
604 // debug(char* msg, int64_t pc, int64_t regs[])
605 __ mov(c_rarg0, rscratch1); // pass address of error message
606 __ mov(c_rarg1, lr); // pass return address
607 __ mov(c_rarg2, sp); // pass address of regs on stack
608 #ifndef PRODUCT
609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
610 #endif
611 BLOCK_COMMENT("call MacroAssembler::debug");
612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
613 __ blr(rscratch1);
614 __ hlt(0);
615
616 return start;
617 }
618
619 // Generate indices for iota vector.
620 address generate_iota_indices(StubId stub_id) {
621 __ align(CodeEntryAlignment);
622 StubCodeMark mark(this, stub_id);
623 address start = __ pc();
624 // B
625 __ emit_data64(0x0706050403020100, relocInfo::none);
626 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
627 // H
628 __ emit_data64(0x0003000200010000, relocInfo::none);
629 __ emit_data64(0x0007000600050004, relocInfo::none);
630 // S
631 __ emit_data64(0x0000000100000000, relocInfo::none);
632 __ emit_data64(0x0000000300000002, relocInfo::none);
633 // D
634 __ emit_data64(0x0000000000000000, relocInfo::none);
635 __ emit_data64(0x0000000000000001, relocInfo::none);
636 // S - FP
637 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
638 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
639 // D - FP
640 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
641 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
642 return start;
643 }
644
645 // The inner part of zero_words(). This is the bulk operation,
646 // zeroing words in blocks, possibly using DC ZVA to do it. The
647 // caller is responsible for zeroing the last few words.
648 //
649 // Inputs:
650 // r10: the HeapWord-aligned base address of an array to zero.
651 // r11: the count in HeapWords, r11 > 0.
652 //
653 // Returns r10 and r11, adjusted for the caller to clear.
654 // r10: the base address of the tail of words left to clear.
655 // r11: the number of words in the tail.
656 // r11 < MacroAssembler::zero_words_block_size.
657
658 address generate_zero_blocks() {
659 Label done;
660 Label base_aligned;
661
662 Register base = r10, cnt = r11;
663
664 __ align(CodeEntryAlignment);
665 StubId stub_id = StubId::stubgen_zero_blocks_id;
666 StubCodeMark mark(this, stub_id);
667 address start = __ pc();
668
669 if (UseBlockZeroing) {
670 int zva_length = VM_Version::zva_length();
671
672 // Ensure ZVA length can be divided by 16. This is required by
673 // the subsequent operations.
674 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
675
676 __ tbz(base, 3, base_aligned);
677 __ str(zr, Address(__ post(base, 8)));
678 __ sub(cnt, cnt, 1);
679 __ bind(base_aligned);
680
681 // Ensure count >= zva_length * 2 so that it still deserves a zva after
682 // alignment.
683 Label small;
684 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
685 __ subs(rscratch1, cnt, low_limit >> 3);
686 __ br(Assembler::LT, small);
687 __ zero_dcache_blocks(base, cnt);
688 __ bind(small);
689 }
690
691 {
692 // Number of stp instructions we'll unroll
693 const int unroll =
694 MacroAssembler::zero_words_block_size / 2;
695 // Clear the remaining blocks.
696 Label loop;
697 __ subs(cnt, cnt, unroll * 2);
698 __ br(Assembler::LT, done);
699 __ bind(loop);
700 for (int i = 0; i < unroll; i++)
701 __ stp(zr, zr, __ post(base, 16));
702 __ subs(cnt, cnt, unroll * 2);
703 __ br(Assembler::GE, loop);
704 __ bind(done);
705 __ add(cnt, cnt, unroll * 2);
706 }
707
708 __ ret(lr);
709
710 return start;
711 }
712
713
714 typedef enum {
715 copy_forwards = 1,
716 copy_backwards = -1
717 } copy_direction;
718
719 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
720 // for arraycopy stubs.
721 class ArrayCopyBarrierSetHelper : StackObj {
722 BarrierSetAssembler* _bs_asm;
723 MacroAssembler* _masm;
724 DecoratorSet _decorators;
725 BasicType _type;
726 Register _gct1;
727 Register _gct2;
728 Register _gct3;
729 FloatRegister _gcvt1;
730 FloatRegister _gcvt2;
731 FloatRegister _gcvt3;
732
733 public:
734 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
735 DecoratorSet decorators,
736 BasicType type,
737 Register gct1,
738 Register gct2,
739 Register gct3,
740 FloatRegister gcvt1,
741 FloatRegister gcvt2,
742 FloatRegister gcvt3)
743 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
744 _masm(masm),
745 _decorators(decorators),
746 _type(type),
747 _gct1(gct1),
748 _gct2(gct2),
749 _gct3(gct3),
750 _gcvt1(gcvt1),
751 _gcvt2(gcvt2),
752 _gcvt3(gcvt3) {
753 }
754
755 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
756 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
757 dst1, dst2, src,
758 _gct1, _gct2, _gcvt1);
759 }
760
761 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
762 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
763 dst, src1, src2,
764 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
765 }
766
767 void copy_load_at_16(Register dst1, Register dst2, Address src) {
768 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
769 dst1, dst2, src,
770 _gct1);
771 }
772
773 void copy_store_at_16(Address dst, Register src1, Register src2) {
774 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
775 dst, src1, src2,
776 _gct1, _gct2, _gct3);
777 }
778
779 void copy_load_at_8(Register dst, Address src) {
780 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
781 dst, noreg, src,
782 _gct1);
783 }
784
785 void copy_store_at_8(Address dst, Register src) {
786 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
787 dst, src, noreg,
788 _gct1, _gct2, _gct3);
789 }
790 };
791
792 // Bulk copy of blocks of 8 words.
793 //
794 // count is a count of words.
795 //
796 // Precondition: count >= 8
797 //
798 // Postconditions:
799 //
800 // The least significant bit of count contains the remaining count
801 // of words to copy. The rest of count is trash.
802 //
803 // s and d are adjusted to point to the remaining words to copy
804 //
805 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
806 BasicType type;
807 copy_direction direction;
808
809 switch (stub_id) {
810 case StubId::stubgen_copy_byte_f_id:
811 direction = copy_forwards;
812 type = T_BYTE;
813 break;
814 case StubId::stubgen_copy_byte_b_id:
815 direction = copy_backwards;
816 type = T_BYTE;
817 break;
818 case StubId::stubgen_copy_oop_f_id:
819 direction = copy_forwards;
820 type = T_OBJECT;
821 break;
822 case StubId::stubgen_copy_oop_b_id:
823 direction = copy_backwards;
824 type = T_OBJECT;
825 break;
826 case StubId::stubgen_copy_oop_uninit_f_id:
827 direction = copy_forwards;
828 type = T_OBJECT;
829 break;
830 case StubId::stubgen_copy_oop_uninit_b_id:
831 direction = copy_backwards;
832 type = T_OBJECT;
833 break;
834 default:
835 ShouldNotReachHere();
836 }
837
838 int unit = wordSize * direction;
839 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
840
841 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
842 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
843 const Register stride = r14;
844 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
845 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
846 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
847
848 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
849 assert_different_registers(s, d, count, rscratch1, rscratch2);
850
851 Label again, drain;
852
853 __ align(CodeEntryAlignment);
854
855 StubCodeMark mark(this, stub_id);
856
857 address start = __ pc();
858
859 Label unaligned_copy_long;
860 if (AvoidUnalignedAccesses) {
861 __ tbnz(d, 3, unaligned_copy_long);
862 }
863
864 if (direction == copy_forwards) {
865 __ sub(s, s, bias);
866 __ sub(d, d, bias);
867 }
868
869 #ifdef ASSERT
870 // Make sure we are never given < 8 words
871 {
872 Label L;
873 __ cmp(count, (u1)8);
874 __ br(Assembler::GE, L);
875 __ stop("genrate_copy_longs called with < 8 words");
876 __ bind(L);
877 }
878 #endif
879
880 // Fill 8 registers
881 if (UseSIMDForMemoryOps) {
882 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
883 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
884 } else {
885 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
886 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
887 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
888 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
889 }
890
891 __ subs(count, count, 16);
892 __ br(Assembler::LO, drain);
893
894 int prefetch = PrefetchCopyIntervalInBytes;
895 bool use_stride = false;
896 if (direction == copy_backwards) {
897 use_stride = prefetch > 256;
898 prefetch = -prefetch;
899 if (use_stride) __ mov(stride, prefetch);
900 }
901
902 __ bind(again);
903
904 if (PrefetchCopyIntervalInBytes > 0)
905 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
906
907 if (UseSIMDForMemoryOps) {
908 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
909 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
910 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
911 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
912 } else {
913 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
914 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
915 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
916 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
917 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
918 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
919 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
920 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
921 }
922
923 __ subs(count, count, 8);
924 __ br(Assembler::HS, again);
925
926 // Drain
927 __ bind(drain);
928 if (UseSIMDForMemoryOps) {
929 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
930 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
931 } else {
932 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
933 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
934 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
935 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
936 }
937
938 {
939 Label L1, L2;
940 __ tbz(count, exact_log2(4), L1);
941 if (UseSIMDForMemoryOps) {
942 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
943 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
944 } else {
945 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
946 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
947 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
948 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
949 }
950 __ bind(L1);
951
952 if (direction == copy_forwards) {
953 __ add(s, s, bias);
954 __ add(d, d, bias);
955 }
956
957 __ tbz(count, 1, L2);
958 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
959 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
960 __ bind(L2);
961 }
962
963 __ ret(lr);
964
965 if (AvoidUnalignedAccesses) {
966 Label drain, again;
967 // Register order for storing. Order is different for backward copy.
968
969 __ bind(unaligned_copy_long);
970
971 // source address is even aligned, target odd aligned
972 //
973 // when forward copying word pairs we read long pairs at offsets
974 // {0, 2, 4, 6} (in long words). when backwards copying we read
975 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
976 // address by -2 in the forwards case so we can compute the
977 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
978 // or -1.
979 //
980 // when forward copying we need to store 1 word, 3 pairs and
981 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
982 // zero offset We adjust the destination by -1 which means we
983 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
984 //
985 // When backwards copyng we need to store 1 word, 3 pairs and
986 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
987 // offsets {1, 3, 5, 7, 8} * unit.
988
989 if (direction == copy_forwards) {
990 __ sub(s, s, 16);
991 __ sub(d, d, 8);
992 }
993
994 // Fill 8 registers
995 //
996 // for forwards copy s was offset by -16 from the original input
997 // value of s so the register contents are at these offsets
998 // relative to the 64 bit block addressed by that original input
999 // and so on for each successive 64 byte block when s is updated
1000 //
1001 // t0 at offset 0, t1 at offset 8
1002 // t2 at offset 16, t3 at offset 24
1003 // t4 at offset 32, t5 at offset 40
1004 // t6 at offset 48, t7 at offset 56
1005
1006 // for backwards copy s was not offset so the register contents
1007 // are at these offsets into the preceding 64 byte block
1008 // relative to that original input and so on for each successive
1009 // preceding 64 byte block when s is updated. this explains the
1010 // slightly counter-intuitive looking pattern of register usage
1011 // in the stp instructions for backwards copy.
1012 //
1013 // t0 at offset -16, t1 at offset -8
1014 // t2 at offset -32, t3 at offset -24
1015 // t4 at offset -48, t5 at offset -40
1016 // t6 at offset -64, t7 at offset -56
1017
1018 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1019 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1020 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1021 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1022
1023 __ subs(count, count, 16);
1024 __ br(Assembler::LO, drain);
1025
1026 int prefetch = PrefetchCopyIntervalInBytes;
1027 bool use_stride = false;
1028 if (direction == copy_backwards) {
1029 use_stride = prefetch > 256;
1030 prefetch = -prefetch;
1031 if (use_stride) __ mov(stride, prefetch);
1032 }
1033
1034 __ bind(again);
1035
1036 if (PrefetchCopyIntervalInBytes > 0)
1037 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1038
1039 if (direction == copy_forwards) {
1040 // allowing for the offset of -8 the store instructions place
1041 // registers into the target 64 bit block at the following
1042 // offsets
1043 //
1044 // t0 at offset 0
1045 // t1 at offset 8, t2 at offset 16
1046 // t3 at offset 24, t4 at offset 32
1047 // t5 at offset 40, t6 at offset 48
1048 // t7 at offset 56
1049
1050 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1051 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1052 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1053 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1054 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1055 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1056 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1057 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1058 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1059 } else {
1060 // d was not offset when we started so the registers are
1061 // written into the 64 bit block preceding d with the following
1062 // offsets
1063 //
1064 // t1 at offset -8
1065 // t3 at offset -24, t0 at offset -16
1066 // t5 at offset -48, t2 at offset -32
1067 // t7 at offset -56, t4 at offset -48
1068 // t6 at offset -64
1069 //
1070 // note that this matches the offsets previously noted for the
1071 // loads
1072
1073 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1074 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1075 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1076 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1077 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1078 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1079 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1080 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1081 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1082 }
1083
1084 __ subs(count, count, 8);
1085 __ br(Assembler::HS, again);
1086
1087 // Drain
1088 //
1089 // this uses the same pattern of offsets and register arguments
1090 // as above
1091 __ bind(drain);
1092 if (direction == copy_forwards) {
1093 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1094 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1095 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1096 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1097 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1098 } else {
1099 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1100 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1101 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1102 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1103 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1104 }
1105 // now we need to copy any remaining part block which may
1106 // include a 4 word block subblock and/or a 2 word subblock.
1107 // bits 2 and 1 in the count are the tell-tale for whether we
1108 // have each such subblock
1109 {
1110 Label L1, L2;
1111 __ tbz(count, exact_log2(4), L1);
1112 // this is the same as above but copying only 4 longs hence
1113 // with only one intervening stp between the str instructions
1114 // but note that the offsets and registers still follow the
1115 // same pattern
1116 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1117 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1118 if (direction == copy_forwards) {
1119 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1120 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1121 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1122 } else {
1123 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1124 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1125 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1126 }
1127 __ bind(L1);
1128
1129 __ tbz(count, 1, L2);
1130 // this is the same as above but copying only 2 longs hence
1131 // there is no intervening stp between the str instructions
1132 // but note that the offset and register patterns are still
1133 // the same
1134 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1135 if (direction == copy_forwards) {
1136 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1137 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1138 } else {
1139 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1140 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1141 }
1142 __ bind(L2);
1143
1144 // for forwards copy we need to re-adjust the offsets we
1145 // applied so that s and d are follow the last words written
1146
1147 if (direction == copy_forwards) {
1148 __ add(s, s, 16);
1149 __ add(d, d, 8);
1150 }
1151
1152 }
1153
1154 __ ret(lr);
1155 }
1156
1157 return start;
1158 }
1159
1160 // Small copy: less than 16 bytes.
1161 //
1162 // NB: Ignores all of the bits of count which represent more than 15
1163 // bytes, so a caller doesn't have to mask them.
1164
1165 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1166 bool is_backwards = step < 0;
1167 size_t granularity = g_uabs(step);
1168 int direction = is_backwards ? -1 : 1;
1169
1170 Label Lword, Lint, Lshort, Lbyte;
1171
1172 assert(granularity
1173 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1174
1175 const Register t0 = r3;
1176 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1177 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1178
1179 // ??? I don't know if this bit-test-and-branch is the right thing
1180 // to do. It does a lot of jumping, resulting in several
1181 // mispredicted branches. It might make more sense to do this
1182 // with something like Duff's device with a single computed branch.
1183
1184 __ tbz(count, 3 - exact_log2(granularity), Lword);
1185 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1186 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1187 __ bind(Lword);
1188
1189 if (granularity <= sizeof (jint)) {
1190 __ tbz(count, 2 - exact_log2(granularity), Lint);
1191 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1192 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1193 __ bind(Lint);
1194 }
1195
1196 if (granularity <= sizeof (jshort)) {
1197 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1198 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1199 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1200 __ bind(Lshort);
1201 }
1202
1203 if (granularity <= sizeof (jbyte)) {
1204 __ tbz(count, 0, Lbyte);
1205 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1206 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1207 __ bind(Lbyte);
1208 }
1209 }
1210
1211 // All-singing all-dancing memory copy.
1212 //
1213 // Copy count units of memory from s to d. The size of a unit is
1214 // step, which can be positive or negative depending on the direction
1215 // of copy. If is_aligned is false, we align the source address.
1216 //
1217
1218 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1219 Register s, Register d, Register count, int step) {
1220 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1221 bool is_backwards = step < 0;
1222 unsigned int granularity = g_uabs(step);
1223 const Register t0 = r3, t1 = r4;
1224
1225 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1226 // load all the data before writing anything
1227 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1228 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1229 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1230 const Register send = r17, dend = r16;
1231 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1232 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1233 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1234
1235 if (PrefetchCopyIntervalInBytes > 0)
1236 __ prfm(Address(s, 0), PLDL1KEEP);
1237 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1238 __ br(Assembler::HI, copy_big);
1239
1240 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1241 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1242
1243 __ cmp(count, u1(16/granularity));
1244 __ br(Assembler::LS, copy16);
1245
1246 __ cmp(count, u1(64/granularity));
1247 __ br(Assembler::HI, copy80);
1248
1249 __ cmp(count, u1(32/granularity));
1250 __ br(Assembler::LS, copy32);
1251
1252 // 33..64 bytes
1253 if (UseSIMDForMemoryOps) {
1254 bs.copy_load_at_32(v0, v1, Address(s, 0));
1255 bs.copy_load_at_32(v2, v3, Address(send, -32));
1256 bs.copy_store_at_32(Address(d, 0), v0, v1);
1257 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1258 } else {
1259 bs.copy_load_at_16(t0, t1, Address(s, 0));
1260 bs.copy_load_at_16(t2, t3, Address(s, 16));
1261 bs.copy_load_at_16(t4, t5, Address(send, -32));
1262 bs.copy_load_at_16(t6, t7, Address(send, -16));
1263
1264 bs.copy_store_at_16(Address(d, 0), t0, t1);
1265 bs.copy_store_at_16(Address(d, 16), t2, t3);
1266 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1267 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1268 }
1269 __ b(finish);
1270
1271 // 17..32 bytes
1272 __ bind(copy32);
1273 bs.copy_load_at_16(t0, t1, Address(s, 0));
1274 bs.copy_load_at_16(t6, t7, Address(send, -16));
1275
1276 bs.copy_store_at_16(Address(d, 0), t0, t1);
1277 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1278 __ b(finish);
1279
1280 // 65..80/96 bytes
1281 // (96 bytes if SIMD because we do 32 byes per instruction)
1282 __ bind(copy80);
1283 if (UseSIMDForMemoryOps) {
1284 bs.copy_load_at_32(v0, v1, Address(s, 0));
1285 bs.copy_load_at_32(v2, v3, Address(s, 32));
1286 // Unaligned pointers can be an issue for copying.
1287 // The issue has more chances to happen when granularity of data is
1288 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1289 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1290 // The most performance drop has been seen for the range 65-80 bytes.
1291 // For such cases using the pair of ldp/stp instead of the third pair of
1292 // ldpq/stpq fixes the performance issue.
1293 if (granularity < sizeof (jint)) {
1294 Label copy96;
1295 __ cmp(count, u1(80/granularity));
1296 __ br(Assembler::HI, copy96);
1297 bs.copy_load_at_16(t0, t1, Address(send, -16));
1298
1299 bs.copy_store_at_32(Address(d, 0), v0, v1);
1300 bs.copy_store_at_32(Address(d, 32), v2, v3);
1301
1302 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1303 __ b(finish);
1304
1305 __ bind(copy96);
1306 }
1307 bs.copy_load_at_32(v4, v5, Address(send, -32));
1308
1309 bs.copy_store_at_32(Address(d, 0), v0, v1);
1310 bs.copy_store_at_32(Address(d, 32), v2, v3);
1311
1312 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1313 } else {
1314 bs.copy_load_at_16(t0, t1, Address(s, 0));
1315 bs.copy_load_at_16(t2, t3, Address(s, 16));
1316 bs.copy_load_at_16(t4, t5, Address(s, 32));
1317 bs.copy_load_at_16(t6, t7, Address(s, 48));
1318 bs.copy_load_at_16(t8, t9, Address(send, -16));
1319
1320 bs.copy_store_at_16(Address(d, 0), t0, t1);
1321 bs.copy_store_at_16(Address(d, 16), t2, t3);
1322 bs.copy_store_at_16(Address(d, 32), t4, t5);
1323 bs.copy_store_at_16(Address(d, 48), t6, t7);
1324 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1325 }
1326 __ b(finish);
1327
1328 // 0..16 bytes
1329 __ bind(copy16);
1330 __ cmp(count, u1(8/granularity));
1331 __ br(Assembler::LO, copy8);
1332
1333 // 8..16 bytes
1334 bs.copy_load_at_8(t0, Address(s, 0));
1335 bs.copy_load_at_8(t1, Address(send, -8));
1336 bs.copy_store_at_8(Address(d, 0), t0);
1337 bs.copy_store_at_8(Address(dend, -8), t1);
1338 __ b(finish);
1339
1340 if (granularity < 8) {
1341 // 4..7 bytes
1342 __ bind(copy8);
1343 __ tbz(count, 2 - exact_log2(granularity), copy4);
1344 __ ldrw(t0, Address(s, 0));
1345 __ ldrw(t1, Address(send, -4));
1346 __ strw(t0, Address(d, 0));
1347 __ strw(t1, Address(dend, -4));
1348 __ b(finish);
1349 if (granularity < 4) {
1350 // 0..3 bytes
1351 __ bind(copy4);
1352 __ cbz(count, finish); // get rid of 0 case
1353 if (granularity == 2) {
1354 __ ldrh(t0, Address(s, 0));
1355 __ strh(t0, Address(d, 0));
1356 } else { // granularity == 1
1357 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1358 // the first and last byte.
1359 // Handle the 3 byte case by loading and storing base + count/2
1360 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1361 // This does means in the 1 byte case we load/store the same
1362 // byte 3 times.
1363 __ lsr(count, count, 1);
1364 __ ldrb(t0, Address(s, 0));
1365 __ ldrb(t1, Address(send, -1));
1366 __ ldrb(t2, Address(s, count));
1367 __ strb(t0, Address(d, 0));
1368 __ strb(t1, Address(dend, -1));
1369 __ strb(t2, Address(d, count));
1370 }
1371 __ b(finish);
1372 }
1373 }
1374
1375 __ bind(copy_big);
1376 if (is_backwards) {
1377 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1378 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1379 }
1380
1381 // Now we've got the small case out of the way we can align the
1382 // source address on a 2-word boundary.
1383
1384 // Here we will materialize a count in r15, which is used by copy_memory_small
1385 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1386 // Up until here, we have used t9, which aliases r15, but from here on, that register
1387 // can not be used as a temp register, as it contains the count.
1388
1389 Label aligned;
1390
1391 if (is_aligned) {
1392 // We may have to adjust by 1 word to get s 2-word-aligned.
1393 __ tbz(s, exact_log2(wordSize), aligned);
1394 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1395 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1396 __ sub(count, count, wordSize/granularity);
1397 } else {
1398 if (is_backwards) {
1399 __ andr(r15, s, 2 * wordSize - 1);
1400 } else {
1401 __ neg(r15, s);
1402 __ andr(r15, r15, 2 * wordSize - 1);
1403 }
1404 // r15 is the byte adjustment needed to align s.
1405 __ cbz(r15, aligned);
1406 int shift = exact_log2(granularity);
1407 if (shift > 0) {
1408 __ lsr(r15, r15, shift);
1409 }
1410 __ sub(count, count, r15);
1411
1412 #if 0
1413 // ?? This code is only correct for a disjoint copy. It may or
1414 // may not make sense to use it in that case.
1415
1416 // Copy the first pair; s and d may not be aligned.
1417 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1418 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1419
1420 // Align s and d, adjust count
1421 if (is_backwards) {
1422 __ sub(s, s, r15);
1423 __ sub(d, d, r15);
1424 } else {
1425 __ add(s, s, r15);
1426 __ add(d, d, r15);
1427 }
1428 #else
1429 copy_memory_small(decorators, type, s, d, r15, step);
1430 #endif
1431 }
1432
1433 __ bind(aligned);
1434
1435 // s is now 2-word-aligned.
1436
1437 // We have a count of units and some trailing bytes. Adjust the
1438 // count and do a bulk copy of words. If the shift is zero
1439 // perform a move instead to benefit from zero latency moves.
1440 int shift = exact_log2(wordSize/granularity);
1441 if (shift > 0) {
1442 __ lsr(r15, count, shift);
1443 } else {
1444 __ mov(r15, count);
1445 }
1446 if (direction == copy_forwards) {
1447 if (type != T_OBJECT) {
1448 __ bl(StubRoutines::aarch64::copy_byte_f());
1449 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1450 __ bl(StubRoutines::aarch64::copy_oop_uninit_f());
1451 } else {
1452 __ bl(StubRoutines::aarch64::copy_oop_f());
1453 }
1454 } else {
1455 if (type != T_OBJECT) {
1456 __ bl(StubRoutines::aarch64::copy_byte_b());
1457 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1458 __ bl(StubRoutines::aarch64::copy_oop_uninit_b());
1459 } else {
1460 __ bl(StubRoutines::aarch64::copy_oop_b());
1461 }
1462 }
1463
1464 // And the tail.
1465 copy_memory_small(decorators, type, s, d, count, step);
1466
1467 if (granularity >= 8) __ bind(copy8);
1468 if (granularity >= 4) __ bind(copy4);
1469 __ bind(finish);
1470 }
1471
1472
1473 void clobber_registers() {
1474 #ifdef ASSERT
1475 RegSet clobbered
1476 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1477 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1478 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1479 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1480 __ mov(*it, rscratch1);
1481 }
1482 #endif
1483
1484 }
1485
1486 // Scan over array at a for count oops, verifying each one.
1487 // Preserves a and count, clobbers rscratch1 and rscratch2.
1488 void verify_oop_array (int size, Register a, Register count, Register temp) {
1489 Label loop, end;
1490 __ mov(rscratch1, a);
1491 __ mov(rscratch2, zr);
1492 __ bind(loop);
1493 __ cmp(rscratch2, count);
1494 __ br(Assembler::HS, end);
1495 if (size == wordSize) {
1496 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1497 __ verify_oop(temp);
1498 } else {
1499 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1500 __ decode_heap_oop(temp); // calls verify_oop
1501 }
1502 __ add(rscratch2, rscratch2, 1);
1503 __ b(loop);
1504 __ bind(end);
1505 }
1506
1507 // Arguments:
1508 // stub_id - is used to name the stub and identify all details of
1509 // how to perform the copy.
1510 //
1511 // entry - is assigned to the stub's post push entry point unless
1512 // it is null
1513 //
1514 // Inputs:
1515 // c_rarg0 - source array address
1516 // c_rarg1 - destination array address
1517 // c_rarg2 - element count, treated as ssize_t, can be zero
1518 //
1519 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1520 // the hardware handle it. The two dwords within qwords that span
1521 // cache line boundaries will still be loaded and stored atomically.
1522 //
1523 // Side Effects: nopush_entry is set to the (post push) entry point
1524 // so it can be used by the corresponding conjoint
1525 // copy method
1526 //
1527 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1528 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1529 RegSet saved_reg = RegSet::of(s, d, count);
1530 int size;
1531 bool aligned;
1532 bool is_oop;
1533 bool dest_uninitialized;
1534 switch (stub_id) {
1535 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1536 size = sizeof(jbyte);
1537 aligned = false;
1538 is_oop = false;
1539 dest_uninitialized = false;
1540 break;
1541 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1542 size = sizeof(jbyte);
1543 aligned = true;
1544 is_oop = false;
1545 dest_uninitialized = false;
1546 break;
1547 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1548 size = sizeof(jshort);
1549 aligned = false;
1550 is_oop = false;
1551 dest_uninitialized = false;
1552 break;
1553 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1554 size = sizeof(jshort);
1555 aligned = true;
1556 is_oop = false;
1557 dest_uninitialized = false;
1558 break;
1559 case StubId::stubgen_jint_disjoint_arraycopy_id:
1560 size = sizeof(jint);
1561 aligned = false;
1562 is_oop = false;
1563 dest_uninitialized = false;
1564 break;
1565 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1566 size = sizeof(jint);
1567 aligned = true;
1568 is_oop = false;
1569 dest_uninitialized = false;
1570 break;
1571 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1572 // since this is always aligned we can (should!) use the same
1573 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1574 ShouldNotReachHere();
1575 break;
1576 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1577 size = sizeof(jlong);
1578 aligned = true;
1579 is_oop = false;
1580 dest_uninitialized = false;
1581 break;
1582 case StubId::stubgen_oop_disjoint_arraycopy_id:
1583 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1584 aligned = !UseCompressedOops;
1585 is_oop = true;
1586 dest_uninitialized = false;
1587 break;
1588 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1589 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1590 aligned = !UseCompressedOops;
1591 is_oop = true;
1592 dest_uninitialized = false;
1593 break;
1594 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1595 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1596 aligned = !UseCompressedOops;
1597 is_oop = true;
1598 dest_uninitialized = true;
1599 break;
1600 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1601 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1602 aligned = !UseCompressedOops;
1603 is_oop = true;
1604 dest_uninitialized = true;
1605 break;
1606 default:
1607 ShouldNotReachHere();
1608 break;
1609 }
1610
1611 __ align(CodeEntryAlignment);
1612 StubCodeMark mark(this, stub_id);
1613 address start = __ pc();
1614 __ enter();
1615
1616 if (nopush_entry != nullptr) {
1617 *nopush_entry = __ pc();
1618 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1619 BLOCK_COMMENT("Entry:");
1620 }
1621
1622 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1623 if (dest_uninitialized) {
1624 decorators |= IS_DEST_UNINITIALIZED;
1625 }
1626 if (aligned) {
1627 decorators |= ARRAYCOPY_ALIGNED;
1628 }
1629
1630 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1631 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1632
1633 if (is_oop) {
1634 // save regs before copy_memory
1635 __ push(RegSet::of(d, count), sp);
1636 }
1637 {
1638 // UnsafeMemoryAccess page error: continue after unsafe access
1639 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1640 UnsafeMemoryAccessMark umam(this, add_entry, true);
1641 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1642 }
1643
1644 if (is_oop) {
1645 __ pop(RegSet::of(d, count), sp);
1646 if (VerifyOops)
1647 verify_oop_array(size, d, count, r16);
1648 }
1649
1650 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1651
1652 __ leave();
1653 __ mov(r0, zr); // return 0
1654 __ ret(lr);
1655 return start;
1656 }
1657
1658 // Arguments:
1659 // stub_id - is used to name the stub and identify all details of
1660 // how to perform the copy.
1661 //
1662 // nooverlap_target - identifes the (post push) entry for the
1663 // corresponding disjoint copy routine which can be
1664 // jumped to if the ranges do not actually overlap
1665 //
1666 // entry - is assigned to the stub's post push entry point unless
1667 // it is null
1668 //
1669 //
1670 // Inputs:
1671 // c_rarg0 - source array address
1672 // c_rarg1 - destination array address
1673 // c_rarg2 - element count, treated as ssize_t, can be zero
1674 //
1675 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1676 // the hardware handle it. The two dwords within qwords that span
1677 // cache line boundaries will still be loaded and stored atomically.
1678 //
1679 // Side Effects:
1680 // nopush_entry is set to the no-overlap entry point so it can be
1681 // used by some other conjoint copy method
1682 //
1683 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1684 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1685 RegSet saved_regs = RegSet::of(s, d, count);
1686 int size;
1687 bool aligned;
1688 bool is_oop;
1689 bool dest_uninitialized;
1690 switch (stub_id) {
1691 case StubId::stubgen_jbyte_arraycopy_id:
1692 size = sizeof(jbyte);
1693 aligned = false;
1694 is_oop = false;
1695 dest_uninitialized = false;
1696 break;
1697 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1698 size = sizeof(jbyte);
1699 aligned = true;
1700 is_oop = false;
1701 dest_uninitialized = false;
1702 break;
1703 case StubId::stubgen_jshort_arraycopy_id:
1704 size = sizeof(jshort);
1705 aligned = false;
1706 is_oop = false;
1707 dest_uninitialized = false;
1708 break;
1709 case StubId::stubgen_arrayof_jshort_arraycopy_id:
1710 size = sizeof(jshort);
1711 aligned = true;
1712 is_oop = false;
1713 dest_uninitialized = false;
1714 break;
1715 case StubId::stubgen_jint_arraycopy_id:
1716 size = sizeof(jint);
1717 aligned = false;
1718 is_oop = false;
1719 dest_uninitialized = false;
1720 break;
1721 case StubId::stubgen_arrayof_jint_arraycopy_id:
1722 size = sizeof(jint);
1723 aligned = true;
1724 is_oop = false;
1725 dest_uninitialized = false;
1726 break;
1727 case StubId::stubgen_jlong_arraycopy_id:
1728 // since this is always aligned we can (should!) use the same
1729 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1730 ShouldNotReachHere();
1731 break;
1732 case StubId::stubgen_arrayof_jlong_arraycopy_id:
1733 size = sizeof(jlong);
1734 aligned = true;
1735 is_oop = false;
1736 dest_uninitialized = false;
1737 break;
1738 case StubId::stubgen_oop_arraycopy_id:
1739 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1740 aligned = !UseCompressedOops;
1741 is_oop = true;
1742 dest_uninitialized = false;
1743 break;
1744 case StubId::stubgen_arrayof_oop_arraycopy_id:
1745 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1746 aligned = !UseCompressedOops;
1747 is_oop = true;
1748 dest_uninitialized = false;
1749 break;
1750 case StubId::stubgen_oop_arraycopy_uninit_id:
1751 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1752 aligned = !UseCompressedOops;
1753 is_oop = true;
1754 dest_uninitialized = true;
1755 break;
1756 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
1757 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1758 aligned = !UseCompressedOops;
1759 is_oop = true;
1760 dest_uninitialized = true;
1761 break;
1762 default:
1763 ShouldNotReachHere();
1764 }
1765
1766 StubCodeMark mark(this, stub_id);
1767 address start = __ pc();
1768 __ enter();
1769
1770 if (nopush_entry != nullptr) {
1771 *nopush_entry = __ pc();
1772 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1773 BLOCK_COMMENT("Entry:");
1774 }
1775
1776 // use fwd copy when (d-s) above_equal (count*size)
1777 Label L_overlapping;
1778 __ sub(rscratch1, d, s);
1779 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1780 __ br(Assembler::LO, L_overlapping);
1781 __ b(RuntimeAddress(nooverlap_target));
1782 __ bind(L_overlapping);
1783
1784 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1785 if (dest_uninitialized) {
1786 decorators |= IS_DEST_UNINITIALIZED;
1787 }
1788 if (aligned) {
1789 decorators |= ARRAYCOPY_ALIGNED;
1790 }
1791
1792 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1793 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
1794
1795 if (is_oop) {
1796 // save regs before copy_memory
1797 __ push(RegSet::of(d, count), sp);
1798 }
1799 {
1800 // UnsafeMemoryAccess page error: continue after unsafe access
1801 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size);
1802 UnsafeMemoryAccessMark umam(this, add_entry, true);
1803 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
1804 }
1805 if (is_oop) {
1806 __ pop(RegSet::of(d, count), sp);
1807 if (VerifyOops)
1808 verify_oop_array(size, d, count, r16);
1809 }
1810 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1811 __ leave();
1812 __ mov(r0, zr); // return 0
1813 __ ret(lr);
1814 return start;
1815 }
1816
1817 // Helper for generating a dynamic type check.
1818 // Smashes rscratch1, rscratch2.
1819 void generate_type_check(Register sub_klass,
1820 Register super_check_offset,
1821 Register super_klass,
1822 Register temp1,
1823 Register temp2,
1824 Register result,
1825 Label& L_success) {
1826 assert_different_registers(sub_klass, super_check_offset, super_klass);
1827
1828 BLOCK_COMMENT("type_check:");
1829
1830 Label L_miss;
1831
1832 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
1833 super_check_offset);
1834 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
1835
1836 // Fall through on failure!
1837 __ BIND(L_miss);
1838 }
1839
1840 //
1841 // Generate checkcasting array copy stub
1842 //
1843 // Input:
1844 // c_rarg0 - source array address
1845 // c_rarg1 - destination array address
1846 // c_rarg2 - element count, treated as ssize_t, can be zero
1847 // c_rarg3 - size_t ckoff (super_check_offset)
1848 // c_rarg4 - oop ckval (super_klass)
1849 //
1850 // Output:
1851 // r0 == 0 - success
1852 // r0 == -1^K - failure, where K is partial transfer count
1853 //
1854 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
1855 bool dest_uninitialized;
1856 switch (stub_id) {
1857 case StubId::stubgen_checkcast_arraycopy_id:
1858 dest_uninitialized = false;
1859 break;
1860 case StubId::stubgen_checkcast_arraycopy_uninit_id:
1861 dest_uninitialized = true;
1862 break;
1863 default:
1864 ShouldNotReachHere();
1865 }
1866
1867 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1868
1869 // Input registers (after setup_arg_regs)
1870 const Register from = c_rarg0; // source array address
1871 const Register to = c_rarg1; // destination array address
1872 const Register count = c_rarg2; // elementscount
1873 const Register ckoff = c_rarg3; // super_check_offset
1874 const Register ckval = c_rarg4; // super_klass
1875
1876 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1877 RegSet wb_post_saved_regs = RegSet::of(count);
1878
1879 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
1880 const Register copied_oop = r22; // actual oop copied
1881 const Register count_save = r21; // orig elementscount
1882 const Register start_to = r20; // destination array start address
1883 const Register r19_klass = r19; // oop._klass
1884
1885 // Registers used as gc temps (r5, r6, r7 are save-on-call)
1886 const Register gct1 = r5, gct2 = r6, gct3 = r7;
1887
1888 //---------------------------------------------------------------
1889 // Assembler stub will be used for this call to arraycopy
1890 // if the two arrays are subtypes of Object[] but the
1891 // destination array type is not equal to or a supertype
1892 // of the source type. Each element must be separately
1893 // checked.
1894
1895 assert_different_registers(from, to, count, ckoff, ckval, start_to,
1896 copied_oop, r19_klass, count_save);
1897
1898 __ align(CodeEntryAlignment);
1899 StubCodeMark mark(this, stub_id);
1900 address start = __ pc();
1901
1902 __ enter(); // required for proper stackwalking of RuntimeStub frame
1903
1904 #ifdef ASSERT
1905 // caller guarantees that the arrays really are different
1906 // otherwise, we would have to make conjoint checks
1907 { Label L;
1908 __ b(L); // conjoint check not yet implemented
1909 __ stop("checkcast_copy within a single array");
1910 __ bind(L);
1911 }
1912 #endif //ASSERT
1913
1914 // Caller of this entry point must set up the argument registers.
1915 if (nopush_entry != nullptr) {
1916 *nopush_entry = __ pc();
1917 BLOCK_COMMENT("Entry:");
1918 }
1919
1920 // Empty array: Nothing to do.
1921 __ cbz(count, L_done);
1922 __ push(RegSet::of(r19, r20, r21, r22), sp);
1923
1924 #ifdef ASSERT
1925 BLOCK_COMMENT("assert consistent ckoff/ckval");
1926 // The ckoff and ckval must be mutually consistent,
1927 // even though caller generates both.
1928 { Label L;
1929 int sco_offset = in_bytes(Klass::super_check_offset_offset());
1930 __ ldrw(start_to, Address(ckval, sco_offset));
1931 __ cmpw(ckoff, start_to);
1932 __ br(Assembler::EQ, L);
1933 __ stop("super_check_offset inconsistent");
1934 __ bind(L);
1935 }
1936 #endif //ASSERT
1937
1938 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
1939 bool is_oop = true;
1940 int element_size = UseCompressedOops ? 4 : 8;
1941 if (dest_uninitialized) {
1942 decorators |= IS_DEST_UNINITIALIZED;
1943 }
1944
1945 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1946 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
1947
1948 // save the original count
1949 __ mov(count_save, count);
1950
1951 // Copy from low to high addresses
1952 __ mov(start_to, to); // Save destination array start address
1953 __ b(L_load_element);
1954
1955 // ======== begin loop ========
1956 // (Loop is rotated; its entry is L_load_element.)
1957 // Loop control:
1958 // for (; count != 0; count--) {
1959 // copied_oop = load_heap_oop(from++);
1960 // ... generate_type_check ...;
1961 // store_heap_oop(to++, copied_oop);
1962 // }
1963 __ align(OptoLoopAlignment);
1964
1965 __ BIND(L_store_element);
1966 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
1967 __ post(to, element_size), copied_oop, noreg,
1968 gct1, gct2, gct3);
1969 __ sub(count, count, 1);
1970 __ cbz(count, L_do_card_marks);
1971
1972 // ======== loop entry is here ========
1973 __ BIND(L_load_element);
1974 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
1975 copied_oop, noreg, __ post(from, element_size),
1976 gct1);
1977 __ cbz(copied_oop, L_store_element);
1978
1979 __ load_klass(r19_klass, copied_oop);// query the object klass
1980
1981 BLOCK_COMMENT("type_check:");
1982 generate_type_check(/*sub_klass*/r19_klass,
1983 /*super_check_offset*/ckoff,
1984 /*super_klass*/ckval,
1985 /*r_array_base*/gct1,
1986 /*temp2*/gct2,
1987 /*result*/r10, L_store_element);
1988
1989 // Fall through on failure!
1990
1991 // ======== end loop ========
1992
1993 // It was a real error; we must depend on the caller to finish the job.
1994 // Register count = remaining oops, count_orig = total oops.
1995 // Emit GC store barriers for the oops we have copied and report
1996 // their number to the caller.
1997
1998 __ subs(count, count_save, count); // K = partially copied oop count
1999 __ eon(count, count, zr); // report (-1^K) to caller
2000 __ br(Assembler::EQ, L_done_pop);
2001
2002 __ BIND(L_do_card_marks);
2003 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs);
2004
2005 __ bind(L_done_pop);
2006 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2007 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2008
2009 __ bind(L_done);
2010 __ mov(r0, count);
2011 __ leave();
2012 __ ret(lr);
2013
2014 return start;
2015 }
2016
2017 // Perform range checks on the proposed arraycopy.
2018 // Kills temp, but nothing else.
2019 // Also, clean the sign bits of src_pos and dst_pos.
2020 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2021 Register src_pos, // source position (c_rarg1)
2022 Register dst, // destination array oo (c_rarg2)
2023 Register dst_pos, // destination position (c_rarg3)
2024 Register length,
2025 Register temp,
2026 Label& L_failed) {
2027 BLOCK_COMMENT("arraycopy_range_checks:");
2028
2029 assert_different_registers(rscratch1, temp);
2030
2031 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2032 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2033 __ addw(temp, length, src_pos);
2034 __ cmpw(temp, rscratch1);
2035 __ br(Assembler::HI, L_failed);
2036
2037 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2038 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2039 __ addw(temp, length, dst_pos);
2040 __ cmpw(temp, rscratch1);
2041 __ br(Assembler::HI, L_failed);
2042
2043 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2044 __ movw(src_pos, src_pos);
2045 __ movw(dst_pos, dst_pos);
2046
2047 BLOCK_COMMENT("arraycopy_range_checks done");
2048 }
2049
2050 // These stubs get called from some dumb test routine.
2051 // I'll write them properly when they're called from
2052 // something that's actually doing something.
2053 static void fake_arraycopy_stub(address src, address dst, int count) {
2054 assert(count == 0, "huh?");
2055 }
2056
2057
2058 //
2059 // Generate 'unsafe' array copy stub
2060 // Though just as safe as the other stubs, it takes an unscaled
2061 // size_t argument instead of an element count.
2062 //
2063 // Input:
2064 // c_rarg0 - source array address
2065 // c_rarg1 - destination array address
2066 // c_rarg2 - byte count, treated as ssize_t, can be zero
2067 //
2068 // Examines the alignment of the operands and dispatches
2069 // to a long, int, short, or byte copy loop.
2070 //
2071 address generate_unsafe_copy(address byte_copy_entry,
2072 address short_copy_entry,
2073 address int_copy_entry,
2074 address long_copy_entry) {
2075 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2076
2077 Label L_long_aligned, L_int_aligned, L_short_aligned;
2078 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2079
2080 __ align(CodeEntryAlignment);
2081 StubCodeMark mark(this, stub_id);
2082 address start = __ pc();
2083 __ enter(); // required for proper stackwalking of RuntimeStub frame
2084
2085 // bump this on entry, not on exit:
2086 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2087
2088 __ orr(rscratch1, s, d);
2089 __ orr(rscratch1, rscratch1, count);
2090
2091 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2092 __ cbz(rscratch1, L_long_aligned);
2093 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2094 __ cbz(rscratch1, L_int_aligned);
2095 __ tbz(rscratch1, 0, L_short_aligned);
2096 __ b(RuntimeAddress(byte_copy_entry));
2097
2098 __ BIND(L_short_aligned);
2099 __ lsr(count, count, LogBytesPerShort); // size => short_count
2100 __ b(RuntimeAddress(short_copy_entry));
2101 __ BIND(L_int_aligned);
2102 __ lsr(count, count, LogBytesPerInt); // size => int_count
2103 __ b(RuntimeAddress(int_copy_entry));
2104 __ BIND(L_long_aligned);
2105 __ lsr(count, count, LogBytesPerLong); // size => long_count
2106 __ b(RuntimeAddress(long_copy_entry));
2107
2108 return start;
2109 }
2110
2111 //
2112 // Generate generic array copy stubs
2113 //
2114 // Input:
2115 // c_rarg0 - src oop
2116 // c_rarg1 - src_pos (32-bits)
2117 // c_rarg2 - dst oop
2118 // c_rarg3 - dst_pos (32-bits)
2119 // c_rarg4 - element count (32-bits)
2120 //
2121 // Output:
2122 // r0 == 0 - success
2123 // r0 == -1^K - failure, where K is partial transfer count
2124 //
2125 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2126 address int_copy_entry, address oop_copy_entry,
2127 address long_copy_entry, address checkcast_copy_entry) {
2128 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2129
2130 Label L_failed, L_objArray;
2131 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2132
2133 // Input registers
2134 const Register src = c_rarg0; // source array oop
2135 const Register src_pos = c_rarg1; // source position
2136 const Register dst = c_rarg2; // destination array oop
2137 const Register dst_pos = c_rarg3; // destination position
2138 const Register length = c_rarg4;
2139
2140
2141 // Registers used as temps
2142 const Register dst_klass = c_rarg5;
2143
2144 __ align(CodeEntryAlignment);
2145
2146 StubCodeMark mark(this, stub_id);
2147
2148 address start = __ pc();
2149
2150 __ enter(); // required for proper stackwalking of RuntimeStub frame
2151
2152 // bump this on entry, not on exit:
2153 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2154
2155 //-----------------------------------------------------------------------
2156 // Assembler stub will be used for this call to arraycopy
2157 // if the following conditions are met:
2158 //
2159 // (1) src and dst must not be null.
2160 // (2) src_pos must not be negative.
2161 // (3) dst_pos must not be negative.
2162 // (4) length must not be negative.
2163 // (5) src klass and dst klass should be the same and not null.
2164 // (6) src and dst should be arrays.
2165 // (7) src_pos + length must not exceed length of src.
2166 // (8) dst_pos + length must not exceed length of dst.
2167 //
2168
2169 // if (src == nullptr) return -1;
2170 __ cbz(src, L_failed);
2171
2172 // if (src_pos < 0) return -1;
2173 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2174
2175 // if (dst == nullptr) return -1;
2176 __ cbz(dst, L_failed);
2177
2178 // if (dst_pos < 0) return -1;
2179 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2180
2181 // registers used as temp
2182 const Register scratch_length = r16; // elements count to copy
2183 const Register scratch_src_klass = r17; // array klass
2184 const Register lh = r15; // layout helper
2185
2186 // if (length < 0) return -1;
2187 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2188 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2189
2190 __ load_klass(scratch_src_klass, src);
2191 #ifdef ASSERT
2192 // assert(src->klass() != nullptr);
2193 {
2194 BLOCK_COMMENT("assert klasses not null {");
2195 Label L1, L2;
2196 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2197 __ bind(L1);
2198 __ stop("broken null klass");
2199 __ bind(L2);
2200 __ load_klass(rscratch1, dst);
2201 __ cbz(rscratch1, L1); // this would be broken also
2202 BLOCK_COMMENT("} assert klasses not null done");
2203 }
2204 #endif
2205
2206 // Load layout helper (32-bits)
2207 //
2208 // |array_tag| | header_size | element_type | |log2_element_size|
2209 // 32 30 24 16 8 2 0
2210 //
2211 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2212 //
2213
2214 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2215
2216 // Handle objArrays completely differently...
2217 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2218 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2219 __ movw(rscratch1, objArray_lh);
2220 __ eorw(rscratch2, lh, rscratch1);
2221 __ cbzw(rscratch2, L_objArray);
2222
2223 // if (src->klass() != dst->klass()) return -1;
2224 __ load_klass(rscratch2, dst);
2225 __ eor(rscratch2, rscratch2, scratch_src_klass);
2226 __ cbnz(rscratch2, L_failed);
2227
2228 // if (!src->is_Array()) return -1;
2229 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2230
2231 // At this point, it is known to be a typeArray (array_tag 0x3).
2232 #ifdef ASSERT
2233 {
2234 BLOCK_COMMENT("assert primitive array {");
2235 Label L;
2236 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2237 __ cmpw(lh, rscratch2);
2238 __ br(Assembler::GE, L);
2239 __ stop("must be a primitive array");
2240 __ bind(L);
2241 BLOCK_COMMENT("} assert primitive array done");
2242 }
2243 #endif
2244
2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2246 rscratch2, L_failed);
2247
2248 // TypeArrayKlass
2249 //
2250 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2251 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2252 //
2253
2254 const Register rscratch1_offset = rscratch1; // array offset
2255 const Register r15_elsize = lh; // element size
2256
2257 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2258 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2259 __ add(src, src, rscratch1_offset); // src array offset
2260 __ add(dst, dst, rscratch1_offset); // dst array offset
2261 BLOCK_COMMENT("choose copy loop based on element size");
2262
2263 // next registers should be set before the jump to corresponding stub
2264 const Register from = c_rarg0; // source array address
2265 const Register to = c_rarg1; // destination array address
2266 const Register count = c_rarg2; // elements count
2267
2268 // 'from', 'to', 'count' registers should be set in such order
2269 // since they are the same as 'src', 'src_pos', 'dst'.
2270
2271 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2272
2273 // The possible values of elsize are 0-3, i.e. exact_log2(element
2274 // size in bytes). We do a simple bitwise binary search.
2275 __ BIND(L_copy_bytes);
2276 __ tbnz(r15_elsize, 1, L_copy_ints);
2277 __ tbnz(r15_elsize, 0, L_copy_shorts);
2278 __ lea(from, Address(src, src_pos));// src_addr
2279 __ lea(to, Address(dst, dst_pos));// dst_addr
2280 __ movw(count, scratch_length); // length
2281 __ b(RuntimeAddress(byte_copy_entry));
2282
2283 __ BIND(L_copy_shorts);
2284 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2285 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2286 __ movw(count, scratch_length); // length
2287 __ b(RuntimeAddress(short_copy_entry));
2288
2289 __ BIND(L_copy_ints);
2290 __ tbnz(r15_elsize, 0, L_copy_longs);
2291 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2292 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2293 __ movw(count, scratch_length); // length
2294 __ b(RuntimeAddress(int_copy_entry));
2295
2296 __ BIND(L_copy_longs);
2297 #ifdef ASSERT
2298 {
2299 BLOCK_COMMENT("assert long copy {");
2300 Label L;
2301 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2302 __ cmpw(r15_elsize, LogBytesPerLong);
2303 __ br(Assembler::EQ, L);
2304 __ stop("must be long copy, but elsize is wrong");
2305 __ bind(L);
2306 BLOCK_COMMENT("} assert long copy done");
2307 }
2308 #endif
2309 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2310 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2311 __ movw(count, scratch_length); // length
2312 __ b(RuntimeAddress(long_copy_entry));
2313
2314 // ObjArrayKlass
2315 __ BIND(L_objArray);
2316 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2317
2318 Label L_plain_copy, L_checkcast_copy;
2319 // test array classes for subtyping
2320 __ load_klass(r15, dst);
2321 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2322 __ br(Assembler::NE, L_checkcast_copy);
2323
2324 // Identically typed arrays can be copied without element-wise checks.
2325 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2326 rscratch2, L_failed);
2327
2328 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2329 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2330 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2331 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2332 __ movw(count, scratch_length); // length
2333 __ BIND(L_plain_copy);
2334 __ b(RuntimeAddress(oop_copy_entry));
2335
2336 __ BIND(L_checkcast_copy);
2337 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2338 {
2339 // Before looking at dst.length, make sure dst is also an objArray.
2340 __ ldrw(rscratch1, Address(r15, lh_offset));
2341 __ movw(rscratch2, objArray_lh);
2342 __ eorw(rscratch1, rscratch1, rscratch2);
2343 __ cbnzw(rscratch1, L_failed);
2344
2345 // It is safe to examine both src.length and dst.length.
2346 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2347 r15, L_failed);
2348
2349 __ load_klass(dst_klass, dst); // reload
2350
2351 // Marshal the base address arguments now, freeing registers.
2352 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2353 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2354 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2355 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2356 __ movw(count, length); // length (reloaded)
2357 Register sco_temp = c_rarg3; // this register is free now
2358 assert_different_registers(from, to, count, sco_temp,
2359 dst_klass, scratch_src_klass);
2360 // assert_clean_int(count, sco_temp);
2361
2362 // Generate the type check.
2363 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2364 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2365
2366 // Smashes rscratch1, rscratch2
2367 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2368 L_plain_copy);
2369
2370 // Fetch destination element klass from the ObjArrayKlass header.
2371 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2372 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2373 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2374
2375 // the checkcast_copy loop needs two extra arguments:
2376 assert(c_rarg3 == sco_temp, "#3 already in place");
2377 // Set up arguments for checkcast_copy_entry.
2378 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2379 __ b(RuntimeAddress(checkcast_copy_entry));
2380 }
2381
2382 __ BIND(L_failed);
2383 __ mov(r0, -1);
2384 __ leave(); // required for proper stackwalking of RuntimeStub frame
2385 __ ret(lr);
2386
2387 return start;
2388 }
2389
2390 //
2391 // Generate stub for array fill. If "aligned" is true, the
2392 // "to" address is assumed to be heapword aligned.
2393 //
2394 // Arguments for generated stub:
2395 // to: c_rarg0
2396 // value: c_rarg1
2397 // count: c_rarg2 treated as signed
2398 //
2399 address generate_fill(StubId stub_id) {
2400 BasicType t;
2401 bool aligned;
2402
2403 switch (stub_id) {
2404 case StubId::stubgen_jbyte_fill_id:
2405 t = T_BYTE;
2406 aligned = false;
2407 break;
2408 case StubId::stubgen_jshort_fill_id:
2409 t = T_SHORT;
2410 aligned = false;
2411 break;
2412 case StubId::stubgen_jint_fill_id:
2413 t = T_INT;
2414 aligned = false;
2415 break;
2416 case StubId::stubgen_arrayof_jbyte_fill_id:
2417 t = T_BYTE;
2418 aligned = true;
2419 break;
2420 case StubId::stubgen_arrayof_jshort_fill_id:
2421 t = T_SHORT;
2422 aligned = true;
2423 break;
2424 case StubId::stubgen_arrayof_jint_fill_id:
2425 t = T_INT;
2426 aligned = true;
2427 break;
2428 default:
2429 ShouldNotReachHere();
2430 };
2431
2432 __ align(CodeEntryAlignment);
2433 StubCodeMark mark(this, stub_id);
2434 address start = __ pc();
2435
2436 BLOCK_COMMENT("Entry:");
2437
2438 const Register to = c_rarg0; // source array address
2439 const Register value = c_rarg1; // value
2440 const Register count = c_rarg2; // elements count
2441
2442 const Register bz_base = r10; // base for block_zero routine
2443 const Register cnt_words = r11; // temp register
2444
2445 __ enter();
2446
2447 Label L_fill_elements, L_exit1;
2448
2449 int shift = -1;
2450 switch (t) {
2451 case T_BYTE:
2452 shift = 0;
2453 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2454 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2455 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2456 __ br(Assembler::LO, L_fill_elements);
2457 break;
2458 case T_SHORT:
2459 shift = 1;
2460 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2461 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2462 __ br(Assembler::LO, L_fill_elements);
2463 break;
2464 case T_INT:
2465 shift = 2;
2466 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2467 __ br(Assembler::LO, L_fill_elements);
2468 break;
2469 default: ShouldNotReachHere();
2470 }
2471
2472 // Align source address at 8 bytes address boundary.
2473 Label L_skip_align1, L_skip_align2, L_skip_align4;
2474 if (!aligned) {
2475 switch (t) {
2476 case T_BYTE:
2477 // One byte misalignment happens only for byte arrays.
2478 __ tbz(to, 0, L_skip_align1);
2479 __ strb(value, Address(__ post(to, 1)));
2480 __ subw(count, count, 1);
2481 __ bind(L_skip_align1);
2482 // Fallthrough
2483 case T_SHORT:
2484 // Two bytes misalignment happens only for byte and short (char) arrays.
2485 __ tbz(to, 1, L_skip_align2);
2486 __ strh(value, Address(__ post(to, 2)));
2487 __ subw(count, count, 2 >> shift);
2488 __ bind(L_skip_align2);
2489 // Fallthrough
2490 case T_INT:
2491 // Align to 8 bytes, we know we are 4 byte aligned to start.
2492 __ tbz(to, 2, L_skip_align4);
2493 __ strw(value, Address(__ post(to, 4)));
2494 __ subw(count, count, 4 >> shift);
2495 __ bind(L_skip_align4);
2496 break;
2497 default: ShouldNotReachHere();
2498 }
2499 }
2500
2501 //
2502 // Fill large chunks
2503 //
2504 __ lsrw(cnt_words, count, 3 - shift); // number of words
2505 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2506 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2507 if (UseBlockZeroing) {
2508 Label non_block_zeroing, rest;
2509 // If the fill value is zero we can use the fast zero_words().
2510 __ cbnz(value, non_block_zeroing);
2511 __ mov(bz_base, to);
2512 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2513 address tpc = __ zero_words(bz_base, cnt_words);
2514 if (tpc == nullptr) {
2515 fatal("CodeCache is full at generate_fill");
2516 }
2517 __ b(rest);
2518 __ bind(non_block_zeroing);
2519 __ fill_words(to, cnt_words, value);
2520 __ bind(rest);
2521 } else {
2522 __ fill_words(to, cnt_words, value);
2523 }
2524
2525 // Remaining count is less than 8 bytes. Fill it by a single store.
2526 // Note that the total length is no less than 8 bytes.
2527 if (t == T_BYTE || t == T_SHORT) {
2528 Label L_exit1;
2529 __ cbzw(count, L_exit1);
2530 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2531 __ str(value, Address(to, -8)); // overwrite some elements
2532 __ bind(L_exit1);
2533 __ leave();
2534 __ ret(lr);
2535 }
2536
2537 // Handle copies less than 8 bytes.
2538 Label L_fill_2, L_fill_4, L_exit2;
2539 __ bind(L_fill_elements);
2540 switch (t) {
2541 case T_BYTE:
2542 __ tbz(count, 0, L_fill_2);
2543 __ strb(value, Address(__ post(to, 1)));
2544 __ bind(L_fill_2);
2545 __ tbz(count, 1, L_fill_4);
2546 __ strh(value, Address(__ post(to, 2)));
2547 __ bind(L_fill_4);
2548 __ tbz(count, 2, L_exit2);
2549 __ strw(value, Address(to));
2550 break;
2551 case T_SHORT:
2552 __ tbz(count, 0, L_fill_4);
2553 __ strh(value, Address(__ post(to, 2)));
2554 __ bind(L_fill_4);
2555 __ tbz(count, 1, L_exit2);
2556 __ strw(value, Address(to));
2557 break;
2558 case T_INT:
2559 __ cbzw(count, L_exit2);
2560 __ strw(value, Address(to));
2561 break;
2562 default: ShouldNotReachHere();
2563 }
2564 __ bind(L_exit2);
2565 __ leave();
2566 __ ret(lr);
2567 return start;
2568 }
2569
2570 address generate_unsafecopy_common_error_exit() {
2571 address start_pc = __ pc();
2572 __ leave();
2573 __ mov(r0, 0);
2574 __ ret(lr);
2575 return start_pc;
2576 }
2577
2578 //
2579 // Generate 'unsafe' set memory stub
2580 // Though just as safe as the other stubs, it takes an unscaled
2581 // size_t (# bytes) argument instead of an element count.
2582 //
2583 // This fill operation is atomicity preserving: as long as the
2584 // address supplied is sufficiently aligned, all writes of up to 64
2585 // bits in size are single-copy atomic.
2586 //
2587 // Input:
2588 // c_rarg0 - destination array address
2589 // c_rarg1 - byte count (size_t)
2590 // c_rarg2 - byte value
2591 //
2592 address generate_unsafe_setmemory() {
2593 __ align(CodeEntryAlignment);
2594 StubCodeMark mark(this, StubId::stubgen_unsafe_setmemory_id);
2595 address start = __ pc();
2596
2597 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
2598 Label tail;
2599
2600 UnsafeMemoryAccessMark umam(this, true, false);
2601
2602 __ enter(); // required for proper stackwalking of RuntimeStub frame
2603
2604 __ dup(v0, __ T16B, value);
2605
2606 if (AvoidUnalignedAccesses) {
2607 __ cmp(count, (u1)16);
2608 __ br(__ LO, tail);
2609
2610 __ mov(rscratch1, 16);
2611 __ andr(rscratch2, dest, 15);
2612 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
2613 __ strq(v0, Address(dest));
2614 __ sub(count, count, rscratch1);
2615 __ add(dest, dest, rscratch1);
2616 }
2617
2618 __ subs(count, count, (u1)64);
2619 __ br(__ LO, tail);
2620 {
2621 Label again;
2622 __ bind(again);
2623 __ stpq(v0, v0, Address(dest));
2624 __ stpq(v0, v0, Address(dest, 32));
2625
2626 __ subs(count, count, 64);
2627 __ add(dest, dest, 64);
2628 __ br(__ HS, again);
2629 }
2630
2631 __ bind(tail);
2632 // The count of bytes is off by 64, but we don't need to correct
2633 // it because we're only going to use the least-significant few
2634 // count bits from here on.
2635 // __ add(count, count, 64);
2636
2637 {
2638 Label dont;
2639 __ tbz(count, exact_log2(32), dont);
2640 __ stpq(v0, v0, __ post(dest, 32));
2641 __ bind(dont);
2642 }
2643 {
2644 Label dont;
2645 __ tbz(count, exact_log2(16), dont);
2646 __ strq(v0, __ post(dest, 16));
2647 __ bind(dont);
2648 }
2649 {
2650 Label dont;
2651 __ tbz(count, exact_log2(8), dont);
2652 __ strd(v0, __ post(dest, 8));
2653 __ bind(dont);
2654 }
2655
2656 Label finished;
2657 __ tst(count, 7);
2658 __ br(__ EQ, finished);
2659
2660 {
2661 Label dont;
2662 __ tbz(count, exact_log2(4), dont);
2663 __ strs(v0, __ post(dest, 4));
2664 __ bind(dont);
2665 }
2666 {
2667 Label dont;
2668 __ tbz(count, exact_log2(2), dont);
2669 __ bfi(value, value, 8, 8);
2670 __ strh(value, __ post(dest, 2));
2671 __ bind(dont);
2672 }
2673 {
2674 Label dont;
2675 __ tbz(count, exact_log2(1), dont);
2676 __ strb(value, Address(dest));
2677 __ bind(dont);
2678 }
2679
2680 __ bind(finished);
2681 __ leave();
2682 __ ret(lr);
2683
2684 return start;
2685 }
2686
2687 address generate_data_cache_writeback() {
2688 const Register line = c_rarg0; // address of line to write back
2689
2690 __ align(CodeEntryAlignment);
2691
2692 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
2693 StubCodeMark mark(this, stub_id);
2694
2695 address start = __ pc();
2696 __ enter();
2697 __ cache_wb(Address(line, 0));
2698 __ leave();
2699 __ ret(lr);
2700
2701 return start;
2702 }
2703
2704 address generate_data_cache_writeback_sync() {
2705 const Register is_pre = c_rarg0; // pre or post sync
2706
2707 __ align(CodeEntryAlignment);
2708
2709 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
2710 StubCodeMark mark(this, stub_id);
2711
2712 // pre wbsync is a no-op
2713 // post wbsync translates to an sfence
2714
2715 Label skip;
2716 address start = __ pc();
2717 __ enter();
2718 __ cbnz(is_pre, skip);
2719 __ cache_wbsync(false);
2720 __ bind(skip);
2721 __ leave();
2722 __ ret(lr);
2723
2724 return start;
2725 }
2726
2727 void generate_arraycopy_stubs() {
2728 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
2729 // entry immediately following their stack push. This can be used
2730 // as a post-push branch target for compatible stubs when they
2731 // identify a special case that can be handled by the fallback
2732 // stub e.g a disjoint copy stub may be use as a special case
2733 // fallback for its compatible conjoint copy stub.
2734 //
2735 // A no push entry is always returned in the following local and
2736 // then published by assigning to the appropriate entry field in
2737 // class StubRoutines. The entry value is then passed to the
2738 // generator for the compatible stub. That means the entry must be
2739 // listed when saving to/restoring from the AOT cache, ensuring
2740 // that the inter-stub jumps are noted at AOT-cache save and
2741 // relocated at AOT cache load.
2742 address nopush_entry;
2743
2744 // generate the common exit first so later stubs can rely on it if
2745 // they want an UnsafeMemoryAccess exit non-local to the stub
2746 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
2747 // register the stub as the default exit with class UnsafeMemoryAccess
2748 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
2749
2750 // generate and publish arch64-specific bulk copy routines first
2751 // so we can call them from other copy stubs
2752 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2753 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2754
2755 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2756 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
2757
2758 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2759 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
2760
2761 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2762
2763 //*** jbyte
2764 // Always need aligned and unaligned versions
2765 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
2766 // disjoint nopush entry is needed by conjoint copy
2767 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2768 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
2769 // conjoint nopush entry is needed by generic/unsafe copy
2770 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
2771 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
2772 // disjoint arrayof nopush entry is needed by conjoint copy
2773 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
2774 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
2775
2776 //*** jshort
2777 // Always need aligned and unaligned versions
2778 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
2779 // disjoint nopush entry is needed by conjoint copy
2780 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
2781 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
2782 // conjoint nopush entry is used by generic/unsafe copy
2783 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
2784 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
2785 // disjoint arrayof nopush entry is needed by conjoint copy
2786 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
2787 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
2788
2789 //*** jint
2790 // Aligned versions
2791 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
2792 // disjoint arrayof nopush entry is needed by conjoint copy
2793 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
2794 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
2795 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2796 // jint_arraycopy_nopush always points to the unaligned version
2797 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
2798 // disjoint nopush entry is needed by conjoint copy
2799 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
2800 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
2801 // conjoint nopush entry is needed by generic/unsafe copy
2802 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
2803
2804 //*** jlong
2805 // It is always aligned
2806 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
2807 // disjoint arrayof nopush entry is needed by conjoint copy
2808 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
2809 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
2810 // conjoint nopush entry is needed by generic/unsafe copy
2811 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
2812 // disjoint normal/nopush and conjoint normal entries are not
2813 // generated since the arrayof versions are the same
2814 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2815 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
2816 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
2817
2818 //*** oops
2819 {
2820 StubRoutines::_arrayof_oop_disjoint_arraycopy
2821 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
2822 // disjoint arrayof nopush entry is needed by conjoint copy
2823 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
2824 StubRoutines::_arrayof_oop_arraycopy
2825 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
2826 // conjoint arrayof nopush entry is needed by generic/unsafe copy
2827 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
2828 // Aligned versions without pre-barriers
2829 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2830 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
2831 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
2832 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
2833 // note that we don't need a returned nopush entry because the
2834 // generic/unsafe copy does not cater for uninit arrays.
2835 StubRoutines::_arrayof_oop_arraycopy_uninit
2836 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
2837 }
2838
2839 // for oop copies reuse arrayof entries for non-arrayof cases
2840 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2841 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
2842 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
2843 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2844 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
2845 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
2846
2847 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
2848 // checkcast nopush entry is needed by generic copy
2849 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
2850 // note that we don't need a returned nopush entry because the
2851 // generic copy does not cater for uninit arrays.
2852 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
2853
2854 // unsafe arraycopy may fallback on conjoint stubs
2855 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
2856 StubRoutines::_jshort_arraycopy_nopush,
2857 StubRoutines::_jint_arraycopy_nopush,
2858 StubRoutines::_jlong_arraycopy_nopush);
2859
2860 // generic arraycopy may fallback on conjoint stubs
2861 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
2862 StubRoutines::_jshort_arraycopy_nopush,
2863 StubRoutines::_jint_arraycopy_nopush,
2864 StubRoutines::_oop_arraycopy_nopush,
2865 StubRoutines::_jlong_arraycopy_nopush,
2866 StubRoutines::_checkcast_arraycopy_nopush);
2867
2868 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
2869 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
2870 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
2871 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
2872 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
2873 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
2874 }
2875
2876 void generate_math_stubs() { Unimplemented(); }
2877
2878 // Arguments:
2879 //
2880 // Inputs:
2881 // c_rarg0 - source byte array address
2882 // c_rarg1 - destination byte array address
2883 // c_rarg2 - K (key) in little endian int array
2884 //
2885 address generate_aescrypt_encryptBlock() {
2886 __ align(CodeEntryAlignment);
2887 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
2888 StubCodeMark mark(this, stub_id);
2889
2890 const Register from = c_rarg0; // source array address
2891 const Register to = c_rarg1; // destination array address
2892 const Register key = c_rarg2; // key array address
2893 const Register keylen = rscratch1;
2894
2895 address start = __ pc();
2896 __ enter();
2897
2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899
2900 __ aesenc_loadkeys(key, keylen);
2901 __ aesecb_encrypt(from, to, keylen);
2902
2903 __ mov(r0, 0);
2904
2905 __ leave();
2906 __ ret(lr);
2907
2908 return start;
2909 }
2910
2911 // Arguments:
2912 //
2913 // Inputs:
2914 // c_rarg0 - source byte array address
2915 // c_rarg1 - destination byte array address
2916 // c_rarg2 - K (key) in little endian int array
2917 //
2918 address generate_aescrypt_decryptBlock() {
2919 assert(UseAES, "need AES cryptographic extension support");
2920 __ align(CodeEntryAlignment);
2921 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
2922 StubCodeMark mark(this, stub_id);
2923 Label L_doLast;
2924
2925 const Register from = c_rarg0; // source array address
2926 const Register to = c_rarg1; // destination array address
2927 const Register key = c_rarg2; // key array address
2928 const Register keylen = rscratch1;
2929
2930 address start = __ pc();
2931 __ enter(); // required for proper stackwalking of RuntimeStub frame
2932
2933 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2934
2935 __ aesecb_decrypt(from, to, key, keylen);
2936
2937 __ mov(r0, 0);
2938
2939 __ leave();
2940 __ ret(lr);
2941
2942 return start;
2943 }
2944
2945 // Arguments:
2946 //
2947 // Inputs:
2948 // c_rarg0 - source byte array address
2949 // c_rarg1 - destination byte array address
2950 // c_rarg2 - K (key) in little endian int array
2951 // c_rarg3 - r vector byte array address
2952 // c_rarg4 - input length
2953 //
2954 // Output:
2955 // x0 - input length
2956 //
2957 address generate_cipherBlockChaining_encryptAESCrypt() {
2958 assert(UseAES, "need AES cryptographic extension support");
2959 __ align(CodeEntryAlignment);
2960 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
2961 StubCodeMark mark(this, stub_id);
2962
2963 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2964
2965 const Register from = c_rarg0; // source array address
2966 const Register to = c_rarg1; // destination array address
2967 const Register key = c_rarg2; // key array address
2968 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
2969 // and left with the results of the last encryption block
2970 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
2971 const Register keylen = rscratch1;
2972
2973 address start = __ pc();
2974
2975 __ enter();
2976
2977 __ movw(rscratch2, len_reg);
2978
2979 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2980
2981 __ ld1(v0, __ T16B, rvec);
2982
2983 __ cmpw(keylen, 52);
2984 __ br(Assembler::CC, L_loadkeys_44);
2985 __ br(Assembler::EQ, L_loadkeys_52);
2986
2987 __ ld1(v17, v18, __ T16B, __ post(key, 32));
2988 __ rev32(v17, __ T16B, v17);
2989 __ rev32(v18, __ T16B, v18);
2990 __ BIND(L_loadkeys_52);
2991 __ ld1(v19, v20, __ T16B, __ post(key, 32));
2992 __ rev32(v19, __ T16B, v19);
2993 __ rev32(v20, __ T16B, v20);
2994 __ BIND(L_loadkeys_44);
2995 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2996 __ rev32(v21, __ T16B, v21);
2997 __ rev32(v22, __ T16B, v22);
2998 __ rev32(v23, __ T16B, v23);
2999 __ rev32(v24, __ T16B, v24);
3000 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3001 __ rev32(v25, __ T16B, v25);
3002 __ rev32(v26, __ T16B, v26);
3003 __ rev32(v27, __ T16B, v27);
3004 __ rev32(v28, __ T16B, v28);
3005 __ ld1(v29, v30, v31, __ T16B, key);
3006 __ rev32(v29, __ T16B, v29);
3007 __ rev32(v30, __ T16B, v30);
3008 __ rev32(v31, __ T16B, v31);
3009
3010 __ BIND(L_aes_loop);
3011 __ ld1(v1, __ T16B, __ post(from, 16));
3012 __ eor(v0, __ T16B, v0, v1);
3013
3014 __ br(Assembler::CC, L_rounds_44);
3015 __ br(Assembler::EQ, L_rounds_52);
3016
3017 __ aese(v0, v17); __ aesmc(v0, v0);
3018 __ aese(v0, v18); __ aesmc(v0, v0);
3019 __ BIND(L_rounds_52);
3020 __ aese(v0, v19); __ aesmc(v0, v0);
3021 __ aese(v0, v20); __ aesmc(v0, v0);
3022 __ BIND(L_rounds_44);
3023 __ aese(v0, v21); __ aesmc(v0, v0);
3024 __ aese(v0, v22); __ aesmc(v0, v0);
3025 __ aese(v0, v23); __ aesmc(v0, v0);
3026 __ aese(v0, v24); __ aesmc(v0, v0);
3027 __ aese(v0, v25); __ aesmc(v0, v0);
3028 __ aese(v0, v26); __ aesmc(v0, v0);
3029 __ aese(v0, v27); __ aesmc(v0, v0);
3030 __ aese(v0, v28); __ aesmc(v0, v0);
3031 __ aese(v0, v29); __ aesmc(v0, v0);
3032 __ aese(v0, v30);
3033 __ eor(v0, __ T16B, v0, v31);
3034
3035 __ st1(v0, __ T16B, __ post(to, 16));
3036
3037 __ subw(len_reg, len_reg, 16);
3038 __ cbnzw(len_reg, L_aes_loop);
3039
3040 __ st1(v0, __ T16B, rvec);
3041
3042 __ mov(r0, rscratch2);
3043
3044 __ leave();
3045 __ ret(lr);
3046
3047 return start;
3048 }
3049
3050 // Arguments:
3051 //
3052 // Inputs:
3053 // c_rarg0 - source byte array address
3054 // c_rarg1 - destination byte array address
3055 // c_rarg2 - K (key) in little endian int array
3056 // c_rarg3 - r vector byte array address
3057 // c_rarg4 - input length
3058 //
3059 // Output:
3060 // r0 - input length
3061 //
3062 address generate_cipherBlockChaining_decryptAESCrypt() {
3063 assert(UseAES, "need AES cryptographic extension support");
3064 __ align(CodeEntryAlignment);
3065 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3066 StubCodeMark mark(this, stub_id);
3067
3068 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3069
3070 const Register from = c_rarg0; // source array address
3071 const Register to = c_rarg1; // destination array address
3072 const Register key = c_rarg2; // key array address
3073 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3074 // and left with the results of the last encryption block
3075 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3076 const Register keylen = rscratch1;
3077
3078 address start = __ pc();
3079
3080 __ enter();
3081
3082 __ movw(rscratch2, len_reg);
3083
3084 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3085
3086 __ ld1(v2, __ T16B, rvec);
3087
3088 __ ld1(v31, __ T16B, __ post(key, 16));
3089 __ rev32(v31, __ T16B, v31);
3090
3091 __ cmpw(keylen, 52);
3092 __ br(Assembler::CC, L_loadkeys_44);
3093 __ br(Assembler::EQ, L_loadkeys_52);
3094
3095 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3096 __ rev32(v17, __ T16B, v17);
3097 __ rev32(v18, __ T16B, v18);
3098 __ BIND(L_loadkeys_52);
3099 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3100 __ rev32(v19, __ T16B, v19);
3101 __ rev32(v20, __ T16B, v20);
3102 __ BIND(L_loadkeys_44);
3103 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3104 __ rev32(v21, __ T16B, v21);
3105 __ rev32(v22, __ T16B, v22);
3106 __ rev32(v23, __ T16B, v23);
3107 __ rev32(v24, __ T16B, v24);
3108 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3109 __ rev32(v25, __ T16B, v25);
3110 __ rev32(v26, __ T16B, v26);
3111 __ rev32(v27, __ T16B, v27);
3112 __ rev32(v28, __ T16B, v28);
3113 __ ld1(v29, v30, __ T16B, key);
3114 __ rev32(v29, __ T16B, v29);
3115 __ rev32(v30, __ T16B, v30);
3116
3117 __ BIND(L_aes_loop);
3118 __ ld1(v0, __ T16B, __ post(from, 16));
3119 __ orr(v1, __ T16B, v0, v0);
3120
3121 __ br(Assembler::CC, L_rounds_44);
3122 __ br(Assembler::EQ, L_rounds_52);
3123
3124 __ aesd(v0, v17); __ aesimc(v0, v0);
3125 __ aesd(v0, v18); __ aesimc(v0, v0);
3126 __ BIND(L_rounds_52);
3127 __ aesd(v0, v19); __ aesimc(v0, v0);
3128 __ aesd(v0, v20); __ aesimc(v0, v0);
3129 __ BIND(L_rounds_44);
3130 __ aesd(v0, v21); __ aesimc(v0, v0);
3131 __ aesd(v0, v22); __ aesimc(v0, v0);
3132 __ aesd(v0, v23); __ aesimc(v0, v0);
3133 __ aesd(v0, v24); __ aesimc(v0, v0);
3134 __ aesd(v0, v25); __ aesimc(v0, v0);
3135 __ aesd(v0, v26); __ aesimc(v0, v0);
3136 __ aesd(v0, v27); __ aesimc(v0, v0);
3137 __ aesd(v0, v28); __ aesimc(v0, v0);
3138 __ aesd(v0, v29); __ aesimc(v0, v0);
3139 __ aesd(v0, v30);
3140 __ eor(v0, __ T16B, v0, v31);
3141 __ eor(v0, __ T16B, v0, v2);
3142
3143 __ st1(v0, __ T16B, __ post(to, 16));
3144 __ orr(v2, __ T16B, v1, v1);
3145
3146 __ subw(len_reg, len_reg, 16);
3147 __ cbnzw(len_reg, L_aes_loop);
3148
3149 __ st1(v2, __ T16B, rvec);
3150
3151 __ mov(r0, rscratch2);
3152
3153 __ leave();
3154 __ ret(lr);
3155
3156 return start;
3157 }
3158
3159 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3160 // Inputs: 128-bits. in is preserved.
3161 // The least-significant 64-bit word is in the upper dword of each vector.
3162 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3163 // Output: result
3164 void be_add_128_64(FloatRegister result, FloatRegister in,
3165 FloatRegister inc, FloatRegister tmp) {
3166 assert_different_registers(result, tmp, inc);
3167
3168 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3169 // input
3170 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3171 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3172 // MSD == 0 (must be!) to LSD
3173 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3174 }
3175
3176 // CTR AES crypt.
3177 // Arguments:
3178 //
3179 // Inputs:
3180 // c_rarg0 - source byte array address
3181 // c_rarg1 - destination byte array address
3182 // c_rarg2 - K (key) in little endian int array
3183 // c_rarg3 - counter vector byte array address
3184 // c_rarg4 - input length
3185 // c_rarg5 - saved encryptedCounter start
3186 // c_rarg6 - saved used length
3187 //
3188 // Output:
3189 // r0 - input length
3190 //
3191 address generate_counterMode_AESCrypt() {
3192 const Register in = c_rarg0;
3193 const Register out = c_rarg1;
3194 const Register key = c_rarg2;
3195 const Register counter = c_rarg3;
3196 const Register saved_len = c_rarg4, len = r10;
3197 const Register saved_encrypted_ctr = c_rarg5;
3198 const Register used_ptr = c_rarg6, used = r12;
3199
3200 const Register offset = r7;
3201 const Register keylen = r11;
3202
3203 const unsigned char block_size = 16;
3204 const int bulk_width = 4;
3205 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3206 // performance with larger data sizes, but it also means that the
3207 // fast path isn't used until you have at least 8 blocks, and up
3208 // to 127 bytes of data will be executed on the slow path. For
3209 // that reason, and also so as not to blow away too much icache, 4
3210 // blocks seems like a sensible compromise.
3211
3212 // Algorithm:
3213 //
3214 // if (len == 0) {
3215 // goto DONE;
3216 // }
3217 // int result = len;
3218 // do {
3219 // if (used >= blockSize) {
3220 // if (len >= bulk_width * blockSize) {
3221 // CTR_large_block();
3222 // if (len == 0)
3223 // goto DONE;
3224 // }
3225 // for (;;) {
3226 // 16ByteVector v0 = counter;
3227 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3228 // used = 0;
3229 // if (len < blockSize)
3230 // break; /* goto NEXT */
3231 // 16ByteVector v1 = load16Bytes(in, offset);
3232 // v1 = v1 ^ encryptedCounter;
3233 // store16Bytes(out, offset);
3234 // used = blockSize;
3235 // offset += blockSize;
3236 // len -= blockSize;
3237 // if (len == 0)
3238 // goto DONE;
3239 // }
3240 // }
3241 // NEXT:
3242 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3243 // len--;
3244 // } while (len != 0);
3245 // DONE:
3246 // return result;
3247 //
3248 // CTR_large_block()
3249 // Wide bulk encryption of whole blocks.
3250
3251 __ align(CodeEntryAlignment);
3252 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3253 StubCodeMark mark(this, stub_id);
3254 const address start = __ pc();
3255 __ enter();
3256
3257 Label DONE, CTR_large_block, large_block_return;
3258 __ ldrw(used, Address(used_ptr));
3259 __ cbzw(saved_len, DONE);
3260
3261 __ mov(len, saved_len);
3262 __ mov(offset, 0);
3263
3264 // Compute #rounds for AES based on the length of the key array
3265 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3266
3267 __ aesenc_loadkeys(key, keylen);
3268
3269 {
3270 Label L_CTR_loop, NEXT;
3271
3272 __ bind(L_CTR_loop);
3273
3274 __ cmp(used, block_size);
3275 __ br(__ LO, NEXT);
3276
3277 // Maybe we have a lot of data
3278 __ subsw(rscratch1, len, bulk_width * block_size);
3279 __ br(__ HS, CTR_large_block);
3280 __ BIND(large_block_return);
3281 __ cbzw(len, DONE);
3282
3283 // Setup the counter
3284 __ movi(v4, __ T4S, 0);
3285 __ movi(v5, __ T4S, 1);
3286 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3287
3288 // 128-bit big-endian increment
3289 __ ld1(v0, __ T16B, counter);
3290 __ rev64(v16, __ T16B, v0);
3291 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3292 __ rev64(v16, __ T16B, v16);
3293 __ st1(v16, __ T16B, counter);
3294 // Previous counter value is in v0
3295 // v4 contains { 0, 1 }
3296
3297 {
3298 // We have fewer than bulk_width blocks of data left. Encrypt
3299 // them one by one until there is less than a full block
3300 // remaining, being careful to save both the encrypted counter
3301 // and the counter.
3302
3303 Label inner_loop;
3304 __ bind(inner_loop);
3305 // Counter to encrypt is in v0
3306 __ aesecb_encrypt(noreg, noreg, keylen);
3307 __ st1(v0, __ T16B, saved_encrypted_ctr);
3308
3309 // Do we have a remaining full block?
3310
3311 __ mov(used, 0);
3312 __ cmp(len, block_size);
3313 __ br(__ LO, NEXT);
3314
3315 // Yes, we have a full block
3316 __ ldrq(v1, Address(in, offset));
3317 __ eor(v1, __ T16B, v1, v0);
3318 __ strq(v1, Address(out, offset));
3319 __ mov(used, block_size);
3320 __ add(offset, offset, block_size);
3321
3322 __ subw(len, len, block_size);
3323 __ cbzw(len, DONE);
3324
3325 // Increment the counter, store it back
3326 __ orr(v0, __ T16B, v16, v16);
3327 __ rev64(v16, __ T16B, v16);
3328 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3329 __ rev64(v16, __ T16B, v16);
3330 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3331
3332 __ b(inner_loop);
3333 }
3334
3335 __ BIND(NEXT);
3336
3337 // Encrypt a single byte, and loop.
3338 // We expect this to be a rare event.
3339 __ ldrb(rscratch1, Address(in, offset));
3340 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3341 __ eor(rscratch1, rscratch1, rscratch2);
3342 __ strb(rscratch1, Address(out, offset));
3343 __ add(offset, offset, 1);
3344 __ add(used, used, 1);
3345 __ subw(len, len,1);
3346 __ cbnzw(len, L_CTR_loop);
3347 }
3348
3349 __ bind(DONE);
3350 __ strw(used, Address(used_ptr));
3351 __ mov(r0, saved_len);
3352
3353 __ leave(); // required for proper stackwalking of RuntimeStub frame
3354 __ ret(lr);
3355
3356 // Bulk encryption
3357
3358 __ BIND (CTR_large_block);
3359 assert(bulk_width == 4 || bulk_width == 8, "must be");
3360
3361 if (bulk_width == 8) {
3362 __ sub(sp, sp, 4 * 16);
3363 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3364 }
3365 __ sub(sp, sp, 4 * 16);
3366 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3367 RegSet saved_regs = (RegSet::of(in, out, offset)
3368 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3369 __ push(saved_regs, sp);
3370 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3371 __ add(in, in, offset);
3372 __ add(out, out, offset);
3373
3374 // Keys should already be loaded into the correct registers
3375
3376 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3377 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3378
3379 // AES/CTR loop
3380 {
3381 Label L_CTR_loop;
3382 __ BIND(L_CTR_loop);
3383
3384 // Setup the counters
3385 __ movi(v8, __ T4S, 0);
3386 __ movi(v9, __ T4S, 1);
3387 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3388
3389 for (int i = 0; i < bulk_width; i++) {
3390 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3391 __ rev64(v0_ofs, __ T16B, v16);
3392 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3393 }
3394
3395 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3396
3397 // Encrypt the counters
3398 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3399
3400 if (bulk_width == 8) {
3401 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3402 }
3403
3404 // XOR the encrypted counters with the inputs
3405 for (int i = 0; i < bulk_width; i++) {
3406 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3407 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3408 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3409 }
3410
3411 // Write the encrypted data
3412 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3413 if (bulk_width == 8) {
3414 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3415 }
3416
3417 __ subw(len, len, 16 * bulk_width);
3418 __ cbnzw(len, L_CTR_loop);
3419 }
3420
3421 // Save the counter back where it goes
3422 __ rev64(v16, __ T16B, v16);
3423 __ st1(v16, __ T16B, counter);
3424
3425 __ pop(saved_regs, sp);
3426
3427 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3428 if (bulk_width == 8) {
3429 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3430 }
3431
3432 __ andr(rscratch1, len, -16 * bulk_width);
3433 __ sub(len, len, rscratch1);
3434 __ add(offset, offset, rscratch1);
3435 __ mov(used, 16);
3436 __ strw(used, Address(used_ptr));
3437 __ b(large_block_return);
3438
3439 return start;
3440 }
3441
3442 // Vector AES Galois Counter Mode implementation. Parameters:
3443 //
3444 // in = c_rarg0
3445 // len = c_rarg1
3446 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3447 // out = c_rarg3
3448 // key = c_rarg4
3449 // state = c_rarg5 - GHASH.state
3450 // subkeyHtbl = c_rarg6 - powers of H
3451 // counter = c_rarg7 - 16 bytes of CTR
3452 // return - number of processed bytes
3453 address generate_galoisCounterMode_AESCrypt() {
3454 Label ghash_polynomial; // local data generated after code
3455
3456 __ align(CodeEntryAlignment);
3457 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3458 StubCodeMark mark(this, stub_id);
3459 address start = __ pc();
3460 __ enter();
3461
3462 const Register in = c_rarg0;
3463 const Register len = c_rarg1;
3464 const Register ct = c_rarg2;
3465 const Register out = c_rarg3;
3466 // and updated with the incremented counter in the end
3467
3468 const Register key = c_rarg4;
3469 const Register state = c_rarg5;
3470
3471 const Register subkeyHtbl = c_rarg6;
3472
3473 const Register counter = c_rarg7;
3474
3475 const Register keylen = r10;
3476 // Save state before entering routine
3477 __ sub(sp, sp, 4 * 16);
3478 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3479 __ sub(sp, sp, 4 * 16);
3480 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3481
3482 // __ andr(len, len, -512);
3483 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3484 __ str(len, __ pre(sp, -2 * wordSize));
3485
3486 Label DONE;
3487 __ cbz(len, DONE);
3488
3489 // Compute #rounds for AES based on the length of the key array
3490 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3491
3492 __ aesenc_loadkeys(key, keylen);
3493 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3494 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3495
3496 // AES/CTR loop
3497 {
3498 Label L_CTR_loop;
3499 __ BIND(L_CTR_loop);
3500
3501 // Setup the counters
3502 __ movi(v8, __ T4S, 0);
3503 __ movi(v9, __ T4S, 1);
3504 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3505
3506 assert(v0->encoding() < v8->encoding(), "");
3507 for (int i = v0->encoding(); i < v8->encoding(); i++) {
3508 FloatRegister f = as_FloatRegister(i);
3509 __ rev32(f, __ T16B, v16);
3510 __ addv(v16, __ T4S, v16, v8);
3511 }
3512
3513 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3514
3515 // Encrypt the counters
3516 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
3517
3518 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3519
3520 // XOR the encrypted counters with the inputs
3521 for (int i = 0; i < 8; i++) {
3522 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3523 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3524 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3525 }
3526 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3527 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3528
3529 __ subw(len, len, 16 * 8);
3530 __ cbnzw(len, L_CTR_loop);
3531 }
3532
3533 __ rev32(v16, __ T16B, v16);
3534 __ st1(v16, __ T16B, counter);
3535
3536 __ ldr(len, Address(sp));
3537 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
3538
3539 // GHASH/CTR loop
3540 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
3541 len, /*unrolls*/4);
3542
3543 #ifdef ASSERT
3544 { Label L;
3545 __ cmp(len, (unsigned char)0);
3546 __ br(Assembler::EQ, L);
3547 __ stop("stubGenerator: abort");
3548 __ bind(L);
3549 }
3550 #endif
3551
3552 __ bind(DONE);
3553 // Return the number of bytes processed
3554 __ ldr(r0, __ post(sp, 2 * wordSize));
3555
3556 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3557 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3558
3559 __ leave(); // required for proper stackwalking of RuntimeStub frame
3560 __ ret(lr);
3561
3562 // bind label and generate polynomial data
3563 __ align(wordSize * 2);
3564 __ bind(ghash_polynomial);
3565 __ emit_int64(0x87); // The low-order bits of the field
3566 // polynomial (i.e. p = z^7+z^2+z+1)
3567 // repeated in the low and high parts of a
3568 // 128-bit vector
3569 __ emit_int64(0x87);
3570
3571 return start;
3572 }
3573
3574 class Cached64Bytes {
3575 private:
3576 MacroAssembler *_masm;
3577 Register _regs[8];
3578
3579 public:
3580 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
3581 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
3582 auto it = rs.begin();
3583 for (auto &r: _regs) {
3584 r = *it;
3585 ++it;
3586 }
3587 }
3588
3589 void gen_loads(Register base) {
3590 for (int i = 0; i < 8; i += 2) {
3591 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
3592 }
3593 }
3594
3595 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
3596 void extract_u32(Register dest, int i) {
3597 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
3598 }
3599 };
3600
3601 // Utility routines for md5.
3602 // Clobbers r10 and r11.
3603 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3604 int k, int s, int t) {
3605 Register rscratch3 = r10;
3606 Register rscratch4 = r11;
3607
3608 __ eorw(rscratch3, r3, r4);
3609 __ movw(rscratch2, t);
3610 __ andw(rscratch3, rscratch3, r2);
3611 __ addw(rscratch4, r1, rscratch2);
3612 reg_cache.extract_u32(rscratch1, k);
3613 __ eorw(rscratch3, rscratch3, r4);
3614 __ addw(rscratch4, rscratch4, rscratch1);
3615 __ addw(rscratch3, rscratch3, rscratch4);
3616 __ rorw(rscratch2, rscratch3, 32 - s);
3617 __ addw(r1, rscratch2, r2);
3618 }
3619
3620 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3621 int k, int s, int t) {
3622 Register rscratch3 = r10;
3623 Register rscratch4 = r11;
3624
3625 reg_cache.extract_u32(rscratch1, k);
3626 __ movw(rscratch2, t);
3627 __ addw(rscratch4, r1, rscratch2);
3628 __ addw(rscratch4, rscratch4, rscratch1);
3629 __ bicw(rscratch2, r3, r4);
3630 __ andw(rscratch3, r2, r4);
3631 __ addw(rscratch2, rscratch2, rscratch4);
3632 __ addw(rscratch2, rscratch2, rscratch3);
3633 __ rorw(rscratch2, rscratch2, 32 - s);
3634 __ addw(r1, rscratch2, r2);
3635 }
3636
3637 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3638 int k, int s, int t) {
3639 Register rscratch3 = r10;
3640 Register rscratch4 = r11;
3641
3642 __ eorw(rscratch3, r3, r4);
3643 __ movw(rscratch2, t);
3644 __ addw(rscratch4, r1, rscratch2);
3645 reg_cache.extract_u32(rscratch1, k);
3646 __ eorw(rscratch3, rscratch3, r2);
3647 __ addw(rscratch4, rscratch4, rscratch1);
3648 __ addw(rscratch3, rscratch3, rscratch4);
3649 __ rorw(rscratch2, rscratch3, 32 - s);
3650 __ addw(r1, rscratch2, r2);
3651 }
3652
3653 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
3654 int k, int s, int t) {
3655 Register rscratch3 = r10;
3656 Register rscratch4 = r11;
3657
3658 __ movw(rscratch3, t);
3659 __ ornw(rscratch2, r2, r4);
3660 __ addw(rscratch4, r1, rscratch3);
3661 reg_cache.extract_u32(rscratch1, k);
3662 __ eorw(rscratch3, rscratch2, r3);
3663 __ addw(rscratch4, rscratch4, rscratch1);
3664 __ addw(rscratch3, rscratch3, rscratch4);
3665 __ rorw(rscratch2, rscratch3, 32 - s);
3666 __ addw(r1, rscratch2, r2);
3667 }
3668
3669 // Arguments:
3670 //
3671 // Inputs:
3672 // c_rarg0 - byte[] source+offset
3673 // c_rarg1 - int[] SHA.state
3674 // c_rarg2 - int offset
3675 // c_rarg3 - int limit
3676 //
3677 address generate_md5_implCompress(StubId stub_id) {
3678 bool multi_block;
3679 switch (stub_id) {
3680 case StubId::stubgen_md5_implCompress_id:
3681 multi_block = false;
3682 break;
3683 case StubId::stubgen_md5_implCompressMB_id:
3684 multi_block = true;
3685 break;
3686 default:
3687 ShouldNotReachHere();
3688 }
3689 __ align(CodeEntryAlignment);
3690
3691 StubCodeMark mark(this, stub_id);
3692 address start = __ pc();
3693
3694 Register buf = c_rarg0;
3695 Register state = c_rarg1;
3696 Register ofs = c_rarg2;
3697 Register limit = c_rarg3;
3698 Register a = r4;
3699 Register b = r5;
3700 Register c = r6;
3701 Register d = r7;
3702 Register rscratch3 = r10;
3703 Register rscratch4 = r11;
3704
3705 Register state_regs[2] = { r12, r13 };
3706 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
3707 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
3708
3709 __ push(saved_regs, sp);
3710
3711 __ ldp(state_regs[0], state_regs[1], Address(state));
3712 __ ubfx(a, state_regs[0], 0, 32);
3713 __ ubfx(b, state_regs[0], 32, 32);
3714 __ ubfx(c, state_regs[1], 0, 32);
3715 __ ubfx(d, state_regs[1], 32, 32);
3716
3717 Label md5_loop;
3718 __ BIND(md5_loop);
3719
3720 reg_cache.gen_loads(buf);
3721
3722 // Round 1
3723 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
3724 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
3725 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
3726 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
3727 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
3728 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
3729 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
3730 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
3731 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
3732 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
3733 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
3734 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
3735 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
3736 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
3737 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
3738 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
3739
3740 // Round 2
3741 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
3742 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
3743 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
3744 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
3745 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
3746 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
3747 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
3748 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
3749 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
3750 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
3751 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
3752 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
3753 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
3754 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
3755 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
3756 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
3757
3758 // Round 3
3759 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
3760 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
3761 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
3762 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
3763 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
3764 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
3765 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
3766 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
3767 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
3768 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
3769 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
3770 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
3771 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
3772 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
3773 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
3774 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
3775
3776 // Round 4
3777 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
3778 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
3779 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
3780 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
3781 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
3782 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
3783 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
3784 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
3785 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
3786 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
3787 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
3788 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
3789 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
3790 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
3791 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
3792 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
3793
3794 __ addw(a, state_regs[0], a);
3795 __ ubfx(rscratch2, state_regs[0], 32, 32);
3796 __ addw(b, rscratch2, b);
3797 __ addw(c, state_regs[1], c);
3798 __ ubfx(rscratch4, state_regs[1], 32, 32);
3799 __ addw(d, rscratch4, d);
3800
3801 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
3802 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
3803
3804 if (multi_block) {
3805 __ add(buf, buf, 64);
3806 __ add(ofs, ofs, 64);
3807 __ cmp(ofs, limit);
3808 __ br(Assembler::LE, md5_loop);
3809 __ mov(c_rarg0, ofs); // return ofs
3810 }
3811
3812 // write hash values back in the correct order
3813 __ stp(state_regs[0], state_regs[1], Address(state));
3814
3815 __ pop(saved_regs, sp);
3816
3817 __ ret(lr);
3818
3819 return start;
3820 }
3821
3822 // Arguments:
3823 //
3824 // Inputs:
3825 // c_rarg0 - byte[] source+offset
3826 // c_rarg1 - int[] SHA.state
3827 // c_rarg2 - int offset
3828 // c_rarg3 - int limit
3829 //
3830 address generate_sha1_implCompress(StubId stub_id) {
3831 bool multi_block;
3832 switch (stub_id) {
3833 case StubId::stubgen_sha1_implCompress_id:
3834 multi_block = false;
3835 break;
3836 case StubId::stubgen_sha1_implCompressMB_id:
3837 multi_block = true;
3838 break;
3839 default:
3840 ShouldNotReachHere();
3841 }
3842
3843 __ align(CodeEntryAlignment);
3844
3845 StubCodeMark mark(this, stub_id);
3846 address start = __ pc();
3847
3848 Register buf = c_rarg0;
3849 Register state = c_rarg1;
3850 Register ofs = c_rarg2;
3851 Register limit = c_rarg3;
3852
3853 Label keys;
3854 Label sha1_loop;
3855
3856 // load the keys into v0..v3
3857 __ adr(rscratch1, keys);
3858 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3859 // load 5 words state into v6, v7
3860 __ ldrq(v6, Address(state, 0));
3861 __ ldrs(v7, Address(state, 16));
3862
3863
3864 __ BIND(sha1_loop);
3865 // load 64 bytes of data into v16..v19
3866 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3867 __ rev32(v16, __ T16B, v16);
3868 __ rev32(v17, __ T16B, v17);
3869 __ rev32(v18, __ T16B, v18);
3870 __ rev32(v19, __ T16B, v19);
3871
3872 // do the sha1
3873 __ addv(v4, __ T4S, v16, v0);
3874 __ orr(v20, __ T16B, v6, v6);
3875
3876 FloatRegister d0 = v16;
3877 FloatRegister d1 = v17;
3878 FloatRegister d2 = v18;
3879 FloatRegister d3 = v19;
3880
3881 for (int round = 0; round < 20; round++) {
3882 FloatRegister tmp1 = (round & 1) ? v4 : v5;
3883 FloatRegister tmp2 = (round & 1) ? v21 : v22;
3884 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3885 FloatRegister tmp4 = (round & 1) ? v5 : v4;
3886 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3887
3888 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3889 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3890 __ sha1h(tmp2, __ T4S, v20);
3891 if (round < 5)
3892 __ sha1c(v20, __ T4S, tmp3, tmp4);
3893 else if (round < 10 || round >= 15)
3894 __ sha1p(v20, __ T4S, tmp3, tmp4);
3895 else
3896 __ sha1m(v20, __ T4S, tmp3, tmp4);
3897 if (round < 16) __ sha1su1(d0, __ T4S, d3);
3898
3899 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3900 }
3901
3902 __ addv(v7, __ T2S, v7, v21);
3903 __ addv(v6, __ T4S, v6, v20);
3904
3905 if (multi_block) {
3906 __ add(ofs, ofs, 64);
3907 __ cmp(ofs, limit);
3908 __ br(Assembler::LE, sha1_loop);
3909 __ mov(c_rarg0, ofs); // return ofs
3910 }
3911
3912 __ strq(v6, Address(state, 0));
3913 __ strs(v7, Address(state, 16));
3914
3915 __ ret(lr);
3916
3917 __ bind(keys);
3918 __ emit_int32(0x5a827999);
3919 __ emit_int32(0x6ed9eba1);
3920 __ emit_int32(0x8f1bbcdc);
3921 __ emit_int32(0xca62c1d6);
3922
3923 return start;
3924 }
3925
3926
3927 // Arguments:
3928 //
3929 // Inputs:
3930 // c_rarg0 - byte[] source+offset
3931 // c_rarg1 - int[] SHA.state
3932 // c_rarg2 - int offset
3933 // c_rarg3 - int limit
3934 //
3935 address generate_sha256_implCompress(StubId stub_id) {
3936 bool multi_block;
3937 switch (stub_id) {
3938 case StubId::stubgen_sha256_implCompress_id:
3939 multi_block = false;
3940 break;
3941 case StubId::stubgen_sha256_implCompressMB_id:
3942 multi_block = true;
3943 break;
3944 default:
3945 ShouldNotReachHere();
3946 }
3947
3948 static const uint32_t round_consts[64] = {
3949 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3950 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3951 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3952 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3953 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3954 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3955 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3956 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3957 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3958 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3959 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3960 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3961 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3962 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3963 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3964 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3965 };
3966
3967 __ align(CodeEntryAlignment);
3968
3969 StubCodeMark mark(this, stub_id);
3970 address start = __ pc();
3971
3972 Register buf = c_rarg0;
3973 Register state = c_rarg1;
3974 Register ofs = c_rarg2;
3975 Register limit = c_rarg3;
3976
3977 Label sha1_loop;
3978
3979 __ stpd(v8, v9, __ pre(sp, -32));
3980 __ stpd(v10, v11, Address(sp, 16));
3981
3982 // dga == v0
3983 // dgb == v1
3984 // dg0 == v2
3985 // dg1 == v3
3986 // dg2 == v4
3987 // t0 == v6
3988 // t1 == v7
3989
3990 // load 16 keys to v16..v31
3991 __ lea(rscratch1, ExternalAddress((address)round_consts));
3992 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3993 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3994 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3995 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3996
3997 // load 8 words (256 bits) state
3998 __ ldpq(v0, v1, state);
3999
4000 __ BIND(sha1_loop);
4001 // load 64 bytes of data into v8..v11
4002 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4003 __ rev32(v8, __ T16B, v8);
4004 __ rev32(v9, __ T16B, v9);
4005 __ rev32(v10, __ T16B, v10);
4006 __ rev32(v11, __ T16B, v11);
4007
4008 __ addv(v6, __ T4S, v8, v16);
4009 __ orr(v2, __ T16B, v0, v0);
4010 __ orr(v3, __ T16B, v1, v1);
4011
4012 FloatRegister d0 = v8;
4013 FloatRegister d1 = v9;
4014 FloatRegister d2 = v10;
4015 FloatRegister d3 = v11;
4016
4017
4018 for (int round = 0; round < 16; round++) {
4019 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4020 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4021 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4022 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4023
4024 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4025 __ orr(v4, __ T16B, v2, v2);
4026 if (round < 15)
4027 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4028 __ sha256h(v2, __ T4S, v3, tmp2);
4029 __ sha256h2(v3, __ T4S, v4, tmp2);
4030 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4031
4032 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4033 }
4034
4035 __ addv(v0, __ T4S, v0, v2);
4036 __ addv(v1, __ T4S, v1, v3);
4037
4038 if (multi_block) {
4039 __ add(ofs, ofs, 64);
4040 __ cmp(ofs, limit);
4041 __ br(Assembler::LE, sha1_loop);
4042 __ mov(c_rarg0, ofs); // return ofs
4043 }
4044
4045 __ ldpd(v10, v11, Address(sp, 16));
4046 __ ldpd(v8, v9, __ post(sp, 32));
4047
4048 __ stpq(v0, v1, state);
4049
4050 __ ret(lr);
4051
4052 return start;
4053 }
4054
4055 // Double rounds for sha512.
4056 void sha512_dround(int dr,
4057 FloatRegister vi0, FloatRegister vi1,
4058 FloatRegister vi2, FloatRegister vi3,
4059 FloatRegister vi4, FloatRegister vrc0,
4060 FloatRegister vrc1, FloatRegister vin0,
4061 FloatRegister vin1, FloatRegister vin2,
4062 FloatRegister vin3, FloatRegister vin4) {
4063 if (dr < 36) {
4064 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4065 }
4066 __ addv(v5, __ T2D, vrc0, vin0);
4067 __ ext(v6, __ T16B, vi2, vi3, 8);
4068 __ ext(v5, __ T16B, v5, v5, 8);
4069 __ ext(v7, __ T16B, vi1, vi2, 8);
4070 __ addv(vi3, __ T2D, vi3, v5);
4071 if (dr < 32) {
4072 __ ext(v5, __ T16B, vin3, vin4, 8);
4073 __ sha512su0(vin0, __ T2D, vin1);
4074 }
4075 __ sha512h(vi3, __ T2D, v6, v7);
4076 if (dr < 32) {
4077 __ sha512su1(vin0, __ T2D, vin2, v5);
4078 }
4079 __ addv(vi4, __ T2D, vi1, vi3);
4080 __ sha512h2(vi3, __ T2D, vi1, vi0);
4081 }
4082
4083 // Arguments:
4084 //
4085 // Inputs:
4086 // c_rarg0 - byte[] source+offset
4087 // c_rarg1 - int[] SHA.state
4088 // c_rarg2 - int offset
4089 // c_rarg3 - int limit
4090 //
4091 address generate_sha512_implCompress(StubId stub_id) {
4092 bool multi_block;
4093 switch (stub_id) {
4094 case StubId::stubgen_sha512_implCompress_id:
4095 multi_block = false;
4096 break;
4097 case StubId::stubgen_sha512_implCompressMB_id:
4098 multi_block = true;
4099 break;
4100 default:
4101 ShouldNotReachHere();
4102 }
4103
4104 static const uint64_t round_consts[80] = {
4105 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
4106 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
4107 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
4108 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
4109 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
4110 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
4111 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
4112 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
4113 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
4114 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
4115 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
4116 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
4117 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
4118 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
4119 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
4120 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
4121 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
4122 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
4123 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
4124 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
4125 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
4126 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
4127 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
4128 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
4129 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
4130 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
4131 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
4132 };
4133
4134 __ align(CodeEntryAlignment);
4135
4136 StubCodeMark mark(this, stub_id);
4137 address start = __ pc();
4138
4139 Register buf = c_rarg0;
4140 Register state = c_rarg1;
4141 Register ofs = c_rarg2;
4142 Register limit = c_rarg3;
4143
4144 __ stpd(v8, v9, __ pre(sp, -64));
4145 __ stpd(v10, v11, Address(sp, 16));
4146 __ stpd(v12, v13, Address(sp, 32));
4147 __ stpd(v14, v15, Address(sp, 48));
4148
4149 Label sha512_loop;
4150
4151 // load state
4152 __ ld1(v8, v9, v10, v11, __ T2D, state);
4153
4154 // load first 4 round constants
4155 __ lea(rscratch1, ExternalAddress((address)round_consts));
4156 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4157
4158 __ BIND(sha512_loop);
4159 // load 128B of data into v12..v19
4160 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4161 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4162 __ rev64(v12, __ T16B, v12);
4163 __ rev64(v13, __ T16B, v13);
4164 __ rev64(v14, __ T16B, v14);
4165 __ rev64(v15, __ T16B, v15);
4166 __ rev64(v16, __ T16B, v16);
4167 __ rev64(v17, __ T16B, v17);
4168 __ rev64(v18, __ T16B, v18);
4169 __ rev64(v19, __ T16B, v19);
4170
4171 __ mov(rscratch2, rscratch1);
4172
4173 __ mov(v0, __ T16B, v8);
4174 __ mov(v1, __ T16B, v9);
4175 __ mov(v2, __ T16B, v10);
4176 __ mov(v3, __ T16B, v11);
4177
4178 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4179 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4180 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4181 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4182 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4183 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4184 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4185 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4186 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4187 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4188 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4189 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4190 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4191 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4192 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4193 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4194 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4195 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4196 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4197 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4198 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4199 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4200 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4201 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4202 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4203 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4204 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4205 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4206 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4207 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4208 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4209 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4210 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4211 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4212 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4213 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4214 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4215 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4216 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4217 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4218
4219 __ addv(v8, __ T2D, v8, v0);
4220 __ addv(v9, __ T2D, v9, v1);
4221 __ addv(v10, __ T2D, v10, v2);
4222 __ addv(v11, __ T2D, v11, v3);
4223
4224 if (multi_block) {
4225 __ add(ofs, ofs, 128);
4226 __ cmp(ofs, limit);
4227 __ br(Assembler::LE, sha512_loop);
4228 __ mov(c_rarg0, ofs); // return ofs
4229 }
4230
4231 __ st1(v8, v9, v10, v11, __ T2D, state);
4232
4233 __ ldpd(v14, v15, Address(sp, 48));
4234 __ ldpd(v12, v13, Address(sp, 32));
4235 __ ldpd(v10, v11, Address(sp, 16));
4236 __ ldpd(v8, v9, __ post(sp, 64));
4237
4238 __ ret(lr);
4239
4240 return start;
4241 }
4242
4243 // Execute one round of keccak of two computations in parallel.
4244 // One of the states should be loaded into the lower halves of
4245 // the vector registers v0-v24, the other should be loaded into
4246 // the upper halves of those registers. The ld1r instruction loads
4247 // the round constant into both halves of register v31.
4248 // Intermediate results c0...c5 and d0...d5 are computed
4249 // in registers v25...v30.
4250 // All vector instructions that are used operate on both register
4251 // halves in parallel.
4252 // If only a single computation is needed, one can only load the lower halves.
4253 void keccak_round(Register rscratch1) {
4254 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4255 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4256 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4257 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4258 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4259 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4260 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4261 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4262 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4263 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4264
4265 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4266 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4267 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4268 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4269 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4270
4271 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4272 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4273 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4274 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4275 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4276 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4277 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4278 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4279 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4280 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4281 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4282 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4283 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4284 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4285 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4286 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4287 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4288 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4289 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4290 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4291 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4292 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4293 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4294 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4295 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4296
4297 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4298 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4299 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4300 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4301 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4302
4303 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4304
4305 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4306 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4307 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4308 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4309 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4310
4311 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4312 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4313 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4314 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4315 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4316
4317 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4318 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4319 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4320 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4321 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4322
4323 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4324 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4325 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4326 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4327 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4328
4329 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4330 }
4331
4332 // Arguments:
4333 //
4334 // Inputs:
4335 // c_rarg0 - byte[] source+offset
4336 // c_rarg1 - byte[] SHA.state
4337 // c_rarg2 - int block_size
4338 // c_rarg3 - int offset
4339 // c_rarg4 - int limit
4340 //
4341 address generate_sha3_implCompress(StubId stub_id) {
4342 bool multi_block;
4343 switch (stub_id) {
4344 case StubId::stubgen_sha3_implCompress_id:
4345 multi_block = false;
4346 break;
4347 case StubId::stubgen_sha3_implCompressMB_id:
4348 multi_block = true;
4349 break;
4350 default:
4351 ShouldNotReachHere();
4352 }
4353
4354 static const uint64_t round_consts[24] = {
4355 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4356 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4357 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4358 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4359 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4360 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4361 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4362 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4363 };
4364
4365 __ align(CodeEntryAlignment);
4366
4367 StubCodeMark mark(this, stub_id);
4368 address start = __ pc();
4369
4370 Register buf = c_rarg0;
4371 Register state = c_rarg1;
4372 Register block_size = c_rarg2;
4373 Register ofs = c_rarg3;
4374 Register limit = c_rarg4;
4375
4376 Label sha3_loop, rounds24_loop;
4377 Label sha3_512_or_sha3_384, shake128;
4378
4379 __ stpd(v8, v9, __ pre(sp, -64));
4380 __ stpd(v10, v11, Address(sp, 16));
4381 __ stpd(v12, v13, Address(sp, 32));
4382 __ stpd(v14, v15, Address(sp, 48));
4383
4384 // load state
4385 __ add(rscratch1, state, 32);
4386 __ ld1(v0, v1, v2, v3, __ T1D, state);
4387 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4388 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4389 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4390 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4391 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4392 __ ld1(v24, __ T1D, rscratch1);
4393
4394 __ BIND(sha3_loop);
4395
4396 // 24 keccak rounds
4397 __ movw(rscratch2, 24);
4398
4399 // load round_constants base
4400 __ lea(rscratch1, ExternalAddress((address) round_consts));
4401
4402 // load input
4403 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4404 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4405 __ eor(v0, __ T8B, v0, v25);
4406 __ eor(v1, __ T8B, v1, v26);
4407 __ eor(v2, __ T8B, v2, v27);
4408 __ eor(v3, __ T8B, v3, v28);
4409 __ eor(v4, __ T8B, v4, v29);
4410 __ eor(v5, __ T8B, v5, v30);
4411 __ eor(v6, __ T8B, v6, v31);
4412
4413 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4414 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4415
4416 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4417 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4418 __ eor(v7, __ T8B, v7, v25);
4419 __ eor(v8, __ T8B, v8, v26);
4420 __ eor(v9, __ T8B, v9, v27);
4421 __ eor(v10, __ T8B, v10, v28);
4422 __ eor(v11, __ T8B, v11, v29);
4423 __ eor(v12, __ T8B, v12, v30);
4424 __ eor(v13, __ T8B, v13, v31);
4425
4426 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4427 __ eor(v14, __ T8B, v14, v25);
4428 __ eor(v15, __ T8B, v15, v26);
4429 __ eor(v16, __ T8B, v16, v27);
4430
4431 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4432 __ andw(c_rarg5, block_size, 48);
4433 __ cbzw(c_rarg5, rounds24_loop);
4434
4435 __ tbnz(block_size, 5, shake128);
4436 // block_size == 144, bit5 == 0, SHA3-224
4437 __ ldrd(v28, __ post(buf, 8));
4438 __ eor(v17, __ T8B, v17, v28);
4439 __ b(rounds24_loop);
4440
4441 __ BIND(shake128);
4442 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4443 __ eor(v17, __ T8B, v17, v28);
4444 __ eor(v18, __ T8B, v18, v29);
4445 __ eor(v19, __ T8B, v19, v30);
4446 __ eor(v20, __ T8B, v20, v31);
4447 __ b(rounds24_loop); // block_size == 168, SHAKE128
4448
4449 __ BIND(sha3_512_or_sha3_384);
4450 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4451 __ eor(v7, __ T8B, v7, v25);
4452 __ eor(v8, __ T8B, v8, v26);
4453 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4454
4455 // SHA3-384
4456 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4457 __ eor(v9, __ T8B, v9, v27);
4458 __ eor(v10, __ T8B, v10, v28);
4459 __ eor(v11, __ T8B, v11, v29);
4460 __ eor(v12, __ T8B, v12, v30);
4461
4462 __ BIND(rounds24_loop);
4463 __ subw(rscratch2, rscratch2, 1);
4464
4465 keccak_round(rscratch1);
4466
4467 __ cbnzw(rscratch2, rounds24_loop);
4468
4469 if (multi_block) {
4470 __ add(ofs, ofs, block_size);
4471 __ cmp(ofs, limit);
4472 __ br(Assembler::LE, sha3_loop);
4473 __ mov(c_rarg0, ofs); // return ofs
4474 }
4475
4476 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4477 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4478 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4479 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4480 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4481 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4482 __ st1(v24, __ T1D, state);
4483
4484 // restore callee-saved registers
4485 __ ldpd(v14, v15, Address(sp, 48));
4486 __ ldpd(v12, v13, Address(sp, 32));
4487 __ ldpd(v10, v11, Address(sp, 16));
4488 __ ldpd(v8, v9, __ post(sp, 64));
4489
4490 __ ret(lr);
4491
4492 return start;
4493 }
4494
4495 // Inputs:
4496 // c_rarg0 - long[] state0
4497 // c_rarg1 - long[] state1
4498 address generate_double_keccak() {
4499 static const uint64_t round_consts[24] = {
4500 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
4501 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
4502 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
4503 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
4504 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
4505 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
4506 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
4507 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
4508 };
4509
4510 // Implements the double_keccak() method of the
4511 // sun.secyrity.provider.SHA3Parallel class
4512 __ align(CodeEntryAlignment);
4513 StubCodeMark mark(this, "StubRoutines", "double_keccak");
4514 address start = __ pc();
4515 __ enter();
4516
4517 Register state0 = c_rarg0;
4518 Register state1 = c_rarg1;
4519
4520 Label rounds24_loop;
4521
4522 // save callee-saved registers
4523 __ stpd(v8, v9, __ pre(sp, -64));
4524 __ stpd(v10, v11, Address(sp, 16));
4525 __ stpd(v12, v13, Address(sp, 32));
4526 __ stpd(v14, v15, Address(sp, 48));
4527
4528 // load states
4529 __ add(rscratch1, state0, 32);
4530 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
4531 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
4532 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
4533 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
4534 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
4535 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
4536 __ ld1(v24, __ D, 0, rscratch1);
4537 __ add(rscratch1, state1, 32);
4538 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
4539 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
4540 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
4541 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
4542 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
4543 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
4544 __ ld1(v24, __ D, 1, rscratch1);
4545
4546 // 24 keccak rounds
4547 __ movw(rscratch2, 24);
4548
4549 // load round_constants base
4550 __ lea(rscratch1, ExternalAddress((address) round_consts));
4551
4552 __ BIND(rounds24_loop);
4553 __ subw(rscratch2, rscratch2, 1);
4554 keccak_round(rscratch1);
4555 __ cbnzw(rscratch2, rounds24_loop);
4556
4557 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
4558 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
4559 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
4560 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
4561 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
4562 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
4563 __ st1(v24, __ D, 0, state0);
4564 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
4565 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
4566 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
4567 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
4568 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
4569 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
4570 __ st1(v24, __ D, 1, state1);
4571
4572 // restore callee-saved vector registers
4573 __ ldpd(v14, v15, Address(sp, 48));
4574 __ ldpd(v12, v13, Address(sp, 32));
4575 __ ldpd(v10, v11, Address(sp, 16));
4576 __ ldpd(v8, v9, __ post(sp, 64));
4577
4578 __ leave(); // required for proper stackwalking of RuntimeStub frame
4579 __ mov(r0, zr); // return 0
4580 __ ret(lr);
4581
4582 return start;
4583 }
4584
4585 // ChaCha20 block function. This version parallelizes the 32-bit
4586 // state elements on each of 16 vectors, producing 4 blocks of
4587 // keystream at a time.
4588 //
4589 // state (int[16]) = c_rarg0
4590 // keystream (byte[256]) = c_rarg1
4591 // return - number of bytes of produced keystream (always 256)
4592 //
4593 // This implementation takes each 32-bit integer from the state
4594 // array and broadcasts it across all 4 32-bit lanes of a vector register
4595 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
4596 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
4597 // the quarter round schedule is implemented as outlined in RFC 7539 section
4598 // 2.3. However, instead of sequentially processing the 3 quarter round
4599 // operations represented by one QUARTERROUND function, we instead stack all
4600 // the adds, xors and left-rotations from the first 4 quarter rounds together
4601 // and then do the same for the second set of 4 quarter rounds. This removes
4602 // some latency that would otherwise be incurred by waiting for an add to
4603 // complete before performing an xor (which depends on the result of the
4604 // add), etc. An adjustment happens between the first and second groups of 4
4605 // quarter rounds, but this is done only in the inputs to the macro functions
4606 // that generate the assembly instructions - these adjustments themselves are
4607 // not part of the resulting assembly.
4608 // The 4 registers v0-v3 are used during the quarter round operations as
4609 // scratch registers. Once the 20 rounds are complete, these 4 scratch
4610 // registers become the vectors involved in adding the start state back onto
4611 // the post-QR working state. After the adds are complete, each of the 16
4612 // vectors write their first lane back to the keystream buffer, followed
4613 // by the second lane from all vectors and so on.
4614 address generate_chacha20Block_blockpar() {
4615 Label L_twoRounds, L_cc20_const;
4616 __ align(CodeEntryAlignment);
4617 StubId stub_id = StubId::stubgen_chacha20Block_id;
4618 StubCodeMark mark(this, stub_id);
4619 address start = __ pc();
4620 __ enter();
4621
4622 int i, j;
4623 const Register state = c_rarg0;
4624 const Register keystream = c_rarg1;
4625 const Register loopCtr = r10;
4626 const Register tmpAddr = r11;
4627 const FloatRegister ctrAddOverlay = v28;
4628 const FloatRegister lrot8Tbl = v29;
4629
4630 // Organize SIMD registers in an array that facilitates
4631 // putting repetitive opcodes into loop structures. It is
4632 // important that each grouping of 4 registers is monotonically
4633 // increasing to support the requirements of multi-register
4634 // instructions (e.g. ld4r, st4, etc.)
4635 const FloatRegister workSt[16] = {
4636 v4, v5, v6, v7, v16, v17, v18, v19,
4637 v20, v21, v22, v23, v24, v25, v26, v27
4638 };
4639
4640 // Pull in constant data. The first 16 bytes are the add overlay
4641 // which is applied to the vector holding the counter (state[12]).
4642 // The second 16 bytes is the index register for the 8-bit left
4643 // rotation tbl instruction.
4644 __ adr(tmpAddr, L_cc20_const);
4645 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
4646
4647 // Load from memory and interlace across 16 SIMD registers,
4648 // With each word from memory being broadcast to all lanes of
4649 // each successive SIMD register.
4650 // Addr(0) -> All lanes in workSt[i]
4651 // Addr(4) -> All lanes workSt[i + 1], etc.
4652 __ mov(tmpAddr, state);
4653 for (i = 0; i < 16; i += 4) {
4654 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
4655 __ post(tmpAddr, 16));
4656 }
4657 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4658
4659 // Before entering the loop, create 5 4-register arrays. These
4660 // will hold the 4 registers that represent the a/b/c/d fields
4661 // in the quarter round operation. For instance the "b" field
4662 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
4663 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
4664 // since it is part of a diagonal organization. The aSet and scratch
4665 // register sets are defined at declaration time because they do not change
4666 // organization at any point during the 20-round processing.
4667 FloatRegister aSet[4] = { v4, v5, v6, v7 };
4668 FloatRegister bSet[4];
4669 FloatRegister cSet[4];
4670 FloatRegister dSet[4];
4671 FloatRegister scratch[4] = { v0, v1, v2, v3 };
4672
4673 // Set up the 10 iteration loop and perform all 8 quarter round ops
4674 __ mov(loopCtr, 10);
4675 __ BIND(L_twoRounds);
4676
4677 // Set to columnar organization and do the following 4 quarter-rounds:
4678 // QUARTERROUND(0, 4, 8, 12)
4679 // QUARTERROUND(1, 5, 9, 13)
4680 // QUARTERROUND(2, 6, 10, 14)
4681 // QUARTERROUND(3, 7, 11, 15)
4682 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
4683 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
4684 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
4685
4686 __ cc20_qr_add4(aSet, bSet); // a += b
4687 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4688 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4689
4690 __ cc20_qr_add4(cSet, dSet); // c += d
4691 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4692 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4693
4694 __ cc20_qr_add4(aSet, bSet); // a += b
4695 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4696 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4697
4698 __ cc20_qr_add4(cSet, dSet); // c += d
4699 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4700 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4701
4702 // Set to diagonal organization and do the next 4 quarter-rounds:
4703 // QUARTERROUND(0, 5, 10, 15)
4704 // QUARTERROUND(1, 6, 11, 12)
4705 // QUARTERROUND(2, 7, 8, 13)
4706 // QUARTERROUND(3, 4, 9, 14)
4707 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
4708 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
4709 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
4710
4711 __ cc20_qr_add4(aSet, bSet); // a += b
4712 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4713 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
4714
4715 __ cc20_qr_add4(cSet, dSet); // c += d
4716 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4717 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
4718
4719 __ cc20_qr_add4(aSet, bSet); // a += b
4720 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
4721 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
4722
4723 __ cc20_qr_add4(cSet, dSet); // c += d
4724 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
4725 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
4726
4727 // Decrement and iterate
4728 __ sub(loopCtr, loopCtr, 1);
4729 __ cbnz(loopCtr, L_twoRounds);
4730
4731 __ mov(tmpAddr, state);
4732
4733 // Add the starting state back to the post-loop keystream
4734 // state. We read/interlace the state array from memory into
4735 // 4 registers similar to what we did in the beginning. Then
4736 // add the counter overlay onto workSt[12] at the end.
4737 for (i = 0; i < 16; i += 4) {
4738 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
4739 __ addv(workSt[i], __ T4S, workSt[i], v0);
4740 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
4741 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
4742 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
4743 }
4744 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
4745
4746 // Write working state into the keystream buffer. This is accomplished
4747 // by taking the lane "i" from each of the four vectors and writing
4748 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
4749 // repeating with the next 4 vectors until all 16 vectors have been used.
4750 // Then move to the next lane and repeat the process until all lanes have
4751 // been written.
4752 for (i = 0; i < 4; i++) {
4753 for (j = 0; j < 16; j += 4) {
4754 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
4755 __ post(keystream, 16));
4756 }
4757 }
4758
4759 __ mov(r0, 256); // Return length of output keystream
4760 __ leave();
4761 __ ret(lr);
4762
4763 // bind label and generate local constant data used by this stub
4764 // The constant data is broken into two 128-bit segments to be loaded
4765 // onto FloatRegisters. The first 128 bits are a counter add overlay
4766 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
4767 // The second 128-bits is a table constant used for 8-bit left rotations.
4768 __ BIND(L_cc20_const);
4769 __ emit_int64(0x0000000100000000UL);
4770 __ emit_int64(0x0000000300000002UL);
4771 __ emit_int64(0x0605040702010003UL);
4772 __ emit_int64(0x0E0D0C0F0A09080BUL);
4773
4774 return start;
4775 }
4776
4777 // Helpers to schedule parallel operation bundles across vector
4778 // register sequences of size 2, 4 or 8.
4779
4780 // Implement various primitive computations across vector sequences
4781
4782 template<int N>
4783 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4784 const VSeq<N>& v1, const VSeq<N>& v2) {
4785 // output must not be constant
4786 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4787 // output cannot overwrite pending inputs
4788 assert(!vs_write_before_read(v, v1), "output overwrites input");
4789 assert(!vs_write_before_read(v, v2), "output overwrites input");
4790 for (int i = 0; i < N; i++) {
4791 __ addv(v[i], T, v1[i], v2[i]);
4792 }
4793 }
4794
4795 template<int N>
4796 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4797 const VSeq<N>& v1, const VSeq<N>& v2) {
4798 // output must not be constant
4799 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4800 // output cannot overwrite pending inputs
4801 assert(!vs_write_before_read(v, v1), "output overwrites input");
4802 assert(!vs_write_before_read(v, v2), "output overwrites input");
4803 for (int i = 0; i < N; i++) {
4804 __ subv(v[i], T, v1[i], v2[i]);
4805 }
4806 }
4807
4808 template<int N>
4809 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4810 const VSeq<N>& v1, const VSeq<N>& v2) {
4811 // output must not be constant
4812 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4813 // output cannot overwrite pending inputs
4814 assert(!vs_write_before_read(v, v1), "output overwrites input");
4815 assert(!vs_write_before_read(v, v2), "output overwrites input");
4816 for (int i = 0; i < N; i++) {
4817 __ mulv(v[i], T, v1[i], v2[i]);
4818 }
4819 }
4820
4821 template<int N>
4822 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
4823 // output must not be constant
4824 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4825 // output cannot overwrite pending inputs
4826 assert(!vs_write_before_read(v, v1), "output overwrites input");
4827 for (int i = 0; i < N; i++) {
4828 __ negr(v[i], T, v1[i]);
4829 }
4830 }
4831
4832 template<int N>
4833 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
4834 const VSeq<N>& v1, int shift) {
4835 // output must not be constant
4836 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4837 // output cannot overwrite pending inputs
4838 assert(!vs_write_before_read(v, v1), "output overwrites input");
4839 for (int i = 0; i < N; i++) {
4840 __ sshr(v[i], T, v1[i], shift);
4841 }
4842 }
4843
4844 template<int N>
4845 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4846 // output must not be constant
4847 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4848 // output cannot overwrite pending inputs
4849 assert(!vs_write_before_read(v, v1), "output overwrites input");
4850 assert(!vs_write_before_read(v, v2), "output overwrites input");
4851 for (int i = 0; i < N; i++) {
4852 __ andr(v[i], __ T16B, v1[i], v2[i]);
4853 }
4854 }
4855
4856 template<int N>
4857 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
4858 // output must not be constant
4859 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4860 // output cannot overwrite pending inputs
4861 assert(!vs_write_before_read(v, v1), "output overwrites input");
4862 assert(!vs_write_before_read(v, v2), "output overwrites input");
4863 for (int i = 0; i < N; i++) {
4864 __ orr(v[i], __ T16B, v1[i], v2[i]);
4865 }
4866 }
4867
4868 template<int N>
4869 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
4870 // output must not be constant
4871 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4872 // output cannot overwrite pending inputs
4873 assert(!vs_write_before_read(v, v1), "output overwrites input");
4874 for (int i = 0; i < N; i++) {
4875 __ notr(v[i], __ T16B, v1[i]);
4876 }
4877 }
4878
4879 template<int N>
4880 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
4881 // output must not be constant
4882 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4883 // output cannot overwrite pending inputs
4884 assert(!vs_write_before_read(v, v1), "output overwrites input");
4885 assert(!vs_write_before_read(v, v2), "output overwrites input");
4886 for (int i = 0; i < N; i++) {
4887 __ sqdmulh(v[i], T, v1[i], v2[i]);
4888 }
4889 }
4890
4891 template<int N>
4892 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
4893 // output must not be constant
4894 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
4895 // output cannot overwrite pending inputs
4896 assert(!vs_write_before_read(v, v1), "output overwrites input");
4897 assert(!vs_write_before_read(v, v2), "output overwrites input");
4898 for (int i = 0; i < N; i++) {
4899 __ mlsv(v[i], T, v1[i], v2[i]);
4900 }
4901 }
4902
4903 // load N/2 successive pairs of quadword values from memory in order
4904 // into N successive vector registers of the sequence via the
4905 // address supplied in base.
4906 template<int N>
4907 void vs_ldpq(const VSeq<N>& v, Register base) {
4908 for (int i = 0; i < N; i += 2) {
4909 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
4910 }
4911 }
4912
4913 // load N/2 successive pairs of quadword values from memory in order
4914 // into N vector registers of the sequence via the address supplied
4915 // in base using post-increment addressing
4916 template<int N>
4917 void vs_ldpq_post(const VSeq<N>& v, Register base) {
4918 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4919 for (int i = 0; i < N; i += 2) {
4920 __ ldpq(v[i], v[i+1], __ post(base, 32));
4921 }
4922 }
4923
4924 // store N successive vector registers of the sequence into N/2
4925 // successive pairs of quadword memory locations via the address
4926 // supplied in base using post-increment addressing
4927 template<int N>
4928 void vs_stpq_post(const VSeq<N>& v, Register base) {
4929 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4930 for (int i = 0; i < N; i += 2) {
4931 __ stpq(v[i], v[i+1], __ post(base, 32));
4932 }
4933 }
4934
4935 // load N/2 pairs of quadword values from memory de-interleaved into
4936 // N vector registers 2 at a time via the address supplied in base
4937 // using post-increment addressing.
4938 template<int N>
4939 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4940 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4941 for (int i = 0; i < N; i += 2) {
4942 __ ld2(v[i], v[i+1], T, __ post(base, 32));
4943 }
4944 }
4945
4946 // store N vector registers interleaved into N/2 pairs of quadword
4947 // memory locations via the address supplied in base using
4948 // post-increment addressing.
4949 template<int N>
4950 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4951 static_assert((N & (N - 1)) == 0, "sequence length must be even");
4952 for (int i = 0; i < N; i += 2) {
4953 __ st2(v[i], v[i+1], T, __ post(base, 32));
4954 }
4955 }
4956
4957 // load N quadword values from memory de-interleaved into N vector
4958 // registers 3 elements at a time via the address supplied in base.
4959 template<int N>
4960 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4961 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4962 for (int i = 0; i < N; i += 3) {
4963 __ ld3(v[i], v[i+1], v[i+2], T, base);
4964 }
4965 }
4966
4967 // load N quadword values from memory de-interleaved into N vector
4968 // registers 3 elements at a time via the address supplied in base
4969 // using post-increment addressing.
4970 template<int N>
4971 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
4972 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
4973 for (int i = 0; i < N; i += 3) {
4974 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
4975 }
4976 }
4977
4978 // load N/2 pairs of quadword values from memory into N vector
4979 // registers via the address supplied in base with each pair indexed
4980 // using the the start offset plus the corresponding entry in the
4981 // offsets array
4982 template<int N>
4983 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
4984 for (int i = 0; i < N/2; i++) {
4985 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4986 }
4987 }
4988
4989 // store N vector registers into N/2 pairs of quadword memory
4990 // locations via the address supplied in base with each pair indexed
4991 // using the the start offset plus the corresponding entry in the
4992 // offsets array
4993 template<int N>
4994 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
4995 for (int i = 0; i < N/2; i++) {
4996 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
4997 }
4998 }
4999
5000 // load N single quadword values from memory into N vector registers
5001 // via the address supplied in base with each value indexed using
5002 // the the start offset plus the corresponding entry in the offsets
5003 // array
5004 template<int N>
5005 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5006 int start, int (&offsets)[N]) {
5007 for (int i = 0; i < N; i++) {
5008 __ ldr(v[i], T, Address(base, start + offsets[i]));
5009 }
5010 }
5011
5012 // store N vector registers into N single quadword memory locations
5013 // via the address supplied in base with each value indexed using
5014 // the the start offset plus the corresponding entry in the offsets
5015 // array
5016 template<int N>
5017 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5018 int start, int (&offsets)[N]) {
5019 for (int i = 0; i < N; i++) {
5020 __ str(v[i], T, Address(base, start + offsets[i]));
5021 }
5022 }
5023
5024 // load N/2 pairs of quadword values from memory de-interleaved into
5025 // N vector registers 2 at a time via the address supplied in base
5026 // with each pair indexed using the the start offset plus the
5027 // corresponding entry in the offsets array
5028 template<int N>
5029 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5030 Register tmp, int start, int (&offsets)[N/2]) {
5031 for (int i = 0; i < N/2; i++) {
5032 __ add(tmp, base, start + offsets[i]);
5033 __ ld2(v[2*i], v[2*i+1], T, tmp);
5034 }
5035 }
5036
5037 // store N vector registers 2 at a time interleaved into N/2 pairs
5038 // of quadword memory locations via the address supplied in base
5039 // with each pair indexed using the the start offset plus the
5040 // corresponding entry in the offsets array
5041 template<int N>
5042 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5043 Register tmp, int start, int (&offsets)[N/2]) {
5044 for (int i = 0; i < N/2; i++) {
5045 __ add(tmp, base, start + offsets[i]);
5046 __ st2(v[2*i], v[2*i+1], T, tmp);
5047 }
5048 }
5049
5050 // Helper routines for various flavours of Montgomery multiply
5051
5052 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5053 // multiplications in parallel
5054 //
5055
5056 // See the montMul() method of the sun.security.provider.ML_DSA
5057 // class.
5058 //
5059 // Computes 4x4S results or 8x8H results
5060 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5061 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5062 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5063 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5064 // Outputs: va - 4x4S or 4x8H vector register sequences
5065 // vb, vc, vtmp and vq must all be disjoint
5066 // va must be disjoint from all other inputs/temps or must equal vc
5067 // va must have a non-zero delta i.e. it must not be a constant vseq.
5068 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5069 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5070 Assembler::SIMD_Arrangement T,
5071 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5072 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5073 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5074 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5075 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5076
5077 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5078 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5079
5080 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5081
5082 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5083 assert(vs_disjoint(va, vb), "va and vb overlap");
5084 assert(vs_disjoint(va, vq), "va and vq overlap");
5085 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5086 assert(!va.is_constant(), "output vector must identify 4 different registers");
5087
5088 // schedule 4 streams of instructions across the vector sequences
5089 for (int i = 0; i < 4; i++) {
5090 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5091 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5092 }
5093
5094 for (int i = 0; i < 4; i++) {
5095 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5096 }
5097
5098 for (int i = 0; i < 4; i++) {
5099 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5100 }
5101
5102 for (int i = 0; i < 4; i++) {
5103 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5104 }
5105 }
5106
5107 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5108 // multiplications in parallel
5109 //
5110
5111 // See the montMul() method of the sun.security.provider.ML_DSA
5112 // class.
5113 //
5114 // Computes 4x4S results or 8x8H results
5115 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5116 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5117 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5118 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5119 // Outputs: va - 4x4S or 4x8H vector register sequences
5120 // vb, vc, vtmp and vq must all be disjoint
5121 // va must be disjoint from all other inputs/temps or must equal vc
5122 // va must have a non-zero delta i.e. it must not be a constant vseq.
5123 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5124 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5125 Assembler::SIMD_Arrangement T,
5126 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5127 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5128 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5129 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5130 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5131
5132 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5133 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5134
5135 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5136
5137 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5138 assert(vs_disjoint(va, vb), "va and vb overlap");
5139 assert(vs_disjoint(va, vq), "va and vq overlap");
5140 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5141 assert(!va.is_constant(), "output vector must identify 2 different registers");
5142
5143 // schedule 2 streams of instructions across the vector sequences
5144 for (int i = 0; i < 2; i++) {
5145 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5146 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5147 }
5148
5149 for (int i = 0; i < 2; i++) {
5150 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5151 }
5152
5153 for (int i = 0; i < 2; i++) {
5154 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5155 }
5156
5157 for (int i = 0; i < 2; i++) {
5158 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5159 }
5160 }
5161
5162 // Perform 16 16-bit Montgomery multiplications in parallel.
5163 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5164 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5165 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5166 // It will assert that the register use is valid
5167 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5168 }
5169
5170 // Perform 32 16-bit Montgomery multiplications in parallel.
5171 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5172 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5173 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5174 // It will assert that the register use is valid
5175 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5176 }
5177
5178 // Perform 64 16-bit Montgomery multiplications in parallel.
5179 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5180 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5181 // Schedule two successive 4x8H multiplies via the montmul helper
5182 // on the front and back halves of va, vb and vc. The helper will
5183 // assert that the register use has no overlap conflicts on each
5184 // individual call but we also need to ensure that the necessary
5185 // disjoint/equality constraints are met across both calls.
5186
5187 // vb, vc, vtmp and vq must be disjoint. va must either be
5188 // disjoint from all other registers or equal vc
5189
5190 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5191 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5192 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5193
5194 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5195 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5196
5197 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5198
5199 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5200 assert(vs_disjoint(va, vb), "va and vb overlap");
5201 assert(vs_disjoint(va, vq), "va and vq overlap");
5202 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5203
5204 // we multiply the front and back halves of each sequence 4 at a
5205 // time because
5206 //
5207 // 1) we are currently only able to get 4-way instruction
5208 // parallelism at best
5209 //
5210 // 2) we need registers for the constants in vq and temporary
5211 // scratch registers to hold intermediate results so vtmp can only
5212 // be a VSeq<4> which means we only have 4 scratch slots
5213
5214 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5215 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5216 }
5217
5218 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5219 const VSeq<4>& vc,
5220 const VSeq<4>& vtmp,
5221 const VSeq<2>& vq) {
5222 // compute a = montmul(a1, c)
5223 kyber_montmul32(vc, va1, vc, vtmp, vq);
5224 // ouptut a1 = a0 - a
5225 vs_subv(va1, __ T8H, va0, vc);
5226 // and a0 = a0 + a
5227 vs_addv(va0, __ T8H, va0, vc);
5228 }
5229
5230 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5231 const VSeq<4>& vb,
5232 const VSeq<4>& vtmp1,
5233 const VSeq<4>& vtmp2,
5234 const VSeq<2>& vq) {
5235 // compute c = a0 - a1
5236 vs_subv(vtmp1, __ T8H, va0, va1);
5237 // output a0 = a0 + a1
5238 vs_addv(va0, __ T8H, va0, va1);
5239 // output a1 = b montmul c
5240 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5241 }
5242
5243 void load64shorts(const VSeq<8>& v, Register shorts) {
5244 vs_ldpq_post(v, shorts);
5245 }
5246
5247 void load32shorts(const VSeq<4>& v, Register shorts) {
5248 vs_ldpq_post(v, shorts);
5249 }
5250
5251 void store64shorts(VSeq<8> v, Register tmpAddr) {
5252 vs_stpq_post(v, tmpAddr);
5253 }
5254
5255 // Kyber NTT function.
5256 // Implements
5257 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5258 //
5259 // coeffs (short[256]) = c_rarg0
5260 // ntt_zetas (short[256]) = c_rarg1
5261 address generate_kyberNtt() {
5262
5263 __ align(CodeEntryAlignment);
5264 StubId stub_id = StubId::stubgen_kyberNtt_id;
5265 StubCodeMark mark(this, stub_id);
5266 address start = __ pc();
5267 __ enter();
5268
5269 const Register coeffs = c_rarg0;
5270 const Register zetas = c_rarg1;
5271
5272 const Register kyberConsts = r10;
5273 const Register tmpAddr = r11;
5274
5275 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5276 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5277 VSeq<2> vq(30); // n.b. constants overlap vs3
5278
5279 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5280 // load the montmul constants
5281 vs_ldpq(vq, kyberConsts);
5282
5283 // Each level corresponds to an iteration of the outermost loop of the
5284 // Java method seilerNTT(int[] coeffs). There are some differences
5285 // from what is done in the seilerNTT() method, though:
5286 // 1. The computation is using 16-bit signed values, we do not convert them
5287 // to ints here.
5288 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5289 // this array for each level, it is easier that way to fill up the vector
5290 // registers.
5291 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5292 // multiplications (this is because that way there should not be any
5293 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5294 // that we can use the 16-bit arithmetic in the vector unit.
5295 //
5296 // On each level, we fill up the vector registers in such a way that the
5297 // array elements that need to be multiplied by the zetas go into one
5298 // set of vector registers while the corresponding ones that don't need to
5299 // be multiplied, go into another set.
5300 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5301 // registers interleaving the steps of 4 identical computations,
5302 // each done on 8 16-bit values per register.
5303
5304 // At levels 0-3 the coefficients multiplied by or added/subtracted
5305 // to the zetas occur in discrete blocks whose size is some multiple
5306 // of 32.
5307
5308 // level 0
5309 __ add(tmpAddr, coeffs, 256);
5310 load64shorts(vs1, tmpAddr);
5311 load64shorts(vs2, zetas);
5312 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5313 __ add(tmpAddr, coeffs, 0);
5314 load64shorts(vs1, tmpAddr);
5315 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5316 vs_addv(vs1, __ T8H, vs1, vs2);
5317 __ add(tmpAddr, coeffs, 0);
5318 vs_stpq_post(vs1, tmpAddr);
5319 __ add(tmpAddr, coeffs, 256);
5320 vs_stpq_post(vs3, tmpAddr);
5321 // restore montmul constants
5322 vs_ldpq(vq, kyberConsts);
5323 load64shorts(vs1, tmpAddr);
5324 load64shorts(vs2, zetas);
5325 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5326 __ add(tmpAddr, coeffs, 128);
5327 load64shorts(vs1, tmpAddr);
5328 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5329 vs_addv(vs1, __ T8H, vs1, vs2);
5330 __ add(tmpAddr, coeffs, 128);
5331 store64shorts(vs1, tmpAddr);
5332 __ add(tmpAddr, coeffs, 384);
5333 store64shorts(vs3, tmpAddr);
5334
5335 // level 1
5336 // restore montmul constants
5337 vs_ldpq(vq, kyberConsts);
5338 __ add(tmpAddr, coeffs, 128);
5339 load64shorts(vs1, tmpAddr);
5340 load64shorts(vs2, zetas);
5341 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5342 __ add(tmpAddr, coeffs, 0);
5343 load64shorts(vs1, tmpAddr);
5344 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5345 vs_addv(vs1, __ T8H, vs1, vs2);
5346 __ add(tmpAddr, coeffs, 0);
5347 store64shorts(vs1, tmpAddr);
5348 store64shorts(vs3, tmpAddr);
5349 vs_ldpq(vq, kyberConsts);
5350 __ add(tmpAddr, coeffs, 384);
5351 load64shorts(vs1, tmpAddr);
5352 load64shorts(vs2, zetas);
5353 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5354 __ add(tmpAddr, coeffs, 256);
5355 load64shorts(vs1, tmpAddr);
5356 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5357 vs_addv(vs1, __ T8H, vs1, vs2);
5358 __ add(tmpAddr, coeffs, 256);
5359 store64shorts(vs1, tmpAddr);
5360 store64shorts(vs3, tmpAddr);
5361
5362 // level 2
5363 vs_ldpq(vq, kyberConsts);
5364 int offsets1[4] = { 0, 32, 128, 160 };
5365 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5366 load64shorts(vs2, zetas);
5367 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5368 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5369 // kyber_subv_addv64();
5370 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5371 vs_addv(vs1, __ T8H, vs1, vs2);
5372 __ add(tmpAddr, coeffs, 0);
5373 vs_stpq_post(vs_front(vs1), tmpAddr);
5374 vs_stpq_post(vs_front(vs3), tmpAddr);
5375 vs_stpq_post(vs_back(vs1), tmpAddr);
5376 vs_stpq_post(vs_back(vs3), tmpAddr);
5377 vs_ldpq(vq, kyberConsts);
5378 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5379 load64shorts(vs2, zetas);
5380 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5381 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5382 // kyber_subv_addv64();
5383 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5384 vs_addv(vs1, __ T8H, vs1, vs2);
5385 __ add(tmpAddr, coeffs, 256);
5386 vs_stpq_post(vs_front(vs1), tmpAddr);
5387 vs_stpq_post(vs_front(vs3), tmpAddr);
5388 vs_stpq_post(vs_back(vs1), tmpAddr);
5389 vs_stpq_post(vs_back(vs3), tmpAddr);
5390
5391 // level 3
5392 vs_ldpq(vq, kyberConsts);
5393 int offsets2[4] = { 0, 64, 128, 192 };
5394 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5395 load64shorts(vs2, zetas);
5396 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5397 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5398 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5399 vs_addv(vs1, __ T8H, vs1, vs2);
5400 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5401 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5402
5403 vs_ldpq(vq, kyberConsts);
5404 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5405 load64shorts(vs2, zetas);
5406 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5407 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5408 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5409 vs_addv(vs1, __ T8H, vs1, vs2);
5410 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5411 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5412
5413 // level 4
5414 // At level 4 coefficients occur in 8 discrete blocks of size 16
5415 // so they are loaded using employing an ldr at 8 distinct offsets.
5416
5417 vs_ldpq(vq, kyberConsts);
5418 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5419 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5420 load64shorts(vs2, zetas);
5421 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5422 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5423 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5424 vs_addv(vs1, __ T8H, vs1, vs2);
5425 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5426 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5427
5428 vs_ldpq(vq, kyberConsts);
5429 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5430 load64shorts(vs2, zetas);
5431 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5432 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5433 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5434 vs_addv(vs1, __ T8H, vs1, vs2);
5435 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5436 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5437
5438 // level 5
5439 // At level 5 related coefficients occur in discrete blocks of size 8 so
5440 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5441
5442 vs_ldpq(vq, kyberConsts);
5443 int offsets4[4] = { 0, 32, 64, 96 };
5444 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5445 load32shorts(vs_front(vs2), zetas);
5446 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5447 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5448 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5449 load32shorts(vs_front(vs2), zetas);
5450 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5451 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5452 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5453 load32shorts(vs_front(vs2), zetas);
5454 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5455 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5456
5457 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5458 load32shorts(vs_front(vs2), zetas);
5459 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5460 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5461
5462 // level 6
5463 // At level 6 related coefficients occur in discrete blocks of size 4 so
5464 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5465
5466 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5467 load32shorts(vs_front(vs2), zetas);
5468 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5469 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5470 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5471 // __ ldpq(v18, v19, __ post(zetas, 32));
5472 load32shorts(vs_front(vs2), zetas);
5473 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5474 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5475
5476 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5477 load32shorts(vs_front(vs2), zetas);
5478 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5479 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5480
5481 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5482 load32shorts(vs_front(vs2), zetas);
5483 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5484 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5485
5486 __ leave(); // required for proper stackwalking of RuntimeStub frame
5487 __ mov(r0, zr); // return 0
5488 __ ret(lr);
5489
5490 return start;
5491 }
5492
5493 // Kyber Inverse NTT function
5494 // Implements
5495 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5496 //
5497 // coeffs (short[256]) = c_rarg0
5498 // ntt_zetas (short[256]) = c_rarg1
5499 address generate_kyberInverseNtt() {
5500
5501 __ align(CodeEntryAlignment);
5502 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5503 StubCodeMark mark(this, stub_id);
5504 address start = __ pc();
5505 __ enter();
5506
5507 const Register coeffs = c_rarg0;
5508 const Register zetas = c_rarg1;
5509
5510 const Register kyberConsts = r10;
5511 const Register tmpAddr = r11;
5512 const Register tmpAddr2 = c_rarg2;
5513
5514 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5515 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5516 VSeq<2> vq(30); // n.b. constants overlap vs3
5517
5518 __ lea(kyberConsts,
5519 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5520
5521 // level 0
5522 // At level 0 related coefficients occur in discrete blocks of size 4 so
5523 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5524
5525 vs_ldpq(vq, kyberConsts);
5526 int offsets4[4] = { 0, 32, 64, 96 };
5527 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5528 load32shorts(vs_front(vs2), zetas);
5529 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5530 vs_front(vs2), vs_back(vs2), vtmp, vq);
5531 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5532 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5533 load32shorts(vs_front(vs2), zetas);
5534 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5535 vs_front(vs2), vs_back(vs2), vtmp, vq);
5536 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5537 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5538 load32shorts(vs_front(vs2), zetas);
5539 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5540 vs_front(vs2), vs_back(vs2), vtmp, vq);
5541 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5542 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5543 load32shorts(vs_front(vs2), zetas);
5544 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5545 vs_front(vs2), vs_back(vs2), vtmp, vq);
5546 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5547
5548 // level 1
5549 // At level 1 related coefficients occur in discrete blocks of size 8 so
5550 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5551
5552 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5553 load32shorts(vs_front(vs2), zetas);
5554 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5555 vs_front(vs2), vs_back(vs2), vtmp, vq);
5556 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5557 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5558 load32shorts(vs_front(vs2), zetas);
5559 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5560 vs_front(vs2), vs_back(vs2), vtmp, vq);
5561 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5562
5563 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5564 load32shorts(vs_front(vs2), zetas);
5565 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5566 vs_front(vs2), vs_back(vs2), vtmp, vq);
5567 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5568 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5569 load32shorts(vs_front(vs2), zetas);
5570 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
5571 vs_front(vs2), vs_back(vs2), vtmp, vq);
5572 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5573
5574 // level 2
5575 // At level 2 coefficients occur in 8 discrete blocks of size 16
5576 // so they are loaded using employing an ldr at 8 distinct offsets.
5577
5578 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5579 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5580 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
5581 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5582 vs_subv(vs1, __ T8H, vs1, vs2);
5583 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
5584 load64shorts(vs2, zetas);
5585 vs_ldpq(vq, kyberConsts);
5586 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5587 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
5588
5589 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5590 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5591 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5592 vs_subv(vs1, __ T8H, vs1, vs2);
5593 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
5594 load64shorts(vs2, zetas);
5595 vs_ldpq(vq, kyberConsts);
5596 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5597 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
5598
5599 // Barrett reduction at indexes where overflow may happen
5600
5601 // load q and the multiplier for the Barrett reduction
5602 __ add(tmpAddr, kyberConsts, 16);
5603 vs_ldpq(vq, tmpAddr);
5604
5605 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
5606 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
5607 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
5608 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5609 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5610 vs_sshr(vs2, __ T8H, vs2, 11);
5611 vs_mlsv(vs1, __ T8H, vs2, vq1);
5612 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5613 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5614 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5615 vs_sshr(vs2, __ T8H, vs2, 11);
5616 vs_mlsv(vs1, __ T8H, vs2, vq1);
5617 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5618
5619 // level 3
5620 // From level 3 upwards coefficients occur in discrete blocks whose size is
5621 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
5622
5623 int offsets2[4] = { 0, 64, 128, 192 };
5624 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5625 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
5626 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5627 vs_subv(vs1, __ T8H, vs1, vs2);
5628 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
5629 load64shorts(vs2, zetas);
5630 vs_ldpq(vq, kyberConsts);
5631 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5632 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
5633
5634 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5635 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5636 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5637 vs_subv(vs1, __ T8H, vs1, vs2);
5638 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
5639 load64shorts(vs2, zetas);
5640 vs_ldpq(vq, kyberConsts);
5641 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5642 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
5643
5644 // level 4
5645
5646 int offsets1[4] = { 0, 32, 128, 160 };
5647 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5648 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
5649 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5650 vs_subv(vs1, __ T8H, vs1, vs2);
5651 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
5652 load64shorts(vs2, zetas);
5653 vs_ldpq(vq, kyberConsts);
5654 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5655 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
5656
5657 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5658 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5659 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5660 vs_subv(vs1, __ T8H, vs1, vs2);
5661 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
5662 load64shorts(vs2, zetas);
5663 vs_ldpq(vq, kyberConsts);
5664 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5665 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
5666
5667 // level 5
5668
5669 __ add(tmpAddr, coeffs, 0);
5670 load64shorts(vs1, tmpAddr);
5671 __ add(tmpAddr, coeffs, 128);
5672 load64shorts(vs2, tmpAddr);
5673 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5674 vs_subv(vs1, __ T8H, vs1, vs2);
5675 __ add(tmpAddr, coeffs, 0);
5676 store64shorts(vs3, tmpAddr);
5677 load64shorts(vs2, zetas);
5678 vs_ldpq(vq, kyberConsts);
5679 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5680 __ add(tmpAddr, coeffs, 128);
5681 store64shorts(vs2, tmpAddr);
5682
5683 load64shorts(vs1, tmpAddr);
5684 __ add(tmpAddr, coeffs, 384);
5685 load64shorts(vs2, tmpAddr);
5686 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5687 vs_subv(vs1, __ T8H, vs1, vs2);
5688 __ add(tmpAddr, coeffs, 256);
5689 store64shorts(vs3, tmpAddr);
5690 load64shorts(vs2, zetas);
5691 vs_ldpq(vq, kyberConsts);
5692 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5693 __ add(tmpAddr, coeffs, 384);
5694 store64shorts(vs2, tmpAddr);
5695
5696 // Barrett reduction at indexes where overflow may happen
5697
5698 // load q and the multiplier for the Barrett reduction
5699 __ add(tmpAddr, kyberConsts, 16);
5700 vs_ldpq(vq, tmpAddr);
5701
5702 int offsets0[2] = { 0, 256 };
5703 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5704 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
5705 vs_sshr(vs2, __ T8H, vs2, 11);
5706 vs_mlsv(vs1, __ T8H, vs2, vq1);
5707 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
5708
5709 // level 6
5710
5711 __ add(tmpAddr, coeffs, 0);
5712 load64shorts(vs1, tmpAddr);
5713 __ add(tmpAddr, coeffs, 256);
5714 load64shorts(vs2, tmpAddr);
5715 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5716 vs_subv(vs1, __ T8H, vs1, vs2);
5717 __ add(tmpAddr, coeffs, 0);
5718 store64shorts(vs3, tmpAddr);
5719 load64shorts(vs2, zetas);
5720 vs_ldpq(vq, kyberConsts);
5721 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5722 __ add(tmpAddr, coeffs, 256);
5723 store64shorts(vs2, tmpAddr);
5724
5725 __ add(tmpAddr, coeffs, 128);
5726 load64shorts(vs1, tmpAddr);
5727 __ add(tmpAddr, coeffs, 384);
5728 load64shorts(vs2, tmpAddr);
5729 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5730 vs_subv(vs1, __ T8H, vs1, vs2);
5731 __ add(tmpAddr, coeffs, 128);
5732 store64shorts(vs3, tmpAddr);
5733 load64shorts(vs2, zetas);
5734 vs_ldpq(vq, kyberConsts);
5735 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5736 __ add(tmpAddr, coeffs, 384);
5737 store64shorts(vs2, tmpAddr);
5738
5739 // multiply by 2^-n
5740
5741 // load toMont(2^-n mod q)
5742 __ add(tmpAddr, kyberConsts, 48);
5743 __ ldr(v29, __ Q, tmpAddr);
5744
5745 vs_ldpq(vq, kyberConsts);
5746 __ add(tmpAddr, coeffs, 0);
5747 load64shorts(vs1, tmpAddr);
5748 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5749 __ add(tmpAddr, coeffs, 0);
5750 store64shorts(vs2, tmpAddr);
5751
5752 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
5753 load64shorts(vs1, tmpAddr);
5754 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5755 __ add(tmpAddr, coeffs, 128);
5756 store64shorts(vs2, tmpAddr);
5757
5758 // now tmpAddr contains coeffs + 256
5759 load64shorts(vs1, tmpAddr);
5760 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5761 __ add(tmpAddr, coeffs, 256);
5762 store64shorts(vs2, tmpAddr);
5763
5764 // now tmpAddr contains coeffs + 384
5765 load64shorts(vs1, tmpAddr);
5766 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
5767 __ add(tmpAddr, coeffs, 384);
5768 store64shorts(vs2, tmpAddr);
5769
5770 __ leave(); // required for proper stackwalking of RuntimeStub frame
5771 __ mov(r0, zr); // return 0
5772 __ ret(lr);
5773
5774 return start;
5775 }
5776
5777 // Kyber multiply polynomials in the NTT domain.
5778 // Implements
5779 // static int implKyberNttMult(
5780 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
5781 //
5782 // result (short[256]) = c_rarg0
5783 // ntta (short[256]) = c_rarg1
5784 // nttb (short[256]) = c_rarg2
5785 // zetas (short[128]) = c_rarg3
5786 address generate_kyberNttMult() {
5787
5788 __ align(CodeEntryAlignment);
5789 StubId stub_id = StubId::stubgen_kyberNttMult_id;
5790 StubCodeMark mark(this, stub_id);
5791 address start = __ pc();
5792 __ enter();
5793
5794 const Register result = c_rarg0;
5795 const Register ntta = c_rarg1;
5796 const Register nttb = c_rarg2;
5797 const Register zetas = c_rarg3;
5798
5799 const Register kyberConsts = r10;
5800 const Register limit = r11;
5801
5802 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
5803 VSeq<4> vs3(16), vs4(20);
5804 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
5805 VSeq<2> vz(28); // pair of zetas
5806 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
5807
5808 __ lea(kyberConsts,
5809 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5810
5811 Label kyberNttMult_loop;
5812
5813 __ add(limit, result, 512);
5814
5815 // load q and qinv
5816 vs_ldpq(vq, kyberConsts);
5817
5818 // load R^2 mod q (to convert back from Montgomery representation)
5819 __ add(kyberConsts, kyberConsts, 64);
5820 __ ldr(v27, __ Q, kyberConsts);
5821
5822 __ BIND(kyberNttMult_loop);
5823
5824 // load 16 zetas
5825 vs_ldpq_post(vz, zetas);
5826
5827 // load 2 sets of 32 coefficients from the two input arrays
5828 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
5829 // are striped across pairs of vector registers
5830 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
5831 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
5832 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
5833 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
5834
5835 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
5836 // i.e. montmul the first and second halves of vs1 in order and
5837 // then with one sequence reversed storing the two results in vs3
5838 //
5839 // vs3[0] <- montmul(a0, b0)
5840 // vs3[1] <- montmul(a1, b1)
5841 // vs3[2] <- montmul(a0, b1)
5842 // vs3[3] <- montmul(a1, b0)
5843 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
5844 kyber_montmul16(vs_back(vs3),
5845 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
5846
5847 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
5848 // i.e. montmul the first and second halves of vs4 in order and
5849 // then with one sequence reversed storing the two results in vs1
5850 //
5851 // vs1[0] <- montmul(a2, b2)
5852 // vs1[1] <- montmul(a3, b3)
5853 // vs1[2] <- montmul(a2, b3)
5854 // vs1[3] <- montmul(a3, b2)
5855 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
5856 kyber_montmul16(vs_back(vs1),
5857 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
5858
5859 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
5860 // We can schedule two montmuls at a time if we use a suitable vector
5861 // sequence <vs3[1], vs1[1]>.
5862 int delta = vs1[1]->encoding() - vs3[1]->encoding();
5863 VSeq<2> vs5(vs3[1], delta);
5864
5865 // vs3[1] <- montmul(montmul(a1, b1), z0)
5866 // vs1[1] <- montmul(montmul(a3, b3), z1)
5867 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
5868
5869 // add results in pairs storing in vs3
5870 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
5871 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
5872 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
5873
5874 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
5875 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
5876 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
5877
5878 // vs1 <- montmul(vs3, montRSquareModQ)
5879 kyber_montmul32(vs1, vs3, vc, vs2, vq);
5880
5881 // store back the two pairs of result vectors de-interleaved as 8H elements
5882 // i.e. storing each pairs of shorts striped across a register pair adjacent
5883 // in memory
5884 vs_st2_post(vs1, __ T8H, result);
5885
5886 __ cmp(result, limit);
5887 __ br(Assembler::NE, kyberNttMult_loop);
5888
5889 __ leave(); // required for proper stackwalking of RuntimeStub frame
5890 __ mov(r0, zr); // return 0
5891 __ ret(lr);
5892
5893 return start;
5894 }
5895
5896 // Kyber add 2 polynomials.
5897 // Implements
5898 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
5899 //
5900 // result (short[256]) = c_rarg0
5901 // a (short[256]) = c_rarg1
5902 // b (short[256]) = c_rarg2
5903 address generate_kyberAddPoly_2() {
5904
5905 __ align(CodeEntryAlignment);
5906 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
5907 StubCodeMark mark(this, stub_id);
5908 address start = __ pc();
5909 __ enter();
5910
5911 const Register result = c_rarg0;
5912 const Register a = c_rarg1;
5913 const Register b = c_rarg2;
5914
5915 const Register kyberConsts = r11;
5916
5917 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
5918 // So, we can load, add and store the data in 3 groups of 11,
5919 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
5920 // registers. A further constraint is that the mapping needs
5921 // to skip callee saves. So, we allocate the register
5922 // sequences using two 8 sequences, two 2 sequences and two
5923 // single registers.
5924 VSeq<8> vs1_1(0);
5925 VSeq<2> vs1_2(16);
5926 FloatRegister vs1_3 = v28;
5927 VSeq<8> vs2_1(18);
5928 VSeq<2> vs2_2(26);
5929 FloatRegister vs2_3 = v29;
5930
5931 // two constant vector sequences
5932 VSeq<8> vc_1(31, 0);
5933 VSeq<2> vc_2(31, 0);
5934
5935 FloatRegister vc_3 = v31;
5936 __ lea(kyberConsts,
5937 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5938
5939 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
5940 for (int i = 0; i < 3; i++) {
5941 // load 80 or 88 values from a into vs1_1/2/3
5942 vs_ldpq_post(vs1_1, a);
5943 vs_ldpq_post(vs1_2, a);
5944 if (i < 2) {
5945 __ ldr(vs1_3, __ Q, __ post(a, 16));
5946 }
5947 // load 80 or 88 values from b into vs2_1/2/3
5948 vs_ldpq_post(vs2_1, b);
5949 vs_ldpq_post(vs2_2, b);
5950 if (i < 2) {
5951 __ ldr(vs2_3, __ Q, __ post(b, 16));
5952 }
5953 // sum 80 or 88 values across vs1 and vs2 into vs1
5954 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
5955 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
5956 if (i < 2) {
5957 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
5958 }
5959 // add constant to all 80 or 88 results
5960 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
5961 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
5962 if (i < 2) {
5963 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
5964 }
5965 // store 80 or 88 values
5966 vs_stpq_post(vs1_1, result);
5967 vs_stpq_post(vs1_2, result);
5968 if (i < 2) {
5969 __ str(vs1_3, __ Q, __ post(result, 16));
5970 }
5971 }
5972
5973 __ leave(); // required for proper stackwalking of RuntimeStub frame
5974 __ mov(r0, zr); // return 0
5975 __ ret(lr);
5976
5977 return start;
5978 }
5979
5980 // Kyber add 3 polynomials.
5981 // Implements
5982 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
5983 //
5984 // result (short[256]) = c_rarg0
5985 // a (short[256]) = c_rarg1
5986 // b (short[256]) = c_rarg2
5987 // c (short[256]) = c_rarg3
5988 address generate_kyberAddPoly_3() {
5989
5990 __ align(CodeEntryAlignment);
5991 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
5992 StubCodeMark mark(this, stub_id);
5993 address start = __ pc();
5994 __ enter();
5995
5996 const Register result = c_rarg0;
5997 const Register a = c_rarg1;
5998 const Register b = c_rarg2;
5999 const Register c = c_rarg3;
6000
6001 const Register kyberConsts = r11;
6002
6003 // As above we sum 256 sets of values in total i.e. 32 x 8H
6004 // quadwords. So, we can load, add and store the data in 3
6005 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6006 // of 10 or 11 registers. A further constraint is that the
6007 // mapping needs to skip callee saves. So, we allocate the
6008 // register sequences using two 8 sequences, two 2 sequences
6009 // and two single registers.
6010 VSeq<8> vs1_1(0);
6011 VSeq<2> vs1_2(16);
6012 FloatRegister vs1_3 = v28;
6013 VSeq<8> vs2_1(18);
6014 VSeq<2> vs2_2(26);
6015 FloatRegister vs2_3 = v29;
6016
6017 // two constant vector sequences
6018 VSeq<8> vc_1(31, 0);
6019 VSeq<2> vc_2(31, 0);
6020
6021 FloatRegister vc_3 = v31;
6022
6023 __ lea(kyberConsts,
6024 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6025
6026 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6027 for (int i = 0; i < 3; i++) {
6028 // load 80 or 88 values from a into vs1_1/2/3
6029 vs_ldpq_post(vs1_1, a);
6030 vs_ldpq_post(vs1_2, a);
6031 if (i < 2) {
6032 __ ldr(vs1_3, __ Q, __ post(a, 16));
6033 }
6034 // load 80 or 88 values from b into vs2_1/2/3
6035 vs_ldpq_post(vs2_1, b);
6036 vs_ldpq_post(vs2_2, b);
6037 if (i < 2) {
6038 __ ldr(vs2_3, __ Q, __ post(b, 16));
6039 }
6040 // sum 80 or 88 values across vs1 and vs2 into vs1
6041 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6042 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6043 if (i < 2) {
6044 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6045 }
6046 // load 80 or 88 values from c into vs2_1/2/3
6047 vs_ldpq_post(vs2_1, c);
6048 vs_ldpq_post(vs2_2, c);
6049 if (i < 2) {
6050 __ ldr(vs2_3, __ Q, __ post(c, 16));
6051 }
6052 // sum 80 or 88 values across vs1 and vs2 into vs1
6053 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6054 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6055 if (i < 2) {
6056 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6057 }
6058 // add constant to all 80 or 88 results
6059 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6060 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6061 if (i < 2) {
6062 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6063 }
6064 // store 80 or 88 values
6065 vs_stpq_post(vs1_1, result);
6066 vs_stpq_post(vs1_2, result);
6067 if (i < 2) {
6068 __ str(vs1_3, __ Q, __ post(result, 16));
6069 }
6070 }
6071
6072 __ leave(); // required for proper stackwalking of RuntimeStub frame
6073 __ mov(r0, zr); // return 0
6074 __ ret(lr);
6075
6076 return start;
6077 }
6078
6079 // Kyber parse XOF output to polynomial coefficient candidates
6080 // or decodePoly(12, ...).
6081 // Implements
6082 // static int implKyber12To16(
6083 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6084 //
6085 // (parsedLength or (parsedLength - 48) must be divisible by 64.)
6086 //
6087 // condensed (byte[]) = c_rarg0
6088 // condensedIndex = c_rarg1
6089 // parsed (short[112 or 256]) = c_rarg2
6090 // parsedLength (112 or 256) = c_rarg3
6091 address generate_kyber12To16() {
6092 Label L_F00, L_loop, L_end;
6093
6094 __ align(CodeEntryAlignment);
6095 StubId stub_id = StubId::stubgen_kyber12To16_id;
6096 StubCodeMark mark(this, stub_id);
6097 address start = __ pc();
6098 __ enter();
6099
6100 const Register condensed = c_rarg0;
6101 const Register condensedOffs = c_rarg1;
6102 const Register parsed = c_rarg2;
6103 const Register parsedLength = c_rarg3;
6104
6105 const Register tmpAddr = r11;
6106
6107 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6108 // quadwords so we need a 6 vector sequence for the inputs.
6109 // Parsing produces 64 shorts, employing two 8 vector
6110 // sequences to store and combine the intermediate data.
6111 VSeq<6> vin(24);
6112 VSeq<8> va(0), vb(16);
6113
6114 __ adr(tmpAddr, L_F00);
6115 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6116 __ add(condensed, condensed, condensedOffs);
6117
6118 __ BIND(L_loop);
6119 // load 96 (6 x 16B) byte values
6120 vs_ld3_post(vin, __ T16B, condensed);
6121
6122 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6123 // holds 48 (16x3) contiguous bytes from memory striped
6124 // horizontally across each of the 16 byte lanes. Equivalently,
6125 // that is 16 pairs of 12-bit integers. Likewise the back half
6126 // holds the next 48 bytes in the same arrangement.
6127
6128 // Each vector in the front half can also be viewed as a vertical
6129 // strip across the 16 pairs of 12 bit integers. Each byte in
6130 // vin[0] stores the low 8 bits of the first int in a pair. Each
6131 // byte in vin[1] stores the high 4 bits of the first int and the
6132 // low 4 bits of the second int. Each byte in vin[2] stores the
6133 // high 8 bits of the second int. Likewise the vectors in second
6134 // half.
6135
6136 // Converting the data to 16-bit shorts requires first of all
6137 // expanding each of the 6 x 16B vectors into 6 corresponding
6138 // pairs of 8H vectors. Mask, shift and add operations on the
6139 // resulting vector pairs can be used to combine 4 and 8 bit
6140 // parts of related 8H vector elements.
6141 //
6142 // The middle vectors (vin[2] and vin[5]) are actually expanded
6143 // twice, one copy manipulated to provide the lower 4 bits
6144 // belonging to the first short in a pair and another copy
6145 // manipulated to provide the higher 4 bits belonging to the
6146 // second short in a pair. This is why the the vector sequences va
6147 // and vb used to hold the expanded 8H elements are of length 8.
6148
6149 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6150 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6151 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6152 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6153 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6154 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6155 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6156 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6157
6158 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6159 // and vb[4:5]
6160 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6161 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6162 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6163 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6164 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6165 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6166
6167 // shift lo byte of copy 1 of the middle stripe into the high byte
6168 __ shl(va[2], __ T8H, va[2], 8);
6169 __ shl(va[3], __ T8H, va[3], 8);
6170 __ shl(vb[2], __ T8H, vb[2], 8);
6171 __ shl(vb[3], __ T8H, vb[3], 8);
6172
6173 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6174 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6175 // are in bit positions [4..11].
6176 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6177 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6178 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6179 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6180
6181 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6182 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6183 // copy2
6184 __ andr(va[2], __ T16B, va[2], v31);
6185 __ andr(va[3], __ T16B, va[3], v31);
6186 __ ushr(va[4], __ T8H, va[4], 4);
6187 __ ushr(va[5], __ T8H, va[5], 4);
6188 __ andr(vb[2], __ T16B, vb[2], v31);
6189 __ andr(vb[3], __ T16B, vb[3], v31);
6190 __ ushr(vb[4], __ T8H, vb[4], 4);
6191 __ ushr(vb[5], __ T8H, vb[5], 4);
6192
6193 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6194 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6195 // n.b. the ordering ensures: i) inputs are consumed before they
6196 // are overwritten ii) the order of 16-bit results across successive
6197 // pairs of vectors in va and then vb reflects the order of the
6198 // corresponding 12-bit inputs
6199 __ addv(va[0], __ T8H, va[0], va[2]);
6200 __ addv(va[2], __ T8H, va[1], va[3]);
6201 __ addv(va[1], __ T8H, va[4], va[6]);
6202 __ addv(va[3], __ T8H, va[5], va[7]);
6203 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6204 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6205 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6206 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6207
6208 // store 64 results interleaved as shorts
6209 vs_st2_post(vs_front(va), __ T8H, parsed);
6210 vs_st2_post(vs_front(vb), __ T8H, parsed);
6211
6212 __ sub(parsedLength, parsedLength, 64);
6213 __ cmp(parsedLength, (u1)64);
6214 __ br(Assembler::GE, L_loop);
6215 __ cbz(parsedLength, L_end);
6216
6217 // if anything is left it should be a final 72 bytes of input
6218 // i.e. a final 48 12-bit values. so we handle this by loading
6219 // 48 bytes into all 16B lanes of front(vin) and only 24
6220 // bytes into the lower 8B lane of back(vin)
6221 vs_ld3_post(vs_front(vin), __ T16B, condensed);
6222 vs_ld3(vs_back(vin), __ T8B, condensed);
6223
6224 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6225 // n.b. target elements 2 and 3 of va duplicate elements 4 and
6226 // 5 and target element 2 of vb duplicates element 4.
6227 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6228 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6229 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6230 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6231 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6232 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6233
6234 // This time expand just the lower 8 lanes
6235 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6236 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6237 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6238
6239 // shift lo byte of copy 1 of the middle stripe into the high byte
6240 __ shl(va[2], __ T8H, va[2], 8);
6241 __ shl(va[3], __ T8H, va[3], 8);
6242 __ shl(vb[2], __ T8H, vb[2], 8);
6243
6244 // expand vin[2] into va[6:7] and lower 8 lanes of vin[5] into
6245 // vb[6] pre-shifted by 4 to ensure top bits of the input 12-bit
6246 // int are in bit positions [4..11].
6247 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6248 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6249 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6250
6251 // mask hi 4 bits of each 1st 12-bit int in pair from copy1 and
6252 // shift lo 4 bits of each 2nd 12-bit int in pair to bottom of
6253 // copy2
6254 __ andr(va[2], __ T16B, va[2], v31);
6255 __ andr(va[3], __ T16B, va[3], v31);
6256 __ ushr(va[4], __ T8H, va[4], 4);
6257 __ ushr(va[5], __ T8H, va[5], 4);
6258 __ andr(vb[2], __ T16B, vb[2], v31);
6259 __ ushr(vb[4], __ T8H, vb[4], 4);
6260
6261
6262
6263 // sum hi 4 bits and lo 8 bits of each 1st 12-bit int in pair and
6264 // hi 8 bits plus lo 4 bits of each 2nd 12-bit int in pair
6265
6266 // n.b. ordering ensures: i) inputs are consumed before they are
6267 // overwritten ii) order of 16-bit results across succsessive
6268 // pairs of vectors in va and then lower half of vb reflects order
6269 // of corresponding 12-bit inputs
6270 __ addv(va[0], __ T8H, va[0], va[2]);
6271 __ addv(va[2], __ T8H, va[1], va[3]);
6272 __ addv(va[1], __ T8H, va[4], va[6]);
6273 __ addv(va[3], __ T8H, va[5], va[7]);
6274 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6275 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6276
6277 // store 48 results interleaved as shorts
6278 vs_st2_post(vs_front(va), __ T8H, parsed);
6279 vs_st2_post(vs_front(vs_front(vb)), __ T8H, parsed);
6280
6281 __ BIND(L_end);
6282
6283 __ leave(); // required for proper stackwalking of RuntimeStub frame
6284 __ mov(r0, zr); // return 0
6285 __ ret(lr);
6286
6287 // bind label and generate constant data used by this stub
6288 __ BIND(L_F00);
6289 __ emit_int64(0x0f000f000f000f00);
6290 __ emit_int64(0x0f000f000f000f00);
6291
6292 return start;
6293 }
6294
6295 // Kyber Barrett reduce function.
6296 // Implements
6297 // static int implKyberBarrettReduce(short[] coeffs) {}
6298 //
6299 // coeffs (short[256]) = c_rarg0
6300 address generate_kyberBarrettReduce() {
6301
6302 __ align(CodeEntryAlignment);
6303 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6304 StubCodeMark mark(this, stub_id);
6305 address start = __ pc();
6306 __ enter();
6307
6308 const Register coeffs = c_rarg0;
6309
6310 const Register kyberConsts = r10;
6311 const Register result = r11;
6312
6313 // As above we process 256 sets of values in total i.e. 32 x
6314 // 8H quadwords. So, we can load, add and store the data in 3
6315 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6316 // of 10 or 11 registers. A further constraint is that the
6317 // mapping needs to skip callee saves. So, we allocate the
6318 // register sequences using two 8 sequences, two 2 sequences
6319 // and two single registers.
6320 VSeq<8> vs1_1(0);
6321 VSeq<2> vs1_2(16);
6322 FloatRegister vs1_3 = v28;
6323 VSeq<8> vs2_1(18);
6324 VSeq<2> vs2_2(26);
6325 FloatRegister vs2_3 = v29;
6326
6327 // we also need a pair of corresponding constant sequences
6328
6329 VSeq<8> vc1_1(30, 0);
6330 VSeq<2> vc1_2(30, 0);
6331 FloatRegister vc1_3 = v30; // for kyber_q
6332
6333 VSeq<8> vc2_1(31, 0);
6334 VSeq<2> vc2_2(31, 0);
6335 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6336
6337 __ add(result, coeffs, 0);
6338 __ lea(kyberConsts,
6339 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6340
6341 // load q and the multiplier for the Barrett reduction
6342 __ add(kyberConsts, kyberConsts, 16);
6343 __ ldpq(vc1_3, vc2_3, kyberConsts);
6344
6345 for (int i = 0; i < 3; i++) {
6346 // load 80 or 88 coefficients
6347 vs_ldpq_post(vs1_1, coeffs);
6348 vs_ldpq_post(vs1_2, coeffs);
6349 if (i < 2) {
6350 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6351 }
6352
6353 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6354 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6355 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6356 if (i < 2) {
6357 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6358 }
6359
6360 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6361 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6362 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6363 if (i < 2) {
6364 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6365 }
6366
6367 // vs1 <- vs1 - vs2 * kyber_q
6368 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6369 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6370 if (i < 2) {
6371 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6372 }
6373
6374 vs_stpq_post(vs1_1, result);
6375 vs_stpq_post(vs1_2, result);
6376 if (i < 2) {
6377 __ str(vs1_3, __ Q, __ post(result, 16));
6378 }
6379 }
6380
6381 __ leave(); // required for proper stackwalking of RuntimeStub frame
6382 __ mov(r0, zr); // return 0
6383 __ ret(lr);
6384
6385 return start;
6386 }
6387
6388
6389 // Dilithium-specific montmul helper routines that generate parallel
6390 // code for, respectively, a single 4x4s vector sequence montmul or
6391 // two such multiplies in a row.
6392
6393 // Perform 16 32-bit Montgomery multiplications in parallel
6394 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6395 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6396 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6397 // It will assert that the register use is valid
6398 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6399 }
6400
6401 // Perform 2x16 32-bit Montgomery multiplications in parallel
6402 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6403 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6404 // Schedule two successive 4x4S multiplies via the montmul helper
6405 // on the front and back halves of va, vb and vc. The helper will
6406 // assert that the register use has no overlap conflicts on each
6407 // individual call but we also need to ensure that the necessary
6408 // disjoint/equality constraints are met across both calls.
6409
6410 // vb, vc, vtmp and vq must be disjoint. va must either be
6411 // disjoint from all other registers or equal vc
6412
6413 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6414 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6415 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6416
6417 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6418 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6419
6420 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6421
6422 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6423 assert(vs_disjoint(va, vb), "va and vb overlap");
6424 assert(vs_disjoint(va, vq), "va and vq overlap");
6425 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6426
6427 // We multiply the front and back halves of each sequence 4 at a
6428 // time because
6429 //
6430 // 1) we are currently only able to get 4-way instruction
6431 // parallelism at best
6432 //
6433 // 2) we need registers for the constants in vq and temporary
6434 // scratch registers to hold intermediate results so vtmp can only
6435 // be a VSeq<4> which means we only have 4 scratch slots.
6436
6437 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6438 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6439 }
6440
6441 // Perform combined montmul then add/sub on 4x4S vectors.
6442 void dilithium_montmul16_sub_add(
6443 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6444 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6445 // compute a = montmul(a1, c)
6446 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6447 // ouptut a1 = a0 - a
6448 vs_subv(va1, __ T4S, va0, vc);
6449 // and a0 = a0 + a
6450 vs_addv(va0, __ T4S, va0, vc);
6451 }
6452
6453 // Perform combined add/sub then montul on 4x4S vectors.
6454 void dilithium_sub_add_montmul16(
6455 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6456 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6457 // compute c = a0 - a1
6458 vs_subv(vtmp1, __ T4S, va0, va1);
6459 // output a0 = a0 + a1
6460 vs_addv(va0, __ T4S, va0, va1);
6461 // output a1 = b montmul c
6462 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6463 }
6464
6465 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6466 // in the Java implementation come in sequences of at least 8, so we
6467 // can use ldpq to collect the corresponding data into pairs of vector
6468 // registers.
6469 // We collect the coefficients corresponding to the 'j+l' indexes into
6470 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6471 // then we do the (Montgomery) multiplications by the zetas in parallel
6472 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6473 // v0-v7, then do the additions into v24-v31 and the subtractions into
6474 // v0-v7 and finally save the results back to the coeffs array.
6475 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6476 const Register coeffs, const Register zetas) {
6477 int c1 = 0;
6478 int c2 = 512;
6479 int startIncr;
6480 // don't use callee save registers v8 - v15
6481 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6482 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6483 VSeq<2> vq(30); // n.b. constants overlap vs3
6484 int offsets[4] = { 0, 32, 64, 96 };
6485
6486 for (int level = 0; level < 5; level++) {
6487 int c1Start = c1;
6488 int c2Start = c2;
6489 if (level == 3) {
6490 offsets[1] = 32;
6491 offsets[2] = 128;
6492 offsets[3] = 160;
6493 } else if (level == 4) {
6494 offsets[1] = 64;
6495 offsets[2] = 128;
6496 offsets[3] = 192;
6497 }
6498
6499 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6500 // time at 4 different offsets and multiply them in order by the
6501 // next set of input values. So we employ indexed load and store
6502 // pair instructions with arrangement 4S.
6503 for (int i = 0; i < 4; i++) {
6504 // reload q and qinv
6505 vs_ldpq(vq, dilithiumConsts); // qInv, q
6506 // load 8x4S coefficients via second start pos == c2
6507 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6508 // load next 8x4S inputs == b
6509 vs_ldpq_post(vs2, zetas);
6510 // compute a == c2 * b mod MONT_Q
6511 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6512 // load 8x4s coefficients via first start pos == c1
6513 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6514 // compute a1 = c1 + a
6515 vs_addv(vs3, __ T4S, vs1, vs2);
6516 // compute a2 = c1 - a
6517 vs_subv(vs1, __ T4S, vs1, vs2);
6518 // output a1 and a2
6519 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6520 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6521
6522 int k = 4 * level + i;
6523
6524 if (k > 7) {
6525 startIncr = 256;
6526 } else if (k == 5) {
6527 startIncr = 384;
6528 } else {
6529 startIncr = 128;
6530 }
6531
6532 c1Start += startIncr;
6533 c2Start += startIncr;
6534 }
6535
6536 c2 /= 2;
6537 }
6538 }
6539
6540 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
6541 // Implements the method
6542 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
6543 // of the Java class sun.security.provider
6544 //
6545 // coeffs (int[256]) = c_rarg0
6546 // zetas (int[256]) = c_rarg1
6547 address generate_dilithiumAlmostNtt() {
6548
6549 __ align(CodeEntryAlignment);
6550 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
6551 StubCodeMark mark(this, stub_id);
6552 address start = __ pc();
6553 __ enter();
6554
6555 const Register coeffs = c_rarg0;
6556 const Register zetas = c_rarg1;
6557
6558 const Register tmpAddr = r9;
6559 const Register dilithiumConsts = r10;
6560 const Register result = r11;
6561 // don't use callee save registers v8 - v15
6562 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6563 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6564 VSeq<2> vq(30); // n.b. constants overlap vs3
6565 int offsets[4] = { 0, 32, 64, 96};
6566 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6567 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6568 __ add(result, coeffs, 0);
6569 __ lea(dilithiumConsts,
6570 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6571
6572 // Each level represents one iteration of the outer for loop of the Java version.
6573
6574 // level 0-4
6575 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
6576
6577 // level 5
6578
6579 // At level 5 the coefficients we need to combine with the zetas
6580 // are grouped in memory in blocks of size 4. So, for both sets of
6581 // coefficients we load 4 adjacent values at 8 different offsets
6582 // using an indexed ldr with register variant Q and multiply them
6583 // in sequence order by the next set of inputs. Likewise we store
6584 // the resuls using an indexed str with register variant Q.
6585 for (int i = 0; i < 1024; i += 256) {
6586 // reload constants q, qinv each iteration as they get clobbered later
6587 vs_ldpq(vq, dilithiumConsts); // qInv, q
6588 // load 32 (8x4S) coefficients via first offsets = c1
6589 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6590 // load next 32 (8x4S) inputs = b
6591 vs_ldpq_post(vs2, zetas);
6592 // a = b montul c1
6593 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6594 // load 32 (8x4S) coefficients via second offsets = c2
6595 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
6596 // add/sub with result of multiply
6597 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
6598 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
6599 // write back new coefficients using same offsets
6600 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
6601 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
6602 }
6603
6604 // level 6
6605 // At level 6 the coefficients we need to combine with the zetas
6606 // are grouped in memory in pairs, the first two being montmul
6607 // inputs and the second add/sub inputs. We can still implement
6608 // the montmul+sub+add using 4-way parallelism but only if we
6609 // combine the coefficients with the zetas 16 at a time. We load 8
6610 // adjacent values at 4 different offsets using an ld2 load with
6611 // arrangement 2D. That interleaves the lower and upper halves of
6612 // each pair of quadwords into successive vector registers. We
6613 // then need to montmul the 4 even elements of the coefficients
6614 // register sequence by the zetas in order and then add/sub the 4
6615 // odd elements of the coefficients register sequence. We use an
6616 // equivalent st2 operation to store the results back into memory
6617 // de-interleaved.
6618 for (int i = 0; i < 1024; i += 128) {
6619 // reload constants q, qinv each iteration as they get clobbered later
6620 vs_ldpq(vq, dilithiumConsts); // qInv, q
6621 // load interleaved 16 (4x2D) coefficients via offsets
6622 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6623 // load next 16 (4x4S) inputs
6624 vs_ldpq_post(vs_front(vs2), zetas);
6625 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6626 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6627 vs_front(vs2), vtmp, vq);
6628 // store interleaved 16 (4x2D) coefficients via offsets
6629 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6630 }
6631
6632 // level 7
6633 // At level 7 the coefficients we need to combine with the zetas
6634 // occur singly with montmul inputs alterating with add/sub
6635 // inputs. Once again we can use 4-way parallelism to combine 16
6636 // zetas at a time. However, we have to load 8 adjacent values at
6637 // 4 different offsets using an ld2 load with arrangement 4S. That
6638 // interleaves the the odd words of each pair into one
6639 // coefficients vector register and the even words of the pair
6640 // into the next register. We then need to montmul the 4 even
6641 // elements of the coefficients register sequence by the zetas in
6642 // order and then add/sub the 4 odd elements of the coefficients
6643 // register sequence. We use an equivalent st2 operation to store
6644 // the results back into memory de-interleaved.
6645
6646 for (int i = 0; i < 1024; i += 128) {
6647 // reload constants q, qinv each iteration as they get clobbered later
6648 vs_ldpq(vq, dilithiumConsts); // qInv, q
6649 // load interleaved 16 (4x4S) coefficients via offsets
6650 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6651 // load next 16 (4x4S) inputs
6652 vs_ldpq_post(vs_front(vs2), zetas);
6653 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
6654 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
6655 vs_front(vs2), vtmp, vq);
6656 // store interleaved 16 (4x4S) coefficients via offsets
6657 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6658 }
6659 __ leave(); // required for proper stackwalking of RuntimeStub frame
6660 __ mov(r0, zr); // return 0
6661 __ ret(lr);
6662
6663 return start;
6664 }
6665
6666 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6667 // in the Java implementation come in sequences of at least 8, so we
6668 // can use ldpq to collect the corresponding data into pairs of vector
6669 // registers
6670 // We collect the coefficients that correspond to the 'j's into vs1
6671 // the coefficiets that correspond to the 'j+l's into vs2 then
6672 // do the additions into vs3 and the subtractions into vs1 then
6673 // save the result of the additions, load the zetas into vs2
6674 // do the (Montgomery) multiplications by zeta in parallel into vs2
6675 // finally save the results back to the coeffs array
6676 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
6677 const Register coeffs, const Register zetas) {
6678 int c1 = 0;
6679 int c2 = 32;
6680 int startIncr;
6681 int offsets[4];
6682 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6683 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6684 VSeq<2> vq(30); // n.b. constants overlap vs3
6685
6686 offsets[0] = 0;
6687
6688 for (int level = 3; level < 8; level++) {
6689 int c1Start = c1;
6690 int c2Start = c2;
6691 if (level == 3) {
6692 offsets[1] = 64;
6693 offsets[2] = 128;
6694 offsets[3] = 192;
6695 } else if (level == 4) {
6696 offsets[1] = 32;
6697 offsets[2] = 128;
6698 offsets[3] = 160;
6699 } else {
6700 offsets[1] = 32;
6701 offsets[2] = 64;
6702 offsets[3] = 96;
6703 }
6704
6705 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
6706 // time at 4 different offsets and multiply them in order by the
6707 // next set of input values. So we employ indexed load and store
6708 // pair instructions with arrangement 4S.
6709 for (int i = 0; i < 4; i++) {
6710 // load v1 32 (8x4S) coefficients relative to first start index
6711 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6712 // load v2 32 (8x4S) coefficients relative to second start index
6713 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
6714 // a0 = v1 + v2 -- n.b. clobbers vqs
6715 vs_addv(vs3, __ T4S, vs1, vs2);
6716 // a1 = v1 - v2
6717 vs_subv(vs1, __ T4S, vs1, vs2);
6718 // save a1 relative to first start index
6719 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6720 // load constants q, qinv each iteration as they get clobbered above
6721 vs_ldpq(vq, dilithiumConsts); // qInv, q
6722 // load b next 32 (8x4S) inputs
6723 vs_ldpq_post(vs2, zetas);
6724 // a = a1 montmul b
6725 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6726 // save a relative to second start index
6727 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
6728
6729 int k = 4 * level + i;
6730
6731 if (k < 24) {
6732 startIncr = 256;
6733 } else if (k == 25) {
6734 startIncr = 384;
6735 } else {
6736 startIncr = 128;
6737 }
6738
6739 c1Start += startIncr;
6740 c2Start += startIncr;
6741 }
6742
6743 c2 *= 2;
6744 }
6745 }
6746
6747 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
6748 // Implements the method
6749 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
6750 // the sun.security.provider.ML_DSA class.
6751 //
6752 // coeffs (int[256]) = c_rarg0
6753 // zetas (int[256]) = c_rarg1
6754 address generate_dilithiumAlmostInverseNtt() {
6755
6756 __ align(CodeEntryAlignment);
6757 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
6758 StubCodeMark mark(this, stub_id);
6759 address start = __ pc();
6760 __ enter();
6761
6762 const Register coeffs = c_rarg0;
6763 const Register zetas = c_rarg1;
6764
6765 const Register tmpAddr = r9;
6766 const Register dilithiumConsts = r10;
6767 const Register result = r11;
6768 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6769 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6770 VSeq<2> vq(30); // n.b. constants overlap vs3
6771 int offsets[4] = { 0, 32, 64, 96 };
6772 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6773 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
6774
6775 __ add(result, coeffs, 0);
6776 __ lea(dilithiumConsts,
6777 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6778
6779 // Each level represents one iteration of the outer for loop of the Java version
6780
6781 // level 0
6782 // At level 0 we need to interleave adjacent quartets of
6783 // coefficients before we multiply and add/sub by the next 16
6784 // zetas just as we did for level 7 in the multiply code. So we
6785 // load and store the values using an ld2/st2 with arrangement 4S.
6786 for (int i = 0; i < 1024; i += 128) {
6787 // load constants q, qinv
6788 // n.b. this can be moved out of the loop as they do not get
6789 // clobbered by first two loops
6790 vs_ldpq(vq, dilithiumConsts); // qInv, q
6791 // a0/a1 load interleaved 32 (8x4S) coefficients
6792 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6793 // b load next 32 (8x4S) inputs
6794 vs_ldpq_post(vs_front(vs2), zetas);
6795 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6796 // n.b. second half of vs2 provides temporary register storage
6797 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6798 vs_front(vs2), vs_back(vs2), vtmp, vq);
6799 // a0/a1 store interleaved 32 (8x4S) coefficients
6800 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
6801 }
6802
6803 // level 1
6804 // At level 1 we need to interleave pairs of adjacent pairs of
6805 // coefficients before we multiply by the next 16 zetas just as we
6806 // did for level 6 in the multiply code. So we load and store the
6807 // values an ld2/st2 with arrangement 2D.
6808 for (int i = 0; i < 1024; i += 128) {
6809 // a0/a1 load interleaved 32 (8x2D) coefficients
6810 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6811 // b load next 16 (4x4S) inputs
6812 vs_ldpq_post(vs_front(vs2), zetas);
6813 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
6814 // n.b. second half of vs2 provides temporary register storage
6815 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
6816 vs_front(vs2), vs_back(vs2), vtmp, vq);
6817 // a0/a1 store interleaved 32 (8x2D) coefficients
6818 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
6819 }
6820
6821 // level 2
6822 // At level 2 coefficients come in blocks of 4. So, we load 4
6823 // adjacent coefficients at 8 distinct offsets for both the first
6824 // and second coefficient sequences, using an ldr with register
6825 // variant Q then combine them with next set of 32 zetas. Likewise
6826 // we store the results using an str with register variant Q.
6827 for (int i = 0; i < 1024; i += 256) {
6828 // c0 load 32 (8x4S) coefficients via first offsets
6829 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
6830 // c1 load 32 (8x4S) coefficients via second offsets
6831 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
6832 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
6833 vs_addv(vs3, __ T4S, vs1, vs2);
6834 // c = c0 - c1
6835 vs_subv(vs1, __ T4S, vs1, vs2);
6836 // store a0 32 (8x4S) coefficients via first offsets
6837 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
6838 // b load 32 (8x4S) next inputs
6839 vs_ldpq_post(vs2, zetas);
6840 // reload constants q, qinv -- they were clobbered earlier
6841 vs_ldpq(vq, dilithiumConsts); // qInv, q
6842 // compute a1 = b montmul c
6843 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6844 // store a1 32 (8x4S) coefficients via second offsets
6845 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
6846 }
6847
6848 // level 3-7
6849 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
6850
6851 __ leave(); // required for proper stackwalking of RuntimeStub frame
6852 __ mov(r0, zr); // return 0
6853 __ ret(lr);
6854
6855 return start;
6856 }
6857
6858 // Dilithium multiply polynomials in the NTT domain.
6859 // Straightforward implementation of the method
6860 // static int implDilithiumNttMult(
6861 // int[] result, int[] ntta, int[] nttb {} of
6862 // the sun.security.provider.ML_DSA class.
6863 //
6864 // result (int[256]) = c_rarg0
6865 // poly1 (int[256]) = c_rarg1
6866 // poly2 (int[256]) = c_rarg2
6867 address generate_dilithiumNttMult() {
6868
6869 __ align(CodeEntryAlignment);
6870 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
6871 StubCodeMark mark(this, stub_id);
6872 address start = __ pc();
6873 __ enter();
6874
6875 Label L_loop;
6876
6877 const Register result = c_rarg0;
6878 const Register poly1 = c_rarg1;
6879 const Register poly2 = c_rarg2;
6880
6881 const Register dilithiumConsts = r10;
6882 const Register len = r11;
6883
6884 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6885 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6886 VSeq<2> vq(30); // n.b. constants overlap vs3
6887 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
6888
6889 __ lea(dilithiumConsts,
6890 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6891
6892 // load constants q, qinv
6893 vs_ldpq(vq, dilithiumConsts); // qInv, q
6894 // load constant rSquare into v29
6895 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
6896
6897 __ mov(len, zr);
6898 __ add(len, len, 1024);
6899
6900 __ BIND(L_loop);
6901
6902 // b load 32 (8x4S) next inputs from poly1
6903 vs_ldpq_post(vs1, poly1);
6904 // c load 32 (8x4S) next inputs from poly2
6905 vs_ldpq_post(vs2, poly2);
6906 // compute a = b montmul c
6907 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6908 // compute a = rsquare montmul a
6909 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
6910 // save a 32 (8x4S) results
6911 vs_stpq_post(vs2, result);
6912
6913 __ sub(len, len, 128);
6914 __ cmp(len, (u1)128);
6915 __ br(Assembler::GE, L_loop);
6916
6917 __ leave(); // required for proper stackwalking of RuntimeStub frame
6918 __ mov(r0, zr); // return 0
6919 __ ret(lr);
6920
6921 return start;
6922 }
6923
6924 // Dilithium Motgomery multiply an array by a constant.
6925 // A straightforward implementation of the method
6926 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
6927 // of the sun.security.provider.MLDSA class
6928 //
6929 // coeffs (int[256]) = c_rarg0
6930 // constant (int) = c_rarg1
6931 address generate_dilithiumMontMulByConstant() {
6932
6933 __ align(CodeEntryAlignment);
6934 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
6935 StubCodeMark mark(this, stub_id);
6936 address start = __ pc();
6937 __ enter();
6938
6939 Label L_loop;
6940
6941 const Register coeffs = c_rarg0;
6942 const Register constant = c_rarg1;
6943
6944 const Register dilithiumConsts = r10;
6945 const Register result = r11;
6946 const Register len = r12;
6947
6948 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6949 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6950 VSeq<2> vq(30); // n.b. constants overlap vs3
6951 VSeq<8> vconst(29, 0); // for montmul by constant
6952
6953 // results track inputs
6954 __ add(result, coeffs, 0);
6955 __ lea(dilithiumConsts,
6956 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
6957
6958 // load constants q, qinv -- they do not get clobbered by first two loops
6959 vs_ldpq(vq, dilithiumConsts); // qInv, q
6960 // copy caller supplied constant across vconst
6961 __ dup(vconst[0], __ T4S, constant);
6962 __ mov(len, zr);
6963 __ add(len, len, 1024);
6964
6965 __ BIND(L_loop);
6966
6967 // load next 32 inputs
6968 vs_ldpq_post(vs2, coeffs);
6969 // mont mul by constant
6970 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
6971 // write next 32 results
6972 vs_stpq_post(vs2, result);
6973
6974 __ sub(len, len, 128);
6975 __ cmp(len, (u1)128);
6976 __ br(Assembler::GE, L_loop);
6977
6978 __ leave(); // required for proper stackwalking of RuntimeStub frame
6979 __ mov(r0, zr); // return 0
6980 __ ret(lr);
6981
6982 return start;
6983 }
6984
6985 // Dilithium decompose poly.
6986 // Implements the method
6987 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
6988 // of the sun.security.provider.ML_DSA class
6989 //
6990 // input (int[256]) = c_rarg0
6991 // lowPart (int[256]) = c_rarg1
6992 // highPart (int[256]) = c_rarg2
6993 // twoGamma2 (int) = c_rarg3
6994 // multiplier (int) = c_rarg4
6995 address generate_dilithiumDecomposePoly() {
6996
6997 __ align(CodeEntryAlignment);
6998 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
6999 StubCodeMark mark(this, stub_id);
7000 address start = __ pc();
7001 Label L_loop;
7002
7003 const Register input = c_rarg0;
7004 const Register lowPart = c_rarg1;
7005 const Register highPart = c_rarg2;
7006 const Register twoGamma2 = c_rarg3;
7007 const Register multiplier = c_rarg4;
7008
7009 const Register len = r9;
7010 const Register dilithiumConsts = r10;
7011 const Register tmp = r11;
7012
7013 // 6 independent sets of 4x4s values
7014 VSeq<4> vs1(0), vs2(4), vs3(8);
7015 VSeq<4> vs4(12), vs5(16), vtmp(20);
7016
7017 // 7 constants for cross-multiplying
7018 VSeq<4> one(25, 0);
7019 VSeq<4> qminus1(26, 0);
7020 VSeq<4> g2(27, 0);
7021 VSeq<4> twog2(28, 0);
7022 VSeq<4> mult(29, 0);
7023 VSeq<4> q(30, 0);
7024 VSeq<4> qadd(31, 0);
7025
7026 __ enter();
7027
7028 __ lea(dilithiumConsts,
7029 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7030
7031 // save callee-saved registers
7032 __ stpd(v8, v9, __ pre(sp, -64));
7033 __ stpd(v10, v11, Address(sp, 16));
7034 __ stpd(v12, v13, Address(sp, 32));
7035 __ stpd(v14, v15, Address(sp, 48));
7036
7037 // populate constant registers
7038 __ mov(tmp, zr);
7039 __ add(tmp, tmp, 1);
7040 __ dup(one[0], __ T4S, tmp); // 1
7041 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7042 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7043 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7044 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7045 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7046 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7047
7048 __ mov(len, zr);
7049 __ add(len, len, 1024);
7050
7051 __ BIND(L_loop);
7052
7053 // load next 4x4S inputs interleaved: rplus --> vs1
7054 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7055
7056 // rplus = rplus - ((rplus + qadd) >> 23) * q
7057 vs_addv(vtmp, __ T4S, vs1, qadd);
7058 vs_sshr(vtmp, __ T4S, vtmp, 23);
7059 vs_mulv(vtmp, __ T4S, vtmp, q);
7060 vs_subv(vs1, __ T4S, vs1, vtmp);
7061
7062 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7063 vs_sshr(vtmp, __ T4S, vs1, 31);
7064 vs_andr(vtmp, vtmp, q);
7065 vs_addv(vs1, __ T4S, vs1, vtmp);
7066
7067 // quotient --> vs2
7068 // int quotient = (rplus * multiplier) >> 22;
7069 vs_mulv(vtmp, __ T4S, vs1, mult);
7070 vs_sshr(vs2, __ T4S, vtmp, 22);
7071
7072 // r0 --> vs3
7073 // int r0 = rplus - quotient * twoGamma2;
7074 vs_mulv(vtmp, __ T4S, vs2, twog2);
7075 vs_subv(vs3, __ T4S, vs1, vtmp);
7076
7077 // mask --> vs4
7078 // int mask = (twoGamma2 - r0) >> 22;
7079 vs_subv(vtmp, __ T4S, twog2, vs3);
7080 vs_sshr(vs4, __ T4S, vtmp, 22);
7081
7082 // r0 -= (mask & twoGamma2);
7083 vs_andr(vtmp, vs4, twog2);
7084 vs_subv(vs3, __ T4S, vs3, vtmp);
7085
7086 // quotient += (mask & 1);
7087 vs_andr(vtmp, vs4, one);
7088 vs_addv(vs2, __ T4S, vs2, vtmp);
7089
7090 // mask = (twoGamma2 / 2 - r0) >> 31;
7091 vs_subv(vtmp, __ T4S, g2, vs3);
7092 vs_sshr(vs4, __ T4S, vtmp, 31);
7093
7094 // r0 -= (mask & twoGamma2);
7095 vs_andr(vtmp, vs4, twog2);
7096 vs_subv(vs3, __ T4S, vs3, vtmp);
7097
7098 // quotient += (mask & 1);
7099 vs_andr(vtmp, vs4, one);
7100 vs_addv(vs2, __ T4S, vs2, vtmp);
7101
7102 // r1 --> vs5
7103 // int r1 = rplus - r0 - (dilithium_q - 1);
7104 vs_subv(vtmp, __ T4S, vs1, vs3);
7105 vs_subv(vs5, __ T4S, vtmp, qminus1);
7106
7107 // r1 --> vs1 (overwriting rplus)
7108 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7109 vs_negr(vtmp, __ T4S, vs5);
7110 vs_orr(vtmp, vs5, vtmp);
7111 vs_sshr(vs1, __ T4S, vtmp, 31);
7112
7113 // r0 += ~r1;
7114 vs_notr(vtmp, vs1);
7115 vs_addv(vs3, __ T4S, vs3, vtmp);
7116
7117 // r1 = r1 & quotient;
7118 vs_andr(vs1, vs2, vs1);
7119
7120 // store results inteleaved
7121 // lowPart[m] = r0;
7122 // highPart[m] = r1;
7123 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7124 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7125
7126 __ sub(len, len, 64);
7127 __ cmp(len, (u1)64);
7128 __ br(Assembler::GE, L_loop);
7129
7130 // restore callee-saved vector registers
7131 __ ldpd(v14, v15, Address(sp, 48));
7132 __ ldpd(v12, v13, Address(sp, 32));
7133 __ ldpd(v10, v11, Address(sp, 16));
7134 __ ldpd(v8, v9, __ post(sp, 64));
7135
7136 __ leave(); // required for proper stackwalking of RuntimeStub frame
7137 __ mov(r0, zr); // return 0
7138 __ ret(lr);
7139
7140 return start;
7141 }
7142
7143 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7144 Register tmp0, Register tmp1, Register tmp2) {
7145 __ bic(tmp0, a2, a1); // for a0
7146 __ bic(tmp1, a3, a2); // for a1
7147 __ bic(tmp2, a4, a3); // for a2
7148 __ eor(a2, a2, tmp2);
7149 __ bic(tmp2, a0, a4); // for a3
7150 __ eor(a3, a3, tmp2);
7151 __ bic(tmp2, a1, a0); // for a4
7152 __ eor(a0, a0, tmp0);
7153 __ eor(a1, a1, tmp1);
7154 __ eor(a4, a4, tmp2);
7155 }
7156
7157 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7158 Register a0, Register a1, Register a2, Register a3, Register a4,
7159 Register a5, Register a6, Register a7, Register a8, Register a9,
7160 Register a10, Register a11, Register a12, Register a13, Register a14,
7161 Register a15, Register a16, Register a17, Register a18, Register a19,
7162 Register a20, Register a21, Register a22, Register a23, Register a24,
7163 Register tmp0, Register tmp1, Register tmp2) {
7164 __ eor3(tmp1, a4, a9, a14);
7165 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7166 __ eor3(tmp2, a1, a6, a11);
7167 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7168 __ rax1(tmp2, tmp0, tmp1); // d0
7169 {
7170
7171 Register tmp3, tmp4;
7172 if (can_use_fp && can_use_r18) {
7173 tmp3 = rfp;
7174 tmp4 = r18_tls;
7175 } else {
7176 tmp3 = a4;
7177 tmp4 = a9;
7178 __ stp(tmp3, tmp4, __ pre(sp, -16));
7179 }
7180
7181 __ eor3(tmp3, a0, a5, a10);
7182 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7183 __ eor(a0, a0, tmp2);
7184 __ eor(a5, a5, tmp2);
7185 __ eor(a10, a10, tmp2);
7186 __ eor(a15, a15, tmp2);
7187 __ eor(a20, a20, tmp2); // d0(tmp2)
7188 __ eor3(tmp3, a2, a7, a12);
7189 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7190 __ rax1(tmp3, tmp4, tmp2); // d1
7191 __ eor(a1, a1, tmp3);
7192 __ eor(a6, a6, tmp3);
7193 __ eor(a11, a11, tmp3);
7194 __ eor(a16, a16, tmp3);
7195 __ eor(a21, a21, tmp3); // d1(tmp3)
7196 __ rax1(tmp3, tmp2, tmp0); // d3
7197 __ eor3(tmp2, a3, a8, a13);
7198 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7199 __ eor(a3, a3, tmp3);
7200 __ eor(a8, a8, tmp3);
7201 __ eor(a13, a13, tmp3);
7202 __ eor(a18, a18, tmp3);
7203 __ eor(a23, a23, tmp3);
7204 __ rax1(tmp2, tmp1, tmp0); // d2
7205 __ eor(a2, a2, tmp2);
7206 __ eor(a7, a7, tmp2);
7207 __ eor(a12, a12, tmp2);
7208 __ rax1(tmp0, tmp0, tmp4); // d4
7209 if (!can_use_fp || !can_use_r18) {
7210 __ ldp(tmp3, tmp4, __ post(sp, 16));
7211 }
7212 __ eor(a17, a17, tmp2);
7213 __ eor(a22, a22, tmp2);
7214 __ eor(a4, a4, tmp0);
7215 __ eor(a9, a9, tmp0);
7216 __ eor(a14, a14, tmp0);
7217 __ eor(a19, a19, tmp0);
7218 __ eor(a24, a24, tmp0);
7219 }
7220
7221 __ rol(tmp0, a10, 3);
7222 __ rol(a10, a1, 1);
7223 __ rol(a1, a6, 44);
7224 __ rol(a6, a9, 20);
7225 __ rol(a9, a22, 61);
7226 __ rol(a22, a14, 39);
7227 __ rol(a14, a20, 18);
7228 __ rol(a20, a2, 62);
7229 __ rol(a2, a12, 43);
7230 __ rol(a12, a13, 25);
7231 __ rol(a13, a19, 8) ;
7232 __ rol(a19, a23, 56);
7233 __ rol(a23, a15, 41);
7234 __ rol(a15, a4, 27);
7235 __ rol(a4, a24, 14);
7236 __ rol(a24, a21, 2);
7237 __ rol(a21, a8, 55);
7238 __ rol(a8, a16, 45);
7239 __ rol(a16, a5, 36);
7240 __ rol(a5, a3, 28);
7241 __ rol(a3, a18, 21);
7242 __ rol(a18, a17, 15);
7243 __ rol(a17, a11, 10);
7244 __ rol(a11, a7, 6);
7245 __ mov(a7, tmp0);
7246
7247 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7248 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7249 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7250 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7251 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7252
7253 __ ldr(tmp1, __ post(rc, 8));
7254 __ eor(a0, a0, tmp1);
7255
7256 }
7257
7258 // Arguments:
7259 //
7260 // Inputs:
7261 // c_rarg0 - byte[] source+offset
7262 // c_rarg1 - byte[] SHA.state
7263 // c_rarg2 - int block_size
7264 // c_rarg3 - int offset
7265 // c_rarg4 - int limit
7266 //
7267 address generate_sha3_implCompress_gpr(StubId stub_id) {
7268 bool multi_block;
7269 switch (stub_id) {
7270 case StubId::stubgen_sha3_implCompress_id:
7271 multi_block = false;
7272 break;
7273 case StubId::stubgen_sha3_implCompressMB_id:
7274 multi_block = true;
7275 break;
7276 default:
7277 ShouldNotReachHere();
7278 }
7279
7280 static const uint64_t round_consts[24] = {
7281 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
7282 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
7283 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
7284 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
7285 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
7286 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
7287 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
7288 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
7289 };
7290
7291 __ align(CodeEntryAlignment);
7292 StubCodeMark mark(this, stub_id);
7293 address start = __ pc();
7294
7295 Register buf = c_rarg0;
7296 Register state = c_rarg1;
7297 Register block_size = c_rarg2;
7298 Register ofs = c_rarg3;
7299 Register limit = c_rarg4;
7300
7301 // use r3.r17,r19..r28 to keep a0..a24.
7302 // a0..a24 are respective locals from SHA3.java
7303 Register a0 = r25,
7304 a1 = r26,
7305 a2 = r27,
7306 a3 = r3,
7307 a4 = r4,
7308 a5 = r5,
7309 a6 = r6,
7310 a7 = r7,
7311 a8 = rscratch1, // r8
7312 a9 = rscratch2, // r9
7313 a10 = r10,
7314 a11 = r11,
7315 a12 = r12,
7316 a13 = r13,
7317 a14 = r14,
7318 a15 = r15,
7319 a16 = r16,
7320 a17 = r17,
7321 a18 = r28,
7322 a19 = r19,
7323 a20 = r20,
7324 a21 = r21,
7325 a22 = r22,
7326 a23 = r23,
7327 a24 = r24;
7328
7329 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7330
7331 Label sha3_loop, rounds24_preloop, loop_body;
7332 Label sha3_512_or_sha3_384, shake128;
7333
7334 bool can_use_r18 = false;
7335 #ifndef R18_RESERVED
7336 can_use_r18 = true;
7337 #endif
7338 bool can_use_fp = !PreserveFramePointer;
7339
7340 __ enter();
7341
7342 // save almost all yet unsaved gpr registers on stack
7343 __ str(block_size, __ pre(sp, -128));
7344 if (multi_block) {
7345 __ stpw(ofs, limit, Address(sp, 8));
7346 }
7347 // 8 bytes at sp+16 will be used to keep buf
7348 __ stp(r19, r20, Address(sp, 32));
7349 __ stp(r21, r22, Address(sp, 48));
7350 __ stp(r23, r24, Address(sp, 64));
7351 __ stp(r25, r26, Address(sp, 80));
7352 __ stp(r27, r28, Address(sp, 96));
7353 if (can_use_r18 && can_use_fp) {
7354 __ stp(r18_tls, state, Address(sp, 112));
7355 } else {
7356 __ str(state, Address(sp, 112));
7357 }
7358
7359 // begin sha3 calculations: loading a0..a24 from state arrary
7360 __ ldp(a0, a1, state);
7361 __ ldp(a2, a3, Address(state, 16));
7362 __ ldp(a4, a5, Address(state, 32));
7363 __ ldp(a6, a7, Address(state, 48));
7364 __ ldp(a8, a9, Address(state, 64));
7365 __ ldp(a10, a11, Address(state, 80));
7366 __ ldp(a12, a13, Address(state, 96));
7367 __ ldp(a14, a15, Address(state, 112));
7368 __ ldp(a16, a17, Address(state, 128));
7369 __ ldp(a18, a19, Address(state, 144));
7370 __ ldp(a20, a21, Address(state, 160));
7371 __ ldp(a22, a23, Address(state, 176));
7372 __ ldr(a24, Address(state, 192));
7373
7374 __ BIND(sha3_loop);
7375
7376 // load input
7377 __ ldp(tmp3, tmp2, __ post(buf, 16));
7378 __ eor(a0, a0, tmp3);
7379 __ eor(a1, a1, tmp2);
7380 __ ldp(tmp3, tmp2, __ post(buf, 16));
7381 __ eor(a2, a2, tmp3);
7382 __ eor(a3, a3, tmp2);
7383 __ ldp(tmp3, tmp2, __ post(buf, 16));
7384 __ eor(a4, a4, tmp3);
7385 __ eor(a5, a5, tmp2);
7386 __ ldr(tmp3, __ post(buf, 8));
7387 __ eor(a6, a6, tmp3);
7388
7389 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7390 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7391
7392 __ ldp(tmp3, tmp2, __ post(buf, 16));
7393 __ eor(a7, a7, tmp3);
7394 __ eor(a8, a8, tmp2);
7395 __ ldp(tmp3, tmp2, __ post(buf, 16));
7396 __ eor(a9, a9, tmp3);
7397 __ eor(a10, a10, tmp2);
7398 __ ldp(tmp3, tmp2, __ post(buf, 16));
7399 __ eor(a11, a11, tmp3);
7400 __ eor(a12, a12, tmp2);
7401 __ ldp(tmp3, tmp2, __ post(buf, 16));
7402 __ eor(a13, a13, tmp3);
7403 __ eor(a14, a14, tmp2);
7404 __ ldp(tmp3, tmp2, __ post(buf, 16));
7405 __ eor(a15, a15, tmp3);
7406 __ eor(a16, a16, tmp2);
7407
7408 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7409 __ andw(tmp2, block_size, 48);
7410 __ cbzw(tmp2, rounds24_preloop);
7411 __ tbnz(block_size, 5, shake128);
7412 // block_size == 144, bit5 == 0, SHA3-244
7413 __ ldr(tmp3, __ post(buf, 8));
7414 __ eor(a17, a17, tmp3);
7415 __ b(rounds24_preloop);
7416
7417 __ BIND(shake128);
7418 __ ldp(tmp3, tmp2, __ post(buf, 16));
7419 __ eor(a17, a17, tmp3);
7420 __ eor(a18, a18, tmp2);
7421 __ ldp(tmp3, tmp2, __ post(buf, 16));
7422 __ eor(a19, a19, tmp3);
7423 __ eor(a20, a20, tmp2);
7424 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7425
7426 __ BIND(sha3_512_or_sha3_384);
7427 __ ldp(tmp3, tmp2, __ post(buf, 16));
7428 __ eor(a7, a7, tmp3);
7429 __ eor(a8, a8, tmp2);
7430 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7431
7432 // SHA3-384
7433 __ ldp(tmp3, tmp2, __ post(buf, 16));
7434 __ eor(a9, a9, tmp3);
7435 __ eor(a10, a10, tmp2);
7436 __ ldp(tmp3, tmp2, __ post(buf, 16));
7437 __ eor(a11, a11, tmp3);
7438 __ eor(a12, a12, tmp2);
7439
7440 __ BIND(rounds24_preloop);
7441 __ fmovs(v0, 24.0); // float loop counter,
7442 __ fmovs(v1, 1.0); // exact representation
7443
7444 __ str(buf, Address(sp, 16));
7445 __ lea(tmp3, ExternalAddress((address) round_consts));
7446
7447 __ BIND(loop_body);
7448 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7449 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7450 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7451 tmp0, tmp1, tmp2);
7452 __ fsubs(v0, v0, v1);
7453 __ fcmps(v0, 0.0);
7454 __ br(__ NE, loop_body);
7455
7456 if (multi_block) {
7457 __ ldrw(block_size, sp); // block_size
7458 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7459 __ addw(tmp2, tmp2, block_size);
7460 __ cmpw(tmp2, tmp1);
7461 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7462 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7463 __ br(Assembler::LE, sha3_loop);
7464 __ movw(c_rarg0, tmp2); // return offset
7465 }
7466 if (can_use_fp && can_use_r18) {
7467 __ ldp(r18_tls, state, Address(sp, 112));
7468 } else {
7469 __ ldr(state, Address(sp, 112));
7470 }
7471 // save calculated sha3 state
7472 __ stp(a0, a1, Address(state));
7473 __ stp(a2, a3, Address(state, 16));
7474 __ stp(a4, a5, Address(state, 32));
7475 __ stp(a6, a7, Address(state, 48));
7476 __ stp(a8, a9, Address(state, 64));
7477 __ stp(a10, a11, Address(state, 80));
7478 __ stp(a12, a13, Address(state, 96));
7479 __ stp(a14, a15, Address(state, 112));
7480 __ stp(a16, a17, Address(state, 128));
7481 __ stp(a18, a19, Address(state, 144));
7482 __ stp(a20, a21, Address(state, 160));
7483 __ stp(a22, a23, Address(state, 176));
7484 __ str(a24, Address(state, 192));
7485
7486 // restore required registers from stack
7487 __ ldp(r19, r20, Address(sp, 32));
7488 __ ldp(r21, r22, Address(sp, 48));
7489 __ ldp(r23, r24, Address(sp, 64));
7490 __ ldp(r25, r26, Address(sp, 80));
7491 __ ldp(r27, r28, Address(sp, 96));
7492 if (can_use_fp && can_use_r18) {
7493 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
7494 } // else no need to recalculate rfp, since it wasn't changed
7495
7496 __ leave();
7497
7498 __ ret(lr);
7499
7500 return start;
7501 }
7502
7503 /**
7504 * Arguments:
7505 *
7506 * Inputs:
7507 * c_rarg0 - int crc
7508 * c_rarg1 - byte* buf
7509 * c_rarg2 - int length
7510 *
7511 * Output:
7512 * rax - int crc result
7513 */
7514 address generate_updateBytesCRC32() {
7515 assert(UseCRC32Intrinsics, "what are we doing here?");
7516
7517 __ align(CodeEntryAlignment);
7518 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
7519 StubCodeMark mark(this, stub_id);
7520
7521 address start = __ pc();
7522
7523 const Register crc = c_rarg0; // crc
7524 const Register buf = c_rarg1; // source java byte array address
7525 const Register len = c_rarg2; // length
7526 const Register table0 = c_rarg3; // crc_table address
7527 const Register table1 = c_rarg4;
7528 const Register table2 = c_rarg5;
7529 const Register table3 = c_rarg6;
7530 const Register tmp3 = c_rarg7;
7531
7532 BLOCK_COMMENT("Entry:");
7533 __ enter(); // required for proper stackwalking of RuntimeStub frame
7534
7535 __ kernel_crc32(crc, buf, len,
7536 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7537
7538 __ leave(); // required for proper stackwalking of RuntimeStub frame
7539 __ ret(lr);
7540
7541 return start;
7542 }
7543
7544 /**
7545 * Arguments:
7546 *
7547 * Inputs:
7548 * c_rarg0 - int crc
7549 * c_rarg1 - byte* buf
7550 * c_rarg2 - int length
7551 * c_rarg3 - int* table
7552 *
7553 * Output:
7554 * r0 - int crc result
7555 */
7556 address generate_updateBytesCRC32C() {
7557 assert(UseCRC32CIntrinsics, "what are we doing here?");
7558
7559 __ align(CodeEntryAlignment);
7560 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
7561 StubCodeMark mark(this, stub_id);
7562
7563 address start = __ pc();
7564
7565 const Register crc = c_rarg0; // crc
7566 const Register buf = c_rarg1; // source java byte array address
7567 const Register len = c_rarg2; // length
7568 const Register table0 = c_rarg3; // crc_table address
7569 const Register table1 = c_rarg4;
7570 const Register table2 = c_rarg5;
7571 const Register table3 = c_rarg6;
7572 const Register tmp3 = c_rarg7;
7573
7574 BLOCK_COMMENT("Entry:");
7575 __ enter(); // required for proper stackwalking of RuntimeStub frame
7576
7577 __ kernel_crc32c(crc, buf, len,
7578 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
7579
7580 __ leave(); // required for proper stackwalking of RuntimeStub frame
7581 __ ret(lr);
7582
7583 return start;
7584 }
7585
7586 /***
7587 * Arguments:
7588 *
7589 * Inputs:
7590 * c_rarg0 - int adler
7591 * c_rarg1 - byte* buff
7592 * c_rarg2 - int len
7593 *
7594 * Output:
7595 * c_rarg0 - int adler result
7596 */
7597 address generate_updateBytesAdler32() {
7598 __ align(CodeEntryAlignment);
7599 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
7600 StubCodeMark mark(this, stub_id);
7601 address start = __ pc();
7602
7603 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
7604
7605 // Aliases
7606 Register adler = c_rarg0;
7607 Register s1 = c_rarg0;
7608 Register s2 = c_rarg3;
7609 Register buff = c_rarg1;
7610 Register len = c_rarg2;
7611 Register nmax = r4;
7612 Register base = r5;
7613 Register count = r6;
7614 Register temp0 = rscratch1;
7615 Register temp1 = rscratch2;
7616 FloatRegister vbytes = v0;
7617 FloatRegister vs1acc = v1;
7618 FloatRegister vs2acc = v2;
7619 FloatRegister vtable = v3;
7620
7621 // Max number of bytes we can process before having to take the mod
7622 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
7623 uint64_t BASE = 0xfff1;
7624 uint64_t NMAX = 0x15B0;
7625
7626 __ mov(base, BASE);
7627 __ mov(nmax, NMAX);
7628
7629 // Load accumulation coefficients for the upper 16 bits
7630 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
7631 __ ld1(vtable, __ T16B, Address(temp0));
7632
7633 // s1 is initialized to the lower 16 bits of adler
7634 // s2 is initialized to the upper 16 bits of adler
7635 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
7636 __ uxth(s1, adler); // s1 = (adler & 0xffff)
7637
7638 // The pipelined loop needs at least 16 elements for 1 iteration
7639 // It does check this, but it is more effective to skip to the cleanup loop
7640 __ cmp(len, (u1)16);
7641 __ br(Assembler::HS, L_nmax);
7642 __ cbz(len, L_combine);
7643
7644 __ bind(L_simple_by1_loop);
7645 __ ldrb(temp0, Address(__ post(buff, 1)));
7646 __ add(s1, s1, temp0);
7647 __ add(s2, s2, s1);
7648 __ subs(len, len, 1);
7649 __ br(Assembler::HI, L_simple_by1_loop);
7650
7651 // s1 = s1 % BASE
7652 __ subs(temp0, s1, base);
7653 __ csel(s1, temp0, s1, Assembler::HS);
7654
7655 // s2 = s2 % BASE
7656 __ lsr(temp0, s2, 16);
7657 __ lsl(temp1, temp0, 4);
7658 __ sub(temp1, temp1, temp0);
7659 __ add(s2, temp1, s2, ext::uxth);
7660
7661 __ subs(temp0, s2, base);
7662 __ csel(s2, temp0, s2, Assembler::HS);
7663
7664 __ b(L_combine);
7665
7666 __ bind(L_nmax);
7667 __ subs(len, len, nmax);
7668 __ sub(count, nmax, 16);
7669 __ br(Assembler::LO, L_by16);
7670
7671 __ bind(L_nmax_loop);
7672
7673 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7674 vbytes, vs1acc, vs2acc, vtable);
7675
7676 __ subs(count, count, 16);
7677 __ br(Assembler::HS, L_nmax_loop);
7678
7679 // s1 = s1 % BASE
7680 __ lsr(temp0, s1, 16);
7681 __ lsl(temp1, temp0, 4);
7682 __ sub(temp1, temp1, temp0);
7683 __ add(temp1, temp1, s1, ext::uxth);
7684
7685 __ lsr(temp0, temp1, 16);
7686 __ lsl(s1, temp0, 4);
7687 __ sub(s1, s1, temp0);
7688 __ add(s1, s1, temp1, ext:: uxth);
7689
7690 __ subs(temp0, s1, base);
7691 __ csel(s1, temp0, s1, Assembler::HS);
7692
7693 // s2 = s2 % BASE
7694 __ lsr(temp0, s2, 16);
7695 __ lsl(temp1, temp0, 4);
7696 __ sub(temp1, temp1, temp0);
7697 __ add(temp1, temp1, s2, ext::uxth);
7698
7699 __ lsr(temp0, temp1, 16);
7700 __ lsl(s2, temp0, 4);
7701 __ sub(s2, s2, temp0);
7702 __ add(s2, s2, temp1, ext:: uxth);
7703
7704 __ subs(temp0, s2, base);
7705 __ csel(s2, temp0, s2, Assembler::HS);
7706
7707 __ subs(len, len, nmax);
7708 __ sub(count, nmax, 16);
7709 __ br(Assembler::HS, L_nmax_loop);
7710
7711 __ bind(L_by16);
7712 __ adds(len, len, count);
7713 __ br(Assembler::LO, L_by1);
7714
7715 __ bind(L_by16_loop);
7716
7717 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
7718 vbytes, vs1acc, vs2acc, vtable);
7719
7720 __ subs(len, len, 16);
7721 __ br(Assembler::HS, L_by16_loop);
7722
7723 __ bind(L_by1);
7724 __ adds(len, len, 15);
7725 __ br(Assembler::LO, L_do_mod);
7726
7727 __ bind(L_by1_loop);
7728 __ ldrb(temp0, Address(__ post(buff, 1)));
7729 __ add(s1, temp0, s1);
7730 __ add(s2, s2, s1);
7731 __ subs(len, len, 1);
7732 __ br(Assembler::HS, L_by1_loop);
7733
7734 __ bind(L_do_mod);
7735 // s1 = s1 % BASE
7736 __ lsr(temp0, s1, 16);
7737 __ lsl(temp1, temp0, 4);
7738 __ sub(temp1, temp1, temp0);
7739 __ add(temp1, temp1, s1, ext::uxth);
7740
7741 __ lsr(temp0, temp1, 16);
7742 __ lsl(s1, temp0, 4);
7743 __ sub(s1, s1, temp0);
7744 __ add(s1, s1, temp1, ext:: uxth);
7745
7746 __ subs(temp0, s1, base);
7747 __ csel(s1, temp0, s1, Assembler::HS);
7748
7749 // s2 = s2 % BASE
7750 __ lsr(temp0, s2, 16);
7751 __ lsl(temp1, temp0, 4);
7752 __ sub(temp1, temp1, temp0);
7753 __ add(temp1, temp1, s2, ext::uxth);
7754
7755 __ lsr(temp0, temp1, 16);
7756 __ lsl(s2, temp0, 4);
7757 __ sub(s2, s2, temp0);
7758 __ add(s2, s2, temp1, ext:: uxth);
7759
7760 __ subs(temp0, s2, base);
7761 __ csel(s2, temp0, s2, Assembler::HS);
7762
7763 // Combine lower bits and higher bits
7764 __ bind(L_combine);
7765 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
7766
7767 __ ret(lr);
7768
7769 return start;
7770 }
7771
7772 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
7773 Register temp0, Register temp1, FloatRegister vbytes,
7774 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
7775 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
7776 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
7777 // In non-vectorized code, we update s1 and s2 as:
7778 // s1 <- s1 + b1
7779 // s2 <- s2 + s1
7780 // s1 <- s1 + b2
7781 // s2 <- s2 + b1
7782 // ...
7783 // s1 <- s1 + b16
7784 // s2 <- s2 + s1
7785 // Putting above assignments together, we have:
7786 // s1_new = s1 + b1 + b2 + ... + b16
7787 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
7788 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
7789 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
7790 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
7791
7792 // s2 = s2 + s1 * 16
7793 __ add(s2, s2, s1, Assembler::LSL, 4);
7794
7795 // vs1acc = b1 + b2 + b3 + ... + b16
7796 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
7797 __ umullv(vs2acc, __ T8B, vtable, vbytes);
7798 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
7799 __ uaddlv(vs1acc, __ T16B, vbytes);
7800 __ uaddlv(vs2acc, __ T8H, vs2acc);
7801
7802 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
7803 __ fmovd(temp0, vs1acc);
7804 __ fmovd(temp1, vs2acc);
7805 __ add(s1, s1, temp0);
7806 __ add(s2, s2, temp1);
7807 }
7808
7809 /**
7810 * Arguments:
7811 *
7812 * Input:
7813 * c_rarg0 - x address
7814 * c_rarg1 - x length
7815 * c_rarg2 - y address
7816 * c_rarg3 - y length
7817 * c_rarg4 - z address
7818 */
7819 address generate_multiplyToLen() {
7820 __ align(CodeEntryAlignment);
7821 StubId stub_id = StubId::stubgen_multiplyToLen_id;
7822 StubCodeMark mark(this, stub_id);
7823
7824 address start = __ pc();
7825 const Register x = r0;
7826 const Register xlen = r1;
7827 const Register y = r2;
7828 const Register ylen = r3;
7829 const Register z = r4;
7830
7831 const Register tmp0 = r5;
7832 const Register tmp1 = r10;
7833 const Register tmp2 = r11;
7834 const Register tmp3 = r12;
7835 const Register tmp4 = r13;
7836 const Register tmp5 = r14;
7837 const Register tmp6 = r15;
7838 const Register tmp7 = r16;
7839
7840 BLOCK_COMMENT("Entry:");
7841 __ enter(); // required for proper stackwalking of RuntimeStub frame
7842 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7843 __ leave(); // required for proper stackwalking of RuntimeStub frame
7844 __ ret(lr);
7845
7846 return start;
7847 }
7848
7849 address generate_squareToLen() {
7850 // squareToLen algorithm for sizes 1..127 described in java code works
7851 // faster than multiply_to_len on some CPUs and slower on others, but
7852 // multiply_to_len shows a bit better overall results
7853 __ align(CodeEntryAlignment);
7854 StubId stub_id = StubId::stubgen_squareToLen_id;
7855 StubCodeMark mark(this, stub_id);
7856 address start = __ pc();
7857
7858 const Register x = r0;
7859 const Register xlen = r1;
7860 const Register z = r2;
7861 const Register y = r4; // == x
7862 const Register ylen = r5; // == xlen
7863
7864 const Register tmp0 = r3;
7865 const Register tmp1 = r10;
7866 const Register tmp2 = r11;
7867 const Register tmp3 = r12;
7868 const Register tmp4 = r13;
7869 const Register tmp5 = r14;
7870 const Register tmp6 = r15;
7871 const Register tmp7 = r16;
7872
7873 RegSet spilled_regs = RegSet::of(y, ylen);
7874 BLOCK_COMMENT("Entry:");
7875 __ enter();
7876 __ push(spilled_regs, sp);
7877 __ mov(y, x);
7878 __ mov(ylen, xlen);
7879 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
7880 __ pop(spilled_regs, sp);
7881 __ leave();
7882 __ ret(lr);
7883 return start;
7884 }
7885
7886 address generate_mulAdd() {
7887 __ align(CodeEntryAlignment);
7888 StubId stub_id = StubId::stubgen_mulAdd_id;
7889 StubCodeMark mark(this, stub_id);
7890
7891 address start = __ pc();
7892
7893 const Register out = r0;
7894 const Register in = r1;
7895 const Register offset = r2;
7896 const Register len = r3;
7897 const Register k = r4;
7898
7899 BLOCK_COMMENT("Entry:");
7900 __ enter();
7901 __ mul_add(out, in, offset, len, k);
7902 __ leave();
7903 __ ret(lr);
7904
7905 return start;
7906 }
7907
7908 // Arguments:
7909 //
7910 // Input:
7911 // c_rarg0 - newArr address
7912 // c_rarg1 - oldArr address
7913 // c_rarg2 - newIdx
7914 // c_rarg3 - shiftCount
7915 // c_rarg4 - numIter
7916 //
7917 address generate_bigIntegerRightShift() {
7918 __ align(CodeEntryAlignment);
7919 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
7920 StubCodeMark mark(this, stub_id);
7921 address start = __ pc();
7922
7923 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
7924
7925 Register newArr = c_rarg0;
7926 Register oldArr = c_rarg1;
7927 Register newIdx = c_rarg2;
7928 Register shiftCount = c_rarg3;
7929 Register numIter = c_rarg4;
7930 Register idx = numIter;
7931
7932 Register newArrCur = rscratch1;
7933 Register shiftRevCount = rscratch2;
7934 Register oldArrCur = r13;
7935 Register oldArrNext = r14;
7936
7937 FloatRegister oldElem0 = v0;
7938 FloatRegister oldElem1 = v1;
7939 FloatRegister newElem = v2;
7940 FloatRegister shiftVCount = v3;
7941 FloatRegister shiftVRevCount = v4;
7942
7943 __ cbz(idx, Exit);
7944
7945 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
7946
7947 // left shift count
7948 __ movw(shiftRevCount, 32);
7949 __ subw(shiftRevCount, shiftRevCount, shiftCount);
7950
7951 // numIter too small to allow a 4-words SIMD loop, rolling back
7952 __ cmp(numIter, (u1)4);
7953 __ br(Assembler::LT, ShiftThree);
7954
7955 __ dup(shiftVCount, __ T4S, shiftCount);
7956 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
7957 __ negr(shiftVCount, __ T4S, shiftVCount);
7958
7959 __ BIND(ShiftSIMDLoop);
7960
7961 // Calculate the load addresses
7962 __ sub(idx, idx, 4);
7963 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7964 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7965 __ add(oldArrCur, oldArrNext, 4);
7966
7967 // Load 4 words and process
7968 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
7969 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
7970 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
7971 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
7972 __ orr(newElem, __ T16B, oldElem0, oldElem1);
7973 __ st1(newElem, __ T4S, Address(newArrCur));
7974
7975 __ cmp(idx, (u1)4);
7976 __ br(Assembler::LT, ShiftTwoLoop);
7977 __ b(ShiftSIMDLoop);
7978
7979 __ BIND(ShiftTwoLoop);
7980 __ cbz(idx, Exit);
7981 __ cmp(idx, (u1)1);
7982 __ br(Assembler::EQ, ShiftOne);
7983
7984 // Calculate the load addresses
7985 __ sub(idx, idx, 2);
7986 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
7987 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
7988 __ add(oldArrCur, oldArrNext, 4);
7989
7990 // Load 2 words and process
7991 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
7992 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
7993 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
7994 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
7995 __ orr(newElem, __ T8B, oldElem0, oldElem1);
7996 __ st1(newElem, __ T2S, Address(newArrCur));
7997 __ b(ShiftTwoLoop);
7998
7999 __ BIND(ShiftThree);
8000 __ tbz(idx, 1, ShiftOne);
8001 __ tbz(idx, 0, ShiftTwo);
8002 __ ldrw(r10, Address(oldArr, 12));
8003 __ ldrw(r11, Address(oldArr, 8));
8004 __ lsrvw(r10, r10, shiftCount);
8005 __ lslvw(r11, r11, shiftRevCount);
8006 __ orrw(r12, r10, r11);
8007 __ strw(r12, Address(newArr, 8));
8008
8009 __ BIND(ShiftTwo);
8010 __ ldrw(r10, Address(oldArr, 8));
8011 __ ldrw(r11, Address(oldArr, 4));
8012 __ lsrvw(r10, r10, shiftCount);
8013 __ lslvw(r11, r11, shiftRevCount);
8014 __ orrw(r12, r10, r11);
8015 __ strw(r12, Address(newArr, 4));
8016
8017 __ BIND(ShiftOne);
8018 __ ldrw(r10, Address(oldArr, 4));
8019 __ ldrw(r11, Address(oldArr));
8020 __ lsrvw(r10, r10, shiftCount);
8021 __ lslvw(r11, r11, shiftRevCount);
8022 __ orrw(r12, r10, r11);
8023 __ strw(r12, Address(newArr));
8024
8025 __ BIND(Exit);
8026 __ ret(lr);
8027
8028 return start;
8029 }
8030
8031 // Arguments:
8032 //
8033 // Input:
8034 // c_rarg0 - newArr address
8035 // c_rarg1 - oldArr address
8036 // c_rarg2 - newIdx
8037 // c_rarg3 - shiftCount
8038 // c_rarg4 - numIter
8039 //
8040 address generate_bigIntegerLeftShift() {
8041 __ align(CodeEntryAlignment);
8042 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8043 StubCodeMark mark(this, stub_id);
8044 address start = __ pc();
8045
8046 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8047
8048 Register newArr = c_rarg0;
8049 Register oldArr = c_rarg1;
8050 Register newIdx = c_rarg2;
8051 Register shiftCount = c_rarg3;
8052 Register numIter = c_rarg4;
8053
8054 Register shiftRevCount = rscratch1;
8055 Register oldArrNext = rscratch2;
8056
8057 FloatRegister oldElem0 = v0;
8058 FloatRegister oldElem1 = v1;
8059 FloatRegister newElem = v2;
8060 FloatRegister shiftVCount = v3;
8061 FloatRegister shiftVRevCount = v4;
8062
8063 __ cbz(numIter, Exit);
8064
8065 __ add(oldArrNext, oldArr, 4);
8066 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8067
8068 // right shift count
8069 __ movw(shiftRevCount, 32);
8070 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8071
8072 // numIter too small to allow a 4-words SIMD loop, rolling back
8073 __ cmp(numIter, (u1)4);
8074 __ br(Assembler::LT, ShiftThree);
8075
8076 __ dup(shiftVCount, __ T4S, shiftCount);
8077 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8078 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8079
8080 __ BIND(ShiftSIMDLoop);
8081
8082 // load 4 words and process
8083 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8084 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8085 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8086 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8087 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8088 __ st1(newElem, __ T4S, __ post(newArr, 16));
8089 __ sub(numIter, numIter, 4);
8090
8091 __ cmp(numIter, (u1)4);
8092 __ br(Assembler::LT, ShiftTwoLoop);
8093 __ b(ShiftSIMDLoop);
8094
8095 __ BIND(ShiftTwoLoop);
8096 __ cbz(numIter, Exit);
8097 __ cmp(numIter, (u1)1);
8098 __ br(Assembler::EQ, ShiftOne);
8099
8100 // load 2 words and process
8101 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8102 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8103 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8104 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8105 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8106 __ st1(newElem, __ T2S, __ post(newArr, 8));
8107 __ sub(numIter, numIter, 2);
8108 __ b(ShiftTwoLoop);
8109
8110 __ BIND(ShiftThree);
8111 __ ldrw(r10, __ post(oldArr, 4));
8112 __ ldrw(r11, __ post(oldArrNext, 4));
8113 __ lslvw(r10, r10, shiftCount);
8114 __ lsrvw(r11, r11, shiftRevCount);
8115 __ orrw(r12, r10, r11);
8116 __ strw(r12, __ post(newArr, 4));
8117 __ tbz(numIter, 1, Exit);
8118 __ tbz(numIter, 0, ShiftOne);
8119
8120 __ BIND(ShiftTwo);
8121 __ ldrw(r10, __ post(oldArr, 4));
8122 __ ldrw(r11, __ post(oldArrNext, 4));
8123 __ lslvw(r10, r10, shiftCount);
8124 __ lsrvw(r11, r11, shiftRevCount);
8125 __ orrw(r12, r10, r11);
8126 __ strw(r12, __ post(newArr, 4));
8127
8128 __ BIND(ShiftOne);
8129 __ ldrw(r10, Address(oldArr));
8130 __ ldrw(r11, Address(oldArrNext));
8131 __ lslvw(r10, r10, shiftCount);
8132 __ lsrvw(r11, r11, shiftRevCount);
8133 __ orrw(r12, r10, r11);
8134 __ strw(r12, Address(newArr));
8135
8136 __ BIND(Exit);
8137 __ ret(lr);
8138
8139 return start;
8140 }
8141
8142 address generate_count_positives(address &count_positives_long) {
8143 const u1 large_loop_size = 64;
8144 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8145 int dcache_line = VM_Version::dcache_line_size();
8146
8147 Register ary1 = r1, len = r2, result = r0;
8148
8149 __ align(CodeEntryAlignment);
8150
8151 StubId stub_id = StubId::stubgen_count_positives_id;
8152 StubCodeMark mark(this, stub_id);
8153
8154 address entry = __ pc();
8155
8156 __ enter();
8157 // precondition: a copy of len is already in result
8158 // __ mov(result, len);
8159
8160 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8161 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8162
8163 __ cmp(len, (u1)15);
8164 __ br(Assembler::GT, LEN_OVER_15);
8165 // The only case when execution falls into this code is when pointer is near
8166 // the end of memory page and we have to avoid reading next page
8167 __ add(ary1, ary1, len);
8168 __ subs(len, len, 8);
8169 __ br(Assembler::GT, LEN_OVER_8);
8170 __ ldr(rscratch2, Address(ary1, -8));
8171 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8172 __ lsrv(rscratch2, rscratch2, rscratch1);
8173 __ tst(rscratch2, UPPER_BIT_MASK);
8174 __ csel(result, zr, result, Assembler::NE);
8175 __ leave();
8176 __ ret(lr);
8177 __ bind(LEN_OVER_8);
8178 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8179 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8180 __ tst(rscratch2, UPPER_BIT_MASK);
8181 __ br(Assembler::NE, RET_NO_POP);
8182 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8183 __ lsrv(rscratch1, rscratch1, rscratch2);
8184 __ tst(rscratch1, UPPER_BIT_MASK);
8185 __ bind(RET_NO_POP);
8186 __ csel(result, zr, result, Assembler::NE);
8187 __ leave();
8188 __ ret(lr);
8189
8190 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8191 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8192
8193 count_positives_long = __ pc(); // 2nd entry point
8194
8195 __ enter();
8196
8197 __ bind(LEN_OVER_15);
8198 __ push(spilled_regs, sp);
8199 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8200 __ cbz(rscratch2, ALIGNED);
8201 __ ldp(tmp6, tmp1, Address(ary1));
8202 __ mov(tmp5, 16);
8203 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8204 __ add(ary1, ary1, rscratch1);
8205 __ orr(tmp6, tmp6, tmp1);
8206 __ tst(tmp6, UPPER_BIT_MASK);
8207 __ br(Assembler::NE, RET_ADJUST);
8208 __ sub(len, len, rscratch1);
8209
8210 __ bind(ALIGNED);
8211 __ cmp(len, large_loop_size);
8212 __ br(Assembler::LT, CHECK_16);
8213 // Perform 16-byte load as early return in pre-loop to handle situation
8214 // when initially aligned large array has negative values at starting bytes,
8215 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8216 // slower. Cases with negative bytes further ahead won't be affected that
8217 // much. In fact, it'll be faster due to early loads, less instructions and
8218 // less branches in LARGE_LOOP.
8219 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8220 __ sub(len, len, 16);
8221 __ orr(tmp6, tmp6, tmp1);
8222 __ tst(tmp6, UPPER_BIT_MASK);
8223 __ br(Assembler::NE, RET_ADJUST_16);
8224 __ cmp(len, large_loop_size);
8225 __ br(Assembler::LT, CHECK_16);
8226
8227 if (SoftwarePrefetchHintDistance >= 0
8228 && SoftwarePrefetchHintDistance >= dcache_line) {
8229 // initial prefetch
8230 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8231 }
8232 __ bind(LARGE_LOOP);
8233 if (SoftwarePrefetchHintDistance >= 0) {
8234 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8235 }
8236 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8237 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8238 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8239 // instructions per cycle and have less branches, but this approach disables
8240 // early return, thus, all 64 bytes are loaded and checked every time.
8241 __ ldp(tmp2, tmp3, Address(ary1));
8242 __ ldp(tmp4, tmp5, Address(ary1, 16));
8243 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8244 __ ldp(tmp6, tmp1, Address(ary1, 48));
8245 __ add(ary1, ary1, large_loop_size);
8246 __ sub(len, len, large_loop_size);
8247 __ orr(tmp2, tmp2, tmp3);
8248 __ orr(tmp4, tmp4, tmp5);
8249 __ orr(rscratch1, rscratch1, rscratch2);
8250 __ orr(tmp6, tmp6, tmp1);
8251 __ orr(tmp2, tmp2, tmp4);
8252 __ orr(rscratch1, rscratch1, tmp6);
8253 __ orr(tmp2, tmp2, rscratch1);
8254 __ tst(tmp2, UPPER_BIT_MASK);
8255 __ br(Assembler::NE, RET_ADJUST_LONG);
8256 __ cmp(len, large_loop_size);
8257 __ br(Assembler::GE, LARGE_LOOP);
8258
8259 __ bind(CHECK_16); // small 16-byte load pre-loop
8260 __ cmp(len, (u1)16);
8261 __ br(Assembler::LT, POST_LOOP16);
8262
8263 __ bind(LOOP16); // small 16-byte load loop
8264 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8265 __ sub(len, len, 16);
8266 __ orr(tmp2, tmp2, tmp3);
8267 __ tst(tmp2, UPPER_BIT_MASK);
8268 __ br(Assembler::NE, RET_ADJUST_16);
8269 __ cmp(len, (u1)16);
8270 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8271
8272 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8273 __ cmp(len, (u1)8);
8274 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8275 __ ldr(tmp3, Address(__ post(ary1, 8)));
8276 __ tst(tmp3, UPPER_BIT_MASK);
8277 __ br(Assembler::NE, RET_ADJUST);
8278 __ sub(len, len, 8);
8279
8280 __ bind(POST_LOOP16_LOAD_TAIL);
8281 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8282 __ ldr(tmp1, Address(ary1));
8283 __ mov(tmp2, 64);
8284 __ sub(tmp4, tmp2, len, __ LSL, 3);
8285 __ lslv(tmp1, tmp1, tmp4);
8286 __ tst(tmp1, UPPER_BIT_MASK);
8287 __ br(Assembler::NE, RET_ADJUST);
8288 // Fallthrough
8289
8290 __ bind(RET_LEN);
8291 __ pop(spilled_regs, sp);
8292 __ leave();
8293 __ ret(lr);
8294
8295 // difference result - len is the count of guaranteed to be
8296 // positive bytes
8297
8298 __ bind(RET_ADJUST_LONG);
8299 __ add(len, len, (u1)(large_loop_size - 16));
8300 __ bind(RET_ADJUST_16);
8301 __ add(len, len, 16);
8302 __ bind(RET_ADJUST);
8303 __ pop(spilled_regs, sp);
8304 __ leave();
8305 __ sub(result, result, len);
8306 __ ret(lr);
8307
8308 return entry;
8309 }
8310
8311 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8312 bool usePrefetch, Label &NOT_EQUAL) {
8313 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8314 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8315 tmp7 = r12, tmp8 = r13;
8316 Label LOOP;
8317
8318 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8319 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8320 __ bind(LOOP);
8321 if (usePrefetch) {
8322 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8323 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8324 }
8325 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8326 __ eor(tmp1, tmp1, tmp2);
8327 __ eor(tmp3, tmp3, tmp4);
8328 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8329 __ orr(tmp1, tmp1, tmp3);
8330 __ cbnz(tmp1, NOT_EQUAL);
8331 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8332 __ eor(tmp5, tmp5, tmp6);
8333 __ eor(tmp7, tmp7, tmp8);
8334 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8335 __ orr(tmp5, tmp5, tmp7);
8336 __ cbnz(tmp5, NOT_EQUAL);
8337 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8338 __ eor(tmp1, tmp1, tmp2);
8339 __ eor(tmp3, tmp3, tmp4);
8340 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8341 __ orr(tmp1, tmp1, tmp3);
8342 __ cbnz(tmp1, NOT_EQUAL);
8343 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8344 __ eor(tmp5, tmp5, tmp6);
8345 __ sub(cnt1, cnt1, 8 * wordSize);
8346 __ eor(tmp7, tmp7, tmp8);
8347 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8348 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8349 // cmp) because subs allows an unlimited range of immediate operand.
8350 __ subs(tmp6, cnt1, loopThreshold);
8351 __ orr(tmp5, tmp5, tmp7);
8352 __ cbnz(tmp5, NOT_EQUAL);
8353 __ br(__ GE, LOOP);
8354 // post-loop
8355 __ eor(tmp1, tmp1, tmp2);
8356 __ eor(tmp3, tmp3, tmp4);
8357 __ orr(tmp1, tmp1, tmp3);
8358 __ sub(cnt1, cnt1, 2 * wordSize);
8359 __ cbnz(tmp1, NOT_EQUAL);
8360 }
8361
8362 void generate_large_array_equals_loop_simd(int loopThreshold,
8363 bool usePrefetch, Label &NOT_EQUAL) {
8364 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8365 tmp2 = rscratch2;
8366 Label LOOP;
8367
8368 __ bind(LOOP);
8369 if (usePrefetch) {
8370 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8371 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8372 }
8373 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8374 __ sub(cnt1, cnt1, 8 * wordSize);
8375 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8376 __ subs(tmp1, cnt1, loopThreshold);
8377 __ eor(v0, __ T16B, v0, v4);
8378 __ eor(v1, __ T16B, v1, v5);
8379 __ eor(v2, __ T16B, v2, v6);
8380 __ eor(v3, __ T16B, v3, v7);
8381 __ orr(v0, __ T16B, v0, v1);
8382 __ orr(v1, __ T16B, v2, v3);
8383 __ orr(v0, __ T16B, v0, v1);
8384 __ umov(tmp1, v0, __ D, 0);
8385 __ umov(tmp2, v0, __ D, 1);
8386 __ orr(tmp1, tmp1, tmp2);
8387 __ cbnz(tmp1, NOT_EQUAL);
8388 __ br(__ GE, LOOP);
8389 }
8390
8391 // a1 = r1 - array1 address
8392 // a2 = r2 - array2 address
8393 // result = r0 - return value. Already contains "false"
8394 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8395 // r3-r5 are reserved temporary registers
8396 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8397 address generate_large_array_equals() {
8398 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8399 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8400 tmp7 = r12, tmp8 = r13;
8401 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
8402 SMALL_LOOP, POST_LOOP;
8403 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
8404 // calculate if at least 32 prefetched bytes are used
8405 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
8406 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
8407 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
8408 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
8409 tmp5, tmp6, tmp7, tmp8);
8410
8411 __ align(CodeEntryAlignment);
8412
8413 StubId stub_id = StubId::stubgen_large_array_equals_id;
8414 StubCodeMark mark(this, stub_id);
8415
8416 address entry = __ pc();
8417 __ enter();
8418 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
8419 // also advance pointers to use post-increment instead of pre-increment
8420 __ add(a1, a1, wordSize);
8421 __ add(a2, a2, wordSize);
8422 if (AvoidUnalignedAccesses) {
8423 // both implementations (SIMD/nonSIMD) are using relatively large load
8424 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
8425 // on some CPUs in case of address is not at least 16-byte aligned.
8426 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
8427 // load if needed at least for 1st address and make if 16-byte aligned.
8428 Label ALIGNED16;
8429 __ tbz(a1, 3, ALIGNED16);
8430 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8431 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8432 __ sub(cnt1, cnt1, wordSize);
8433 __ eor(tmp1, tmp1, tmp2);
8434 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
8435 __ bind(ALIGNED16);
8436 }
8437 if (UseSIMDForArrayEquals) {
8438 if (SoftwarePrefetchHintDistance >= 0) {
8439 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8440 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8441 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
8442 /* prfm = */ true, NOT_EQUAL);
8443 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8444 __ br(__ LT, TAIL);
8445 }
8446 __ bind(NO_PREFETCH_LARGE_LOOP);
8447 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
8448 /* prfm = */ false, NOT_EQUAL);
8449 } else {
8450 __ push(spilled_regs, sp);
8451 if (SoftwarePrefetchHintDistance >= 0) {
8452 __ subs(tmp1, cnt1, prefetchLoopThreshold);
8453 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
8454 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
8455 /* prfm = */ true, NOT_EQUAL);
8456 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
8457 __ br(__ LT, TAIL);
8458 }
8459 __ bind(NO_PREFETCH_LARGE_LOOP);
8460 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
8461 /* prfm = */ false, NOT_EQUAL);
8462 }
8463 __ bind(TAIL);
8464 __ cbz(cnt1, EQUAL);
8465 __ subs(cnt1, cnt1, wordSize);
8466 __ br(__ LE, POST_LOOP);
8467 __ bind(SMALL_LOOP);
8468 __ ldr(tmp1, Address(__ post(a1, wordSize)));
8469 __ ldr(tmp2, Address(__ post(a2, wordSize)));
8470 __ subs(cnt1, cnt1, wordSize);
8471 __ eor(tmp1, tmp1, tmp2);
8472 __ cbnz(tmp1, NOT_EQUAL);
8473 __ br(__ GT, SMALL_LOOP);
8474 __ bind(POST_LOOP);
8475 __ ldr(tmp1, Address(a1, cnt1));
8476 __ ldr(tmp2, Address(a2, cnt1));
8477 __ eor(tmp1, tmp1, tmp2);
8478 __ cbnz(tmp1, NOT_EQUAL);
8479 __ bind(EQUAL);
8480 __ mov(result, true);
8481 __ bind(NOT_EQUAL);
8482 if (!UseSIMDForArrayEquals) {
8483 __ pop(spilled_regs, sp);
8484 }
8485 __ bind(NOT_EQUAL_NO_POP);
8486 __ leave();
8487 __ ret(lr);
8488 return entry;
8489 }
8490
8491 // result = r0 - return value. Contains initial hashcode value on entry.
8492 // ary = r1 - array address
8493 // cnt = r2 - elements count
8494 // Clobbers: v0-v13, rscratch1, rscratch2
8495 address generate_large_arrays_hashcode(BasicType eltype) {
8496 const Register result = r0, ary = r1, cnt = r2;
8497 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
8498 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
8499 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
8500 const FloatRegister vpowm = v13;
8501
8502 ARRAYS_HASHCODE_REGISTERS;
8503
8504 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
8505
8506 unsigned int vf; // vectorization factor
8507 bool multiply_by_halves;
8508 Assembler::SIMD_Arrangement load_arrangement;
8509 switch (eltype) {
8510 case T_BOOLEAN:
8511 case T_BYTE:
8512 load_arrangement = Assembler::T8B;
8513 multiply_by_halves = true;
8514 vf = 8;
8515 break;
8516 case T_CHAR:
8517 case T_SHORT:
8518 load_arrangement = Assembler::T8H;
8519 multiply_by_halves = true;
8520 vf = 8;
8521 break;
8522 case T_INT:
8523 load_arrangement = Assembler::T4S;
8524 multiply_by_halves = false;
8525 vf = 4;
8526 break;
8527 default:
8528 ShouldNotReachHere();
8529 }
8530
8531 // Unroll factor
8532 const unsigned uf = 4;
8533
8534 // Effective vectorization factor
8535 const unsigned evf = vf * uf;
8536
8537 __ align(CodeEntryAlignment);
8538
8539 StubId stub_id;
8540 switch (eltype) {
8541 case T_BOOLEAN:
8542 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
8543 break;
8544 case T_BYTE:
8545 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
8546 break;
8547 case T_CHAR:
8548 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
8549 break;
8550 case T_SHORT:
8551 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
8552 break;
8553 case T_INT:
8554 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
8555 break;
8556 default:
8557 stub_id = StubId::NO_STUBID;
8558 ShouldNotReachHere();
8559 };
8560
8561 StubCodeMark mark(this, stub_id);
8562
8563 address entry = __ pc();
8564 __ enter();
8565
8566 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
8567 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
8568 // value shouldn't change throughout both loops.
8569 __ movw(rscratch1, intpow(31U, 3));
8570 __ mov(vpow, Assembler::S, 0, rscratch1);
8571 __ movw(rscratch1, intpow(31U, 2));
8572 __ mov(vpow, Assembler::S, 1, rscratch1);
8573 __ movw(rscratch1, intpow(31U, 1));
8574 __ mov(vpow, Assembler::S, 2, rscratch1);
8575 __ movw(rscratch1, intpow(31U, 0));
8576 __ mov(vpow, Assembler::S, 3, rscratch1);
8577
8578 __ mov(vmul0, Assembler::T16B, 0);
8579 __ mov(vmul0, Assembler::S, 3, result);
8580
8581 __ andr(rscratch2, cnt, (uf - 1) * vf);
8582 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
8583
8584 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
8585 __ mov(vpowm, Assembler::S, 0, rscratch1);
8586
8587 // SMALL LOOP
8588 __ bind(SMALL_LOOP);
8589
8590 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
8591 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8592 __ subsw(rscratch2, rscratch2, vf);
8593
8594 if (load_arrangement == Assembler::T8B) {
8595 // Extend 8B to 8H to be able to use vector multiply
8596 // instructions
8597 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8598 if (is_signed_subword_type(eltype)) {
8599 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8600 } else {
8601 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8602 }
8603 }
8604
8605 switch (load_arrangement) {
8606 case Assembler::T4S:
8607 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8608 break;
8609 case Assembler::T8B:
8610 case Assembler::T8H:
8611 assert(is_subword_type(eltype), "subword type expected");
8612 if (is_signed_subword_type(eltype)) {
8613 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8614 } else {
8615 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8616 }
8617 break;
8618 default:
8619 __ should_not_reach_here();
8620 }
8621
8622 // Process the upper half of a vector
8623 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8624 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8625 if (is_signed_subword_type(eltype)) {
8626 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8627 } else {
8628 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8629 }
8630 }
8631
8632 __ br(Assembler::HI, SMALL_LOOP);
8633
8634 // SMALL LOOP'S EPILOQUE
8635 __ lsr(rscratch2, cnt, exact_log2(evf));
8636 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
8637
8638 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8639 __ addv(vmul0, Assembler::T4S, vmul0);
8640 __ umov(result, vmul0, Assembler::S, 0);
8641
8642 // TAIL
8643 __ bind(TAIL);
8644
8645 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
8646 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
8647 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
8648 __ andr(rscratch2, cnt, vf - 1);
8649 __ bind(TAIL_SHORTCUT);
8650 __ adr(rscratch1, BR_BASE);
8651 // For Cortex-A53 offset is 4 because 2 nops are generated.
8652 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
8653 __ movw(rscratch2, 0x1f);
8654 __ br(rscratch1);
8655
8656 for (size_t i = 0; i < vf - 1; ++i) {
8657 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
8658 eltype);
8659 __ maddw(result, result, rscratch2, rscratch1);
8660 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
8661 // Generate 2nd nop to have 4 instructions per iteration.
8662 if (VM_Version::supports_a53mac()) {
8663 __ nop();
8664 }
8665 }
8666 __ bind(BR_BASE);
8667
8668 __ leave();
8669 __ ret(lr);
8670
8671 // LARGE LOOP
8672 __ bind(LARGE_LOOP_PREHEADER);
8673
8674 __ lsr(rscratch2, cnt, exact_log2(evf));
8675
8676 if (multiply_by_halves) {
8677 // 31^4 - multiplier between lower and upper parts of a register
8678 __ movw(rscratch1, intpow(31U, vf / 2));
8679 __ mov(vpowm, Assembler::S, 1, rscratch1);
8680 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
8681 __ movw(rscratch1, intpow(31U, evf - vf / 2));
8682 __ mov(vpowm, Assembler::S, 0, rscratch1);
8683 } else {
8684 // 31^16
8685 __ movw(rscratch1, intpow(31U, evf));
8686 __ mov(vpowm, Assembler::S, 0, rscratch1);
8687 }
8688
8689 __ mov(vmul3, Assembler::T16B, 0);
8690 __ mov(vmul2, Assembler::T16B, 0);
8691 __ mov(vmul1, Assembler::T16B, 0);
8692
8693 __ bind(LARGE_LOOP);
8694
8695 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
8696 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
8697 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
8698 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
8699
8700 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
8701 Address(__ post(ary, evf * type2aelembytes(eltype))));
8702
8703 if (load_arrangement == Assembler::T8B) {
8704 // Extend 8B to 8H to be able to use vector multiply
8705 // instructions
8706 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
8707 if (is_signed_subword_type(eltype)) {
8708 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8709 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8710 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8711 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8712 } else {
8713 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
8714 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
8715 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
8716 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
8717 }
8718 }
8719
8720 switch (load_arrangement) {
8721 case Assembler::T4S:
8722 __ addv(vmul3, load_arrangement, vmul3, vdata3);
8723 __ addv(vmul2, load_arrangement, vmul2, vdata2);
8724 __ addv(vmul1, load_arrangement, vmul1, vdata1);
8725 __ addv(vmul0, load_arrangement, vmul0, vdata0);
8726 break;
8727 case Assembler::T8B:
8728 case Assembler::T8H:
8729 assert(is_subword_type(eltype), "subword type expected");
8730 if (is_signed_subword_type(eltype)) {
8731 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8732 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8733 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8734 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8735 } else {
8736 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
8737 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
8738 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
8739 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
8740 }
8741 break;
8742 default:
8743 __ should_not_reach_here();
8744 }
8745
8746 // Process the upper half of a vector
8747 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
8748 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
8749 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
8750 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
8751 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
8752 if (is_signed_subword_type(eltype)) {
8753 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8754 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8755 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8756 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8757 } else {
8758 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
8759 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
8760 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
8761 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
8762 }
8763 }
8764
8765 __ subsw(rscratch2, rscratch2, 1);
8766 __ br(Assembler::HI, LARGE_LOOP);
8767
8768 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
8769 __ addv(vmul3, Assembler::T4S, vmul3);
8770 __ umov(result, vmul3, Assembler::S, 0);
8771
8772 __ mov(rscratch2, intpow(31U, vf));
8773
8774 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
8775 __ addv(vmul2, Assembler::T4S, vmul2);
8776 __ umov(rscratch1, vmul2, Assembler::S, 0);
8777 __ maddw(result, result, rscratch2, rscratch1);
8778
8779 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
8780 __ addv(vmul1, Assembler::T4S, vmul1);
8781 __ umov(rscratch1, vmul1, Assembler::S, 0);
8782 __ maddw(result, result, rscratch2, rscratch1);
8783
8784 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
8785 __ addv(vmul0, Assembler::T4S, vmul0);
8786 __ umov(rscratch1, vmul0, Assembler::S, 0);
8787 __ maddw(result, result, rscratch2, rscratch1);
8788
8789 __ andr(rscratch2, cnt, vf - 1);
8790 __ cbnz(rscratch2, TAIL_SHORTCUT);
8791
8792 __ leave();
8793 __ ret(lr);
8794
8795 return entry;
8796 }
8797
8798 address generate_dsin_dcos(bool isCos) {
8799 __ align(CodeEntryAlignment);
8800 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
8801 StubCodeMark mark(this, stub_id);
8802 address start = __ pc();
8803 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
8804 (address)StubRoutines::aarch64::_two_over_pi,
8805 (address)StubRoutines::aarch64::_pio2,
8806 (address)StubRoutines::aarch64::_dsin_coef,
8807 (address)StubRoutines::aarch64::_dcos_coef);
8808 return start;
8809 }
8810
8811 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
8812 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
8813 Label &DIFF2) {
8814 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
8815 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
8816
8817 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
8818 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8819 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
8820 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
8821
8822 __ fmovd(tmpL, vtmp3);
8823 __ eor(rscratch2, tmp3, tmpL);
8824 __ cbnz(rscratch2, DIFF2);
8825
8826 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8827 __ umov(tmpL, vtmp3, __ D, 1);
8828 __ eor(rscratch2, tmpU, tmpL);
8829 __ cbnz(rscratch2, DIFF1);
8830
8831 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
8832 __ ldr(tmpU, Address(__ post(cnt1, 8)));
8833 __ fmovd(tmpL, vtmp);
8834 __ eor(rscratch2, tmp3, tmpL);
8835 __ cbnz(rscratch2, DIFF2);
8836
8837 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8838 __ umov(tmpL, vtmp, __ D, 1);
8839 __ eor(rscratch2, tmpU, tmpL);
8840 __ cbnz(rscratch2, DIFF1);
8841 }
8842
8843 // r0 = result
8844 // r1 = str1
8845 // r2 = cnt1
8846 // r3 = str2
8847 // r4 = cnt2
8848 // r10 = tmp1
8849 // r11 = tmp2
8850 address generate_compare_long_string_different_encoding(bool isLU) {
8851 __ align(CodeEntryAlignment);
8852 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
8853 StubCodeMark mark(this, stub_id);
8854 address entry = __ pc();
8855 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
8856 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
8857 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
8858 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
8859 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
8860 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
8861 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
8862
8863 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
8864
8865 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
8866 // cnt2 == amount of characters left to compare
8867 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
8868 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8869 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
8870 __ add(str2, str2, isLU ? wordSize : wordSize/2);
8871 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
8872 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
8873 __ eor(rscratch2, tmp1, tmp2);
8874 __ mov(rscratch1, tmp2);
8875 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
8876 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
8877 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
8878 __ push(spilled_regs, sp);
8879 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
8880 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
8881
8882 __ ldr(tmp3, Address(__ post(cnt1, 8)));
8883
8884 if (SoftwarePrefetchHintDistance >= 0) {
8885 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8886 __ br(__ LT, NO_PREFETCH);
8887 __ bind(LARGE_LOOP_PREFETCH);
8888 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
8889 __ mov(tmp4, 2);
8890 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8891 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
8892 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8893 __ subs(tmp4, tmp4, 1);
8894 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
8895 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
8896 __ mov(tmp4, 2);
8897 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
8898 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8899 __ subs(tmp4, tmp4, 1);
8900 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
8901 __ sub(cnt2, cnt2, 64);
8902 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
8903 __ br(__ GE, LARGE_LOOP_PREFETCH);
8904 }
8905 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
8906 __ bind(NO_PREFETCH);
8907 __ subs(cnt2, cnt2, 16);
8908 __ br(__ LT, TAIL);
8909 __ align(OptoLoopAlignment);
8910 __ bind(SMALL_LOOP); // smaller loop
8911 __ subs(cnt2, cnt2, 16);
8912 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
8913 __ br(__ GE, SMALL_LOOP);
8914 __ cmn(cnt2, (u1)16);
8915 __ br(__ EQ, LOAD_LAST);
8916 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
8917 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
8918 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
8919 __ ldr(tmp3, Address(cnt1, -8));
8920 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
8921 __ b(LOAD_LAST);
8922 __ bind(DIFF2);
8923 __ mov(tmpU, tmp3);
8924 __ bind(DIFF1);
8925 __ pop(spilled_regs, sp);
8926 __ b(CALCULATE_DIFFERENCE);
8927 __ bind(LOAD_LAST);
8928 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
8929 // No need to load it again
8930 __ mov(tmpU, tmp3);
8931 __ pop(spilled_regs, sp);
8932
8933 // tmp2 points to the address of the last 4 Latin1 characters right now
8934 __ ldrs(vtmp, Address(tmp2));
8935 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
8936 __ fmovd(tmpL, vtmp);
8937
8938 __ eor(rscratch2, tmpU, tmpL);
8939 __ cbz(rscratch2, DONE);
8940
8941 // Find the first different characters in the longwords and
8942 // compute their difference.
8943 __ bind(CALCULATE_DIFFERENCE);
8944 __ rev(rscratch2, rscratch2);
8945 __ clz(rscratch2, rscratch2);
8946 __ andr(rscratch2, rscratch2, -16);
8947 __ lsrv(tmp1, tmp1, rscratch2);
8948 __ uxthw(tmp1, tmp1);
8949 __ lsrv(rscratch1, rscratch1, rscratch2);
8950 __ uxthw(rscratch1, rscratch1);
8951 __ subw(result, tmp1, rscratch1);
8952 __ bind(DONE);
8953 __ ret(lr);
8954 return entry;
8955 }
8956
8957 // r0 = input (float16)
8958 // v0 = result (float)
8959 // v1 = temporary float register
8960 address generate_float16ToFloat() {
8961 __ align(CodeEntryAlignment);
8962 StubId stub_id = StubId::stubgen_hf2f_id;
8963 StubCodeMark mark(this, stub_id);
8964 address entry = __ pc();
8965 BLOCK_COMMENT("Entry:");
8966 __ flt16_to_flt(v0, r0, v1);
8967 __ ret(lr);
8968 return entry;
8969 }
8970
8971 // v0 = input (float)
8972 // r0 = result (float16)
8973 // v1 = temporary float register
8974 address generate_floatToFloat16() {
8975 __ align(CodeEntryAlignment);
8976 StubId stub_id = StubId::stubgen_f2hf_id;
8977 StubCodeMark mark(this, stub_id);
8978 address entry = __ pc();
8979 BLOCK_COMMENT("Entry:");
8980 __ flt_to_flt16(r0, v0, v1);
8981 __ ret(lr);
8982 return entry;
8983 }
8984
8985 address generate_method_entry_barrier() {
8986 __ align(CodeEntryAlignment);
8987 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
8988 StubCodeMark mark(this, stub_id);
8989
8990 Label deoptimize_label;
8991
8992 address start = __ pc();
8993
8994 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
8995
8996 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
8997 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
8998 // We can get here despite the nmethod being good, if we have not
8999 // yet applied our cross modification fence (or data fence).
9000 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9001 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9002 __ ldrw(rscratch2, rscratch2);
9003 __ strw(rscratch2, thread_epoch_addr);
9004 __ isb();
9005 __ membar(__ LoadLoad);
9006 }
9007
9008 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9009
9010 __ enter();
9011 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9012
9013 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9014
9015 __ push_call_clobbered_registers();
9016
9017 __ mov(c_rarg0, rscratch2);
9018 __ call_VM_leaf
9019 (CAST_FROM_FN_PTR
9020 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9021
9022 __ reset_last_Java_frame(true);
9023
9024 __ mov(rscratch1, r0);
9025
9026 __ pop_call_clobbered_registers();
9027
9028 __ cbnz(rscratch1, deoptimize_label);
9029
9030 __ leave();
9031 __ ret(lr);
9032
9033 __ BIND(deoptimize_label);
9034
9035 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9036 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9037
9038 __ mov(sp, rscratch1);
9039 __ br(rscratch2);
9040
9041 return start;
9042 }
9043
9044 // r0 = result
9045 // r1 = str1
9046 // r2 = cnt1
9047 // r3 = str2
9048 // r4 = cnt2
9049 // r10 = tmp1
9050 // r11 = tmp2
9051 address generate_compare_long_string_same_encoding(bool isLL) {
9052 __ align(CodeEntryAlignment);
9053 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9054 StubCodeMark mark(this, stub_id);
9055 address entry = __ pc();
9056 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9057 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9058
9059 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9060
9061 // exit from large loop when less than 64 bytes left to read or we're about
9062 // to prefetch memory behind array border
9063 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9064
9065 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9066 __ eor(rscratch2, tmp1, tmp2);
9067 __ cbnz(rscratch2, CAL_DIFFERENCE);
9068
9069 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9070 // update pointers, because of previous read
9071 __ add(str1, str1, wordSize);
9072 __ add(str2, str2, wordSize);
9073 if (SoftwarePrefetchHintDistance >= 0) {
9074 __ align(OptoLoopAlignment);
9075 __ bind(LARGE_LOOP_PREFETCH);
9076 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9077 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9078
9079 for (int i = 0; i < 4; i++) {
9080 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9081 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9082 __ cmp(tmp1, tmp2);
9083 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9084 __ br(Assembler::NE, DIFF);
9085 }
9086 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9087 __ add(str1, str1, 64);
9088 __ add(str2, str2, 64);
9089 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9090 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9091 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9092 }
9093
9094 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9095 __ br(Assembler::LE, LESS16);
9096 __ align(OptoLoopAlignment);
9097 __ bind(LOOP_COMPARE16);
9098 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9099 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9100 __ cmp(tmp1, tmp2);
9101 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9102 __ br(Assembler::NE, DIFF);
9103 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9104 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9105 __ br(Assembler::LT, LESS16);
9106
9107 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9108 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9109 __ cmp(tmp1, tmp2);
9110 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9111 __ br(Assembler::NE, DIFF);
9112 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9113 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9114 __ br(Assembler::GE, LOOP_COMPARE16);
9115 __ cbz(cnt2, LENGTH_DIFF);
9116
9117 __ bind(LESS16);
9118 // each 8 compare
9119 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9120 __ br(Assembler::LE, LESS8);
9121 __ ldr(tmp1, Address(__ post(str1, 8)));
9122 __ ldr(tmp2, Address(__ post(str2, 8)));
9123 __ eor(rscratch2, tmp1, tmp2);
9124 __ cbnz(rscratch2, CAL_DIFFERENCE);
9125 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9126
9127 __ bind(LESS8); // directly load last 8 bytes
9128 if (!isLL) {
9129 __ add(cnt2, cnt2, cnt2);
9130 }
9131 __ ldr(tmp1, Address(str1, cnt2));
9132 __ ldr(tmp2, Address(str2, cnt2));
9133 __ eor(rscratch2, tmp1, tmp2);
9134 __ cbz(rscratch2, LENGTH_DIFF);
9135 __ b(CAL_DIFFERENCE);
9136
9137 __ bind(DIFF);
9138 __ cmp(tmp1, tmp2);
9139 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9140 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9141 // reuse rscratch2 register for the result of eor instruction
9142 __ eor(rscratch2, tmp1, tmp2);
9143
9144 __ bind(CAL_DIFFERENCE);
9145 __ rev(rscratch2, rscratch2);
9146 __ clz(rscratch2, rscratch2);
9147 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9148 __ lsrv(tmp1, tmp1, rscratch2);
9149 __ lsrv(tmp2, tmp2, rscratch2);
9150 if (isLL) {
9151 __ uxtbw(tmp1, tmp1);
9152 __ uxtbw(tmp2, tmp2);
9153 } else {
9154 __ uxthw(tmp1, tmp1);
9155 __ uxthw(tmp2, tmp2);
9156 }
9157 __ subw(result, tmp1, tmp2);
9158
9159 __ bind(LENGTH_DIFF);
9160 __ ret(lr);
9161 return entry;
9162 }
9163
9164 enum string_compare_mode {
9165 LL,
9166 LU,
9167 UL,
9168 UU,
9169 };
9170
9171 // The following registers are declared in aarch64.ad
9172 // r0 = result
9173 // r1 = str1
9174 // r2 = cnt1
9175 // r3 = str2
9176 // r4 = cnt2
9177 // r10 = tmp1
9178 // r11 = tmp2
9179 // z0 = ztmp1
9180 // z1 = ztmp2
9181 // p0 = pgtmp1
9182 // p1 = pgtmp2
9183 address generate_compare_long_string_sve(string_compare_mode mode) {
9184 StubId stub_id;
9185 switch (mode) {
9186 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9187 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9188 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9189 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9190 default: ShouldNotReachHere();
9191 }
9192
9193 __ align(CodeEntryAlignment);
9194 address entry = __ pc();
9195 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9196 tmp1 = r10, tmp2 = r11;
9197
9198 Label LOOP, DONE, MISMATCH;
9199 Register vec_len = tmp1;
9200 Register idx = tmp2;
9201 // The minimum of the string lengths has been stored in cnt2.
9202 Register cnt = cnt2;
9203 FloatRegister ztmp1 = z0, ztmp2 = z1;
9204 PRegister pgtmp1 = p0, pgtmp2 = p1;
9205
9206 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9207 switch (mode) { \
9208 case LL: \
9209 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9210 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9211 break; \
9212 case LU: \
9213 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9214 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9215 break; \
9216 case UL: \
9217 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9218 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9219 break; \
9220 case UU: \
9221 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9222 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9223 break; \
9224 default: \
9225 ShouldNotReachHere(); \
9226 }
9227
9228 StubCodeMark mark(this, stub_id);
9229
9230 __ mov(idx, 0);
9231 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9232
9233 if (mode == LL) {
9234 __ sve_cntb(vec_len);
9235 } else {
9236 __ sve_cnth(vec_len);
9237 }
9238
9239 __ sub(rscratch1, cnt, vec_len);
9240
9241 __ bind(LOOP);
9242
9243 // main loop
9244 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9245 __ add(idx, idx, vec_len);
9246 // Compare strings.
9247 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9248 __ br(__ NE, MISMATCH);
9249 __ cmp(idx, rscratch1);
9250 __ br(__ LT, LOOP);
9251
9252 // post loop, last iteration
9253 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9254
9255 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9256 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9257 __ br(__ EQ, DONE);
9258
9259 __ bind(MISMATCH);
9260
9261 // Crop the vector to find its location.
9262 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9263 // Extract the first different characters of each string.
9264 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9265 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9266
9267 // Compute the difference of the first different characters.
9268 __ sub(result, rscratch1, rscratch2);
9269
9270 __ bind(DONE);
9271 __ ret(lr);
9272 #undef LOAD_PAIR
9273 return entry;
9274 }
9275
9276 void generate_compare_long_strings() {
9277 if (UseSVE == 0) {
9278 StubRoutines::aarch64::_compare_long_string_LL
9279 = generate_compare_long_string_same_encoding(true);
9280 StubRoutines::aarch64::_compare_long_string_UU
9281 = generate_compare_long_string_same_encoding(false);
9282 StubRoutines::aarch64::_compare_long_string_LU
9283 = generate_compare_long_string_different_encoding(true);
9284 StubRoutines::aarch64::_compare_long_string_UL
9285 = generate_compare_long_string_different_encoding(false);
9286 } else {
9287 StubRoutines::aarch64::_compare_long_string_LL
9288 = generate_compare_long_string_sve(LL);
9289 StubRoutines::aarch64::_compare_long_string_UU
9290 = generate_compare_long_string_sve(UU);
9291 StubRoutines::aarch64::_compare_long_string_LU
9292 = generate_compare_long_string_sve(LU);
9293 StubRoutines::aarch64::_compare_long_string_UL
9294 = generate_compare_long_string_sve(UL);
9295 }
9296 }
9297
9298 // R0 = result
9299 // R1 = str2
9300 // R2 = cnt1
9301 // R3 = str1
9302 // R4 = cnt2
9303 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9304 //
9305 // This generic linear code use few additional ideas, which makes it faster:
9306 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9307 // in order to skip initial loading(help in systems with 1 ld pipeline)
9308 // 2) we can use "fast" algorithm of finding single character to search for
9309 // first symbol with less branches(1 branch per each loaded register instead
9310 // of branch for each symbol), so, this is where constants like
9311 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9312 // 3) after loading and analyzing 1st register of source string, it can be
9313 // used to search for every 1st character entry, saving few loads in
9314 // comparison with "simplier-but-slower" implementation
9315 // 4) in order to avoid lots of push/pop operations, code below is heavily
9316 // re-using/re-initializing/compressing register values, which makes code
9317 // larger and a bit less readable, however, most of extra operations are
9318 // issued during loads or branches, so, penalty is minimal
9319 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
9320 StubId stub_id;
9321 if (str1_isL) {
9322 if (str2_isL) {
9323 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
9324 } else {
9325 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
9326 }
9327 } else {
9328 if (str2_isL) {
9329 ShouldNotReachHere();
9330 } else {
9331 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
9332 }
9333 }
9334 __ align(CodeEntryAlignment);
9335 StubCodeMark mark(this, stub_id);
9336 address entry = __ pc();
9337
9338 int str1_chr_size = str1_isL ? 1 : 2;
9339 int str2_chr_size = str2_isL ? 1 : 2;
9340 int str1_chr_shift = str1_isL ? 0 : 1;
9341 int str2_chr_shift = str2_isL ? 0 : 1;
9342 bool isL = str1_isL && str2_isL;
9343 // parameters
9344 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
9345 // temporary registers
9346 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
9347 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
9348 // redefinitions
9349 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
9350
9351 __ push(spilled_regs, sp);
9352 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
9353 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
9354 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
9355 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
9356 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
9357 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
9358 // Read whole register from str1. It is safe, because length >=8 here
9359 __ ldr(ch1, Address(str1));
9360 // Read whole register from str2. It is safe, because length >=8 here
9361 __ ldr(ch2, Address(str2));
9362 __ sub(cnt2, cnt2, cnt1);
9363 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
9364 if (str1_isL != str2_isL) {
9365 __ eor(v0, __ T16B, v0, v0);
9366 }
9367 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
9368 __ mul(first, first, tmp1);
9369 // check if we have less than 1 register to check
9370 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
9371 if (str1_isL != str2_isL) {
9372 __ fmovd(v1, ch1);
9373 }
9374 __ br(__ LE, L_SMALL);
9375 __ eor(ch2, first, ch2);
9376 if (str1_isL != str2_isL) {
9377 __ zip1(v1, __ T16B, v1, v0);
9378 }
9379 __ sub(tmp2, ch2, tmp1);
9380 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9381 __ bics(tmp2, tmp2, ch2);
9382 if (str1_isL != str2_isL) {
9383 __ fmovd(ch1, v1);
9384 }
9385 __ br(__ NE, L_HAS_ZERO);
9386 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9387 __ add(result, result, wordSize/str2_chr_size);
9388 __ add(str2, str2, wordSize);
9389 __ br(__ LT, L_POST_LOOP);
9390 __ BIND(L_LOOP);
9391 __ ldr(ch2, Address(str2));
9392 __ eor(ch2, first, ch2);
9393 __ sub(tmp2, ch2, tmp1);
9394 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9395 __ bics(tmp2, tmp2, ch2);
9396 __ br(__ NE, L_HAS_ZERO);
9397 __ BIND(L_LOOP_PROCEED);
9398 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
9399 __ add(str2, str2, wordSize);
9400 __ add(result, result, wordSize/str2_chr_size);
9401 __ br(__ GE, L_LOOP);
9402 __ BIND(L_POST_LOOP);
9403 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
9404 __ br(__ LE, NOMATCH);
9405 __ ldr(ch2, Address(str2));
9406 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9407 __ eor(ch2, first, ch2);
9408 __ sub(tmp2, ch2, tmp1);
9409 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9410 __ mov(tmp4, -1); // all bits set
9411 __ b(L_SMALL_PROCEED);
9412 __ align(OptoLoopAlignment);
9413 __ BIND(L_SMALL);
9414 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
9415 __ eor(ch2, first, ch2);
9416 if (str1_isL != str2_isL) {
9417 __ zip1(v1, __ T16B, v1, v0);
9418 }
9419 __ sub(tmp2, ch2, tmp1);
9420 __ mov(tmp4, -1); // all bits set
9421 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
9422 if (str1_isL != str2_isL) {
9423 __ fmovd(ch1, v1); // move converted 4 symbols
9424 }
9425 __ BIND(L_SMALL_PROCEED);
9426 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
9427 __ bic(tmp2, tmp2, ch2);
9428 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
9429 __ rbit(tmp2, tmp2);
9430 __ br(__ EQ, NOMATCH);
9431 __ BIND(L_SMALL_HAS_ZERO_LOOP);
9432 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
9433 __ cmp(cnt1, u1(wordSize/str2_chr_size));
9434 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
9435 if (str2_isL) { // LL
9436 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9437 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9438 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9439 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9440 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9441 } else {
9442 __ mov(ch2, 0xE); // all bits in byte set except last one
9443 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9444 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9445 __ lslv(tmp2, tmp2, tmp4);
9446 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9447 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9448 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9449 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9450 }
9451 __ cmp(ch1, ch2);
9452 __ mov(tmp4, wordSize/str2_chr_size);
9453 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9454 __ BIND(L_SMALL_CMP_LOOP);
9455 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9456 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9457 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9458 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9459 __ add(tmp4, tmp4, 1);
9460 __ cmp(tmp4, cnt1);
9461 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
9462 __ cmp(first, ch2);
9463 __ br(__ EQ, L_SMALL_CMP_LOOP);
9464 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
9465 __ cbz(tmp2, NOMATCH); // no more matches. exit
9466 __ clz(tmp4, tmp2);
9467 __ add(result, result, 1); // advance index
9468 __ add(str2, str2, str2_chr_size); // advance pointer
9469 __ b(L_SMALL_HAS_ZERO_LOOP);
9470 __ align(OptoLoopAlignment);
9471 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
9472 __ cmp(first, ch2);
9473 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9474 __ b(DONE);
9475 __ align(OptoLoopAlignment);
9476 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
9477 if (str2_isL) { // LL
9478 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
9479 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
9480 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
9481 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
9482 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9483 } else {
9484 __ mov(ch2, 0xE); // all bits in byte set except last one
9485 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9486 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9487 __ lslv(tmp2, tmp2, tmp4);
9488 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9489 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9490 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
9491 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9492 }
9493 __ cmp(ch1, ch2);
9494 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
9495 __ b(DONE);
9496 __ align(OptoLoopAlignment);
9497 __ BIND(L_HAS_ZERO);
9498 __ rbit(tmp2, tmp2);
9499 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
9500 // Now, perform compression of counters(cnt2 and cnt1) into one register.
9501 // It's fine because both counters are 32bit and are not changed in this
9502 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
9503 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
9504 __ sub(result, result, 1);
9505 __ BIND(L_HAS_ZERO_LOOP);
9506 __ mov(cnt1, wordSize/str2_chr_size);
9507 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9508 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
9509 if (str2_isL) {
9510 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9511 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9512 __ lslv(tmp2, tmp2, tmp4);
9513 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9514 __ add(tmp4, tmp4, 1);
9515 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9516 __ lsl(tmp2, tmp2, 1);
9517 __ mov(tmp4, wordSize/str2_chr_size);
9518 } else {
9519 __ mov(ch2, 0xE);
9520 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9521 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9522 __ lslv(tmp2, tmp2, tmp4);
9523 __ add(tmp4, tmp4, 1);
9524 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9525 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9526 __ lsl(tmp2, tmp2, 1);
9527 __ mov(tmp4, wordSize/str2_chr_size);
9528 __ sub(str2, str2, str2_chr_size);
9529 }
9530 __ cmp(ch1, ch2);
9531 __ mov(tmp4, wordSize/str2_chr_size);
9532 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9533 __ BIND(L_CMP_LOOP);
9534 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
9535 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
9536 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
9537 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
9538 __ add(tmp4, tmp4, 1);
9539 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
9540 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
9541 __ cmp(cnt1, ch2);
9542 __ br(__ EQ, L_CMP_LOOP);
9543 __ BIND(L_CMP_LOOP_NOMATCH);
9544 // here we're not matched
9545 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
9546 __ clz(tmp4, tmp2);
9547 __ add(str2, str2, str2_chr_size); // advance pointer
9548 __ b(L_HAS_ZERO_LOOP);
9549 __ align(OptoLoopAlignment);
9550 __ BIND(L_CMP_LOOP_LAST_CMP);
9551 __ cmp(cnt1, ch2);
9552 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9553 __ b(DONE);
9554 __ align(OptoLoopAlignment);
9555 __ BIND(L_CMP_LOOP_LAST_CMP2);
9556 if (str2_isL) {
9557 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
9558 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9559 __ lslv(tmp2, tmp2, tmp4);
9560 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9561 __ add(tmp4, tmp4, 1);
9562 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9563 __ lsl(tmp2, tmp2, 1);
9564 } else {
9565 __ mov(ch2, 0xE);
9566 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
9567 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
9568 __ lslv(tmp2, tmp2, tmp4);
9569 __ add(tmp4, tmp4, 1);
9570 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
9571 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
9572 __ lsl(tmp2, tmp2, 1);
9573 __ sub(str2, str2, str2_chr_size);
9574 }
9575 __ cmp(ch1, ch2);
9576 __ br(__ NE, L_CMP_LOOP_NOMATCH);
9577 __ b(DONE);
9578 __ align(OptoLoopAlignment);
9579 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
9580 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
9581 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
9582 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
9583 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
9584 // result by analyzed characters value, so, we can just reset lower bits
9585 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
9586 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
9587 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
9588 // index of last analyzed substring inside current octet. So, str2 in at
9589 // respective start address. We need to advance it to next octet
9590 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
9591 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
9592 __ bfm(result, zr, 0, 2 - str2_chr_shift);
9593 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
9594 __ movw(cnt2, cnt2);
9595 __ b(L_LOOP_PROCEED);
9596 __ align(OptoLoopAlignment);
9597 __ BIND(NOMATCH);
9598 __ mov(result, -1);
9599 __ BIND(DONE);
9600 __ pop(spilled_regs, sp);
9601 __ ret(lr);
9602 return entry;
9603 }
9604
9605 void generate_string_indexof_stubs() {
9606 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
9607 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
9608 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
9609 }
9610
9611 void inflate_and_store_2_fp_registers(bool generatePrfm,
9612 FloatRegister src1, FloatRegister src2) {
9613 Register dst = r1;
9614 __ zip1(v1, __ T16B, src1, v0);
9615 __ zip2(v2, __ T16B, src1, v0);
9616 if (generatePrfm) {
9617 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
9618 }
9619 __ zip1(v3, __ T16B, src2, v0);
9620 __ zip2(v4, __ T16B, src2, v0);
9621 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
9622 }
9623
9624 // R0 = src
9625 // R1 = dst
9626 // R2 = len
9627 // R3 = len >> 3
9628 // V0 = 0
9629 // v1 = loaded 8 bytes
9630 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
9631 address generate_large_byte_array_inflate() {
9632 __ align(CodeEntryAlignment);
9633 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
9634 StubCodeMark mark(this, stub_id);
9635 address entry = __ pc();
9636 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
9637 Register src = r0, dst = r1, len = r2, octetCounter = r3;
9638 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
9639
9640 // do one more 8-byte read to have address 16-byte aligned in most cases
9641 // also use single store instruction
9642 __ ldrd(v2, __ post(src, 8));
9643 __ sub(octetCounter, octetCounter, 2);
9644 __ zip1(v1, __ T16B, v1, v0);
9645 __ zip1(v2, __ T16B, v2, v0);
9646 __ st1(v1, v2, __ T16B, __ post(dst, 32));
9647 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9648 __ subs(rscratch1, octetCounter, large_loop_threshold);
9649 __ br(__ LE, LOOP_START);
9650 __ b(LOOP_PRFM_START);
9651 __ bind(LOOP_PRFM);
9652 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9653 __ bind(LOOP_PRFM_START);
9654 __ prfm(Address(src, SoftwarePrefetchHintDistance));
9655 __ sub(octetCounter, octetCounter, 8);
9656 __ subs(rscratch1, octetCounter, large_loop_threshold);
9657 inflate_and_store_2_fp_registers(true, v3, v4);
9658 inflate_and_store_2_fp_registers(true, v5, v6);
9659 __ br(__ GT, LOOP_PRFM);
9660 __ cmp(octetCounter, (u1)8);
9661 __ br(__ LT, DONE);
9662 __ bind(LOOP);
9663 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
9664 __ bind(LOOP_START);
9665 __ sub(octetCounter, octetCounter, 8);
9666 __ cmp(octetCounter, (u1)8);
9667 inflate_and_store_2_fp_registers(false, v3, v4);
9668 inflate_and_store_2_fp_registers(false, v5, v6);
9669 __ br(__ GE, LOOP);
9670 __ bind(DONE);
9671 __ ret(lr);
9672 return entry;
9673 }
9674
9675 /**
9676 * Arguments:
9677 *
9678 * Input:
9679 * c_rarg0 - current state address
9680 * c_rarg1 - H key address
9681 * c_rarg2 - data address
9682 * c_rarg3 - number of blocks
9683 *
9684 * Output:
9685 * Updated state at c_rarg0
9686 */
9687 address generate_ghash_processBlocks() {
9688 // Bafflingly, GCM uses little-endian for the byte order, but
9689 // big-endian for the bit order. For example, the polynomial 1 is
9690 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
9691 //
9692 // So, we must either reverse the bytes in each word and do
9693 // everything big-endian or reverse the bits in each byte and do
9694 // it little-endian. On AArch64 it's more idiomatic to reverse
9695 // the bits in each byte (we have an instruction, RBIT, to do
9696 // that) and keep the data in little-endian bit order through the
9697 // calculation, bit-reversing the inputs and outputs.
9698
9699 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
9700 StubCodeMark mark(this, stub_id);
9701 Label polynomial; // local data generated at end of stub
9702 __ align(CodeEntryAlignment);
9703 address start = __ pc();
9704
9705 Register state = c_rarg0;
9706 Register subkeyH = c_rarg1;
9707 Register data = c_rarg2;
9708 Register blocks = c_rarg3;
9709
9710 FloatRegister vzr = v30;
9711 __ eor(vzr, __ T16B, vzr, vzr); // zero register
9712
9713 __ adr(rscratch1, polynomial);
9714 __ ldrq(v24, rscratch1); // The field polynomial
9715
9716 __ ldrq(v0, Address(state));
9717 __ ldrq(v1, Address(subkeyH));
9718
9719 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
9720 __ rbit(v0, __ T16B, v0);
9721 __ rev64(v1, __ T16B, v1);
9722 __ rbit(v1, __ T16B, v1);
9723
9724 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
9725 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
9726
9727 {
9728 Label L_ghash_loop;
9729 __ bind(L_ghash_loop);
9730
9731 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
9732 // reversing each byte
9733 __ rbit(v2, __ T16B, v2);
9734 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
9735
9736 // Multiply state in v2 by subkey in v1
9737 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
9738 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
9739 /*temps*/v6, v3, /*reuse/clobber b*/v2);
9740 // Reduce v7:v5 by the field polynomial
9741 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
9742
9743 __ sub(blocks, blocks, 1);
9744 __ cbnz(blocks, L_ghash_loop);
9745 }
9746
9747 // The bit-reversed result is at this point in v0
9748 __ rev64(v0, __ T16B, v0);
9749 __ rbit(v0, __ T16B, v0);
9750
9751 __ st1(v0, __ T16B, state);
9752 __ ret(lr);
9753
9754 // bind label and generate local polynomial data
9755 __ align(wordSize * 2);
9756 __ bind(polynomial);
9757 __ emit_int64(0x87); // The low-order bits of the field
9758 // polynomial (i.e. p = z^7+z^2+z+1)
9759 // repeated in the low and high parts of a
9760 // 128-bit vector
9761 __ emit_int64(0x87);
9762
9763 return start;
9764 }
9765
9766 address generate_ghash_processBlocks_wide() {
9767 address small = generate_ghash_processBlocks();
9768
9769 StubId stub_id = StubId::stubgen_ghash_processBlocks_wide_id;
9770 StubCodeMark mark(this, stub_id);
9771 Label polynomial; // local data generated after stub
9772 __ align(CodeEntryAlignment);
9773 address start = __ pc();
9774
9775 Register state = c_rarg0;
9776 Register subkeyH = c_rarg1;
9777 Register data = c_rarg2;
9778 Register blocks = c_rarg3;
9779
9780 const int unroll = 4;
9781
9782 __ cmp(blocks, (unsigned char)(unroll * 2));
9783 __ br(__ LT, small);
9784
9785 if (unroll > 1) {
9786 // Save state before entering routine
9787 __ sub(sp, sp, 4 * 16);
9788 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
9789 __ sub(sp, sp, 4 * 16);
9790 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
9791 }
9792
9793 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
9794
9795 if (unroll > 1) {
9796 // And restore state
9797 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
9798 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
9799 }
9800
9801 __ cmp(blocks, (unsigned char)0);
9802 __ br(__ GT, small);
9803
9804 __ ret(lr);
9805
9806 // bind label and generate polynomial data
9807 __ align(wordSize * 2);
9808 __ bind(polynomial);
9809 __ emit_int64(0x87); // The low-order bits of the field
9810 // polynomial (i.e. p = z^7+z^2+z+1)
9811 // repeated in the low and high parts of a
9812 // 128-bit vector
9813 __ emit_int64(0x87);
9814
9815 return start;
9816
9817 }
9818
9819 void generate_base64_encode_simdround(Register src, Register dst,
9820 FloatRegister codec, u8 size) {
9821
9822 FloatRegister in0 = v4, in1 = v5, in2 = v6;
9823 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
9824 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
9825
9826 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9827
9828 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
9829
9830 __ ushr(ind0, arrangement, in0, 2);
9831
9832 __ ushr(ind1, arrangement, in1, 2);
9833 __ shl(in0, arrangement, in0, 6);
9834 __ orr(ind1, arrangement, ind1, in0);
9835 __ ushr(ind1, arrangement, ind1, 2);
9836
9837 __ ushr(ind2, arrangement, in2, 4);
9838 __ shl(in1, arrangement, in1, 4);
9839 __ orr(ind2, arrangement, in1, ind2);
9840 __ ushr(ind2, arrangement, ind2, 2);
9841
9842 __ shl(ind3, arrangement, in2, 2);
9843 __ ushr(ind3, arrangement, ind3, 2);
9844
9845 __ tbl(out0, arrangement, codec, 4, ind0);
9846 __ tbl(out1, arrangement, codec, 4, ind1);
9847 __ tbl(out2, arrangement, codec, 4, ind2);
9848 __ tbl(out3, arrangement, codec, 4, ind3);
9849
9850 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
9851 }
9852
9853 /**
9854 * Arguments:
9855 *
9856 * Input:
9857 * c_rarg0 - src_start
9858 * c_rarg1 - src_offset
9859 * c_rarg2 - src_length
9860 * c_rarg3 - dest_start
9861 * c_rarg4 - dest_offset
9862 * c_rarg5 - isURL
9863 *
9864 */
9865 address generate_base64_encodeBlock() {
9866
9867 static const char toBase64[64] = {
9868 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9869 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9870 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9871 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9872 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
9873 };
9874
9875 static const char toBase64URL[64] = {
9876 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
9877 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
9878 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
9879 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
9880 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
9881 };
9882
9883 __ align(CodeEntryAlignment);
9884 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
9885 StubCodeMark mark(this, stub_id);
9886 address start = __ pc();
9887
9888 Register src = c_rarg0; // source array
9889 Register soff = c_rarg1; // source start offset
9890 Register send = c_rarg2; // source end offset
9891 Register dst = c_rarg3; // dest array
9892 Register doff = c_rarg4; // position for writing to dest array
9893 Register isURL = c_rarg5; // Base64 or URL character set
9894
9895 // c_rarg6 and c_rarg7 are free to use as temps
9896 Register codec = c_rarg6;
9897 Register length = c_rarg7;
9898
9899 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
9900
9901 __ add(src, src, soff);
9902 __ add(dst, dst, doff);
9903 __ sub(length, send, soff);
9904
9905 // load the codec base address
9906 __ lea(codec, ExternalAddress((address) toBase64));
9907 __ cbz(isURL, ProcessData);
9908 __ lea(codec, ExternalAddress((address) toBase64URL));
9909
9910 __ BIND(ProcessData);
9911
9912 // too short to formup a SIMD loop, roll back
9913 __ cmp(length, (u1)24);
9914 __ br(Assembler::LT, Process3B);
9915
9916 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
9917
9918 __ BIND(Process48B);
9919 __ cmp(length, (u1)48);
9920 __ br(Assembler::LT, Process24B);
9921 generate_base64_encode_simdround(src, dst, v0, 16);
9922 __ sub(length, length, 48);
9923 __ b(Process48B);
9924
9925 __ BIND(Process24B);
9926 __ cmp(length, (u1)24);
9927 __ br(Assembler::LT, SIMDExit);
9928 generate_base64_encode_simdround(src, dst, v0, 8);
9929 __ sub(length, length, 24);
9930
9931 __ BIND(SIMDExit);
9932 __ cbz(length, Exit);
9933
9934 __ BIND(Process3B);
9935 // 3 src bytes, 24 bits
9936 __ ldrb(r10, __ post(src, 1));
9937 __ ldrb(r11, __ post(src, 1));
9938 __ ldrb(r12, __ post(src, 1));
9939 __ orrw(r11, r11, r10, Assembler::LSL, 8);
9940 __ orrw(r12, r12, r11, Assembler::LSL, 8);
9941 // codec index
9942 __ ubfmw(r15, r12, 18, 23);
9943 __ ubfmw(r14, r12, 12, 17);
9944 __ ubfmw(r13, r12, 6, 11);
9945 __ andw(r12, r12, 63);
9946 // get the code based on the codec
9947 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
9948 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
9949 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
9950 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
9951 __ strb(r15, __ post(dst, 1));
9952 __ strb(r14, __ post(dst, 1));
9953 __ strb(r13, __ post(dst, 1));
9954 __ strb(r12, __ post(dst, 1));
9955 __ sub(length, length, 3);
9956 __ cbnz(length, Process3B);
9957
9958 __ BIND(Exit);
9959 __ ret(lr);
9960
9961 return start;
9962 }
9963
9964 void generate_base64_decode_simdround(Register src, Register dst,
9965 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
9966
9967 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
9968 FloatRegister out0 = v20, out1 = v21, out2 = v22;
9969
9970 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
9971 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
9972
9973 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
9974
9975 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
9976
9977 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
9978
9979 // we need unsigned saturating subtract, to make sure all input values
9980 // in range [0, 63] will have 0U value in the higher half lookup
9981 __ uqsubv(decH0, __ T16B, in0, v27);
9982 __ uqsubv(decH1, __ T16B, in1, v27);
9983 __ uqsubv(decH2, __ T16B, in2, v27);
9984 __ uqsubv(decH3, __ T16B, in3, v27);
9985
9986 // lower half lookup
9987 __ tbl(decL0, arrangement, codecL, 4, in0);
9988 __ tbl(decL1, arrangement, codecL, 4, in1);
9989 __ tbl(decL2, arrangement, codecL, 4, in2);
9990 __ tbl(decL3, arrangement, codecL, 4, in3);
9991
9992 // higher half lookup
9993 __ tbx(decH0, arrangement, codecH, 4, decH0);
9994 __ tbx(decH1, arrangement, codecH, 4, decH1);
9995 __ tbx(decH2, arrangement, codecH, 4, decH2);
9996 __ tbx(decH3, arrangement, codecH, 4, decH3);
9997
9998 // combine lower and higher
9999 __ orr(decL0, arrangement, decL0, decH0);
10000 __ orr(decL1, arrangement, decL1, decH1);
10001 __ orr(decL2, arrangement, decL2, decH2);
10002 __ orr(decL3, arrangement, decL3, decH3);
10003
10004 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10005 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10006 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10007 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10008 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10009 __ orr(in0, arrangement, decH0, decH1);
10010 __ orr(in1, arrangement, decH2, decH3);
10011 __ orr(in2, arrangement, in0, in1);
10012 __ umaxv(in3, arrangement, in2);
10013 __ umov(rscratch2, in3, __ B, 0);
10014
10015 // get the data to output
10016 __ shl(out0, arrangement, decL0, 2);
10017 __ ushr(out1, arrangement, decL1, 4);
10018 __ orr(out0, arrangement, out0, out1);
10019 __ shl(out1, arrangement, decL1, 4);
10020 __ ushr(out2, arrangement, decL2, 2);
10021 __ orr(out1, arrangement, out1, out2);
10022 __ shl(out2, arrangement, decL2, 6);
10023 __ orr(out2, arrangement, out2, decL3);
10024
10025 __ cbz(rscratch2, NoIllegalData);
10026
10027 // handle illegal input
10028 __ umov(r10, in2, __ D, 0);
10029 if (size == 16) {
10030 __ cbnz(r10, ErrorInLowerHalf);
10031
10032 // illegal input is in higher half, store the lower half now.
10033 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10034
10035 __ umov(r10, in2, __ D, 1);
10036 __ umov(r11, out0, __ D, 1);
10037 __ umov(r12, out1, __ D, 1);
10038 __ umov(r13, out2, __ D, 1);
10039 __ b(StoreLegalData);
10040
10041 __ BIND(ErrorInLowerHalf);
10042 }
10043 __ umov(r11, out0, __ D, 0);
10044 __ umov(r12, out1, __ D, 0);
10045 __ umov(r13, out2, __ D, 0);
10046
10047 __ BIND(StoreLegalData);
10048 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10049 __ strb(r11, __ post(dst, 1));
10050 __ strb(r12, __ post(dst, 1));
10051 __ strb(r13, __ post(dst, 1));
10052 __ lsr(r10, r10, 8);
10053 __ lsr(r11, r11, 8);
10054 __ lsr(r12, r12, 8);
10055 __ lsr(r13, r13, 8);
10056 __ b(StoreLegalData);
10057
10058 __ BIND(NoIllegalData);
10059 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10060 }
10061
10062
10063 /**
10064 * Arguments:
10065 *
10066 * Input:
10067 * c_rarg0 - src_start
10068 * c_rarg1 - src_offset
10069 * c_rarg2 - src_length
10070 * c_rarg3 - dest_start
10071 * c_rarg4 - dest_offset
10072 * c_rarg5 - isURL
10073 * c_rarg6 - isMIME
10074 *
10075 */
10076 address generate_base64_decodeBlock() {
10077
10078 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10079 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10080 // titled "Base64 decoding".
10081
10082 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
10083 // except the trailing character '=' is also treated illegal value in this intrinsic. That
10084 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
10085 static const uint8_t fromBase64ForNoSIMD[256] = {
10086 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10087 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10088 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10089 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10090 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10091 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
10092 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10093 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10094 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10095 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10096 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10097 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10098 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10099 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10100 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10101 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10102 };
10103
10104 static const uint8_t fromBase64URLForNoSIMD[256] = {
10105 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10106 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10107 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10108 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10109 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
10110 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
10111 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
10112 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
10113 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10114 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10115 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10116 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10117 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10118 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10119 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10120 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10121 };
10122
10123 // A legal value of base64 code is in range [0, 127]. We need two lookups
10124 // with tbl/tbx and combine them to get the decode data. The 1st table vector
10125 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
10126 // table vector lookup use tbx, out of range indices are unchanged in
10127 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
10128 // The value of index 64 is set to 0, so that we know that we already get the
10129 // decoded data with the 1st lookup.
10130 static const uint8_t fromBase64ForSIMD[128] = {
10131 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10132 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10133 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
10134 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10135 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10136 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10137 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10138 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10139 };
10140
10141 static const uint8_t fromBase64URLForSIMD[128] = {
10142 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10143 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
10144 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
10145 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
10146 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
10147 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
10148 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
10149 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
10150 };
10151
10152 __ align(CodeEntryAlignment);
10153 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10154 StubCodeMark mark(this, stub_id);
10155 address start = __ pc();
10156
10157 Register src = c_rarg0; // source array
10158 Register soff = c_rarg1; // source start offset
10159 Register send = c_rarg2; // source end offset
10160 Register dst = c_rarg3; // dest array
10161 Register doff = c_rarg4; // position for writing to dest array
10162 Register isURL = c_rarg5; // Base64 or URL character set
10163 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10164
10165 Register length = send; // reuse send as length of source data to process
10166
10167 Register simd_codec = c_rarg6;
10168 Register nosimd_codec = c_rarg7;
10169
10170 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10171
10172 __ enter();
10173
10174 __ add(src, src, soff);
10175 __ add(dst, dst, doff);
10176
10177 __ mov(doff, dst);
10178
10179 __ sub(length, send, soff);
10180 __ bfm(length, zr, 0, 1);
10181
10182 __ lea(nosimd_codec, ExternalAddress((address) fromBase64ForNoSIMD));
10183 __ cbz(isURL, ProcessData);
10184 __ lea(nosimd_codec, ExternalAddress((address) fromBase64URLForNoSIMD));
10185
10186 __ BIND(ProcessData);
10187 __ mov(rscratch1, length);
10188 __ cmp(length, (u1)144); // 144 = 80 + 64
10189 __ br(Assembler::LT, Process4B);
10190
10191 // In the MIME case, the line length cannot be more than 76
10192 // bytes (see RFC 2045). This is too short a block for SIMD
10193 // to be worthwhile, so we use non-SIMD here.
10194 __ movw(rscratch1, 79);
10195
10196 __ BIND(Process4B);
10197 __ ldrw(r14, __ post(src, 4));
10198 __ ubfxw(r10, r14, 0, 8);
10199 __ ubfxw(r11, r14, 8, 8);
10200 __ ubfxw(r12, r14, 16, 8);
10201 __ ubfxw(r13, r14, 24, 8);
10202 // get the de-code
10203 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10204 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10205 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10206 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10207 // error detection, 255u indicates an illegal input
10208 __ orrw(r14, r10, r11);
10209 __ orrw(r15, r12, r13);
10210 __ orrw(r14, r14, r15);
10211 __ tbnz(r14, 7, Exit);
10212 // recover the data
10213 __ lslw(r14, r10, 10);
10214 __ bfiw(r14, r11, 4, 6);
10215 __ bfmw(r14, r12, 2, 5);
10216 __ rev16w(r14, r14);
10217 __ bfiw(r13, r12, 6, 2);
10218 __ strh(r14, __ post(dst, 2));
10219 __ strb(r13, __ post(dst, 1));
10220 // non-simd loop
10221 __ subsw(rscratch1, rscratch1, 4);
10222 __ br(Assembler::GT, Process4B);
10223
10224 // if exiting from PreProcess80B, rscratch1 == -1;
10225 // otherwise, rscratch1 == 0.
10226 __ cbzw(rscratch1, Exit);
10227 __ sub(length, length, 80);
10228
10229 __ lea(simd_codec, ExternalAddress((address) fromBase64ForSIMD));
10230 __ cbz(isURL, SIMDEnter);
10231 __ lea(simd_codec, ExternalAddress((address) fromBase64URLForSIMD));
10232
10233 __ BIND(SIMDEnter);
10234 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10235 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10236 __ mov(rscratch1, 63);
10237 __ dup(v27, __ T16B, rscratch1);
10238
10239 __ BIND(Process64B);
10240 __ cmp(length, (u1)64);
10241 __ br(Assembler::LT, Process32B);
10242 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10243 __ sub(length, length, 64);
10244 __ b(Process64B);
10245
10246 __ BIND(Process32B);
10247 __ cmp(length, (u1)32);
10248 __ br(Assembler::LT, SIMDExit);
10249 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10250 __ sub(length, length, 32);
10251 __ b(Process32B);
10252
10253 __ BIND(SIMDExit);
10254 __ cbz(length, Exit);
10255 __ movw(rscratch1, length);
10256 __ b(Process4B);
10257
10258 __ BIND(Exit);
10259 __ sub(c_rarg0, dst, doff);
10260
10261 __ leave();
10262 __ ret(lr);
10263
10264 return start;
10265 }
10266
10267 // Support for spin waits.
10268 address generate_spin_wait() {
10269 __ align(CodeEntryAlignment);
10270 StubId stub_id = StubId::stubgen_spin_wait_id;
10271 StubCodeMark mark(this, stub_id);
10272 address start = __ pc();
10273
10274 __ spin_wait();
10275 __ ret(lr);
10276
10277 return start;
10278 }
10279
10280 void generate_lookup_secondary_supers_table_stub() {
10281 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10282 StubCodeMark mark(this, stub_id);
10283
10284 const Register
10285 r_super_klass = r0,
10286 r_array_base = r1,
10287 r_array_length = r2,
10288 r_array_index = r3,
10289 r_sub_klass = r4,
10290 r_bitmap = rscratch2,
10291 result = r5;
10292 const FloatRegister
10293 vtemp = v0;
10294
10295 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10296 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = __ pc();
10297 Label L_success;
10298 __ enter();
10299 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10300 r_array_base, r_array_length, r_array_index,
10301 vtemp, result, slot,
10302 /*stub_is_near*/true);
10303 __ leave();
10304 __ ret(lr);
10305 }
10306 }
10307
10308 // Slow path implementation for UseSecondarySupersTable.
10309 address generate_lookup_secondary_supers_table_slow_path_stub() {
10310 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10311 StubCodeMark mark(this, stub_id);
10312
10313 address start = __ pc();
10314 const Register
10315 r_super_klass = r0, // argument
10316 r_array_base = r1, // argument
10317 temp1 = r2, // temp
10318 r_array_index = r3, // argument
10319 r_bitmap = rscratch2, // argument
10320 result = r5; // argument
10321
10322 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
10323 __ ret(lr);
10324
10325 return start;
10326 }
10327
10328 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
10329
10330 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
10331 //
10332 // If LSE is in use, generate LSE versions of all the stubs. The
10333 // non-LSE versions are in atomic_aarch64.S.
10334
10335 // class AtomicStubMark records the entry point of a stub and the
10336 // stub pointer which will point to it. The stub pointer is set to
10337 // the entry point when ~AtomicStubMark() is called, which must be
10338 // after ICache::invalidate_range. This ensures safe publication of
10339 // the generated code.
10340 class AtomicStubMark {
10341 address _entry_point;
10342 aarch64_atomic_stub_t *_stub;
10343 MacroAssembler *_masm;
10344 public:
10345 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
10346 _masm = masm;
10347 __ align(32);
10348 _entry_point = __ pc();
10349 _stub = stub;
10350 }
10351 ~AtomicStubMark() {
10352 *_stub = (aarch64_atomic_stub_t)_entry_point;
10353 }
10354 };
10355
10356 // NB: For memory_order_conservative we need a trailing membar after
10357 // LSE atomic operations but not a leading membar.
10358 //
10359 // We don't need a leading membar because a clause in the Arm ARM
10360 // says:
10361 //
10362 // Barrier-ordered-before
10363 //
10364 // Barrier instructions order prior Memory effects before subsequent
10365 // Memory effects generated by the same Observer. A read or a write
10366 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
10367 // Observer if and only if RW1 appears in program order before RW 2
10368 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
10369 // instruction with both Acquire and Release semantics.
10370 //
10371 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
10372 // and Release semantics, therefore we don't need a leading
10373 // barrier. However, there is no corresponding Barrier-ordered-after
10374 // relationship, therefore we need a trailing membar to prevent a
10375 // later store or load from being reordered with the store in an
10376 // atomic instruction.
10377 //
10378 // This was checked by using the herd7 consistency model simulator
10379 // (http://diy.inria.fr/) with this test case:
10380 //
10381 // AArch64 LseCas
10382 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
10383 // P0 | P1;
10384 // LDR W4, [X2] | MOV W3, #0;
10385 // DMB LD | MOV W4, #1;
10386 // LDR W3, [X1] | CASAL W3, W4, [X1];
10387 // | DMB ISH;
10388 // | STR W4, [X2];
10389 // exists
10390 // (0:X3=0 /\ 0:X4=1)
10391 //
10392 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
10393 // with the store to x in P1. Without the DMB in P1 this may happen.
10394 //
10395 // At the time of writing we don't know of any AArch64 hardware that
10396 // reorders stores in this way, but the Reference Manual permits it.
10397
10398 void gen_cas_entry(Assembler::operand_size size,
10399 atomic_memory_order order) {
10400 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
10401 exchange_val = c_rarg2;
10402 bool acquire, release;
10403 switch (order) {
10404 case memory_order_relaxed:
10405 acquire = false;
10406 release = false;
10407 break;
10408 case memory_order_release:
10409 acquire = false;
10410 release = true;
10411 break;
10412 default:
10413 acquire = true;
10414 release = true;
10415 break;
10416 }
10417 __ mov(prev, compare_val);
10418 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
10419 if (order == memory_order_conservative) {
10420 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10421 }
10422 if (size == Assembler::xword) {
10423 __ mov(r0, prev);
10424 } else {
10425 __ movw(r0, prev);
10426 }
10427 __ ret(lr);
10428 }
10429
10430 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
10431 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10432 // If not relaxed, then default to conservative. Relaxed is the only
10433 // case we use enough to be worth specializing.
10434 if (order == memory_order_relaxed) {
10435 __ ldadd(size, incr, prev, addr);
10436 } else {
10437 __ ldaddal(size, incr, prev, addr);
10438 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10439 }
10440 if (size == Assembler::xword) {
10441 __ mov(r0, prev);
10442 } else {
10443 __ movw(r0, prev);
10444 }
10445 __ ret(lr);
10446 }
10447
10448 void gen_swpal_entry(Assembler::operand_size size) {
10449 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
10450 __ swpal(size, incr, prev, addr);
10451 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
10452 if (size == Assembler::xword) {
10453 __ mov(r0, prev);
10454 } else {
10455 __ movw(r0, prev);
10456 }
10457 __ ret(lr);
10458 }
10459
10460 void generate_atomic_entry_points() {
10461 if (! UseLSE) {
10462 return;
10463 }
10464 __ align(CodeEntryAlignment);
10465 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
10466 StubCodeMark mark(this, stub_id);
10467 address first_entry = __ pc();
10468
10469 // ADD, memory_order_conservative
10470 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
10471 gen_ldadd_entry(Assembler::word, memory_order_conservative);
10472 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
10473 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
10474
10475 // ADD, memory_order_relaxed
10476 AtomicStubMark mark_fetch_add_4_relaxed
10477 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
10478 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
10479 AtomicStubMark mark_fetch_add_8_relaxed
10480 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
10481 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
10482
10483 // XCHG, memory_order_conservative
10484 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
10485 gen_swpal_entry(Assembler::word);
10486 AtomicStubMark mark_xchg_8_impl(_masm, &aarch64_atomic_xchg_8_impl);
10487 gen_swpal_entry(Assembler::xword);
10488
10489 // CAS, memory_order_conservative
10490 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
10491 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
10492 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
10493 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
10494 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
10495 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
10496
10497 // CAS, memory_order_relaxed
10498 AtomicStubMark mark_cmpxchg_1_relaxed
10499 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
10500 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
10501 AtomicStubMark mark_cmpxchg_4_relaxed
10502 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
10503 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
10504 AtomicStubMark mark_cmpxchg_8_relaxed
10505 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
10506 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
10507
10508 AtomicStubMark mark_cmpxchg_4_release
10509 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
10510 gen_cas_entry(MacroAssembler::word, memory_order_release);
10511 AtomicStubMark mark_cmpxchg_8_release
10512 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
10513 gen_cas_entry(MacroAssembler::xword, memory_order_release);
10514
10515 AtomicStubMark mark_cmpxchg_4_seq_cst
10516 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
10517 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
10518 AtomicStubMark mark_cmpxchg_8_seq_cst
10519 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
10520 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
10521
10522 ICache::invalidate_range(first_entry, __ pc() - first_entry);
10523 }
10524 #endif // LINUX
10525
10526 address generate_cont_thaw(Continuation::thaw_kind kind) {
10527 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
10528 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
10529
10530 address start = __ pc();
10531
10532 if (return_barrier) {
10533 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
10534 __ mov(sp, rscratch1);
10535 }
10536 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10537
10538 if (return_barrier) {
10539 // preserve possible return value from a method returning to the return barrier
10540 __ fmovd(rscratch1, v0);
10541 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10542 }
10543
10544 __ movw(c_rarg1, (return_barrier ? 1 : 0));
10545 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
10546 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
10547
10548 if (return_barrier) {
10549 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10550 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10551 __ fmovd(v0, rscratch1);
10552 }
10553 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
10554
10555
10556 Label thaw_success;
10557 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
10558 __ cbnz(rscratch2, thaw_success);
10559 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
10560 __ br(rscratch1);
10561 __ bind(thaw_success);
10562
10563 // make room for the thawed frames
10564 __ sub(rscratch1, sp, rscratch2);
10565 __ andr(rscratch1, rscratch1, -16); // align
10566 __ mov(sp, rscratch1);
10567
10568 if (return_barrier) {
10569 // save original return value -- again
10570 __ fmovd(rscratch1, v0);
10571 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
10572 }
10573
10574 // If we want, we can templatize thaw by kind, and have three different entries
10575 __ movw(c_rarg1, (uint32_t)kind);
10576
10577 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
10578 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
10579
10580 if (return_barrier) {
10581 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
10582 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
10583 __ fmovd(v0, rscratch1);
10584 } else {
10585 __ mov(r0, zr); // return 0 (success) from doYield
10586 }
10587
10588 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
10589 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
10590 __ mov(rfp, sp);
10591
10592 if (return_barrier_exception) {
10593 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
10594 __ authenticate_return_address(c_rarg1);
10595 __ verify_oop(r0);
10596 // save return value containing the exception oop in callee-saved R19
10597 __ mov(r19, r0);
10598
10599 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
10600
10601 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
10602 // __ reinitialize_ptrue();
10603
10604 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
10605
10606 __ mov(r1, r0); // the exception handler
10607 __ mov(r0, r19); // restore return value containing the exception oop
10608 __ verify_oop(r0);
10609
10610 __ leave();
10611 __ mov(r3, lr);
10612 __ br(r1); // the exception handler
10613 } else {
10614 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
10615 __ leave();
10616 __ ret(lr);
10617 }
10618
10619 return start;
10620 }
10621
10622 address generate_cont_thaw() {
10623 if (!Continuations::enabled()) return nullptr;
10624
10625 StubId stub_id = StubId::stubgen_cont_thaw_id;
10626 StubCodeMark mark(this, stub_id);
10627 address start = __ pc();
10628 generate_cont_thaw(Continuation::thaw_top);
10629 return start;
10630 }
10631
10632 address generate_cont_returnBarrier() {
10633 if (!Continuations::enabled()) return nullptr;
10634
10635 // TODO: will probably need multiple return barriers depending on return type
10636 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
10637 StubCodeMark mark(this, stub_id);
10638 address start = __ pc();
10639
10640 generate_cont_thaw(Continuation::thaw_return_barrier);
10641
10642 return start;
10643 }
10644
10645 address generate_cont_returnBarrier_exception() {
10646 if (!Continuations::enabled()) return nullptr;
10647
10648 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
10649 StubCodeMark mark(this, stub_id);
10650 address start = __ pc();
10651
10652 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
10653
10654 return start;
10655 }
10656
10657 address generate_cont_preempt_stub() {
10658 if (!Continuations::enabled()) return nullptr;
10659 StubId stub_id = StubId::stubgen_cont_preempt_id;
10660 StubCodeMark mark(this, stub_id);
10661 address start = __ pc();
10662
10663 __ reset_last_Java_frame(true);
10664
10665 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
10666 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
10667 __ mov(sp, rscratch2);
10668
10669 Label preemption_cancelled;
10670 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
10671 __ cbnz(rscratch1, preemption_cancelled);
10672
10673 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
10674 SharedRuntime::continuation_enter_cleanup(_masm);
10675 __ leave();
10676 __ ret(lr);
10677
10678 // We acquired the monitor after freezing the frames so call thaw to continue execution.
10679 __ bind(preemption_cancelled);
10680 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
10681 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
10682 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
10683 __ ldr(rscratch1, Address(rscratch1));
10684 __ br(rscratch1);
10685
10686 return start;
10687 }
10688
10689 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
10690 // are represented as long[5], with BITS_PER_LIMB = 26.
10691 // Pack five 26-bit limbs into three 64-bit registers.
10692 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
10693 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
10694 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
10695 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
10696 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
10697
10698 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
10699 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
10700 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
10701 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
10702
10703 if (dest2->is_valid()) {
10704 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
10705 } else {
10706 #ifdef ASSERT
10707 Label OK;
10708 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
10709 __ br(__ EQ, OK);
10710 __ stop("high bits of Poly1305 integer should be zero");
10711 __ should_not_reach_here();
10712 __ bind(OK);
10713 #endif
10714 }
10715 }
10716
10717 // As above, but return only a 128-bit integer, packed into two
10718 // 64-bit registers.
10719 void pack_26(Register dest0, Register dest1, Register src) {
10720 pack_26(dest0, dest1, noreg, src);
10721 }
10722
10723 // Multiply and multiply-accumulate unsigned 64-bit registers.
10724 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
10725 __ mul(prod_lo, n, m);
10726 __ umulh(prod_hi, n, m);
10727 }
10728 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
10729 wide_mul(rscratch1, rscratch2, n, m);
10730 __ adds(sum_lo, sum_lo, rscratch1);
10731 __ adc(sum_hi, sum_hi, rscratch2);
10732 }
10733
10734 // Poly1305, RFC 7539
10735
10736 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
10737 // description of the tricks used to simplify and accelerate this
10738 // computation.
10739
10740 address generate_poly1305_processBlocks() {
10741 __ align(CodeEntryAlignment);
10742 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
10743 StubCodeMark mark(this, stub_id);
10744 address start = __ pc();
10745 Label here;
10746 __ enter();
10747 RegSet callee_saved = RegSet::range(r19, r28);
10748 __ push(callee_saved, sp);
10749
10750 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
10751
10752 // Arguments
10753 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
10754
10755 // R_n is the 128-bit randomly-generated key, packed into two
10756 // registers. The caller passes this key to us as long[5], with
10757 // BITS_PER_LIMB = 26.
10758 const Register R_0 = *++regs, R_1 = *++regs;
10759 pack_26(R_0, R_1, r_start);
10760
10761 // RR_n is (R_n >> 2) * 5
10762 const Register RR_0 = *++regs, RR_1 = *++regs;
10763 __ lsr(RR_0, R_0, 2);
10764 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
10765 __ lsr(RR_1, R_1, 2);
10766 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
10767
10768 // U_n is the current checksum
10769 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
10770 pack_26(U_0, U_1, U_2, acc_start);
10771
10772 static constexpr int BLOCK_LENGTH = 16;
10773 Label DONE, LOOP;
10774
10775 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10776 __ br(Assembler::LT, DONE); {
10777 __ bind(LOOP);
10778
10779 // S_n is to be the sum of U_n and the next block of data
10780 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
10781 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
10782 __ adds(S_0, U_0, S_0);
10783 __ adcs(S_1, U_1, S_1);
10784 __ adc(S_2, U_2, zr);
10785 __ add(S_2, S_2, 1);
10786
10787 const Register U_0HI = *++regs, U_1HI = *++regs;
10788
10789 // NB: this logic depends on some of the special properties of
10790 // Poly1305 keys. In particular, because we know that the top
10791 // four bits of R_0 and R_1 are zero, we can add together
10792 // partial products without any risk of needing to propagate a
10793 // carry out.
10794 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
10795 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
10796 __ andr(U_2, R_0, 3);
10797 __ mul(U_2, S_2, U_2);
10798
10799 // Recycle registers S_0, S_1, S_2
10800 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
10801
10802 // Partial reduction mod 2**130 - 5
10803 __ adds(U_1, U_0HI, U_1);
10804 __ adc(U_2, U_1HI, U_2);
10805 // Sum now in U_2:U_1:U_0.
10806 // Dead: U_0HI, U_1HI.
10807 regs = (regs.remaining() + U_0HI + U_1HI).begin();
10808
10809 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
10810
10811 // First, U_2:U_1:U_0 += (U_2 >> 2)
10812 __ lsr(rscratch1, U_2, 2);
10813 __ andr(U_2, U_2, (u8)3);
10814 __ adds(U_0, U_0, rscratch1);
10815 __ adcs(U_1, U_1, zr);
10816 __ adc(U_2, U_2, zr);
10817 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
10818 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
10819 __ adcs(U_1, U_1, zr);
10820 __ adc(U_2, U_2, zr);
10821
10822 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
10823 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
10824 __ br(~ Assembler::LT, LOOP);
10825 }
10826
10827 // Further reduce modulo 2^130 - 5
10828 __ lsr(rscratch1, U_2, 2);
10829 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
10830 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
10831 __ adcs(U_1, U_1, zr);
10832 __ andr(U_2, U_2, (u1)3);
10833 __ adc(U_2, U_2, zr);
10834
10835 // Unpack the sum into five 26-bit limbs and write to memory.
10836 __ ubfiz(rscratch1, U_0, 0, 26);
10837 __ ubfx(rscratch2, U_0, 26, 26);
10838 __ stp(rscratch1, rscratch2, Address(acc_start));
10839 __ ubfx(rscratch1, U_0, 52, 12);
10840 __ bfi(rscratch1, U_1, 12, 14);
10841 __ ubfx(rscratch2, U_1, 14, 26);
10842 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
10843 __ ubfx(rscratch1, U_1, 40, 24);
10844 __ bfi(rscratch1, U_2, 24, 3);
10845 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
10846
10847 __ bind(DONE);
10848 __ pop(callee_saved, sp);
10849 __ leave();
10850 __ ret(lr);
10851
10852 return start;
10853 }
10854
10855 // exception handler for upcall stubs
10856 address generate_upcall_stub_exception_handler() {
10857 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
10858 StubCodeMark mark(this, stub_id);
10859 address start = __ pc();
10860
10861 // Native caller has no idea how to handle exceptions,
10862 // so we just crash here. Up to callee to catch exceptions.
10863 __ verify_oop(r0);
10864 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
10865 __ blr(rscratch1);
10866 __ should_not_reach_here();
10867
10868 return start;
10869 }
10870
10871 // load Method* target of MethodHandle
10872 // j_rarg0 = jobject receiver
10873 // rmethod = result
10874 address generate_upcall_stub_load_target() {
10875 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
10876 StubCodeMark mark(this, stub_id);
10877 address start = __ pc();
10878
10879 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
10880 // Load target method from receiver
10881 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
10882 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
10883 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
10884 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
10885 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
10886 noreg, noreg);
10887 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
10888
10889 __ ret(lr);
10890
10891 return start;
10892 }
10893
10894 #undef __
10895 #define __ masm->
10896
10897 class MontgomeryMultiplyGenerator : public MacroAssembler {
10898
10899 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
10900 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
10901
10902 RegSet _toSave;
10903 bool _squaring;
10904
10905 public:
10906 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
10907 : MacroAssembler(as->code()), _squaring(squaring) {
10908
10909 // Register allocation
10910
10911 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
10912 Pa_base = *regs; // Argument registers
10913 if (squaring)
10914 Pb_base = Pa_base;
10915 else
10916 Pb_base = *++regs;
10917 Pn_base = *++regs;
10918 Rlen= *++regs;
10919 inv = *++regs;
10920 Pm_base = *++regs;
10921
10922 // Working registers:
10923 Ra = *++regs; // The current digit of a, b, n, and m.
10924 Rb = *++regs;
10925 Rm = *++regs;
10926 Rn = *++regs;
10927
10928 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
10929 Pb = *++regs;
10930 Pm = *++regs;
10931 Pn = *++regs;
10932
10933 t0 = *++regs; // Three registers which form a
10934 t1 = *++regs; // triple-precision accumuator.
10935 t2 = *++regs;
10936
10937 Ri = *++regs; // Inner and outer loop indexes.
10938 Rj = *++regs;
10939
10940 Rhi_ab = *++regs; // Product registers: low and high parts
10941 Rlo_ab = *++regs; // of a*b and m*n.
10942 Rhi_mn = *++regs;
10943 Rlo_mn = *++regs;
10944
10945 // r19 and up are callee-saved.
10946 _toSave = RegSet::range(r19, *regs) + Pm_base;
10947 }
10948
10949 private:
10950 void save_regs() {
10951 push(_toSave, sp);
10952 }
10953
10954 void restore_regs() {
10955 pop(_toSave, sp);
10956 }
10957
10958 template <typename T>
10959 void unroll_2(Register count, T block) {
10960 Label loop, end, odd;
10961 tbnz(count, 0, odd);
10962 cbz(count, end);
10963 align(16);
10964 bind(loop);
10965 (this->*block)();
10966 bind(odd);
10967 (this->*block)();
10968 subs(count, count, 2);
10969 br(Assembler::GT, loop);
10970 bind(end);
10971 }
10972
10973 template <typename T>
10974 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
10975 Label loop, end, odd;
10976 tbnz(count, 0, odd);
10977 cbz(count, end);
10978 align(16);
10979 bind(loop);
10980 (this->*block)(d, s, tmp);
10981 bind(odd);
10982 (this->*block)(d, s, tmp);
10983 subs(count, count, 2);
10984 br(Assembler::GT, loop);
10985 bind(end);
10986 }
10987
10988 void pre1(RegisterOrConstant i) {
10989 block_comment("pre1");
10990 // Pa = Pa_base;
10991 // Pb = Pb_base + i;
10992 // Pm = Pm_base;
10993 // Pn = Pn_base + i;
10994 // Ra = *Pa;
10995 // Rb = *Pb;
10996 // Rm = *Pm;
10997 // Rn = *Pn;
10998 ldr(Ra, Address(Pa_base));
10999 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11000 ldr(Rm, Address(Pm_base));
11001 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11002 lea(Pa, Address(Pa_base));
11003 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11004 lea(Pm, Address(Pm_base));
11005 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11006
11007 // Zero the m*n result.
11008 mov(Rhi_mn, zr);
11009 mov(Rlo_mn, zr);
11010 }
11011
11012 // The core multiply-accumulate step of a Montgomery
11013 // multiplication. The idea is to schedule operations as a
11014 // pipeline so that instructions with long latencies (loads and
11015 // multiplies) have time to complete before their results are
11016 // used. This most benefits in-order implementations of the
11017 // architecture but out-of-order ones also benefit.
11018 void step() {
11019 block_comment("step");
11020 // MACC(Ra, Rb, t0, t1, t2);
11021 // Ra = *++Pa;
11022 // Rb = *--Pb;
11023 umulh(Rhi_ab, Ra, Rb);
11024 mul(Rlo_ab, Ra, Rb);
11025 ldr(Ra, pre(Pa, wordSize));
11026 ldr(Rb, pre(Pb, -wordSize));
11027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11028 // previous iteration.
11029 // MACC(Rm, Rn, t0, t1, t2);
11030 // Rm = *++Pm;
11031 // Rn = *--Pn;
11032 umulh(Rhi_mn, Rm, Rn);
11033 mul(Rlo_mn, Rm, Rn);
11034 ldr(Rm, pre(Pm, wordSize));
11035 ldr(Rn, pre(Pn, -wordSize));
11036 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11037 }
11038
11039 void post1() {
11040 block_comment("post1");
11041
11042 // MACC(Ra, Rb, t0, t1, t2);
11043 // Ra = *++Pa;
11044 // Rb = *--Pb;
11045 umulh(Rhi_ab, Ra, Rb);
11046 mul(Rlo_ab, Ra, Rb);
11047 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11048 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11049
11050 // *Pm = Rm = t0 * inv;
11051 mul(Rm, t0, inv);
11052 str(Rm, Address(Pm));
11053
11054 // MACC(Rm, Rn, t0, t1, t2);
11055 // t0 = t1; t1 = t2; t2 = 0;
11056 umulh(Rhi_mn, Rm, Rn);
11057
11058 #ifndef PRODUCT
11059 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11060 {
11061 mul(Rlo_mn, Rm, Rn);
11062 add(Rlo_mn, t0, Rlo_mn);
11063 Label ok;
11064 cbz(Rlo_mn, ok); {
11065 stop("broken Montgomery multiply");
11066 } bind(ok);
11067 }
11068 #endif
11069 // We have very carefully set things up so that
11070 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11071 // the lower half of Rm * Rn because we know the result already:
11072 // it must be -t0. t0 + (-t0) must generate a carry iff
11073 // t0 != 0. So, rather than do a mul and an adds we just set
11074 // the carry flag iff t0 is nonzero.
11075 //
11076 // mul(Rlo_mn, Rm, Rn);
11077 // adds(zr, t0, Rlo_mn);
11078 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11079 adcs(t0, t1, Rhi_mn);
11080 adc(t1, t2, zr);
11081 mov(t2, zr);
11082 }
11083
11084 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11085 block_comment("pre2");
11086 // Pa = Pa_base + i-len;
11087 // Pb = Pb_base + len;
11088 // Pm = Pm_base + i-len;
11089 // Pn = Pn_base + len;
11090
11091 if (i.is_register()) {
11092 sub(Rj, i.as_register(), len);
11093 } else {
11094 mov(Rj, i.as_constant());
11095 sub(Rj, Rj, len);
11096 }
11097 // Rj == i-len
11098
11099 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11100 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11101 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11102 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11103
11104 // Ra = *++Pa;
11105 // Rb = *--Pb;
11106 // Rm = *++Pm;
11107 // Rn = *--Pn;
11108 ldr(Ra, pre(Pa, wordSize));
11109 ldr(Rb, pre(Pb, -wordSize));
11110 ldr(Rm, pre(Pm, wordSize));
11111 ldr(Rn, pre(Pn, -wordSize));
11112
11113 mov(Rhi_mn, zr);
11114 mov(Rlo_mn, zr);
11115 }
11116
11117 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11118 block_comment("post2");
11119 if (i.is_constant()) {
11120 mov(Rj, i.as_constant()-len.as_constant());
11121 } else {
11122 sub(Rj, i.as_register(), len);
11123 }
11124
11125 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11126
11127 // As soon as we know the least significant digit of our result,
11128 // store it.
11129 // Pm_base[i-len] = t0;
11130 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11131
11132 // t0 = t1; t1 = t2; t2 = 0;
11133 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11134 adc(t1, t2, zr);
11135 mov(t2, zr);
11136 }
11137
11138 // A carry in t0 after Montgomery multiplication means that we
11139 // should subtract multiples of n from our result in m. We'll
11140 // keep doing that until there is no carry.
11141 void normalize(RegisterOrConstant len) {
11142 block_comment("normalize");
11143 // while (t0)
11144 // t0 = sub(Pm_base, Pn_base, t0, len);
11145 Label loop, post, again;
11146 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11147 cbz(t0, post); {
11148 bind(again); {
11149 mov(i, zr);
11150 mov(cnt, len);
11151 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11152 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11153 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11154 align(16);
11155 bind(loop); {
11156 sbcs(Rm, Rm, Rn);
11157 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11158 add(i, i, 1);
11159 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11160 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11161 sub(cnt, cnt, 1);
11162 } cbnz(cnt, loop);
11163 sbc(t0, t0, zr);
11164 } cbnz(t0, again);
11165 } bind(post);
11166 }
11167
11168 // Move memory at s to d, reversing words.
11169 // Increments d to end of copied memory
11170 // Destroys tmp1, tmp2
11171 // Preserves len
11172 // Leaves s pointing to the address which was in d at start
11173 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11174 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11175 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11176
11177 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11178 mov(tmp1, len);
11179 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11180 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11181 }
11182 // where
11183 void reverse1(Register d, Register s, Register tmp) {
11184 ldr(tmp, pre(s, -wordSize));
11185 ror(tmp, tmp, 32);
11186 str(tmp, post(d, wordSize));
11187 }
11188
11189 void step_squaring() {
11190 // An extra ACC
11191 step();
11192 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11193 }
11194
11195 void last_squaring(RegisterOrConstant i) {
11196 Label dont;
11197 // if ((i & 1) == 0) {
11198 tbnz(i.as_register(), 0, dont); {
11199 // MACC(Ra, Rb, t0, t1, t2);
11200 // Ra = *++Pa;
11201 // Rb = *--Pb;
11202 umulh(Rhi_ab, Ra, Rb);
11203 mul(Rlo_ab, Ra, Rb);
11204 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11205 } bind(dont);
11206 }
11207
11208 void extra_step_squaring() {
11209 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11210
11211 // MACC(Rm, Rn, t0, t1, t2);
11212 // Rm = *++Pm;
11213 // Rn = *--Pn;
11214 umulh(Rhi_mn, Rm, Rn);
11215 mul(Rlo_mn, Rm, Rn);
11216 ldr(Rm, pre(Pm, wordSize));
11217 ldr(Rn, pre(Pn, -wordSize));
11218 }
11219
11220 void post1_squaring() {
11221 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11222
11223 // *Pm = Rm = t0 * inv;
11224 mul(Rm, t0, inv);
11225 str(Rm, Address(Pm));
11226
11227 // MACC(Rm, Rn, t0, t1, t2);
11228 // t0 = t1; t1 = t2; t2 = 0;
11229 umulh(Rhi_mn, Rm, Rn);
11230
11231 #ifndef PRODUCT
11232 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11233 {
11234 mul(Rlo_mn, Rm, Rn);
11235 add(Rlo_mn, t0, Rlo_mn);
11236 Label ok;
11237 cbz(Rlo_mn, ok); {
11238 stop("broken Montgomery multiply");
11239 } bind(ok);
11240 }
11241 #endif
11242 // We have very carefully set things up so that
11243 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11244 // the lower half of Rm * Rn because we know the result already:
11245 // it must be -t0. t0 + (-t0) must generate a carry iff
11246 // t0 != 0. So, rather than do a mul and an adds we just set
11247 // the carry flag iff t0 is nonzero.
11248 //
11249 // mul(Rlo_mn, Rm, Rn);
11250 // adds(zr, t0, Rlo_mn);
11251 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11252 adcs(t0, t1, Rhi_mn);
11253 adc(t1, t2, zr);
11254 mov(t2, zr);
11255 }
11256
11257 void acc(Register Rhi, Register Rlo,
11258 Register t0, Register t1, Register t2) {
11259 adds(t0, t0, Rlo);
11260 adcs(t1, t1, Rhi);
11261 adc(t2, t2, zr);
11262 }
11263
11264 public:
11265 /**
11266 * Fast Montgomery multiplication. The derivation of the
11267 * algorithm is in A Cryptographic Library for the Motorola
11268 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
11269 *
11270 * Arguments:
11271 *
11272 * Inputs for multiplication:
11273 * c_rarg0 - int array elements a
11274 * c_rarg1 - int array elements b
11275 * c_rarg2 - int array elements n (the modulus)
11276 * c_rarg3 - int length
11277 * c_rarg4 - int inv
11278 * c_rarg5 - int array elements m (the result)
11279 *
11280 * Inputs for squaring:
11281 * c_rarg0 - int array elements a
11282 * c_rarg1 - int array elements n (the modulus)
11283 * c_rarg2 - int length
11284 * c_rarg3 - int inv
11285 * c_rarg4 - int array elements m (the result)
11286 *
11287 */
11288 address generate_multiply() {
11289 Label argh, nothing;
11290 bind(argh);
11291 stop("MontgomeryMultiply total_allocation must be <= 8192");
11292
11293 align(CodeEntryAlignment);
11294 address entry = pc();
11295
11296 cbzw(Rlen, nothing);
11297
11298 enter();
11299
11300 // Make room.
11301 cmpw(Rlen, 512);
11302 br(Assembler::HI, argh);
11303 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11304 andr(sp, Ra, -2 * wordSize);
11305
11306 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11307
11308 {
11309 // Copy input args, reversing as we go. We use Ra as a
11310 // temporary variable.
11311 reverse(Ra, Pa_base, Rlen, t0, t1);
11312 if (!_squaring)
11313 reverse(Ra, Pb_base, Rlen, t0, t1);
11314 reverse(Ra, Pn_base, Rlen, t0, t1);
11315 }
11316
11317 // Push all call-saved registers and also Pm_base which we'll need
11318 // at the end.
11319 save_regs();
11320
11321 #ifndef PRODUCT
11322 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
11323 {
11324 ldr(Rn, Address(Pn_base, 0));
11325 mul(Rlo_mn, Rn, inv);
11326 subs(zr, Rlo_mn, -1);
11327 Label ok;
11328 br(EQ, ok); {
11329 stop("broken inverse in Montgomery multiply");
11330 } bind(ok);
11331 }
11332 #endif
11333
11334 mov(Pm_base, Ra);
11335
11336 mov(t0, zr);
11337 mov(t1, zr);
11338 mov(t2, zr);
11339
11340 block_comment("for (int i = 0; i < len; i++) {");
11341 mov(Ri, zr); {
11342 Label loop, end;
11343 cmpw(Ri, Rlen);
11344 br(Assembler::GE, end);
11345
11346 bind(loop);
11347 pre1(Ri);
11348
11349 block_comment(" for (j = i; j; j--) {"); {
11350 movw(Rj, Ri);
11351 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11352 } block_comment(" } // j");
11353
11354 post1();
11355 addw(Ri, Ri, 1);
11356 cmpw(Ri, Rlen);
11357 br(Assembler::LT, loop);
11358 bind(end);
11359 block_comment("} // i");
11360 }
11361
11362 block_comment("for (int i = len; i < 2*len; i++) {");
11363 mov(Ri, Rlen); {
11364 Label loop, end;
11365 cmpw(Ri, Rlen, Assembler::LSL, 1);
11366 br(Assembler::GE, end);
11367
11368 bind(loop);
11369 pre2(Ri, Rlen);
11370
11371 block_comment(" for (j = len*2-i-1; j; j--) {"); {
11372 lslw(Rj, Rlen, 1);
11373 subw(Rj, Rj, Ri);
11374 subw(Rj, Rj, 1);
11375 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
11376 } block_comment(" } // j");
11377
11378 post2(Ri, Rlen);
11379 addw(Ri, Ri, 1);
11380 cmpw(Ri, Rlen, Assembler::LSL, 1);
11381 br(Assembler::LT, loop);
11382 bind(end);
11383 }
11384 block_comment("} // i");
11385
11386 normalize(Rlen);
11387
11388 mov(Ra, Pm_base); // Save Pm_base in Ra
11389 restore_regs(); // Restore caller's Pm_base
11390
11391 // Copy our result into caller's Pm_base
11392 reverse(Pm_base, Ra, Rlen, t0, t1);
11393
11394 leave();
11395 bind(nothing);
11396 ret(lr);
11397
11398 return entry;
11399 }
11400 // In C, approximately:
11401
11402 // void
11403 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
11404 // julong Pn_base[], julong Pm_base[],
11405 // julong inv, int len) {
11406 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11407 // julong *Pa, *Pb, *Pn, *Pm;
11408 // julong Ra, Rb, Rn, Rm;
11409
11410 // int i;
11411
11412 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11413
11414 // for (i = 0; i < len; i++) {
11415 // int j;
11416
11417 // Pa = Pa_base;
11418 // Pb = Pb_base + i;
11419 // Pm = Pm_base;
11420 // Pn = Pn_base + i;
11421
11422 // Ra = *Pa;
11423 // Rb = *Pb;
11424 // Rm = *Pm;
11425 // Rn = *Pn;
11426
11427 // int iters = i;
11428 // for (j = 0; iters--; j++) {
11429 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11430 // MACC(Ra, Rb, t0, t1, t2);
11431 // Ra = *++Pa;
11432 // Rb = *--Pb;
11433 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11434 // MACC(Rm, Rn, t0, t1, t2);
11435 // Rm = *++Pm;
11436 // Rn = *--Pn;
11437 // }
11438
11439 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
11440 // MACC(Ra, Rb, t0, t1, t2);
11441 // *Pm = Rm = t0 * inv;
11442 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11443 // MACC(Rm, Rn, t0, t1, t2);
11444
11445 // assert(t0 == 0, "broken Montgomery multiply");
11446
11447 // t0 = t1; t1 = t2; t2 = 0;
11448 // }
11449
11450 // for (i = len; i < 2*len; i++) {
11451 // int j;
11452
11453 // Pa = Pa_base + i-len;
11454 // Pb = Pb_base + len;
11455 // Pm = Pm_base + i-len;
11456 // Pn = Pn_base + len;
11457
11458 // Ra = *++Pa;
11459 // Rb = *--Pb;
11460 // Rm = *++Pm;
11461 // Rn = *--Pn;
11462
11463 // int iters = len*2-i-1;
11464 // for (j = i-len+1; iters--; j++) {
11465 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
11466 // MACC(Ra, Rb, t0, t1, t2);
11467 // Ra = *++Pa;
11468 // Rb = *--Pb;
11469 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11470 // MACC(Rm, Rn, t0, t1, t2);
11471 // Rm = *++Pm;
11472 // Rn = *--Pn;
11473 // }
11474
11475 // Pm_base[i-len] = t0;
11476 // t0 = t1; t1 = t2; t2 = 0;
11477 // }
11478
11479 // while (t0)
11480 // t0 = sub(Pm_base, Pn_base, t0, len);
11481 // }
11482
11483 /**
11484 * Fast Montgomery squaring. This uses asymptotically 25% fewer
11485 * multiplies than Montgomery multiplication so it should be up to
11486 * 25% faster. However, its loop control is more complex and it
11487 * may actually run slower on some machines.
11488 *
11489 * Arguments:
11490 *
11491 * Inputs:
11492 * c_rarg0 - int array elements a
11493 * c_rarg1 - int array elements n (the modulus)
11494 * c_rarg2 - int length
11495 * c_rarg3 - int inv
11496 * c_rarg4 - int array elements m (the result)
11497 *
11498 */
11499 address generate_square() {
11500 Label argh;
11501 bind(argh);
11502 stop("MontgomeryMultiply total_allocation must be <= 8192");
11503
11504 align(CodeEntryAlignment);
11505 address entry = pc();
11506
11507 enter();
11508
11509 // Make room.
11510 cmpw(Rlen, 512);
11511 br(Assembler::HI, argh);
11512 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
11513 andr(sp, Ra, -2 * wordSize);
11514
11515 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
11516
11517 {
11518 // Copy input args, reversing as we go. We use Ra as a
11519 // temporary variable.
11520 reverse(Ra, Pa_base, Rlen, t0, t1);
11521 reverse(Ra, Pn_base, Rlen, t0, t1);
11522 }
11523
11524 // Push all call-saved registers and also Pm_base which we'll need
11525 // at the end.
11526 save_regs();
11527
11528 mov(Pm_base, Ra);
11529
11530 mov(t0, zr);
11531 mov(t1, zr);
11532 mov(t2, zr);
11533
11534 block_comment("for (int i = 0; i < len; i++) {");
11535 mov(Ri, zr); {
11536 Label loop, end;
11537 bind(loop);
11538 cmp(Ri, Rlen);
11539 br(Assembler::GE, end);
11540
11541 pre1(Ri);
11542
11543 block_comment("for (j = (i+1)/2; j; j--) {"); {
11544 add(Rj, Ri, 1);
11545 lsr(Rj, Rj, 1);
11546 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11547 } block_comment(" } // j");
11548
11549 last_squaring(Ri);
11550
11551 block_comment(" for (j = i/2; j; j--) {"); {
11552 lsr(Rj, Ri, 1);
11553 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11554 } block_comment(" } // j");
11555
11556 post1_squaring();
11557 add(Ri, Ri, 1);
11558 cmp(Ri, Rlen);
11559 br(Assembler::LT, loop);
11560
11561 bind(end);
11562 block_comment("} // i");
11563 }
11564
11565 block_comment("for (int i = len; i < 2*len; i++) {");
11566 mov(Ri, Rlen); {
11567 Label loop, end;
11568 bind(loop);
11569 cmp(Ri, Rlen, Assembler::LSL, 1);
11570 br(Assembler::GE, end);
11571
11572 pre2(Ri, Rlen);
11573
11574 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
11575 lsl(Rj, Rlen, 1);
11576 sub(Rj, Rj, Ri);
11577 sub(Rj, Rj, 1);
11578 lsr(Rj, Rj, 1);
11579 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
11580 } block_comment(" } // j");
11581
11582 last_squaring(Ri);
11583
11584 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
11585 lsl(Rj, Rlen, 1);
11586 sub(Rj, Rj, Ri);
11587 lsr(Rj, Rj, 1);
11588 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
11589 } block_comment(" } // j");
11590
11591 post2(Ri, Rlen);
11592 add(Ri, Ri, 1);
11593 cmp(Ri, Rlen, Assembler::LSL, 1);
11594
11595 br(Assembler::LT, loop);
11596 bind(end);
11597 block_comment("} // i");
11598 }
11599
11600 normalize(Rlen);
11601
11602 mov(Ra, Pm_base); // Save Pm_base in Ra
11603 restore_regs(); // Restore caller's Pm_base
11604
11605 // Copy our result into caller's Pm_base
11606 reverse(Pm_base, Ra, Rlen, t0, t1);
11607
11608 leave();
11609 ret(lr);
11610
11611 return entry;
11612 }
11613 // In C, approximately:
11614
11615 // void
11616 // montgomery_square(julong Pa_base[], julong Pn_base[],
11617 // julong Pm_base[], julong inv, int len) {
11618 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
11619 // julong *Pa, *Pb, *Pn, *Pm;
11620 // julong Ra, Rb, Rn, Rm;
11621
11622 // int i;
11623
11624 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
11625
11626 // for (i = 0; i < len; i++) {
11627 // int j;
11628
11629 // Pa = Pa_base;
11630 // Pb = Pa_base + i;
11631 // Pm = Pm_base;
11632 // Pn = Pn_base + i;
11633
11634 // Ra = *Pa;
11635 // Rb = *Pb;
11636 // Rm = *Pm;
11637 // Rn = *Pn;
11638
11639 // int iters = (i+1)/2;
11640 // for (j = 0; iters--; j++) {
11641 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11642 // MACC2(Ra, Rb, t0, t1, t2);
11643 // Ra = *++Pa;
11644 // Rb = *--Pb;
11645 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11646 // MACC(Rm, Rn, t0, t1, t2);
11647 // Rm = *++Pm;
11648 // Rn = *--Pn;
11649 // }
11650 // if ((i & 1) == 0) {
11651 // assert(Ra == Pa_base[j], "must be");
11652 // MACC(Ra, Ra, t0, t1, t2);
11653 // }
11654 // iters = i/2;
11655 // assert(iters == i-j, "must be");
11656 // for (; iters--; j++) {
11657 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11658 // MACC(Rm, Rn, t0, t1, t2);
11659 // Rm = *++Pm;
11660 // Rn = *--Pn;
11661 // }
11662
11663 // *Pm = Rm = t0 * inv;
11664 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
11665 // MACC(Rm, Rn, t0, t1, t2);
11666
11667 // assert(t0 == 0, "broken Montgomery multiply");
11668
11669 // t0 = t1; t1 = t2; t2 = 0;
11670 // }
11671
11672 // for (i = len; i < 2*len; i++) {
11673 // int start = i-len+1;
11674 // int end = start + (len - start)/2;
11675 // int j;
11676
11677 // Pa = Pa_base + i-len;
11678 // Pb = Pa_base + len;
11679 // Pm = Pm_base + i-len;
11680 // Pn = Pn_base + len;
11681
11682 // Ra = *++Pa;
11683 // Rb = *--Pb;
11684 // Rm = *++Pm;
11685 // Rn = *--Pn;
11686
11687 // int iters = (2*len-i-1)/2;
11688 // assert(iters == end-start, "must be");
11689 // for (j = start; iters--; j++) {
11690 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
11691 // MACC2(Ra, Rb, t0, t1, t2);
11692 // Ra = *++Pa;
11693 // Rb = *--Pb;
11694 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11695 // MACC(Rm, Rn, t0, t1, t2);
11696 // Rm = *++Pm;
11697 // Rn = *--Pn;
11698 // }
11699 // if ((i & 1) == 0) {
11700 // assert(Ra == Pa_base[j], "must be");
11701 // MACC(Ra, Ra, t0, t1, t2);
11702 // }
11703 // iters = (2*len-i)/2;
11704 // assert(iters == len-j, "must be");
11705 // for (; iters--; j++) {
11706 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
11707 // MACC(Rm, Rn, t0, t1, t2);
11708 // Rm = *++Pm;
11709 // Rn = *--Pn;
11710 // }
11711 // Pm_base[i-len] = t0;
11712 // t0 = t1; t1 = t2; t2 = 0;
11713 // }
11714
11715 // while (t0)
11716 // t0 = sub(Pm_base, Pn_base, t0, len);
11717 // }
11718 };
11719
11720 // Initialization
11721 void generate_preuniverse_stubs() {
11722 // preuniverse stubs are not needed for aarch64
11723 }
11724
11725 void generate_initial_stubs() {
11726 // Generate initial stubs and initializes the entry points
11727
11728 // entry points that exist in all platforms Note: This is code
11729 // that could be shared among different platforms - however the
11730 // benefit seems to be smaller than the disadvantage of having a
11731 // much more complicated generator structure. See also comment in
11732 // stubRoutines.hpp.
11733
11734 StubRoutines::_forward_exception_entry = generate_forward_exception();
11735
11736 StubRoutines::_call_stub_entry =
11737 generate_call_stub(StubRoutines::_call_stub_return_address);
11738
11739 // is referenced by megamorphic call
11740 StubRoutines::_catch_exception_entry = generate_catch_exception();
11741
11742 // Initialize table for copy memory (arraycopy) check.
11743 if (UnsafeMemoryAccess::_table == nullptr) {
11744 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
11745 }
11746
11747 if (UseCRC32Intrinsics) {
11748 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
11749 }
11750
11751 if (UseCRC32CIntrinsics) {
11752 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
11753 }
11754
11755 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
11756 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
11757 }
11758
11759 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
11760 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
11761 }
11762
11763 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
11764 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
11765 StubRoutines::_hf2f = generate_float16ToFloat();
11766 StubRoutines::_f2hf = generate_floatToFloat16();
11767 }
11768 }
11769
11770 void generate_continuation_stubs() {
11771 // Continuation stubs:
11772 StubRoutines::_cont_thaw = generate_cont_thaw();
11773 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
11774 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
11775 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
11776 }
11777
11778 void generate_final_stubs() {
11779 // support for verify_oop (must happen after universe_init)
11780 if (VerifyOops) {
11781 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
11782 }
11783
11784 // arraycopy stubs used by compilers
11785 generate_arraycopy_stubs();
11786
11787 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
11788
11789 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
11790
11791 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
11792 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
11793
11794 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11795
11796 generate_atomic_entry_points();
11797
11798 #endif // LINUX
11799
11800 #ifdef COMPILER2
11801 if (UseSecondarySupersTable) {
11802 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
11803 if (! InlineSecondarySupersTest) {
11804 generate_lookup_secondary_supers_table_stub();
11805 }
11806 }
11807 #endif
11808
11809 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
11810
11811 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
11812 }
11813
11814 void generate_compiler_stubs() {
11815 #if COMPILER2_OR_JVMCI
11816
11817 if (UseSVE == 0) {
11818 StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
11819 }
11820
11821 // array equals stub for large arrays.
11822 if (!UseSimpleArrayEquals) {
11823 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
11824 }
11825
11826 // arrays_hascode stub for large arrays.
11827 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
11828 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
11829 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
11830 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
11831 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
11832
11833 // byte_array_inflate stub for large arrays.
11834 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
11835
11836 // countPositives stub for large arrays.
11837 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
11838
11839 generate_compare_long_strings();
11840
11841 generate_string_indexof_stubs();
11842
11843 #ifdef COMPILER2
11844 if (UseMultiplyToLenIntrinsic) {
11845 StubRoutines::_multiplyToLen = generate_multiplyToLen();
11846 }
11847
11848 if (UseSquareToLenIntrinsic) {
11849 StubRoutines::_squareToLen = generate_squareToLen();
11850 }
11851
11852 if (UseMulAddIntrinsic) {
11853 StubRoutines::_mulAdd = generate_mulAdd();
11854 }
11855
11856 if (UseSIMDForBigIntegerShiftIntrinsics) {
11857 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
11858 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
11859 }
11860
11861 if (UseMontgomeryMultiplyIntrinsic) {
11862 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
11863 StubCodeMark mark(this, stub_id);
11864 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
11865 StubRoutines::_montgomeryMultiply = g.generate_multiply();
11866 }
11867
11868 if (UseMontgomerySquareIntrinsic) {
11869 StubId stub_id = StubId::stubgen_montgomerySquare_id;
11870 StubCodeMark mark(this, stub_id);
11871 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
11872 // We use generate_multiply() rather than generate_square()
11873 // because it's faster for the sizes of modulus we care about.
11874 StubRoutines::_montgomerySquare = g.generate_multiply();
11875 }
11876
11877 #endif // COMPILER2
11878
11879 if (UseChaCha20Intrinsics) {
11880 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
11881 }
11882
11883 if (UseKyberIntrinsics) {
11884 StubRoutines::_kyberNtt = generate_kyberNtt();
11885 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
11886 StubRoutines::_kyberNttMult = generate_kyberNttMult();
11887 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
11888 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
11889 StubRoutines::_kyber12To16 = generate_kyber12To16();
11890 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
11891 }
11892
11893 if (UseDilithiumIntrinsics) {
11894 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
11895 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
11896 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
11897 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
11898 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
11899 }
11900
11901 if (UseBASE64Intrinsics) {
11902 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
11903 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
11904 }
11905
11906 // data cache line writeback
11907 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
11908 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
11909
11910 if (UseAESIntrinsics) {
11911 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
11912 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
11913 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
11914 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
11915 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
11916 }
11917 if (UseGHASHIntrinsics) {
11918 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
11919 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks_wide();
11920 }
11921 if (UseAESIntrinsics && UseGHASHIntrinsics) {
11922 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
11923 }
11924
11925 if (UseMD5Intrinsics) {
11926 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
11927 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
11928 }
11929 if (UseSHA1Intrinsics) {
11930 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
11931 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
11932 }
11933 if (UseSHA256Intrinsics) {
11934 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
11935 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
11936 }
11937 if (UseSHA512Intrinsics) {
11938 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
11939 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
11940 }
11941 if (UseSHA3Intrinsics) {
11942
11943 StubRoutines::_double_keccak = generate_double_keccak();
11944 if (UseSIMDForSHA3Intrinsic) {
11945 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
11946 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
11947 } else {
11948 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
11949 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
11950 }
11951 }
11952
11953 if (UsePoly1305Intrinsics) {
11954 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
11955 }
11956
11957 // generate Adler32 intrinsics code
11958 if (UseAdler32Intrinsics) {
11959 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
11960 }
11961
11962 #endif // COMPILER2_OR_JVMCI
11963 }
11964
11965 public:
11966 StubGenerator(CodeBuffer* code, BlobId blob_id) : StubCodeGenerator(code, blob_id) {
11967 switch(blob_id) {
11968 case BlobId::stubgen_preuniverse_id:
11969 generate_preuniverse_stubs();
11970 break;
11971 case BlobId::stubgen_initial_id:
11972 generate_initial_stubs();
11973 break;
11974 case BlobId::stubgen_continuation_id:
11975 generate_continuation_stubs();
11976 break;
11977 case BlobId::stubgen_compiler_id:
11978 generate_compiler_stubs();
11979 break;
11980 case BlobId::stubgen_final_id:
11981 generate_final_stubs();
11982 break;
11983 default:
11984 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
11985 break;
11986 };
11987 }
11988 }; // end class declaration
11989
11990 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id) {
11991 StubGenerator g(code, blob_id);
11992 }
11993
11994
11995 #if defined (LINUX)
11996
11997 // Define pointers to atomic stubs and initialize them to point to the
11998 // code in atomic_aarch64.S.
11999
12000 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12001 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12002 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12003 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12004 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12005
12006 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12007 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12008 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12009 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12010 DEFAULT_ATOMIC_OP(xchg, 4, )
12011 DEFAULT_ATOMIC_OP(xchg, 8, )
12012 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12013 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12014 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12015 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12016 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12017 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12018 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12019 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12020 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12021 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12022
12023 #undef DEFAULT_ATOMIC_OP
12024
12025 #endif // LINUX