1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 static const char _encodeBlock_toBase64[64] = {
156 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
157 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
158 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
159 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
160 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
161 };
162
163 static const char _encodeBlock_toBase64URL[64] = {
164 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
165 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
166 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
167 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
168 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
169 };
170
171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
178 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
179 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
180 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
181 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
182 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 };
192
193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
197 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
198 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
199 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
200 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
201 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
203 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
205 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
206 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
207 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 };
211
212 // A legal value of base64 code is in range [0, 127]. We need two lookups
213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
215 // table vector lookup use tbx, out of range indices are unchanged in
216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
217 // The value of index 64 is set to 0, so that we know that we already get the
218 // decoded data with the 1st lookup.
219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
220 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
221 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
222 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
223 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
224 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
225 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
226 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
227 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
228 };
229
230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
231 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
232 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
233 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
234 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
235 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
236 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
237 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
238 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
239 };
240
241
242 // Stub Code definitions
243
244 class StubGenerator: public StubCodeGenerator {
245 private:
246
247 #ifdef PRODUCT
248 #define inc_counter_np(counter) ((void)0)
249 #else
250 void inc_counter_np_(uint& counter) {
251 __ incrementw(ExternalAddress((address)&counter));
252 }
253 #define inc_counter_np(counter) \
254 BLOCK_COMMENT("inc_counter " #counter); \
255 inc_counter_np_(counter);
256 #endif
257
258 // Call stubs are used to call Java from C
259 //
260 // Arguments:
261 // c_rarg0: call wrapper address address
262 // c_rarg1: result address
263 // c_rarg2: result type BasicType
264 // c_rarg3: method Method*
265 // c_rarg4: (interpreter) entry point address
266 // c_rarg5: parameters intptr_t*
267 // c_rarg6: parameter size (in words) int
268 // c_rarg7: thread Thread*
269 //
270 // There is no return from the stub itself as any Java result
271 // is written to result
272 //
273 // we save r30 (lr) as the return PC at the base of the frame and
274 // link r29 (fp) below it as the frame pointer installing sp (r31)
275 // into fp.
276 //
277 // we save r0-r7, which accounts for all the c arguments.
278 //
279 // TODO: strictly do we need to save them all? they are treated as
280 // volatile by C so could we omit saving the ones we are going to
281 // place in global registers (thread? method?) or those we only use
282 // during setup of the Java call?
283 //
284 // we don't need to save r8 which C uses as an indirect result location
285 // return register.
286 //
287 // we don't need to save r9-r15 which both C and Java treat as
288 // volatile
289 //
290 // we don't need to save r16-18 because Java does not use them
291 //
292 // we save r19-r28 which Java uses as scratch registers and C
293 // expects to be callee-save
294 //
295 // we save the bottom 64 bits of each value stored in v8-v15; it is
296 // the responsibility of the caller to preserve larger values.
297 //
298 // so the stub frame looks like this when we enter Java code
299 //
300 // [ return_from_Java ] <--- sp
301 // [ argument word n ]
302 // ...
303 // -29 [ argument word 1 ]
304 // -28 [ saved Floating-point Control Register ]
305 // -26 [ saved v15 ] <--- sp_after_call
306 // -25 [ saved v14 ]
307 // -24 [ saved v13 ]
308 // -23 [ saved v12 ]
309 // -22 [ saved v11 ]
310 // -21 [ saved v10 ]
311 // -20 [ saved v9 ]
312 // -19 [ saved v8 ]
313 // -18 [ saved r28 ]
314 // -17 [ saved r27 ]
315 // -16 [ saved r26 ]
316 // -15 [ saved r25 ]
317 // -14 [ saved r24 ]
318 // -13 [ saved r23 ]
319 // -12 [ saved r22 ]
320 // -11 [ saved r21 ]
321 // -10 [ saved r20 ]
322 // -9 [ saved r19 ]
323 // -8 [ call wrapper (r0) ]
324 // -7 [ result (r1) ]
325 // -6 [ result type (r2) ]
326 // -5 [ method (r3) ]
327 // -4 [ entry point (r4) ]
328 // -3 [ parameters (r5) ]
329 // -2 [ parameter size (r6) ]
330 // -1 [ thread (r7) ]
331 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
332 // 1 [ saved lr (r30) ]
333
334 // Call stub stack layout word offsets from fp
335 enum call_stub_layout {
336 sp_after_call_off = -28,
337
338 fpcr_off = sp_after_call_off,
339 d15_off = -26,
340 d13_off = -24,
341 d11_off = -22,
342 d9_off = -20,
343
344 r28_off = -18,
345 r26_off = -16,
346 r24_off = -14,
347 r22_off = -12,
348 r20_off = -10,
349 call_wrapper_off = -8,
350 result_off = -7,
351 result_type_off = -6,
352 method_off = -5,
353 entry_point_off = -4,
354 parameter_size_off = -2,
355 thread_off = -1,
356 fp_f = 0,
357 retaddr_off = 1,
358 };
359
360 address generate_call_stub(address& return_address) {
361 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
362 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
363 "adjust this code");
364
365 StubId stub_id = StubId::stubgen_call_stub_id;
366 GrowableArray<address> entries;
367 int entry_count = StubInfo::entry_count(stub_id);
368 assert(entry_count == 2, "sanity check");
369 address start = load_archive_data(stub_id, &entries);
370 if (start != nullptr) {
371 assert(entries.length() == 1, "expected 1 extra entry");
372 return_address = entries.at(0);
373 return start;
374 }
375 StubCodeMark mark(this, stub_id);
376 start = __ pc();
377
378 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
379
380 const Address fpcr_save (rfp, fpcr_off * wordSize);
381 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
382 const Address result (rfp, result_off * wordSize);
383 const Address result_type (rfp, result_type_off * wordSize);
384 const Address method (rfp, method_off * wordSize);
385 const Address entry_point (rfp, entry_point_off * wordSize);
386 const Address parameter_size(rfp, parameter_size_off * wordSize);
387
388 const Address thread (rfp, thread_off * wordSize);
389
390 const Address d15_save (rfp, d15_off * wordSize);
391 const Address d13_save (rfp, d13_off * wordSize);
392 const Address d11_save (rfp, d11_off * wordSize);
393 const Address d9_save (rfp, d9_off * wordSize);
394
395 const Address r28_save (rfp, r28_off * wordSize);
396 const Address r26_save (rfp, r26_off * wordSize);
397 const Address r24_save (rfp, r24_off * wordSize);
398 const Address r22_save (rfp, r22_off * wordSize);
399 const Address r20_save (rfp, r20_off * wordSize);
400
401 // stub code
402
403 address aarch64_entry = __ pc();
404
405 // set up frame and move sp to end of save area
406 __ enter();
407 __ sub(sp, rfp, -sp_after_call_off * wordSize);
408
409 // save register parameters and Java scratch/global registers
410 // n.b. we save thread even though it gets installed in
411 // rthread because we want to sanity check rthread later
412 __ str(c_rarg7, thread);
413 __ strw(c_rarg6, parameter_size);
414 __ stp(c_rarg4, c_rarg5, entry_point);
415 __ stp(c_rarg2, c_rarg3, result_type);
416 __ stp(c_rarg0, c_rarg1, call_wrapper);
417
418 __ stp(r20, r19, r20_save);
419 __ stp(r22, r21, r22_save);
420 __ stp(r24, r23, r24_save);
421 __ stp(r26, r25, r26_save);
422 __ stp(r28, r27, r28_save);
423
424 __ stpd(v9, v8, d9_save);
425 __ stpd(v11, v10, d11_save);
426 __ stpd(v13, v12, d13_save);
427 __ stpd(v15, v14, d15_save);
428
429 __ get_fpcr(rscratch1);
430 __ str(rscratch1, fpcr_save);
431 // Set FPCR to the state we need. We do want Round to Nearest. We
432 // don't want non-IEEE rounding modes or floating-point traps.
433 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
434 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
435 __ set_fpcr(rscratch1);
436
437 // install Java thread in global register now we have saved
438 // whatever value it held
439 __ mov(rthread, c_rarg7);
440 // And method
441 __ mov(rmethod, c_rarg3);
442
443 // set up the heapbase register
444 __ reinit_heapbase();
445
446 #ifdef ASSERT
447 // make sure we have no pending exceptions
448 {
449 Label L;
450 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
451 __ cmp(rscratch1, (u1)NULL_WORD);
452 __ br(Assembler::EQ, L);
453 __ stop("StubRoutines::call_stub: entered with pending exception");
454 __ BIND(L);
455 }
456 #endif
457 // pass parameters if any
458 __ mov(esp, sp);
459 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
460 __ andr(sp, rscratch1, -2 * wordSize);
461
462 BLOCK_COMMENT("pass parameters if any");
463 Label parameters_done;
464 // parameter count is still in c_rarg6
465 // and parameter pointer identifying param 1 is in c_rarg5
466 __ cbzw(c_rarg6, parameters_done);
467
468 address loop = __ pc();
469 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
470 __ subsw(c_rarg6, c_rarg6, 1);
471 __ push(rscratch1);
472 __ br(Assembler::GT, loop);
473
474 __ BIND(parameters_done);
475
476 // call Java entry -- passing methdoOop, and current sp
477 // rmethod: Method*
478 // r19_sender_sp: sender sp
479 BLOCK_COMMENT("call Java function");
480 __ mov(r19_sender_sp, sp);
481 __ blr(c_rarg4);
482
483 // we do this here because the notify will already have been done
484 // if we get to the next instruction via an exception
485 //
486 // n.b. adding this instruction here affects the calculation of
487 // whether or not a routine returns to the call stub (used when
488 // doing stack walks) since the normal test is to check the return
489 // pc against the address saved below. so we may need to allow for
490 // this extra instruction in the check.
491
492 // save current address for use by exception handling code
493
494 return_address = __ pc();
495 entries.append(return_address);
496
497 // store result depending on type (everything that is not
498 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
499 // n.b. this assumes Java returns an integral result in r0
500 // and a floating result in j_farg0
501 __ ldr(j_rarg2, result);
502 Label is_long, is_float, is_double, exit;
503 __ ldr(j_rarg1, result_type);
504 __ cmp(j_rarg1, (u1)T_OBJECT);
505 __ br(Assembler::EQ, is_long);
506 __ cmp(j_rarg1, (u1)T_LONG);
507 __ br(Assembler::EQ, is_long);
508 __ cmp(j_rarg1, (u1)T_FLOAT);
509 __ br(Assembler::EQ, is_float);
510 __ cmp(j_rarg1, (u1)T_DOUBLE);
511 __ br(Assembler::EQ, is_double);
512
513 // handle T_INT case
514 __ strw(r0, Address(j_rarg2));
515
516 __ BIND(exit);
517
518 // pop parameters
519 __ sub(esp, rfp, -sp_after_call_off * wordSize);
520
521 #ifdef ASSERT
522 // verify that threads correspond
523 {
524 Label L, S;
525 __ ldr(rscratch1, thread);
526 __ cmp(rthread, rscratch1);
527 __ br(Assembler::NE, S);
528 __ get_thread(rscratch1);
529 __ cmp(rthread, rscratch1);
530 __ br(Assembler::EQ, L);
531 __ BIND(S);
532 __ stop("StubRoutines::call_stub: threads must correspond");
533 __ BIND(L);
534 }
535 #endif
536
537 __ pop_cont_fastpath(rthread);
538
539 // restore callee-save registers
540 __ ldpd(v15, v14, d15_save);
541 __ ldpd(v13, v12, d13_save);
542 __ ldpd(v11, v10, d11_save);
543 __ ldpd(v9, v8, d9_save);
544
545 __ ldp(r28, r27, r28_save);
546 __ ldp(r26, r25, r26_save);
547 __ ldp(r24, r23, r24_save);
548 __ ldp(r22, r21, r22_save);
549 __ ldp(r20, r19, r20_save);
550
551 // restore fpcr
552 __ ldr(rscratch1, fpcr_save);
553 __ set_fpcr(rscratch1);
554
555 __ ldp(c_rarg0, c_rarg1, call_wrapper);
556 __ ldrw(c_rarg2, result_type);
557 __ ldr(c_rarg3, method);
558 __ ldp(c_rarg4, c_rarg5, entry_point);
559 __ ldp(c_rarg6, c_rarg7, parameter_size);
560
561 // leave frame and return to caller
562 __ leave();
563 __ ret(lr);
564
565 // handle return types different from T_INT
566
567 __ BIND(is_long);
568 __ str(r0, Address(j_rarg2, 0));
569 __ br(Assembler::AL, exit);
570
571 __ BIND(is_float);
572 __ strs(j_farg0, Address(j_rarg2, 0));
573 __ br(Assembler::AL, exit);
574
575 __ BIND(is_double);
576 __ strd(j_farg0, Address(j_rarg2, 0));
577 __ br(Assembler::AL, exit);
578
579 // record the stub entry and end plus the auxiliary entry
580 store_archive_data(stub_id, start, __ pc(), &entries);
581
582 return start;
583 }
584
585 // Return point for a Java call if there's an exception thrown in
586 // Java code. The exception is caught and transformed into a
587 // pending exception stored in JavaThread that can be tested from
588 // within the VM.
589 //
590 // Note: Usually the parameters are removed by the callee. In case
591 // of an exception crossing an activation frame boundary, that is
592 // not the case if the callee is compiled code => need to setup the
593 // rsp.
594 //
595 // r0: exception oop
596
597 address generate_catch_exception() {
598 StubId stub_id = StubId::stubgen_catch_exception_id;
599 int entry_count = StubInfo::entry_count(stub_id);
600 assert(entry_count == 1, "sanity check");
601 address start = load_archive_data(stub_id);
602 if (start != nullptr) {
603 return start;
604 }
605 StubCodeMark mark(this, stub_id);
606 start = __ pc();
607
608 // same as in generate_call_stub():
609 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
610 const Address thread (rfp, thread_off * wordSize);
611
612 #ifdef ASSERT
613 // verify that threads correspond
614 {
615 Label L, S;
616 __ ldr(rscratch1, thread);
617 __ cmp(rthread, rscratch1);
618 __ br(Assembler::NE, S);
619 __ get_thread(rscratch1);
620 __ cmp(rthread, rscratch1);
621 __ br(Assembler::EQ, L);
622 __ bind(S);
623 __ stop("StubRoutines::catch_exception: threads must correspond");
624 __ bind(L);
625 }
626 #endif
627
628 // set pending exception
629 __ verify_oop(r0);
630
631 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
632 // special case -- add file name string to AOT address table
633 address file = (address)AOTCodeCache::add_C_string(__FILE__);
634 __ lea(rscratch1, ExternalAddress(file));
635 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
636 __ movw(rscratch1, (int)__LINE__);
637 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
638
639 // complete return to VM
640 assert(StubRoutines::_call_stub_return_address != nullptr,
641 "_call_stub_return_address must have been generated before");
642 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
643
644 // record the stub entry and end
645 store_archive_data(stub_id, start, __ pc());
646
647 return start;
648 }
649
650 // Continuation point for runtime calls returning with a pending
651 // exception. The pending exception check happened in the runtime
652 // or native call stub. The pending exception in Thread is
653 // converted into a Java-level exception.
654 //
655 // Contract with Java-level exception handlers:
656 // r0: exception
657 // r3: throwing pc
658 //
659 // NOTE: At entry of this stub, exception-pc must be in LR !!
660
661 // NOTE: this is always used as a jump target within generated code
662 // so it just needs to be generated code with no x86 prolog
663
664 address generate_forward_exception() {
665 StubId stub_id = StubId::stubgen_forward_exception_id;
666 int entry_count = StubInfo::entry_count(stub_id);
667 assert(entry_count == 1, "sanity check");
668 address start = load_archive_data(stub_id);
669 if (start != nullptr) {
670 return start;
671 }
672 StubCodeMark mark(this, stub_id);
673 start = __ pc();
674
675 // Upon entry, LR points to the return address returning into
676 // Java (interpreted or compiled) code; i.e., the return address
677 // becomes the throwing pc.
678 //
679 // Arguments pushed before the runtime call are still on the stack
680 // but the exception handler will reset the stack pointer ->
681 // ignore them. A potential result in registers can be ignored as
682 // well.
683
684 #ifdef ASSERT
685 // make sure this code is only executed if there is a pending exception
686 {
687 Label L;
688 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
689 __ cbnz(rscratch1, L);
690 __ stop("StubRoutines::forward exception: no pending exception (1)");
691 __ bind(L);
692 }
693 #endif
694
695 // compute exception handler into r19
696
697 // call the VM to find the handler address associated with the
698 // caller address. pass thread in r0 and caller pc (ret address)
699 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
700 // the stack.
701 __ mov(c_rarg1, lr);
702 // lr will be trashed by the VM call so we move it to R19
703 // (callee-saved) because we also need to pass it to the handler
704 // returned by this call.
705 __ mov(r19, lr);
706 BLOCK_COMMENT("call exception_handler_for_return_address");
707 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
708 SharedRuntime::exception_handler_for_return_address),
709 rthread, c_rarg1);
710 // Reinitialize the ptrue predicate register, in case the external runtime
711 // call clobbers ptrue reg, as we may return to SVE compiled code.
712 __ reinitialize_ptrue();
713
714 // we should not really care that lr is no longer the callee
715 // address. we saved the value the handler needs in r19 so we can
716 // just copy it to r3. however, the C2 handler will push its own
717 // frame and then calls into the VM and the VM code asserts that
718 // the PC for the frame above the handler belongs to a compiled
719 // Java method. So, we restore lr here to satisfy that assert.
720 __ mov(lr, r19);
721 // setup r0 & r3 & clear pending exception
722 __ mov(r3, r19);
723 __ mov(r19, r0);
724 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
725 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
726
727 #ifdef ASSERT
728 // make sure exception is set
729 {
730 Label L;
731 __ cbnz(r0, L);
732 __ stop("StubRoutines::forward exception: no pending exception (2)");
733 __ bind(L);
734 }
735 #endif
736
737 // continue at exception handler
738 // r0: exception
739 // r3: throwing pc
740 // r19: exception handler
741 __ verify_oop(r0);
742 __ br(r19);
743
744 // record the stub entry and end
745 store_archive_data(stub_id, start, __ pc());
746
747 return start;
748 }
749
750 // Non-destructive plausibility checks for oops
751 //
752 // Arguments:
753 // r0: oop to verify
754 // rscratch1: error message
755 //
756 // Stack after saving c_rarg3:
757 // [tos + 0]: saved c_rarg3
758 // [tos + 1]: saved c_rarg2
759 // [tos + 2]: saved lr
760 // [tos + 3]: saved rscratch2
761 // [tos + 4]: saved r0
762 // [tos + 5]: saved rscratch1
763 address generate_verify_oop() {
764 StubId stub_id = StubId::stubgen_verify_oop_id;
765 int entry_count = StubInfo::entry_count(stub_id);
766 assert(entry_count == 1, "sanity check");
767 address start = load_archive_data(stub_id);
768 if (start != nullptr) {
769 return start;
770 }
771 StubCodeMark mark(this, stub_id);
772 start = __ pc();
773
774 Label exit, error;
775
776 // save c_rarg2 and c_rarg3
777 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
778
779 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
780 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
781 __ ldr(c_rarg3, Address(c_rarg2));
782 __ add(c_rarg3, c_rarg3, 1);
783 __ str(c_rarg3, Address(c_rarg2));
784
785 // object is in r0
786 // make sure object is 'reasonable'
787 __ cbz(r0, exit); // if obj is null it is OK
788
789 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
790 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
791
792 // return if everything seems ok
793 __ bind(exit);
794
795 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
796 __ ret(lr);
797
798 // handle errors
799 __ bind(error);
800 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
801
802 __ push(RegSet::range(r0, r29), sp);
803 // debug(char* msg, int64_t pc, int64_t regs[])
804 __ mov(c_rarg0, rscratch1); // pass address of error message
805 __ mov(c_rarg1, lr); // pass return address
806 __ mov(c_rarg2, sp); // pass address of regs on stack
807 #ifndef PRODUCT
808 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
809 #endif
810 BLOCK_COMMENT("call MacroAssembler::debug");
811 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
812 __ blr(rscratch1);
813 __ hlt(0);
814
815 // record the stub entry and end
816 store_archive_data(stub_id, start, __ pc());
817
818 return start;
819 }
820
821 // Generate indices for iota vector.
822 void generate_iota_indices(StubId stub_id) {
823 GrowableArray<address> entries;
824 int entry_count = StubInfo::entry_count(stub_id);
825 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
826 address start = load_archive_data(stub_id, &entries);
827 if (start != nullptr) {
828 assert(entries.length() == entry_count - 1,
829 "unexpected entries count %d", entries.length());
830 StubRoutines::aarch64::_vector_iota_indices[0] = start;
831 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
832 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
833 }
834 return;
835 }
836 __ align(CodeEntryAlignment);
837 StubCodeMark mark(this, stub_id);
838 start = __ pc();
839 // B
840 __ emit_data64(0x0706050403020100, relocInfo::none);
841 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
842 entries.append(__ pc());
843 // H
844 __ emit_data64(0x0003000200010000, relocInfo::none);
845 __ emit_data64(0x0007000600050004, relocInfo::none);
846 entries.append(__ pc());
847 // S
848 __ emit_data64(0x0000000100000000, relocInfo::none);
849 __ emit_data64(0x0000000300000002, relocInfo::none);
850 entries.append(__ pc());
851 // D
852 __ emit_data64(0x0000000000000000, relocInfo::none);
853 __ emit_data64(0x0000000000000001, relocInfo::none);
854 entries.append(__ pc());
855 // S - FP
856 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
857 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
858 entries.append(__ pc());
859 // D - FP
860 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
861 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
862
863 // record the stub entry and end
864 store_archive_data(stub_id, start, __ pc(), &entries);
865
866 // install the entry addresses in the entry array
867 assert(entries.length() == entry_count - 1,
868 "unexpected entries count %d", entries.length());
869 StubRoutines::aarch64::_vector_iota_indices[0] = start;
870 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
871 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
872 }
873 }
874
875 // The inner part of zero_words(). This is the bulk operation,
876 // zeroing words in blocks, possibly using DC ZVA to do it. The
877 // caller is responsible for zeroing the last few words.
878 //
879 // Inputs:
880 // r10: the HeapWord-aligned base address of an array to zero.
881 // r11: the count in HeapWords, r11 > 0.
882 //
883 // Returns r10 and r11, adjusted for the caller to clear.
884 // r10: the base address of the tail of words left to clear.
885 // r11: the number of words in the tail.
886 // r11 < MacroAssembler::zero_words_block_size.
887
888 address generate_zero_blocks() {
889 StubId stub_id = StubId::stubgen_zero_blocks_id;
890 int entry_count = StubInfo::entry_count(stub_id);
891 assert(entry_count == 1, "sanity check");
892 address start = load_archive_data(stub_id);
893 if (start != nullptr) {
894 return start;
895 }
896 __ align(CodeEntryAlignment);
897 StubCodeMark mark(this, stub_id);
898 Label done;
899 Label base_aligned;
900
901 Register base = r10, cnt = r11;
902
903 start = __ pc();
904
905 if (UseBlockZeroing) {
906 int zva_length = VM_Version::zva_length();
907
908 // Ensure ZVA length can be divided by 16. This is required by
909 // the subsequent operations.
910 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
911
912 __ tbz(base, 3, base_aligned);
913 __ str(zr, Address(__ post(base, 8)));
914 __ sub(cnt, cnt, 1);
915 __ bind(base_aligned);
916
917 // Ensure count >= zva_length * 2 so that it still deserves a zva after
918 // alignment.
919 Label small;
920 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
921 __ subs(rscratch1, cnt, low_limit >> 3);
922 __ br(Assembler::LT, small);
923 __ zero_dcache_blocks(base, cnt);
924 __ bind(small);
925 }
926
927 {
928 // Number of stp instructions we'll unroll
929 const int unroll =
930 MacroAssembler::zero_words_block_size / 2;
931 // Clear the remaining blocks.
932 Label loop;
933 __ subs(cnt, cnt, unroll * 2);
934 __ br(Assembler::LT, done);
935 __ bind(loop);
936 for (int i = 0; i < unroll; i++)
937 __ stp(zr, zr, __ post(base, 16));
938 __ subs(cnt, cnt, unroll * 2);
939 __ br(Assembler::GE, loop);
940 __ bind(done);
941 __ add(cnt, cnt, unroll * 2);
942 }
943
944 __ ret(lr);
945
946 // record the stub entry and end
947 store_archive_data(stub_id, start, __ pc());
948
949 return start;
950 }
951
952
953 typedef enum {
954 copy_forwards = 1,
955 copy_backwards = -1
956 } copy_direction;
957
958 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
959 // for arraycopy stubs.
960 class ArrayCopyBarrierSetHelper : StackObj {
961 BarrierSetAssembler* _bs_asm;
962 MacroAssembler* _masm;
963 DecoratorSet _decorators;
964 BasicType _type;
965 Register _gct1;
966 Register _gct2;
967 Register _gct3;
968 FloatRegister _gcvt1;
969 FloatRegister _gcvt2;
970 FloatRegister _gcvt3;
971
972 public:
973 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
974 DecoratorSet decorators,
975 BasicType type,
976 Register gct1,
977 Register gct2,
978 Register gct3,
979 FloatRegister gcvt1,
980 FloatRegister gcvt2,
981 FloatRegister gcvt3)
982 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
983 _masm(masm),
984 _decorators(decorators),
985 _type(type),
986 _gct1(gct1),
987 _gct2(gct2),
988 _gct3(gct3),
989 _gcvt1(gcvt1),
990 _gcvt2(gcvt2),
991 _gcvt3(gcvt3) {
992 }
993
994 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
995 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
996 dst1, dst2, src,
997 _gct1, _gct2, _gcvt1);
998 }
999
1000 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1001 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1002 dst, src1, src2,
1003 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1004 }
1005
1006 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1007 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1008 dst1, dst2, src,
1009 _gct1);
1010 }
1011
1012 void copy_store_at_16(Address dst, Register src1, Register src2) {
1013 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1014 dst, src1, src2,
1015 _gct1, _gct2, _gct3);
1016 }
1017
1018 void copy_load_at_8(Register dst, Address src) {
1019 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1020 dst, noreg, src,
1021 _gct1);
1022 }
1023
1024 void copy_store_at_8(Address dst, Register src) {
1025 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1026 dst, src, noreg,
1027 _gct1, _gct2, _gct3);
1028 }
1029 };
1030
1031 // Bulk copy of blocks of 8 words.
1032 //
1033 // count is a count of words.
1034 //
1035 // Precondition: count >= 8
1036 //
1037 // Postconditions:
1038 //
1039 // The least significant bit of count contains the remaining count
1040 // of words to copy. The rest of count is trash.
1041 //
1042 // s and d are adjusted to point to the remaining words to copy
1043 //
1044 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1045 int entry_count = StubInfo::entry_count(stub_id);
1046 assert(entry_count == 1, "sanity check");
1047 address start = load_archive_data(stub_id);
1048 if (start != nullptr) {
1049 return start;
1050 }
1051 BasicType type;
1052 copy_direction direction;
1053
1054 switch (stub_id) {
1055 case StubId::stubgen_copy_byte_f_id:
1056 direction = copy_forwards;
1057 type = T_BYTE;
1058 break;
1059 case StubId::stubgen_copy_byte_b_id:
1060 direction = copy_backwards;
1061 type = T_BYTE;
1062 break;
1063 case StubId::stubgen_copy_oop_f_id:
1064 direction = copy_forwards;
1065 type = T_OBJECT;
1066 break;
1067 case StubId::stubgen_copy_oop_b_id:
1068 direction = copy_backwards;
1069 type = T_OBJECT;
1070 break;
1071 case StubId::stubgen_copy_oop_uninit_f_id:
1072 direction = copy_forwards;
1073 type = T_OBJECT;
1074 break;
1075 case StubId::stubgen_copy_oop_uninit_b_id:
1076 direction = copy_backwards;
1077 type = T_OBJECT;
1078 break;
1079 default:
1080 ShouldNotReachHere();
1081 }
1082
1083 int unit = wordSize * direction;
1084 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1085
1086 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1087 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1088 const Register stride = r14;
1089 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1090 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1091 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1092
1093 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1094 assert_different_registers(s, d, count, rscratch1, rscratch2);
1095
1096 Label again, drain;
1097
1098 __ align(CodeEntryAlignment);
1099
1100 StubCodeMark mark(this, stub_id);
1101
1102 start = __ pc();
1103
1104 Label unaligned_copy_long;
1105 if (AvoidUnalignedAccesses) {
1106 __ tbnz(d, 3, unaligned_copy_long);
1107 }
1108
1109 if (direction == copy_forwards) {
1110 __ sub(s, s, bias);
1111 __ sub(d, d, bias);
1112 }
1113
1114 #ifdef ASSERT
1115 // Make sure we are never given < 8 words
1116 {
1117 Label L;
1118 __ cmp(count, (u1)8);
1119 __ br(Assembler::GE, L);
1120 __ stop("genrate_copy_longs called with < 8 words");
1121 __ bind(L);
1122 }
1123 #endif
1124
1125 // Fill 8 registers
1126 if (UseSIMDForMemoryOps) {
1127 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1128 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1129 } else {
1130 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1131 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1132 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1133 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1134 }
1135
1136 __ subs(count, count, 16);
1137 __ br(Assembler::LO, drain);
1138
1139 int prefetch = PrefetchCopyIntervalInBytes;
1140 bool use_stride = false;
1141 if (direction == copy_backwards) {
1142 use_stride = prefetch > 256;
1143 prefetch = -prefetch;
1144 if (use_stride) __ mov(stride, prefetch);
1145 }
1146
1147 __ bind(again);
1148
1149 if (PrefetchCopyIntervalInBytes > 0)
1150 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1151
1152 if (UseSIMDForMemoryOps) {
1153 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1154 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1155 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1156 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1157 } else {
1158 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1159 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1160 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1161 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1162 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1163 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1164 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1165 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1166 }
1167
1168 __ subs(count, count, 8);
1169 __ br(Assembler::HS, again);
1170
1171 // Drain
1172 __ bind(drain);
1173 if (UseSIMDForMemoryOps) {
1174 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1175 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1176 } else {
1177 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1178 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1179 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1180 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1181 }
1182
1183 {
1184 Label L1, L2;
1185 __ tbz(count, exact_log2(4), L1);
1186 if (UseSIMDForMemoryOps) {
1187 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1188 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1189 } else {
1190 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1191 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1192 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1193 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1194 }
1195 __ bind(L1);
1196
1197 if (direction == copy_forwards) {
1198 __ add(s, s, bias);
1199 __ add(d, d, bias);
1200 }
1201
1202 __ tbz(count, 1, L2);
1203 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1204 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1205 __ bind(L2);
1206 }
1207
1208 __ ret(lr);
1209
1210 if (AvoidUnalignedAccesses) {
1211 Label drain, again;
1212 // Register order for storing. Order is different for backward copy.
1213
1214 __ bind(unaligned_copy_long);
1215
1216 // source address is even aligned, target odd aligned
1217 //
1218 // when forward copying word pairs we read long pairs at offsets
1219 // {0, 2, 4, 6} (in long words). when backwards copying we read
1220 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1221 // address by -2 in the forwards case so we can compute the
1222 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1223 // or -1.
1224 //
1225 // when forward copying we need to store 1 word, 3 pairs and
1226 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1227 // zero offset We adjust the destination by -1 which means we
1228 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1229 //
1230 // When backwards copyng we need to store 1 word, 3 pairs and
1231 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1232 // offsets {1, 3, 5, 7, 8} * unit.
1233
1234 if (direction == copy_forwards) {
1235 __ sub(s, s, 16);
1236 __ sub(d, d, 8);
1237 }
1238
1239 // Fill 8 registers
1240 //
1241 // for forwards copy s was offset by -16 from the original input
1242 // value of s so the register contents are at these offsets
1243 // relative to the 64 bit block addressed by that original input
1244 // and so on for each successive 64 byte block when s is updated
1245 //
1246 // t0 at offset 0, t1 at offset 8
1247 // t2 at offset 16, t3 at offset 24
1248 // t4 at offset 32, t5 at offset 40
1249 // t6 at offset 48, t7 at offset 56
1250
1251 // for backwards copy s was not offset so the register contents
1252 // are at these offsets into the preceding 64 byte block
1253 // relative to that original input and so on for each successive
1254 // preceding 64 byte block when s is updated. this explains the
1255 // slightly counter-intuitive looking pattern of register usage
1256 // in the stp instructions for backwards copy.
1257 //
1258 // t0 at offset -16, t1 at offset -8
1259 // t2 at offset -32, t3 at offset -24
1260 // t4 at offset -48, t5 at offset -40
1261 // t6 at offset -64, t7 at offset -56
1262
1263 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1264 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1265 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1266 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1267
1268 __ subs(count, count, 16);
1269 __ br(Assembler::LO, drain);
1270
1271 int prefetch = PrefetchCopyIntervalInBytes;
1272 bool use_stride = false;
1273 if (direction == copy_backwards) {
1274 use_stride = prefetch > 256;
1275 prefetch = -prefetch;
1276 if (use_stride) __ mov(stride, prefetch);
1277 }
1278
1279 __ bind(again);
1280
1281 if (PrefetchCopyIntervalInBytes > 0)
1282 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1283
1284 if (direction == copy_forwards) {
1285 // allowing for the offset of -8 the store instructions place
1286 // registers into the target 64 bit block at the following
1287 // offsets
1288 //
1289 // t0 at offset 0
1290 // t1 at offset 8, t2 at offset 16
1291 // t3 at offset 24, t4 at offset 32
1292 // t5 at offset 40, t6 at offset 48
1293 // t7 at offset 56
1294
1295 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1296 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1297 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1298 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1299 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1300 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1301 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1302 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1303 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1304 } else {
1305 // d was not offset when we started so the registers are
1306 // written into the 64 bit block preceding d with the following
1307 // offsets
1308 //
1309 // t1 at offset -8
1310 // t3 at offset -24, t0 at offset -16
1311 // t5 at offset -48, t2 at offset -32
1312 // t7 at offset -56, t4 at offset -48
1313 // t6 at offset -64
1314 //
1315 // note that this matches the offsets previously noted for the
1316 // loads
1317
1318 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1319 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1320 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1321 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1322 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1323 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1324 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1325 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1326 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1327 }
1328
1329 __ subs(count, count, 8);
1330 __ br(Assembler::HS, again);
1331
1332 // Drain
1333 //
1334 // this uses the same pattern of offsets and register arguments
1335 // as above
1336 __ bind(drain);
1337 if (direction == copy_forwards) {
1338 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1339 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1340 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1341 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1342 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1343 } else {
1344 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1345 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1346 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1347 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1348 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1349 }
1350 // now we need to copy any remaining part block which may
1351 // include a 4 word block subblock and/or a 2 word subblock.
1352 // bits 2 and 1 in the count are the tell-tale for whether we
1353 // have each such subblock
1354 {
1355 Label L1, L2;
1356 __ tbz(count, exact_log2(4), L1);
1357 // this is the same as above but copying only 4 longs hence
1358 // with only one intervening stp between the str instructions
1359 // but note that the offsets and registers still follow the
1360 // same pattern
1361 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1362 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1363 if (direction == copy_forwards) {
1364 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1365 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1366 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1367 } else {
1368 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1369 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1370 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1371 }
1372 __ bind(L1);
1373
1374 __ tbz(count, 1, L2);
1375 // this is the same as above but copying only 2 longs hence
1376 // there is no intervening stp between the str instructions
1377 // but note that the offset and register patterns are still
1378 // the same
1379 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1380 if (direction == copy_forwards) {
1381 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1382 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1383 } else {
1384 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1385 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1386 }
1387 __ bind(L2);
1388
1389 // for forwards copy we need to re-adjust the offsets we
1390 // applied so that s and d are follow the last words written
1391
1392 if (direction == copy_forwards) {
1393 __ add(s, s, 16);
1394 __ add(d, d, 8);
1395 }
1396
1397 }
1398
1399 __ ret(lr);
1400 }
1401
1402 // record the stub entry and end
1403 store_archive_data(stub_id, start, __ pc());
1404
1405 return start;
1406 }
1407
1408 // Small copy: less than 16 bytes.
1409 //
1410 // NB: Ignores all of the bits of count which represent more than 15
1411 // bytes, so a caller doesn't have to mask them.
1412
1413 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1414 bool is_backwards = step < 0;
1415 size_t granularity = g_uabs(step);
1416 int direction = is_backwards ? -1 : 1;
1417
1418 Label Lword, Lint, Lshort, Lbyte;
1419
1420 assert(granularity
1421 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1422
1423 const Register t0 = r3;
1424 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1425 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1426
1427 // ??? I don't know if this bit-test-and-branch is the right thing
1428 // to do. It does a lot of jumping, resulting in several
1429 // mispredicted branches. It might make more sense to do this
1430 // with something like Duff's device with a single computed branch.
1431
1432 __ tbz(count, 3 - exact_log2(granularity), Lword);
1433 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1434 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1435 __ bind(Lword);
1436
1437 if (granularity <= sizeof (jint)) {
1438 __ tbz(count, 2 - exact_log2(granularity), Lint);
1439 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1440 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1441 __ bind(Lint);
1442 }
1443
1444 if (granularity <= sizeof (jshort)) {
1445 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1446 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1447 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1448 __ bind(Lshort);
1449 }
1450
1451 if (granularity <= sizeof (jbyte)) {
1452 __ tbz(count, 0, Lbyte);
1453 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1454 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1455 __ bind(Lbyte);
1456 }
1457 }
1458
1459 // All-singing all-dancing memory copy.
1460 //
1461 // Copy count units of memory from s to d. The size of a unit is
1462 // step, which can be positive or negative depending on the direction
1463 // of copy. If is_aligned is false, we align the source address.
1464 //
1465
1466 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1467 Register s, Register d, Register count, int step) {
1468 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1469 bool is_backwards = step < 0;
1470 unsigned int granularity = g_uabs(step);
1471 const Register t0 = r3, t1 = r4;
1472
1473 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1474 // load all the data before writing anything
1475 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1476 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1477 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1478 const Register send = r17, dend = r16;
1479 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1480 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1481 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1482
1483 if (PrefetchCopyIntervalInBytes > 0)
1484 __ prfm(Address(s, 0), PLDL1KEEP);
1485 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1486 __ br(Assembler::HI, copy_big);
1487
1488 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1489 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1490
1491 __ cmp(count, u1(16/granularity));
1492 __ br(Assembler::LS, copy16);
1493
1494 __ cmp(count, u1(64/granularity));
1495 __ br(Assembler::HI, copy80);
1496
1497 __ cmp(count, u1(32/granularity));
1498 __ br(Assembler::LS, copy32);
1499
1500 // 33..64 bytes
1501 if (UseSIMDForMemoryOps) {
1502 bs.copy_load_at_32(v0, v1, Address(s, 0));
1503 bs.copy_load_at_32(v2, v3, Address(send, -32));
1504 bs.copy_store_at_32(Address(d, 0), v0, v1);
1505 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1506 } else {
1507 bs.copy_load_at_16(t0, t1, Address(s, 0));
1508 bs.copy_load_at_16(t2, t3, Address(s, 16));
1509 bs.copy_load_at_16(t4, t5, Address(send, -32));
1510 bs.copy_load_at_16(t6, t7, Address(send, -16));
1511
1512 bs.copy_store_at_16(Address(d, 0), t0, t1);
1513 bs.copy_store_at_16(Address(d, 16), t2, t3);
1514 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1515 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1516 }
1517 __ b(finish);
1518
1519 // 17..32 bytes
1520 __ bind(copy32);
1521 bs.copy_load_at_16(t0, t1, Address(s, 0));
1522 bs.copy_load_at_16(t6, t7, Address(send, -16));
1523
1524 bs.copy_store_at_16(Address(d, 0), t0, t1);
1525 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1526 __ b(finish);
1527
1528 // 65..80/96 bytes
1529 // (96 bytes if SIMD because we do 32 byes per instruction)
1530 __ bind(copy80);
1531 if (UseSIMDForMemoryOps) {
1532 bs.copy_load_at_32(v0, v1, Address(s, 0));
1533 bs.copy_load_at_32(v2, v3, Address(s, 32));
1534 // Unaligned pointers can be an issue for copying.
1535 // The issue has more chances to happen when granularity of data is
1536 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1537 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1538 // The most performance drop has been seen for the range 65-80 bytes.
1539 // For such cases using the pair of ldp/stp instead of the third pair of
1540 // ldpq/stpq fixes the performance issue.
1541 if (granularity < sizeof (jint)) {
1542 Label copy96;
1543 __ cmp(count, u1(80/granularity));
1544 __ br(Assembler::HI, copy96);
1545 bs.copy_load_at_16(t0, t1, Address(send, -16));
1546
1547 bs.copy_store_at_32(Address(d, 0), v0, v1);
1548 bs.copy_store_at_32(Address(d, 32), v2, v3);
1549
1550 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1551 __ b(finish);
1552
1553 __ bind(copy96);
1554 }
1555 bs.copy_load_at_32(v4, v5, Address(send, -32));
1556
1557 bs.copy_store_at_32(Address(d, 0), v0, v1);
1558 bs.copy_store_at_32(Address(d, 32), v2, v3);
1559
1560 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1561 } else {
1562 bs.copy_load_at_16(t0, t1, Address(s, 0));
1563 bs.copy_load_at_16(t2, t3, Address(s, 16));
1564 bs.copy_load_at_16(t4, t5, Address(s, 32));
1565 bs.copy_load_at_16(t6, t7, Address(s, 48));
1566 bs.copy_load_at_16(t8, t9, Address(send, -16));
1567
1568 bs.copy_store_at_16(Address(d, 0), t0, t1);
1569 bs.copy_store_at_16(Address(d, 16), t2, t3);
1570 bs.copy_store_at_16(Address(d, 32), t4, t5);
1571 bs.copy_store_at_16(Address(d, 48), t6, t7);
1572 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1573 }
1574 __ b(finish);
1575
1576 // 0..16 bytes
1577 __ bind(copy16);
1578 __ cmp(count, u1(8/granularity));
1579 __ br(Assembler::LO, copy8);
1580
1581 // 8..16 bytes
1582 bs.copy_load_at_8(t0, Address(s, 0));
1583 bs.copy_load_at_8(t1, Address(send, -8));
1584 bs.copy_store_at_8(Address(d, 0), t0);
1585 bs.copy_store_at_8(Address(dend, -8), t1);
1586 __ b(finish);
1587
1588 if (granularity < 8) {
1589 // 4..7 bytes
1590 __ bind(copy8);
1591 __ tbz(count, 2 - exact_log2(granularity), copy4);
1592 __ ldrw(t0, Address(s, 0));
1593 __ ldrw(t1, Address(send, -4));
1594 __ strw(t0, Address(d, 0));
1595 __ strw(t1, Address(dend, -4));
1596 __ b(finish);
1597 if (granularity < 4) {
1598 // 0..3 bytes
1599 __ bind(copy4);
1600 __ cbz(count, finish); // get rid of 0 case
1601 if (granularity == 2) {
1602 __ ldrh(t0, Address(s, 0));
1603 __ strh(t0, Address(d, 0));
1604 } else { // granularity == 1
1605 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1606 // the first and last byte.
1607 // Handle the 3 byte case by loading and storing base + count/2
1608 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1609 // This does means in the 1 byte case we load/store the same
1610 // byte 3 times.
1611 __ lsr(count, count, 1);
1612 __ ldrb(t0, Address(s, 0));
1613 __ ldrb(t1, Address(send, -1));
1614 __ ldrb(t2, Address(s, count));
1615 __ strb(t0, Address(d, 0));
1616 __ strb(t1, Address(dend, -1));
1617 __ strb(t2, Address(d, count));
1618 }
1619 __ b(finish);
1620 }
1621 }
1622
1623 __ bind(copy_big);
1624 if (is_backwards) {
1625 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1626 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1627 }
1628
1629 // Now we've got the small case out of the way we can align the
1630 // source address on a 2-word boundary.
1631
1632 // Here we will materialize a count in r15, which is used by copy_memory_small
1633 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1634 // Up until here, we have used t9, which aliases r15, but from here on, that register
1635 // can not be used as a temp register, as it contains the count.
1636
1637 Label aligned;
1638
1639 if (is_aligned) {
1640 // We may have to adjust by 1 word to get s 2-word-aligned.
1641 __ tbz(s, exact_log2(wordSize), aligned);
1642 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1643 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1644 __ sub(count, count, wordSize/granularity);
1645 } else {
1646 if (is_backwards) {
1647 __ andr(r15, s, 2 * wordSize - 1);
1648 } else {
1649 __ neg(r15, s);
1650 __ andr(r15, r15, 2 * wordSize - 1);
1651 }
1652 // r15 is the byte adjustment needed to align s.
1653 __ cbz(r15, aligned);
1654 int shift = exact_log2(granularity);
1655 if (shift > 0) {
1656 __ lsr(r15, r15, shift);
1657 }
1658 __ sub(count, count, r15);
1659
1660 #if 0
1661 // ?? This code is only correct for a disjoint copy. It may or
1662 // may not make sense to use it in that case.
1663
1664 // Copy the first pair; s and d may not be aligned.
1665 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1666 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1667
1668 // Align s and d, adjust count
1669 if (is_backwards) {
1670 __ sub(s, s, r15);
1671 __ sub(d, d, r15);
1672 } else {
1673 __ add(s, s, r15);
1674 __ add(d, d, r15);
1675 }
1676 #else
1677 copy_memory_small(decorators, type, s, d, r15, step);
1678 #endif
1679 }
1680
1681 __ bind(aligned);
1682
1683 // s is now 2-word-aligned.
1684
1685 // We have a count of units and some trailing bytes. Adjust the
1686 // count and do a bulk copy of words. If the shift is zero
1687 // perform a move instead to benefit from zero latency moves.
1688 int shift = exact_log2(wordSize/granularity);
1689 if (shift > 0) {
1690 __ lsr(r15, count, shift);
1691 } else {
1692 __ mov(r15, count);
1693 }
1694 if (direction == copy_forwards) {
1695 if (type != T_OBJECT) {
1696 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1697 __ blr(rscratch1);
1698 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1699 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1700 __ blr(rscratch1);
1701 } else {
1702 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1703 __ blr(rscratch1);
1704 }
1705 } else {
1706 if (type != T_OBJECT) {
1707 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1708 __ blr(rscratch1);
1709 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1710 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1711 __ blr(rscratch1);
1712 } else {
1713 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1714 __ blr(rscratch1);
1715 }
1716 }
1717
1718 // And the tail.
1719 copy_memory_small(decorators, type, s, d, count, step);
1720
1721 if (granularity >= 8) __ bind(copy8);
1722 if (granularity >= 4) __ bind(copy4);
1723 __ bind(finish);
1724 }
1725
1726
1727 void clobber_registers() {
1728 #ifdef ASSERT
1729 RegSet clobbered
1730 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1731 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1732 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1733 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1734 __ mov(*it, rscratch1);
1735 }
1736 #endif
1737
1738 }
1739
1740 // Scan over array at a for count oops, verifying each one.
1741 // Preserves a and count, clobbers rscratch1 and rscratch2.
1742 void verify_oop_array (int size, Register a, Register count, Register temp) {
1743 Label loop, end;
1744 __ mov(rscratch1, a);
1745 __ mov(rscratch2, zr);
1746 __ bind(loop);
1747 __ cmp(rscratch2, count);
1748 __ br(Assembler::HS, end);
1749 if (size == wordSize) {
1750 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1751 __ verify_oop(temp);
1752 } else {
1753 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1754 __ decode_heap_oop(temp); // calls verify_oop
1755 }
1756 __ add(rscratch2, rscratch2, 1);
1757 __ b(loop);
1758 __ bind(end);
1759 }
1760
1761 // Arguments:
1762 // stub_id - is used to name the stub and identify all details of
1763 // how to perform the copy.
1764 //
1765 // nopush_entry - is assigned to the stub's post push entry point
1766 // unless it is null
1767 //
1768 // Inputs:
1769 // c_rarg0 - source array address
1770 // c_rarg1 - destination array address
1771 // c_rarg2 - element count, treated as ssize_t, can be zero
1772 //
1773 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1774 // the hardware handle it. The two dwords within qwords that span
1775 // cache line boundaries will still be loaded and stored atomically.
1776 //
1777 // Side Effects: nopush_entry is set to the (post push) entry point
1778 // so it can be used by the corresponding conjoint
1779 // copy method
1780 //
1781 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1782 int size;
1783 bool aligned;
1784 bool is_oop;
1785 bool dest_uninitialized;
1786 switch (stub_id) {
1787 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1788 size = sizeof(jbyte);
1789 aligned = false;
1790 is_oop = false;
1791 dest_uninitialized = false;
1792 break;
1793 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1794 size = sizeof(jbyte);
1795 aligned = true;
1796 is_oop = false;
1797 dest_uninitialized = false;
1798 break;
1799 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1800 size = sizeof(jshort);
1801 aligned = false;
1802 is_oop = false;
1803 dest_uninitialized = false;
1804 break;
1805 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1806 size = sizeof(jshort);
1807 aligned = true;
1808 is_oop = false;
1809 dest_uninitialized = false;
1810 break;
1811 case StubId::stubgen_jint_disjoint_arraycopy_id:
1812 size = sizeof(jint);
1813 aligned = false;
1814 is_oop = false;
1815 dest_uninitialized = false;
1816 break;
1817 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1818 size = sizeof(jint);
1819 aligned = true;
1820 is_oop = false;
1821 dest_uninitialized = false;
1822 break;
1823 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1824 // since this is always aligned we can (should!) use the same
1825 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1826 ShouldNotReachHere();
1827 break;
1828 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1829 size = sizeof(jlong);
1830 aligned = true;
1831 is_oop = false;
1832 dest_uninitialized = false;
1833 break;
1834 case StubId::stubgen_oop_disjoint_arraycopy_id:
1835 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1836 aligned = !UseCompressedOops;
1837 is_oop = true;
1838 dest_uninitialized = false;
1839 break;
1840 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1841 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1842 aligned = !UseCompressedOops;
1843 is_oop = true;
1844 dest_uninitialized = false;
1845 break;
1846 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1847 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1848 aligned = !UseCompressedOops;
1849 is_oop = true;
1850 dest_uninitialized = true;
1851 break;
1852 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1853 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1854 aligned = !UseCompressedOops;
1855 is_oop = true;
1856 dest_uninitialized = true;
1857 break;
1858 default:
1859 ShouldNotReachHere();
1860 break;
1861 }
1862 // all stubs provide a 2nd entry which omits the frame push for
1863 // use when bailing out from a conjoint copy. However we may also
1864 // need some extra addressses for memory access protection.
1865 int entry_count = StubInfo::entry_count(stub_id);
1866 assert(entry_count == 2, "sanity check");
1867 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1868
1869 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1870 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1871 GrowableArray<address> entries;
1872 GrowableArray<address> extras;
1873 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1874 address start = load_archive_data(stub_id, &entries, extras_ptr);
1875 if (start != nullptr) {
1876 assert(entries.length() == entry_count - 1,
1877 "unexpected entries count %d", entries.length());
1878 *nopush_entry = entries.at(0);
1879 assert(extras.length() == extra_count,
1880 "unexpected extra count %d", extras.length());
1881 if (add_extras) {
1882 // register one handler at offset 0
1883 register_unsafe_access_handlers(extras, 0, 1);
1884 }
1885 return start;
1886 }
1887
1888 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1889 RegSet saved_reg = RegSet::of(s, d, count);
1890
1891 __ align(CodeEntryAlignment);
1892 StubCodeMark mark(this, stub_id);
1893 start = __ pc();
1894 __ enter();
1895
1896 *nopush_entry = __ pc();
1897 entries.append(*nopush_entry);
1898
1899 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1900 BLOCK_COMMENT("Post-Push Entry:");
1901
1902 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1903 if (dest_uninitialized) {
1904 decorators |= IS_DEST_UNINITIALIZED;
1905 }
1906 if (aligned) {
1907 decorators |= ARRAYCOPY_ALIGNED;
1908 }
1909
1910 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1911 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1912
1913 if (is_oop) {
1914 // save regs before copy_memory
1915 __ push(RegSet::of(d, count), sp);
1916 }
1917 {
1918 // UnsafeMemoryAccess page error: continue after unsafe access
1919 UnsafeMemoryAccessMark umam(this, add_extras, true);
1920 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1921 }
1922
1923 if (is_oop) {
1924 __ pop(RegSet::of(d, count), sp);
1925 if (VerifyOops)
1926 verify_oop_array(size, d, count, r16);
1927 }
1928
1929 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1930
1931 __ leave();
1932 __ mov(r0, zr); // return 0
1933 __ ret(lr);
1934
1935 address end = __ pc();
1936
1937 if (add_extras) {
1938 // retrieve the registered handler addresses
1939 retrieve_unsafe_access_handlers(start, end, extras);
1940 assert(extras.length() == extra_count
1941 , "incorrect handlers count %d", extras.length());
1942 }
1943
1944 // record the stub entry and end plus the no_push entry and any
1945 // extra handler addresses
1946 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1947
1948 return start;
1949 }
1950
1951 // Arguments:
1952 // stub_id - is used to name the stub and identify all details of
1953 // how to perform the copy.
1954 //
1955 // nooverlap_target - identifes the (post push) entry for the
1956 // corresponding disjoint copy routine which can be
1957 // jumped to if the ranges do not actually overlap
1958 //
1959 // nopush_entry - is assigned to the stub's post push entry point
1960 // unless it is null
1961 //
1962 //
1963 // Inputs:
1964 // c_rarg0 - source array address
1965 // c_rarg1 - destination array address
1966 // c_rarg2 - element count, treated as ssize_t, can be zero
1967 //
1968 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1969 // the hardware handle it. The two dwords within qwords that span
1970 // cache line boundaries will still be loaded and stored atomically.
1971 //
1972 // Side Effects:
1973 // nopush_entry is set to the no-overlap entry point so it can be
1974 // used by some other conjoint copy method
1975 //
1976 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1977 int size;
1978 bool aligned;
1979 bool is_oop;
1980 bool dest_uninitialized;
1981 switch (stub_id) {
1982 case StubId::stubgen_jbyte_arraycopy_id:
1983 size = sizeof(jbyte);
1984 aligned = false;
1985 is_oop = false;
1986 dest_uninitialized = false;
1987 break;
1988 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1989 size = sizeof(jbyte);
1990 aligned = true;
1991 is_oop = false;
1992 dest_uninitialized = false;
1993 break;
1994 case StubId::stubgen_jshort_arraycopy_id:
1995 size = sizeof(jshort);
1996 aligned = false;
1997 is_oop = false;
1998 dest_uninitialized = false;
1999 break;
2000 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2001 size = sizeof(jshort);
2002 aligned = true;
2003 is_oop = false;
2004 dest_uninitialized = false;
2005 break;
2006 case StubId::stubgen_jint_arraycopy_id:
2007 size = sizeof(jint);
2008 aligned = false;
2009 is_oop = false;
2010 dest_uninitialized = false;
2011 break;
2012 case StubId::stubgen_arrayof_jint_arraycopy_id:
2013 size = sizeof(jint);
2014 aligned = true;
2015 is_oop = false;
2016 dest_uninitialized = false;
2017 break;
2018 case StubId::stubgen_jlong_arraycopy_id:
2019 // since this is always aligned we can (should!) use the same
2020 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2021 ShouldNotReachHere();
2022 break;
2023 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2024 size = sizeof(jlong);
2025 aligned = true;
2026 is_oop = false;
2027 dest_uninitialized = false;
2028 break;
2029 case StubId::stubgen_oop_arraycopy_id:
2030 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2031 aligned = !UseCompressedOops;
2032 is_oop = true;
2033 dest_uninitialized = false;
2034 break;
2035 case StubId::stubgen_arrayof_oop_arraycopy_id:
2036 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2037 aligned = !UseCompressedOops;
2038 is_oop = true;
2039 dest_uninitialized = false;
2040 break;
2041 case StubId::stubgen_oop_arraycopy_uninit_id:
2042 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2043 aligned = !UseCompressedOops;
2044 is_oop = true;
2045 dest_uninitialized = true;
2046 break;
2047 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2048 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2049 aligned = !UseCompressedOops;
2050 is_oop = true;
2051 dest_uninitialized = true;
2052 break;
2053 default:
2054 ShouldNotReachHere();
2055 }
2056 // only some conjoint stubs generate a 2nd entry
2057 int entry_count = StubInfo::entry_count(stub_id);
2058 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2059 assert(entry_count == expected_entry_count,
2060 "expected entry count %d does not match declared entry count %d for stub %s",
2061 expected_entry_count, entry_count, StubInfo::name(stub_id));
2062
2063 // We need to protect memory accesses in certain cases
2064 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2065 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2066 GrowableArray<address> entries;
2067 GrowableArray<address> extras;
2068 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2069 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2070 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2071 if (start != nullptr) {
2072 assert(entries.length() == expected_entry_count - 1,
2073 "unexpected entries count %d", entries.length());
2074 assert(extras.length() == extra_count,
2075 "unexpected extra count %d", extras.length());
2076 if (nopush_entry != nullptr) {
2077 *nopush_entry = entries.at(0);
2078 }
2079 if (add_extras) {
2080 // register one handler at offset 0
2081 register_unsafe_access_handlers(extras, 0, 1);
2082 }
2083 return start;
2084 }
2085
2086 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2087 RegSet saved_regs = RegSet::of(s, d, count);
2088 StubCodeMark mark(this, stub_id);
2089 start = __ pc();
2090 __ enter();
2091
2092 if (nopush_entry != nullptr) {
2093 *nopush_entry = __ pc();
2094 entries.append(*nopush_entry);
2095 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2096 BLOCK_COMMENT("Post-Push Entry:");
2097 }
2098
2099 // use fwd copy when (d-s) above_equal (count*size)
2100 Label L_overlapping;
2101 __ sub(rscratch1, d, s);
2102 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2103 __ br(Assembler::LO, L_overlapping);
2104 __ b(RuntimeAddress(nooverlap_target));
2105 __ bind(L_overlapping);
2106
2107 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2108 if (dest_uninitialized) {
2109 decorators |= IS_DEST_UNINITIALIZED;
2110 }
2111 if (aligned) {
2112 decorators |= ARRAYCOPY_ALIGNED;
2113 }
2114
2115 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2116 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2117
2118 if (is_oop) {
2119 // save regs before copy_memory
2120 __ push(RegSet::of(d, count), sp);
2121 }
2122 {
2123 // UnsafeMemoryAccess page error: continue after unsafe access
2124 UnsafeMemoryAccessMark umam(this, add_extras, true);
2125 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2126 }
2127 if (is_oop) {
2128 __ pop(RegSet::of(d, count), sp);
2129 if (VerifyOops)
2130 verify_oop_array(size, d, count, r16);
2131 }
2132 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2133 __ leave();
2134 __ mov(r0, zr); // return 0
2135 __ ret(lr);
2136
2137 assert(entries.length() == expected_entry_count - 1,
2138 "unexpected entries count %d", entries.length());
2139
2140 address end = __ pc();
2141
2142 if (add_extras) {
2143 // retrieve the registered handler addresses
2144 retrieve_unsafe_access_handlers(start, end, extras);
2145 assert(extras.length() == extra_count,
2146 "incorrect handlers count %d", extras.length());
2147 }
2148
2149 // record the stub entry and end plus any no_push entry and/or
2150 // extra handler addresses
2151 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2152
2153 return start;
2154 }
2155
2156 // Helper for generating a dynamic type check.
2157 // Smashes rscratch1, rscratch2.
2158 void generate_type_check(Register sub_klass,
2159 Register super_check_offset,
2160 Register super_klass,
2161 Register temp1,
2162 Register temp2,
2163 Register result,
2164 Label& L_success) {
2165 assert_different_registers(sub_klass, super_check_offset, super_klass);
2166
2167 BLOCK_COMMENT("type_check:");
2168
2169 Label L_miss;
2170
2171 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2172 super_check_offset);
2173 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2174
2175 // Fall through on failure!
2176 __ BIND(L_miss);
2177 }
2178
2179 //
2180 // Generate checkcasting array copy stub
2181 //
2182 // Input:
2183 // c_rarg0 - source array address
2184 // c_rarg1 - destination array address
2185 // c_rarg2 - element count, treated as ssize_t, can be zero
2186 // c_rarg3 - size_t ckoff (super_check_offset)
2187 // c_rarg4 - oop ckval (super_klass)
2188 //
2189 // Output:
2190 // r0 == 0 - success
2191 // r0 == -1^K - failure, where K is partial transfer count
2192 //
2193 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2194 bool dest_uninitialized;
2195 switch (stub_id) {
2196 case StubId::stubgen_checkcast_arraycopy_id:
2197 dest_uninitialized = false;
2198 break;
2199 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2200 dest_uninitialized = true;
2201 break;
2202 default:
2203 ShouldNotReachHere();
2204 }
2205
2206 // The normal stub provides a 2nd entry which omits the frame push
2207 // for use when bailing out from a disjoint copy.
2208 // Only some conjoint stubs generate a 2nd entry
2209 int entry_count = StubInfo::entry_count(stub_id);
2210 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2211 GrowableArray<address> entries;
2212 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2213 assert(entry_count == expected_entry_count,
2214 "expected entry count %d does not match declared entry count %d for stub %s",
2215 expected_entry_count, entry_count, StubInfo::name(stub_id));
2216 address start = load_archive_data(stub_id, entries_ptr);
2217 if (start != nullptr) {
2218 assert(entries.length() + 1 == expected_entry_count,
2219 "expected entry count %d does not match return entry count %d for stub %s",
2220 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2221 if (nopush_entry != nullptr) {
2222 *nopush_entry = entries.at(0);
2223 }
2224 return start;
2225 }
2226
2227 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2228
2229 // Input registers (after setup_arg_regs)
2230 const Register from = c_rarg0; // source array address
2231 const Register to = c_rarg1; // destination array address
2232 const Register count = c_rarg2; // elementscount
2233 const Register ckoff = c_rarg3; // super_check_offset
2234 const Register ckval = c_rarg4; // super_klass
2235
2236 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2237
2238 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2239 const Register copied_oop = r22; // actual oop copied
2240 const Register count_save = r21; // orig elementscount
2241 const Register start_to = r20; // destination array start address
2242 const Register r19_klass = r19; // oop._klass
2243
2244 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2245 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2246
2247 //---------------------------------------------------------------
2248 // Assembler stub will be used for this call to arraycopy
2249 // if the two arrays are subtypes of Object[] but the
2250 // destination array type is not equal to or a supertype
2251 // of the source type. Each element must be separately
2252 // checked.
2253
2254 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2255 copied_oop, r19_klass, count_save);
2256
2257 __ align(CodeEntryAlignment);
2258 StubCodeMark mark(this, stub_id);
2259 start = __ pc();
2260
2261 __ enter(); // required for proper stackwalking of RuntimeStub frame
2262
2263 #ifdef ASSERT
2264 // caller guarantees that the arrays really are different
2265 // otherwise, we would have to make conjoint checks
2266 { Label L;
2267 __ b(L); // conjoint check not yet implemented
2268 __ stop("checkcast_copy within a single array");
2269 __ bind(L);
2270 }
2271 #endif //ASSERT
2272
2273 // Caller of this entry point must set up the argument registers.
2274 if (nopush_entry != nullptr) {
2275 *nopush_entry = __ pc();
2276 entries.append(*nopush_entry);
2277 BLOCK_COMMENT("Entry:");
2278 }
2279
2280 // Empty array: Nothing to do.
2281 __ cbz(count, L_done);
2282 __ push(RegSet::of(r19, r20, r21, r22), sp);
2283
2284 #ifdef ASSERT
2285 BLOCK_COMMENT("assert consistent ckoff/ckval");
2286 // The ckoff and ckval must be mutually consistent,
2287 // even though caller generates both.
2288 { Label L;
2289 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2290 __ ldrw(start_to, Address(ckval, sco_offset));
2291 __ cmpw(ckoff, start_to);
2292 __ br(Assembler::EQ, L);
2293 __ stop("super_check_offset inconsistent");
2294 __ bind(L);
2295 }
2296 #endif //ASSERT
2297
2298 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2299 bool is_oop = true;
2300 int element_size = UseCompressedOops ? 4 : 8;
2301 if (dest_uninitialized) {
2302 decorators |= IS_DEST_UNINITIALIZED;
2303 }
2304
2305 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2306 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2307
2308 // save the original count
2309 __ mov(count_save, count);
2310
2311 // Copy from low to high addresses
2312 __ mov(start_to, to); // Save destination array start address
2313 __ b(L_load_element);
2314
2315 // ======== begin loop ========
2316 // (Loop is rotated; its entry is L_load_element.)
2317 // Loop control:
2318 // for (; count != 0; count--) {
2319 // copied_oop = load_heap_oop(from++);
2320 // ... generate_type_check ...;
2321 // store_heap_oop(to++, copied_oop);
2322 // }
2323 __ align(OptoLoopAlignment);
2324
2325 __ BIND(L_store_element);
2326 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2327 __ post(to, element_size), copied_oop, noreg,
2328 gct1, gct2, gct3);
2329 __ sub(count, count, 1);
2330 __ cbz(count, L_do_card_marks);
2331
2332 // ======== loop entry is here ========
2333 __ BIND(L_load_element);
2334 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2335 copied_oop, noreg, __ post(from, element_size),
2336 gct1);
2337 __ cbz(copied_oop, L_store_element);
2338
2339 __ load_klass(r19_klass, copied_oop);// query the object klass
2340
2341 BLOCK_COMMENT("type_check:");
2342 generate_type_check(/*sub_klass*/r19_klass,
2343 /*super_check_offset*/ckoff,
2344 /*super_klass*/ckval,
2345 /*r_array_base*/gct1,
2346 /*temp2*/gct2,
2347 /*result*/r10, L_store_element);
2348
2349 // Fall through on failure!
2350
2351 // ======== end loop ========
2352
2353 // It was a real error; we must depend on the caller to finish the job.
2354 // Register count = remaining oops, count_orig = total oops.
2355 // Emit GC store barriers for the oops we have copied and report
2356 // their number to the caller.
2357
2358 __ subs(count, count_save, count); // K = partially copied oop count
2359 __ eon(count, count, zr); // report (-1^K) to caller
2360 __ br(Assembler::EQ, L_done_pop);
2361
2362 __ BIND(L_do_card_marks);
2363 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2364
2365 __ bind(L_done_pop);
2366 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2367 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2368
2369 __ bind(L_done);
2370 __ mov(r0, count);
2371 __ leave();
2372 __ ret(lr);
2373
2374 // record the stub entry and end plus any no_push entry
2375 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2376 return start;
2377 }
2378
2379 // Perform range checks on the proposed arraycopy.
2380 // Kills temp, but nothing else.
2381 // Also, clean the sign bits of src_pos and dst_pos.
2382 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2383 Register src_pos, // source position (c_rarg1)
2384 Register dst, // destination array oo (c_rarg2)
2385 Register dst_pos, // destination position (c_rarg3)
2386 Register length,
2387 Register temp,
2388 Label& L_failed) {
2389 BLOCK_COMMENT("arraycopy_range_checks:");
2390
2391 assert_different_registers(rscratch1, temp);
2392
2393 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2394 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2395 __ addw(temp, length, src_pos);
2396 __ cmpw(temp, rscratch1);
2397 __ br(Assembler::HI, L_failed);
2398
2399 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2400 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2401 __ addw(temp, length, dst_pos);
2402 __ cmpw(temp, rscratch1);
2403 __ br(Assembler::HI, L_failed);
2404
2405 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2406 __ movw(src_pos, src_pos);
2407 __ movw(dst_pos, dst_pos);
2408
2409 BLOCK_COMMENT("arraycopy_range_checks done");
2410 }
2411
2412 // These stubs get called from some dumb test routine.
2413 // I'll write them properly when they're called from
2414 // something that's actually doing something.
2415 static void fake_arraycopy_stub(address src, address dst, int count) {
2416 assert(count == 0, "huh?");
2417 }
2418
2419
2420 //
2421 // Generate 'unsafe' array copy stub
2422 // Though just as safe as the other stubs, it takes an unscaled
2423 // size_t argument instead of an element count.
2424 //
2425 // Input:
2426 // c_rarg0 - source array address
2427 // c_rarg1 - destination array address
2428 // c_rarg2 - byte count, treated as ssize_t, can be zero
2429 //
2430 // Examines the alignment of the operands and dispatches
2431 // to a long, int, short, or byte copy loop.
2432 //
2433 address generate_unsafe_copy(address byte_copy_entry,
2434 address short_copy_entry,
2435 address int_copy_entry,
2436 address long_copy_entry) {
2437 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2438 int entry_count = StubInfo::entry_count(stub_id);
2439 assert(entry_count == 1, "sanity check");
2440 address start = load_archive_data(stub_id);
2441 if (start != nullptr) {
2442 return start;
2443 }
2444 Label L_long_aligned, L_int_aligned, L_short_aligned;
2445 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2446
2447 __ align(CodeEntryAlignment);
2448 StubCodeMark mark(this, stub_id);
2449 start = __ pc();
2450 __ enter(); // required for proper stackwalking of RuntimeStub frame
2451
2452 // bump this on entry, not on exit:
2453 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2454
2455 __ orr(rscratch1, s, d);
2456 __ orr(rscratch1, rscratch1, count);
2457
2458 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2459 __ cbz(rscratch1, L_long_aligned);
2460 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2461 __ cbz(rscratch1, L_int_aligned);
2462 __ tbz(rscratch1, 0, L_short_aligned);
2463 __ b(RuntimeAddress(byte_copy_entry));
2464
2465 __ BIND(L_short_aligned);
2466 __ lsr(count, count, LogBytesPerShort); // size => short_count
2467 __ b(RuntimeAddress(short_copy_entry));
2468 __ BIND(L_int_aligned);
2469 __ lsr(count, count, LogBytesPerInt); // size => int_count
2470 __ b(RuntimeAddress(int_copy_entry));
2471 __ BIND(L_long_aligned);
2472 __ lsr(count, count, LogBytesPerLong); // size => long_count
2473 __ b(RuntimeAddress(long_copy_entry));
2474
2475 // record the stub entry and end
2476 store_archive_data(stub_id, start, __ pc());
2477
2478 return start;
2479 }
2480
2481 //
2482 // Generate generic array copy stubs
2483 //
2484 // Input:
2485 // c_rarg0 - src oop
2486 // c_rarg1 - src_pos (32-bits)
2487 // c_rarg2 - dst oop
2488 // c_rarg3 - dst_pos (32-bits)
2489 // c_rarg4 - element count (32-bits)
2490 //
2491 // Output:
2492 // r0 == 0 - success
2493 // r0 == -1^K - failure, where K is partial transfer count
2494 //
2495 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2496 address int_copy_entry, address oop_copy_entry,
2497 address long_copy_entry, address checkcast_copy_entry) {
2498 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2499 int entry_count = StubInfo::entry_count(stub_id);
2500 assert(entry_count == 1, "sanity check");
2501 address start = load_archive_data(stub_id);
2502 if (start != nullptr) {
2503 return start;
2504 }
2505 Label L_failed, L_objArray;
2506 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2507
2508 // Input registers
2509 const Register src = c_rarg0; // source array oop
2510 const Register src_pos = c_rarg1; // source position
2511 const Register dst = c_rarg2; // destination array oop
2512 const Register dst_pos = c_rarg3; // destination position
2513 const Register length = c_rarg4;
2514
2515
2516 // Registers used as temps
2517 const Register dst_klass = c_rarg5;
2518
2519 __ align(CodeEntryAlignment);
2520
2521 StubCodeMark mark(this, stub_id);
2522
2523 start = __ pc();
2524
2525 __ enter(); // required for proper stackwalking of RuntimeStub frame
2526
2527 // bump this on entry, not on exit:
2528 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2529
2530 //-----------------------------------------------------------------------
2531 // Assembler stub will be used for this call to arraycopy
2532 // if the following conditions are met:
2533 //
2534 // (1) src and dst must not be null.
2535 // (2) src_pos must not be negative.
2536 // (3) dst_pos must not be negative.
2537 // (4) length must not be negative.
2538 // (5) src klass and dst klass should be the same and not null.
2539 // (6) src and dst should be arrays.
2540 // (7) src_pos + length must not exceed length of src.
2541 // (8) dst_pos + length must not exceed length of dst.
2542 //
2543
2544 // if (src == nullptr) return -1;
2545 __ cbz(src, L_failed);
2546
2547 // if (src_pos < 0) return -1;
2548 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2549
2550 // if (dst == nullptr) return -1;
2551 __ cbz(dst, L_failed);
2552
2553 // if (dst_pos < 0) return -1;
2554 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2555
2556 // registers used as temp
2557 const Register scratch_length = r16; // elements count to copy
2558 const Register scratch_src_klass = r17; // array klass
2559 const Register lh = r15; // layout helper
2560
2561 // if (length < 0) return -1;
2562 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2563 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2564
2565 __ load_klass(scratch_src_klass, src);
2566 #ifdef ASSERT
2567 // assert(src->klass() != nullptr);
2568 {
2569 BLOCK_COMMENT("assert klasses not null {");
2570 Label L1, L2;
2571 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2572 __ bind(L1);
2573 __ stop("broken null klass");
2574 __ bind(L2);
2575 __ load_klass(rscratch1, dst);
2576 __ cbz(rscratch1, L1); // this would be broken also
2577 BLOCK_COMMENT("} assert klasses not null done");
2578 }
2579 #endif
2580
2581 // Load layout helper (32-bits)
2582 //
2583 // |array_tag| | header_size | element_type | |log2_element_size|
2584 // 32 30 24 16 8 2 0
2585 //
2586 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2587 //
2588
2589 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2590
2591 // Handle objArrays completely differently...
2592 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2593 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2594 __ movw(rscratch1, objArray_lh);
2595 __ eorw(rscratch2, lh, rscratch1);
2596 __ cbzw(rscratch2, L_objArray);
2597
2598 // if (src->klass() != dst->klass()) return -1;
2599 __ load_klass(rscratch2, dst);
2600 __ eor(rscratch2, rscratch2, scratch_src_klass);
2601 __ cbnz(rscratch2, L_failed);
2602
2603 // if (!src->is_Array()) return -1;
2604 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2605
2606 // At this point, it is known to be a typeArray (array_tag 0x3).
2607 #ifdef ASSERT
2608 {
2609 BLOCK_COMMENT("assert primitive array {");
2610 Label L;
2611 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2612 __ cmpw(lh, rscratch2);
2613 __ br(Assembler::GE, L);
2614 __ stop("must be a primitive array");
2615 __ bind(L);
2616 BLOCK_COMMENT("} assert primitive array done");
2617 }
2618 #endif
2619
2620 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2621 rscratch2, L_failed);
2622
2623 // TypeArrayKlass
2624 //
2625 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2626 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2627 //
2628
2629 const Register rscratch1_offset = rscratch1; // array offset
2630 const Register r15_elsize = lh; // element size
2631
2632 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2633 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2634 __ add(src, src, rscratch1_offset); // src array offset
2635 __ add(dst, dst, rscratch1_offset); // dst array offset
2636 BLOCK_COMMENT("choose copy loop based on element size");
2637
2638 // next registers should be set before the jump to corresponding stub
2639 const Register from = c_rarg0; // source array address
2640 const Register to = c_rarg1; // destination array address
2641 const Register count = c_rarg2; // elements count
2642
2643 // 'from', 'to', 'count' registers should be set in such order
2644 // since they are the same as 'src', 'src_pos', 'dst'.
2645
2646 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2647
2648 // The possible values of elsize are 0-3, i.e. exact_log2(element
2649 // size in bytes). We do a simple bitwise binary search.
2650 __ BIND(L_copy_bytes);
2651 __ tbnz(r15_elsize, 1, L_copy_ints);
2652 __ tbnz(r15_elsize, 0, L_copy_shorts);
2653 __ lea(from, Address(src, src_pos));// src_addr
2654 __ lea(to, Address(dst, dst_pos));// dst_addr
2655 __ movw(count, scratch_length); // length
2656 __ b(RuntimeAddress(byte_copy_entry));
2657
2658 __ BIND(L_copy_shorts);
2659 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2660 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2661 __ movw(count, scratch_length); // length
2662 __ b(RuntimeAddress(short_copy_entry));
2663
2664 __ BIND(L_copy_ints);
2665 __ tbnz(r15_elsize, 0, L_copy_longs);
2666 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2667 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2668 __ movw(count, scratch_length); // length
2669 __ b(RuntimeAddress(int_copy_entry));
2670
2671 __ BIND(L_copy_longs);
2672 #ifdef ASSERT
2673 {
2674 BLOCK_COMMENT("assert long copy {");
2675 Label L;
2676 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2677 __ cmpw(r15_elsize, LogBytesPerLong);
2678 __ br(Assembler::EQ, L);
2679 __ stop("must be long copy, but elsize is wrong");
2680 __ bind(L);
2681 BLOCK_COMMENT("} assert long copy done");
2682 }
2683 #endif
2684 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2685 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2686 __ movw(count, scratch_length); // length
2687 __ b(RuntimeAddress(long_copy_entry));
2688
2689 // ObjArrayKlass
2690 __ BIND(L_objArray);
2691 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2692
2693 Label L_plain_copy, L_checkcast_copy;
2694 // test array classes for subtyping
2695 __ load_klass(r15, dst);
2696 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2697 __ br(Assembler::NE, L_checkcast_copy);
2698
2699 // Identically typed arrays can be copied without element-wise checks.
2700 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2701 rscratch2, L_failed);
2702
2703 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2704 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2705 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2706 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2707 __ movw(count, scratch_length); // length
2708 __ BIND(L_plain_copy);
2709 __ b(RuntimeAddress(oop_copy_entry));
2710
2711 __ BIND(L_checkcast_copy);
2712 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2713 {
2714 // Before looking at dst.length, make sure dst is also an objArray.
2715 __ ldrw(rscratch1, Address(r15, lh_offset));
2716 __ movw(rscratch2, objArray_lh);
2717 __ eorw(rscratch1, rscratch1, rscratch2);
2718 __ cbnzw(rscratch1, L_failed);
2719
2720 // It is safe to examine both src.length and dst.length.
2721 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2722 r15, L_failed);
2723
2724 __ load_klass(dst_klass, dst); // reload
2725
2726 // Marshal the base address arguments now, freeing registers.
2727 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2728 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2729 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2730 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2731 __ movw(count, length); // length (reloaded)
2732 Register sco_temp = c_rarg3; // this register is free now
2733 assert_different_registers(from, to, count, sco_temp,
2734 dst_klass, scratch_src_klass);
2735 // assert_clean_int(count, sco_temp);
2736
2737 // Generate the type check.
2738 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2739 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2740
2741 // Smashes rscratch1, rscratch2
2742 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2743 L_plain_copy);
2744
2745 // Fetch destination element klass from the ObjArrayKlass header.
2746 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2747 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2748 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2749
2750 // the checkcast_copy loop needs two extra arguments:
2751 assert(c_rarg3 == sco_temp, "#3 already in place");
2752 // Set up arguments for checkcast_copy_entry.
2753 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2754 __ b(RuntimeAddress(checkcast_copy_entry));
2755 }
2756
2757 __ BIND(L_failed);
2758 __ mov(r0, -1);
2759 __ leave(); // required for proper stackwalking of RuntimeStub frame
2760 __ ret(lr);
2761
2762 // record the stub entry and end
2763 store_archive_data(stub_id, start, __ pc());
2764
2765 return start;
2766 }
2767
2768 //
2769 // Generate stub for array fill. If "aligned" is true, the
2770 // "to" address is assumed to be heapword aligned.
2771 //
2772 // Arguments for generated stub:
2773 // to: c_rarg0
2774 // value: c_rarg1
2775 // count: c_rarg2 treated as signed
2776 //
2777 address generate_fill(StubId stub_id) {
2778 BasicType t;
2779 bool aligned;
2780
2781 switch (stub_id) {
2782 case StubId::stubgen_jbyte_fill_id:
2783 t = T_BYTE;
2784 aligned = false;
2785 break;
2786 case StubId::stubgen_jshort_fill_id:
2787 t = T_SHORT;
2788 aligned = false;
2789 break;
2790 case StubId::stubgen_jint_fill_id:
2791 t = T_INT;
2792 aligned = false;
2793 break;
2794 case StubId::stubgen_arrayof_jbyte_fill_id:
2795 t = T_BYTE;
2796 aligned = true;
2797 break;
2798 case StubId::stubgen_arrayof_jshort_fill_id:
2799 t = T_SHORT;
2800 aligned = true;
2801 break;
2802 case StubId::stubgen_arrayof_jint_fill_id:
2803 t = T_INT;
2804 aligned = true;
2805 break;
2806 default:
2807 ShouldNotReachHere();
2808 };
2809 int entry_count = StubInfo::entry_count(stub_id);
2810 assert(entry_count == 1, "sanity check");
2811 address start = load_archive_data(stub_id);
2812 if (start != nullptr) {
2813 return start;
2814 }
2815 __ align(CodeEntryAlignment);
2816 StubCodeMark mark(this, stub_id);
2817 start = __ pc();
2818
2819 BLOCK_COMMENT("Entry:");
2820
2821 const Register to = c_rarg0; // source array address
2822 const Register value = c_rarg1; // value
2823 const Register count = c_rarg2; // elements count
2824
2825 const Register bz_base = r10; // base for block_zero routine
2826 const Register cnt_words = r11; // temp register
2827
2828 __ enter();
2829
2830 Label L_fill_elements, L_exit1;
2831
2832 int shift = -1;
2833 switch (t) {
2834 case T_BYTE:
2835 shift = 0;
2836 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2837 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2838 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2839 __ br(Assembler::LO, L_fill_elements);
2840 break;
2841 case T_SHORT:
2842 shift = 1;
2843 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2844 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2845 __ br(Assembler::LO, L_fill_elements);
2846 break;
2847 case T_INT:
2848 shift = 2;
2849 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2850 __ br(Assembler::LO, L_fill_elements);
2851 break;
2852 default: ShouldNotReachHere();
2853 }
2854
2855 // Align source address at 8 bytes address boundary.
2856 Label L_skip_align1, L_skip_align2, L_skip_align4;
2857 if (!aligned) {
2858 switch (t) {
2859 case T_BYTE:
2860 // One byte misalignment happens only for byte arrays.
2861 __ tbz(to, 0, L_skip_align1);
2862 __ strb(value, Address(__ post(to, 1)));
2863 __ subw(count, count, 1);
2864 __ bind(L_skip_align1);
2865 // Fallthrough
2866 case T_SHORT:
2867 // Two bytes misalignment happens only for byte and short (char) arrays.
2868 __ tbz(to, 1, L_skip_align2);
2869 __ strh(value, Address(__ post(to, 2)));
2870 __ subw(count, count, 2 >> shift);
2871 __ bind(L_skip_align2);
2872 // Fallthrough
2873 case T_INT:
2874 // Align to 8 bytes, we know we are 4 byte aligned to start.
2875 __ tbz(to, 2, L_skip_align4);
2876 __ strw(value, Address(__ post(to, 4)));
2877 __ subw(count, count, 4 >> shift);
2878 __ bind(L_skip_align4);
2879 break;
2880 default: ShouldNotReachHere();
2881 }
2882 }
2883
2884 //
2885 // Fill large chunks
2886 //
2887 __ lsrw(cnt_words, count, 3 - shift); // number of words
2888 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2889 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2890 if (UseBlockZeroing) {
2891 Label non_block_zeroing, rest;
2892 // If the fill value is zero we can use the fast zero_words().
2893 __ cbnz(value, non_block_zeroing);
2894 __ mov(bz_base, to);
2895 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2896 address tpc = __ zero_words(bz_base, cnt_words);
2897 if (tpc == nullptr) {
2898 fatal("CodeCache is full at generate_fill");
2899 }
2900 __ b(rest);
2901 __ bind(non_block_zeroing);
2902 __ fill_words(to, cnt_words, value);
2903 __ bind(rest);
2904 } else {
2905 __ fill_words(to, cnt_words, value);
2906 }
2907
2908 // Remaining count is less than 8 bytes. Fill it by a single store.
2909 // Note that the total length is no less than 8 bytes.
2910 if (t == T_BYTE || t == T_SHORT) {
2911 Label L_exit1;
2912 __ cbzw(count, L_exit1);
2913 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2914 __ str(value, Address(to, -8)); // overwrite some elements
2915 __ bind(L_exit1);
2916 __ leave();
2917 __ ret(lr);
2918 }
2919
2920 // Handle copies less than 8 bytes.
2921 Label L_fill_2, L_fill_4, L_exit2;
2922 __ bind(L_fill_elements);
2923 switch (t) {
2924 case T_BYTE:
2925 __ tbz(count, 0, L_fill_2);
2926 __ strb(value, Address(__ post(to, 1)));
2927 __ bind(L_fill_2);
2928 __ tbz(count, 1, L_fill_4);
2929 __ strh(value, Address(__ post(to, 2)));
2930 __ bind(L_fill_4);
2931 __ tbz(count, 2, L_exit2);
2932 __ strw(value, Address(to));
2933 break;
2934 case T_SHORT:
2935 __ tbz(count, 0, L_fill_4);
2936 __ strh(value, Address(__ post(to, 2)));
2937 __ bind(L_fill_4);
2938 __ tbz(count, 1, L_exit2);
2939 __ strw(value, Address(to));
2940 break;
2941 case T_INT:
2942 __ cbzw(count, L_exit2);
2943 __ strw(value, Address(to));
2944 break;
2945 default: ShouldNotReachHere();
2946 }
2947 __ bind(L_exit2);
2948 __ leave();
2949 __ ret(lr);
2950
2951 // record the stub entry and end
2952 store_archive_data(stub_id, start, __ pc());
2953
2954 return start;
2955 }
2956
2957 address generate_unsafecopy_common_error_exit() {
2958 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2959 int entry_count = StubInfo::entry_count(stub_id);
2960 assert(entry_count == 1, "sanity check");
2961 address start = load_archive_data(stub_id);
2962 if (start != nullptr) {
2963 return start;
2964 }
2965 __ align(CodeEntryAlignment);
2966 StubCodeMark mark(this, stub_id);
2967 start = __ pc();
2968 __ leave();
2969 __ mov(r0, 0);
2970 __ ret(lr);
2971
2972 // record the stub entry and end
2973 store_archive_data(stub_id, start, __ pc());
2974
2975 return start;
2976 }
2977
2978 //
2979 // Generate 'unsafe' set memory stub
2980 // Though just as safe as the other stubs, it takes an unscaled
2981 // size_t (# bytes) argument instead of an element count.
2982 //
2983 // This fill operation is atomicity preserving: as long as the
2984 // address supplied is sufficiently aligned, all writes of up to 64
2985 // bits in size are single-copy atomic.
2986 //
2987 // Input:
2988 // c_rarg0 - destination array address
2989 // c_rarg1 - byte count (size_t)
2990 // c_rarg2 - byte value
2991 //
2992 address generate_unsafe_setmemory() {
2993 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2994 int entry_count = StubInfo::entry_count(stub_id);
2995 assert(entry_count == 1, "sanity check");
2996 // we expect one set of extra unsafememory access handler entries
2997 GrowableArray<address> extras;
2998 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
2999 address start = load_archive_data(stub_id, nullptr, &extras);
3000 if (start != nullptr) {
3001 assert(extras.length() == extra_count,
3002 "unexpected extra entry count %d", extras.length());
3003 register_unsafe_access_handlers(extras, 0, 1);
3004 return start;
3005 }
3006
3007 __ align(CodeEntryAlignment);
3008 StubCodeMark mark(this, stub_id);
3009 start = __ pc();
3010
3011 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3012 Label tail;
3013
3014 {
3015 UnsafeMemoryAccessMark umam(this, true, false);
3016
3017 __ enter(); // required for proper stackwalking of RuntimeStub frame
3018
3019 __ dup(v0, __ T16B, value);
3020
3021 if (AvoidUnalignedAccesses) {
3022 __ cmp(count, (u1)16);
3023 __ br(__ LO, tail);
3024
3025 __ mov(rscratch1, 16);
3026 __ andr(rscratch2, dest, 15);
3027 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3028 __ strq(v0, Address(dest));
3029 __ sub(count, count, rscratch1);
3030 __ add(dest, dest, rscratch1);
3031 }
3032
3033 __ subs(count, count, (u1)64);
3034 __ br(__ LO, tail);
3035 {
3036 Label again;
3037 __ bind(again);
3038 __ stpq(v0, v0, Address(dest));
3039 __ stpq(v0, v0, Address(dest, 32));
3040
3041 __ subs(count, count, 64);
3042 __ add(dest, dest, 64);
3043 __ br(__ HS, again);
3044 }
3045
3046 __ bind(tail);
3047 // The count of bytes is off by 64, but we don't need to correct
3048 // it because we're only going to use the least-significant few
3049 // count bits from here on.
3050 // __ add(count, count, 64);
3051
3052 {
3053 Label dont;
3054 __ tbz(count, exact_log2(32), dont);
3055 __ stpq(v0, v0, __ post(dest, 32));
3056 __ bind(dont);
3057 }
3058 {
3059 Label dont;
3060 __ tbz(count, exact_log2(16), dont);
3061 __ strq(v0, __ post(dest, 16));
3062 __ bind(dont);
3063 }
3064 {
3065 Label dont;
3066 __ tbz(count, exact_log2(8), dont);
3067 __ strd(v0, __ post(dest, 8));
3068 __ bind(dont);
3069 }
3070
3071 Label finished;
3072 __ tst(count, 7);
3073 __ br(__ EQ, finished);
3074
3075 {
3076 Label dont;
3077 __ tbz(count, exact_log2(4), dont);
3078 __ strs(v0, __ post(dest, 4));
3079 __ bind(dont);
3080 }
3081 {
3082 Label dont;
3083 __ tbz(count, exact_log2(2), dont);
3084 __ bfi(value, value, 8, 8);
3085 __ strh(value, __ post(dest, 2));
3086 __ bind(dont);
3087 }
3088 {
3089 Label dont;
3090 __ tbz(count, exact_log2(1), dont);
3091 __ strb(value, Address(dest));
3092 __ bind(dont);
3093 }
3094
3095 __ bind(finished);
3096 __ leave();
3097 __ ret(lr);
3098 // have to exit the block and destroy the UnsafeMemoryAccessMark
3099 // in order to retrieve the handler end address
3100 }
3101
3102 // install saved handler addresses in extras
3103 address end = __ pc();
3104 retrieve_unsafe_access_handlers(start, end, extras);
3105 assert(extras.length() == extra_count,
3106 "incorrect handlers count %d", extras.length());
3107 // record the stub entry and end plus the extras
3108 store_archive_data(stub_id, start, end, nullptr, &extras);
3109
3110 return start;
3111 }
3112
3113 address generate_data_cache_writeback() {
3114 const Register line = c_rarg0; // address of line to write back
3115
3116 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3117 int entry_count = StubInfo::entry_count(stub_id);
3118 assert(entry_count == 1, "sanity check");
3119 address start = load_archive_data(stub_id);
3120 if (start != nullptr) {
3121 return start;
3122 }
3123 __ align(CodeEntryAlignment);
3124 StubCodeMark mark(this, stub_id);
3125
3126 start = __ pc();
3127 __ enter();
3128 __ cache_wb(Address(line, 0));
3129 __ leave();
3130 __ ret(lr);
3131
3132 // record the stub entry and end
3133 store_archive_data(stub_id, start, __ pc());
3134
3135 return start;
3136 }
3137
3138 address generate_data_cache_writeback_sync() {
3139 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3140 int entry_count = StubInfo::entry_count(stub_id);
3141 assert(entry_count == 1, "sanity check");
3142 address start = load_archive_data(stub_id);
3143 if (start != nullptr) {
3144 return start;
3145 }
3146 const Register is_pre = c_rarg0; // pre or post sync
3147 __ align(CodeEntryAlignment);
3148 StubCodeMark mark(this, stub_id);
3149
3150 // pre wbsync is a no-op
3151 // post wbsync translates to an sfence
3152
3153 Label skip;
3154 start = __ pc();
3155 __ enter();
3156 __ cbnz(is_pre, skip);
3157 __ cache_wbsync(false);
3158 __ bind(skip);
3159 __ leave();
3160 __ ret(lr);
3161
3162 // record the stub entry and end
3163 store_archive_data(stub_id, start, __ pc());
3164
3165 return start;
3166 }
3167
3168 void generate_arraycopy_stubs() {
3169 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3170 // entry immediately following their stack push. This can be used
3171 // as a post-push branch target for compatible stubs when they
3172 // identify a special case that can be handled by the fallback
3173 // stub e.g a disjoint copy stub may be use as a special case
3174 // fallback for its compatible conjoint copy stub.
3175 //
3176 // A no push entry is always returned in the following local and
3177 // then published by assigning to the appropriate entry field in
3178 // class StubRoutines. The entry value is then passed to the
3179 // generator for the compatible stub. That means the entry must be
3180 // listed when saving to/restoring from the AOT cache, ensuring
3181 // that the inter-stub jumps are noted at AOT-cache save and
3182 // relocated at AOT cache load.
3183 address nopush_entry;
3184
3185 // generate the common exit first so later stubs can rely on it if
3186 // they want an UnsafeMemoryAccess exit non-local to the stub
3187 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3188 // register the stub as the default exit with class UnsafeMemoryAccess
3189 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3190
3191 // generate and publish arch64-specific bulk copy routines first
3192 // so we can call them from other copy stubs
3193 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3194 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3195
3196 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3197 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3198
3199 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3200 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3201
3202 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3203
3204 //*** jbyte
3205 // Always need aligned and unaligned versions
3206 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3207 // disjoint nopush entry is needed by conjoint copy
3208 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3209 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3210 // conjoint nopush entry is needed by generic/unsafe copy
3211 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3212 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3213 // disjoint arrayof nopush entry is needed by conjoint copy
3214 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3215 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3216
3217 //*** jshort
3218 // Always need aligned and unaligned versions
3219 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3220 // disjoint nopush entry is needed by conjoint copy
3221 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3222 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3223 // conjoint nopush entry is used by generic/unsafe copy
3224 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3225 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3226 // disjoint arrayof nopush entry is needed by conjoint copy
3227 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3228 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3229
3230 //*** jint
3231 // Aligned versions
3232 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3233 // disjoint arrayof nopush entry is needed by conjoint copy
3234 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3235 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3236 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3237 // jint_arraycopy_nopush always points to the unaligned version
3238 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3239 // disjoint nopush entry is needed by conjoint copy
3240 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3241 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3242 // conjoint nopush entry is needed by generic/unsafe copy
3243 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3244
3245 //*** jlong
3246 // It is always aligned
3247 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3248 // disjoint arrayof nopush entry is needed by conjoint copy
3249 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3250 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3251 // conjoint nopush entry is needed by generic/unsafe copy
3252 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3253 // disjoint normal/nopush and conjoint normal entries are not
3254 // generated since the arrayof versions are the same
3255 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3256 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3257 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3258
3259 //*** oops
3260 {
3261 StubRoutines::_arrayof_oop_disjoint_arraycopy
3262 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3263 // disjoint arrayof nopush entry is needed by conjoint copy
3264 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3265 StubRoutines::_arrayof_oop_arraycopy
3266 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3267 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3268 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3269 // Aligned versions without pre-barriers
3270 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3271 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3272 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3273 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3274 // note that we don't need a returned nopush entry because the
3275 // generic/unsafe copy does not cater for uninit arrays.
3276 StubRoutines::_arrayof_oop_arraycopy_uninit
3277 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3278 }
3279
3280 // for oop copies reuse arrayof entries for non-arrayof cases
3281 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3282 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3283 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3284 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3285 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3286 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3287
3288 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3289 // checkcast nopush entry is needed by generic copy
3290 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3291 // note that we don't need a returned nopush entry because the
3292 // generic copy does not cater for uninit arrays.
3293 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3294
3295 // unsafe arraycopy may fallback on conjoint stubs
3296 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3297 StubRoutines::_jshort_arraycopy_nopush,
3298 StubRoutines::_jint_arraycopy_nopush,
3299 StubRoutines::_jlong_arraycopy_nopush);
3300
3301 // generic arraycopy may fallback on conjoint stubs
3302 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3303 StubRoutines::_jshort_arraycopy_nopush,
3304 StubRoutines::_jint_arraycopy_nopush,
3305 StubRoutines::_oop_arraycopy_nopush,
3306 StubRoutines::_jlong_arraycopy_nopush,
3307 StubRoutines::_checkcast_arraycopy_nopush);
3308
3309 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3310 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3311 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3312 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3313 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3314 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3315 }
3316
3317 void generate_math_stubs() { Unimplemented(); }
3318
3319 // Arguments:
3320 //
3321 // Inputs:
3322 // c_rarg0 - source byte array address
3323 // c_rarg1 - destination byte array address
3324 // c_rarg2 - sessionKe (key) in little endian int array
3325 //
3326 address generate_aescrypt_encryptBlock() {
3327 assert(UseAES, "need AES cryptographic extension support");
3328 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3329 int entry_count = StubInfo::entry_count(stub_id);
3330 assert(entry_count == 1, "sanity check");
3331 address start = load_archive_data(stub_id);
3332 if (start != nullptr) {
3333 return start;
3334 }
3335 __ align(CodeEntryAlignment);
3336 StubCodeMark mark(this, stub_id);
3337
3338 const Register from = c_rarg0; // source array address
3339 const Register to = c_rarg1; // destination array address
3340 const Register key = c_rarg2; // key array address
3341 const Register keylen = rscratch1;
3342
3343 start = __ pc();
3344 __ enter();
3345
3346 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3347
3348 __ aesenc_loadkeys(key, keylen);
3349 __ aesecb_encrypt(from, to, keylen);
3350
3351 __ mov(r0, 0);
3352
3353 __ leave();
3354 __ ret(lr);
3355
3356 // record the stub entry and end
3357 store_archive_data(stub_id, start, __ pc());
3358
3359 return start;
3360 }
3361
3362 // Arguments:
3363 //
3364 // Inputs:
3365 // c_rarg0 - source byte array address
3366 // c_rarg1 - destination byte array address
3367 // c_rarg2 - sessionKd (key) in little endian int array
3368 //
3369 address generate_aescrypt_decryptBlock() {
3370 assert(UseAES, "need AES cryptographic extension support");
3371 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3372 int entry_count = StubInfo::entry_count(stub_id);
3373 assert(entry_count == 1, "sanity check");
3374 address start = load_archive_data(stub_id);
3375 if (start != nullptr) {
3376 return start;
3377 }
3378 __ align(CodeEntryAlignment);
3379 StubCodeMark mark(this, stub_id);
3380 Label L_doLast;
3381
3382 const Register from = c_rarg0; // source array address
3383 const Register to = c_rarg1; // destination array address
3384 const Register key = c_rarg2; // key array address
3385 const Register keylen = rscratch1;
3386
3387 start = __ pc();
3388 __ enter(); // required for proper stackwalking of RuntimeStub frame
3389
3390 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3391
3392 __ aesecb_decrypt(from, to, key, keylen);
3393
3394 __ mov(r0, 0);
3395
3396 __ leave();
3397 __ ret(lr);
3398
3399 // record the stub entry and end
3400 store_archive_data(stub_id, start, __ pc());
3401
3402 return start;
3403 }
3404
3405 // Arguments:
3406 //
3407 // Inputs:
3408 // c_rarg0 - source byte array address
3409 // c_rarg1 - destination byte array address
3410 // c_rarg2 - sessionKe (key) in little endian int array
3411 // c_rarg3 - r vector byte array address
3412 // c_rarg4 - input length
3413 //
3414 // Output:
3415 // x0 - input length
3416 //
3417 address generate_cipherBlockChaining_encryptAESCrypt() {
3418 assert(UseAES, "need AES cryptographic extension support");
3419 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3420 int entry_count = StubInfo::entry_count(stub_id);
3421 assert(entry_count == 1, "sanity check");
3422 address start = load_archive_data(stub_id);
3423 if (start != nullptr) {
3424 return start;
3425 }
3426 __ align(CodeEntryAlignment);
3427 StubCodeMark mark(this, stub_id);
3428
3429 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3430
3431 const Register from = c_rarg0; // source array address
3432 const Register to = c_rarg1; // destination array address
3433 const Register key = c_rarg2; // key array address
3434 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3435 // and left with the results of the last encryption block
3436 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3437 const Register keylen = rscratch1;
3438
3439 start = __ pc();
3440
3441 __ enter();
3442
3443 __ movw(rscratch2, len_reg);
3444
3445 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3446
3447 __ ld1(v0, __ T16B, rvec);
3448
3449 __ cmpw(keylen, 52);
3450 __ br(Assembler::CC, L_loadkeys_44);
3451 __ br(Assembler::EQ, L_loadkeys_52);
3452
3453 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3454 __ rev32(v17, __ T16B, v17);
3455 __ rev32(v18, __ T16B, v18);
3456 __ BIND(L_loadkeys_52);
3457 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3458 __ rev32(v19, __ T16B, v19);
3459 __ rev32(v20, __ T16B, v20);
3460 __ BIND(L_loadkeys_44);
3461 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3462 __ rev32(v21, __ T16B, v21);
3463 __ rev32(v22, __ T16B, v22);
3464 __ rev32(v23, __ T16B, v23);
3465 __ rev32(v24, __ T16B, v24);
3466 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3467 __ rev32(v25, __ T16B, v25);
3468 __ rev32(v26, __ T16B, v26);
3469 __ rev32(v27, __ T16B, v27);
3470 __ rev32(v28, __ T16B, v28);
3471 __ ld1(v29, v30, v31, __ T16B, key);
3472 __ rev32(v29, __ T16B, v29);
3473 __ rev32(v30, __ T16B, v30);
3474 __ rev32(v31, __ T16B, v31);
3475
3476 __ BIND(L_aes_loop);
3477 __ ld1(v1, __ T16B, __ post(from, 16));
3478 __ eor(v0, __ T16B, v0, v1);
3479
3480 __ br(Assembler::CC, L_rounds_44);
3481 __ br(Assembler::EQ, L_rounds_52);
3482
3483 __ aese(v0, v17); __ aesmc(v0, v0);
3484 __ aese(v0, v18); __ aesmc(v0, v0);
3485 __ BIND(L_rounds_52);
3486 __ aese(v0, v19); __ aesmc(v0, v0);
3487 __ aese(v0, v20); __ aesmc(v0, v0);
3488 __ BIND(L_rounds_44);
3489 __ aese(v0, v21); __ aesmc(v0, v0);
3490 __ aese(v0, v22); __ aesmc(v0, v0);
3491 __ aese(v0, v23); __ aesmc(v0, v0);
3492 __ aese(v0, v24); __ aesmc(v0, v0);
3493 __ aese(v0, v25); __ aesmc(v0, v0);
3494 __ aese(v0, v26); __ aesmc(v0, v0);
3495 __ aese(v0, v27); __ aesmc(v0, v0);
3496 __ aese(v0, v28); __ aesmc(v0, v0);
3497 __ aese(v0, v29); __ aesmc(v0, v0);
3498 __ aese(v0, v30);
3499 __ eor(v0, __ T16B, v0, v31);
3500
3501 __ st1(v0, __ T16B, __ post(to, 16));
3502
3503 __ subw(len_reg, len_reg, 16);
3504 __ cbnzw(len_reg, L_aes_loop);
3505
3506 __ st1(v0, __ T16B, rvec);
3507
3508 __ mov(r0, rscratch2);
3509
3510 __ leave();
3511 __ ret(lr);
3512
3513 // record the stub entry and end
3514 store_archive_data(stub_id, start, __ pc());
3515
3516 return start;
3517 }
3518
3519 // Arguments:
3520 //
3521 // Inputs:
3522 // c_rarg0 - source byte array address
3523 // c_rarg1 - destination byte array address
3524 // c_rarg2 - sessionKd (key) in little endian int array
3525 // c_rarg3 - r vector byte array address
3526 // c_rarg4 - input length
3527 //
3528 // Output:
3529 // r0 - input length
3530 //
3531 address generate_cipherBlockChaining_decryptAESCrypt() {
3532 assert(UseAES, "need AES cryptographic extension support");
3533 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3534 int entry_count = StubInfo::entry_count(stub_id);
3535 assert(entry_count == 1, "sanity check");
3536 address start = load_archive_data(stub_id);
3537 if (start != nullptr) {
3538 return start;
3539 }
3540 __ align(CodeEntryAlignment);
3541 StubCodeMark mark(this, stub_id);
3542
3543 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3544
3545 const Register from = c_rarg0; // source array address
3546 const Register to = c_rarg1; // destination array address
3547 const Register key = c_rarg2; // key array address
3548 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3549 // and left with the results of the last encryption block
3550 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3551 const Register keylen = rscratch1;
3552
3553 start = __ pc();
3554
3555 __ enter();
3556
3557 __ movw(rscratch2, len_reg);
3558
3559 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3560
3561 __ ld1(v2, __ T16B, rvec);
3562
3563 __ ld1(v31, __ T16B, __ post(key, 16));
3564 __ rev32(v31, __ T16B, v31);
3565
3566 __ cmpw(keylen, 52);
3567 __ br(Assembler::CC, L_loadkeys_44);
3568 __ br(Assembler::EQ, L_loadkeys_52);
3569
3570 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3571 __ rev32(v17, __ T16B, v17);
3572 __ rev32(v18, __ T16B, v18);
3573 __ BIND(L_loadkeys_52);
3574 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3575 __ rev32(v19, __ T16B, v19);
3576 __ rev32(v20, __ T16B, v20);
3577 __ BIND(L_loadkeys_44);
3578 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3579 __ rev32(v21, __ T16B, v21);
3580 __ rev32(v22, __ T16B, v22);
3581 __ rev32(v23, __ T16B, v23);
3582 __ rev32(v24, __ T16B, v24);
3583 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3584 __ rev32(v25, __ T16B, v25);
3585 __ rev32(v26, __ T16B, v26);
3586 __ rev32(v27, __ T16B, v27);
3587 __ rev32(v28, __ T16B, v28);
3588 __ ld1(v29, v30, __ T16B, key);
3589 __ rev32(v29, __ T16B, v29);
3590 __ rev32(v30, __ T16B, v30);
3591
3592 __ BIND(L_aes_loop);
3593 __ ld1(v0, __ T16B, __ post(from, 16));
3594 __ orr(v1, __ T16B, v0, v0);
3595
3596 __ br(Assembler::CC, L_rounds_44);
3597 __ br(Assembler::EQ, L_rounds_52);
3598
3599 __ aesd(v0, v17); __ aesimc(v0, v0);
3600 __ aesd(v0, v18); __ aesimc(v0, v0);
3601 __ BIND(L_rounds_52);
3602 __ aesd(v0, v19); __ aesimc(v0, v0);
3603 __ aesd(v0, v20); __ aesimc(v0, v0);
3604 __ BIND(L_rounds_44);
3605 __ aesd(v0, v21); __ aesimc(v0, v0);
3606 __ aesd(v0, v22); __ aesimc(v0, v0);
3607 __ aesd(v0, v23); __ aesimc(v0, v0);
3608 __ aesd(v0, v24); __ aesimc(v0, v0);
3609 __ aesd(v0, v25); __ aesimc(v0, v0);
3610 __ aesd(v0, v26); __ aesimc(v0, v0);
3611 __ aesd(v0, v27); __ aesimc(v0, v0);
3612 __ aesd(v0, v28); __ aesimc(v0, v0);
3613 __ aesd(v0, v29); __ aesimc(v0, v0);
3614 __ aesd(v0, v30);
3615 __ eor(v0, __ T16B, v0, v31);
3616 __ eor(v0, __ T16B, v0, v2);
3617
3618 __ st1(v0, __ T16B, __ post(to, 16));
3619 __ orr(v2, __ T16B, v1, v1);
3620
3621 __ subw(len_reg, len_reg, 16);
3622 __ cbnzw(len_reg, L_aes_loop);
3623
3624 __ st1(v2, __ T16B, rvec);
3625
3626 __ mov(r0, rscratch2);
3627
3628 __ leave();
3629 __ ret(lr);
3630
3631 // record the stub entry and end
3632 store_archive_data(stub_id, start, __ pc());
3633
3634 return start;
3635 }
3636
3637 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3638 // Inputs: 128-bits. in is preserved.
3639 // The least-significant 64-bit word is in the upper dword of each vector.
3640 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3641 // Output: result
3642 void be_add_128_64(FloatRegister result, FloatRegister in,
3643 FloatRegister inc, FloatRegister tmp) {
3644 assert_different_registers(result, tmp, inc);
3645
3646 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3647 // input
3648 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3649 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3650 // MSD == 0 (must be!) to LSD
3651 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3652 }
3653
3654 // CTR AES crypt.
3655 // Arguments:
3656 //
3657 // Inputs:
3658 // c_rarg0 - source byte array address
3659 // c_rarg1 - destination byte array address
3660 // c_rarg2 - sessionKe (key) in little endian int array
3661 // c_rarg3 - counter vector byte array address
3662 // c_rarg4 - input length
3663 // c_rarg5 - saved encryptedCounter start
3664 // c_rarg6 - saved used length
3665 //
3666 // Output:
3667 // r0 - input length
3668 //
3669 address generate_counterMode_AESCrypt() {
3670 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3671 int entry_count = StubInfo::entry_count(stub_id);
3672 assert(entry_count == 1, "sanity check");
3673 address start = load_archive_data(stub_id);
3674 if (start != nullptr) {
3675 return start;
3676 }
3677 const Register in = c_rarg0;
3678 const Register out = c_rarg1;
3679 const Register key = c_rarg2;
3680 const Register counter = c_rarg3;
3681 const Register saved_len = c_rarg4, len = r10;
3682 const Register saved_encrypted_ctr = c_rarg5;
3683 const Register used_ptr = c_rarg6, used = r12;
3684
3685 const Register offset = r7;
3686 const Register keylen = r11;
3687
3688 const unsigned char block_size = 16;
3689 const int bulk_width = 4;
3690 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3691 // performance with larger data sizes, but it also means that the
3692 // fast path isn't used until you have at least 8 blocks, and up
3693 // to 127 bytes of data will be executed on the slow path. For
3694 // that reason, and also so as not to blow away too much icache, 4
3695 // blocks seems like a sensible compromise.
3696
3697 // Algorithm:
3698 //
3699 // if (len == 0) {
3700 // goto DONE;
3701 // }
3702 // int result = len;
3703 // do {
3704 // if (used >= blockSize) {
3705 // if (len >= bulk_width * blockSize) {
3706 // CTR_large_block();
3707 // if (len == 0)
3708 // goto DONE;
3709 // }
3710 // for (;;) {
3711 // 16ByteVector v0 = counter;
3712 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3713 // used = 0;
3714 // if (len < blockSize)
3715 // break; /* goto NEXT */
3716 // 16ByteVector v1 = load16Bytes(in, offset);
3717 // v1 = v1 ^ encryptedCounter;
3718 // store16Bytes(out, offset);
3719 // used = blockSize;
3720 // offset += blockSize;
3721 // len -= blockSize;
3722 // if (len == 0)
3723 // goto DONE;
3724 // }
3725 // }
3726 // NEXT:
3727 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3728 // len--;
3729 // } while (len != 0);
3730 // DONE:
3731 // return result;
3732 //
3733 // CTR_large_block()
3734 // Wide bulk encryption of whole blocks.
3735
3736 __ align(CodeEntryAlignment);
3737 StubCodeMark mark(this, stub_id);
3738 start = __ pc();
3739 __ enter();
3740
3741 Label DONE, CTR_large_block, large_block_return;
3742 __ ldrw(used, Address(used_ptr));
3743 __ cbzw(saved_len, DONE);
3744
3745 __ mov(len, saved_len);
3746 __ mov(offset, 0);
3747
3748 // Compute #rounds for AES based on the length of the key array
3749 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3750
3751 __ aesenc_loadkeys(key, keylen);
3752
3753 {
3754 Label L_CTR_loop, NEXT;
3755
3756 __ bind(L_CTR_loop);
3757
3758 __ cmp(used, block_size);
3759 __ br(__ LO, NEXT);
3760
3761 // Maybe we have a lot of data
3762 __ subsw(rscratch1, len, bulk_width * block_size);
3763 __ br(__ HS, CTR_large_block);
3764 __ BIND(large_block_return);
3765 __ cbzw(len, DONE);
3766
3767 // Setup the counter
3768 __ movi(v4, __ T4S, 0);
3769 __ movi(v5, __ T4S, 1);
3770 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3771
3772 // 128-bit big-endian increment
3773 __ ld1(v0, __ T16B, counter);
3774 __ rev64(v16, __ T16B, v0);
3775 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3776 __ rev64(v16, __ T16B, v16);
3777 __ st1(v16, __ T16B, counter);
3778 // Previous counter value is in v0
3779 // v4 contains { 0, 1 }
3780
3781 {
3782 // We have fewer than bulk_width blocks of data left. Encrypt
3783 // them one by one until there is less than a full block
3784 // remaining, being careful to save both the encrypted counter
3785 // and the counter.
3786
3787 Label inner_loop;
3788 __ bind(inner_loop);
3789 // Counter to encrypt is in v0
3790 __ aesecb_encrypt(noreg, noreg, keylen);
3791 __ st1(v0, __ T16B, saved_encrypted_ctr);
3792
3793 // Do we have a remaining full block?
3794
3795 __ mov(used, 0);
3796 __ cmp(len, block_size);
3797 __ br(__ LO, NEXT);
3798
3799 // Yes, we have a full block
3800 __ ldrq(v1, Address(in, offset));
3801 __ eor(v1, __ T16B, v1, v0);
3802 __ strq(v1, Address(out, offset));
3803 __ mov(used, block_size);
3804 __ add(offset, offset, block_size);
3805
3806 __ subw(len, len, block_size);
3807 __ cbzw(len, DONE);
3808
3809 // Increment the counter, store it back
3810 __ orr(v0, __ T16B, v16, v16);
3811 __ rev64(v16, __ T16B, v16);
3812 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3813 __ rev64(v16, __ T16B, v16);
3814 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3815
3816 __ b(inner_loop);
3817 }
3818
3819 __ BIND(NEXT);
3820
3821 // Encrypt a single byte, and loop.
3822 // We expect this to be a rare event.
3823 __ ldrb(rscratch1, Address(in, offset));
3824 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3825 __ eor(rscratch1, rscratch1, rscratch2);
3826 __ strb(rscratch1, Address(out, offset));
3827 __ add(offset, offset, 1);
3828 __ add(used, used, 1);
3829 __ subw(len, len,1);
3830 __ cbnzw(len, L_CTR_loop);
3831 }
3832
3833 __ bind(DONE);
3834 __ strw(used, Address(used_ptr));
3835 __ mov(r0, saved_len);
3836
3837 __ leave(); // required for proper stackwalking of RuntimeStub frame
3838 __ ret(lr);
3839
3840 // Bulk encryption
3841
3842 __ BIND (CTR_large_block);
3843 assert(bulk_width == 4 || bulk_width == 8, "must be");
3844
3845 if (bulk_width == 8) {
3846 __ sub(sp, sp, 4 * 16);
3847 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3848 }
3849 __ sub(sp, sp, 4 * 16);
3850 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3851 RegSet saved_regs = (RegSet::of(in, out, offset)
3852 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3853 __ push(saved_regs, sp);
3854 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3855 __ add(in, in, offset);
3856 __ add(out, out, offset);
3857
3858 // Keys should already be loaded into the correct registers
3859
3860 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3861 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3862
3863 // AES/CTR loop
3864 {
3865 Label L_CTR_loop;
3866 __ BIND(L_CTR_loop);
3867
3868 // Setup the counters
3869 __ movi(v8, __ T4S, 0);
3870 __ movi(v9, __ T4S, 1);
3871 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3872
3873 for (int i = 0; i < bulk_width; i++) {
3874 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3875 __ rev64(v0_ofs, __ T16B, v16);
3876 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3877 }
3878
3879 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3880
3881 // Encrypt the counters
3882 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3883
3884 if (bulk_width == 8) {
3885 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3886 }
3887
3888 // XOR the encrypted counters with the inputs
3889 for (int i = 0; i < bulk_width; i++) {
3890 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3891 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3892 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3893 }
3894
3895 // Write the encrypted data
3896 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3897 if (bulk_width == 8) {
3898 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3899 }
3900
3901 __ subw(len, len, 16 * bulk_width);
3902 __ cbnzw(len, L_CTR_loop);
3903 }
3904
3905 // Save the counter back where it goes
3906 __ rev64(v16, __ T16B, v16);
3907 __ st1(v16, __ T16B, counter);
3908
3909 __ pop(saved_regs, sp);
3910
3911 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3912 if (bulk_width == 8) {
3913 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3914 }
3915
3916 __ andr(rscratch1, len, -16 * bulk_width);
3917 __ sub(len, len, rscratch1);
3918 __ add(offset, offset, rscratch1);
3919 __ mov(used, 16);
3920 __ strw(used, Address(used_ptr));
3921 __ b(large_block_return);
3922
3923 // record the stub entry and end
3924 store_archive_data(stub_id, start, __ pc());
3925
3926 return start;
3927 }
3928
3929 // Vector AES Galois Counter Mode implementation. Parameters:
3930 //
3931 // in = c_rarg0
3932 // len = c_rarg1
3933 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3934 // out = c_rarg3
3935 // key = c_rarg4
3936 // state = c_rarg5 - GHASH.state
3937 // subkeyHtbl = c_rarg6 - powers of H
3938 // counter = c_rarg7 - 16 bytes of CTR
3939 // return - number of processed bytes
3940 address generate_galoisCounterMode_AESCrypt() {
3941 Label ghash_polynomial; // local data generated after code
3942 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3943 int entry_count = StubInfo::entry_count(stub_id);
3944 assert(entry_count == 1, "sanity check");
3945 address start = load_archive_data(stub_id);
3946 if (start != nullptr) {
3947 return start;
3948 }
3949 __ align(CodeEntryAlignment);
3950 StubCodeMark mark(this, stub_id);
3951 start = __ pc();
3952 __ enter();
3953
3954 const Register in = c_rarg0;
3955 const Register len = c_rarg1;
3956 const Register ct = c_rarg2;
3957 const Register out = c_rarg3;
3958 // and updated with the incremented counter in the end
3959
3960 const Register key = c_rarg4;
3961 const Register state = c_rarg5;
3962
3963 const Register subkeyHtbl = c_rarg6;
3964
3965 const Register counter = c_rarg7;
3966
3967 const Register keylen = r10;
3968 // Save state before entering routine
3969 __ sub(sp, sp, 4 * 16);
3970 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3971 __ sub(sp, sp, 4 * 16);
3972 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3973
3974 // __ andr(len, len, -512);
3975 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3976 __ str(len, __ pre(sp, -2 * wordSize));
3977
3978 Label DONE;
3979 __ cbz(len, DONE);
3980
3981 // Compute #rounds for AES based on the length of the key array
3982 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3983
3984 __ aesenc_loadkeys(key, keylen);
3985 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3986 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3987
3988 // AES/CTR loop
3989 {
3990 Label L_CTR_loop;
3991 __ BIND(L_CTR_loop);
3992
3993 // Setup the counters
3994 __ movi(v8, __ T4S, 0);
3995 __ movi(v9, __ T4S, 1);
3996 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3997
3998 assert(v0->encoding() < v8->encoding(), "");
3999 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4000 FloatRegister f = as_FloatRegister(i);
4001 __ rev32(f, __ T16B, v16);
4002 __ addv(v16, __ T4S, v16, v8);
4003 }
4004
4005 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4006
4007 // Encrypt the counters
4008 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4009
4010 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4011
4012 // XOR the encrypted counters with the inputs
4013 for (int i = 0; i < 8; i++) {
4014 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4015 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4016 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4017 }
4018 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4019 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4020
4021 __ subw(len, len, 16 * 8);
4022 __ cbnzw(len, L_CTR_loop);
4023 }
4024
4025 __ rev32(v16, __ T16B, v16);
4026 __ st1(v16, __ T16B, counter);
4027
4028 __ ldr(len, Address(sp));
4029 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4030
4031 // GHASH/CTR loop
4032 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4033 len, /*unrolls*/4);
4034
4035 #ifdef ASSERT
4036 { Label L;
4037 __ cmp(len, (unsigned char)0);
4038 __ br(Assembler::EQ, L);
4039 __ stop("stubGenerator: abort");
4040 __ bind(L);
4041 }
4042 #endif
4043
4044 __ bind(DONE);
4045 // Return the number of bytes processed
4046 __ ldr(r0, __ post(sp, 2 * wordSize));
4047
4048 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4049 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4050
4051 __ leave(); // required for proper stackwalking of RuntimeStub frame
4052 __ ret(lr);
4053
4054 // bind label and generate polynomial data
4055 __ align(wordSize * 2);
4056 __ bind(ghash_polynomial);
4057 __ emit_int64(0x87); // The low-order bits of the field
4058 // polynomial (i.e. p = z^7+z^2+z+1)
4059 // repeated in the low and high parts of a
4060 // 128-bit vector
4061 __ emit_int64(0x87);
4062
4063 // record the stub entry and end
4064 store_archive_data(stub_id, start, __ pc());
4065
4066 return start;
4067 }
4068
4069 class Cached64Bytes {
4070 private:
4071 MacroAssembler *_masm;
4072 Register _regs[8];
4073
4074 public:
4075 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4076 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4077 auto it = rs.begin();
4078 for (auto &r: _regs) {
4079 r = *it;
4080 ++it;
4081 }
4082 }
4083
4084 void gen_loads(Register base) {
4085 for (int i = 0; i < 8; i += 2) {
4086 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4087 }
4088 }
4089
4090 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4091 void extract_u32(Register dest, int i) {
4092 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4093 }
4094 };
4095
4096 // Utility routines for md5.
4097 // Clobbers r10 and r11.
4098 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4099 int k, int s, int t) {
4100 Register rscratch3 = r10;
4101 Register rscratch4 = r11;
4102
4103 __ eorw(rscratch3, r3, r4);
4104 __ movw(rscratch2, t);
4105 __ andw(rscratch3, rscratch3, r2);
4106 __ addw(rscratch4, r1, rscratch2);
4107 reg_cache.extract_u32(rscratch1, k);
4108 __ eorw(rscratch3, rscratch3, r4);
4109 __ addw(rscratch4, rscratch4, rscratch1);
4110 __ addw(rscratch3, rscratch3, rscratch4);
4111 __ rorw(rscratch2, rscratch3, 32 - s);
4112 __ addw(r1, rscratch2, r2);
4113 }
4114
4115 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4116 int k, int s, int t) {
4117 Register rscratch3 = r10;
4118 Register rscratch4 = r11;
4119
4120 reg_cache.extract_u32(rscratch1, k);
4121 __ movw(rscratch2, t);
4122 __ addw(rscratch4, r1, rscratch2);
4123 __ addw(rscratch4, rscratch4, rscratch1);
4124 __ bicw(rscratch2, r3, r4);
4125 __ andw(rscratch3, r2, r4);
4126 __ addw(rscratch2, rscratch2, rscratch4);
4127 __ addw(rscratch2, rscratch2, rscratch3);
4128 __ rorw(rscratch2, rscratch2, 32 - s);
4129 __ addw(r1, rscratch2, r2);
4130 }
4131
4132 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4133 int k, int s, int t) {
4134 Register rscratch3 = r10;
4135 Register rscratch4 = r11;
4136
4137 __ eorw(rscratch3, r3, r4);
4138 __ movw(rscratch2, t);
4139 __ addw(rscratch4, r1, rscratch2);
4140 reg_cache.extract_u32(rscratch1, k);
4141 __ eorw(rscratch3, rscratch3, r2);
4142 __ addw(rscratch4, rscratch4, rscratch1);
4143 __ addw(rscratch3, rscratch3, rscratch4);
4144 __ rorw(rscratch2, rscratch3, 32 - s);
4145 __ addw(r1, rscratch2, r2);
4146 }
4147
4148 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4149 int k, int s, int t) {
4150 Register rscratch3 = r10;
4151 Register rscratch4 = r11;
4152
4153 __ movw(rscratch3, t);
4154 __ ornw(rscratch2, r2, r4);
4155 __ addw(rscratch4, r1, rscratch3);
4156 reg_cache.extract_u32(rscratch1, k);
4157 __ eorw(rscratch3, rscratch2, r3);
4158 __ addw(rscratch4, rscratch4, rscratch1);
4159 __ addw(rscratch3, rscratch3, rscratch4);
4160 __ rorw(rscratch2, rscratch3, 32 - s);
4161 __ addw(r1, rscratch2, r2);
4162 }
4163
4164 // Arguments:
4165 //
4166 // Inputs:
4167 // c_rarg0 - byte[] source+offset
4168 // c_rarg1 - int[] SHA.state
4169 // c_rarg2 - int offset
4170 // c_rarg3 - int limit
4171 //
4172 address generate_md5_implCompress(StubId stub_id) {
4173 bool multi_block;
4174 switch (stub_id) {
4175 case StubId::stubgen_md5_implCompress_id:
4176 multi_block = false;
4177 break;
4178 case StubId::stubgen_md5_implCompressMB_id:
4179 multi_block = true;
4180 break;
4181 default:
4182 ShouldNotReachHere();
4183 }
4184 int entry_count = StubInfo::entry_count(stub_id);
4185 assert(entry_count == 1, "sanity check");
4186 address start = load_archive_data(stub_id);
4187 if (start != nullptr) {
4188 return start;
4189 }
4190 __ align(CodeEntryAlignment);
4191
4192 StubCodeMark mark(this, stub_id);
4193 start = __ pc();
4194
4195 Register buf = c_rarg0;
4196 Register state = c_rarg1;
4197 Register ofs = c_rarg2;
4198 Register limit = c_rarg3;
4199 Register a = r4;
4200 Register b = r5;
4201 Register c = r6;
4202 Register d = r7;
4203 Register rscratch3 = r10;
4204 Register rscratch4 = r11;
4205
4206 Register state_regs[2] = { r12, r13 };
4207 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4208 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4209
4210 __ push(saved_regs, sp);
4211
4212 __ ldp(state_regs[0], state_regs[1], Address(state));
4213 __ ubfx(a, state_regs[0], 0, 32);
4214 __ ubfx(b, state_regs[0], 32, 32);
4215 __ ubfx(c, state_regs[1], 0, 32);
4216 __ ubfx(d, state_regs[1], 32, 32);
4217
4218 Label md5_loop;
4219 __ BIND(md5_loop);
4220
4221 reg_cache.gen_loads(buf);
4222
4223 // Round 1
4224 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4225 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4226 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4227 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4228 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4229 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4230 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4231 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4232 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4233 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4234 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4235 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4236 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4237 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4238 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4239 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4240
4241 // Round 2
4242 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4243 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4244 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4245 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4246 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4247 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4248 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4249 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4250 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4251 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4252 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4253 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4254 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4255 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4256 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4257 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4258
4259 // Round 3
4260 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4261 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4262 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4263 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4264 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4265 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4266 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4267 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4268 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4269 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4270 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4271 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4272 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4273 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4274 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4275 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4276
4277 // Round 4
4278 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4279 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4280 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4281 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4282 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4283 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4284 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4285 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4286 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4287 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4288 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4289 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4290 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4291 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4292 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4293 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4294
4295 __ addw(a, state_regs[0], a);
4296 __ ubfx(rscratch2, state_regs[0], 32, 32);
4297 __ addw(b, rscratch2, b);
4298 __ addw(c, state_regs[1], c);
4299 __ ubfx(rscratch4, state_regs[1], 32, 32);
4300 __ addw(d, rscratch4, d);
4301
4302 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4303 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4304
4305 if (multi_block) {
4306 __ add(buf, buf, 64);
4307 __ add(ofs, ofs, 64);
4308 __ cmp(ofs, limit);
4309 __ br(Assembler::LE, md5_loop);
4310 __ mov(c_rarg0, ofs); // return ofs
4311 }
4312
4313 // write hash values back in the correct order
4314 __ stp(state_regs[0], state_regs[1], Address(state));
4315
4316 __ pop(saved_regs, sp);
4317
4318 __ ret(lr);
4319
4320 // record the stub entry and end
4321 store_archive_data(stub_id, start, __ pc());
4322
4323 return start;
4324 }
4325
4326 // Arguments:
4327 //
4328 // Inputs:
4329 // c_rarg0 - byte[] source+offset
4330 // c_rarg1 - int[] SHA.state
4331 // c_rarg2 - int offset
4332 // c_rarg3 - int limit
4333 //
4334 address generate_sha1_implCompress(StubId stub_id) {
4335 bool multi_block;
4336 switch (stub_id) {
4337 case StubId::stubgen_sha1_implCompress_id:
4338 multi_block = false;
4339 break;
4340 case StubId::stubgen_sha1_implCompressMB_id:
4341 multi_block = true;
4342 break;
4343 default:
4344 ShouldNotReachHere();
4345 }
4346 int entry_count = StubInfo::entry_count(stub_id);
4347 assert(entry_count == 1, "sanity check");
4348 address start = load_archive_data(stub_id);
4349 if (start != nullptr) {
4350 return start;
4351 }
4352 __ align(CodeEntryAlignment);
4353
4354 StubCodeMark mark(this, stub_id);
4355 start = __ pc();
4356
4357 Register buf = c_rarg0;
4358 Register state = c_rarg1;
4359 Register ofs = c_rarg2;
4360 Register limit = c_rarg3;
4361
4362 Label keys;
4363 Label sha1_loop;
4364
4365 // load the keys into v0..v3
4366 __ adr(rscratch1, keys);
4367 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4368 // load 5 words state into v6, v7
4369 __ ldrq(v6, Address(state, 0));
4370 __ ldrs(v7, Address(state, 16));
4371
4372
4373 __ BIND(sha1_loop);
4374 // load 64 bytes of data into v16..v19
4375 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4376 __ rev32(v16, __ T16B, v16);
4377 __ rev32(v17, __ T16B, v17);
4378 __ rev32(v18, __ T16B, v18);
4379 __ rev32(v19, __ T16B, v19);
4380
4381 // do the sha1
4382 __ addv(v4, __ T4S, v16, v0);
4383 __ orr(v20, __ T16B, v6, v6);
4384
4385 FloatRegister d0 = v16;
4386 FloatRegister d1 = v17;
4387 FloatRegister d2 = v18;
4388 FloatRegister d3 = v19;
4389
4390 for (int round = 0; round < 20; round++) {
4391 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4392 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4393 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4394 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4395 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4396
4397 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4398 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4399 __ sha1h(tmp2, __ T4S, v20);
4400 if (round < 5)
4401 __ sha1c(v20, __ T4S, tmp3, tmp4);
4402 else if (round < 10 || round >= 15)
4403 __ sha1p(v20, __ T4S, tmp3, tmp4);
4404 else
4405 __ sha1m(v20, __ T4S, tmp3, tmp4);
4406 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4407
4408 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4409 }
4410
4411 __ addv(v7, __ T2S, v7, v21);
4412 __ addv(v6, __ T4S, v6, v20);
4413
4414 if (multi_block) {
4415 __ add(ofs, ofs, 64);
4416 __ cmp(ofs, limit);
4417 __ br(Assembler::LE, sha1_loop);
4418 __ mov(c_rarg0, ofs); // return ofs
4419 }
4420
4421 __ strq(v6, Address(state, 0));
4422 __ strs(v7, Address(state, 16));
4423
4424 __ ret(lr);
4425
4426 __ bind(keys);
4427 __ emit_int32(0x5a827999);
4428 __ emit_int32(0x6ed9eba1);
4429 __ emit_int32(0x8f1bbcdc);
4430 __ emit_int32(0xca62c1d6);
4431
4432 // record the stub entry and end
4433 store_archive_data(stub_id, start, __ pc());
4434
4435 return start;
4436 }
4437
4438
4439 // Arguments:
4440 //
4441 // Inputs:
4442 // c_rarg0 - byte[] source+offset
4443 // c_rarg1 - int[] SHA.state
4444 // c_rarg2 - int offset
4445 // c_rarg3 - int limit
4446 //
4447 address generate_sha256_implCompress(StubId stub_id) {
4448 bool multi_block;
4449 switch (stub_id) {
4450 case StubId::stubgen_sha256_implCompress_id:
4451 multi_block = false;
4452 break;
4453 case StubId::stubgen_sha256_implCompressMB_id:
4454 multi_block = true;
4455 break;
4456 default:
4457 ShouldNotReachHere();
4458 }
4459 int entry_count = StubInfo::entry_count(stub_id);
4460 assert(entry_count == 1, "sanity check");
4461 address start = load_archive_data(stub_id);
4462 if (start != nullptr) {
4463 return start;
4464 }
4465 __ align(CodeEntryAlignment);
4466 StubCodeMark mark(this, stub_id);
4467 start = __ pc();
4468
4469 Register buf = c_rarg0;
4470 Register state = c_rarg1;
4471 Register ofs = c_rarg2;
4472 Register limit = c_rarg3;
4473
4474 Label sha1_loop;
4475
4476 __ stpd(v8, v9, __ pre(sp, -32));
4477 __ stpd(v10, v11, Address(sp, 16));
4478
4479 // dga == v0
4480 // dgb == v1
4481 // dg0 == v2
4482 // dg1 == v3
4483 // dg2 == v4
4484 // t0 == v6
4485 // t1 == v7
4486
4487 // load 16 keys to v16..v31
4488 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4489 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4490 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4491 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4492 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4493
4494 // load 8 words (256 bits) state
4495 __ ldpq(v0, v1, state);
4496
4497 __ BIND(sha1_loop);
4498 // load 64 bytes of data into v8..v11
4499 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4500 __ rev32(v8, __ T16B, v8);
4501 __ rev32(v9, __ T16B, v9);
4502 __ rev32(v10, __ T16B, v10);
4503 __ rev32(v11, __ T16B, v11);
4504
4505 __ addv(v6, __ T4S, v8, v16);
4506 __ orr(v2, __ T16B, v0, v0);
4507 __ orr(v3, __ T16B, v1, v1);
4508
4509 FloatRegister d0 = v8;
4510 FloatRegister d1 = v9;
4511 FloatRegister d2 = v10;
4512 FloatRegister d3 = v11;
4513
4514
4515 for (int round = 0; round < 16; round++) {
4516 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4517 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4518 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4519 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4520
4521 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4522 __ orr(v4, __ T16B, v2, v2);
4523 if (round < 15)
4524 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4525 __ sha256h(v2, __ T4S, v3, tmp2);
4526 __ sha256h2(v3, __ T4S, v4, tmp2);
4527 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4528
4529 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4530 }
4531
4532 __ addv(v0, __ T4S, v0, v2);
4533 __ addv(v1, __ T4S, v1, v3);
4534
4535 if (multi_block) {
4536 __ add(ofs, ofs, 64);
4537 __ cmp(ofs, limit);
4538 __ br(Assembler::LE, sha1_loop);
4539 __ mov(c_rarg0, ofs); // return ofs
4540 }
4541
4542 __ ldpd(v10, v11, Address(sp, 16));
4543 __ ldpd(v8, v9, __ post(sp, 32));
4544
4545 __ stpq(v0, v1, state);
4546
4547 __ ret(lr);
4548
4549 // record the stub entry and end
4550 store_archive_data(stub_id, start, __ pc());
4551
4552 return start;
4553 }
4554
4555 // Double rounds for sha512.
4556 void sha512_dround(int dr,
4557 FloatRegister vi0, FloatRegister vi1,
4558 FloatRegister vi2, FloatRegister vi3,
4559 FloatRegister vi4, FloatRegister vrc0,
4560 FloatRegister vrc1, FloatRegister vin0,
4561 FloatRegister vin1, FloatRegister vin2,
4562 FloatRegister vin3, FloatRegister vin4) {
4563 if (dr < 36) {
4564 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4565 }
4566 __ addv(v5, __ T2D, vrc0, vin0);
4567 __ ext(v6, __ T16B, vi2, vi3, 8);
4568 __ ext(v5, __ T16B, v5, v5, 8);
4569 __ ext(v7, __ T16B, vi1, vi2, 8);
4570 __ addv(vi3, __ T2D, vi3, v5);
4571 if (dr < 32) {
4572 __ ext(v5, __ T16B, vin3, vin4, 8);
4573 __ sha512su0(vin0, __ T2D, vin1);
4574 }
4575 __ sha512h(vi3, __ T2D, v6, v7);
4576 if (dr < 32) {
4577 __ sha512su1(vin0, __ T2D, vin2, v5);
4578 }
4579 __ addv(vi4, __ T2D, vi1, vi3);
4580 __ sha512h2(vi3, __ T2D, vi1, vi0);
4581 }
4582
4583 // Arguments:
4584 //
4585 // Inputs:
4586 // c_rarg0 - byte[] source+offset
4587 // c_rarg1 - int[] SHA.state
4588 // c_rarg2 - int offset
4589 // c_rarg3 - int limit
4590 //
4591 address generate_sha512_implCompress(StubId stub_id) {
4592 bool multi_block;
4593 switch (stub_id) {
4594 case StubId::stubgen_sha512_implCompress_id:
4595 multi_block = false;
4596 break;
4597 case StubId::stubgen_sha512_implCompressMB_id:
4598 multi_block = true;
4599 break;
4600 default:
4601 ShouldNotReachHere();
4602 }
4603 int entry_count = StubInfo::entry_count(stub_id);
4604 assert(entry_count == 1, "sanity check");
4605 address start = load_archive_data(stub_id);
4606 if (start != nullptr) {
4607 return start;
4608 }
4609 __ align(CodeEntryAlignment);
4610 StubCodeMark mark(this, stub_id);
4611 start = __ pc();
4612
4613 Register buf = c_rarg0;
4614 Register state = c_rarg1;
4615 Register ofs = c_rarg2;
4616 Register limit = c_rarg3;
4617
4618 __ stpd(v8, v9, __ pre(sp, -64));
4619 __ stpd(v10, v11, Address(sp, 16));
4620 __ stpd(v12, v13, Address(sp, 32));
4621 __ stpd(v14, v15, Address(sp, 48));
4622
4623 Label sha512_loop;
4624
4625 // load state
4626 __ ld1(v8, v9, v10, v11, __ T2D, state);
4627
4628 // load first 4 round constants
4629 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4630 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4631
4632 __ BIND(sha512_loop);
4633 // load 128B of data into v12..v19
4634 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4635 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4636 __ rev64(v12, __ T16B, v12);
4637 __ rev64(v13, __ T16B, v13);
4638 __ rev64(v14, __ T16B, v14);
4639 __ rev64(v15, __ T16B, v15);
4640 __ rev64(v16, __ T16B, v16);
4641 __ rev64(v17, __ T16B, v17);
4642 __ rev64(v18, __ T16B, v18);
4643 __ rev64(v19, __ T16B, v19);
4644
4645 __ mov(rscratch2, rscratch1);
4646
4647 __ mov(v0, __ T16B, v8);
4648 __ mov(v1, __ T16B, v9);
4649 __ mov(v2, __ T16B, v10);
4650 __ mov(v3, __ T16B, v11);
4651
4652 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4653 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4654 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4655 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4656 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4657 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4658 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4659 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4660 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4661 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4662 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4663 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4664 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4665 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4666 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4667 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4668 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4669 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4670 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4671 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4672 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4673 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4674 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4675 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4676 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4677 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4678 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4679 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4680 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4681 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4682 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4683 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4684 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4685 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4686 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4687 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4688 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4689 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4690 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4691 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4692
4693 __ addv(v8, __ T2D, v8, v0);
4694 __ addv(v9, __ T2D, v9, v1);
4695 __ addv(v10, __ T2D, v10, v2);
4696 __ addv(v11, __ T2D, v11, v3);
4697
4698 if (multi_block) {
4699 __ add(ofs, ofs, 128);
4700 __ cmp(ofs, limit);
4701 __ br(Assembler::LE, sha512_loop);
4702 __ mov(c_rarg0, ofs); // return ofs
4703 }
4704
4705 __ st1(v8, v9, v10, v11, __ T2D, state);
4706
4707 __ ldpd(v14, v15, Address(sp, 48));
4708 __ ldpd(v12, v13, Address(sp, 32));
4709 __ ldpd(v10, v11, Address(sp, 16));
4710 __ ldpd(v8, v9, __ post(sp, 64));
4711
4712 __ ret(lr);
4713
4714 // record the stub entry and end
4715 store_archive_data(stub_id, start, __ pc());
4716
4717 return start;
4718 }
4719
4720 // Execute one round of keccak of two computations in parallel.
4721 // One of the states should be loaded into the lower halves of
4722 // the vector registers v0-v24, the other should be loaded into
4723 // the upper halves of those registers. The ld1r instruction loads
4724 // the round constant into both halves of register v31.
4725 // Intermediate results c0...c5 and d0...d5 are computed
4726 // in registers v25...v30.
4727 // All vector instructions that are used operate on both register
4728 // halves in parallel.
4729 // If only a single computation is needed, one can only load the lower halves.
4730 void keccak_round(Register rscratch1) {
4731 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4732 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4733 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4734 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4735 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4736 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4737 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4738 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4739 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4740 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4741
4742 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4743 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4744 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4745 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4746 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4747
4748 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4749 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4750 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4751 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4752 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4753 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4754 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4755 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4756 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4757 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4758 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4759 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4760 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4761 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4762 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4763 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4764 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4765 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4766 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4767 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4768 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4769 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4770 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4771 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4772 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4773
4774 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4775 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4776 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4777 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4778 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4779
4780 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4781
4782 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4783 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4784 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4785 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4786 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4787
4788 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4789 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4790 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4791 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4792 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4793
4794 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4795 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4796 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4797 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4798 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4799
4800 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4801 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4802 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4803 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4804 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4805
4806 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4807 }
4808
4809 // Arguments:
4810 //
4811 // Inputs:
4812 // c_rarg0 - byte[] source+offset
4813 // c_rarg1 - byte[] SHA.state
4814 // c_rarg2 - int block_size
4815 // c_rarg3 - int offset
4816 // c_rarg4 - int limit
4817 //
4818 address generate_sha3_implCompress(StubId stub_id) {
4819 bool multi_block;
4820 switch (stub_id) {
4821 case StubId::stubgen_sha3_implCompress_id:
4822 multi_block = false;
4823 break;
4824 case StubId::stubgen_sha3_implCompressMB_id:
4825 multi_block = true;
4826 break;
4827 default:
4828 ShouldNotReachHere();
4829 }
4830 int entry_count = StubInfo::entry_count(stub_id);
4831 assert(entry_count == 1, "sanity check");
4832 address start = load_archive_data(stub_id);
4833 if (start != nullptr) {
4834 return start;
4835 }
4836 __ align(CodeEntryAlignment);
4837 StubCodeMark mark(this, stub_id);
4838 start = __ pc();
4839
4840 Register buf = c_rarg0;
4841 Register state = c_rarg1;
4842 Register block_size = c_rarg2;
4843 Register ofs = c_rarg3;
4844 Register limit = c_rarg4;
4845
4846 Label sha3_loop, rounds24_loop;
4847 Label sha3_512_or_sha3_384, shake128;
4848
4849 __ stpd(v8, v9, __ pre(sp, -64));
4850 __ stpd(v10, v11, Address(sp, 16));
4851 __ stpd(v12, v13, Address(sp, 32));
4852 __ stpd(v14, v15, Address(sp, 48));
4853
4854 // load state
4855 __ add(rscratch1, state, 32);
4856 __ ld1(v0, v1, v2, v3, __ T1D, state);
4857 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4858 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4859 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4860 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4861 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4862 __ ld1(v24, __ T1D, rscratch1);
4863
4864 __ BIND(sha3_loop);
4865
4866 // 24 keccak rounds
4867 __ movw(rscratch2, 24);
4868
4869 // load round_constants base
4870 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4871
4872 // load input
4873 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4874 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4875 __ eor(v0, __ T8B, v0, v25);
4876 __ eor(v1, __ T8B, v1, v26);
4877 __ eor(v2, __ T8B, v2, v27);
4878 __ eor(v3, __ T8B, v3, v28);
4879 __ eor(v4, __ T8B, v4, v29);
4880 __ eor(v5, __ T8B, v5, v30);
4881 __ eor(v6, __ T8B, v6, v31);
4882
4883 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4884 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4885
4886 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4887 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4888 __ eor(v7, __ T8B, v7, v25);
4889 __ eor(v8, __ T8B, v8, v26);
4890 __ eor(v9, __ T8B, v9, v27);
4891 __ eor(v10, __ T8B, v10, v28);
4892 __ eor(v11, __ T8B, v11, v29);
4893 __ eor(v12, __ T8B, v12, v30);
4894 __ eor(v13, __ T8B, v13, v31);
4895
4896 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4897 __ eor(v14, __ T8B, v14, v25);
4898 __ eor(v15, __ T8B, v15, v26);
4899 __ eor(v16, __ T8B, v16, v27);
4900
4901 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4902 __ andw(c_rarg5, block_size, 48);
4903 __ cbzw(c_rarg5, rounds24_loop);
4904
4905 __ tbnz(block_size, 5, shake128);
4906 // block_size == 144, bit5 == 0, SHA3-224
4907 __ ldrd(v28, __ post(buf, 8));
4908 __ eor(v17, __ T8B, v17, v28);
4909 __ b(rounds24_loop);
4910
4911 __ BIND(shake128);
4912 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4913 __ eor(v17, __ T8B, v17, v28);
4914 __ eor(v18, __ T8B, v18, v29);
4915 __ eor(v19, __ T8B, v19, v30);
4916 __ eor(v20, __ T8B, v20, v31);
4917 __ b(rounds24_loop); // block_size == 168, SHAKE128
4918
4919 __ BIND(sha3_512_or_sha3_384);
4920 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4921 __ eor(v7, __ T8B, v7, v25);
4922 __ eor(v8, __ T8B, v8, v26);
4923 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4924
4925 // SHA3-384
4926 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4927 __ eor(v9, __ T8B, v9, v27);
4928 __ eor(v10, __ T8B, v10, v28);
4929 __ eor(v11, __ T8B, v11, v29);
4930 __ eor(v12, __ T8B, v12, v30);
4931
4932 __ BIND(rounds24_loop);
4933 __ subw(rscratch2, rscratch2, 1);
4934
4935 keccak_round(rscratch1);
4936
4937 __ cbnzw(rscratch2, rounds24_loop);
4938
4939 if (multi_block) {
4940 __ add(ofs, ofs, block_size);
4941 __ cmp(ofs, limit);
4942 __ br(Assembler::LE, sha3_loop);
4943 __ mov(c_rarg0, ofs); // return ofs
4944 }
4945
4946 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4947 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4948 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4949 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4950 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4951 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4952 __ st1(v24, __ T1D, state);
4953
4954 // restore callee-saved registers
4955 __ ldpd(v14, v15, Address(sp, 48));
4956 __ ldpd(v12, v13, Address(sp, 32));
4957 __ ldpd(v10, v11, Address(sp, 16));
4958 __ ldpd(v8, v9, __ post(sp, 64));
4959
4960 __ ret(lr);
4961
4962 // record the stub entry and end
4963 store_archive_data(stub_id, start, __ pc());
4964
4965 return start;
4966 }
4967
4968 // Inputs:
4969 // c_rarg0 - long[] state0
4970 // c_rarg1 - long[] state1
4971 address generate_double_keccak() {
4972 StubId stub_id = StubId::stubgen_double_keccak_id;
4973 int entry_count = StubInfo::entry_count(stub_id);
4974 assert(entry_count == 1, "sanity check");
4975 address start = load_archive_data(stub_id);
4976 if (start != nullptr) {
4977 return start;
4978 }
4979 // Implements the double_keccak() method of the
4980 // sun.secyrity.provider.SHA3Parallel class
4981 __ align(CodeEntryAlignment);
4982 StubCodeMark mark(this, stub_id);
4983 start = __ pc();
4984 __ enter();
4985
4986 Register state0 = c_rarg0;
4987 Register state1 = c_rarg1;
4988
4989 Label rounds24_loop;
4990
4991 // save callee-saved registers
4992 __ stpd(v8, v9, __ pre(sp, -64));
4993 __ stpd(v10, v11, Address(sp, 16));
4994 __ stpd(v12, v13, Address(sp, 32));
4995 __ stpd(v14, v15, Address(sp, 48));
4996
4997 // load states
4998 __ add(rscratch1, state0, 32);
4999 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5000 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5001 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5002 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5003 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5004 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5005 __ ld1(v24, __ D, 0, rscratch1);
5006 __ add(rscratch1, state1, 32);
5007 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5008 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5009 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5010 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5011 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5012 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5013 __ ld1(v24, __ D, 1, rscratch1);
5014
5015 // 24 keccak rounds
5016 __ movw(rscratch2, 24);
5017
5018 // load round_constants base
5019 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5020
5021 __ BIND(rounds24_loop);
5022 __ subw(rscratch2, rscratch2, 1);
5023 keccak_round(rscratch1);
5024 __ cbnzw(rscratch2, rounds24_loop);
5025
5026 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5027 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5028 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5029 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5030 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5031 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5032 __ st1(v24, __ D, 0, state0);
5033 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5034 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5035 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5036 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5037 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5038 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5039 __ st1(v24, __ D, 1, state1);
5040
5041 // restore callee-saved vector registers
5042 __ ldpd(v14, v15, Address(sp, 48));
5043 __ ldpd(v12, v13, Address(sp, 32));
5044 __ ldpd(v10, v11, Address(sp, 16));
5045 __ ldpd(v8, v9, __ post(sp, 64));
5046
5047 __ leave(); // required for proper stackwalking of RuntimeStub frame
5048 __ mov(r0, zr); // return 0
5049 __ ret(lr);
5050
5051 // record the stub entry and end
5052 store_archive_data(stub_id, start, __ pc());
5053
5054 return start;
5055 }
5056
5057 // ChaCha20 block function. This version parallelizes the 32-bit
5058 // state elements on each of 16 vectors, producing 4 blocks of
5059 // keystream at a time.
5060 //
5061 // state (int[16]) = c_rarg0
5062 // keystream (byte[256]) = c_rarg1
5063 // return - number of bytes of produced keystream (always 256)
5064 //
5065 // This implementation takes each 32-bit integer from the state
5066 // array and broadcasts it across all 4 32-bit lanes of a vector register
5067 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5068 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5069 // the quarter round schedule is implemented as outlined in RFC 7539 section
5070 // 2.3. However, instead of sequentially processing the 3 quarter round
5071 // operations represented by one QUARTERROUND function, we instead stack all
5072 // the adds, xors and left-rotations from the first 4 quarter rounds together
5073 // and then do the same for the second set of 4 quarter rounds. This removes
5074 // some latency that would otherwise be incurred by waiting for an add to
5075 // complete before performing an xor (which depends on the result of the
5076 // add), etc. An adjustment happens between the first and second groups of 4
5077 // quarter rounds, but this is done only in the inputs to the macro functions
5078 // that generate the assembly instructions - these adjustments themselves are
5079 // not part of the resulting assembly.
5080 // The 4 registers v0-v3 are used during the quarter round operations as
5081 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5082 // registers become the vectors involved in adding the start state back onto
5083 // the post-QR working state. After the adds are complete, each of the 16
5084 // vectors write their first lane back to the keystream buffer, followed
5085 // by the second lane from all vectors and so on.
5086 address generate_chacha20Block_blockpar() {
5087 StubId stub_id = StubId::stubgen_chacha20Block_id;
5088 int entry_count = StubInfo::entry_count(stub_id);
5089 assert(entry_count == 1, "sanity check");
5090 address start = load_archive_data(stub_id);
5091 if (start != nullptr) {
5092 return start;
5093 }
5094 Label L_twoRounds, L_cc20_const;
5095 __ align(CodeEntryAlignment);
5096 StubCodeMark mark(this, stub_id);
5097 start = __ pc();
5098 __ enter();
5099
5100 int i, j;
5101 const Register state = c_rarg0;
5102 const Register keystream = c_rarg1;
5103 const Register loopCtr = r10;
5104 const Register tmpAddr = r11;
5105 const FloatRegister ctrAddOverlay = v28;
5106 const FloatRegister lrot8Tbl = v29;
5107
5108 // Organize SIMD registers in an array that facilitates
5109 // putting repetitive opcodes into loop structures. It is
5110 // important that each grouping of 4 registers is monotonically
5111 // increasing to support the requirements of multi-register
5112 // instructions (e.g. ld4r, st4, etc.)
5113 const FloatRegister workSt[16] = {
5114 v4, v5, v6, v7, v16, v17, v18, v19,
5115 v20, v21, v22, v23, v24, v25, v26, v27
5116 };
5117
5118 // Pull in constant data. The first 16 bytes are the add overlay
5119 // which is applied to the vector holding the counter (state[12]).
5120 // The second 16 bytes is the index register for the 8-bit left
5121 // rotation tbl instruction.
5122 __ adr(tmpAddr, L_cc20_const);
5123 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5124
5125 // Load from memory and interlace across 16 SIMD registers,
5126 // With each word from memory being broadcast to all lanes of
5127 // each successive SIMD register.
5128 // Addr(0) -> All lanes in workSt[i]
5129 // Addr(4) -> All lanes workSt[i + 1], etc.
5130 __ mov(tmpAddr, state);
5131 for (i = 0; i < 16; i += 4) {
5132 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5133 __ post(tmpAddr, 16));
5134 }
5135 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5136
5137 // Before entering the loop, create 5 4-register arrays. These
5138 // will hold the 4 registers that represent the a/b/c/d fields
5139 // in the quarter round operation. For instance the "b" field
5140 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5141 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5142 // since it is part of a diagonal organization. The aSet and scratch
5143 // register sets are defined at declaration time because they do not change
5144 // organization at any point during the 20-round processing.
5145 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5146 FloatRegister bSet[4];
5147 FloatRegister cSet[4];
5148 FloatRegister dSet[4];
5149 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5150
5151 // Set up the 10 iteration loop and perform all 8 quarter round ops
5152 __ mov(loopCtr, 10);
5153 __ BIND(L_twoRounds);
5154
5155 // Set to columnar organization and do the following 4 quarter-rounds:
5156 // QUARTERROUND(0, 4, 8, 12)
5157 // QUARTERROUND(1, 5, 9, 13)
5158 // QUARTERROUND(2, 6, 10, 14)
5159 // QUARTERROUND(3, 7, 11, 15)
5160 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5161 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5162 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5163
5164 __ cc20_qr_add4(aSet, bSet); // a += b
5165 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5166 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5167
5168 __ cc20_qr_add4(cSet, dSet); // c += d
5169 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5170 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5171
5172 __ cc20_qr_add4(aSet, bSet); // a += b
5173 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5174 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5175
5176 __ cc20_qr_add4(cSet, dSet); // c += d
5177 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5178 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5179
5180 // Set to diagonal organization and do the next 4 quarter-rounds:
5181 // QUARTERROUND(0, 5, 10, 15)
5182 // QUARTERROUND(1, 6, 11, 12)
5183 // QUARTERROUND(2, 7, 8, 13)
5184 // QUARTERROUND(3, 4, 9, 14)
5185 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5186 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5187 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5188
5189 __ cc20_qr_add4(aSet, bSet); // a += b
5190 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5191 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5192
5193 __ cc20_qr_add4(cSet, dSet); // c += d
5194 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5195 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5196
5197 __ cc20_qr_add4(aSet, bSet); // a += b
5198 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5199 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5200
5201 __ cc20_qr_add4(cSet, dSet); // c += d
5202 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5203 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5204
5205 // Decrement and iterate
5206 __ sub(loopCtr, loopCtr, 1);
5207 __ cbnz(loopCtr, L_twoRounds);
5208
5209 __ mov(tmpAddr, state);
5210
5211 // Add the starting state back to the post-loop keystream
5212 // state. We read/interlace the state array from memory into
5213 // 4 registers similar to what we did in the beginning. Then
5214 // add the counter overlay onto workSt[12] at the end.
5215 for (i = 0; i < 16; i += 4) {
5216 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5217 __ addv(workSt[i], __ T4S, workSt[i], v0);
5218 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5219 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5220 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5221 }
5222 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5223
5224 // Write working state into the keystream buffer. This is accomplished
5225 // by taking the lane "i" from each of the four vectors and writing
5226 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5227 // repeating with the next 4 vectors until all 16 vectors have been used.
5228 // Then move to the next lane and repeat the process until all lanes have
5229 // been written.
5230 for (i = 0; i < 4; i++) {
5231 for (j = 0; j < 16; j += 4) {
5232 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5233 __ post(keystream, 16));
5234 }
5235 }
5236
5237 __ mov(r0, 256); // Return length of output keystream
5238 __ leave();
5239 __ ret(lr);
5240
5241 // bind label and generate local constant data used by this stub
5242 // The constant data is broken into two 128-bit segments to be loaded
5243 // onto FloatRegisters. The first 128 bits are a counter add overlay
5244 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5245 // The second 128-bits is a table constant used for 8-bit left rotations.
5246 __ BIND(L_cc20_const);
5247 __ emit_int64(0x0000000100000000UL);
5248 __ emit_int64(0x0000000300000002UL);
5249 __ emit_int64(0x0605040702010003UL);
5250 __ emit_int64(0x0E0D0C0F0A09080BUL);
5251
5252 // record the stub entry and end
5253 store_archive_data(stub_id, start, __ pc());
5254
5255 return start;
5256 }
5257
5258 // Helpers to schedule parallel operation bundles across vector
5259 // register sequences of size 2, 4 or 8.
5260
5261 // Implement various primitive computations across vector sequences
5262
5263 template<int N>
5264 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5265 const VSeq<N>& v1, const VSeq<N>& v2) {
5266 // output must not be constant
5267 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5268 // output cannot overwrite pending inputs
5269 assert(!vs_write_before_read(v, v1), "output overwrites input");
5270 assert(!vs_write_before_read(v, v2), "output overwrites input");
5271 for (int i = 0; i < N; i++) {
5272 __ addv(v[i], T, v1[i], v2[i]);
5273 }
5274 }
5275
5276 template<int N>
5277 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5278 const VSeq<N>& v1, const VSeq<N>& v2) {
5279 // output must not be constant
5280 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5281 // output cannot overwrite pending inputs
5282 assert(!vs_write_before_read(v, v1), "output overwrites input");
5283 assert(!vs_write_before_read(v, v2), "output overwrites input");
5284 for (int i = 0; i < N; i++) {
5285 __ subv(v[i], T, v1[i], v2[i]);
5286 }
5287 }
5288
5289 template<int N>
5290 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5291 const VSeq<N>& v1, const VSeq<N>& v2) {
5292 // output must not be constant
5293 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5294 // output cannot overwrite pending inputs
5295 assert(!vs_write_before_read(v, v1), "output overwrites input");
5296 assert(!vs_write_before_read(v, v2), "output overwrites input");
5297 for (int i = 0; i < N; i++) {
5298 __ mulv(v[i], T, v1[i], v2[i]);
5299 }
5300 }
5301
5302 template<int N>
5303 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5304 // output must not be constant
5305 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5306 // output cannot overwrite pending inputs
5307 assert(!vs_write_before_read(v, v1), "output overwrites input");
5308 for (int i = 0; i < N; i++) {
5309 __ negr(v[i], T, v1[i]);
5310 }
5311 }
5312
5313 template<int N>
5314 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5315 const VSeq<N>& v1, int shift) {
5316 // output must not be constant
5317 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5318 // output cannot overwrite pending inputs
5319 assert(!vs_write_before_read(v, v1), "output overwrites input");
5320 for (int i = 0; i < N; i++) {
5321 __ sshr(v[i], T, v1[i], shift);
5322 }
5323 }
5324
5325 template<int N>
5326 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5327 // output must not be constant
5328 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5329 // output cannot overwrite pending inputs
5330 assert(!vs_write_before_read(v, v1), "output overwrites input");
5331 assert(!vs_write_before_read(v, v2), "output overwrites input");
5332 for (int i = 0; i < N; i++) {
5333 __ andr(v[i], __ T16B, v1[i], v2[i]);
5334 }
5335 }
5336
5337 template<int N>
5338 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5339 // output must not be constant
5340 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5341 // output cannot overwrite pending inputs
5342 assert(!vs_write_before_read(v, v1), "output overwrites input");
5343 assert(!vs_write_before_read(v, v2), "output overwrites input");
5344 for (int i = 0; i < N; i++) {
5345 __ orr(v[i], __ T16B, v1[i], v2[i]);
5346 }
5347 }
5348
5349 template<int N>
5350 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5351 // output must not be constant
5352 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5353 // output cannot overwrite pending inputs
5354 assert(!vs_write_before_read(v, v1), "output overwrites input");
5355 for (int i = 0; i < N; i++) {
5356 __ notr(v[i], __ T16B, v1[i]);
5357 }
5358 }
5359
5360 template<int N>
5361 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5362 // output must not be constant
5363 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5364 // output cannot overwrite pending inputs
5365 assert(!vs_write_before_read(v, v1), "output overwrites input");
5366 assert(!vs_write_before_read(v, v2), "output overwrites input");
5367 for (int i = 0; i < N; i++) {
5368 __ sqdmulh(v[i], T, v1[i], v2[i]);
5369 }
5370 }
5371
5372 template<int N>
5373 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5374 // output must not be constant
5375 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5376 // output cannot overwrite pending inputs
5377 assert(!vs_write_before_read(v, v1), "output overwrites input");
5378 assert(!vs_write_before_read(v, v2), "output overwrites input");
5379 for (int i = 0; i < N; i++) {
5380 __ mlsv(v[i], T, v1[i], v2[i]);
5381 }
5382 }
5383
5384 // load N/2 successive pairs of quadword values from memory in order
5385 // into N successive vector registers of the sequence via the
5386 // address supplied in base.
5387 template<int N>
5388 void vs_ldpq(const VSeq<N>& v, Register base) {
5389 for (int i = 0; i < N; i += 2) {
5390 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
5391 }
5392 }
5393
5394 // load N/2 successive pairs of quadword values from memory in order
5395 // into N vector registers of the sequence via the address supplied
5396 // in base using post-increment addressing
5397 template<int N>
5398 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5399 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5400 for (int i = 0; i < N; i += 2) {
5401 __ ldpq(v[i], v[i+1], __ post(base, 32));
5402 }
5403 }
5404
5405 // store N successive vector registers of the sequence into N/2
5406 // successive pairs of quadword memory locations via the address
5407 // supplied in base using post-increment addressing
5408 template<int N>
5409 void vs_stpq_post(const VSeq<N>& v, Register base) {
5410 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5411 for (int i = 0; i < N; i += 2) {
5412 __ stpq(v[i], v[i+1], __ post(base, 32));
5413 }
5414 }
5415
5416 // load N/2 pairs of quadword values from memory de-interleaved into
5417 // N vector registers 2 at a time via the address supplied in base
5418 // using post-increment addressing.
5419 template<int N>
5420 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5421 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5422 for (int i = 0; i < N; i += 2) {
5423 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5424 }
5425 }
5426
5427 // store N vector registers interleaved into N/2 pairs of quadword
5428 // memory locations via the address supplied in base using
5429 // post-increment addressing.
5430 template<int N>
5431 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5432 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5433 for (int i = 0; i < N; i += 2) {
5434 __ st2(v[i], v[i+1], T, __ post(base, 32));
5435 }
5436 }
5437
5438 // load N quadword values from memory de-interleaved into N vector
5439 // registers 3 elements at a time via the address supplied in base.
5440 template<int N>
5441 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5442 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5443 for (int i = 0; i < N; i += 3) {
5444 __ ld3(v[i], v[i+1], v[i+2], T, base);
5445 }
5446 }
5447
5448 // load N quadword values from memory de-interleaved into N vector
5449 // registers 3 elements at a time via the address supplied in base
5450 // using post-increment addressing.
5451 template<int N>
5452 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5453 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5454 for (int i = 0; i < N; i += 3) {
5455 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5456 }
5457 }
5458
5459 // load N/2 pairs of quadword values from memory into N vector
5460 // registers via the address supplied in base with each pair indexed
5461 // using the the start offset plus the corresponding entry in the
5462 // offsets array
5463 template<int N>
5464 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5465 for (int i = 0; i < N/2; i++) {
5466 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5467 }
5468 }
5469
5470 // store N vector registers into N/2 pairs of quadword memory
5471 // locations via the address supplied in base with each pair indexed
5472 // using the the start offset plus the corresponding entry in the
5473 // offsets array
5474 template<int N>
5475 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5476 for (int i = 0; i < N/2; i++) {
5477 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5478 }
5479 }
5480
5481 // load N single quadword values from memory into N vector registers
5482 // via the address supplied in base with each value indexed using
5483 // the the start offset plus the corresponding entry in the offsets
5484 // array
5485 template<int N>
5486 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5487 int start, int (&offsets)[N]) {
5488 for (int i = 0; i < N; i++) {
5489 __ ldr(v[i], T, Address(base, start + offsets[i]));
5490 }
5491 }
5492
5493 // store N vector registers into N single quadword memory locations
5494 // via the address supplied in base with each value indexed using
5495 // the the start offset plus the corresponding entry in the offsets
5496 // array
5497 template<int N>
5498 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5499 int start, int (&offsets)[N]) {
5500 for (int i = 0; i < N; i++) {
5501 __ str(v[i], T, Address(base, start + offsets[i]));
5502 }
5503 }
5504
5505 // load N/2 pairs of quadword values from memory de-interleaved into
5506 // N vector registers 2 at a time via the address supplied in base
5507 // with each pair indexed using the the start offset plus the
5508 // corresponding entry in the offsets array
5509 template<int N>
5510 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5511 Register tmp, int start, int (&offsets)[N/2]) {
5512 for (int i = 0; i < N/2; i++) {
5513 __ add(tmp, base, start + offsets[i]);
5514 __ ld2(v[2*i], v[2*i+1], T, tmp);
5515 }
5516 }
5517
5518 // store N vector registers 2 at a time interleaved into N/2 pairs
5519 // of quadword memory locations via the address supplied in base
5520 // with each pair indexed using the the start offset plus the
5521 // corresponding entry in the offsets array
5522 template<int N>
5523 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5524 Register tmp, int start, int (&offsets)[N/2]) {
5525 for (int i = 0; i < N/2; i++) {
5526 __ add(tmp, base, start + offsets[i]);
5527 __ st2(v[2*i], v[2*i+1], T, tmp);
5528 }
5529 }
5530
5531 // Helper routines for various flavours of Montgomery multiply
5532
5533 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5534 // multiplications in parallel
5535 //
5536
5537 // See the montMul() method of the sun.security.provider.ML_DSA
5538 // class.
5539 //
5540 // Computes 4x4S results or 8x8H results
5541 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5542 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5543 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5544 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5545 // Outputs: va - 4x4S or 4x8H vector register sequences
5546 // vb, vc, vtmp and vq must all be disjoint
5547 // va must be disjoint from all other inputs/temps or must equal vc
5548 // va must have a non-zero delta i.e. it must not be a constant vseq.
5549 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5550 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5551 Assembler::SIMD_Arrangement T,
5552 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5553 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5554 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5555 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5556 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5557
5558 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5559 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5560
5561 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5562
5563 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5564 assert(vs_disjoint(va, vb), "va and vb overlap");
5565 assert(vs_disjoint(va, vq), "va and vq overlap");
5566 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5567 assert(!va.is_constant(), "output vector must identify 4 different registers");
5568
5569 // schedule 4 streams of instructions across the vector sequences
5570 for (int i = 0; i < 4; i++) {
5571 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5572 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5573 }
5574
5575 for (int i = 0; i < 4; i++) {
5576 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5577 }
5578
5579 for (int i = 0; i < 4; i++) {
5580 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5581 }
5582
5583 for (int i = 0; i < 4; i++) {
5584 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5585 }
5586 }
5587
5588 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5589 // multiplications in parallel
5590 //
5591
5592 // See the montMul() method of the sun.security.provider.ML_DSA
5593 // class.
5594 //
5595 // Computes 4x4S results or 8x8H results
5596 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5597 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5598 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5599 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5600 // Outputs: va - 4x4S or 4x8H vector register sequences
5601 // vb, vc, vtmp and vq must all be disjoint
5602 // va must be disjoint from all other inputs/temps or must equal vc
5603 // va must have a non-zero delta i.e. it must not be a constant vseq.
5604 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5605 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5606 Assembler::SIMD_Arrangement T,
5607 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5608 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5609 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5610 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5611 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5612
5613 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5614 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5615
5616 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5617
5618 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5619 assert(vs_disjoint(va, vb), "va and vb overlap");
5620 assert(vs_disjoint(va, vq), "va and vq overlap");
5621 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5622 assert(!va.is_constant(), "output vector must identify 2 different registers");
5623
5624 // schedule 2 streams of instructions across the vector sequences
5625 for (int i = 0; i < 2; i++) {
5626 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5627 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5628 }
5629
5630 for (int i = 0; i < 2; i++) {
5631 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5632 }
5633
5634 for (int i = 0; i < 2; i++) {
5635 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5636 }
5637
5638 for (int i = 0; i < 2; i++) {
5639 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5640 }
5641 }
5642
5643 // Perform 16 16-bit Montgomery multiplications in parallel.
5644 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5645 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5646 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5647 // It will assert that the register use is valid
5648 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5649 }
5650
5651 // Perform 32 16-bit Montgomery multiplications in parallel.
5652 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5653 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5654 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5655 // It will assert that the register use is valid
5656 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5657 }
5658
5659 // Perform 64 16-bit Montgomery multiplications in parallel.
5660 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5661 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5662 // Schedule two successive 4x8H multiplies via the montmul helper
5663 // on the front and back halves of va, vb and vc. The helper will
5664 // assert that the register use has no overlap conflicts on each
5665 // individual call but we also need to ensure that the necessary
5666 // disjoint/equality constraints are met across both calls.
5667
5668 // vb, vc, vtmp and vq must be disjoint. va must either be
5669 // disjoint from all other registers or equal vc
5670
5671 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5672 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5673 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5674
5675 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5676 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5677
5678 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5679
5680 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5681 assert(vs_disjoint(va, vb), "va and vb overlap");
5682 assert(vs_disjoint(va, vq), "va and vq overlap");
5683 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5684
5685 // we multiply the front and back halves of each sequence 4 at a
5686 // time because
5687 //
5688 // 1) we are currently only able to get 4-way instruction
5689 // parallelism at best
5690 //
5691 // 2) we need registers for the constants in vq and temporary
5692 // scratch registers to hold intermediate results so vtmp can only
5693 // be a VSeq<4> which means we only have 4 scratch slots
5694
5695 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5696 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5697 }
5698
5699 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5700 const VSeq<4>& vc,
5701 const VSeq<4>& vtmp,
5702 const VSeq<2>& vq) {
5703 // compute a = montmul(a1, c)
5704 kyber_montmul32(vc, va1, vc, vtmp, vq);
5705 // ouptut a1 = a0 - a
5706 vs_subv(va1, __ T8H, va0, vc);
5707 // and a0 = a0 + a
5708 vs_addv(va0, __ T8H, va0, vc);
5709 }
5710
5711 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5712 const VSeq<4>& vb,
5713 const VSeq<4>& vtmp1,
5714 const VSeq<4>& vtmp2,
5715 const VSeq<2>& vq) {
5716 // compute c = a0 - a1
5717 vs_subv(vtmp1, __ T8H, va0, va1);
5718 // output a0 = a0 + a1
5719 vs_addv(va0, __ T8H, va0, va1);
5720 // output a1 = b montmul c
5721 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5722 }
5723
5724 void load64shorts(const VSeq<8>& v, Register shorts) {
5725 vs_ldpq_post(v, shorts);
5726 }
5727
5728 void load32shorts(const VSeq<4>& v, Register shorts) {
5729 vs_ldpq_post(v, shorts);
5730 }
5731
5732 void store64shorts(VSeq<8> v, Register tmpAddr) {
5733 vs_stpq_post(v, tmpAddr);
5734 }
5735
5736 // Kyber NTT function.
5737 // Implements
5738 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5739 //
5740 // coeffs (short[256]) = c_rarg0
5741 // ntt_zetas (short[256]) = c_rarg1
5742 address generate_kyberNtt() {
5743 StubId stub_id = StubId::stubgen_kyberNtt_id;
5744 int entry_count = StubInfo::entry_count(stub_id);
5745 assert(entry_count == 1, "sanity check");
5746 address start = load_archive_data(stub_id);
5747 if (start != nullptr) {
5748 return start;
5749 }
5750 __ align(CodeEntryAlignment);
5751 StubCodeMark mark(this, stub_id);
5752 start = __ pc();
5753 __ enter();
5754
5755 const Register coeffs = c_rarg0;
5756 const Register zetas = c_rarg1;
5757
5758 const Register kyberConsts = r10;
5759 const Register tmpAddr = r11;
5760
5761 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5762 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5763 VSeq<2> vq(30); // n.b. constants overlap vs3
5764
5765 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5766 // load the montmul constants
5767 vs_ldpq(vq, kyberConsts);
5768
5769 // Each level corresponds to an iteration of the outermost loop of the
5770 // Java method seilerNTT(int[] coeffs). There are some differences
5771 // from what is done in the seilerNTT() method, though:
5772 // 1. The computation is using 16-bit signed values, we do not convert them
5773 // to ints here.
5774 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5775 // this array for each level, it is easier that way to fill up the vector
5776 // registers.
5777 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5778 // multiplications (this is because that way there should not be any
5779 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5780 // that we can use the 16-bit arithmetic in the vector unit.
5781 //
5782 // On each level, we fill up the vector registers in such a way that the
5783 // array elements that need to be multiplied by the zetas go into one
5784 // set of vector registers while the corresponding ones that don't need to
5785 // be multiplied, go into another set.
5786 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5787 // registers interleaving the steps of 4 identical computations,
5788 // each done on 8 16-bit values per register.
5789
5790 // At levels 0-3 the coefficients multiplied by or added/subtracted
5791 // to the zetas occur in discrete blocks whose size is some multiple
5792 // of 32.
5793
5794 // level 0
5795 __ add(tmpAddr, coeffs, 256);
5796 load64shorts(vs1, tmpAddr);
5797 load64shorts(vs2, zetas);
5798 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5799 __ add(tmpAddr, coeffs, 0);
5800 load64shorts(vs1, tmpAddr);
5801 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5802 vs_addv(vs1, __ T8H, vs1, vs2);
5803 __ add(tmpAddr, coeffs, 0);
5804 vs_stpq_post(vs1, tmpAddr);
5805 __ add(tmpAddr, coeffs, 256);
5806 vs_stpq_post(vs3, tmpAddr);
5807 // restore montmul constants
5808 vs_ldpq(vq, kyberConsts);
5809 load64shorts(vs1, tmpAddr);
5810 load64shorts(vs2, zetas);
5811 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5812 __ add(tmpAddr, coeffs, 128);
5813 load64shorts(vs1, tmpAddr);
5814 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5815 vs_addv(vs1, __ T8H, vs1, vs2);
5816 __ add(tmpAddr, coeffs, 128);
5817 store64shorts(vs1, tmpAddr);
5818 __ add(tmpAddr, coeffs, 384);
5819 store64shorts(vs3, tmpAddr);
5820
5821 // level 1
5822 // restore montmul constants
5823 vs_ldpq(vq, kyberConsts);
5824 __ add(tmpAddr, coeffs, 128);
5825 load64shorts(vs1, tmpAddr);
5826 load64shorts(vs2, zetas);
5827 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5828 __ add(tmpAddr, coeffs, 0);
5829 load64shorts(vs1, tmpAddr);
5830 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5831 vs_addv(vs1, __ T8H, vs1, vs2);
5832 __ add(tmpAddr, coeffs, 0);
5833 store64shorts(vs1, tmpAddr);
5834 store64shorts(vs3, tmpAddr);
5835 vs_ldpq(vq, kyberConsts);
5836 __ add(tmpAddr, coeffs, 384);
5837 load64shorts(vs1, tmpAddr);
5838 load64shorts(vs2, zetas);
5839 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5840 __ add(tmpAddr, coeffs, 256);
5841 load64shorts(vs1, tmpAddr);
5842 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5843 vs_addv(vs1, __ T8H, vs1, vs2);
5844 __ add(tmpAddr, coeffs, 256);
5845 store64shorts(vs1, tmpAddr);
5846 store64shorts(vs3, tmpAddr);
5847
5848 // level 2
5849 vs_ldpq(vq, kyberConsts);
5850 int offsets1[4] = { 0, 32, 128, 160 };
5851 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5852 load64shorts(vs2, zetas);
5853 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5854 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5855 // kyber_subv_addv64();
5856 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5857 vs_addv(vs1, __ T8H, vs1, vs2);
5858 __ add(tmpAddr, coeffs, 0);
5859 vs_stpq_post(vs_front(vs1), tmpAddr);
5860 vs_stpq_post(vs_front(vs3), tmpAddr);
5861 vs_stpq_post(vs_back(vs1), tmpAddr);
5862 vs_stpq_post(vs_back(vs3), tmpAddr);
5863 vs_ldpq(vq, kyberConsts);
5864 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5865 load64shorts(vs2, zetas);
5866 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5867 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5868 // kyber_subv_addv64();
5869 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5870 vs_addv(vs1, __ T8H, vs1, vs2);
5871 __ add(tmpAddr, coeffs, 256);
5872 vs_stpq_post(vs_front(vs1), tmpAddr);
5873 vs_stpq_post(vs_front(vs3), tmpAddr);
5874 vs_stpq_post(vs_back(vs1), tmpAddr);
5875 vs_stpq_post(vs_back(vs3), tmpAddr);
5876
5877 // level 3
5878 vs_ldpq(vq, kyberConsts);
5879 int offsets2[4] = { 0, 64, 128, 192 };
5880 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5881 load64shorts(vs2, zetas);
5882 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5883 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5884 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5885 vs_addv(vs1, __ T8H, vs1, vs2);
5886 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5887 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5888
5889 vs_ldpq(vq, kyberConsts);
5890 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5891 load64shorts(vs2, zetas);
5892 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5893 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5894 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5895 vs_addv(vs1, __ T8H, vs1, vs2);
5896 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5897 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5898
5899 // level 4
5900 // At level 4 coefficients occur in 8 discrete blocks of size 16
5901 // so they are loaded using employing an ldr at 8 distinct offsets.
5902
5903 vs_ldpq(vq, kyberConsts);
5904 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5905 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5906 load64shorts(vs2, zetas);
5907 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5908 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5909 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5910 vs_addv(vs1, __ T8H, vs1, vs2);
5911 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5912 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5913
5914 vs_ldpq(vq, kyberConsts);
5915 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5916 load64shorts(vs2, zetas);
5917 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5918 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5919 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5920 vs_addv(vs1, __ T8H, vs1, vs2);
5921 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5922 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5923
5924 // level 5
5925 // At level 5 related coefficients occur in discrete blocks of size 8 so
5926 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5927
5928 vs_ldpq(vq, kyberConsts);
5929 int offsets4[4] = { 0, 32, 64, 96 };
5930 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5931 load32shorts(vs_front(vs2), zetas);
5932 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5933 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5934 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5935 load32shorts(vs_front(vs2), zetas);
5936 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5937 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5938 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5939 load32shorts(vs_front(vs2), zetas);
5940 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5941 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5942
5943 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5944 load32shorts(vs_front(vs2), zetas);
5945 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5946 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5947
5948 // level 6
5949 // At level 6 related coefficients occur in discrete blocks of size 4 so
5950 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5951
5952 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5953 load32shorts(vs_front(vs2), zetas);
5954 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5955 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5956 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5957 // __ ldpq(v18, v19, __ post(zetas, 32));
5958 load32shorts(vs_front(vs2), zetas);
5959 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5960 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5961
5962 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5963 load32shorts(vs_front(vs2), zetas);
5964 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5965 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5966
5967 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5968 load32shorts(vs_front(vs2), zetas);
5969 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5970 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5971
5972 __ leave(); // required for proper stackwalking of RuntimeStub frame
5973 __ mov(r0, zr); // return 0
5974 __ ret(lr);
5975
5976 // record the stub entry and end
5977 store_archive_data(stub_id, start, __ pc());
5978
5979 return start;
5980 }
5981
5982 // Kyber Inverse NTT function
5983 // Implements
5984 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5985 //
5986 // coeffs (short[256]) = c_rarg0
5987 // ntt_zetas (short[256]) = c_rarg1
5988 address generate_kyberInverseNtt() {
5989 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5990 int entry_count = StubInfo::entry_count(stub_id);
5991 assert(entry_count == 1, "sanity check");
5992 address start = load_archive_data(stub_id);
5993 if (start != nullptr) {
5994 return start;
5995 }
5996 __ align(CodeEntryAlignment);
5997 StubCodeMark mark(this, stub_id);
5998 start = __ pc();
5999 __ enter();
6000
6001 const Register coeffs = c_rarg0;
6002 const Register zetas = c_rarg1;
6003
6004 const Register kyberConsts = r10;
6005 const Register tmpAddr = r11;
6006 const Register tmpAddr2 = c_rarg2;
6007
6008 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6009 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6010 VSeq<2> vq(30); // n.b. constants overlap vs3
6011
6012 __ lea(kyberConsts,
6013 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6014
6015 // level 0
6016 // At level 0 related coefficients occur in discrete blocks of size 4 so
6017 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6018
6019 vs_ldpq(vq, kyberConsts);
6020 int offsets4[4] = { 0, 32, 64, 96 };
6021 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6022 load32shorts(vs_front(vs2), zetas);
6023 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6024 vs_front(vs2), vs_back(vs2), vtmp, vq);
6025 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6026 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6027 load32shorts(vs_front(vs2), zetas);
6028 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6029 vs_front(vs2), vs_back(vs2), vtmp, vq);
6030 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6031 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6032 load32shorts(vs_front(vs2), zetas);
6033 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6034 vs_front(vs2), vs_back(vs2), vtmp, vq);
6035 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6036 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6037 load32shorts(vs_front(vs2), zetas);
6038 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6039 vs_front(vs2), vs_back(vs2), vtmp, vq);
6040 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6041
6042 // level 1
6043 // At level 1 related coefficients occur in discrete blocks of size 8 so
6044 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6045
6046 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6047 load32shorts(vs_front(vs2), zetas);
6048 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6049 vs_front(vs2), vs_back(vs2), vtmp, vq);
6050 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6051 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6052 load32shorts(vs_front(vs2), zetas);
6053 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6054 vs_front(vs2), vs_back(vs2), vtmp, vq);
6055 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6056
6057 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6058 load32shorts(vs_front(vs2), zetas);
6059 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6060 vs_front(vs2), vs_back(vs2), vtmp, vq);
6061 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6062 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6063 load32shorts(vs_front(vs2), zetas);
6064 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6065 vs_front(vs2), vs_back(vs2), vtmp, vq);
6066 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6067
6068 // level 2
6069 // At level 2 coefficients occur in 8 discrete blocks of size 16
6070 // so they are loaded using employing an ldr at 8 distinct offsets.
6071
6072 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6073 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6074 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6075 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6076 vs_subv(vs1, __ T8H, vs1, vs2);
6077 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6078 load64shorts(vs2, zetas);
6079 vs_ldpq(vq, kyberConsts);
6080 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6081 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6082
6083 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6084 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6085 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6086 vs_subv(vs1, __ T8H, vs1, vs2);
6087 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6088 load64shorts(vs2, zetas);
6089 vs_ldpq(vq, kyberConsts);
6090 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6091 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6092
6093 // Barrett reduction at indexes where overflow may happen
6094
6095 // load q and the multiplier for the Barrett reduction
6096 __ add(tmpAddr, kyberConsts, 16);
6097 vs_ldpq(vq, tmpAddr);
6098
6099 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6100 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6101 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6102 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6103 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6104 vs_sshr(vs2, __ T8H, vs2, 11);
6105 vs_mlsv(vs1, __ T8H, vs2, vq1);
6106 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6107 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6108 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6109 vs_sshr(vs2, __ T8H, vs2, 11);
6110 vs_mlsv(vs1, __ T8H, vs2, vq1);
6111 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6112
6113 // level 3
6114 // From level 3 upwards coefficients occur in discrete blocks whose size is
6115 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6116
6117 int offsets2[4] = { 0, 64, 128, 192 };
6118 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6119 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6120 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6121 vs_subv(vs1, __ T8H, vs1, vs2);
6122 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6123 load64shorts(vs2, zetas);
6124 vs_ldpq(vq, kyberConsts);
6125 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6126 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6127
6128 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6129 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6130 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6131 vs_subv(vs1, __ T8H, vs1, vs2);
6132 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6133 load64shorts(vs2, zetas);
6134 vs_ldpq(vq, kyberConsts);
6135 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6136 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6137
6138 // level 4
6139
6140 int offsets1[4] = { 0, 32, 128, 160 };
6141 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6142 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6143 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6144 vs_subv(vs1, __ T8H, vs1, vs2);
6145 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6146 load64shorts(vs2, zetas);
6147 vs_ldpq(vq, kyberConsts);
6148 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6149 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6150
6151 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6152 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6153 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6154 vs_subv(vs1, __ T8H, vs1, vs2);
6155 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6156 load64shorts(vs2, zetas);
6157 vs_ldpq(vq, kyberConsts);
6158 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6159 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6160
6161 // level 5
6162
6163 __ add(tmpAddr, coeffs, 0);
6164 load64shorts(vs1, tmpAddr);
6165 __ add(tmpAddr, coeffs, 128);
6166 load64shorts(vs2, tmpAddr);
6167 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6168 vs_subv(vs1, __ T8H, vs1, vs2);
6169 __ add(tmpAddr, coeffs, 0);
6170 store64shorts(vs3, tmpAddr);
6171 load64shorts(vs2, zetas);
6172 vs_ldpq(vq, kyberConsts);
6173 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6174 __ add(tmpAddr, coeffs, 128);
6175 store64shorts(vs2, tmpAddr);
6176
6177 load64shorts(vs1, tmpAddr);
6178 __ add(tmpAddr, coeffs, 384);
6179 load64shorts(vs2, tmpAddr);
6180 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6181 vs_subv(vs1, __ T8H, vs1, vs2);
6182 __ add(tmpAddr, coeffs, 256);
6183 store64shorts(vs3, tmpAddr);
6184 load64shorts(vs2, zetas);
6185 vs_ldpq(vq, kyberConsts);
6186 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6187 __ add(tmpAddr, coeffs, 384);
6188 store64shorts(vs2, tmpAddr);
6189
6190 // Barrett reduction at indexes where overflow may happen
6191
6192 // load q and the multiplier for the Barrett reduction
6193 __ add(tmpAddr, kyberConsts, 16);
6194 vs_ldpq(vq, tmpAddr);
6195
6196 int offsets0[2] = { 0, 256 };
6197 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6198 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6199 vs_sshr(vs2, __ T8H, vs2, 11);
6200 vs_mlsv(vs1, __ T8H, vs2, vq1);
6201 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6202
6203 // level 6
6204
6205 __ add(tmpAddr, coeffs, 0);
6206 load64shorts(vs1, tmpAddr);
6207 __ add(tmpAddr, coeffs, 256);
6208 load64shorts(vs2, tmpAddr);
6209 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6210 vs_subv(vs1, __ T8H, vs1, vs2);
6211 __ add(tmpAddr, coeffs, 0);
6212 store64shorts(vs3, tmpAddr);
6213 load64shorts(vs2, zetas);
6214 vs_ldpq(vq, kyberConsts);
6215 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6216 __ add(tmpAddr, coeffs, 256);
6217 store64shorts(vs2, tmpAddr);
6218
6219 __ add(tmpAddr, coeffs, 128);
6220 load64shorts(vs1, tmpAddr);
6221 __ add(tmpAddr, coeffs, 384);
6222 load64shorts(vs2, tmpAddr);
6223 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6224 vs_subv(vs1, __ T8H, vs1, vs2);
6225 __ add(tmpAddr, coeffs, 128);
6226 store64shorts(vs3, tmpAddr);
6227 load64shorts(vs2, zetas);
6228 vs_ldpq(vq, kyberConsts);
6229 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6230 __ add(tmpAddr, coeffs, 384);
6231 store64shorts(vs2, tmpAddr);
6232
6233 // multiply by 2^-n
6234
6235 // load toMont(2^-n mod q)
6236 __ add(tmpAddr, kyberConsts, 48);
6237 __ ldr(v29, __ Q, tmpAddr);
6238
6239 vs_ldpq(vq, kyberConsts);
6240 __ add(tmpAddr, coeffs, 0);
6241 load64shorts(vs1, tmpAddr);
6242 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6243 __ add(tmpAddr, coeffs, 0);
6244 store64shorts(vs2, tmpAddr);
6245
6246 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6247 load64shorts(vs1, tmpAddr);
6248 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6249 __ add(tmpAddr, coeffs, 128);
6250 store64shorts(vs2, tmpAddr);
6251
6252 // now tmpAddr contains coeffs + 256
6253 load64shorts(vs1, tmpAddr);
6254 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6255 __ add(tmpAddr, coeffs, 256);
6256 store64shorts(vs2, tmpAddr);
6257
6258 // now tmpAddr contains coeffs + 384
6259 load64shorts(vs1, tmpAddr);
6260 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6261 __ add(tmpAddr, coeffs, 384);
6262 store64shorts(vs2, tmpAddr);
6263
6264 __ leave(); // required for proper stackwalking of RuntimeStub frame
6265 __ mov(r0, zr); // return 0
6266 __ ret(lr);
6267
6268 // record the stub entry and end
6269 store_archive_data(stub_id, start, __ pc());
6270
6271 return start;
6272 }
6273
6274 // Kyber multiply polynomials in the NTT domain.
6275 // Implements
6276 // static int implKyberNttMult(
6277 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6278 //
6279 // The actual algorithm that is used here differs from the one in the Java
6280 // implementation, it uses Montgomery multiplications instead of Barrett
6281 // reduction, but the end result modulo MLKEM_Q is the same. This is the
6282 // Java equivalent of this intrinsic implementation:
6283 // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
6284 // for (int m = 0; m < ML_KEM_N / 2; m++) {
6285 // int a0 = ntta[2 * m];
6286 // int a1 = ntta[2 * m + 1];
6287 // int b0 = nttb[2 * m];
6288 // int b1 = nttb[2 * m + 1];
6289 // int r = montMul(a0, b0) +
6290 // montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
6291 // result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
6292 // result[2 * m + 1] = (short) montMul(
6293 // (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
6294 // }
6295 // }
6296 //
6297 // result (short[256]) = c_rarg0
6298 // ntta (short[256]) = c_rarg1
6299 // nttb (short[256]) = c_rarg2
6300 // zetas (short[128]) = c_rarg3
6301 address generate_kyberNttMult() {
6302 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6303 int entry_count = StubInfo::entry_count(stub_id);
6304 assert(entry_count == 1, "sanity check");
6305 address start = load_archive_data(stub_id);
6306 if (start != nullptr) {
6307 return start;
6308 }
6309 __ align(CodeEntryAlignment);
6310 StubCodeMark mark(this, stub_id);
6311 start = __ pc();
6312 __ enter();
6313
6314 const Register result = c_rarg0;
6315 const Register ntta = c_rarg1;
6316 const Register nttb = c_rarg2;
6317 const Register zetas = c_rarg3;
6318
6319 const Register kyberConsts = r10;
6320 const Register limit = r11;
6321
6322 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6323 VSeq<4> vs3(16), vs4(20);
6324 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6325 VSeq<2> vz(28); // pair of zetas
6326 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6327
6328 __ lea(kyberConsts,
6329 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6330
6331 Label kyberNttMult_loop;
6332
6333 __ add(limit, result, 512);
6334
6335 // load q and qinv
6336 vs_ldpq(vq, kyberConsts);
6337
6338 // load R^2 mod q (to convert back from Montgomery representation)
6339 __ add(kyberConsts, kyberConsts, 64);
6340 __ ldr(v27, __ Q, kyberConsts);
6341
6342 __ BIND(kyberNttMult_loop);
6343
6344 // load 16 zetas
6345 vs_ldpq_post(vz, zetas);
6346
6347 // load 2 sets of 32 coefficients from the two input arrays
6348 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6349 // are striped across pairs of vector registers
6350 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6351 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6352 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6353 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6354
6355 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6356 // i.e. montmul the first and second halves of vs1 in order and
6357 // then with one sequence reversed storing the two results in vs3
6358 //
6359 // vs3[0] <- montmul(a0, b0)
6360 // vs3[1] <- montmul(a1, b1)
6361 // vs3[2] <- montmul(a0, b1)
6362 // vs3[3] <- montmul(a1, b0)
6363 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6364 kyber_montmul16(vs_back(vs3),
6365 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6366
6367 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6368 // i.e. montmul the first and second halves of vs4 in order and
6369 // then with one sequence reversed storing the two results in vs1
6370 //
6371 // vs1[0] <- montmul(a2, b2)
6372 // vs1[1] <- montmul(a3, b3)
6373 // vs1[2] <- montmul(a2, b3)
6374 // vs1[3] <- montmul(a3, b2)
6375 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6376 kyber_montmul16(vs_back(vs1),
6377 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6378
6379 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6380 // We can schedule two montmuls at a time if we use a suitable vector
6381 // sequence <vs3[1], vs1[1]>.
6382 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6383 VSeq<2> vs5(vs3[1], delta);
6384
6385 // vs3[1] <- montmul(montmul(a1, b1), z0)
6386 // vs1[1] <- montmul(montmul(a3, b3), z1)
6387 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6388
6389 // add results in pairs storing in vs3
6390 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6391 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6392 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6393
6394 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6395 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6396 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6397
6398 // vs1 <- montmul(vs3, montRSquareModQ)
6399 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6400
6401 // store back the two pairs of result vectors de-interleaved as 8H elements
6402 // i.e. storing each pairs of shorts striped across a register pair adjacent
6403 // in memory
6404 vs_st2_post(vs1, __ T8H, result);
6405
6406 __ cmp(result, limit);
6407 __ br(Assembler::NE, kyberNttMult_loop);
6408
6409 __ leave(); // required for proper stackwalking of RuntimeStub frame
6410 __ mov(r0, zr); // return 0
6411 __ ret(lr);
6412
6413 // record the stub entry and end
6414 store_archive_data(stub_id, start, __ pc());
6415
6416 return start;
6417 }
6418
6419 // Kyber add 2 polynomials.
6420 // Implements
6421 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6422 //
6423 // result (short[256]) = c_rarg0
6424 // a (short[256]) = c_rarg1
6425 // b (short[256]) = c_rarg2
6426 address generate_kyberAddPoly_2() {
6427 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6428 int entry_count = StubInfo::entry_count(stub_id);
6429 assert(entry_count == 1, "sanity check");
6430 address start = load_archive_data(stub_id);
6431 if (start != nullptr) {
6432 return start;
6433 }
6434 __ align(CodeEntryAlignment);
6435 StubCodeMark mark(this, stub_id);
6436 start = __ pc();
6437 __ enter();
6438
6439 const Register result = c_rarg0;
6440 const Register a = c_rarg1;
6441 const Register b = c_rarg2;
6442
6443 const Register kyberConsts = r11;
6444
6445 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6446 // So, we can load, add and store the data in 3 groups of 11,
6447 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6448 // registers. A further constraint is that the mapping needs
6449 // to skip callee saves. So, we allocate the register
6450 // sequences using two 8 sequences, two 2 sequences and two
6451 // single registers.
6452 VSeq<8> vs1_1(0);
6453 VSeq<2> vs1_2(16);
6454 FloatRegister vs1_3 = v28;
6455 VSeq<8> vs2_1(18);
6456 VSeq<2> vs2_2(26);
6457 FloatRegister vs2_3 = v29;
6458
6459 // two constant vector sequences
6460 VSeq<8> vc_1(31, 0);
6461 VSeq<2> vc_2(31, 0);
6462
6463 FloatRegister vc_3 = v31;
6464 __ lea(kyberConsts,
6465 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6466
6467 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6468 for (int i = 0; i < 3; i++) {
6469 // load 80 or 88 values from a into vs1_1/2/3
6470 vs_ldpq_post(vs1_1, a);
6471 vs_ldpq_post(vs1_2, a);
6472 if (i < 2) {
6473 __ ldr(vs1_3, __ Q, __ post(a, 16));
6474 }
6475 // load 80 or 88 values from b into vs2_1/2/3
6476 vs_ldpq_post(vs2_1, b);
6477 vs_ldpq_post(vs2_2, b);
6478 if (i < 2) {
6479 __ ldr(vs2_3, __ Q, __ post(b, 16));
6480 }
6481 // sum 80 or 88 values across vs1 and vs2 into vs1
6482 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6483 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6484 if (i < 2) {
6485 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6486 }
6487 // add constant to all 80 or 88 results
6488 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6489 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6490 if (i < 2) {
6491 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6492 }
6493 // store 80 or 88 values
6494 vs_stpq_post(vs1_1, result);
6495 vs_stpq_post(vs1_2, result);
6496 if (i < 2) {
6497 __ str(vs1_3, __ Q, __ post(result, 16));
6498 }
6499 }
6500
6501 __ leave(); // required for proper stackwalking of RuntimeStub frame
6502 __ mov(r0, zr); // return 0
6503 __ ret(lr);
6504
6505 // record the stub entry and end
6506 store_archive_data(stub_id, start, __ pc());
6507
6508 return start;
6509 }
6510
6511 // Kyber add 3 polynomials.
6512 // Implements
6513 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6514 //
6515 // result (short[256]) = c_rarg0
6516 // a (short[256]) = c_rarg1
6517 // b (short[256]) = c_rarg2
6518 // c (short[256]) = c_rarg3
6519 address generate_kyberAddPoly_3() {
6520 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6521 int entry_count = StubInfo::entry_count(stub_id);
6522 assert(entry_count == 1, "sanity check");
6523 address start = load_archive_data(stub_id);
6524 if (start != nullptr) {
6525 return start;
6526 }
6527 __ align(CodeEntryAlignment);
6528 StubCodeMark mark(this, stub_id);
6529 start = __ pc();
6530 __ enter();
6531
6532 const Register result = c_rarg0;
6533 const Register a = c_rarg1;
6534 const Register b = c_rarg2;
6535 const Register c = c_rarg3;
6536
6537 const Register kyberConsts = r11;
6538
6539 // As above we sum 256 sets of values in total i.e. 32 x 8H
6540 // quadwords. So, we can load, add and store the data in 3
6541 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6542 // of 10 or 11 registers. A further constraint is that the
6543 // mapping needs to skip callee saves. So, we allocate the
6544 // register sequences using two 8 sequences, two 2 sequences
6545 // and two single registers.
6546 VSeq<8> vs1_1(0);
6547 VSeq<2> vs1_2(16);
6548 FloatRegister vs1_3 = v28;
6549 VSeq<8> vs2_1(18);
6550 VSeq<2> vs2_2(26);
6551 FloatRegister vs2_3 = v29;
6552
6553 // two constant vector sequences
6554 VSeq<8> vc_1(31, 0);
6555 VSeq<2> vc_2(31, 0);
6556
6557 FloatRegister vc_3 = v31;
6558
6559 __ lea(kyberConsts,
6560 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6561
6562 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6563 for (int i = 0; i < 3; i++) {
6564 // load 80 or 88 values from a into vs1_1/2/3
6565 vs_ldpq_post(vs1_1, a);
6566 vs_ldpq_post(vs1_2, a);
6567 if (i < 2) {
6568 __ ldr(vs1_3, __ Q, __ post(a, 16));
6569 }
6570 // load 80 or 88 values from b into vs2_1/2/3
6571 vs_ldpq_post(vs2_1, b);
6572 vs_ldpq_post(vs2_2, b);
6573 if (i < 2) {
6574 __ ldr(vs2_3, __ Q, __ post(b, 16));
6575 }
6576 // sum 80 or 88 values across vs1 and vs2 into vs1
6577 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6578 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6579 if (i < 2) {
6580 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6581 }
6582 // load 80 or 88 values from c into vs2_1/2/3
6583 vs_ldpq_post(vs2_1, c);
6584 vs_ldpq_post(vs2_2, c);
6585 if (i < 2) {
6586 __ ldr(vs2_3, __ Q, __ post(c, 16));
6587 }
6588 // sum 80 or 88 values across vs1 and vs2 into vs1
6589 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6590 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6591 if (i < 2) {
6592 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6593 }
6594 // add constant to all 80 or 88 results
6595 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6596 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6597 if (i < 2) {
6598 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6599 }
6600 // store 80 or 88 values
6601 vs_stpq_post(vs1_1, result);
6602 vs_stpq_post(vs1_2, result);
6603 if (i < 2) {
6604 __ str(vs1_3, __ Q, __ post(result, 16));
6605 }
6606 }
6607
6608 __ leave(); // required for proper stackwalking of RuntimeStub frame
6609 __ mov(r0, zr); // return 0
6610 __ ret(lr);
6611
6612 // record the stub entry and end
6613 store_archive_data(stub_id, start, __ pc());
6614
6615 return start;
6616 }
6617
6618 // Kyber parse XOF output to polynomial coefficient candidates
6619 // or decodePoly(12, ...).
6620 // Implements
6621 // static int implKyber12To16(
6622 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6623 //
6624 // we assume that parsed and condensed are allocated such that for
6625 // n = (parsedLength + 63) / 64
6626 // n blocks of 96 bytes of input can be processed, i.e.
6627 // index + n * 96 <= condensed.length and
6628 // n * 64 <= parsed.length
6629 //
6630 // condensed (byte[]) = c_rarg0
6631 // condensedIndex = c_rarg1
6632 // parsed (short[]) = c_rarg2
6633 // parsedLength = c_rarg3
6634 address generate_kyber12To16() {
6635 StubId stub_id = StubId::stubgen_kyber12To16_id;
6636 int entry_count = StubInfo::entry_count(stub_id);
6637 assert(entry_count == 1, "sanity check");
6638 address start = load_archive_data(stub_id);
6639 if (start != nullptr) {
6640 return start;
6641 }
6642 Label L_F00, L_loop;
6643
6644 __ align(CodeEntryAlignment);
6645 StubCodeMark mark(this, stub_id);
6646 start = __ pc();
6647 __ enter();
6648
6649 const Register condensed = c_rarg0;
6650 const Register condensedOffs = c_rarg1;
6651 const Register parsed = c_rarg2;
6652 const Register parsedLength = c_rarg3;
6653
6654 const Register tmpAddr = r11;
6655
6656 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6657 // quadwords so we need a 6 vector sequence for the inputs.
6658 // Parsing produces 64 shorts, employing two 8 vector
6659 // sequences to store and combine the intermediate data.
6660 VSeq<6> vin(24);
6661 VSeq<8> va(0), vb(16);
6662
6663 __ adr(tmpAddr, L_F00);
6664 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6665 __ add(condensed, condensed, condensedOffs);
6666
6667 __ BIND(L_loop);
6668 // load 96 (6 x 16B) byte values
6669 vs_ld3_post(vin, __ T16B, condensed);
6670
6671 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6672 // holds 48 (16x3) contiguous bytes from memory striped
6673 // horizontally across each of the 16 byte lanes. Equivalently,
6674 // that is 16 pairs of 12-bit integers. Likewise the back half
6675 // holds the next 48 bytes in the same arrangement.
6676
6677 // Each vector in the front half can also be viewed as a vertical
6678 // strip across the 16 pairs of 12 bit integers. Each byte in
6679 // vin[0] stores the low 8 bits of the first int in a pair. Each
6680 // byte in vin[1] stores the high 4 bits of the first int and the
6681 // low 4 bits of the second int. Each byte in vin[2] stores the
6682 // high 8 bits of the second int. Likewise the vectors in second
6683 // half.
6684
6685 // Converting the data to 16-bit shorts requires first of all
6686 // expanding each of the 6 x 16B vectors into 6 corresponding
6687 // pairs of 8H vectors. Mask, shift and add operations on the
6688 // resulting vector pairs can be used to combine 4 and 8 bit
6689 // parts of related 8H vector elements.
6690 //
6691 // The middle vectors (vin[2] and vin[5]) are actually expanded
6692 // twice, one copy manipulated to provide the lower 4 bits
6693 // belonging to the first short in a pair and another copy
6694 // manipulated to provide the higher 4 bits belonging to the
6695 // second short in a pair. This is why the the vector sequences va
6696 // and vb used to hold the expanded 8H elements are of length 8.
6697
6698 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6699 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6700 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6701 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6702 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6703 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6704 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6705 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6706
6707 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6708 // and vb[4:5]
6709 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6710 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6711 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6712 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6713 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6714 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6715
6716 // shift lo byte of copy 1 of the middle stripe into the high byte
6717 __ shl(va[2], __ T8H, va[2], 8);
6718 __ shl(va[3], __ T8H, va[3], 8);
6719 __ shl(vb[2], __ T8H, vb[2], 8);
6720 __ shl(vb[3], __ T8H, vb[3], 8);
6721
6722 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6723 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6724 // are in bit positions [4..11].
6725 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6726 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6727 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6728 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6729
6730 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6731 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6732 // copy2
6733 __ andr(va[2], __ T16B, va[2], v31);
6734 __ andr(va[3], __ T16B, va[3], v31);
6735 __ ushr(va[4], __ T8H, va[4], 4);
6736 __ ushr(va[5], __ T8H, va[5], 4);
6737 __ andr(vb[2], __ T16B, vb[2], v31);
6738 __ andr(vb[3], __ T16B, vb[3], v31);
6739 __ ushr(vb[4], __ T8H, vb[4], 4);
6740 __ ushr(vb[5], __ T8H, vb[5], 4);
6741
6742 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6743 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6744 // n.b. the ordering ensures: i) inputs are consumed before they
6745 // are overwritten ii) the order of 16-bit results across successive
6746 // pairs of vectors in va and then vb reflects the order of the
6747 // corresponding 12-bit inputs
6748 __ addv(va[0], __ T8H, va[0], va[2]);
6749 __ addv(va[2], __ T8H, va[1], va[3]);
6750 __ addv(va[1], __ T8H, va[4], va[6]);
6751 __ addv(va[3], __ T8H, va[5], va[7]);
6752 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6753 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6754 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6755 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6756
6757 // store 64 results interleaved as shorts
6758 vs_st2_post(vs_front(va), __ T8H, parsed);
6759 vs_st2_post(vs_front(vb), __ T8H, parsed);
6760
6761 __ sub(parsedLength, parsedLength, 64);
6762 __ cmp(parsedLength, (u1)0);
6763 __ br(Assembler::GT, L_loop);
6764
6765 __ leave(); // required for proper stackwalking of RuntimeStub frame
6766 __ mov(r0, zr); // return 0
6767 __ ret(lr);
6768
6769 // bind label and generate constant data used by this stub
6770 __ BIND(L_F00);
6771 __ emit_int64(0x0f000f000f000f00);
6772 __ emit_int64(0x0f000f000f000f00);
6773
6774 // record the stub entry and end
6775 store_archive_data(stub_id, start, __ pc());
6776
6777 return start;
6778 }
6779
6780 // Kyber Barrett reduce function.
6781 // Implements
6782 // static int implKyberBarrettReduce(short[] coeffs) {}
6783 //
6784 // coeffs (short[256]) = c_rarg0
6785 address generate_kyberBarrettReduce() {
6786 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6787 int entry_count = StubInfo::entry_count(stub_id);
6788 assert(entry_count == 1, "sanity check");
6789 address start = load_archive_data(stub_id);
6790 if (start != nullptr) {
6791 return start;
6792 }
6793 __ align(CodeEntryAlignment);
6794 StubCodeMark mark(this, stub_id);
6795 start = __ pc();
6796 __ enter();
6797
6798 const Register coeffs = c_rarg0;
6799
6800 const Register kyberConsts = r10;
6801 const Register result = r11;
6802
6803 // As above we process 256 sets of values in total i.e. 32 x
6804 // 8H quadwords. So, we can load, add and store the data in 3
6805 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6806 // of 10 or 11 registers. A further constraint is that the
6807 // mapping needs to skip callee saves. So, we allocate the
6808 // register sequences using two 8 sequences, two 2 sequences
6809 // and two single registers.
6810 VSeq<8> vs1_1(0);
6811 VSeq<2> vs1_2(16);
6812 FloatRegister vs1_3 = v28;
6813 VSeq<8> vs2_1(18);
6814 VSeq<2> vs2_2(26);
6815 FloatRegister vs2_3 = v29;
6816
6817 // we also need a pair of corresponding constant sequences
6818
6819 VSeq<8> vc1_1(30, 0);
6820 VSeq<2> vc1_2(30, 0);
6821 FloatRegister vc1_3 = v30; // for kyber_q
6822
6823 VSeq<8> vc2_1(31, 0);
6824 VSeq<2> vc2_2(31, 0);
6825 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6826
6827 __ add(result, coeffs, 0);
6828 __ lea(kyberConsts,
6829 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6830
6831 // load q and the multiplier for the Barrett reduction
6832 __ add(kyberConsts, kyberConsts, 16);
6833 __ ldpq(vc1_3, vc2_3, kyberConsts);
6834
6835 for (int i = 0; i < 3; i++) {
6836 // load 80 or 88 coefficients
6837 vs_ldpq_post(vs1_1, coeffs);
6838 vs_ldpq_post(vs1_2, coeffs);
6839 if (i < 2) {
6840 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6841 }
6842
6843 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6844 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6845 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6846 if (i < 2) {
6847 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6848 }
6849
6850 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6851 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6852 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6853 if (i < 2) {
6854 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6855 }
6856
6857 // vs1 <- vs1 - vs2 * kyber_q
6858 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6859 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6860 if (i < 2) {
6861 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6862 }
6863
6864 vs_stpq_post(vs1_1, result);
6865 vs_stpq_post(vs1_2, result);
6866 if (i < 2) {
6867 __ str(vs1_3, __ Q, __ post(result, 16));
6868 }
6869 }
6870
6871 __ leave(); // required for proper stackwalking of RuntimeStub frame
6872 __ mov(r0, zr); // return 0
6873 __ ret(lr);
6874
6875 // record the stub entry and end
6876 store_archive_data(stub_id, start, __ pc());
6877
6878 return start;
6879 }
6880
6881
6882 // Dilithium-specific montmul helper routines that generate parallel
6883 // code for, respectively, a single 4x4s vector sequence montmul or
6884 // two such multiplies in a row.
6885
6886 // Perform 16 32-bit Montgomery multiplications in parallel
6887 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6888 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6889 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6890 // It will assert that the register use is valid
6891 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6892 }
6893
6894 // Perform 2x16 32-bit Montgomery multiplications in parallel
6895 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6896 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6897 // Schedule two successive 4x4S multiplies via the montmul helper
6898 // on the front and back halves of va, vb and vc. The helper will
6899 // assert that the register use has no overlap conflicts on each
6900 // individual call but we also need to ensure that the necessary
6901 // disjoint/equality constraints are met across both calls.
6902
6903 // vb, vc, vtmp and vq must be disjoint. va must either be
6904 // disjoint from all other registers or equal vc
6905
6906 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6907 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6908 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6909
6910 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6911 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6912
6913 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6914
6915 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6916 assert(vs_disjoint(va, vb), "va and vb overlap");
6917 assert(vs_disjoint(va, vq), "va and vq overlap");
6918 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6919
6920 // We multiply the front and back halves of each sequence 4 at a
6921 // time because
6922 //
6923 // 1) we are currently only able to get 4-way instruction
6924 // parallelism at best
6925 //
6926 // 2) we need registers for the constants in vq and temporary
6927 // scratch registers to hold intermediate results so vtmp can only
6928 // be a VSeq<4> which means we only have 4 scratch slots.
6929
6930 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6931 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6932 }
6933
6934 // Perform combined montmul then add/sub on 4x4S vectors.
6935 void dilithium_montmul16_sub_add(
6936 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6937 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6938 // compute a = montmul(a1, c)
6939 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6940 // ouptut a1 = a0 - a
6941 vs_subv(va1, __ T4S, va0, vc);
6942 // and a0 = a0 + a
6943 vs_addv(va0, __ T4S, va0, vc);
6944 }
6945
6946 // Perform combined add/sub then montul on 4x4S vectors.
6947 void dilithium_sub_add_montmul16(
6948 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6949 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6950 // compute c = a0 - a1
6951 vs_subv(vtmp1, __ T4S, va0, va1);
6952 // output a0 = a0 + a1
6953 vs_addv(va0, __ T4S, va0, va1);
6954 // output a1 = b montmul c
6955 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6956 }
6957
6958 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6959 // in the Java implementation come in sequences of at least 8, so we
6960 // can use ldpq to collect the corresponding data into pairs of vector
6961 // registers.
6962 // We collect the coefficients corresponding to the 'j+l' indexes into
6963 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6964 // then we do the (Montgomery) multiplications by the zetas in parallel
6965 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6966 // v0-v7, then do the additions into v24-v31 and the subtractions into
6967 // v0-v7 and finally save the results back to the coeffs array.
6968 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6969 const Register coeffs, const Register zetas) {
6970 int c1 = 0;
6971 int c2 = 512;
6972 int startIncr;
6973 // don't use callee save registers v8 - v15
6974 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6975 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6976 VSeq<2> vq(30); // n.b. constants overlap vs3
6977 int offsets[4] = { 0, 32, 64, 96 };
6978
6979 for (int level = 0; level < 5; level++) {
6980 int c1Start = c1;
6981 int c2Start = c2;
6982 if (level == 3) {
6983 offsets[1] = 32;
6984 offsets[2] = 128;
6985 offsets[3] = 160;
6986 } else if (level == 4) {
6987 offsets[1] = 64;
6988 offsets[2] = 128;
6989 offsets[3] = 192;
6990 }
6991
6992 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6993 // time at 4 different offsets and multiply them in order by the
6994 // next set of input values. So we employ indexed load and store
6995 // pair instructions with arrangement 4S.
6996 for (int i = 0; i < 4; i++) {
6997 // reload q and qinv
6998 vs_ldpq(vq, dilithiumConsts); // qInv, q
6999 // load 8x4S coefficients via second start pos == c2
7000 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
7001 // load next 8x4S inputs == b
7002 vs_ldpq_post(vs2, zetas);
7003 // compute a == c2 * b mod MONT_Q
7004 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7005 // load 8x4s coefficients via first start pos == c1
7006 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7007 // compute a1 = c1 + a
7008 vs_addv(vs3, __ T4S, vs1, vs2);
7009 // compute a2 = c1 - a
7010 vs_subv(vs1, __ T4S, vs1, vs2);
7011 // output a1 and a2
7012 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7013 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
7014
7015 int k = 4 * level + i;
7016
7017 if (k > 7) {
7018 startIncr = 256;
7019 } else if (k == 5) {
7020 startIncr = 384;
7021 } else {
7022 startIncr = 128;
7023 }
7024
7025 c1Start += startIncr;
7026 c2Start += startIncr;
7027 }
7028
7029 c2 /= 2;
7030 }
7031 }
7032
7033 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7034 // Implements the method
7035 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7036 // of the Java class sun.security.provider
7037 //
7038 // coeffs (int[256]) = c_rarg0
7039 // zetas (int[256]) = c_rarg1
7040 address generate_dilithiumAlmostNtt() {
7041 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7042 int entry_count = StubInfo::entry_count(stub_id);
7043 assert(entry_count == 1, "sanity check");
7044 address start = load_archive_data(stub_id);
7045 if (start != nullptr) {
7046 return start;
7047 }
7048 __ align(CodeEntryAlignment);
7049 StubCodeMark mark(this, stub_id);
7050 start = __ pc();
7051 __ enter();
7052
7053 const Register coeffs = c_rarg0;
7054 const Register zetas = c_rarg1;
7055
7056 const Register tmpAddr = r9;
7057 const Register dilithiumConsts = r10;
7058 const Register result = r11;
7059 // don't use callee save registers v8 - v15
7060 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7061 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7062 VSeq<2> vq(30); // n.b. constants overlap vs3
7063 int offsets[4] = { 0, 32, 64, 96};
7064 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7065 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7066 __ add(result, coeffs, 0);
7067 __ lea(dilithiumConsts,
7068 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7069
7070 // Each level represents one iteration of the outer for loop of the Java version.
7071
7072 // level 0-4
7073 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7074
7075 // level 5
7076
7077 // At level 5 the coefficients we need to combine with the zetas
7078 // are grouped in memory in blocks of size 4. So, for both sets of
7079 // coefficients we load 4 adjacent values at 8 different offsets
7080 // using an indexed ldr with register variant Q and multiply them
7081 // in sequence order by the next set of inputs. Likewise we store
7082 // the resuls using an indexed str with register variant Q.
7083 for (int i = 0; i < 1024; i += 256) {
7084 // reload constants q, qinv each iteration as they get clobbered later
7085 vs_ldpq(vq, dilithiumConsts); // qInv, q
7086 // load 32 (8x4S) coefficients via first offsets = c1
7087 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7088 // load next 32 (8x4S) inputs = b
7089 vs_ldpq_post(vs2, zetas);
7090 // a = b montul c1
7091 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7092 // load 32 (8x4S) coefficients via second offsets = c2
7093 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7094 // add/sub with result of multiply
7095 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7096 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7097 // write back new coefficients using same offsets
7098 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7099 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7100 }
7101
7102 // level 6
7103 // At level 6 the coefficients we need to combine with the zetas
7104 // are grouped in memory in pairs, the first two being montmul
7105 // inputs and the second add/sub inputs. We can still implement
7106 // the montmul+sub+add using 4-way parallelism but only if we
7107 // combine the coefficients with the zetas 16 at a time. We load 8
7108 // adjacent values at 4 different offsets using an ld2 load with
7109 // arrangement 2D. That interleaves the lower and upper halves of
7110 // each pair of quadwords into successive vector registers. We
7111 // then need to montmul the 4 even elements of the coefficients
7112 // register sequence by the zetas in order and then add/sub the 4
7113 // odd elements of the coefficients register sequence. We use an
7114 // equivalent st2 operation to store the results back into memory
7115 // de-interleaved.
7116 for (int i = 0; i < 1024; i += 128) {
7117 // reload constants q, qinv each iteration as they get clobbered later
7118 vs_ldpq(vq, dilithiumConsts); // qInv, q
7119 // load interleaved 16 (4x2D) coefficients via offsets
7120 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7121 // load next 16 (4x4S) inputs
7122 vs_ldpq_post(vs_front(vs2), zetas);
7123 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7124 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7125 vs_front(vs2), vtmp, vq);
7126 // store interleaved 16 (4x2D) coefficients via offsets
7127 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7128 }
7129
7130 // level 7
7131 // At level 7 the coefficients we need to combine with the zetas
7132 // occur singly with montmul inputs alterating with add/sub
7133 // inputs. Once again we can use 4-way parallelism to combine 16
7134 // zetas at a time. However, we have to load 8 adjacent values at
7135 // 4 different offsets using an ld2 load with arrangement 4S. That
7136 // interleaves the the odd words of each pair into one
7137 // coefficients vector register and the even words of the pair
7138 // into the next register. We then need to montmul the 4 even
7139 // elements of the coefficients register sequence by the zetas in
7140 // order and then add/sub the 4 odd elements of the coefficients
7141 // register sequence. We use an equivalent st2 operation to store
7142 // the results back into memory de-interleaved.
7143
7144 for (int i = 0; i < 1024; i += 128) {
7145 // reload constants q, qinv each iteration as they get clobbered later
7146 vs_ldpq(vq, dilithiumConsts); // qInv, q
7147 // load interleaved 16 (4x4S) coefficients via offsets
7148 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7149 // load next 16 (4x4S) inputs
7150 vs_ldpq_post(vs_front(vs2), zetas);
7151 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7152 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7153 vs_front(vs2), vtmp, vq);
7154 // store interleaved 16 (4x4S) coefficients via offsets
7155 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7156 }
7157 __ leave(); // required for proper stackwalking of RuntimeStub frame
7158 __ mov(r0, zr); // return 0
7159 __ ret(lr);
7160
7161 // record the stub entry and end
7162 store_archive_data(stub_id, start, __ pc());
7163
7164 return start;
7165 }
7166
7167 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7168 // in the Java implementation come in sequences of at least 8, so we
7169 // can use ldpq to collect the corresponding data into pairs of vector
7170 // registers
7171 // We collect the coefficients that correspond to the 'j's into vs1
7172 // the coefficiets that correspond to the 'j+l's into vs2 then
7173 // do the additions into vs3 and the subtractions into vs1 then
7174 // save the result of the additions, load the zetas into vs2
7175 // do the (Montgomery) multiplications by zeta in parallel into vs2
7176 // finally save the results back to the coeffs array
7177 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7178 const Register coeffs, const Register zetas) {
7179 int c1 = 0;
7180 int c2 = 32;
7181 int startIncr;
7182 int offsets[4];
7183 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7184 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7185 VSeq<2> vq(30); // n.b. constants overlap vs3
7186
7187 offsets[0] = 0;
7188
7189 for (int level = 3; level < 8; level++) {
7190 int c1Start = c1;
7191 int c2Start = c2;
7192 if (level == 3) {
7193 offsets[1] = 64;
7194 offsets[2] = 128;
7195 offsets[3] = 192;
7196 } else if (level == 4) {
7197 offsets[1] = 32;
7198 offsets[2] = 128;
7199 offsets[3] = 160;
7200 } else {
7201 offsets[1] = 32;
7202 offsets[2] = 64;
7203 offsets[3] = 96;
7204 }
7205
7206 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7207 // time at 4 different offsets and multiply them in order by the
7208 // next set of input values. So we employ indexed load and store
7209 // pair instructions with arrangement 4S.
7210 for (int i = 0; i < 4; i++) {
7211 // load v1 32 (8x4S) coefficients relative to first start index
7212 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7213 // load v2 32 (8x4S) coefficients relative to second start index
7214 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7215 // a0 = v1 + v2 -- n.b. clobbers vqs
7216 vs_addv(vs3, __ T4S, vs1, vs2);
7217 // a1 = v1 - v2
7218 vs_subv(vs1, __ T4S, vs1, vs2);
7219 // save a1 relative to first start index
7220 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7221 // load constants q, qinv each iteration as they get clobbered above
7222 vs_ldpq(vq, dilithiumConsts); // qInv, q
7223 // load b next 32 (8x4S) inputs
7224 vs_ldpq_post(vs2, zetas);
7225 // a = a1 montmul b
7226 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7227 // save a relative to second start index
7228 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7229
7230 int k = 4 * level + i;
7231
7232 if (k < 24) {
7233 startIncr = 256;
7234 } else if (k == 25) {
7235 startIncr = 384;
7236 } else {
7237 startIncr = 128;
7238 }
7239
7240 c1Start += startIncr;
7241 c2Start += startIncr;
7242 }
7243
7244 c2 *= 2;
7245 }
7246 }
7247
7248 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7249 // Implements the method
7250 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7251 // the sun.security.provider.ML_DSA class.
7252 //
7253 // coeffs (int[256]) = c_rarg0
7254 // zetas (int[256]) = c_rarg1
7255 address generate_dilithiumAlmostInverseNtt() {
7256 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7257 int entry_count = StubInfo::entry_count(stub_id);
7258 assert(entry_count == 1, "sanity check");
7259 address start = load_archive_data(stub_id);
7260 if (start != nullptr) {
7261 return start;
7262 }
7263 __ align(CodeEntryAlignment);
7264 StubCodeMark mark(this, stub_id);
7265 start = __ pc();
7266 __ enter();
7267
7268 const Register coeffs = c_rarg0;
7269 const Register zetas = c_rarg1;
7270
7271 const Register tmpAddr = r9;
7272 const Register dilithiumConsts = r10;
7273 const Register result = r11;
7274 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7275 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7276 VSeq<2> vq(30); // n.b. constants overlap vs3
7277 int offsets[4] = { 0, 32, 64, 96 };
7278 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7279 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7280
7281 __ add(result, coeffs, 0);
7282 __ lea(dilithiumConsts,
7283 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7284
7285 // Each level represents one iteration of the outer for loop of the Java version
7286
7287 // level 0
7288 // At level 0 we need to interleave adjacent quartets of
7289 // coefficients before we multiply and add/sub by the next 16
7290 // zetas just as we did for level 7 in the multiply code. So we
7291 // load and store the values using an ld2/st2 with arrangement 4S.
7292 for (int i = 0; i < 1024; i += 128) {
7293 // load constants q, qinv
7294 // n.b. this can be moved out of the loop as they do not get
7295 // clobbered by first two loops
7296 vs_ldpq(vq, dilithiumConsts); // qInv, q
7297 // a0/a1 load interleaved 32 (8x4S) coefficients
7298 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7299 // b load next 32 (8x4S) inputs
7300 vs_ldpq_post(vs_front(vs2), zetas);
7301 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7302 // n.b. second half of vs2 provides temporary register storage
7303 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7304 vs_front(vs2), vs_back(vs2), vtmp, vq);
7305 // a0/a1 store interleaved 32 (8x4S) coefficients
7306 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7307 }
7308
7309 // level 1
7310 // At level 1 we need to interleave pairs of adjacent pairs of
7311 // coefficients before we multiply by the next 16 zetas just as we
7312 // did for level 6 in the multiply code. So we load and store the
7313 // values an ld2/st2 with arrangement 2D.
7314 for (int i = 0; i < 1024; i += 128) {
7315 // a0/a1 load interleaved 32 (8x2D) coefficients
7316 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7317 // b load next 16 (4x4S) inputs
7318 vs_ldpq_post(vs_front(vs2), zetas);
7319 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7320 // n.b. second half of vs2 provides temporary register storage
7321 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7322 vs_front(vs2), vs_back(vs2), vtmp, vq);
7323 // a0/a1 store interleaved 32 (8x2D) coefficients
7324 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7325 }
7326
7327 // level 2
7328 // At level 2 coefficients come in blocks of 4. So, we load 4
7329 // adjacent coefficients at 8 distinct offsets for both the first
7330 // and second coefficient sequences, using an ldr with register
7331 // variant Q then combine them with next set of 32 zetas. Likewise
7332 // we store the results using an str with register variant Q.
7333 for (int i = 0; i < 1024; i += 256) {
7334 // c0 load 32 (8x4S) coefficients via first offsets
7335 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7336 // c1 load 32 (8x4S) coefficients via second offsets
7337 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
7338 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7339 vs_addv(vs3, __ T4S, vs1, vs2);
7340 // c = c0 - c1
7341 vs_subv(vs1, __ T4S, vs1, vs2);
7342 // store a0 32 (8x4S) coefficients via first offsets
7343 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7344 // b load 32 (8x4S) next inputs
7345 vs_ldpq_post(vs2, zetas);
7346 // reload constants q, qinv -- they were clobbered earlier
7347 vs_ldpq(vq, dilithiumConsts); // qInv, q
7348 // compute a1 = b montmul c
7349 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7350 // store a1 32 (8x4S) coefficients via second offsets
7351 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7352 }
7353
7354 // level 3-7
7355 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7356
7357 __ leave(); // required for proper stackwalking of RuntimeStub frame
7358 __ mov(r0, zr); // return 0
7359 __ ret(lr);
7360
7361 // record the stub entry and end
7362 store_archive_data(stub_id, start, __ pc());
7363
7364 return start;
7365 }
7366
7367 // Dilithium multiply polynomials in the NTT domain.
7368 // Straightforward implementation of the method
7369 // static int implDilithiumNttMult(
7370 // int[] result, int[] ntta, int[] nttb {} of
7371 // the sun.security.provider.ML_DSA class.
7372 //
7373 // result (int[256]) = c_rarg0
7374 // poly1 (int[256]) = c_rarg1
7375 // poly2 (int[256]) = c_rarg2
7376 address generate_dilithiumNttMult() {
7377 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7378 int entry_count = StubInfo::entry_count(stub_id);
7379 assert(entry_count == 1, "sanity check");
7380 address start = load_archive_data(stub_id);
7381 if (start != nullptr) {
7382 return start;
7383 }
7384 __ align(CodeEntryAlignment);
7385 StubCodeMark mark(this, stub_id);
7386 start = __ pc();
7387 __ enter();
7388
7389 Label L_loop;
7390
7391 const Register result = c_rarg0;
7392 const Register poly1 = c_rarg1;
7393 const Register poly2 = c_rarg2;
7394
7395 const Register dilithiumConsts = r10;
7396 const Register len = r11;
7397
7398 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7399 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7400 VSeq<2> vq(30); // n.b. constants overlap vs3
7401 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7402
7403 __ lea(dilithiumConsts,
7404 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7405
7406 // load constants q, qinv
7407 vs_ldpq(vq, dilithiumConsts); // qInv, q
7408 // load constant rSquare into v29
7409 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7410
7411 __ mov(len, zr);
7412 __ add(len, len, 1024);
7413
7414 __ BIND(L_loop);
7415
7416 // b load 32 (8x4S) next inputs from poly1
7417 vs_ldpq_post(vs1, poly1);
7418 // c load 32 (8x4S) next inputs from poly2
7419 vs_ldpq_post(vs2, poly2);
7420 // compute a = b montmul c
7421 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7422 // compute a = rsquare montmul a
7423 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7424 // save a 32 (8x4S) results
7425 vs_stpq_post(vs2, result);
7426
7427 __ sub(len, len, 128);
7428 __ cmp(len, (u1)128);
7429 __ br(Assembler::GE, L_loop);
7430
7431 __ leave(); // required for proper stackwalking of RuntimeStub frame
7432 __ mov(r0, zr); // return 0
7433 __ ret(lr);
7434
7435 // record the stub entry and end
7436 store_archive_data(stub_id, start, __ pc());
7437
7438 return start;
7439 }
7440
7441 // Dilithium Motgomery multiply an array by a constant.
7442 // A straightforward implementation of the method
7443 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7444 // of the sun.security.provider.MLDSA class
7445 //
7446 // coeffs (int[256]) = c_rarg0
7447 // constant (int) = c_rarg1
7448 address generate_dilithiumMontMulByConstant() {
7449 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7450 int entry_count = StubInfo::entry_count(stub_id);
7451 assert(entry_count == 1, "sanity check");
7452 address start = load_archive_data(stub_id);
7453 if (start != nullptr) {
7454 return start;
7455 }
7456 __ align(CodeEntryAlignment);
7457 StubCodeMark mark(this, stub_id);
7458 start = __ pc();
7459 __ enter();
7460
7461 Label L_loop;
7462
7463 const Register coeffs = c_rarg0;
7464 const Register constant = c_rarg1;
7465
7466 const Register dilithiumConsts = r10;
7467 const Register result = r11;
7468 const Register len = r12;
7469
7470 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7471 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7472 VSeq<2> vq(30); // n.b. constants overlap vs3
7473 VSeq<8> vconst(29, 0); // for montmul by constant
7474
7475 // results track inputs
7476 __ add(result, coeffs, 0);
7477 __ lea(dilithiumConsts,
7478 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7479
7480 // load constants q, qinv -- they do not get clobbered by first two loops
7481 vs_ldpq(vq, dilithiumConsts); // qInv, q
7482 // copy caller supplied constant across vconst
7483 __ dup(vconst[0], __ T4S, constant);
7484 __ mov(len, zr);
7485 __ add(len, len, 1024);
7486
7487 __ BIND(L_loop);
7488
7489 // load next 32 inputs
7490 vs_ldpq_post(vs2, coeffs);
7491 // mont mul by constant
7492 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7493 // write next 32 results
7494 vs_stpq_post(vs2, result);
7495
7496 __ sub(len, len, 128);
7497 __ cmp(len, (u1)128);
7498 __ br(Assembler::GE, L_loop);
7499
7500 __ leave(); // required for proper stackwalking of RuntimeStub frame
7501 __ mov(r0, zr); // return 0
7502 __ ret(lr);
7503
7504 // record the stub entry and end
7505 store_archive_data(stub_id, start, __ pc());
7506
7507 return start;
7508 }
7509
7510 // Dilithium decompose poly.
7511 // Implements the method
7512 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7513 // of the sun.security.provider.ML_DSA class
7514 //
7515 // input (int[256]) = c_rarg0
7516 // lowPart (int[256]) = c_rarg1
7517 // highPart (int[256]) = c_rarg2
7518 // twoGamma2 (int) = c_rarg3
7519 // multiplier (int) = c_rarg4
7520 address generate_dilithiumDecomposePoly() {
7521 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7522 int entry_count = StubInfo::entry_count(stub_id);
7523 assert(entry_count == 1, "sanity check");
7524 address start = load_archive_data(stub_id);
7525 if (start != nullptr) {
7526 return start;
7527 }
7528 __ align(CodeEntryAlignment);
7529 StubCodeMark mark(this, stub_id);
7530 start = __ pc();
7531 Label L_loop;
7532
7533 const Register input = c_rarg0;
7534 const Register lowPart = c_rarg1;
7535 const Register highPart = c_rarg2;
7536 const Register twoGamma2 = c_rarg3;
7537 const Register multiplier = c_rarg4;
7538
7539 const Register len = r9;
7540 const Register dilithiumConsts = r10;
7541 const Register tmp = r11;
7542
7543 // 6 independent sets of 4x4s values
7544 VSeq<4> vs1(0), vs2(4), vs3(8);
7545 VSeq<4> vs4(12), vs5(16), vtmp(20);
7546
7547 // 7 constants for cross-multiplying
7548 VSeq<4> one(25, 0);
7549 VSeq<4> qminus1(26, 0);
7550 VSeq<4> g2(27, 0);
7551 VSeq<4> twog2(28, 0);
7552 VSeq<4> mult(29, 0);
7553 VSeq<4> q(30, 0);
7554 VSeq<4> qadd(31, 0);
7555
7556 __ enter();
7557
7558 __ lea(dilithiumConsts,
7559 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7560
7561 // save callee-saved registers
7562 __ stpd(v8, v9, __ pre(sp, -64));
7563 __ stpd(v10, v11, Address(sp, 16));
7564 __ stpd(v12, v13, Address(sp, 32));
7565 __ stpd(v14, v15, Address(sp, 48));
7566
7567 // populate constant registers
7568 __ mov(tmp, zr);
7569 __ add(tmp, tmp, 1);
7570 __ dup(one[0], __ T4S, tmp); // 1
7571 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7572 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7573 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7574 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7575 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7576 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7577
7578 __ mov(len, zr);
7579 __ add(len, len, 1024);
7580
7581 __ BIND(L_loop);
7582
7583 // load next 4x4S inputs interleaved: rplus --> vs1
7584 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7585
7586 // rplus = rplus - ((rplus + qadd) >> 23) * q
7587 vs_addv(vtmp, __ T4S, vs1, qadd);
7588 vs_sshr(vtmp, __ T4S, vtmp, 23);
7589 vs_mulv(vtmp, __ T4S, vtmp, q);
7590 vs_subv(vs1, __ T4S, vs1, vtmp);
7591
7592 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7593 vs_sshr(vtmp, __ T4S, vs1, 31);
7594 vs_andr(vtmp, vtmp, q);
7595 vs_addv(vs1, __ T4S, vs1, vtmp);
7596
7597 // quotient --> vs2
7598 // int quotient = (rplus * multiplier) >> 22;
7599 vs_mulv(vtmp, __ T4S, vs1, mult);
7600 vs_sshr(vs2, __ T4S, vtmp, 22);
7601
7602 // r0 --> vs3
7603 // int r0 = rplus - quotient * twoGamma2;
7604 vs_mulv(vtmp, __ T4S, vs2, twog2);
7605 vs_subv(vs3, __ T4S, vs1, vtmp);
7606
7607 // mask --> vs4
7608 // int mask = (twoGamma2 - r0) >> 22;
7609 vs_subv(vtmp, __ T4S, twog2, vs3);
7610 vs_sshr(vs4, __ T4S, vtmp, 22);
7611
7612 // r0 -= (mask & twoGamma2);
7613 vs_andr(vtmp, vs4, twog2);
7614 vs_subv(vs3, __ T4S, vs3, vtmp);
7615
7616 // quotient += (mask & 1);
7617 vs_andr(vtmp, vs4, one);
7618 vs_addv(vs2, __ T4S, vs2, vtmp);
7619
7620 // mask = (twoGamma2 / 2 - r0) >> 31;
7621 vs_subv(vtmp, __ T4S, g2, vs3);
7622 vs_sshr(vs4, __ T4S, vtmp, 31);
7623
7624 // r0 -= (mask & twoGamma2);
7625 vs_andr(vtmp, vs4, twog2);
7626 vs_subv(vs3, __ T4S, vs3, vtmp);
7627
7628 // quotient += (mask & 1);
7629 vs_andr(vtmp, vs4, one);
7630 vs_addv(vs2, __ T4S, vs2, vtmp);
7631
7632 // r1 --> vs5
7633 // int r1 = rplus - r0 - (dilithium_q - 1);
7634 vs_subv(vtmp, __ T4S, vs1, vs3);
7635 vs_subv(vs5, __ T4S, vtmp, qminus1);
7636
7637 // r1 --> vs1 (overwriting rplus)
7638 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7639 vs_negr(vtmp, __ T4S, vs5);
7640 vs_orr(vtmp, vs5, vtmp);
7641 vs_sshr(vs1, __ T4S, vtmp, 31);
7642
7643 // r0 += ~r1;
7644 vs_notr(vtmp, vs1);
7645 vs_addv(vs3, __ T4S, vs3, vtmp);
7646
7647 // r1 = r1 & quotient;
7648 vs_andr(vs1, vs2, vs1);
7649
7650 // store results inteleaved
7651 // lowPart[m] = r0;
7652 // highPart[m] = r1;
7653 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7654 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7655
7656 __ sub(len, len, 64);
7657 __ cmp(len, (u1)64);
7658 __ br(Assembler::GE, L_loop);
7659
7660 // restore callee-saved vector registers
7661 __ ldpd(v14, v15, Address(sp, 48));
7662 __ ldpd(v12, v13, Address(sp, 32));
7663 __ ldpd(v10, v11, Address(sp, 16));
7664 __ ldpd(v8, v9, __ post(sp, 64));
7665
7666 __ leave(); // required for proper stackwalking of RuntimeStub frame
7667 __ mov(r0, zr); // return 0
7668 __ ret(lr);
7669
7670 // record the stub entry and end
7671 store_archive_data(stub_id, start, __ pc());
7672
7673 return start;
7674 }
7675
7676 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7677 Register tmp0, Register tmp1, Register tmp2) {
7678 __ bic(tmp0, a2, a1); // for a0
7679 __ bic(tmp1, a3, a2); // for a1
7680 __ bic(tmp2, a4, a3); // for a2
7681 __ eor(a2, a2, tmp2);
7682 __ bic(tmp2, a0, a4); // for a3
7683 __ eor(a3, a3, tmp2);
7684 __ bic(tmp2, a1, a0); // for a4
7685 __ eor(a0, a0, tmp0);
7686 __ eor(a1, a1, tmp1);
7687 __ eor(a4, a4, tmp2);
7688 }
7689
7690 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7691 Register a0, Register a1, Register a2, Register a3, Register a4,
7692 Register a5, Register a6, Register a7, Register a8, Register a9,
7693 Register a10, Register a11, Register a12, Register a13, Register a14,
7694 Register a15, Register a16, Register a17, Register a18, Register a19,
7695 Register a20, Register a21, Register a22, Register a23, Register a24,
7696 Register tmp0, Register tmp1, Register tmp2) {
7697 __ eor3(tmp1, a4, a9, a14);
7698 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7699 __ eor3(tmp2, a1, a6, a11);
7700 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7701 __ rax1(tmp2, tmp0, tmp1); // d0
7702 {
7703
7704 Register tmp3, tmp4;
7705 if (can_use_fp && can_use_r18) {
7706 tmp3 = rfp;
7707 tmp4 = r18_tls;
7708 } else {
7709 tmp3 = a4;
7710 tmp4 = a9;
7711 __ stp(tmp3, tmp4, __ pre(sp, -16));
7712 }
7713
7714 __ eor3(tmp3, a0, a5, a10);
7715 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7716 __ eor(a0, a0, tmp2);
7717 __ eor(a5, a5, tmp2);
7718 __ eor(a10, a10, tmp2);
7719 __ eor(a15, a15, tmp2);
7720 __ eor(a20, a20, tmp2); // d0(tmp2)
7721 __ eor3(tmp3, a2, a7, a12);
7722 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7723 __ rax1(tmp3, tmp4, tmp2); // d1
7724 __ eor(a1, a1, tmp3);
7725 __ eor(a6, a6, tmp3);
7726 __ eor(a11, a11, tmp3);
7727 __ eor(a16, a16, tmp3);
7728 __ eor(a21, a21, tmp3); // d1(tmp3)
7729 __ rax1(tmp3, tmp2, tmp0); // d3
7730 __ eor3(tmp2, a3, a8, a13);
7731 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7732 __ eor(a3, a3, tmp3);
7733 __ eor(a8, a8, tmp3);
7734 __ eor(a13, a13, tmp3);
7735 __ eor(a18, a18, tmp3);
7736 __ eor(a23, a23, tmp3);
7737 __ rax1(tmp2, tmp1, tmp0); // d2
7738 __ eor(a2, a2, tmp2);
7739 __ eor(a7, a7, tmp2);
7740 __ eor(a12, a12, tmp2);
7741 __ rax1(tmp0, tmp0, tmp4); // d4
7742 if (!can_use_fp || !can_use_r18) {
7743 __ ldp(tmp3, tmp4, __ post(sp, 16));
7744 }
7745 __ eor(a17, a17, tmp2);
7746 __ eor(a22, a22, tmp2);
7747 __ eor(a4, a4, tmp0);
7748 __ eor(a9, a9, tmp0);
7749 __ eor(a14, a14, tmp0);
7750 __ eor(a19, a19, tmp0);
7751 __ eor(a24, a24, tmp0);
7752 }
7753
7754 __ rol(tmp0, a10, 3);
7755 __ rol(a10, a1, 1);
7756 __ rol(a1, a6, 44);
7757 __ rol(a6, a9, 20);
7758 __ rol(a9, a22, 61);
7759 __ rol(a22, a14, 39);
7760 __ rol(a14, a20, 18);
7761 __ rol(a20, a2, 62);
7762 __ rol(a2, a12, 43);
7763 __ rol(a12, a13, 25);
7764 __ rol(a13, a19, 8) ;
7765 __ rol(a19, a23, 56);
7766 __ rol(a23, a15, 41);
7767 __ rol(a15, a4, 27);
7768 __ rol(a4, a24, 14);
7769 __ rol(a24, a21, 2);
7770 __ rol(a21, a8, 55);
7771 __ rol(a8, a16, 45);
7772 __ rol(a16, a5, 36);
7773 __ rol(a5, a3, 28);
7774 __ rol(a3, a18, 21);
7775 __ rol(a18, a17, 15);
7776 __ rol(a17, a11, 10);
7777 __ rol(a11, a7, 6);
7778 __ mov(a7, tmp0);
7779
7780 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7781 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7782 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7783 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7784 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7785
7786 __ ldr(tmp1, __ post(rc, 8));
7787 __ eor(a0, a0, tmp1);
7788
7789 }
7790
7791 // Arguments:
7792 //
7793 // Inputs:
7794 // c_rarg0 - byte[] source+offset
7795 // c_rarg1 - byte[] SHA.state
7796 // c_rarg2 - int block_size
7797 // c_rarg3 - int offset
7798 // c_rarg4 - int limit
7799 //
7800 address generate_sha3_implCompress_gpr(StubId stub_id) {
7801 bool multi_block;
7802 switch (stub_id) {
7803 case StubId::stubgen_sha3_implCompress_id:
7804 multi_block = false;
7805 break;
7806 case StubId::stubgen_sha3_implCompressMB_id:
7807 multi_block = true;
7808 break;
7809 default:
7810 ShouldNotReachHere();
7811 }
7812 int entry_count = StubInfo::entry_count(stub_id);
7813 assert(entry_count == 1, "sanity check");
7814 address start = load_archive_data(stub_id);
7815 if (start != nullptr) {
7816 return start;
7817 }
7818 __ align(CodeEntryAlignment);
7819 StubCodeMark mark(this, stub_id);
7820 start = __ pc();
7821
7822 Register buf = c_rarg0;
7823 Register state = c_rarg1;
7824 Register block_size = c_rarg2;
7825 Register ofs = c_rarg3;
7826 Register limit = c_rarg4;
7827
7828 // use r3.r17,r19..r28 to keep a0..a24.
7829 // a0..a24 are respective locals from SHA3.java
7830 Register a0 = r25,
7831 a1 = r26,
7832 a2 = r27,
7833 a3 = r3,
7834 a4 = r4,
7835 a5 = r5,
7836 a6 = r6,
7837 a7 = r7,
7838 a8 = rscratch1, // r8
7839 a9 = rscratch2, // r9
7840 a10 = r10,
7841 a11 = r11,
7842 a12 = r12,
7843 a13 = r13,
7844 a14 = r14,
7845 a15 = r15,
7846 a16 = r16,
7847 a17 = r17,
7848 a18 = r28,
7849 a19 = r19,
7850 a20 = r20,
7851 a21 = r21,
7852 a22 = r22,
7853 a23 = r23,
7854 a24 = r24;
7855
7856 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7857
7858 Label sha3_loop, rounds24_preloop, loop_body;
7859 Label sha3_512_or_sha3_384, shake128;
7860
7861 bool can_use_r18 = false;
7862 #ifndef R18_RESERVED
7863 can_use_r18 = true;
7864 #endif
7865 bool can_use_fp = !PreserveFramePointer;
7866
7867 __ enter();
7868
7869 // save almost all yet unsaved gpr registers on stack
7870 __ str(block_size, __ pre(sp, -128));
7871 if (multi_block) {
7872 __ stpw(ofs, limit, Address(sp, 8));
7873 }
7874 // 8 bytes at sp+16 will be used to keep buf
7875 __ stp(r19, r20, Address(sp, 32));
7876 __ stp(r21, r22, Address(sp, 48));
7877 __ stp(r23, r24, Address(sp, 64));
7878 __ stp(r25, r26, Address(sp, 80));
7879 __ stp(r27, r28, Address(sp, 96));
7880 if (can_use_r18 && can_use_fp) {
7881 __ stp(r18_tls, state, Address(sp, 112));
7882 } else {
7883 __ str(state, Address(sp, 112));
7884 }
7885
7886 // begin sha3 calculations: loading a0..a24 from state arrary
7887 __ ldp(a0, a1, state);
7888 __ ldp(a2, a3, Address(state, 16));
7889 __ ldp(a4, a5, Address(state, 32));
7890 __ ldp(a6, a7, Address(state, 48));
7891 __ ldp(a8, a9, Address(state, 64));
7892 __ ldp(a10, a11, Address(state, 80));
7893 __ ldp(a12, a13, Address(state, 96));
7894 __ ldp(a14, a15, Address(state, 112));
7895 __ ldp(a16, a17, Address(state, 128));
7896 __ ldp(a18, a19, Address(state, 144));
7897 __ ldp(a20, a21, Address(state, 160));
7898 __ ldp(a22, a23, Address(state, 176));
7899 __ ldr(a24, Address(state, 192));
7900
7901 __ BIND(sha3_loop);
7902
7903 // load input
7904 __ ldp(tmp3, tmp2, __ post(buf, 16));
7905 __ eor(a0, a0, tmp3);
7906 __ eor(a1, a1, tmp2);
7907 __ ldp(tmp3, tmp2, __ post(buf, 16));
7908 __ eor(a2, a2, tmp3);
7909 __ eor(a3, a3, tmp2);
7910 __ ldp(tmp3, tmp2, __ post(buf, 16));
7911 __ eor(a4, a4, tmp3);
7912 __ eor(a5, a5, tmp2);
7913 __ ldr(tmp3, __ post(buf, 8));
7914 __ eor(a6, a6, tmp3);
7915
7916 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7917 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7918
7919 __ ldp(tmp3, tmp2, __ post(buf, 16));
7920 __ eor(a7, a7, tmp3);
7921 __ eor(a8, a8, tmp2);
7922 __ ldp(tmp3, tmp2, __ post(buf, 16));
7923 __ eor(a9, a9, tmp3);
7924 __ eor(a10, a10, tmp2);
7925 __ ldp(tmp3, tmp2, __ post(buf, 16));
7926 __ eor(a11, a11, tmp3);
7927 __ eor(a12, a12, tmp2);
7928 __ ldp(tmp3, tmp2, __ post(buf, 16));
7929 __ eor(a13, a13, tmp3);
7930 __ eor(a14, a14, tmp2);
7931 __ ldp(tmp3, tmp2, __ post(buf, 16));
7932 __ eor(a15, a15, tmp3);
7933 __ eor(a16, a16, tmp2);
7934
7935 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7936 __ andw(tmp2, block_size, 48);
7937 __ cbzw(tmp2, rounds24_preloop);
7938 __ tbnz(block_size, 5, shake128);
7939 // block_size == 144, bit5 == 0, SHA3-244
7940 __ ldr(tmp3, __ post(buf, 8));
7941 __ eor(a17, a17, tmp3);
7942 __ b(rounds24_preloop);
7943
7944 __ BIND(shake128);
7945 __ ldp(tmp3, tmp2, __ post(buf, 16));
7946 __ eor(a17, a17, tmp3);
7947 __ eor(a18, a18, tmp2);
7948 __ ldp(tmp3, tmp2, __ post(buf, 16));
7949 __ eor(a19, a19, tmp3);
7950 __ eor(a20, a20, tmp2);
7951 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7952
7953 __ BIND(sha3_512_or_sha3_384);
7954 __ ldp(tmp3, tmp2, __ post(buf, 16));
7955 __ eor(a7, a7, tmp3);
7956 __ eor(a8, a8, tmp2);
7957 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7958
7959 // SHA3-384
7960 __ ldp(tmp3, tmp2, __ post(buf, 16));
7961 __ eor(a9, a9, tmp3);
7962 __ eor(a10, a10, tmp2);
7963 __ ldp(tmp3, tmp2, __ post(buf, 16));
7964 __ eor(a11, a11, tmp3);
7965 __ eor(a12, a12, tmp2);
7966
7967 __ BIND(rounds24_preloop);
7968 __ fmovs(v0, 24.0); // float loop counter,
7969 __ fmovs(v1, 1.0); // exact representation
7970
7971 __ str(buf, Address(sp, 16));
7972 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
7973
7974 __ BIND(loop_body);
7975 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7976 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7977 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7978 tmp0, tmp1, tmp2);
7979 __ fsubs(v0, v0, v1);
7980 __ fcmps(v0, 0.0);
7981 __ br(__ NE, loop_body);
7982
7983 if (multi_block) {
7984 __ ldrw(block_size, sp); // block_size
7985 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7986 __ addw(tmp2, tmp2, block_size);
7987 __ cmpw(tmp2, tmp1);
7988 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7989 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7990 __ br(Assembler::LE, sha3_loop);
7991 __ movw(c_rarg0, tmp2); // return offset
7992 }
7993 if (can_use_fp && can_use_r18) {
7994 __ ldp(r18_tls, state, Address(sp, 112));
7995 } else {
7996 __ ldr(state, Address(sp, 112));
7997 }
7998 // save calculated sha3 state
7999 __ stp(a0, a1, Address(state));
8000 __ stp(a2, a3, Address(state, 16));
8001 __ stp(a4, a5, Address(state, 32));
8002 __ stp(a6, a7, Address(state, 48));
8003 __ stp(a8, a9, Address(state, 64));
8004 __ stp(a10, a11, Address(state, 80));
8005 __ stp(a12, a13, Address(state, 96));
8006 __ stp(a14, a15, Address(state, 112));
8007 __ stp(a16, a17, Address(state, 128));
8008 __ stp(a18, a19, Address(state, 144));
8009 __ stp(a20, a21, Address(state, 160));
8010 __ stp(a22, a23, Address(state, 176));
8011 __ str(a24, Address(state, 192));
8012
8013 // restore required registers from stack
8014 __ ldp(r19, r20, Address(sp, 32));
8015 __ ldp(r21, r22, Address(sp, 48));
8016 __ ldp(r23, r24, Address(sp, 64));
8017 __ ldp(r25, r26, Address(sp, 80));
8018 __ ldp(r27, r28, Address(sp, 96));
8019 if (can_use_fp && can_use_r18) {
8020 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
8021 } // else no need to recalculate rfp, since it wasn't changed
8022
8023 __ leave();
8024
8025 __ ret(lr);
8026
8027 // record the stub entry and end
8028 store_archive_data(stub_id, start, __ pc());
8029
8030 return start;
8031 }
8032
8033 /**
8034 * Arguments:
8035 *
8036 * Inputs:
8037 * c_rarg0 - int crc
8038 * c_rarg1 - byte* buf
8039 * c_rarg2 - int length
8040 *
8041 * Output:
8042 * rax - int crc result
8043 */
8044 address generate_updateBytesCRC32() {
8045 assert(UseCRC32Intrinsics, "what are we doing here?");
8046 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
8047 int entry_count = StubInfo::entry_count(stub_id);
8048 assert(entry_count == 1, "sanity check");
8049 address start = load_archive_data(stub_id);
8050 if (start != nullptr) {
8051 return start;
8052 }
8053 __ align(CodeEntryAlignment);
8054 StubCodeMark mark(this, stub_id);
8055
8056 start = __ pc();
8057
8058 const Register crc = c_rarg0; // crc
8059 const Register buf = c_rarg1; // source java byte array address
8060 const Register len = c_rarg2; // length
8061 const Register table0 = c_rarg3; // crc_table address
8062 const Register table1 = c_rarg4;
8063 const Register table2 = c_rarg5;
8064 const Register table3 = c_rarg6;
8065 const Register tmp3 = c_rarg7;
8066
8067 BLOCK_COMMENT("Entry:");
8068 __ enter(); // required for proper stackwalking of RuntimeStub frame
8069
8070 __ kernel_crc32(crc, buf, len,
8071 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8072
8073 __ leave(); // required for proper stackwalking of RuntimeStub frame
8074 __ ret(lr);
8075
8076 // record the stub entry and end
8077 store_archive_data(stub_id, start, __ pc());
8078
8079 return start;
8080 }
8081
8082 /**
8083 * Arguments:
8084 *
8085 * Inputs:
8086 * c_rarg0 - int crc
8087 * c_rarg1 - byte* buf
8088 * c_rarg2 - int length
8089 * c_rarg3 - int* table
8090 *
8091 * Output:
8092 * r0 - int crc result
8093 */
8094 address generate_updateBytesCRC32C() {
8095 assert(UseCRC32CIntrinsics, "what are we doing here?");
8096 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
8097 int entry_count = StubInfo::entry_count(stub_id);
8098 assert(entry_count == 1, "sanity check");
8099 address start = load_archive_data(stub_id);
8100 if (start != nullptr) {
8101 return start;
8102 }
8103 __ align(CodeEntryAlignment);
8104 StubCodeMark mark(this, stub_id);
8105
8106 start = __ pc();
8107
8108 const Register crc = c_rarg0; // crc
8109 const Register buf = c_rarg1; // source java byte array address
8110 const Register len = c_rarg2; // length
8111 const Register table0 = c_rarg3; // crc_table address
8112 const Register table1 = c_rarg4;
8113 const Register table2 = c_rarg5;
8114 const Register table3 = c_rarg6;
8115 const Register tmp3 = c_rarg7;
8116
8117 BLOCK_COMMENT("Entry:");
8118 __ enter(); // required for proper stackwalking of RuntimeStub frame
8119
8120 __ kernel_crc32c(crc, buf, len,
8121 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8122
8123 __ leave(); // required for proper stackwalking of RuntimeStub frame
8124 __ ret(lr);
8125
8126 // record the stub entry and end
8127 store_archive_data(stub_id, start, __ pc());
8128
8129 return start;
8130 }
8131
8132 /***
8133 * Arguments:
8134 *
8135 * Inputs:
8136 * c_rarg0 - int adler
8137 * c_rarg1 - byte* buff
8138 * c_rarg2 - int len
8139 *
8140 * Output:
8141 * c_rarg0 - int adler result
8142 */
8143 address generate_updateBytesAdler32() {
8144 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
8145 int entry_count = StubInfo::entry_count(stub_id);
8146 assert(entry_count == 1, "sanity check");
8147 address start = load_archive_data(stub_id);
8148 if (start != nullptr) {
8149 return start;
8150 }
8151 __ align(CodeEntryAlignment);
8152 StubCodeMark mark(this, stub_id);
8153 start = __ pc();
8154
8155 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
8156
8157 // Aliases
8158 Register adler = c_rarg0;
8159 Register s1 = c_rarg0;
8160 Register s2 = c_rarg3;
8161 Register buff = c_rarg1;
8162 Register len = c_rarg2;
8163 Register nmax = r4;
8164 Register base = r5;
8165 Register count = r6;
8166 Register temp0 = rscratch1;
8167 Register temp1 = rscratch2;
8168 FloatRegister vbytes = v0;
8169 FloatRegister vs1acc = v1;
8170 FloatRegister vs2acc = v2;
8171 FloatRegister vtable = v3;
8172
8173 // Max number of bytes we can process before having to take the mod
8174 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
8175 uint64_t BASE = 0xfff1;
8176 uint64_t NMAX = 0x15B0;
8177
8178 __ mov(base, BASE);
8179 __ mov(nmax, NMAX);
8180
8181 // Load accumulation coefficients for the upper 16 bits
8182 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
8183 __ ld1(vtable, __ T16B, Address(temp0));
8184
8185 // s1 is initialized to the lower 16 bits of adler
8186 // s2 is initialized to the upper 16 bits of adler
8187 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
8188 __ uxth(s1, adler); // s1 = (adler & 0xffff)
8189
8190 // The pipelined loop needs at least 16 elements for 1 iteration
8191 // It does check this, but it is more effective to skip to the cleanup loop
8192 __ cmp(len, (u1)16);
8193 __ br(Assembler::HS, L_nmax);
8194 __ cbz(len, L_combine);
8195
8196 __ bind(L_simple_by1_loop);
8197 __ ldrb(temp0, Address(__ post(buff, 1)));
8198 __ add(s1, s1, temp0);
8199 __ add(s2, s2, s1);
8200 __ subs(len, len, 1);
8201 __ br(Assembler::HI, L_simple_by1_loop);
8202
8203 // s1 = s1 % BASE
8204 __ subs(temp0, s1, base);
8205 __ csel(s1, temp0, s1, Assembler::HS);
8206
8207 // s2 = s2 % BASE
8208 __ lsr(temp0, s2, 16);
8209 __ lsl(temp1, temp0, 4);
8210 __ sub(temp1, temp1, temp0);
8211 __ add(s2, temp1, s2, ext::uxth);
8212
8213 __ subs(temp0, s2, base);
8214 __ csel(s2, temp0, s2, Assembler::HS);
8215
8216 __ b(L_combine);
8217
8218 __ bind(L_nmax);
8219 __ subs(len, len, nmax);
8220 __ sub(count, nmax, 16);
8221 __ br(Assembler::LO, L_by16);
8222
8223 __ bind(L_nmax_loop);
8224
8225 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8226 vbytes, vs1acc, vs2acc, vtable);
8227
8228 __ subs(count, count, 16);
8229 __ br(Assembler::HS, L_nmax_loop);
8230
8231 // s1 = s1 % BASE
8232 __ lsr(temp0, s1, 16);
8233 __ lsl(temp1, temp0, 4);
8234 __ sub(temp1, temp1, temp0);
8235 __ add(temp1, temp1, s1, ext::uxth);
8236
8237 __ lsr(temp0, temp1, 16);
8238 __ lsl(s1, temp0, 4);
8239 __ sub(s1, s1, temp0);
8240 __ add(s1, s1, temp1, ext:: uxth);
8241
8242 __ subs(temp0, s1, base);
8243 __ csel(s1, temp0, s1, Assembler::HS);
8244
8245 // s2 = s2 % BASE
8246 __ lsr(temp0, s2, 16);
8247 __ lsl(temp1, temp0, 4);
8248 __ sub(temp1, temp1, temp0);
8249 __ add(temp1, temp1, s2, ext::uxth);
8250
8251 __ lsr(temp0, temp1, 16);
8252 __ lsl(s2, temp0, 4);
8253 __ sub(s2, s2, temp0);
8254 __ add(s2, s2, temp1, ext:: uxth);
8255
8256 __ subs(temp0, s2, base);
8257 __ csel(s2, temp0, s2, Assembler::HS);
8258
8259 __ subs(len, len, nmax);
8260 __ sub(count, nmax, 16);
8261 __ br(Assembler::HS, L_nmax_loop);
8262
8263 __ bind(L_by16);
8264 __ adds(len, len, count);
8265 __ br(Assembler::LO, L_by1);
8266
8267 __ bind(L_by16_loop);
8268
8269 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8270 vbytes, vs1acc, vs2acc, vtable);
8271
8272 __ subs(len, len, 16);
8273 __ br(Assembler::HS, L_by16_loop);
8274
8275 __ bind(L_by1);
8276 __ adds(len, len, 15);
8277 __ br(Assembler::LO, L_do_mod);
8278
8279 __ bind(L_by1_loop);
8280 __ ldrb(temp0, Address(__ post(buff, 1)));
8281 __ add(s1, temp0, s1);
8282 __ add(s2, s2, s1);
8283 __ subs(len, len, 1);
8284 __ br(Assembler::HS, L_by1_loop);
8285
8286 __ bind(L_do_mod);
8287 // s1 = s1 % BASE
8288 __ lsr(temp0, s1, 16);
8289 __ lsl(temp1, temp0, 4);
8290 __ sub(temp1, temp1, temp0);
8291 __ add(temp1, temp1, s1, ext::uxth);
8292
8293 __ lsr(temp0, temp1, 16);
8294 __ lsl(s1, temp0, 4);
8295 __ sub(s1, s1, temp0);
8296 __ add(s1, s1, temp1, ext:: uxth);
8297
8298 __ subs(temp0, s1, base);
8299 __ csel(s1, temp0, s1, Assembler::HS);
8300
8301 // s2 = s2 % BASE
8302 __ lsr(temp0, s2, 16);
8303 __ lsl(temp1, temp0, 4);
8304 __ sub(temp1, temp1, temp0);
8305 __ add(temp1, temp1, s2, ext::uxth);
8306
8307 __ lsr(temp0, temp1, 16);
8308 __ lsl(s2, temp0, 4);
8309 __ sub(s2, s2, temp0);
8310 __ add(s2, s2, temp1, ext:: uxth);
8311
8312 __ subs(temp0, s2, base);
8313 __ csel(s2, temp0, s2, Assembler::HS);
8314
8315 // Combine lower bits and higher bits
8316 __ bind(L_combine);
8317 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
8318
8319 __ ret(lr);
8320
8321 // record the stub entry and end
8322 store_archive_data(stub_id, start, __ pc());
8323
8324 return start;
8325 }
8326
8327 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
8328 Register temp0, Register temp1, FloatRegister vbytes,
8329 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
8330 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
8331 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
8332 // In non-vectorized code, we update s1 and s2 as:
8333 // s1 <- s1 + b1
8334 // s2 <- s2 + s1
8335 // s1 <- s1 + b2
8336 // s2 <- s2 + b1
8337 // ...
8338 // s1 <- s1 + b16
8339 // s2 <- s2 + s1
8340 // Putting above assignments together, we have:
8341 // s1_new = s1 + b1 + b2 + ... + b16
8342 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
8343 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
8344 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
8345 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
8346
8347 // s2 = s2 + s1 * 16
8348 __ add(s2, s2, s1, Assembler::LSL, 4);
8349
8350 // vs1acc = b1 + b2 + b3 + ... + b16
8351 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
8352 __ umullv(vs2acc, __ T8B, vtable, vbytes);
8353 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
8354 __ uaddlv(vs1acc, __ T16B, vbytes);
8355 __ uaddlv(vs2acc, __ T8H, vs2acc);
8356
8357 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
8358 __ fmovd(temp0, vs1acc);
8359 __ fmovd(temp1, vs2acc);
8360 __ add(s1, s1, temp0);
8361 __ add(s2, s2, temp1);
8362 }
8363
8364 /**
8365 * Arguments:
8366 *
8367 * Input:
8368 * c_rarg0 - x address
8369 * c_rarg1 - x length
8370 * c_rarg2 - y address
8371 * c_rarg3 - y length
8372 * c_rarg4 - z address
8373 */
8374 address generate_multiplyToLen() {
8375 StubId stub_id = StubId::stubgen_multiplyToLen_id;
8376 int entry_count = StubInfo::entry_count(stub_id);
8377 assert(entry_count == 1, "sanity check");
8378 address start = load_archive_data(stub_id);
8379 if (start != nullptr) {
8380 return start;
8381 }
8382 __ align(CodeEntryAlignment);
8383 StubCodeMark mark(this, stub_id);
8384
8385 start = __ pc();
8386 const Register x = r0;
8387 const Register xlen = r1;
8388 const Register y = r2;
8389 const Register ylen = r3;
8390 const Register z = r4;
8391
8392 const Register tmp0 = r5;
8393 const Register tmp1 = r10;
8394 const Register tmp2 = r11;
8395 const Register tmp3 = r12;
8396 const Register tmp4 = r13;
8397 const Register tmp5 = r14;
8398 const Register tmp6 = r15;
8399 const Register tmp7 = r16;
8400
8401 BLOCK_COMMENT("Entry:");
8402 __ enter(); // required for proper stackwalking of RuntimeStub frame
8403 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8404 __ leave(); // required for proper stackwalking of RuntimeStub frame
8405 __ ret(lr);
8406
8407 // record the stub entry and end
8408 store_archive_data(stub_id, start, __ pc());
8409
8410 return start;
8411 }
8412
8413 address generate_squareToLen() {
8414 // squareToLen algorithm for sizes 1..127 described in java code works
8415 // faster than multiply_to_len on some CPUs and slower on others, but
8416 // multiply_to_len shows a bit better overall results
8417 StubId stub_id = StubId::stubgen_squareToLen_id;
8418 int entry_count = StubInfo::entry_count(stub_id);
8419 assert(entry_count == 1, "sanity check");
8420 address start = load_archive_data(stub_id);
8421 if (start != nullptr) {
8422 return start;
8423 }
8424 __ align(CodeEntryAlignment);
8425 StubCodeMark mark(this, stub_id);
8426 start = __ pc();
8427
8428 const Register x = r0;
8429 const Register xlen = r1;
8430 const Register z = r2;
8431 const Register y = r4; // == x
8432 const Register ylen = r5; // == xlen
8433
8434 const Register tmp0 = r3;
8435 const Register tmp1 = r10;
8436 const Register tmp2 = r11;
8437 const Register tmp3 = r12;
8438 const Register tmp4 = r13;
8439 const Register tmp5 = r14;
8440 const Register tmp6 = r15;
8441 const Register tmp7 = r16;
8442
8443 RegSet spilled_regs = RegSet::of(y, ylen);
8444 BLOCK_COMMENT("Entry:");
8445 __ enter();
8446 __ push(spilled_regs, sp);
8447 __ mov(y, x);
8448 __ mov(ylen, xlen);
8449 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8450 __ pop(spilled_regs, sp);
8451 __ leave();
8452 __ ret(lr);
8453
8454 // record the stub entry and end
8455 store_archive_data(stub_id, start, __ pc());
8456
8457 return start;
8458 }
8459
8460 address generate_mulAdd() {
8461 StubId stub_id = StubId::stubgen_mulAdd_id;
8462 int entry_count = StubInfo::entry_count(stub_id);
8463 assert(entry_count == 1, "sanity check");
8464 address start = load_archive_data(stub_id);
8465 if (start != nullptr) {
8466 return start;
8467 }
8468 __ align(CodeEntryAlignment);
8469 StubCodeMark mark(this, stub_id);
8470
8471 start = __ pc();
8472
8473 const Register out = r0;
8474 const Register in = r1;
8475 const Register offset = r2;
8476 const Register len = r3;
8477 const Register k = r4;
8478
8479 BLOCK_COMMENT("Entry:");
8480 __ enter();
8481 __ mul_add(out, in, offset, len, k);
8482 __ leave();
8483 __ ret(lr);
8484
8485 // record the stub entry and end
8486 store_archive_data(stub_id, start, __ pc());
8487
8488 return start;
8489 }
8490
8491 // Arguments:
8492 //
8493 // Input:
8494 // c_rarg0 - newArr address
8495 // c_rarg1 - oldArr address
8496 // c_rarg2 - newIdx
8497 // c_rarg3 - shiftCount
8498 // c_rarg4 - numIter
8499 //
8500 address generate_bigIntegerRightShift() {
8501 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
8502 int entry_count = StubInfo::entry_count(stub_id);
8503 assert(entry_count == 1, "sanity check");
8504 address start = load_archive_data(stub_id);
8505 if (start != nullptr) {
8506 return start;
8507 }
8508 __ align(CodeEntryAlignment);
8509 StubCodeMark mark(this, stub_id);
8510 start = __ pc();
8511
8512 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8513
8514 Register newArr = c_rarg0;
8515 Register oldArr = c_rarg1;
8516 Register newIdx = c_rarg2;
8517 Register shiftCount = c_rarg3;
8518 Register numIter = c_rarg4;
8519 Register idx = numIter;
8520
8521 Register newArrCur = rscratch1;
8522 Register shiftRevCount = rscratch2;
8523 Register oldArrCur = r13;
8524 Register oldArrNext = r14;
8525
8526 FloatRegister oldElem0 = v0;
8527 FloatRegister oldElem1 = v1;
8528 FloatRegister newElem = v2;
8529 FloatRegister shiftVCount = v3;
8530 FloatRegister shiftVRevCount = v4;
8531
8532 __ cbz(idx, Exit);
8533
8534 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8535
8536 // left shift count
8537 __ movw(shiftRevCount, 32);
8538 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8539
8540 // numIter too small to allow a 4-words SIMD loop, rolling back
8541 __ cmp(numIter, (u1)4);
8542 __ br(Assembler::LT, ShiftThree);
8543
8544 __ dup(shiftVCount, __ T4S, shiftCount);
8545 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8546 __ negr(shiftVCount, __ T4S, shiftVCount);
8547
8548 __ BIND(ShiftSIMDLoop);
8549
8550 // Calculate the load addresses
8551 __ sub(idx, idx, 4);
8552 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8553 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8554 __ add(oldArrCur, oldArrNext, 4);
8555
8556 // Load 4 words and process
8557 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
8558 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
8559 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8560 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8561 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8562 __ st1(newElem, __ T4S, Address(newArrCur));
8563
8564 __ cmp(idx, (u1)4);
8565 __ br(Assembler::LT, ShiftTwoLoop);
8566 __ b(ShiftSIMDLoop);
8567
8568 __ BIND(ShiftTwoLoop);
8569 __ cbz(idx, Exit);
8570 __ cmp(idx, (u1)1);
8571 __ br(Assembler::EQ, ShiftOne);
8572
8573 // Calculate the load addresses
8574 __ sub(idx, idx, 2);
8575 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8576 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8577 __ add(oldArrCur, oldArrNext, 4);
8578
8579 // Load 2 words and process
8580 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8581 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8582 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8583 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8584 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8585 __ st1(newElem, __ T2S, Address(newArrCur));
8586 __ b(ShiftTwoLoop);
8587
8588 __ BIND(ShiftThree);
8589 __ tbz(idx, 1, ShiftOne);
8590 __ tbz(idx, 0, ShiftTwo);
8591 __ ldrw(r10, Address(oldArr, 12));
8592 __ ldrw(r11, Address(oldArr, 8));
8593 __ lsrvw(r10, r10, shiftCount);
8594 __ lslvw(r11, r11, shiftRevCount);
8595 __ orrw(r12, r10, r11);
8596 __ strw(r12, Address(newArr, 8));
8597
8598 __ BIND(ShiftTwo);
8599 __ ldrw(r10, Address(oldArr, 8));
8600 __ ldrw(r11, Address(oldArr, 4));
8601 __ lsrvw(r10, r10, shiftCount);
8602 __ lslvw(r11, r11, shiftRevCount);
8603 __ orrw(r12, r10, r11);
8604 __ strw(r12, Address(newArr, 4));
8605
8606 __ BIND(ShiftOne);
8607 __ ldrw(r10, Address(oldArr, 4));
8608 __ ldrw(r11, Address(oldArr));
8609 __ lsrvw(r10, r10, shiftCount);
8610 __ lslvw(r11, r11, shiftRevCount);
8611 __ orrw(r12, r10, r11);
8612 __ strw(r12, Address(newArr));
8613
8614 __ BIND(Exit);
8615 __ ret(lr);
8616
8617 // record the stub entry and end
8618 store_archive_data(stub_id, start, __ pc());
8619
8620 return start;
8621 }
8622
8623 // Arguments:
8624 //
8625 // Input:
8626 // c_rarg0 - newArr address
8627 // c_rarg1 - oldArr address
8628 // c_rarg2 - newIdx
8629 // c_rarg3 - shiftCount
8630 // c_rarg4 - numIter
8631 //
8632 address generate_bigIntegerLeftShift() {
8633 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8634 int entry_count = StubInfo::entry_count(stub_id);
8635 assert(entry_count == 1, "sanity check");
8636 address start = load_archive_data(stub_id);
8637 if (start != nullptr) {
8638 return start;
8639 }
8640 __ align(CodeEntryAlignment);
8641 StubCodeMark mark(this, stub_id);
8642 start = __ pc();
8643
8644 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8645
8646 Register newArr = c_rarg0;
8647 Register oldArr = c_rarg1;
8648 Register newIdx = c_rarg2;
8649 Register shiftCount = c_rarg3;
8650 Register numIter = c_rarg4;
8651
8652 Register shiftRevCount = rscratch1;
8653 Register oldArrNext = rscratch2;
8654
8655 FloatRegister oldElem0 = v0;
8656 FloatRegister oldElem1 = v1;
8657 FloatRegister newElem = v2;
8658 FloatRegister shiftVCount = v3;
8659 FloatRegister shiftVRevCount = v4;
8660
8661 __ cbz(numIter, Exit);
8662
8663 __ add(oldArrNext, oldArr, 4);
8664 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8665
8666 // right shift count
8667 __ movw(shiftRevCount, 32);
8668 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8669
8670 // numIter too small to allow a 4-words SIMD loop, rolling back
8671 __ cmp(numIter, (u1)4);
8672 __ br(Assembler::LT, ShiftThree);
8673
8674 __ dup(shiftVCount, __ T4S, shiftCount);
8675 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8676 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8677
8678 __ BIND(ShiftSIMDLoop);
8679
8680 // load 4 words and process
8681 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8682 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8683 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8684 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8685 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8686 __ st1(newElem, __ T4S, __ post(newArr, 16));
8687 __ sub(numIter, numIter, 4);
8688
8689 __ cmp(numIter, (u1)4);
8690 __ br(Assembler::LT, ShiftTwoLoop);
8691 __ b(ShiftSIMDLoop);
8692
8693 __ BIND(ShiftTwoLoop);
8694 __ cbz(numIter, Exit);
8695 __ cmp(numIter, (u1)1);
8696 __ br(Assembler::EQ, ShiftOne);
8697
8698 // load 2 words and process
8699 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8700 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8701 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8702 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8703 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8704 __ st1(newElem, __ T2S, __ post(newArr, 8));
8705 __ sub(numIter, numIter, 2);
8706 __ b(ShiftTwoLoop);
8707
8708 __ BIND(ShiftThree);
8709 __ ldrw(r10, __ post(oldArr, 4));
8710 __ ldrw(r11, __ post(oldArrNext, 4));
8711 __ lslvw(r10, r10, shiftCount);
8712 __ lsrvw(r11, r11, shiftRevCount);
8713 __ orrw(r12, r10, r11);
8714 __ strw(r12, __ post(newArr, 4));
8715 __ tbz(numIter, 1, Exit);
8716 __ tbz(numIter, 0, ShiftOne);
8717
8718 __ BIND(ShiftTwo);
8719 __ ldrw(r10, __ post(oldArr, 4));
8720 __ ldrw(r11, __ post(oldArrNext, 4));
8721 __ lslvw(r10, r10, shiftCount);
8722 __ lsrvw(r11, r11, shiftRevCount);
8723 __ orrw(r12, r10, r11);
8724 __ strw(r12, __ post(newArr, 4));
8725
8726 __ BIND(ShiftOne);
8727 __ ldrw(r10, Address(oldArr));
8728 __ ldrw(r11, Address(oldArrNext));
8729 __ lslvw(r10, r10, shiftCount);
8730 __ lsrvw(r11, r11, shiftRevCount);
8731 __ orrw(r12, r10, r11);
8732 __ strw(r12, Address(newArr));
8733
8734 __ BIND(Exit);
8735 __ ret(lr);
8736
8737 // record the stub entry and end
8738 store_archive_data(stub_id, start, __ pc());
8739
8740 return start;
8741 }
8742
8743 address generate_count_positives(address &count_positives_long) {
8744 StubId stub_id = StubId::stubgen_count_positives_id;
8745 GrowableArray<address> entries;
8746 int entry_count = StubInfo::entry_count(stub_id);
8747 // We have an extra entry for count_positives_long.
8748 assert(entry_count == 2, "sanity check");
8749 address start = load_archive_data(stub_id, &entries);
8750 if (start != nullptr) {
8751 assert(entries.length() == 1,
8752 "unexpected extra entry count %d", entries.length());
8753 count_positives_long = entries.at(0);
8754 return start;
8755 }
8756 const u1 large_loop_size = 64;
8757 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8758 int dcache_line = VM_Version::dcache_line_size();
8759
8760 Register ary1 = r1, len = r2, result = r0;
8761
8762 __ align(CodeEntryAlignment);
8763 StubCodeMark mark(this, stub_id);
8764
8765 address entry = __ pc();
8766
8767 __ enter();
8768 // precondition: a copy of len is already in result
8769 // __ mov(result, len);
8770
8771 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8772 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8773
8774 __ cmp(len, (u1)15);
8775 __ br(Assembler::GT, LEN_OVER_15);
8776 // The only case when execution falls into this code is when pointer is near
8777 // the end of memory page and we have to avoid reading next page
8778 __ add(ary1, ary1, len);
8779 __ subs(len, len, 8);
8780 __ br(Assembler::GT, LEN_OVER_8);
8781 __ ldr(rscratch2, Address(ary1, -8));
8782 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8783 __ lsrv(rscratch2, rscratch2, rscratch1);
8784 __ tst(rscratch2, UPPER_BIT_MASK);
8785 __ csel(result, zr, result, Assembler::NE);
8786 __ leave();
8787 __ ret(lr);
8788 __ bind(LEN_OVER_8);
8789 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8790 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8791 __ tst(rscratch2, UPPER_BIT_MASK);
8792 __ br(Assembler::NE, RET_NO_POP);
8793 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8794 __ lsrv(rscratch1, rscratch1, rscratch2);
8795 __ tst(rscratch1, UPPER_BIT_MASK);
8796 __ bind(RET_NO_POP);
8797 __ csel(result, zr, result, Assembler::NE);
8798 __ leave();
8799 __ ret(lr);
8800
8801 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8802 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8803
8804 count_positives_long = __ pc(); // 2nd entry point
8805 entries.append(count_positives_long);
8806
8807 __ enter();
8808
8809 __ bind(LEN_OVER_15);
8810 __ push(spilled_regs, sp);
8811 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8812 __ cbz(rscratch2, ALIGNED);
8813 __ ldp(tmp6, tmp1, Address(ary1));
8814 __ mov(tmp5, 16);
8815 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8816 __ add(ary1, ary1, rscratch1);
8817 __ orr(tmp6, tmp6, tmp1);
8818 __ tst(tmp6, UPPER_BIT_MASK);
8819 __ br(Assembler::NE, RET_ADJUST);
8820 __ sub(len, len, rscratch1);
8821
8822 __ bind(ALIGNED);
8823 __ cmp(len, large_loop_size);
8824 __ br(Assembler::LT, CHECK_16);
8825 // Perform 16-byte load as early return in pre-loop to handle situation
8826 // when initially aligned large array has negative values at starting bytes,
8827 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8828 // slower. Cases with negative bytes further ahead won't be affected that
8829 // much. In fact, it'll be faster due to early loads, less instructions and
8830 // less branches in LARGE_LOOP.
8831 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8832 __ sub(len, len, 16);
8833 __ orr(tmp6, tmp6, tmp1);
8834 __ tst(tmp6, UPPER_BIT_MASK);
8835 __ br(Assembler::NE, RET_ADJUST_16);
8836 __ cmp(len, large_loop_size);
8837 __ br(Assembler::LT, CHECK_16);
8838
8839 if (SoftwarePrefetchHintDistance >= 0
8840 && SoftwarePrefetchHintDistance >= dcache_line) {
8841 // initial prefetch
8842 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8843 }
8844 __ bind(LARGE_LOOP);
8845 if (SoftwarePrefetchHintDistance >= 0) {
8846 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8847 }
8848 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8849 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8850 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8851 // instructions per cycle and have less branches, but this approach disables
8852 // early return, thus, all 64 bytes are loaded and checked every time.
8853 __ ldp(tmp2, tmp3, Address(ary1));
8854 __ ldp(tmp4, tmp5, Address(ary1, 16));
8855 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8856 __ ldp(tmp6, tmp1, Address(ary1, 48));
8857 __ add(ary1, ary1, large_loop_size);
8858 __ sub(len, len, large_loop_size);
8859 __ orr(tmp2, tmp2, tmp3);
8860 __ orr(tmp4, tmp4, tmp5);
8861 __ orr(rscratch1, rscratch1, rscratch2);
8862 __ orr(tmp6, tmp6, tmp1);
8863 __ orr(tmp2, tmp2, tmp4);
8864 __ orr(rscratch1, rscratch1, tmp6);
8865 __ orr(tmp2, tmp2, rscratch1);
8866 __ tst(tmp2, UPPER_BIT_MASK);
8867 __ br(Assembler::NE, RET_ADJUST_LONG);
8868 __ cmp(len, large_loop_size);
8869 __ br(Assembler::GE, LARGE_LOOP);
8870
8871 __ bind(CHECK_16); // small 16-byte load pre-loop
8872 __ cmp(len, (u1)16);
8873 __ br(Assembler::LT, POST_LOOP16);
8874
8875 __ bind(LOOP16); // small 16-byte load loop
8876 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8877 __ sub(len, len, 16);
8878 __ orr(tmp2, tmp2, tmp3);
8879 __ tst(tmp2, UPPER_BIT_MASK);
8880 __ br(Assembler::NE, RET_ADJUST_16);
8881 __ cmp(len, (u1)16);
8882 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8883
8884 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8885 __ cmp(len, (u1)8);
8886 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8887 __ ldr(tmp3, Address(__ post(ary1, 8)));
8888 __ tst(tmp3, UPPER_BIT_MASK);
8889 __ br(Assembler::NE, RET_ADJUST);
8890 __ sub(len, len, 8);
8891
8892 __ bind(POST_LOOP16_LOAD_TAIL);
8893 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8894 __ ldr(tmp1, Address(ary1));
8895 __ mov(tmp2, 64);
8896 __ sub(tmp4, tmp2, len, __ LSL, 3);
8897 __ lslv(tmp1, tmp1, tmp4);
8898 __ tst(tmp1, UPPER_BIT_MASK);
8899 __ br(Assembler::NE, RET_ADJUST);
8900 // Fallthrough
8901
8902 __ bind(RET_LEN);
8903 __ pop(spilled_regs, sp);
8904 __ leave();
8905 __ ret(lr);
8906
8907 // difference result - len is the count of guaranteed to be
8908 // positive bytes
8909
8910 __ bind(RET_ADJUST_LONG);
8911 __ add(len, len, (u1)(large_loop_size - 16));
8912 __ bind(RET_ADJUST_16);
8913 __ add(len, len, 16);
8914 __ bind(RET_ADJUST);
8915 __ pop(spilled_regs, sp);
8916 __ leave();
8917 __ sub(result, result, len);
8918 __ ret(lr);
8919
8920 // record the stub entry and end plus the extra entry
8921 store_archive_data(stub_id, entry, __ pc(), &entries);
8922
8923 return entry;
8924 }
8925
8926 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8927 bool usePrefetch, Label &NOT_EQUAL) {
8928 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8929 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8930 tmp7 = r12, tmp8 = r13;
8931 Label LOOP;
8932
8933 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8934 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8935 __ bind(LOOP);
8936 if (usePrefetch) {
8937 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8938 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8939 }
8940 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8941 __ eor(tmp1, tmp1, tmp2);
8942 __ eor(tmp3, tmp3, tmp4);
8943 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8944 __ orr(tmp1, tmp1, tmp3);
8945 __ cbnz(tmp1, NOT_EQUAL);
8946 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8947 __ eor(tmp5, tmp5, tmp6);
8948 __ eor(tmp7, tmp7, tmp8);
8949 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8950 __ orr(tmp5, tmp5, tmp7);
8951 __ cbnz(tmp5, NOT_EQUAL);
8952 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8953 __ eor(tmp1, tmp1, tmp2);
8954 __ eor(tmp3, tmp3, tmp4);
8955 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8956 __ orr(tmp1, tmp1, tmp3);
8957 __ cbnz(tmp1, NOT_EQUAL);
8958 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8959 __ eor(tmp5, tmp5, tmp6);
8960 __ sub(cnt1, cnt1, 8 * wordSize);
8961 __ eor(tmp7, tmp7, tmp8);
8962 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8963 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8964 // cmp) because subs allows an unlimited range of immediate operand.
8965 __ subs(tmp6, cnt1, loopThreshold);
8966 __ orr(tmp5, tmp5, tmp7);
8967 __ cbnz(tmp5, NOT_EQUAL);
8968 __ br(__ GE, LOOP);
8969 // post-loop
8970 __ eor(tmp1, tmp1, tmp2);
8971 __ eor(tmp3, tmp3, tmp4);
8972 __ orr(tmp1, tmp1, tmp3);
8973 __ sub(cnt1, cnt1, 2 * wordSize);
8974 __ cbnz(tmp1, NOT_EQUAL);
8975 }
8976
8977 void generate_large_array_equals_loop_simd(int loopThreshold,
8978 bool usePrefetch, Label &NOT_EQUAL) {
8979 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8980 tmp2 = rscratch2;
8981 Label LOOP;
8982
8983 __ bind(LOOP);
8984 if (usePrefetch) {
8985 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8986 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8987 }
8988 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8989 __ sub(cnt1, cnt1, 8 * wordSize);
8990 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8991 __ subs(tmp1, cnt1, loopThreshold);
8992 __ eor(v0, __ T16B, v0, v4);
8993 __ eor(v1, __ T16B, v1, v5);
8994 __ eor(v2, __ T16B, v2, v6);
8995 __ eor(v3, __ T16B, v3, v7);
8996 __ orr(v0, __ T16B, v0, v1);
8997 __ orr(v1, __ T16B, v2, v3);
8998 __ orr(v0, __ T16B, v0, v1);
8999 __ umov(tmp1, v0, __ D, 0);
9000 __ umov(tmp2, v0, __ D, 1);
9001 __ orr(tmp1, tmp1, tmp2);
9002 __ cbnz(tmp1, NOT_EQUAL);
9003 __ br(__ GE, LOOP);
9004 }
9005
9006 // a1 = r1 - array1 address
9007 // a2 = r2 - array2 address
9008 // result = r0 - return value. Already contains "false"
9009 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
9010 // r3-r5 are reserved temporary registers
9011 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
9012 address generate_large_array_equals() {
9013 StubId stub_id = StubId::stubgen_large_array_equals_id;
9014 int entry_count = StubInfo::entry_count(stub_id);
9015 assert(entry_count == 1, "sanity check");
9016 address start = load_archive_data(stub_id);
9017 if (start != nullptr) {
9018 return start;
9019 }
9020 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
9021 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
9022 tmp7 = r12, tmp8 = r13;
9023 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
9024 SMALL_LOOP, POST_LOOP;
9025 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
9026 // calculate if at least 32 prefetched bytes are used
9027 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
9028 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
9029 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
9030 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
9031 tmp5, tmp6, tmp7, tmp8);
9032
9033 __ align(CodeEntryAlignment);
9034
9035 StubCodeMark mark(this, stub_id);
9036
9037 address entry = __ pc();
9038 __ enter();
9039 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
9040 // also advance pointers to use post-increment instead of pre-increment
9041 __ add(a1, a1, wordSize);
9042 __ add(a2, a2, wordSize);
9043 if (AvoidUnalignedAccesses) {
9044 // both implementations (SIMD/nonSIMD) are using relatively large load
9045 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
9046 // on some CPUs in case of address is not at least 16-byte aligned.
9047 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
9048 // load if needed at least for 1st address and make if 16-byte aligned.
9049 Label ALIGNED16;
9050 __ tbz(a1, 3, ALIGNED16);
9051 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9052 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9053 __ sub(cnt1, cnt1, wordSize);
9054 __ eor(tmp1, tmp1, tmp2);
9055 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
9056 __ bind(ALIGNED16);
9057 }
9058 if (UseSIMDForArrayEquals) {
9059 if (SoftwarePrefetchHintDistance >= 0) {
9060 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9061 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9062 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
9063 /* prfm = */ true, NOT_EQUAL);
9064 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9065 __ br(__ LT, TAIL);
9066 }
9067 __ bind(NO_PREFETCH_LARGE_LOOP);
9068 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
9069 /* prfm = */ false, NOT_EQUAL);
9070 } else {
9071 __ push(spilled_regs, sp);
9072 if (SoftwarePrefetchHintDistance >= 0) {
9073 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9074 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9075 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
9076 /* prfm = */ true, NOT_EQUAL);
9077 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9078 __ br(__ LT, TAIL);
9079 }
9080 __ bind(NO_PREFETCH_LARGE_LOOP);
9081 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
9082 /* prfm = */ false, NOT_EQUAL);
9083 }
9084 __ bind(TAIL);
9085 __ cbz(cnt1, EQUAL);
9086 __ subs(cnt1, cnt1, wordSize);
9087 __ br(__ LE, POST_LOOP);
9088 __ bind(SMALL_LOOP);
9089 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9090 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9091 __ subs(cnt1, cnt1, wordSize);
9092 __ eor(tmp1, tmp1, tmp2);
9093 __ cbnz(tmp1, NOT_EQUAL);
9094 __ br(__ GT, SMALL_LOOP);
9095 __ bind(POST_LOOP);
9096 __ ldr(tmp1, Address(a1, cnt1));
9097 __ ldr(tmp2, Address(a2, cnt1));
9098 __ eor(tmp1, tmp1, tmp2);
9099 __ cbnz(tmp1, NOT_EQUAL);
9100 __ bind(EQUAL);
9101 __ mov(result, true);
9102 __ bind(NOT_EQUAL);
9103 if (!UseSIMDForArrayEquals) {
9104 __ pop(spilled_regs, sp);
9105 }
9106 __ bind(NOT_EQUAL_NO_POP);
9107 __ leave();
9108 __ ret(lr);
9109
9110 // record the stub entry and end
9111 store_archive_data(stub_id, entry, __ pc());
9112
9113 return entry;
9114 }
9115
9116 // result = r0 - return value. Contains initial hashcode value on entry.
9117 // ary = r1 - array address
9118 // cnt = r2 - elements count
9119 // Clobbers: v0-v13, rscratch1, rscratch2
9120 address generate_large_arrays_hashcode(BasicType eltype) {
9121 StubId stub_id;
9122 switch (eltype) {
9123 case T_BOOLEAN:
9124 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
9125 break;
9126 case T_BYTE:
9127 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
9128 break;
9129 case T_CHAR:
9130 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
9131 break;
9132 case T_SHORT:
9133 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
9134 break;
9135 case T_INT:
9136 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
9137 break;
9138 default:
9139 stub_id = StubId::NO_STUBID;
9140 ShouldNotReachHere();
9141 };
9142 int entry_count = StubInfo::entry_count(stub_id);
9143 assert(entry_count == 1, "sanity check");
9144 address start = load_archive_data(stub_id);
9145 if (start != nullptr) {
9146 return start;
9147 }
9148 const Register result = r0, ary = r1, cnt = r2;
9149 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
9150 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
9151 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
9152 const FloatRegister vpowm = v13;
9153
9154 ARRAYS_HASHCODE_REGISTERS;
9155
9156 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
9157
9158 unsigned int vf; // vectorization factor
9159 bool multiply_by_halves;
9160 Assembler::SIMD_Arrangement load_arrangement;
9161 switch (eltype) {
9162 case T_BOOLEAN:
9163 case T_BYTE:
9164 load_arrangement = Assembler::T8B;
9165 multiply_by_halves = true;
9166 vf = 8;
9167 break;
9168 case T_CHAR:
9169 case T_SHORT:
9170 load_arrangement = Assembler::T8H;
9171 multiply_by_halves = true;
9172 vf = 8;
9173 break;
9174 case T_INT:
9175 load_arrangement = Assembler::T4S;
9176 multiply_by_halves = false;
9177 vf = 4;
9178 break;
9179 default:
9180 ShouldNotReachHere();
9181 }
9182
9183 // Unroll factor
9184 const unsigned uf = 4;
9185
9186 // Effective vectorization factor
9187 const unsigned evf = vf * uf;
9188
9189 __ align(CodeEntryAlignment);
9190
9191 StubCodeMark mark(this, stub_id);
9192
9193 address entry = __ pc();
9194 __ enter();
9195
9196 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
9197 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
9198 // value shouldn't change throughout both loops.
9199 __ movw(rscratch1, intpow(31U, 3));
9200 __ mov(vpow, Assembler::S, 0, rscratch1);
9201 __ movw(rscratch1, intpow(31U, 2));
9202 __ mov(vpow, Assembler::S, 1, rscratch1);
9203 __ movw(rscratch1, intpow(31U, 1));
9204 __ mov(vpow, Assembler::S, 2, rscratch1);
9205 __ movw(rscratch1, intpow(31U, 0));
9206 __ mov(vpow, Assembler::S, 3, rscratch1);
9207
9208 __ mov(vmul0, Assembler::T16B, 0);
9209 __ mov(vmul0, Assembler::S, 3, result);
9210
9211 __ andr(rscratch2, cnt, (uf - 1) * vf);
9212 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
9213
9214 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
9215 __ mov(vpowm, Assembler::S, 0, rscratch1);
9216
9217 // SMALL LOOP
9218 __ bind(SMALL_LOOP);
9219
9220 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
9221 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9222 __ subsw(rscratch2, rscratch2, vf);
9223
9224 if (load_arrangement == Assembler::T8B) {
9225 // Extend 8B to 8H to be able to use vector multiply
9226 // instructions
9227 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9228 if (is_signed_subword_type(eltype)) {
9229 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9230 } else {
9231 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9232 }
9233 }
9234
9235 switch (load_arrangement) {
9236 case Assembler::T4S:
9237 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9238 break;
9239 case Assembler::T8B:
9240 case Assembler::T8H:
9241 assert(is_subword_type(eltype), "subword type expected");
9242 if (is_signed_subword_type(eltype)) {
9243 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9244 } else {
9245 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9246 }
9247 break;
9248 default:
9249 __ should_not_reach_here();
9250 }
9251
9252 // Process the upper half of a vector
9253 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9254 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9255 if (is_signed_subword_type(eltype)) {
9256 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9257 } else {
9258 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9259 }
9260 }
9261
9262 __ br(Assembler::HI, SMALL_LOOP);
9263
9264 // SMALL LOOP'S EPILOQUE
9265 __ lsr(rscratch2, cnt, exact_log2(evf));
9266 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
9267
9268 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9269 __ addv(vmul0, Assembler::T4S, vmul0);
9270 __ umov(result, vmul0, Assembler::S, 0);
9271
9272 // TAIL
9273 __ bind(TAIL);
9274
9275 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
9276 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
9277 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
9278 __ andr(rscratch2, cnt, vf - 1);
9279 __ bind(TAIL_SHORTCUT);
9280 __ adr(rscratch1, BR_BASE);
9281 // For Cortex-A53 offset is 4 because 2 nops are generated.
9282 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
9283 __ movw(rscratch2, 0x1f);
9284 __ br(rscratch1);
9285
9286 for (size_t i = 0; i < vf - 1; ++i) {
9287 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
9288 eltype);
9289 __ maddw(result, result, rscratch2, rscratch1);
9290 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
9291 // Generate 2nd nop to have 4 instructions per iteration.
9292 if (VM_Version::supports_a53mac()) {
9293 __ nop();
9294 }
9295 }
9296 __ bind(BR_BASE);
9297
9298 __ leave();
9299 __ ret(lr);
9300
9301 // LARGE LOOP
9302 __ bind(LARGE_LOOP_PREHEADER);
9303
9304 __ lsr(rscratch2, cnt, exact_log2(evf));
9305
9306 if (multiply_by_halves) {
9307 // 31^4 - multiplier between lower and upper parts of a register
9308 __ movw(rscratch1, intpow(31U, vf / 2));
9309 __ mov(vpowm, Assembler::S, 1, rscratch1);
9310 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
9311 __ movw(rscratch1, intpow(31U, evf - vf / 2));
9312 __ mov(vpowm, Assembler::S, 0, rscratch1);
9313 } else {
9314 // 31^16
9315 __ movw(rscratch1, intpow(31U, evf));
9316 __ mov(vpowm, Assembler::S, 0, rscratch1);
9317 }
9318
9319 __ mov(vmul3, Assembler::T16B, 0);
9320 __ mov(vmul2, Assembler::T16B, 0);
9321 __ mov(vmul1, Assembler::T16B, 0);
9322
9323 __ bind(LARGE_LOOP);
9324
9325 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
9326 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
9327 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
9328 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9329
9330 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
9331 Address(__ post(ary, evf * type2aelembytes(eltype))));
9332
9333 if (load_arrangement == Assembler::T8B) {
9334 // Extend 8B to 8H to be able to use vector multiply
9335 // instructions
9336 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9337 if (is_signed_subword_type(eltype)) {
9338 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9339 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9340 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9341 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9342 } else {
9343 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9344 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9345 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9346 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9347 }
9348 }
9349
9350 switch (load_arrangement) {
9351 case Assembler::T4S:
9352 __ addv(vmul3, load_arrangement, vmul3, vdata3);
9353 __ addv(vmul2, load_arrangement, vmul2, vdata2);
9354 __ addv(vmul1, load_arrangement, vmul1, vdata1);
9355 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9356 break;
9357 case Assembler::T8B:
9358 case Assembler::T8H:
9359 assert(is_subword_type(eltype), "subword type expected");
9360 if (is_signed_subword_type(eltype)) {
9361 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9362 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9363 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9364 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9365 } else {
9366 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9367 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9368 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9369 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9370 }
9371 break;
9372 default:
9373 __ should_not_reach_here();
9374 }
9375
9376 // Process the upper half of a vector
9377 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9378 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
9379 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
9380 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
9381 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
9382 if (is_signed_subword_type(eltype)) {
9383 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9384 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9385 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9386 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9387 } else {
9388 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9389 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9390 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9391 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9392 }
9393 }
9394
9395 __ subsw(rscratch2, rscratch2, 1);
9396 __ br(Assembler::HI, LARGE_LOOP);
9397
9398 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
9399 __ addv(vmul3, Assembler::T4S, vmul3);
9400 __ umov(result, vmul3, Assembler::S, 0);
9401
9402 __ mov(rscratch2, intpow(31U, vf));
9403
9404 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
9405 __ addv(vmul2, Assembler::T4S, vmul2);
9406 __ umov(rscratch1, vmul2, Assembler::S, 0);
9407 __ maddw(result, result, rscratch2, rscratch1);
9408
9409 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
9410 __ addv(vmul1, Assembler::T4S, vmul1);
9411 __ umov(rscratch1, vmul1, Assembler::S, 0);
9412 __ maddw(result, result, rscratch2, rscratch1);
9413
9414 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9415 __ addv(vmul0, Assembler::T4S, vmul0);
9416 __ umov(rscratch1, vmul0, Assembler::S, 0);
9417 __ maddw(result, result, rscratch2, rscratch1);
9418
9419 __ andr(rscratch2, cnt, vf - 1);
9420 __ cbnz(rscratch2, TAIL_SHORTCUT);
9421
9422 __ leave();
9423 __ ret(lr);
9424
9425 // record the stub entry and end
9426 store_archive_data(stub_id, entry, __ pc());
9427
9428 return entry;
9429 }
9430
9431 address generate_dsin_dcos(bool isCos) {
9432 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
9433 int entry_count = StubInfo::entry_count(stub_id);
9434 assert(entry_count == 1, "sanity check");
9435 address start = load_archive_data(stub_id);
9436 if (start != nullptr) {
9437 return start;
9438 }
9439 __ align(CodeEntryAlignment);
9440 StubCodeMark mark(this, stub_id);
9441 start = __ pc();
9442 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
9443 (address)StubRoutines::aarch64::_two_over_pi,
9444 (address)StubRoutines::aarch64::_pio2,
9445 (address)StubRoutines::aarch64::_dsin_coef,
9446 (address)StubRoutines::aarch64::_dcos_coef);
9447
9448 // record the stub entry and end
9449 store_archive_data(stub_id, start, __ pc());
9450
9451 return start;
9452 }
9453
9454 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
9455 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
9456 Label &DIFF2) {
9457 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
9458 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
9459
9460 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
9461 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9462 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
9463 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
9464
9465 __ fmovd(tmpL, vtmp3);
9466 __ eor(rscratch2, tmp3, tmpL);
9467 __ cbnz(rscratch2, DIFF2);
9468
9469 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9470 __ umov(tmpL, vtmp3, __ D, 1);
9471 __ eor(rscratch2, tmpU, tmpL);
9472 __ cbnz(rscratch2, DIFF1);
9473
9474 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
9475 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9476 __ fmovd(tmpL, vtmp);
9477 __ eor(rscratch2, tmp3, tmpL);
9478 __ cbnz(rscratch2, DIFF2);
9479
9480 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9481 __ umov(tmpL, vtmp, __ D, 1);
9482 __ eor(rscratch2, tmpU, tmpL);
9483 __ cbnz(rscratch2, DIFF1);
9484 }
9485
9486 // r0 = result
9487 // r1 = str1
9488 // r2 = cnt1
9489 // r3 = str2
9490 // r4 = cnt2
9491 // r10 = tmp1
9492 // r11 = tmp2
9493 address generate_compare_long_string_different_encoding(bool isLU) {
9494 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
9495 int entry_count = StubInfo::entry_count(stub_id);
9496 assert(entry_count == 1, "sanity check");
9497 address start = load_archive_data(stub_id);
9498 if (start != nullptr) {
9499 return start;
9500 }
9501 __ align(CodeEntryAlignment);
9502 StubCodeMark mark(this, stub_id);
9503 address entry = __ pc();
9504 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
9505 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
9506 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
9507 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9508 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
9509 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
9510 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
9511
9512 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
9513
9514 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
9515 // cnt2 == amount of characters left to compare
9516 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
9517 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9518 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
9519 __ add(str2, str2, isLU ? wordSize : wordSize/2);
9520 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
9521 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
9522 __ eor(rscratch2, tmp1, tmp2);
9523 __ mov(rscratch1, tmp2);
9524 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
9525 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
9526 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
9527 __ push(spilled_regs, sp);
9528 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
9529 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
9530
9531 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9532
9533 if (SoftwarePrefetchHintDistance >= 0) {
9534 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9535 __ br(__ LT, NO_PREFETCH);
9536 __ bind(LARGE_LOOP_PREFETCH);
9537 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
9538 __ mov(tmp4, 2);
9539 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9540 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
9541 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9542 __ subs(tmp4, tmp4, 1);
9543 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
9544 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9545 __ mov(tmp4, 2);
9546 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
9547 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9548 __ subs(tmp4, tmp4, 1);
9549 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
9550 __ sub(cnt2, cnt2, 64);
9551 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9552 __ br(__ GE, LARGE_LOOP_PREFETCH);
9553 }
9554 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
9555 __ bind(NO_PREFETCH);
9556 __ subs(cnt2, cnt2, 16);
9557 __ br(__ LT, TAIL);
9558 __ align(OptoLoopAlignment);
9559 __ bind(SMALL_LOOP); // smaller loop
9560 __ subs(cnt2, cnt2, 16);
9561 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9562 __ br(__ GE, SMALL_LOOP);
9563 __ cmn(cnt2, (u1)16);
9564 __ br(__ EQ, LOAD_LAST);
9565 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
9566 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
9567 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
9568 __ ldr(tmp3, Address(cnt1, -8));
9569 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
9570 __ b(LOAD_LAST);
9571 __ bind(DIFF2);
9572 __ mov(tmpU, tmp3);
9573 __ bind(DIFF1);
9574 __ pop(spilled_regs, sp);
9575 __ b(CALCULATE_DIFFERENCE);
9576 __ bind(LOAD_LAST);
9577 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
9578 // No need to load it again
9579 __ mov(tmpU, tmp3);
9580 __ pop(spilled_regs, sp);
9581
9582 // tmp2 points to the address of the last 4 Latin1 characters right now
9583 __ ldrs(vtmp, Address(tmp2));
9584 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9585 __ fmovd(tmpL, vtmp);
9586
9587 __ eor(rscratch2, tmpU, tmpL);
9588 __ cbz(rscratch2, DONE);
9589
9590 // Find the first different characters in the longwords and
9591 // compute their difference.
9592 __ bind(CALCULATE_DIFFERENCE);
9593 __ rev(rscratch2, rscratch2);
9594 __ clz(rscratch2, rscratch2);
9595 __ andr(rscratch2, rscratch2, -16);
9596 __ lsrv(tmp1, tmp1, rscratch2);
9597 __ uxthw(tmp1, tmp1);
9598 __ lsrv(rscratch1, rscratch1, rscratch2);
9599 __ uxthw(rscratch1, rscratch1);
9600 __ subw(result, tmp1, rscratch1);
9601 __ bind(DONE);
9602 __ ret(lr);
9603
9604 // record the stub entry and end
9605 store_archive_data(stub_id, entry, __ pc());
9606
9607 return entry;
9608 }
9609
9610 // r0 = input (float16)
9611 // v0 = result (float)
9612 // v1 = temporary float register
9613 address generate_float16ToFloat() {
9614 StubId stub_id = StubId::stubgen_hf2f_id;
9615 int entry_count = StubInfo::entry_count(stub_id);
9616 assert(entry_count == 1, "sanity check");
9617 address start = load_archive_data(stub_id);
9618 if (start != nullptr) {
9619 return start;
9620 }
9621 __ align(CodeEntryAlignment);
9622 StubCodeMark mark(this, stub_id);
9623 address entry = __ pc();
9624 BLOCK_COMMENT("Entry:");
9625 __ flt16_to_flt(v0, r0, v1);
9626 __ ret(lr);
9627
9628 // record the stub entry and end
9629 store_archive_data(stub_id, entry, __ pc());
9630
9631 return entry;
9632 }
9633
9634 // v0 = input (float)
9635 // r0 = result (float16)
9636 // v1 = temporary float register
9637 address generate_floatToFloat16() {
9638 StubId stub_id = StubId::stubgen_f2hf_id;
9639 int entry_count = StubInfo::entry_count(stub_id);
9640 assert(entry_count == 1, "sanity check");
9641 address start = load_archive_data(stub_id);
9642 if (start != nullptr) {
9643 return start;
9644 }
9645 __ align(CodeEntryAlignment);
9646 StubCodeMark mark(this, stub_id);
9647 address entry = __ pc();
9648 BLOCK_COMMENT("Entry:");
9649 __ flt_to_flt16(r0, v0, v1);
9650 __ ret(lr);
9651
9652 // record the stub entry and end
9653 store_archive_data(stub_id, entry, __ pc());
9654
9655 return entry;
9656 }
9657
9658 address generate_method_entry_barrier() {
9659 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9660 int entry_count = StubInfo::entry_count(stub_id);
9661 assert(entry_count == 1, "sanity check");
9662 address start = load_archive_data(stub_id);
9663 if (start != nullptr) {
9664 return start;
9665 }
9666 __ align(CodeEntryAlignment);
9667 StubCodeMark mark(this, stub_id);
9668
9669 Label deoptimize_label;
9670
9671 start = __ pc();
9672
9673 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9674
9675 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9676 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9677 // We can get here despite the nmethod being good, if we have not
9678 // yet applied our cross modification fence (or data fence).
9679 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9680 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9681 __ ldrw(rscratch2, rscratch2);
9682 __ strw(rscratch2, thread_epoch_addr);
9683 __ isb();
9684 __ membar(__ LoadLoad);
9685 }
9686
9687 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9688
9689 __ enter();
9690 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9691
9692 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9693
9694 __ push_call_clobbered_registers();
9695
9696 __ mov(c_rarg0, rscratch2);
9697 __ call_VM_leaf
9698 (CAST_FROM_FN_PTR
9699 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9700
9701 __ reset_last_Java_frame(true);
9702
9703 __ mov(rscratch1, r0);
9704
9705 __ pop_call_clobbered_registers();
9706
9707 __ cbnz(rscratch1, deoptimize_label);
9708
9709 __ leave();
9710 __ ret(lr);
9711
9712 __ BIND(deoptimize_label);
9713
9714 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9715 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9716
9717 __ mov(sp, rscratch1);
9718 __ br(rscratch2);
9719
9720 // record the stub entry and end
9721 store_archive_data(stub_id, start, __ pc());
9722
9723 return start;
9724 }
9725
9726 // r0 = result
9727 // r1 = str1
9728 // r2 = cnt1
9729 // r3 = str2
9730 // r4 = cnt2
9731 // r10 = tmp1
9732 // r11 = tmp2
9733 address generate_compare_long_string_same_encoding(bool isLL) {
9734 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9735 int entry_count = StubInfo::entry_count(stub_id);
9736 assert(entry_count == 1, "sanity check");
9737 address start = load_archive_data(stub_id);
9738 if (start != nullptr) {
9739 return start;
9740 }
9741 __ align(CodeEntryAlignment);
9742 StubCodeMark mark(this, stub_id);
9743 address entry = __ pc();
9744 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9745 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9746
9747 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9748
9749 // exit from large loop when less than 64 bytes left to read or we're about
9750 // to prefetch memory behind array border
9751 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9752
9753 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9754 __ eor(rscratch2, tmp1, tmp2);
9755 __ cbnz(rscratch2, CAL_DIFFERENCE);
9756
9757 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9758 // update pointers, because of previous read
9759 __ add(str1, str1, wordSize);
9760 __ add(str2, str2, wordSize);
9761 if (SoftwarePrefetchHintDistance >= 0) {
9762 __ align(OptoLoopAlignment);
9763 __ bind(LARGE_LOOP_PREFETCH);
9764 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9765 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9766
9767 for (int i = 0; i < 4; i++) {
9768 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9769 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9770 __ cmp(tmp1, tmp2);
9771 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9772 __ br(Assembler::NE, DIFF);
9773 }
9774 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9775 __ add(str1, str1, 64);
9776 __ add(str2, str2, 64);
9777 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9778 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9779 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9780 }
9781
9782 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9783 __ br(Assembler::LE, LESS16);
9784 __ align(OptoLoopAlignment);
9785 __ bind(LOOP_COMPARE16);
9786 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9787 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9788 __ cmp(tmp1, tmp2);
9789 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9790 __ br(Assembler::NE, DIFF);
9791 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9792 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9793 __ br(Assembler::LT, LESS16);
9794
9795 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9796 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9797 __ cmp(tmp1, tmp2);
9798 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9799 __ br(Assembler::NE, DIFF);
9800 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9801 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9802 __ br(Assembler::GE, LOOP_COMPARE16);
9803 __ cbz(cnt2, LENGTH_DIFF);
9804
9805 __ bind(LESS16);
9806 // each 8 compare
9807 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9808 __ br(Assembler::LE, LESS8);
9809 __ ldr(tmp1, Address(__ post(str1, 8)));
9810 __ ldr(tmp2, Address(__ post(str2, 8)));
9811 __ eor(rscratch2, tmp1, tmp2);
9812 __ cbnz(rscratch2, CAL_DIFFERENCE);
9813 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9814
9815 __ bind(LESS8); // directly load last 8 bytes
9816 if (!isLL) {
9817 __ add(cnt2, cnt2, cnt2);
9818 }
9819 __ ldr(tmp1, Address(str1, cnt2));
9820 __ ldr(tmp2, Address(str2, cnt2));
9821 __ eor(rscratch2, tmp1, tmp2);
9822 __ cbz(rscratch2, LENGTH_DIFF);
9823 __ b(CAL_DIFFERENCE);
9824
9825 __ bind(DIFF);
9826 __ cmp(tmp1, tmp2);
9827 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9828 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9829 // reuse rscratch2 register for the result of eor instruction
9830 __ eor(rscratch2, tmp1, tmp2);
9831
9832 __ bind(CAL_DIFFERENCE);
9833 __ rev(rscratch2, rscratch2);
9834 __ clz(rscratch2, rscratch2);
9835 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9836 __ lsrv(tmp1, tmp1, rscratch2);
9837 __ lsrv(tmp2, tmp2, rscratch2);
9838 if (isLL) {
9839 __ uxtbw(tmp1, tmp1);
9840 __ uxtbw(tmp2, tmp2);
9841 } else {
9842 __ uxthw(tmp1, tmp1);
9843 __ uxthw(tmp2, tmp2);
9844 }
9845 __ subw(result, tmp1, tmp2);
9846
9847 __ bind(LENGTH_DIFF);
9848 __ ret(lr);
9849
9850 // record the stub entry and end
9851 store_archive_data(stub_id, entry, __ pc());
9852
9853 return entry;
9854 }
9855
9856 enum string_compare_mode {
9857 LL,
9858 LU,
9859 UL,
9860 UU,
9861 };
9862
9863 // The following registers are declared in aarch64.ad
9864 // r0 = result
9865 // r1 = str1
9866 // r2 = cnt1
9867 // r3 = str2
9868 // r4 = cnt2
9869 // r10 = tmp1
9870 // r11 = tmp2
9871 // z0 = ztmp1
9872 // z1 = ztmp2
9873 // p0 = pgtmp1
9874 // p1 = pgtmp2
9875 address generate_compare_long_string_sve(string_compare_mode mode) {
9876 StubId stub_id;
9877 switch (mode) {
9878 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9879 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9880 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9881 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9882 default: ShouldNotReachHere();
9883 }
9884 int entry_count = StubInfo::entry_count(stub_id);
9885 assert(entry_count == 1, "sanity check");
9886 address start = load_archive_data(stub_id);
9887 if (start != nullptr) {
9888 return start;
9889 }
9890 __ align(CodeEntryAlignment);
9891 StubCodeMark mark(this, stub_id);
9892 address entry = __ pc();
9893 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9894 tmp1 = r10, tmp2 = r11;
9895
9896 Label LOOP, DONE, MISMATCH;
9897 Register vec_len = tmp1;
9898 Register idx = tmp2;
9899 // The minimum of the string lengths has been stored in cnt2.
9900 Register cnt = cnt2;
9901 FloatRegister ztmp1 = z0, ztmp2 = z1;
9902 PRegister pgtmp1 = p0, pgtmp2 = p1;
9903
9904 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9905 switch (mode) { \
9906 case LL: \
9907 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9908 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9909 break; \
9910 case LU: \
9911 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9912 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9913 break; \
9914 case UL: \
9915 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9916 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9917 break; \
9918 case UU: \
9919 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9920 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9921 break; \
9922 default: \
9923 ShouldNotReachHere(); \
9924 }
9925
9926 __ mov(idx, 0);
9927 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9928
9929 if (mode == LL) {
9930 __ sve_cntb(vec_len);
9931 } else {
9932 __ sve_cnth(vec_len);
9933 }
9934
9935 __ sub(rscratch1, cnt, vec_len);
9936
9937 __ bind(LOOP);
9938
9939 // main loop
9940 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9941 __ add(idx, idx, vec_len);
9942 // Compare strings.
9943 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9944 __ br(__ NE, MISMATCH);
9945 __ cmp(idx, rscratch1);
9946 __ br(__ LT, LOOP);
9947
9948 // post loop, last iteration
9949 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9950
9951 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9952 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9953 __ br(__ EQ, DONE);
9954
9955 __ bind(MISMATCH);
9956
9957 // Crop the vector to find its location.
9958 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9959 // Extract the first different characters of each string.
9960 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9961 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9962
9963 // Compute the difference of the first different characters.
9964 __ sub(result, rscratch1, rscratch2);
9965
9966 __ bind(DONE);
9967 __ ret(lr);
9968 #undef LOAD_PAIR
9969
9970 // record the stub entry and end
9971 store_archive_data(stub_id, entry, __ pc());
9972
9973 return entry;
9974 }
9975
9976 void generate_compare_long_strings() {
9977 if (UseSVE == 0) {
9978 StubRoutines::aarch64::_compare_long_string_LL
9979 = generate_compare_long_string_same_encoding(true);
9980 StubRoutines::aarch64::_compare_long_string_UU
9981 = generate_compare_long_string_same_encoding(false);
9982 StubRoutines::aarch64::_compare_long_string_LU
9983 = generate_compare_long_string_different_encoding(true);
9984 StubRoutines::aarch64::_compare_long_string_UL
9985 = generate_compare_long_string_different_encoding(false);
9986 } else {
9987 StubRoutines::aarch64::_compare_long_string_LL
9988 = generate_compare_long_string_sve(LL);
9989 StubRoutines::aarch64::_compare_long_string_UU
9990 = generate_compare_long_string_sve(UU);
9991 StubRoutines::aarch64::_compare_long_string_LU
9992 = generate_compare_long_string_sve(LU);
9993 StubRoutines::aarch64::_compare_long_string_UL
9994 = generate_compare_long_string_sve(UL);
9995 }
9996 }
9997
9998 // R0 = result
9999 // R1 = str2
10000 // R2 = cnt1
10001 // R3 = str1
10002 // R4 = cnt2
10003 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10004 //
10005 // This generic linear code use few additional ideas, which makes it faster:
10006 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10007 // in order to skip initial loading(help in systems with 1 ld pipeline)
10008 // 2) we can use "fast" algorithm of finding single character to search for
10009 // first symbol with less branches(1 branch per each loaded register instead
10010 // of branch for each symbol), so, this is where constants like
10011 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10012 // 3) after loading and analyzing 1st register of source string, it can be
10013 // used to search for every 1st character entry, saving few loads in
10014 // comparison with "simplier-but-slower" implementation
10015 // 4) in order to avoid lots of push/pop operations, code below is heavily
10016 // re-using/re-initializing/compressing register values, which makes code
10017 // larger and a bit less readable, however, most of extra operations are
10018 // issued during loads or branches, so, penalty is minimal
10019 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10020 StubId stub_id;
10021 if (str1_isL) {
10022 if (str2_isL) {
10023 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10024 } else {
10025 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10026 }
10027 } else {
10028 if (str2_isL) {
10029 ShouldNotReachHere();
10030 } else {
10031 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10032 }
10033 }
10034 int entry_count = StubInfo::entry_count(stub_id);
10035 assert(entry_count == 1, "sanity check");
10036 address start = load_archive_data(stub_id);
10037 if (start != nullptr) {
10038 return start;
10039 }
10040 __ align(CodeEntryAlignment);
10041 StubCodeMark mark(this, stub_id);
10042 address entry = __ pc();
10043
10044 int str1_chr_size = str1_isL ? 1 : 2;
10045 int str2_chr_size = str2_isL ? 1 : 2;
10046 int str1_chr_shift = str1_isL ? 0 : 1;
10047 int str2_chr_shift = str2_isL ? 0 : 1;
10048 bool isL = str1_isL && str2_isL;
10049 // parameters
10050 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10051 // temporary registers
10052 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10053 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10054 // redefinitions
10055 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10056
10057 __ push(spilled_regs, sp);
10058 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10059 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10060 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10061 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10062 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10063 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10064 // Read whole register from str1. It is safe, because length >=8 here
10065 __ ldr(ch1, Address(str1));
10066 // Read whole register from str2. It is safe, because length >=8 here
10067 __ ldr(ch2, Address(str2));
10068 __ sub(cnt2, cnt2, cnt1);
10069 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10070 if (str1_isL != str2_isL) {
10071 __ eor(v0, __ T16B, v0, v0);
10072 }
10073 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10074 __ mul(first, first, tmp1);
10075 // check if we have less than 1 register to check
10076 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10077 if (str1_isL != str2_isL) {
10078 __ fmovd(v1, ch1);
10079 }
10080 __ br(__ LE, L_SMALL);
10081 __ eor(ch2, first, ch2);
10082 if (str1_isL != str2_isL) {
10083 __ zip1(v1, __ T16B, v1, v0);
10084 }
10085 __ sub(tmp2, ch2, tmp1);
10086 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10087 __ bics(tmp2, tmp2, ch2);
10088 if (str1_isL != str2_isL) {
10089 __ fmovd(ch1, v1);
10090 }
10091 __ br(__ NE, L_HAS_ZERO);
10092 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10093 __ add(result, result, wordSize/str2_chr_size);
10094 __ add(str2, str2, wordSize);
10095 __ br(__ LT, L_POST_LOOP);
10096 __ BIND(L_LOOP);
10097 __ ldr(ch2, Address(str2));
10098 __ eor(ch2, first, ch2);
10099 __ sub(tmp2, ch2, tmp1);
10100 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10101 __ bics(tmp2, tmp2, ch2);
10102 __ br(__ NE, L_HAS_ZERO);
10103 __ BIND(L_LOOP_PROCEED);
10104 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10105 __ add(str2, str2, wordSize);
10106 __ add(result, result, wordSize/str2_chr_size);
10107 __ br(__ GE, L_LOOP);
10108 __ BIND(L_POST_LOOP);
10109 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10110 __ br(__ LE, NOMATCH);
10111 __ ldr(ch2, Address(str2));
10112 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10113 __ eor(ch2, first, ch2);
10114 __ sub(tmp2, ch2, tmp1);
10115 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10116 __ mov(tmp4, -1); // all bits set
10117 __ b(L_SMALL_PROCEED);
10118 __ align(OptoLoopAlignment);
10119 __ BIND(L_SMALL);
10120 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10121 __ eor(ch2, first, ch2);
10122 if (str1_isL != str2_isL) {
10123 __ zip1(v1, __ T16B, v1, v0);
10124 }
10125 __ sub(tmp2, ch2, tmp1);
10126 __ mov(tmp4, -1); // all bits set
10127 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10128 if (str1_isL != str2_isL) {
10129 __ fmovd(ch1, v1); // move converted 4 symbols
10130 }
10131 __ BIND(L_SMALL_PROCEED);
10132 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10133 __ bic(tmp2, tmp2, ch2);
10134 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10135 __ rbit(tmp2, tmp2);
10136 __ br(__ EQ, NOMATCH);
10137 __ BIND(L_SMALL_HAS_ZERO_LOOP);
10138 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10139 __ cmp(cnt1, u1(wordSize/str2_chr_size));
10140 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10141 if (str2_isL) { // LL
10142 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10143 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10144 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10145 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10146 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10147 } else {
10148 __ mov(ch2, 0xE); // all bits in byte set except last one
10149 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10150 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10151 __ lslv(tmp2, tmp2, tmp4);
10152 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10153 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10154 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10155 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10156 }
10157 __ cmp(ch1, ch2);
10158 __ mov(tmp4, wordSize/str2_chr_size);
10159 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10160 __ BIND(L_SMALL_CMP_LOOP);
10161 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10162 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10163 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10164 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10165 __ add(tmp4, tmp4, 1);
10166 __ cmp(tmp4, cnt1);
10167 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10168 __ cmp(first, ch2);
10169 __ br(__ EQ, L_SMALL_CMP_LOOP);
10170 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10171 __ cbz(tmp2, NOMATCH); // no more matches. exit
10172 __ clz(tmp4, tmp2);
10173 __ add(result, result, 1); // advance index
10174 __ add(str2, str2, str2_chr_size); // advance pointer
10175 __ b(L_SMALL_HAS_ZERO_LOOP);
10176 __ align(OptoLoopAlignment);
10177 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10178 __ cmp(first, ch2);
10179 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10180 __ b(DONE);
10181 __ align(OptoLoopAlignment);
10182 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10183 if (str2_isL) { // LL
10184 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10185 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10186 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10187 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10188 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10189 } else {
10190 __ mov(ch2, 0xE); // all bits in byte set except last one
10191 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10192 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10193 __ lslv(tmp2, tmp2, tmp4);
10194 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10195 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10196 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10197 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10198 }
10199 __ cmp(ch1, ch2);
10200 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10201 __ b(DONE);
10202 __ align(OptoLoopAlignment);
10203 __ BIND(L_HAS_ZERO);
10204 __ rbit(tmp2, tmp2);
10205 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10206 // Now, perform compression of counters(cnt2 and cnt1) into one register.
10207 // It's fine because both counters are 32bit and are not changed in this
10208 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10209 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10210 __ sub(result, result, 1);
10211 __ BIND(L_HAS_ZERO_LOOP);
10212 __ mov(cnt1, wordSize/str2_chr_size);
10213 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10214 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10215 if (str2_isL) {
10216 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10217 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10218 __ lslv(tmp2, tmp2, tmp4);
10219 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10220 __ add(tmp4, tmp4, 1);
10221 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10222 __ lsl(tmp2, tmp2, 1);
10223 __ mov(tmp4, wordSize/str2_chr_size);
10224 } else {
10225 __ mov(ch2, 0xE);
10226 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10227 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10228 __ lslv(tmp2, tmp2, tmp4);
10229 __ add(tmp4, tmp4, 1);
10230 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10231 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10232 __ lsl(tmp2, tmp2, 1);
10233 __ mov(tmp4, wordSize/str2_chr_size);
10234 __ sub(str2, str2, str2_chr_size);
10235 }
10236 __ cmp(ch1, ch2);
10237 __ mov(tmp4, wordSize/str2_chr_size);
10238 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10239 __ BIND(L_CMP_LOOP);
10240 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10241 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10242 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10243 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10244 __ add(tmp4, tmp4, 1);
10245 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10246 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10247 __ cmp(cnt1, ch2);
10248 __ br(__ EQ, L_CMP_LOOP);
10249 __ BIND(L_CMP_LOOP_NOMATCH);
10250 // here we're not matched
10251 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10252 __ clz(tmp4, tmp2);
10253 __ add(str2, str2, str2_chr_size); // advance pointer
10254 __ b(L_HAS_ZERO_LOOP);
10255 __ align(OptoLoopAlignment);
10256 __ BIND(L_CMP_LOOP_LAST_CMP);
10257 __ cmp(cnt1, ch2);
10258 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10259 __ b(DONE);
10260 __ align(OptoLoopAlignment);
10261 __ BIND(L_CMP_LOOP_LAST_CMP2);
10262 if (str2_isL) {
10263 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10264 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10265 __ lslv(tmp2, tmp2, tmp4);
10266 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10267 __ add(tmp4, tmp4, 1);
10268 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10269 __ lsl(tmp2, tmp2, 1);
10270 } else {
10271 __ mov(ch2, 0xE);
10272 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10273 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10274 __ lslv(tmp2, tmp2, tmp4);
10275 __ add(tmp4, tmp4, 1);
10276 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10277 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10278 __ lsl(tmp2, tmp2, 1);
10279 __ sub(str2, str2, str2_chr_size);
10280 }
10281 __ cmp(ch1, ch2);
10282 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10283 __ b(DONE);
10284 __ align(OptoLoopAlignment);
10285 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10286 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10287 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10288 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10289 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10290 // result by analyzed characters value, so, we can just reset lower bits
10291 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10292 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10293 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10294 // index of last analyzed substring inside current octet. So, str2 in at
10295 // respective start address. We need to advance it to next octet
10296 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10297 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10298 __ bfm(result, zr, 0, 2 - str2_chr_shift);
10299 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10300 __ movw(cnt2, cnt2);
10301 __ b(L_LOOP_PROCEED);
10302 __ align(OptoLoopAlignment);
10303 __ BIND(NOMATCH);
10304 __ mov(result, -1);
10305 __ BIND(DONE);
10306 __ pop(spilled_regs, sp);
10307 __ ret(lr);
10308
10309 // record the stub entry and end
10310 store_archive_data(stub_id, entry, __ pc());
10311
10312 return entry;
10313 }
10314
10315 void generate_string_indexof_stubs() {
10316 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10317 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10318 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10319 }
10320
10321 void inflate_and_store_2_fp_registers(bool generatePrfm,
10322 FloatRegister src1, FloatRegister src2) {
10323 Register dst = r1;
10324 __ zip1(v1, __ T16B, src1, v0);
10325 __ zip2(v2, __ T16B, src1, v0);
10326 if (generatePrfm) {
10327 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10328 }
10329 __ zip1(v3, __ T16B, src2, v0);
10330 __ zip2(v4, __ T16B, src2, v0);
10331 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10332 }
10333
10334 // R0 = src
10335 // R1 = dst
10336 // R2 = len
10337 // R3 = len >> 3
10338 // V0 = 0
10339 // v1 = loaded 8 bytes
10340 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10341 address generate_large_byte_array_inflate() {
10342 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10343 int entry_count = StubInfo::entry_count(stub_id);
10344 assert(entry_count == 1, "sanity check");
10345 address start = load_archive_data(stub_id);
10346 if (start != nullptr) {
10347 return start;
10348 }
10349 __ align(CodeEntryAlignment);
10350 StubCodeMark mark(this, stub_id);
10351 address entry = __ pc();
10352 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10353 Register src = r0, dst = r1, len = r2, octetCounter = r3;
10354 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10355
10356 // do one more 8-byte read to have address 16-byte aligned in most cases
10357 // also use single store instruction
10358 __ ldrd(v2, __ post(src, 8));
10359 __ sub(octetCounter, octetCounter, 2);
10360 __ zip1(v1, __ T16B, v1, v0);
10361 __ zip1(v2, __ T16B, v2, v0);
10362 __ st1(v1, v2, __ T16B, __ post(dst, 32));
10363 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10364 __ subs(rscratch1, octetCounter, large_loop_threshold);
10365 __ br(__ LE, LOOP_START);
10366 __ b(LOOP_PRFM_START);
10367 __ bind(LOOP_PRFM);
10368 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10369 __ bind(LOOP_PRFM_START);
10370 __ prfm(Address(src, SoftwarePrefetchHintDistance));
10371 __ sub(octetCounter, octetCounter, 8);
10372 __ subs(rscratch1, octetCounter, large_loop_threshold);
10373 inflate_and_store_2_fp_registers(true, v3, v4);
10374 inflate_and_store_2_fp_registers(true, v5, v6);
10375 __ br(__ GT, LOOP_PRFM);
10376 __ cmp(octetCounter, (u1)8);
10377 __ br(__ LT, DONE);
10378 __ bind(LOOP);
10379 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10380 __ bind(LOOP_START);
10381 __ sub(octetCounter, octetCounter, 8);
10382 __ cmp(octetCounter, (u1)8);
10383 inflate_and_store_2_fp_registers(false, v3, v4);
10384 inflate_and_store_2_fp_registers(false, v5, v6);
10385 __ br(__ GE, LOOP);
10386 __ bind(DONE);
10387 __ ret(lr);
10388
10389 // record the stub entry and end
10390 store_archive_data(stub_id, entry, __ pc());
10391
10392 return entry;
10393 }
10394
10395 /**
10396 * Arguments:
10397 *
10398 * Input:
10399 * c_rarg0 - current state address
10400 * c_rarg1 - H key address
10401 * c_rarg2 - data address
10402 * c_rarg3 - number of blocks
10403 *
10404 * Output:
10405 * Updated state at c_rarg0
10406 */
10407 address generate_ghash_processBlocks_small() {
10408 // Bafflingly, GCM uses little-endian for the byte order, but
10409 // big-endian for the bit order. For example, the polynomial 1 is
10410 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10411 //
10412 // So, we must either reverse the bytes in each word and do
10413 // everything big-endian or reverse the bits in each byte and do
10414 // it little-endian. On AArch64 it's more idiomatic to reverse
10415 // the bits in each byte (we have an instruction, RBIT, to do
10416 // that) and keep the data in little-endian bit order through the
10417 // calculation, bit-reversing the inputs and outputs.
10418
10419 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10420 int entry_count = StubInfo::entry_count(stub_id);
10421 assert(entry_count == 1, "sanity check");
10422 address start = load_archive_data(stub_id);
10423 if (start != nullptr) {
10424 return start;
10425 }
10426 __ align(CodeEntryAlignment);
10427 StubCodeMark mark(this, stub_id);
10428 Label polynomial; // local data generated at end of stub
10429 start = __ pc();
10430
10431 Register state = c_rarg0;
10432 Register subkeyH = c_rarg1;
10433 Register data = c_rarg2;
10434 Register blocks = c_rarg3;
10435
10436 FloatRegister vzr = v30;
10437 __ eor(vzr, __ T16B, vzr, vzr); // zero register
10438
10439 __ adr(rscratch1, polynomial);
10440 __ ldrq(v24, rscratch1); // The field polynomial
10441
10442 __ ldrq(v0, Address(state));
10443 __ ldrq(v1, Address(subkeyH));
10444
10445 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
10446 __ rbit(v0, __ T16B, v0);
10447 __ rev64(v1, __ T16B, v1);
10448 __ rbit(v1, __ T16B, v1);
10449
10450 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10451 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10452
10453 {
10454 Label L_ghash_loop;
10455 __ bind(L_ghash_loop);
10456
10457 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10458 // reversing each byte
10459 __ rbit(v2, __ T16B, v2);
10460 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
10461
10462 // Multiply state in v2 by subkey in v1
10463 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10464 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10465 /*temps*/v6, v3, /*reuse/clobber b*/v2);
10466 // Reduce v7:v5 by the field polynomial
10467 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10468
10469 __ sub(blocks, blocks, 1);
10470 __ cbnz(blocks, L_ghash_loop);
10471 }
10472
10473 // The bit-reversed result is at this point in v0
10474 __ rev64(v0, __ T16B, v0);
10475 __ rbit(v0, __ T16B, v0);
10476
10477 __ st1(v0, __ T16B, state);
10478 __ ret(lr);
10479
10480 // bind label and generate local polynomial data
10481 __ align(wordSize * 2);
10482 __ bind(polynomial);
10483 __ emit_int64(0x87); // The low-order bits of the field
10484 // polynomial (i.e. p = z^7+z^2+z+1)
10485 // repeated in the low and high parts of a
10486 // 128-bit vector
10487 __ emit_int64(0x87);
10488
10489 // record the stub entry and end
10490 store_archive_data(stub_id, start, __ pc());
10491
10492 return start;
10493 }
10494
10495 address generate_ghash_processBlocks(address small) {
10496 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10497 int entry_count = StubInfo::entry_count(stub_id);
10498 assert(entry_count == 1, "sanity check");
10499 address start = load_archive_data(stub_id);
10500 if (start != nullptr) {
10501 return start;
10502 }
10503 Label polynomial; // local data generated after stub
10504 __ align(CodeEntryAlignment);
10505 StubCodeMark mark(this, stub_id);
10506 start = __ pc();
10507
10508 Register state = c_rarg0;
10509 Register subkeyH = c_rarg1;
10510 Register data = c_rarg2;
10511 Register blocks = c_rarg3;
10512
10513 const int unroll = 4;
10514
10515 __ cmp(blocks, (unsigned char)(unroll * 2));
10516 __ br(__ LT, small);
10517
10518 if (unroll > 1) {
10519 // Save state before entering routine
10520 __ sub(sp, sp, 4 * 16);
10521 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10522 __ sub(sp, sp, 4 * 16);
10523 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10524 }
10525
10526 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10527
10528 if (unroll > 1) {
10529 // And restore state
10530 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10531 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10532 }
10533
10534 __ cmp(blocks, (unsigned char)0);
10535 __ br(__ GT, small);
10536
10537 __ ret(lr);
10538
10539 // bind label and generate polynomial data
10540 __ align(wordSize * 2);
10541 __ bind(polynomial);
10542 __ emit_int64(0x87); // The low-order bits of the field
10543 // polynomial (i.e. p = z^7+z^2+z+1)
10544 // repeated in the low and high parts of a
10545 // 128-bit vector
10546 __ emit_int64(0x87);
10547
10548 // record the stub entry and end
10549 store_archive_data(stub_id, start, __ pc());
10550
10551 return start;
10552 }
10553
10554 void generate_base64_encode_simdround(Register src, Register dst,
10555 FloatRegister codec, u8 size) {
10556
10557 FloatRegister in0 = v4, in1 = v5, in2 = v6;
10558 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10559 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10560
10561 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10562
10563 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10564
10565 __ ushr(ind0, arrangement, in0, 2);
10566
10567 __ ushr(ind1, arrangement, in1, 2);
10568 __ shl(in0, arrangement, in0, 6);
10569 __ orr(ind1, arrangement, ind1, in0);
10570 __ ushr(ind1, arrangement, ind1, 2);
10571
10572 __ ushr(ind2, arrangement, in2, 4);
10573 __ shl(in1, arrangement, in1, 4);
10574 __ orr(ind2, arrangement, in1, ind2);
10575 __ ushr(ind2, arrangement, ind2, 2);
10576
10577 __ shl(ind3, arrangement, in2, 2);
10578 __ ushr(ind3, arrangement, ind3, 2);
10579
10580 __ tbl(out0, arrangement, codec, 4, ind0);
10581 __ tbl(out1, arrangement, codec, 4, ind1);
10582 __ tbl(out2, arrangement, codec, 4, ind2);
10583 __ tbl(out3, arrangement, codec, 4, ind3);
10584
10585 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
10586 }
10587
10588 /**
10589 * Arguments:
10590 *
10591 * Input:
10592 * c_rarg0 - src_start
10593 * c_rarg1 - src_offset
10594 * c_rarg2 - src_length
10595 * c_rarg3 - dest_start
10596 * c_rarg4 - dest_offset
10597 * c_rarg5 - isURL
10598 *
10599 */
10600 address generate_base64_encodeBlock() {
10601
10602 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10603 int entry_count = StubInfo::entry_count(stub_id);
10604 assert(entry_count == 1, "sanity check");
10605 address start = load_archive_data(stub_id);
10606 if (start != nullptr) {
10607 return start;
10608 }
10609 __ align(CodeEntryAlignment);
10610 StubCodeMark mark(this, stub_id);
10611 start = __ pc();
10612
10613 Register src = c_rarg0; // source array
10614 Register soff = c_rarg1; // source start offset
10615 Register send = c_rarg2; // source end offset
10616 Register dst = c_rarg3; // dest array
10617 Register doff = c_rarg4; // position for writing to dest array
10618 Register isURL = c_rarg5; // Base64 or URL character set
10619
10620 // c_rarg6 and c_rarg7 are free to use as temps
10621 Register codec = c_rarg6;
10622 Register length = c_rarg7;
10623
10624 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10625
10626 __ add(src, src, soff);
10627 __ add(dst, dst, doff);
10628 __ sub(length, send, soff);
10629
10630 // load the codec base address
10631 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10632 __ cbz(isURL, ProcessData);
10633 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10634
10635 __ BIND(ProcessData);
10636
10637 // too short to formup a SIMD loop, roll back
10638 __ cmp(length, (u1)24);
10639 __ br(Assembler::LT, Process3B);
10640
10641 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10642
10643 __ BIND(Process48B);
10644 __ cmp(length, (u1)48);
10645 __ br(Assembler::LT, Process24B);
10646 generate_base64_encode_simdround(src, dst, v0, 16);
10647 __ sub(length, length, 48);
10648 __ b(Process48B);
10649
10650 __ BIND(Process24B);
10651 __ cmp(length, (u1)24);
10652 __ br(Assembler::LT, SIMDExit);
10653 generate_base64_encode_simdround(src, dst, v0, 8);
10654 __ sub(length, length, 24);
10655
10656 __ BIND(SIMDExit);
10657 __ cbz(length, Exit);
10658
10659 __ BIND(Process3B);
10660 // 3 src bytes, 24 bits
10661 __ ldrb(r10, __ post(src, 1));
10662 __ ldrb(r11, __ post(src, 1));
10663 __ ldrb(r12, __ post(src, 1));
10664 __ orrw(r11, r11, r10, Assembler::LSL, 8);
10665 __ orrw(r12, r12, r11, Assembler::LSL, 8);
10666 // codec index
10667 __ ubfmw(r15, r12, 18, 23);
10668 __ ubfmw(r14, r12, 12, 17);
10669 __ ubfmw(r13, r12, 6, 11);
10670 __ andw(r12, r12, 63);
10671 // get the code based on the codec
10672 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10673 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10674 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10675 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10676 __ strb(r15, __ post(dst, 1));
10677 __ strb(r14, __ post(dst, 1));
10678 __ strb(r13, __ post(dst, 1));
10679 __ strb(r12, __ post(dst, 1));
10680 __ sub(length, length, 3);
10681 __ cbnz(length, Process3B);
10682
10683 __ BIND(Exit);
10684 __ ret(lr);
10685
10686 // record the stub entry and end
10687 store_archive_data(stub_id, start, __ pc());
10688
10689 return start;
10690 }
10691
10692 void generate_base64_decode_simdround(Register src, Register dst,
10693 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10694
10695 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
10696 FloatRegister out0 = v20, out1 = v21, out2 = v22;
10697
10698 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10699 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10700
10701 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10702
10703 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10704
10705 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10706
10707 // we need unsigned saturating subtract, to make sure all input values
10708 // in range [0, 63] will have 0U value in the higher half lookup
10709 __ uqsubv(decH0, __ T16B, in0, v27);
10710 __ uqsubv(decH1, __ T16B, in1, v27);
10711 __ uqsubv(decH2, __ T16B, in2, v27);
10712 __ uqsubv(decH3, __ T16B, in3, v27);
10713
10714 // lower half lookup
10715 __ tbl(decL0, arrangement, codecL, 4, in0);
10716 __ tbl(decL1, arrangement, codecL, 4, in1);
10717 __ tbl(decL2, arrangement, codecL, 4, in2);
10718 __ tbl(decL3, arrangement, codecL, 4, in3);
10719
10720 // higher half lookup
10721 __ tbx(decH0, arrangement, codecH, 4, decH0);
10722 __ tbx(decH1, arrangement, codecH, 4, decH1);
10723 __ tbx(decH2, arrangement, codecH, 4, decH2);
10724 __ tbx(decH3, arrangement, codecH, 4, decH3);
10725
10726 // combine lower and higher
10727 __ orr(decL0, arrangement, decL0, decH0);
10728 __ orr(decL1, arrangement, decL1, decH1);
10729 __ orr(decL2, arrangement, decL2, decH2);
10730 __ orr(decL3, arrangement, decL3, decH3);
10731
10732 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10733 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10734 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10735 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10736 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10737 __ orr(in0, arrangement, decH0, decH1);
10738 __ orr(in1, arrangement, decH2, decH3);
10739 __ orr(in2, arrangement, in0, in1);
10740 __ umaxv(in3, arrangement, in2);
10741 __ umov(rscratch2, in3, __ B, 0);
10742
10743 // get the data to output
10744 __ shl(out0, arrangement, decL0, 2);
10745 __ ushr(out1, arrangement, decL1, 4);
10746 __ orr(out0, arrangement, out0, out1);
10747 __ shl(out1, arrangement, decL1, 4);
10748 __ ushr(out2, arrangement, decL2, 2);
10749 __ orr(out1, arrangement, out1, out2);
10750 __ shl(out2, arrangement, decL2, 6);
10751 __ orr(out2, arrangement, out2, decL3);
10752
10753 __ cbz(rscratch2, NoIllegalData);
10754
10755 // handle illegal input
10756 __ umov(r10, in2, __ D, 0);
10757 if (size == 16) {
10758 __ cbnz(r10, ErrorInLowerHalf);
10759
10760 // illegal input is in higher half, store the lower half now.
10761 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10762
10763 __ umov(r10, in2, __ D, 1);
10764 __ umov(r11, out0, __ D, 1);
10765 __ umov(r12, out1, __ D, 1);
10766 __ umov(r13, out2, __ D, 1);
10767 __ b(StoreLegalData);
10768
10769 __ BIND(ErrorInLowerHalf);
10770 }
10771 __ umov(r11, out0, __ D, 0);
10772 __ umov(r12, out1, __ D, 0);
10773 __ umov(r13, out2, __ D, 0);
10774
10775 __ BIND(StoreLegalData);
10776 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10777 __ strb(r11, __ post(dst, 1));
10778 __ strb(r12, __ post(dst, 1));
10779 __ strb(r13, __ post(dst, 1));
10780 __ lsr(r10, r10, 8);
10781 __ lsr(r11, r11, 8);
10782 __ lsr(r12, r12, 8);
10783 __ lsr(r13, r13, 8);
10784 __ b(StoreLegalData);
10785
10786 __ BIND(NoIllegalData);
10787 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10788 }
10789
10790
10791 /**
10792 * Arguments:
10793 *
10794 * Input:
10795 * c_rarg0 - src_start
10796 * c_rarg1 - src_offset
10797 * c_rarg2 - src_length
10798 * c_rarg3 - dest_start
10799 * c_rarg4 - dest_offset
10800 * c_rarg5 - isURL
10801 * c_rarg6 - isMIME
10802 *
10803 */
10804 address generate_base64_decodeBlock() {
10805
10806 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10807 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10808 // titled "Base64 decoding".
10809
10810 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10811 int entry_count = StubInfo::entry_count(stub_id);
10812 assert(entry_count == 1, "sanity check");
10813 address start = load_archive_data(stub_id);
10814 if (start != nullptr) {
10815 return start;
10816 }
10817 __ align(CodeEntryAlignment);
10818 StubCodeMark mark(this, stub_id);
10819 start = __ pc();
10820
10821 Register src = c_rarg0; // source array
10822 Register soff = c_rarg1; // source start offset
10823 Register send = c_rarg2; // source end offset
10824 Register dst = c_rarg3; // dest array
10825 Register doff = c_rarg4; // position for writing to dest array
10826 Register isURL = c_rarg5; // Base64 or URL character set
10827 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10828
10829 Register length = send; // reuse send as length of source data to process
10830
10831 Register simd_codec = c_rarg6;
10832 Register nosimd_codec = c_rarg7;
10833
10834 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10835
10836 __ enter();
10837
10838 __ add(src, src, soff);
10839 __ add(dst, dst, doff);
10840
10841 __ mov(doff, dst);
10842
10843 __ sub(length, send, soff);
10844 __ bfm(length, zr, 0, 1);
10845
10846 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10847 __ cbz(isURL, ProcessData);
10848 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10849
10850 __ BIND(ProcessData);
10851 __ mov(rscratch1, length);
10852 __ cmp(length, (u1)144); // 144 = 80 + 64
10853 __ br(Assembler::LT, Process4B);
10854
10855 // In the MIME case, the line length cannot be more than 76
10856 // bytes (see RFC 2045). This is too short a block for SIMD
10857 // to be worthwhile, so we use non-SIMD here.
10858 __ movw(rscratch1, 79);
10859
10860 __ BIND(Process4B);
10861 __ ldrw(r14, __ post(src, 4));
10862 __ ubfxw(r10, r14, 0, 8);
10863 __ ubfxw(r11, r14, 8, 8);
10864 __ ubfxw(r12, r14, 16, 8);
10865 __ ubfxw(r13, r14, 24, 8);
10866 // get the de-code
10867 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10868 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10869 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10870 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10871 // error detection, 255u indicates an illegal input
10872 __ orrw(r14, r10, r11);
10873 __ orrw(r15, r12, r13);
10874 __ orrw(r14, r14, r15);
10875 __ tbnz(r14, 7, Exit);
10876 // recover the data
10877 __ lslw(r14, r10, 10);
10878 __ bfiw(r14, r11, 4, 6);
10879 __ bfmw(r14, r12, 2, 5);
10880 __ rev16w(r14, r14);
10881 __ bfiw(r13, r12, 6, 2);
10882 __ strh(r14, __ post(dst, 2));
10883 __ strb(r13, __ post(dst, 1));
10884 // non-simd loop
10885 __ subsw(rscratch1, rscratch1, 4);
10886 __ br(Assembler::GT, Process4B);
10887
10888 // if exiting from PreProcess80B, rscratch1 == -1;
10889 // otherwise, rscratch1 == 0.
10890 __ cbzw(rscratch1, Exit);
10891 __ sub(length, length, 80);
10892
10893 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10894 __ cbz(isURL, SIMDEnter);
10895 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10896
10897 __ BIND(SIMDEnter);
10898 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10899 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10900 __ mov(rscratch1, 63);
10901 __ dup(v27, __ T16B, rscratch1);
10902
10903 __ BIND(Process64B);
10904 __ cmp(length, (u1)64);
10905 __ br(Assembler::LT, Process32B);
10906 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10907 __ sub(length, length, 64);
10908 __ b(Process64B);
10909
10910 __ BIND(Process32B);
10911 __ cmp(length, (u1)32);
10912 __ br(Assembler::LT, SIMDExit);
10913 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10914 __ sub(length, length, 32);
10915 __ b(Process32B);
10916
10917 __ BIND(SIMDExit);
10918 __ cbz(length, Exit);
10919 __ movw(rscratch1, length);
10920 __ b(Process4B);
10921
10922 __ BIND(Exit);
10923 __ sub(c_rarg0, dst, doff);
10924
10925 __ leave();
10926 __ ret(lr);
10927
10928 // record the stub entry and end
10929 store_archive_data(stub_id, start, __ pc());
10930
10931 return start;
10932 }
10933
10934 // Support for spin waits.
10935 address generate_spin_wait() {
10936 StubId stub_id = StubId::stubgen_spin_wait_id;
10937 int entry_count = StubInfo::entry_count(stub_id);
10938 assert(entry_count == 1, "sanity check");
10939 address start = load_archive_data(stub_id);
10940 if (start != nullptr) {
10941 return start;
10942 }
10943 __ align(CodeEntryAlignment);
10944 StubCodeMark mark(this, stub_id);
10945 start = __ pc();
10946
10947 __ spin_wait();
10948 __ ret(lr);
10949
10950 // record the stub entry and end
10951 store_archive_data(stub_id, start, __ pc());
10952
10953 return start;
10954 }
10955
10956 void generate_lookup_secondary_supers_table_stub() {
10957 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10958 GrowableArray<address> entries;
10959 int entry_count = StubInfo::entry_count(stub_id);
10960 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10961 address start = load_archive_data(stub_id, &entries);
10962 if (start != nullptr) {
10963 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10964 "unexpected extra entry count %d", entries.length());
10965 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10966 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10967 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10968 }
10969 return;
10970 }
10971
10972 StubCodeMark mark(this, stub_id);
10973
10974 const Register
10975 r_super_klass = r0,
10976 r_array_base = r1,
10977 r_array_length = r2,
10978 r_array_index = r3,
10979 r_sub_klass = r4,
10980 r_bitmap = rscratch2,
10981 result = r5;
10982 const FloatRegister
10983 vtemp = v0;
10984
10985 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10986 address next_entry = __ pc();
10987 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10988 if (slot == 0) {
10989 start = next_entry;
10990 } else {
10991 entries.append(next_entry);
10992 }
10993 Label L_success;
10994 __ enter();
10995 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10996 r_array_base, r_array_length, r_array_index,
10997 vtemp, result, slot,
10998 /*stub_is_near*/true);
10999 __ leave();
11000 __ ret(lr);
11001 }
11002 // record the stub entry and end plus all the auxiliary entries
11003 store_archive_data(stub_id, start, __ pc(), &entries);
11004 }
11005
11006 // Slow path implementation for UseSecondarySupersTable.
11007 address generate_lookup_secondary_supers_table_slow_path_stub() {
11008 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11009 int entry_count = StubInfo::entry_count(stub_id);
11010 assert(entry_count == 1, "sanity check");
11011 address start = load_archive_data(stub_id);
11012 if (start != nullptr) {
11013 return start;
11014 }
11015 StubCodeMark mark(this, stub_id);
11016 start = __ pc();
11017 const Register
11018 r_super_klass = r0, // argument
11019 r_array_base = r1, // argument
11020 temp1 = r2, // temp
11021 r_array_index = r3, // argument
11022 r_bitmap = rscratch2, // argument
11023 result = r5; // argument
11024
11025 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11026 __ ret(lr);
11027
11028 // record the stub entry and end
11029 store_archive_data(stub_id, start, __ pc());
11030
11031 return start;
11032 }
11033
11034 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11035
11036 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11037 //
11038 // If LSE is in use, generate LSE versions of all the stubs. The
11039 // non-LSE versions are in atomic_aarch64.S.
11040
11041 // class AtomicStubMark records the entry point of a stub and the
11042 // stub pointer which will point to it. The stub pointer is set to
11043 // the entry point when ~AtomicStubMark() is called, which must be
11044 // after ICache::invalidate_range. This ensures safe publication of
11045 // the generated code.
11046 class AtomicStubMark {
11047 address _entry_point;
11048 aarch64_atomic_stub_t *_stub;
11049 MacroAssembler *_masm;
11050 public:
11051 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11052 _masm = masm;
11053 __ align(32);
11054 _entry_point = __ pc();
11055 _stub = stub;
11056 }
11057 ~AtomicStubMark() {
11058 *_stub = (aarch64_atomic_stub_t)_entry_point;
11059 }
11060 };
11061
11062 // NB: For memory_order_conservative we need a trailing membar after
11063 // LSE atomic operations but not a leading membar.
11064 //
11065 // We don't need a leading membar because a clause in the Arm ARM
11066 // says:
11067 //
11068 // Barrier-ordered-before
11069 //
11070 // Barrier instructions order prior Memory effects before subsequent
11071 // Memory effects generated by the same Observer. A read or a write
11072 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11073 // Observer if and only if RW1 appears in program order before RW 2
11074 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11075 // instruction with both Acquire and Release semantics.
11076 //
11077 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11078 // and Release semantics, therefore we don't need a leading
11079 // barrier. However, there is no corresponding Barrier-ordered-after
11080 // relationship, therefore we need a trailing membar to prevent a
11081 // later store or load from being reordered with the store in an
11082 // atomic instruction.
11083 //
11084 // This was checked by using the herd7 consistency model simulator
11085 // (http://diy.inria.fr/) with this test case:
11086 //
11087 // AArch64 LseCas
11088 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11089 // P0 | P1;
11090 // LDR W4, [X2] | MOV W3, #0;
11091 // DMB LD | MOV W4, #1;
11092 // LDR W3, [X1] | CASAL W3, W4, [X1];
11093 // | DMB ISH;
11094 // | STR W4, [X2];
11095 // exists
11096 // (0:X3=0 /\ 0:X4=1)
11097 //
11098 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11099 // with the store to x in P1. Without the DMB in P1 this may happen.
11100 //
11101 // At the time of writing we don't know of any AArch64 hardware that
11102 // reorders stores in this way, but the Reference Manual permits it.
11103
11104 void gen_cas_entry(Assembler::operand_size size,
11105 atomic_memory_order order) {
11106 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11107 exchange_val = c_rarg2;
11108 bool acquire, release;
11109 switch (order) {
11110 case memory_order_relaxed:
11111 acquire = false;
11112 release = false;
11113 break;
11114 case memory_order_release:
11115 acquire = false;
11116 release = true;
11117 break;
11118 default:
11119 acquire = true;
11120 release = true;
11121 break;
11122 }
11123 __ mov(prev, compare_val);
11124 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11125 if (order == memory_order_conservative) {
11126 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11127 }
11128 if (size == Assembler::xword) {
11129 __ mov(r0, prev);
11130 } else {
11131 __ movw(r0, prev);
11132 }
11133 __ ret(lr);
11134 }
11135
11136 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11137 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11138 // If not relaxed, then default to conservative. Relaxed is the only
11139 // case we use enough to be worth specializing.
11140 if (order == memory_order_relaxed) {
11141 __ ldadd(size, incr, prev, addr);
11142 } else {
11143 __ ldaddal(size, incr, prev, addr);
11144 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11145 }
11146 if (size == Assembler::xword) {
11147 __ mov(r0, prev);
11148 } else {
11149 __ movw(r0, prev);
11150 }
11151 __ ret(lr);
11152 }
11153
11154 void gen_swpal_entry(Assembler::operand_size size) {
11155 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11156 __ swpal(size, incr, prev, addr);
11157 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11158 if (size == Assembler::xword) {
11159 __ mov(r0, prev);
11160 } else {
11161 __ movw(r0, prev);
11162 }
11163 __ ret(lr);
11164 }
11165
11166 void generate_atomic_entry_points() {
11167 if (! UseLSE) {
11168 return;
11169 }
11170 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11171 GrowableArray<address> entries;
11172 int entry_count = StubInfo::entry_count(stub_id);
11173 address start = load_archive_data(stub_id, &entries);
11174 if (start != nullptr) {
11175 assert(entries.length() == entry_count - 1,
11176 "unexpected extra entry count %d", entries.length());
11177 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11178 int idx = 0;
11179 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11180 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11181 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11182 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11183 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11184 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11185 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11186 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11187 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11188 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11189 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11190 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11191 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11192 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11193 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11194 assert(idx == entries.length(), "sanity!");
11195 return;
11196 }
11197
11198 __ align(CodeEntryAlignment);
11199 StubCodeMark mark(this, stub_id);
11200 start = __ pc();
11201 address end;
11202 {
11203 // ADD, memory_order_conservative
11204 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11205 gen_ldadd_entry(Assembler::word, memory_order_conservative);
11206
11207 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11208 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11209
11210 // ADD, memory_order_relaxed
11211 AtomicStubMark mark_fetch_add_4_relaxed
11212 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11213 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11214
11215 AtomicStubMark mark_fetch_add_8_relaxed
11216 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11217 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11218
11219 // XCHG, memory_order_conservative
11220 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11221 gen_swpal_entry(Assembler::word);
11222
11223 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11224 gen_swpal_entry(Assembler::xword);
11225
11226 // CAS, memory_order_conservative
11227 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11228 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11229
11230 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11231 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11232
11233 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11234 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11235
11236 // CAS, memory_order_relaxed
11237 AtomicStubMark mark_cmpxchg_1_relaxed
11238 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11239 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11240
11241 AtomicStubMark mark_cmpxchg_4_relaxed
11242 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11243 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11244
11245 AtomicStubMark mark_cmpxchg_8_relaxed
11246 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11247 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11248
11249 AtomicStubMark mark_cmpxchg_4_release
11250 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11251 gen_cas_entry(MacroAssembler::word, memory_order_release);
11252
11253 AtomicStubMark mark_cmpxchg_8_release
11254 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11255 gen_cas_entry(MacroAssembler::xword, memory_order_release);
11256
11257 AtomicStubMark mark_cmpxchg_4_seq_cst
11258 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11259 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11260
11261 AtomicStubMark mark_cmpxchg_8_seq_cst
11262 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11263 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11264
11265 end = __ pc();
11266
11267 ICache::invalidate_range(start, end - start);
11268 // exit block to force update of AtomicStubMark targets
11269 }
11270
11271 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11272 "atomic stub should be at start of buffer");
11273 // record the stub start and end plus all the entries saved by the
11274 // AtomicStubMark destructor
11275 entries.append((address)aarch64_atomic_fetch_add_8_impl);
11276 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11277 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11278 entries.append((address)aarch64_atomic_xchg_4_impl);
11279 entries.append((address)aarch64_atomic_xchg_8_impl);
11280 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11281 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11282 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11283 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11284 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11285 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11286 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11287 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11288 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11289 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11290
11291 assert(entries.length() == entry_count - 1,
11292 "unexpected extra entry count %d", entries.length());
11293
11294 store_archive_data(stub_id, start, end, &entries);
11295 }
11296 #endif // LINUX
11297
11298 address generate_cont_thaw(Continuation::thaw_kind kind) {
11299 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11300 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11301
11302 address start = __ pc();
11303
11304 if (return_barrier) {
11305 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11306 __ mov(sp, rscratch1);
11307 }
11308 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11309
11310 if (return_barrier) {
11311 // preserve possible return value from a method returning to the return barrier
11312 __ fmovd(rscratch1, v0);
11313 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11314 }
11315
11316 __ movw(c_rarg1, (return_barrier ? 1 : 0));
11317 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11318 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11319
11320 if (return_barrier) {
11321 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11322 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11323 __ fmovd(v0, rscratch1);
11324 }
11325 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11326
11327
11328 Label thaw_success;
11329 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11330 __ cbnz(rscratch2, thaw_success);
11331 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11332 __ br(rscratch1);
11333 __ bind(thaw_success);
11334
11335 // make room for the thawed frames
11336 __ sub(rscratch1, sp, rscratch2);
11337 __ andr(rscratch1, rscratch1, -16); // align
11338 __ mov(sp, rscratch1);
11339
11340 if (return_barrier) {
11341 // save original return value -- again
11342 __ fmovd(rscratch1, v0);
11343 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11344 }
11345
11346 // If we want, we can templatize thaw by kind, and have three different entries
11347 __ movw(c_rarg1, (uint32_t)kind);
11348
11349 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11350 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11351
11352 if (return_barrier) {
11353 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11354 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11355 __ fmovd(v0, rscratch1);
11356 } else {
11357 __ mov(r0, zr); // return 0 (success) from doYield
11358 }
11359
11360 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11361 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11362 __ mov(rfp, sp);
11363
11364 if (return_barrier_exception) {
11365 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11366 __ authenticate_return_address(c_rarg1);
11367 __ verify_oop(r0);
11368 // save return value containing the exception oop in callee-saved R19
11369 __ mov(r19, r0);
11370
11371 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11372
11373 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11374 // __ reinitialize_ptrue();
11375
11376 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11377
11378 __ mov(r1, r0); // the exception handler
11379 __ mov(r0, r19); // restore return value containing the exception oop
11380 __ verify_oop(r0);
11381
11382 __ leave();
11383 __ mov(r3, lr);
11384 __ br(r1); // the exception handler
11385 } else {
11386 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11387 __ leave();
11388 __ ret(lr);
11389 }
11390
11391 return start;
11392 }
11393
11394 address generate_cont_thaw() {
11395 if (!Continuations::enabled()) return nullptr;
11396
11397 StubId stub_id = StubId::stubgen_cont_thaw_id;
11398 int entry_count = StubInfo::entry_count(stub_id);
11399 assert(entry_count == 1, "sanity check");
11400 address start = load_archive_data(stub_id);
11401 if (start != nullptr) {
11402 return start;
11403 }
11404 StubCodeMark mark(this, stub_id);
11405 start = __ pc();
11406 generate_cont_thaw(Continuation::thaw_top);
11407
11408 // record the stub start and end
11409 store_archive_data(stub_id, start, __ pc());
11410
11411 return start;
11412 }
11413
11414 address generate_cont_returnBarrier() {
11415 if (!Continuations::enabled()) return nullptr;
11416
11417 // TODO: will probably need multiple return barriers depending on return type
11418 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11419 int entry_count = StubInfo::entry_count(stub_id);
11420 assert(entry_count == 1, "sanity check");
11421 address start = load_archive_data(stub_id);
11422 if (start != nullptr) {
11423 return start;
11424 }
11425 StubCodeMark mark(this, stub_id);
11426 start = __ pc();
11427
11428 generate_cont_thaw(Continuation::thaw_return_barrier);
11429
11430 // record the stub start and end
11431 store_archive_data(stub_id, start, __ pc());
11432
11433 return start;
11434 }
11435
11436 address generate_cont_returnBarrier_exception() {
11437 if (!Continuations::enabled()) return nullptr;
11438
11439 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11440 int entry_count = StubInfo::entry_count(stub_id);
11441 assert(entry_count == 1, "sanity check");
11442 address start = load_archive_data(stub_id);
11443 if (start != nullptr) {
11444 return start;
11445 }
11446 StubCodeMark mark(this, stub_id);
11447 start = __ pc();
11448
11449 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11450
11451 // record the stub start and end
11452 store_archive_data(stub_id, start, __ pc());
11453
11454 return start;
11455 }
11456
11457 address generate_cont_preempt_stub() {
11458 if (!Continuations::enabled()) return nullptr;
11459 StubId stub_id = StubId::stubgen_cont_preempt_id;
11460 int entry_count = StubInfo::entry_count(stub_id);
11461 assert(entry_count == 1, "sanity check");
11462 address start = load_archive_data(stub_id);
11463 if (start != nullptr) {
11464 return start;
11465 }
11466 StubCodeMark mark(this, stub_id);
11467 start = __ pc();
11468
11469 __ reset_last_Java_frame(true);
11470
11471 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11472 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11473 __ mov(sp, rscratch2);
11474
11475 Label preemption_cancelled;
11476 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11477 __ cbnz(rscratch1, preemption_cancelled);
11478
11479 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11480 SharedRuntime::continuation_enter_cleanup(_masm);
11481 __ leave();
11482 __ ret(lr);
11483
11484 // We acquired the monitor after freezing the frames so call thaw to continue execution.
11485 __ bind(preemption_cancelled);
11486 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11487 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11488 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11489 __ ldr(rscratch1, Address(rscratch1));
11490 __ br(rscratch1);
11491
11492 // record the stub start and end
11493 store_archive_data(stub_id, start, __ pc());
11494
11495 return start;
11496 }
11497
11498 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11499 // are represented as long[5], with BITS_PER_LIMB = 26.
11500 // Pack five 26-bit limbs into three 64-bit registers.
11501 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11502 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
11503 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
11504 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11505 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
11506
11507 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
11508 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
11509 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11510 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
11511
11512 if (dest2->is_valid()) {
11513 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
11514 } else {
11515 #ifdef ASSERT
11516 Label OK;
11517 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
11518 __ br(__ EQ, OK);
11519 __ stop("high bits of Poly1305 integer should be zero");
11520 __ should_not_reach_here();
11521 __ bind(OK);
11522 #endif
11523 }
11524 }
11525
11526 // As above, but return only a 128-bit integer, packed into two
11527 // 64-bit registers.
11528 void pack_26(Register dest0, Register dest1, Register src) {
11529 pack_26(dest0, dest1, noreg, src);
11530 }
11531
11532 // Multiply and multiply-accumulate unsigned 64-bit registers.
11533 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11534 __ mul(prod_lo, n, m);
11535 __ umulh(prod_hi, n, m);
11536 }
11537 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11538 wide_mul(rscratch1, rscratch2, n, m);
11539 __ adds(sum_lo, sum_lo, rscratch1);
11540 __ adc(sum_hi, sum_hi, rscratch2);
11541 }
11542
11543 // Poly1305, RFC 7539
11544
11545 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11546 // description of the tricks used to simplify and accelerate this
11547 // computation.
11548
11549 address generate_poly1305_processBlocks() {
11550 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11551 int entry_count = StubInfo::entry_count(stub_id);
11552 assert(entry_count == 1, "sanity check");
11553 address start = load_archive_data(stub_id);
11554 if (start != nullptr) {
11555 return start;
11556 }
11557 __ align(CodeEntryAlignment);
11558 StubCodeMark mark(this, stub_id);
11559 start = __ pc();
11560 Label here;
11561 __ enter();
11562 RegSet callee_saved = RegSet::range(r19, r28);
11563 __ push(callee_saved, sp);
11564
11565 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11566
11567 // Arguments
11568 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11569
11570 // R_n is the 128-bit randomly-generated key, packed into two
11571 // registers. The caller passes this key to us as long[5], with
11572 // BITS_PER_LIMB = 26.
11573 const Register R_0 = *++regs, R_1 = *++regs;
11574 pack_26(R_0, R_1, r_start);
11575
11576 // RR_n is (R_n >> 2) * 5
11577 const Register RR_0 = *++regs, RR_1 = *++regs;
11578 __ lsr(RR_0, R_0, 2);
11579 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11580 __ lsr(RR_1, R_1, 2);
11581 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11582
11583 // U_n is the current checksum
11584 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11585 pack_26(U_0, U_1, U_2, acc_start);
11586
11587 static constexpr int BLOCK_LENGTH = 16;
11588 Label DONE, LOOP;
11589
11590 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11591 __ br(Assembler::LT, DONE); {
11592 __ bind(LOOP);
11593
11594 // S_n is to be the sum of U_n and the next block of data
11595 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11596 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11597 __ adds(S_0, U_0, S_0);
11598 __ adcs(S_1, U_1, S_1);
11599 __ adc(S_2, U_2, zr);
11600 __ add(S_2, S_2, 1);
11601
11602 const Register U_0HI = *++regs, U_1HI = *++regs;
11603
11604 // NB: this logic depends on some of the special properties of
11605 // Poly1305 keys. In particular, because we know that the top
11606 // four bits of R_0 and R_1 are zero, we can add together
11607 // partial products without any risk of needing to propagate a
11608 // carry out.
11609 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11610 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
11611 __ andr(U_2, R_0, 3);
11612 __ mul(U_2, S_2, U_2);
11613
11614 // Recycle registers S_0, S_1, S_2
11615 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11616
11617 // Partial reduction mod 2**130 - 5
11618 __ adds(U_1, U_0HI, U_1);
11619 __ adc(U_2, U_1HI, U_2);
11620 // Sum now in U_2:U_1:U_0.
11621 // Dead: U_0HI, U_1HI.
11622 regs = (regs.remaining() + U_0HI + U_1HI).begin();
11623
11624 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11625
11626 // First, U_2:U_1:U_0 += (U_2 >> 2)
11627 __ lsr(rscratch1, U_2, 2);
11628 __ andr(U_2, U_2, (u8)3);
11629 __ adds(U_0, U_0, rscratch1);
11630 __ adcs(U_1, U_1, zr);
11631 __ adc(U_2, U_2, zr);
11632 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11633 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11634 __ adcs(U_1, U_1, zr);
11635 __ adc(U_2, U_2, zr);
11636
11637 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11638 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11639 __ br(~ Assembler::LT, LOOP);
11640 }
11641
11642 // Further reduce modulo 2^130 - 5
11643 __ lsr(rscratch1, U_2, 2);
11644 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11645 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11646 __ adcs(U_1, U_1, zr);
11647 __ andr(U_2, U_2, (u1)3);
11648 __ adc(U_2, U_2, zr);
11649
11650 // Unpack the sum into five 26-bit limbs and write to memory.
11651 __ ubfiz(rscratch1, U_0, 0, 26);
11652 __ ubfx(rscratch2, U_0, 26, 26);
11653 __ stp(rscratch1, rscratch2, Address(acc_start));
11654 __ ubfx(rscratch1, U_0, 52, 12);
11655 __ bfi(rscratch1, U_1, 12, 14);
11656 __ ubfx(rscratch2, U_1, 14, 26);
11657 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11658 __ ubfx(rscratch1, U_1, 40, 24);
11659 __ bfi(rscratch1, U_2, 24, 3);
11660 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11661
11662 __ bind(DONE);
11663 __ pop(callee_saved, sp);
11664 __ leave();
11665 __ ret(lr);
11666
11667 // record the stub start and end
11668 store_archive_data(stub_id, start, __ pc());
11669
11670 return start;
11671 }
11672
11673 // exception handler for upcall stubs
11674 address generate_upcall_stub_exception_handler() {
11675 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11676 int entry_count = StubInfo::entry_count(stub_id);
11677 assert(entry_count == 1, "sanity check");
11678 address start = load_archive_data(stub_id);
11679 if (start != nullptr) {
11680 return start;
11681 }
11682 StubCodeMark mark(this, stub_id);
11683 start = __ pc();
11684
11685 // Native caller has no idea how to handle exceptions,
11686 // so we just crash here. Up to callee to catch exceptions.
11687 __ verify_oop(r0);
11688 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11689 __ blr(rscratch1);
11690 __ should_not_reach_here();
11691
11692 // record the stub start and end
11693 store_archive_data(stub_id, start, __ pc());
11694
11695 return start;
11696 }
11697
11698 // load Method* target of MethodHandle
11699 // j_rarg0 = jobject receiver
11700 // rmethod = result
11701 address generate_upcall_stub_load_target() {
11702 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11703 int entry_count = StubInfo::entry_count(stub_id);
11704 assert(entry_count == 1, "sanity check");
11705 address start = load_archive_data(stub_id);
11706 if (start != nullptr) {
11707 return start;
11708 }
11709 StubCodeMark mark(this, stub_id);
11710 start = __ pc();
11711
11712 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11713 // Load target method from receiver
11714 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11715 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11716 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11717 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11718 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11719 noreg, noreg);
11720 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11721
11722 __ ret(lr);
11723
11724 // record the stub start and end
11725 store_archive_data(stub_id, start, __ pc());
11726
11727 return start;
11728 }
11729
11730 #undef __
11731 #define __ masm->
11732
11733 class MontgomeryMultiplyGenerator : public MacroAssembler {
11734
11735 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11736 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11737
11738 RegSet _toSave;
11739 bool _squaring;
11740
11741 public:
11742 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11743 : MacroAssembler(as->code()), _squaring(squaring) {
11744
11745 // Register allocation
11746
11747 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11748 Pa_base = *regs; // Argument registers
11749 if (squaring)
11750 Pb_base = Pa_base;
11751 else
11752 Pb_base = *++regs;
11753 Pn_base = *++regs;
11754 Rlen= *++regs;
11755 inv = *++regs;
11756 Pm_base = *++regs;
11757
11758 // Working registers:
11759 Ra = *++regs; // The current digit of a, b, n, and m.
11760 Rb = *++regs;
11761 Rm = *++regs;
11762 Rn = *++regs;
11763
11764 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
11765 Pb = *++regs;
11766 Pm = *++regs;
11767 Pn = *++regs;
11768
11769 t0 = *++regs; // Three registers which form a
11770 t1 = *++regs; // triple-precision accumuator.
11771 t2 = *++regs;
11772
11773 Ri = *++regs; // Inner and outer loop indexes.
11774 Rj = *++regs;
11775
11776 Rhi_ab = *++regs; // Product registers: low and high parts
11777 Rlo_ab = *++regs; // of a*b and m*n.
11778 Rhi_mn = *++regs;
11779 Rlo_mn = *++regs;
11780
11781 // r19 and up are callee-saved.
11782 _toSave = RegSet::range(r19, *regs) + Pm_base;
11783 }
11784
11785 private:
11786 void save_regs() {
11787 push(_toSave, sp);
11788 }
11789
11790 void restore_regs() {
11791 pop(_toSave, sp);
11792 }
11793
11794 template <typename T>
11795 void unroll_2(Register count, T block) {
11796 Label loop, end, odd;
11797 tbnz(count, 0, odd);
11798 cbz(count, end);
11799 align(16);
11800 bind(loop);
11801 (this->*block)();
11802 bind(odd);
11803 (this->*block)();
11804 subs(count, count, 2);
11805 br(Assembler::GT, loop);
11806 bind(end);
11807 }
11808
11809 template <typename T>
11810 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11811 Label loop, end, odd;
11812 tbnz(count, 0, odd);
11813 cbz(count, end);
11814 align(16);
11815 bind(loop);
11816 (this->*block)(d, s, tmp);
11817 bind(odd);
11818 (this->*block)(d, s, tmp);
11819 subs(count, count, 2);
11820 br(Assembler::GT, loop);
11821 bind(end);
11822 }
11823
11824 void pre1(RegisterOrConstant i) {
11825 block_comment("pre1");
11826 // Pa = Pa_base;
11827 // Pb = Pb_base + i;
11828 // Pm = Pm_base;
11829 // Pn = Pn_base + i;
11830 // Ra = *Pa;
11831 // Rb = *Pb;
11832 // Rm = *Pm;
11833 // Rn = *Pn;
11834 ldr(Ra, Address(Pa_base));
11835 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11836 ldr(Rm, Address(Pm_base));
11837 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11838 lea(Pa, Address(Pa_base));
11839 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11840 lea(Pm, Address(Pm_base));
11841 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11842
11843 // Zero the m*n result.
11844 mov(Rhi_mn, zr);
11845 mov(Rlo_mn, zr);
11846 }
11847
11848 // The core multiply-accumulate step of a Montgomery
11849 // multiplication. The idea is to schedule operations as a
11850 // pipeline so that instructions with long latencies (loads and
11851 // multiplies) have time to complete before their results are
11852 // used. This most benefits in-order implementations of the
11853 // architecture but out-of-order ones also benefit.
11854 void step() {
11855 block_comment("step");
11856 // MACC(Ra, Rb, t0, t1, t2);
11857 // Ra = *++Pa;
11858 // Rb = *--Pb;
11859 umulh(Rhi_ab, Ra, Rb);
11860 mul(Rlo_ab, Ra, Rb);
11861 ldr(Ra, pre(Pa, wordSize));
11862 ldr(Rb, pre(Pb, -wordSize));
11863 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11864 // previous iteration.
11865 // MACC(Rm, Rn, t0, t1, t2);
11866 // Rm = *++Pm;
11867 // Rn = *--Pn;
11868 umulh(Rhi_mn, Rm, Rn);
11869 mul(Rlo_mn, Rm, Rn);
11870 ldr(Rm, pre(Pm, wordSize));
11871 ldr(Rn, pre(Pn, -wordSize));
11872 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11873 }
11874
11875 void post1() {
11876 block_comment("post1");
11877
11878 // MACC(Ra, Rb, t0, t1, t2);
11879 // Ra = *++Pa;
11880 // Rb = *--Pb;
11881 umulh(Rhi_ab, Ra, Rb);
11882 mul(Rlo_ab, Ra, Rb);
11883 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11884 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11885
11886 // *Pm = Rm = t0 * inv;
11887 mul(Rm, t0, inv);
11888 str(Rm, Address(Pm));
11889
11890 // MACC(Rm, Rn, t0, t1, t2);
11891 // t0 = t1; t1 = t2; t2 = 0;
11892 umulh(Rhi_mn, Rm, Rn);
11893
11894 #ifndef PRODUCT
11895 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11896 {
11897 mul(Rlo_mn, Rm, Rn);
11898 add(Rlo_mn, t0, Rlo_mn);
11899 Label ok;
11900 cbz(Rlo_mn, ok); {
11901 stop("broken Montgomery multiply");
11902 } bind(ok);
11903 }
11904 #endif
11905 // We have very carefully set things up so that
11906 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11907 // the lower half of Rm * Rn because we know the result already:
11908 // it must be -t0. t0 + (-t0) must generate a carry iff
11909 // t0 != 0. So, rather than do a mul and an adds we just set
11910 // the carry flag iff t0 is nonzero.
11911 //
11912 // mul(Rlo_mn, Rm, Rn);
11913 // adds(zr, t0, Rlo_mn);
11914 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11915 adcs(t0, t1, Rhi_mn);
11916 adc(t1, t2, zr);
11917 mov(t2, zr);
11918 }
11919
11920 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11921 block_comment("pre2");
11922 // Pa = Pa_base + i-len;
11923 // Pb = Pb_base + len;
11924 // Pm = Pm_base + i-len;
11925 // Pn = Pn_base + len;
11926
11927 if (i.is_register()) {
11928 sub(Rj, i.as_register(), len);
11929 } else {
11930 mov(Rj, i.as_constant());
11931 sub(Rj, Rj, len);
11932 }
11933 // Rj == i-len
11934
11935 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11936 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11937 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11938 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11939
11940 // Ra = *++Pa;
11941 // Rb = *--Pb;
11942 // Rm = *++Pm;
11943 // Rn = *--Pn;
11944 ldr(Ra, pre(Pa, wordSize));
11945 ldr(Rb, pre(Pb, -wordSize));
11946 ldr(Rm, pre(Pm, wordSize));
11947 ldr(Rn, pre(Pn, -wordSize));
11948
11949 mov(Rhi_mn, zr);
11950 mov(Rlo_mn, zr);
11951 }
11952
11953 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11954 block_comment("post2");
11955 if (i.is_constant()) {
11956 mov(Rj, i.as_constant()-len.as_constant());
11957 } else {
11958 sub(Rj, i.as_register(), len);
11959 }
11960
11961 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11962
11963 // As soon as we know the least significant digit of our result,
11964 // store it.
11965 // Pm_base[i-len] = t0;
11966 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11967
11968 // t0 = t1; t1 = t2; t2 = 0;
11969 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11970 adc(t1, t2, zr);
11971 mov(t2, zr);
11972 }
11973
11974 // A carry in t0 after Montgomery multiplication means that we
11975 // should subtract multiples of n from our result in m. We'll
11976 // keep doing that until there is no carry.
11977 void normalize(RegisterOrConstant len) {
11978 block_comment("normalize");
11979 // while (t0)
11980 // t0 = sub(Pm_base, Pn_base, t0, len);
11981 Label loop, post, again;
11982 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11983 cbz(t0, post); {
11984 bind(again); {
11985 mov(i, zr);
11986 mov(cnt, len);
11987 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11988 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11989 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11990 align(16);
11991 bind(loop); {
11992 sbcs(Rm, Rm, Rn);
11993 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11994 add(i, i, 1);
11995 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11996 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11997 sub(cnt, cnt, 1);
11998 } cbnz(cnt, loop);
11999 sbc(t0, t0, zr);
12000 } cbnz(t0, again);
12001 } bind(post);
12002 }
12003
12004 // Move memory at s to d, reversing words.
12005 // Increments d to end of copied memory
12006 // Destroys tmp1, tmp2
12007 // Preserves len
12008 // Leaves s pointing to the address which was in d at start
12009 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12010 assert(tmp1->encoding() < r19->encoding(), "register corruption");
12011 assert(tmp2->encoding() < r19->encoding(), "register corruption");
12012
12013 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12014 mov(tmp1, len);
12015 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12016 sub(s, d, len, ext::uxtw, LogBytesPerWord);
12017 }
12018 // where
12019 void reverse1(Register d, Register s, Register tmp) {
12020 ldr(tmp, pre(s, -wordSize));
12021 ror(tmp, tmp, 32);
12022 str(tmp, post(d, wordSize));
12023 }
12024
12025 void step_squaring() {
12026 // An extra ACC
12027 step();
12028 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12029 }
12030
12031 void last_squaring(RegisterOrConstant i) {
12032 Label dont;
12033 // if ((i & 1) == 0) {
12034 tbnz(i.as_register(), 0, dont); {
12035 // MACC(Ra, Rb, t0, t1, t2);
12036 // Ra = *++Pa;
12037 // Rb = *--Pb;
12038 umulh(Rhi_ab, Ra, Rb);
12039 mul(Rlo_ab, Ra, Rb);
12040 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12041 } bind(dont);
12042 }
12043
12044 void extra_step_squaring() {
12045 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12046
12047 // MACC(Rm, Rn, t0, t1, t2);
12048 // Rm = *++Pm;
12049 // Rn = *--Pn;
12050 umulh(Rhi_mn, Rm, Rn);
12051 mul(Rlo_mn, Rm, Rn);
12052 ldr(Rm, pre(Pm, wordSize));
12053 ldr(Rn, pre(Pn, -wordSize));
12054 }
12055
12056 void post1_squaring() {
12057 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12058
12059 // *Pm = Rm = t0 * inv;
12060 mul(Rm, t0, inv);
12061 str(Rm, Address(Pm));
12062
12063 // MACC(Rm, Rn, t0, t1, t2);
12064 // t0 = t1; t1 = t2; t2 = 0;
12065 umulh(Rhi_mn, Rm, Rn);
12066
12067 #ifndef PRODUCT
12068 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12069 {
12070 mul(Rlo_mn, Rm, Rn);
12071 add(Rlo_mn, t0, Rlo_mn);
12072 Label ok;
12073 cbz(Rlo_mn, ok); {
12074 stop("broken Montgomery multiply");
12075 } bind(ok);
12076 }
12077 #endif
12078 // We have very carefully set things up so that
12079 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12080 // the lower half of Rm * Rn because we know the result already:
12081 // it must be -t0. t0 + (-t0) must generate a carry iff
12082 // t0 != 0. So, rather than do a mul and an adds we just set
12083 // the carry flag iff t0 is nonzero.
12084 //
12085 // mul(Rlo_mn, Rm, Rn);
12086 // adds(zr, t0, Rlo_mn);
12087 subs(zr, t0, 1); // Set carry iff t0 is nonzero
12088 adcs(t0, t1, Rhi_mn);
12089 adc(t1, t2, zr);
12090 mov(t2, zr);
12091 }
12092
12093 void acc(Register Rhi, Register Rlo,
12094 Register t0, Register t1, Register t2) {
12095 adds(t0, t0, Rlo);
12096 adcs(t1, t1, Rhi);
12097 adc(t2, t2, zr);
12098 }
12099
12100 public:
12101 /**
12102 * Fast Montgomery multiplication. The derivation of the
12103 * algorithm is in A Cryptographic Library for the Motorola
12104 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12105 *
12106 * Arguments:
12107 *
12108 * Inputs for multiplication:
12109 * c_rarg0 - int array elements a
12110 * c_rarg1 - int array elements b
12111 * c_rarg2 - int array elements n (the modulus)
12112 * c_rarg3 - int length
12113 * c_rarg4 - int inv
12114 * c_rarg5 - int array elements m (the result)
12115 *
12116 * Inputs for squaring:
12117 * c_rarg0 - int array elements a
12118 * c_rarg1 - int array elements n (the modulus)
12119 * c_rarg2 - int length
12120 * c_rarg3 - int inv
12121 * c_rarg4 - int array elements m (the result)
12122 *
12123 */
12124 address generate_multiply() {
12125 Label argh, nothing;
12126
12127 align(CodeEntryAlignment);
12128 address entry = pc();
12129
12130 cbzw(Rlen, nothing);
12131
12132 enter();
12133
12134 // Make room.
12135 cmpw(Rlen, 512);
12136 br(Assembler::HI, argh);
12137 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12138 andr(sp, Ra, -2 * wordSize);
12139
12140 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12141
12142 {
12143 // Copy input args, reversing as we go. We use Ra as a
12144 // temporary variable.
12145 reverse(Ra, Pa_base, Rlen, t0, t1);
12146 if (!_squaring)
12147 reverse(Ra, Pb_base, Rlen, t0, t1);
12148 reverse(Ra, Pn_base, Rlen, t0, t1);
12149 }
12150
12151 // Push all call-saved registers and also Pm_base which we'll need
12152 // at the end.
12153 save_regs();
12154
12155 #ifndef PRODUCT
12156 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12157 {
12158 ldr(Rn, Address(Pn_base, 0));
12159 mul(Rlo_mn, Rn, inv);
12160 subs(zr, Rlo_mn, -1);
12161 Label ok;
12162 br(EQ, ok); {
12163 stop("broken inverse in Montgomery multiply");
12164 } bind(ok);
12165 }
12166 #endif
12167
12168 mov(Pm_base, Ra);
12169
12170 mov(t0, zr);
12171 mov(t1, zr);
12172 mov(t2, zr);
12173
12174 block_comment("for (int i = 0; i < len; i++) {");
12175 mov(Ri, zr); {
12176 Label loop, end;
12177 cmpw(Ri, Rlen);
12178 br(Assembler::GE, end);
12179
12180 bind(loop);
12181 pre1(Ri);
12182
12183 block_comment(" for (j = i; j; j--) {"); {
12184 movw(Rj, Ri);
12185 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12186 } block_comment(" } // j");
12187
12188 post1();
12189 addw(Ri, Ri, 1);
12190 cmpw(Ri, Rlen);
12191 br(Assembler::LT, loop);
12192 bind(end);
12193 block_comment("} // i");
12194 }
12195
12196 block_comment("for (int i = len; i < 2*len; i++) {");
12197 mov(Ri, Rlen); {
12198 Label loop, end;
12199 cmpw(Ri, Rlen, Assembler::LSL, 1);
12200 br(Assembler::GE, end);
12201
12202 bind(loop);
12203 pre2(Ri, Rlen);
12204
12205 block_comment(" for (j = len*2-i-1; j; j--) {"); {
12206 lslw(Rj, Rlen, 1);
12207 subw(Rj, Rj, Ri);
12208 subw(Rj, Rj, 1);
12209 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12210 } block_comment(" } // j");
12211
12212 post2(Ri, Rlen);
12213 addw(Ri, Ri, 1);
12214 cmpw(Ri, Rlen, Assembler::LSL, 1);
12215 br(Assembler::LT, loop);
12216 bind(end);
12217 }
12218 block_comment("} // i");
12219
12220 normalize(Rlen);
12221
12222 mov(Ra, Pm_base); // Save Pm_base in Ra
12223 restore_regs(); // Restore caller's Pm_base
12224
12225 // Copy our result into caller's Pm_base
12226 reverse(Pm_base, Ra, Rlen, t0, t1);
12227
12228 leave();
12229 bind(nothing);
12230 ret(lr);
12231
12232 // handler for error case
12233 bind(argh);
12234 stop("MontgomeryMultiply total_allocation must be <= 8192");
12235
12236 return entry;
12237 }
12238 // In C, approximately:
12239
12240 // void
12241 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12242 // julong Pn_base[], julong Pm_base[],
12243 // julong inv, int len) {
12244 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12245 // julong *Pa, *Pb, *Pn, *Pm;
12246 // julong Ra, Rb, Rn, Rm;
12247
12248 // int i;
12249
12250 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12251
12252 // for (i = 0; i < len; i++) {
12253 // int j;
12254
12255 // Pa = Pa_base;
12256 // Pb = Pb_base + i;
12257 // Pm = Pm_base;
12258 // Pn = Pn_base + i;
12259
12260 // Ra = *Pa;
12261 // Rb = *Pb;
12262 // Rm = *Pm;
12263 // Rn = *Pn;
12264
12265 // int iters = i;
12266 // for (j = 0; iters--; j++) {
12267 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12268 // MACC(Ra, Rb, t0, t1, t2);
12269 // Ra = *++Pa;
12270 // Rb = *--Pb;
12271 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12272 // MACC(Rm, Rn, t0, t1, t2);
12273 // Rm = *++Pm;
12274 // Rn = *--Pn;
12275 // }
12276
12277 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12278 // MACC(Ra, Rb, t0, t1, t2);
12279 // *Pm = Rm = t0 * inv;
12280 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12281 // MACC(Rm, Rn, t0, t1, t2);
12282
12283 // assert(t0 == 0, "broken Montgomery multiply");
12284
12285 // t0 = t1; t1 = t2; t2 = 0;
12286 // }
12287
12288 // for (i = len; i < 2*len; i++) {
12289 // int j;
12290
12291 // Pa = Pa_base + i-len;
12292 // Pb = Pb_base + len;
12293 // Pm = Pm_base + i-len;
12294 // Pn = Pn_base + len;
12295
12296 // Ra = *++Pa;
12297 // Rb = *--Pb;
12298 // Rm = *++Pm;
12299 // Rn = *--Pn;
12300
12301 // int iters = len*2-i-1;
12302 // for (j = i-len+1; iters--; j++) {
12303 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12304 // MACC(Ra, Rb, t0, t1, t2);
12305 // Ra = *++Pa;
12306 // Rb = *--Pb;
12307 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12308 // MACC(Rm, Rn, t0, t1, t2);
12309 // Rm = *++Pm;
12310 // Rn = *--Pn;
12311 // }
12312
12313 // Pm_base[i-len] = t0;
12314 // t0 = t1; t1 = t2; t2 = 0;
12315 // }
12316
12317 // while (t0)
12318 // t0 = sub(Pm_base, Pn_base, t0, len);
12319 // }
12320
12321 /**
12322 * Fast Montgomery squaring. This uses asymptotically 25% fewer
12323 * multiplies than Montgomery multiplication so it should be up to
12324 * 25% faster. However, its loop control is more complex and it
12325 * may actually run slower on some machines.
12326 *
12327 * Arguments:
12328 *
12329 * Inputs:
12330 * c_rarg0 - int array elements a
12331 * c_rarg1 - int array elements n (the modulus)
12332 * c_rarg2 - int length
12333 * c_rarg3 - int inv
12334 * c_rarg4 - int array elements m (the result)
12335 *
12336 */
12337 address generate_square() {
12338 Label argh;
12339
12340 align(CodeEntryAlignment);
12341 address entry = pc();
12342
12343 enter();
12344
12345 // Make room.
12346 cmpw(Rlen, 512);
12347 br(Assembler::HI, argh);
12348 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12349 andr(sp, Ra, -2 * wordSize);
12350
12351 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12352
12353 {
12354 // Copy input args, reversing as we go. We use Ra as a
12355 // temporary variable.
12356 reverse(Ra, Pa_base, Rlen, t0, t1);
12357 reverse(Ra, Pn_base, Rlen, t0, t1);
12358 }
12359
12360 // Push all call-saved registers and also Pm_base which we'll need
12361 // at the end.
12362 save_regs();
12363
12364 mov(Pm_base, Ra);
12365
12366 mov(t0, zr);
12367 mov(t1, zr);
12368 mov(t2, zr);
12369
12370 block_comment("for (int i = 0; i < len; i++) {");
12371 mov(Ri, zr); {
12372 Label loop, end;
12373 bind(loop);
12374 cmp(Ri, Rlen);
12375 br(Assembler::GE, end);
12376
12377 pre1(Ri);
12378
12379 block_comment("for (j = (i+1)/2; j; j--) {"); {
12380 add(Rj, Ri, 1);
12381 lsr(Rj, Rj, 1);
12382 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12383 } block_comment(" } // j");
12384
12385 last_squaring(Ri);
12386
12387 block_comment(" for (j = i/2; j; j--) {"); {
12388 lsr(Rj, Ri, 1);
12389 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12390 } block_comment(" } // j");
12391
12392 post1_squaring();
12393 add(Ri, Ri, 1);
12394 cmp(Ri, Rlen);
12395 br(Assembler::LT, loop);
12396
12397 bind(end);
12398 block_comment("} // i");
12399 }
12400
12401 block_comment("for (int i = len; i < 2*len; i++) {");
12402 mov(Ri, Rlen); {
12403 Label loop, end;
12404 bind(loop);
12405 cmp(Ri, Rlen, Assembler::LSL, 1);
12406 br(Assembler::GE, end);
12407
12408 pre2(Ri, Rlen);
12409
12410 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
12411 lsl(Rj, Rlen, 1);
12412 sub(Rj, Rj, Ri);
12413 sub(Rj, Rj, 1);
12414 lsr(Rj, Rj, 1);
12415 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12416 } block_comment(" } // j");
12417
12418 last_squaring(Ri);
12419
12420 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
12421 lsl(Rj, Rlen, 1);
12422 sub(Rj, Rj, Ri);
12423 lsr(Rj, Rj, 1);
12424 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12425 } block_comment(" } // j");
12426
12427 post2(Ri, Rlen);
12428 add(Ri, Ri, 1);
12429 cmp(Ri, Rlen, Assembler::LSL, 1);
12430
12431 br(Assembler::LT, loop);
12432 bind(end);
12433 block_comment("} // i");
12434 }
12435
12436 normalize(Rlen);
12437
12438 mov(Ra, Pm_base); // Save Pm_base in Ra
12439 restore_regs(); // Restore caller's Pm_base
12440
12441 // Copy our result into caller's Pm_base
12442 reverse(Pm_base, Ra, Rlen, t0, t1);
12443
12444 leave();
12445 ret(lr);
12446
12447 // handler for error case
12448 bind(argh);
12449 stop("MontgomeryMultiply total_allocation must be <= 8192");
12450
12451 return entry;
12452 }
12453 // In C, approximately:
12454
12455 // void
12456 // montgomery_square(julong Pa_base[], julong Pn_base[],
12457 // julong Pm_base[], julong inv, int len) {
12458 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12459 // julong *Pa, *Pb, *Pn, *Pm;
12460 // julong Ra, Rb, Rn, Rm;
12461
12462 // int i;
12463
12464 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12465
12466 // for (i = 0; i < len; i++) {
12467 // int j;
12468
12469 // Pa = Pa_base;
12470 // Pb = Pa_base + i;
12471 // Pm = Pm_base;
12472 // Pn = Pn_base + i;
12473
12474 // Ra = *Pa;
12475 // Rb = *Pb;
12476 // Rm = *Pm;
12477 // Rn = *Pn;
12478
12479 // int iters = (i+1)/2;
12480 // for (j = 0; iters--; j++) {
12481 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12482 // MACC2(Ra, Rb, t0, t1, t2);
12483 // Ra = *++Pa;
12484 // Rb = *--Pb;
12485 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12486 // MACC(Rm, Rn, t0, t1, t2);
12487 // Rm = *++Pm;
12488 // Rn = *--Pn;
12489 // }
12490 // if ((i & 1) == 0) {
12491 // assert(Ra == Pa_base[j], "must be");
12492 // MACC(Ra, Ra, t0, t1, t2);
12493 // }
12494 // iters = i/2;
12495 // assert(iters == i-j, "must be");
12496 // for (; iters--; j++) {
12497 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12498 // MACC(Rm, Rn, t0, t1, t2);
12499 // Rm = *++Pm;
12500 // Rn = *--Pn;
12501 // }
12502
12503 // *Pm = Rm = t0 * inv;
12504 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12505 // MACC(Rm, Rn, t0, t1, t2);
12506
12507 // assert(t0 == 0, "broken Montgomery multiply");
12508
12509 // t0 = t1; t1 = t2; t2 = 0;
12510 // }
12511
12512 // for (i = len; i < 2*len; i++) {
12513 // int start = i-len+1;
12514 // int end = start + (len - start)/2;
12515 // int j;
12516
12517 // Pa = Pa_base + i-len;
12518 // Pb = Pa_base + len;
12519 // Pm = Pm_base + i-len;
12520 // Pn = Pn_base + len;
12521
12522 // Ra = *++Pa;
12523 // Rb = *--Pb;
12524 // Rm = *++Pm;
12525 // Rn = *--Pn;
12526
12527 // int iters = (2*len-i-1)/2;
12528 // assert(iters == end-start, "must be");
12529 // for (j = start; iters--; j++) {
12530 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12531 // MACC2(Ra, Rb, t0, t1, t2);
12532 // Ra = *++Pa;
12533 // Rb = *--Pb;
12534 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12535 // MACC(Rm, Rn, t0, t1, t2);
12536 // Rm = *++Pm;
12537 // Rn = *--Pn;
12538 // }
12539 // if ((i & 1) == 0) {
12540 // assert(Ra == Pa_base[j], "must be");
12541 // MACC(Ra, Ra, t0, t1, t2);
12542 // }
12543 // iters = (2*len-i)/2;
12544 // assert(iters == len-j, "must be");
12545 // for (; iters--; j++) {
12546 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12547 // MACC(Rm, Rn, t0, t1, t2);
12548 // Rm = *++Pm;
12549 // Rn = *--Pn;
12550 // }
12551 // Pm_base[i-len] = t0;
12552 // t0 = t1; t1 = t2; t2 = 0;
12553 // }
12554
12555 // while (t0)
12556 // t0 = sub(Pm_base, Pn_base, t0, len);
12557 // }
12558 };
12559
12560 // Initialization
12561 void generate_preuniverse_stubs() {
12562 // preuniverse stubs are not needed for aarch64
12563 }
12564
12565 void generate_initial_stubs() {
12566 // Generate initial stubs and initializes the entry points
12567
12568 // entry points that exist in all platforms Note: This is code
12569 // that could be shared among different platforms - however the
12570 // benefit seems to be smaller than the disadvantage of having a
12571 // much more complicated generator structure. See also comment in
12572 // stubRoutines.hpp.
12573
12574 StubRoutines::_forward_exception_entry = generate_forward_exception();
12575
12576 StubRoutines::_call_stub_entry =
12577 generate_call_stub(StubRoutines::_call_stub_return_address);
12578
12579 // is referenced by megamorphic call
12580 StubRoutines::_catch_exception_entry = generate_catch_exception();
12581
12582 // Initialize table for copy memory (arraycopy) check.
12583 if (UnsafeMemoryAccess::_table == nullptr) {
12584 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12585 }
12586
12587 if (UseCRC32Intrinsics) {
12588 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12589 }
12590
12591 if (UseCRC32CIntrinsics) {
12592 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12593 }
12594
12595 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12596 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12597 }
12598
12599 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12600 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12601 }
12602
12603 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12604 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12605 StubRoutines::_hf2f = generate_float16ToFloat();
12606 StubRoutines::_f2hf = generate_floatToFloat16();
12607 }
12608 }
12609
12610 void generate_continuation_stubs() {
12611 // Continuation stubs:
12612 StubRoutines::_cont_thaw = generate_cont_thaw();
12613 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12614 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12615 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12616 }
12617
12618 void generate_final_stubs() {
12619 // support for verify_oop (must happen after universe_init)
12620 if (VerifyOops) {
12621 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
12622 }
12623
12624 // arraycopy stubs used by compilers
12625 generate_arraycopy_stubs();
12626
12627 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12628
12629 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12630
12631 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12632 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12633
12634 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12635
12636 generate_atomic_entry_points();
12637
12638 #endif // LINUX
12639
12640 #ifdef COMPILER2
12641 if (UseSecondarySupersTable) {
12642 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12643 if (! InlineSecondarySupersTest) {
12644 generate_lookup_secondary_supers_table_stub();
12645 }
12646 }
12647 #endif
12648
12649 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12650 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12651 }
12652
12653 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12654 }
12655
12656 void generate_compiler_stubs() {
12657 #ifdef COMPILER2
12658
12659 if (UseSVE == 0) {
12660 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12661 }
12662
12663 // array equals stub for large arrays.
12664 if (!UseSimpleArrayEquals) {
12665 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12666 }
12667
12668 // arrays_hascode stub for large arrays.
12669 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12670 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12671 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12672 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12673 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12674
12675 // byte_array_inflate stub for large arrays.
12676 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12677
12678 // countPositives stub for large arrays.
12679 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12680
12681 generate_compare_long_strings();
12682
12683 generate_string_indexof_stubs();
12684
12685 if (UseMultiplyToLenIntrinsic) {
12686 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12687 }
12688
12689 if (UseSquareToLenIntrinsic) {
12690 StubRoutines::_squareToLen = generate_squareToLen();
12691 }
12692
12693 if (UseMulAddIntrinsic) {
12694 StubRoutines::_mulAdd = generate_mulAdd();
12695 }
12696
12697 if (UseSIMDForBigIntegerShiftIntrinsics) {
12698 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12699 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12700 }
12701
12702 if (UseMontgomeryMultiplyIntrinsic) {
12703 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12704 address start = load_archive_data(stub_id);
12705 if (start == nullptr) {
12706 // we have to generate it
12707 StubCodeMark mark(this, stub_id);
12708 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12709 start = g.generate_multiply();
12710 // record the stub start and end
12711 store_archive_data(stub_id, start, _masm->pc());
12712 }
12713 StubRoutines::_montgomeryMultiply = start;
12714 }
12715
12716 if (UseMontgomerySquareIntrinsic) {
12717 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12718 address start = load_archive_data(stub_id);
12719 if (start == nullptr) {
12720 // we have to generate it
12721 StubCodeMark mark(this, stub_id);
12722 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12723 // We use generate_multiply() rather than generate_square()
12724 // because it's faster for the sizes of modulus we care about.
12725 start = g.generate_multiply();
12726 // record the stub start and end
12727 store_archive_data(stub_id, start, _masm->pc());
12728 }
12729 StubRoutines::_montgomerySquare = start;
12730 }
12731
12732 if (UseChaCha20Intrinsics) {
12733 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12734 }
12735
12736 if (UseKyberIntrinsics) {
12737 StubRoutines::_kyberNtt = generate_kyberNtt();
12738 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12739 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12740 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12741 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12742 StubRoutines::_kyber12To16 = generate_kyber12To16();
12743 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12744 }
12745
12746 if (UseDilithiumIntrinsics) {
12747 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12748 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12749 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12750 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12751 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12752 }
12753
12754 if (UseBASE64Intrinsics) {
12755 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12756 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12757 }
12758
12759 // data cache line writeback
12760 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12761 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12762
12763 if (UseAESIntrinsics) {
12764 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12765 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12766 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12767 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12768 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12769 }
12770 if (UseGHASHIntrinsics) {
12771 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12772 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12773 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12774 }
12775 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12776 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12777 }
12778
12779 if (UseMD5Intrinsics) {
12780 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12781 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12782 }
12783 if (UseSHA1Intrinsics) {
12784 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12785 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12786 }
12787 if (UseSHA256Intrinsics) {
12788 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12789 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12790 }
12791 if (UseSHA512Intrinsics) {
12792 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12793 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12794 }
12795 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12796 StubRoutines::_double_keccak = generate_double_keccak();
12797 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12798 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12799 } else if (UseSHA3Intrinsics) {
12800 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12801 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12802 }
12803
12804 if (UsePoly1305Intrinsics) {
12805 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12806 }
12807
12808 // generate Adler32 intrinsics code
12809 if (UseAdler32Intrinsics) {
12810 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12811 }
12812
12813 #endif // COMPILER2
12814 }
12815
12816 public:
12817 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12818 switch(blob_id) {
12819 case BlobId::stubgen_preuniverse_id:
12820 generate_preuniverse_stubs();
12821 break;
12822 case BlobId::stubgen_initial_id:
12823 generate_initial_stubs();
12824 break;
12825 case BlobId::stubgen_continuation_id:
12826 generate_continuation_stubs();
12827 break;
12828 case BlobId::stubgen_compiler_id:
12829 generate_compiler_stubs();
12830 break;
12831 case BlobId::stubgen_final_id:
12832 generate_final_stubs();
12833 break;
12834 default:
12835 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12836 break;
12837 };
12838 }
12839
12840 #if INCLUDE_CDS
12841 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
12842 // external data defined in this file
12843 #define ADD(addr) external_addresses.append((address)(addr));
12844 ADD(_sha256_round_consts);
12845 ADD(_sha512_round_consts);
12846 ADD(_sha3_round_consts);
12847 ADD(_double_keccak_round_consts);
12848 ADD(_encodeBlock_toBase64);
12849 ADD(_encodeBlock_toBase64URL);
12850 ADD(_decodeBlock_fromBase64ForNoSIMD);
12851 ADD(_decodeBlock_fromBase64URLForNoSIMD);
12852 ADD(_decodeBlock_fromBase64ForSIMD);
12853 ADD(_decodeBlock_fromBase64URLForSIMD);
12854 #undef ADD
12855 }
12856 #endif // INCLUDE_CDS
12857 }; // end class declaration
12858
12859 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
12860 StubGenerator g(code, blob_id, stub_data);
12861 }
12862
12863 #if INCLUDE_CDS
12864 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
12865 StubGenerator::init_AOTAddressTable(addresses);
12866 }
12867 #endif // INCLUDE_CDS
12868
12869 #if defined (LINUX)
12870
12871 // Define pointers to atomic stubs and initialize them to point to the
12872 // code in atomic_aarch64.S.
12873
12874 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12875 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12876 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12877 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12878 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12879
12880 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12881 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12882 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12883 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12884 DEFAULT_ATOMIC_OP(xchg, 4, )
12885 DEFAULT_ATOMIC_OP(xchg, 8, )
12886 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12887 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12888 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12889 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12890 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12891 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12892 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12893 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12894 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12895 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12896
12897 #undef DEFAULT_ATOMIC_OP
12898
12899 #endif // LINUX