1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 static const char _encodeBlock_toBase64[64] = {
156 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
157 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
158 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
159 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
160 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
161 };
162
163 static const char _encodeBlock_toBase64URL[64] = {
164 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
165 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
166 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
167 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
168 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
169 };
170
171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
178 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
179 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
180 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
181 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
182 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 };
192
193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
197 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
198 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
199 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
200 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
201 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
203 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
205 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
206 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
207 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 };
211
212 // A legal value of base64 code is in range [0, 127]. We need two lookups
213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
215 // table vector lookup use tbx, out of range indices are unchanged in
216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
217 // The value of index 64 is set to 0, so that we know that we already get the
218 // decoded data with the 1st lookup.
219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
220 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
221 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
222 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
223 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
224 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
225 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
226 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
227 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
228 };
229
230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
231 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
232 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
233 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
234 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
235 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
236 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
237 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
238 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
239 };
240
241
242 // Stub Code definitions
243
244 class StubGenerator: public StubCodeGenerator {
245 private:
246
247 #ifdef PRODUCT
248 #define inc_counter_np(counter) ((void)0)
249 #else
250 void inc_counter_np_(uint& counter) {
251 __ incrementw(ExternalAddress((address)&counter));
252 }
253 #define inc_counter_np(counter) \
254 BLOCK_COMMENT("inc_counter " #counter); \
255 inc_counter_np_(counter);
256 #endif
257
258 // Call stubs are used to call Java from C
259 //
260 // Arguments:
261 // c_rarg0: call wrapper address address
262 // c_rarg1: result address
263 // c_rarg2: result type BasicType
264 // c_rarg3: method Method*
265 // c_rarg4: (interpreter) entry point address
266 // c_rarg5: parameters intptr_t*
267 // c_rarg6: parameter size (in words) int
268 // c_rarg7: thread Thread*
269 //
270 // There is no return from the stub itself as any Java result
271 // is written to result
272 //
273 // we save r30 (lr) as the return PC at the base of the frame and
274 // link r29 (fp) below it as the frame pointer installing sp (r31)
275 // into fp.
276 //
277 // we save r0-r7, which accounts for all the c arguments.
278 //
279 // TODO: strictly do we need to save them all? they are treated as
280 // volatile by C so could we omit saving the ones we are going to
281 // place in global registers (thread? method?) or those we only use
282 // during setup of the Java call?
283 //
284 // we don't need to save r8 which C uses as an indirect result location
285 // return register.
286 //
287 // we don't need to save r9-r15 which both C and Java treat as
288 // volatile
289 //
290 // we don't need to save r16-18 because Java does not use them
291 //
292 // we save r19-r28 which Java uses as scratch registers and C
293 // expects to be callee-save
294 //
295 // we save the bottom 64 bits of each value stored in v8-v15; it is
296 // the responsibility of the caller to preserve larger values.
297 //
298 // so the stub frame looks like this when we enter Java code
299 //
300 // [ return_from_Java ] <--- sp
301 // [ argument word n ]
302 // ...
303 // -29 [ argument word 1 ]
304 // -28 [ saved Floating-point Control Register ]
305 // -26 [ saved v15 ] <--- sp_after_call
306 // -25 [ saved v14 ]
307 // -24 [ saved v13 ]
308 // -23 [ saved v12 ]
309 // -22 [ saved v11 ]
310 // -21 [ saved v10 ]
311 // -20 [ saved v9 ]
312 // -19 [ saved v8 ]
313 // -18 [ saved r28 ]
314 // -17 [ saved r27 ]
315 // -16 [ saved r26 ]
316 // -15 [ saved r25 ]
317 // -14 [ saved r24 ]
318 // -13 [ saved r23 ]
319 // -12 [ saved r22 ]
320 // -11 [ saved r21 ]
321 // -10 [ saved r20 ]
322 // -9 [ saved r19 ]
323 // -8 [ call wrapper (r0) ]
324 // -7 [ result (r1) ]
325 // -6 [ result type (r2) ]
326 // -5 [ method (r3) ]
327 // -4 [ entry point (r4) ]
328 // -3 [ parameters (r5) ]
329 // -2 [ parameter size (r6) ]
330 // -1 [ thread (r7) ]
331 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
332 // 1 [ saved lr (r30) ]
333
334 // Call stub stack layout word offsets from fp
335 enum call_stub_layout {
336 sp_after_call_off = -28,
337
338 fpcr_off = sp_after_call_off,
339 d15_off = -26,
340 d13_off = -24,
341 d11_off = -22,
342 d9_off = -20,
343
344 r28_off = -18,
345 r26_off = -16,
346 r24_off = -14,
347 r22_off = -12,
348 r20_off = -10,
349 call_wrapper_off = -8,
350 result_off = -7,
351 result_type_off = -6,
352 method_off = -5,
353 entry_point_off = -4,
354 parameter_size_off = -2,
355 thread_off = -1,
356 fp_f = 0,
357 retaddr_off = 1,
358 };
359
360 address generate_call_stub(address& return_address) {
361 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
362 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
363 "adjust this code");
364
365 StubId stub_id = StubId::stubgen_call_stub_id;
366 GrowableArray<address> entries;
367 int entry_count = StubInfo::entry_count(stub_id);
368 assert(entry_count == 2, "sanity check");
369 address start = load_archive_data(stub_id, &entries);
370 if (start != nullptr) {
371 assert(entries.length() == 1, "expected 1 extra entry");
372 return_address = entries.at(0);
373 return start;
374 }
375 StubCodeMark mark(this, stub_id);
376 start = __ pc();
377
378 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
379
380 const Address fpcr_save (rfp, fpcr_off * wordSize);
381 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
382 const Address result (rfp, result_off * wordSize);
383 const Address result_type (rfp, result_type_off * wordSize);
384 const Address method (rfp, method_off * wordSize);
385 const Address entry_point (rfp, entry_point_off * wordSize);
386 const Address parameter_size(rfp, parameter_size_off * wordSize);
387
388 const Address thread (rfp, thread_off * wordSize);
389
390 const Address d15_save (rfp, d15_off * wordSize);
391 const Address d13_save (rfp, d13_off * wordSize);
392 const Address d11_save (rfp, d11_off * wordSize);
393 const Address d9_save (rfp, d9_off * wordSize);
394
395 const Address r28_save (rfp, r28_off * wordSize);
396 const Address r26_save (rfp, r26_off * wordSize);
397 const Address r24_save (rfp, r24_off * wordSize);
398 const Address r22_save (rfp, r22_off * wordSize);
399 const Address r20_save (rfp, r20_off * wordSize);
400
401 // stub code
402
403 address aarch64_entry = __ pc();
404
405 // set up frame and move sp to end of save area
406 __ enter();
407 __ sub(sp, rfp, -sp_after_call_off * wordSize);
408
409 // save register parameters and Java scratch/global registers
410 // n.b. we save thread even though it gets installed in
411 // rthread because we want to sanity check rthread later
412 __ str(c_rarg7, thread);
413 __ strw(c_rarg6, parameter_size);
414 __ stp(c_rarg4, c_rarg5, entry_point);
415 __ stp(c_rarg2, c_rarg3, result_type);
416 __ stp(c_rarg0, c_rarg1, call_wrapper);
417
418 __ stp(r20, r19, r20_save);
419 __ stp(r22, r21, r22_save);
420 __ stp(r24, r23, r24_save);
421 __ stp(r26, r25, r26_save);
422 __ stp(r28, r27, r28_save);
423
424 __ stpd(v9, v8, d9_save);
425 __ stpd(v11, v10, d11_save);
426 __ stpd(v13, v12, d13_save);
427 __ stpd(v15, v14, d15_save);
428
429 __ get_fpcr(rscratch1);
430 __ str(rscratch1, fpcr_save);
431 // Set FPCR to the state we need. We do want Round to Nearest. We
432 // don't want non-IEEE rounding modes or floating-point traps.
433 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
434 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
435 __ set_fpcr(rscratch1);
436
437 // install Java thread in global register now we have saved
438 // whatever value it held
439 __ mov(rthread, c_rarg7);
440 // And method
441 __ mov(rmethod, c_rarg3);
442
443 // set up the heapbase register
444 __ reinit_heapbase();
445
446 #ifdef ASSERT
447 // make sure we have no pending exceptions
448 {
449 Label L;
450 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
451 __ cmp(rscratch1, (u1)NULL_WORD);
452 __ br(Assembler::EQ, L);
453 __ stop("StubRoutines::call_stub: entered with pending exception");
454 __ BIND(L);
455 }
456 #endif
457 // pass parameters if any
458 __ mov(esp, sp);
459 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
460 __ andr(sp, rscratch1, -2 * wordSize);
461
462 BLOCK_COMMENT("pass parameters if any");
463 Label parameters_done;
464 // parameter count is still in c_rarg6
465 // and parameter pointer identifying param 1 is in c_rarg5
466 __ cbzw(c_rarg6, parameters_done);
467
468 address loop = __ pc();
469 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
470 __ subsw(c_rarg6, c_rarg6, 1);
471 __ push(rscratch1);
472 __ br(Assembler::GT, loop);
473
474 __ BIND(parameters_done);
475
476 // call Java entry -- passing methdoOop, and current sp
477 // rmethod: Method*
478 // r19_sender_sp: sender sp
479 BLOCK_COMMENT("call Java function");
480 __ mov(r19_sender_sp, sp);
481 __ blr(c_rarg4);
482
483 // we do this here because the notify will already have been done
484 // if we get to the next instruction via an exception
485 //
486 // n.b. adding this instruction here affects the calculation of
487 // whether or not a routine returns to the call stub (used when
488 // doing stack walks) since the normal test is to check the return
489 // pc against the address saved below. so we may need to allow for
490 // this extra instruction in the check.
491
492 // save current address for use by exception handling code
493
494 return_address = __ pc();
495 entries.append(return_address);
496
497 // store result depending on type (everything that is not
498 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
499 // n.b. this assumes Java returns an integral result in r0
500 // and a floating result in j_farg0
501 // All of j_rargN may be used to return inline type fields so be careful
502 // not to clobber those.
503 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
504 // assignment of Rresult below.
505 Register Rresult = r14, Rresult_type = r15;
506 __ ldr(Rresult, result);
507 Label is_long, is_float, is_double, check_prim, exit;
508 __ ldr(Rresult_type, result_type);
509 __ cmp(Rresult_type, (u1)T_OBJECT);
510 __ br(Assembler::EQ, check_prim);
511 __ cmp(Rresult_type, (u1)T_LONG);
512 __ br(Assembler::EQ, is_long);
513 __ cmp(Rresult_type, (u1)T_FLOAT);
514 __ br(Assembler::EQ, is_float);
515 __ cmp(Rresult_type, (u1)T_DOUBLE);
516 __ br(Assembler::EQ, is_double);
517
518 // handle T_INT case
519 __ strw(r0, Address(Rresult));
520
521 __ BIND(exit);
522
523 // pop parameters
524 __ sub(esp, rfp, -sp_after_call_off * wordSize);
525
526 #ifdef ASSERT
527 // verify that threads correspond
528 {
529 Label L, S;
530 __ ldr(rscratch1, thread);
531 __ cmp(rthread, rscratch1);
532 __ br(Assembler::NE, S);
533 __ get_thread(rscratch1);
534 __ cmp(rthread, rscratch1);
535 __ br(Assembler::EQ, L);
536 __ BIND(S);
537 __ stop("StubRoutines::call_stub: threads must correspond");
538 __ BIND(L);
539 }
540 #endif
541
542 __ pop_cont_fastpath(rthread);
543
544 // restore callee-save registers
545 __ ldpd(v15, v14, d15_save);
546 __ ldpd(v13, v12, d13_save);
547 __ ldpd(v11, v10, d11_save);
548 __ ldpd(v9, v8, d9_save);
549
550 __ ldp(r28, r27, r28_save);
551 __ ldp(r26, r25, r26_save);
552 __ ldp(r24, r23, r24_save);
553 __ ldp(r22, r21, r22_save);
554 __ ldp(r20, r19, r20_save);
555
556 // restore fpcr
557 __ ldr(rscratch1, fpcr_save);
558 __ set_fpcr(rscratch1);
559
560 __ ldp(c_rarg0, c_rarg1, call_wrapper);
561 __ ldrw(c_rarg2, result_type);
562 __ ldr(c_rarg3, method);
563 __ ldp(c_rarg4, c_rarg5, entry_point);
564 __ ldp(c_rarg6, c_rarg7, parameter_size);
565
566 // leave frame and return to caller
567 __ leave();
568 __ ret(lr);
569
570 // handle return types different from T_INT
571 __ BIND(check_prim);
572 if (InlineTypeReturnedAsFields) {
573 // Check for scalarized return value
574 __ tbz(r0, 0, is_long);
575 // Load pack handler address
576 __ andr(rscratch1, r0, -2);
577 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
578 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
579 __ blr(rscratch1);
580 __ b(exit);
581 }
582
583 __ BIND(is_long);
584 __ str(r0, Address(Rresult, 0));
585 __ br(Assembler::AL, exit);
586
587 __ BIND(is_float);
588 __ strs(j_farg0, Address(Rresult, 0));
589 __ br(Assembler::AL, exit);
590
591 __ BIND(is_double);
592 __ strd(j_farg0, Address(Rresult, 0));
593 __ br(Assembler::AL, exit);
594
595 // record the stub entry and end plus the auxiliary entry
596 store_archive_data(stub_id, start, __ pc(), &entries);
597
598 return start;
599 }
600
601 // Return point for a Java call if there's an exception thrown in
602 // Java code. The exception is caught and transformed into a
603 // pending exception stored in JavaThread that can be tested from
604 // within the VM.
605 //
606 // Note: Usually the parameters are removed by the callee. In case
607 // of an exception crossing an activation frame boundary, that is
608 // not the case if the callee is compiled code => need to setup the
609 // rsp.
610 //
611 // r0: exception oop
612
613 address generate_catch_exception() {
614 StubId stub_id = StubId::stubgen_catch_exception_id;
615 int entry_count = StubInfo::entry_count(stub_id);
616 assert(entry_count == 1, "sanity check");
617 address start = load_archive_data(stub_id);
618 if (start != nullptr) {
619 return start;
620 }
621 StubCodeMark mark(this, stub_id);
622 start = __ pc();
623
624 // same as in generate_call_stub():
625 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
626 const Address thread (rfp, thread_off * wordSize);
627
628 #ifdef ASSERT
629 // verify that threads correspond
630 {
631 Label L, S;
632 __ ldr(rscratch1, thread);
633 __ cmp(rthread, rscratch1);
634 __ br(Assembler::NE, S);
635 __ get_thread(rscratch1);
636 __ cmp(rthread, rscratch1);
637 __ br(Assembler::EQ, L);
638 __ bind(S);
639 __ stop("StubRoutines::catch_exception: threads must correspond");
640 __ bind(L);
641 }
642 #endif
643
644 // set pending exception
645 __ verify_oop(r0);
646
647 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
648 // special case -- add file name string to AOT address table
649 address file = (address)AOTCodeCache::add_C_string(__FILE__);
650 __ lea(rscratch1, ExternalAddress(file));
651 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
652 __ movw(rscratch1, (int)__LINE__);
653 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
654
655 // complete return to VM
656 assert(StubRoutines::_call_stub_return_address != nullptr,
657 "_call_stub_return_address must have been generated before");
658 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
659
660 // record the stub entry and end
661 store_archive_data(stub_id, start, __ pc());
662
663 return start;
664 }
665
666 // Continuation point for runtime calls returning with a pending
667 // exception. The pending exception check happened in the runtime
668 // or native call stub. The pending exception in Thread is
669 // converted into a Java-level exception.
670 //
671 // Contract with Java-level exception handlers:
672 // r0: exception
673 // r3: throwing pc
674 //
675 // NOTE: At entry of this stub, exception-pc must be in LR !!
676
677 // NOTE: this is always used as a jump target within generated code
678 // so it just needs to be generated code with no x86 prolog
679
680 address generate_forward_exception() {
681 StubId stub_id = StubId::stubgen_forward_exception_id;
682 int entry_count = StubInfo::entry_count(stub_id);
683 assert(entry_count == 1, "sanity check");
684 address start = load_archive_data(stub_id);
685 if (start != nullptr) {
686 return start;
687 }
688 StubCodeMark mark(this, stub_id);
689 start = __ pc();
690
691 // Upon entry, LR points to the return address returning into
692 // Java (interpreted or compiled) code; i.e., the return address
693 // becomes the throwing pc.
694 //
695 // Arguments pushed before the runtime call are still on the stack
696 // but the exception handler will reset the stack pointer ->
697 // ignore them. A potential result in registers can be ignored as
698 // well.
699
700 #ifdef ASSERT
701 // make sure this code is only executed if there is a pending exception
702 {
703 Label L;
704 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
705 __ cbnz(rscratch1, L);
706 __ stop("StubRoutines::forward exception: no pending exception (1)");
707 __ bind(L);
708 }
709 #endif
710
711 // compute exception handler into r19
712
713 // call the VM to find the handler address associated with the
714 // caller address. pass thread in r0 and caller pc (ret address)
715 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
716 // the stack.
717 __ mov(c_rarg1, lr);
718 // lr will be trashed by the VM call so we move it to R19
719 // (callee-saved) because we also need to pass it to the handler
720 // returned by this call.
721 __ mov(r19, lr);
722 BLOCK_COMMENT("call exception_handler_for_return_address");
723 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
724 SharedRuntime::exception_handler_for_return_address),
725 rthread, c_rarg1);
726 // Reinitialize the ptrue predicate register, in case the external runtime
727 // call clobbers ptrue reg, as we may return to SVE compiled code.
728 __ reinitialize_ptrue();
729
730 // we should not really care that lr is no longer the callee
731 // address. we saved the value the handler needs in r19 so we can
732 // just copy it to r3. however, the C2 handler will push its own
733 // frame and then calls into the VM and the VM code asserts that
734 // the PC for the frame above the handler belongs to a compiled
735 // Java method. So, we restore lr here to satisfy that assert.
736 __ mov(lr, r19);
737 // setup r0 & r3 & clear pending exception
738 __ mov(r3, r19);
739 __ mov(r19, r0);
740 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
741 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
742
743 #ifdef ASSERT
744 // make sure exception is set
745 {
746 Label L;
747 __ cbnz(r0, L);
748 __ stop("StubRoutines::forward exception: no pending exception (2)");
749 __ bind(L);
750 }
751 #endif
752
753 // continue at exception handler
754 // r0: exception
755 // r3: throwing pc
756 // r19: exception handler
757 __ verify_oop(r0);
758 __ br(r19);
759
760 // record the stub entry and end
761 store_archive_data(stub_id, start, __ pc());
762
763 return start;
764 }
765
766 // Non-destructive plausibility checks for oops
767 //
768 // Arguments:
769 // r0: oop to verify
770 // rscratch1: error message
771 //
772 // Stack after saving c_rarg3:
773 // [tos + 0]: saved c_rarg3
774 // [tos + 1]: saved c_rarg2
775 // [tos + 2]: saved lr
776 // [tos + 3]: saved rscratch2
777 // [tos + 4]: saved r0
778 // [tos + 5]: saved rscratch1
779 address generate_verify_oop() {
780 StubId stub_id = StubId::stubgen_verify_oop_id;
781 int entry_count = StubInfo::entry_count(stub_id);
782 assert(entry_count == 1, "sanity check");
783 address start = load_archive_data(stub_id);
784 if (start != nullptr) {
785 return start;
786 }
787 StubCodeMark mark(this, stub_id);
788 start = __ pc();
789
790 Label exit, error;
791
792 // save c_rarg2 and c_rarg3
793 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
794
795 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
796 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
797 __ ldr(c_rarg3, Address(c_rarg2));
798 __ add(c_rarg3, c_rarg3, 1);
799 __ str(c_rarg3, Address(c_rarg2));
800
801 // object is in r0
802 // make sure object is 'reasonable'
803 __ cbz(r0, exit); // if obj is null it is OK
804
805 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
806 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
807
808 // return if everything seems ok
809 __ bind(exit);
810
811 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
812 __ ret(lr);
813
814 // handle errors
815 __ bind(error);
816 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
817
818 __ push(RegSet::range(r0, r29), sp);
819 // debug(char* msg, int64_t pc, int64_t regs[])
820 __ mov(c_rarg0, rscratch1); // pass address of error message
821 __ mov(c_rarg1, lr); // pass return address
822 __ mov(c_rarg2, sp); // pass address of regs on stack
823 #ifndef PRODUCT
824 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
825 #endif
826 BLOCK_COMMENT("call MacroAssembler::debug");
827 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
828 __ blr(rscratch1);
829 __ hlt(0);
830
831 // record the stub entry and end
832 store_archive_data(stub_id, start, __ pc());
833
834 return start;
835 }
836
837 // Generate indices for iota vector.
838 void generate_iota_indices(StubId stub_id) {
839 GrowableArray<address> entries;
840 int entry_count = StubInfo::entry_count(stub_id);
841 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
842 address start = load_archive_data(stub_id, &entries);
843 if (start != nullptr) {
844 assert(entries.length() == entry_count - 1,
845 "unexpected entries count %d", entries.length());
846 StubRoutines::aarch64::_vector_iota_indices[0] = start;
847 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
848 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
849 }
850 return;
851 }
852 __ align(CodeEntryAlignment);
853 StubCodeMark mark(this, stub_id);
854 start = __ pc();
855 // B
856 __ emit_data64(0x0706050403020100, relocInfo::none);
857 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
858 entries.append(__ pc());
859 // H
860 __ emit_data64(0x0003000200010000, relocInfo::none);
861 __ emit_data64(0x0007000600050004, relocInfo::none);
862 entries.append(__ pc());
863 // S
864 __ emit_data64(0x0000000100000000, relocInfo::none);
865 __ emit_data64(0x0000000300000002, relocInfo::none);
866 entries.append(__ pc());
867 // D
868 __ emit_data64(0x0000000000000000, relocInfo::none);
869 __ emit_data64(0x0000000000000001, relocInfo::none);
870 entries.append(__ pc());
871 // S - FP
872 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
873 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
874 entries.append(__ pc());
875 // D - FP
876 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
877 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
878
879 // record the stub entry and end
880 store_archive_data(stub_id, start, __ pc(), &entries);
881
882 // install the entry addresses in the entry array
883 assert(entries.length() == entry_count - 1,
884 "unexpected entries count %d", entries.length());
885 StubRoutines::aarch64::_vector_iota_indices[0] = start;
886 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
887 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
888 }
889 }
890
891 // The inner part of zero_words(). This is the bulk operation,
892 // zeroing words in blocks, possibly using DC ZVA to do it. The
893 // caller is responsible for zeroing the last few words.
894 //
895 // Inputs:
896 // r10: the HeapWord-aligned base address of an array to zero.
897 // r11: the count in HeapWords, r11 > 0.
898 //
899 // Returns r10 and r11, adjusted for the caller to clear.
900 // r10: the base address of the tail of words left to clear.
901 // r11: the number of words in the tail.
902 // r11 < MacroAssembler::zero_words_block_size.
903
904 address generate_zero_blocks() {
905 StubId stub_id = StubId::stubgen_zero_blocks_id;
906 int entry_count = StubInfo::entry_count(stub_id);
907 assert(entry_count == 1, "sanity check");
908 address start = load_archive_data(stub_id);
909 if (start != nullptr) {
910 return start;
911 }
912 __ align(CodeEntryAlignment);
913 StubCodeMark mark(this, stub_id);
914 Label done;
915 Label base_aligned;
916
917 Register base = r10, cnt = r11;
918
919 start = __ pc();
920
921 if (UseBlockZeroing) {
922 int zva_length = VM_Version::zva_length();
923
924 // Ensure ZVA length can be divided by 16. This is required by
925 // the subsequent operations.
926 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
927
928 __ tbz(base, 3, base_aligned);
929 __ str(zr, Address(__ post(base, 8)));
930 __ sub(cnt, cnt, 1);
931 __ bind(base_aligned);
932
933 // Ensure count >= zva_length * 2 so that it still deserves a zva after
934 // alignment.
935 Label small;
936 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
937 __ subs(rscratch1, cnt, low_limit >> 3);
938 __ br(Assembler::LT, small);
939 __ zero_dcache_blocks(base, cnt);
940 __ bind(small);
941 }
942
943 {
944 // Number of stp instructions we'll unroll
945 const int unroll =
946 MacroAssembler::zero_words_block_size / 2;
947 // Clear the remaining blocks.
948 Label loop;
949 __ subs(cnt, cnt, unroll * 2);
950 __ br(Assembler::LT, done);
951 __ bind(loop);
952 for (int i = 0; i < unroll; i++)
953 __ stp(zr, zr, __ post(base, 16));
954 __ subs(cnt, cnt, unroll * 2);
955 __ br(Assembler::GE, loop);
956 __ bind(done);
957 __ add(cnt, cnt, unroll * 2);
958 }
959
960 __ ret(lr);
961
962 // record the stub entry and end
963 store_archive_data(stub_id, start, __ pc());
964
965 return start;
966 }
967
968
969 typedef enum {
970 copy_forwards = 1,
971 copy_backwards = -1
972 } copy_direction;
973
974 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
975 // for arraycopy stubs.
976 class ArrayCopyBarrierSetHelper : StackObj {
977 BarrierSetAssembler* _bs_asm;
978 MacroAssembler* _masm;
979 DecoratorSet _decorators;
980 BasicType _type;
981 Register _gct1;
982 Register _gct2;
983 Register _gct3;
984 FloatRegister _gcvt1;
985 FloatRegister _gcvt2;
986 FloatRegister _gcvt3;
987
988 public:
989 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
990 DecoratorSet decorators,
991 BasicType type,
992 Register gct1,
993 Register gct2,
994 Register gct3,
995 FloatRegister gcvt1,
996 FloatRegister gcvt2,
997 FloatRegister gcvt3)
998 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
999 _masm(masm),
1000 _decorators(decorators),
1001 _type(type),
1002 _gct1(gct1),
1003 _gct2(gct2),
1004 _gct3(gct3),
1005 _gcvt1(gcvt1),
1006 _gcvt2(gcvt2),
1007 _gcvt3(gcvt3) {
1008 }
1009
1010 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
1011 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
1012 dst1, dst2, src,
1013 _gct1, _gct2, _gcvt1);
1014 }
1015
1016 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1017 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1018 dst, src1, src2,
1019 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1020 }
1021
1022 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1023 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1024 dst1, dst2, src,
1025 _gct1);
1026 }
1027
1028 void copy_store_at_16(Address dst, Register src1, Register src2) {
1029 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1030 dst, src1, src2,
1031 _gct1, _gct2, _gct3);
1032 }
1033
1034 void copy_load_at_8(Register dst, Address src) {
1035 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1036 dst, noreg, src,
1037 _gct1);
1038 }
1039
1040 void copy_store_at_8(Address dst, Register src) {
1041 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1042 dst, src, noreg,
1043 _gct1, _gct2, _gct3);
1044 }
1045 };
1046
1047 // Bulk copy of blocks of 8 words.
1048 //
1049 // count is a count of words.
1050 //
1051 // Precondition: count >= 8
1052 //
1053 // Postconditions:
1054 //
1055 // The least significant bit of count contains the remaining count
1056 // of words to copy. The rest of count is trash.
1057 //
1058 // s and d are adjusted to point to the remaining words to copy
1059 //
1060 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1061 int entry_count = StubInfo::entry_count(stub_id);
1062 assert(entry_count == 1, "sanity check");
1063 address start = load_archive_data(stub_id);
1064 if (start != nullptr) {
1065 return start;
1066 }
1067 BasicType type;
1068 copy_direction direction;
1069
1070 switch (stub_id) {
1071 case StubId::stubgen_copy_byte_f_id:
1072 direction = copy_forwards;
1073 type = T_BYTE;
1074 break;
1075 case StubId::stubgen_copy_byte_b_id:
1076 direction = copy_backwards;
1077 type = T_BYTE;
1078 break;
1079 case StubId::stubgen_copy_oop_f_id:
1080 direction = copy_forwards;
1081 type = T_OBJECT;
1082 break;
1083 case StubId::stubgen_copy_oop_b_id:
1084 direction = copy_backwards;
1085 type = T_OBJECT;
1086 break;
1087 case StubId::stubgen_copy_oop_uninit_f_id:
1088 direction = copy_forwards;
1089 type = T_OBJECT;
1090 break;
1091 case StubId::stubgen_copy_oop_uninit_b_id:
1092 direction = copy_backwards;
1093 type = T_OBJECT;
1094 break;
1095 default:
1096 ShouldNotReachHere();
1097 }
1098
1099 int unit = wordSize * direction;
1100 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1101
1102 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1103 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1104 const Register stride = r14;
1105 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1106 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1107 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1108
1109 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1110 assert_different_registers(s, d, count, rscratch1, rscratch2);
1111
1112 Label again, drain;
1113
1114 __ align(CodeEntryAlignment);
1115
1116 StubCodeMark mark(this, stub_id);
1117
1118 start = __ pc();
1119
1120 Label unaligned_copy_long;
1121 if (AvoidUnalignedAccesses) {
1122 __ tbnz(d, 3, unaligned_copy_long);
1123 }
1124
1125 if (direction == copy_forwards) {
1126 __ sub(s, s, bias);
1127 __ sub(d, d, bias);
1128 }
1129
1130 #ifdef ASSERT
1131 // Make sure we are never given < 8 words
1132 {
1133 Label L;
1134 __ cmp(count, (u1)8);
1135 __ br(Assembler::GE, L);
1136 __ stop("genrate_copy_longs called with < 8 words");
1137 __ bind(L);
1138 }
1139 #endif
1140
1141 // Fill 8 registers
1142 if (UseSIMDForMemoryOps) {
1143 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1144 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1145 } else {
1146 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1147 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1148 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1149 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1150 }
1151
1152 __ subs(count, count, 16);
1153 __ br(Assembler::LO, drain);
1154
1155 int prefetch = PrefetchCopyIntervalInBytes;
1156 bool use_stride = false;
1157 if (direction == copy_backwards) {
1158 use_stride = prefetch > 256;
1159 prefetch = -prefetch;
1160 if (use_stride) __ mov(stride, prefetch);
1161 }
1162
1163 __ bind(again);
1164
1165 if (PrefetchCopyIntervalInBytes > 0)
1166 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1167
1168 if (UseSIMDForMemoryOps) {
1169 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1170 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1171 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1172 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1173 } else {
1174 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1175 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1176 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1177 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1178 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1179 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1180 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1181 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1182 }
1183
1184 __ subs(count, count, 8);
1185 __ br(Assembler::HS, again);
1186
1187 // Drain
1188 __ bind(drain);
1189 if (UseSIMDForMemoryOps) {
1190 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1191 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1192 } else {
1193 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1194 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1195 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1196 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1197 }
1198
1199 {
1200 Label L1, L2;
1201 __ tbz(count, exact_log2(4), L1);
1202 if (UseSIMDForMemoryOps) {
1203 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1204 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1205 } else {
1206 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1207 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1208 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1209 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1210 }
1211 __ bind(L1);
1212
1213 if (direction == copy_forwards) {
1214 __ add(s, s, bias);
1215 __ add(d, d, bias);
1216 }
1217
1218 __ tbz(count, 1, L2);
1219 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1220 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1221 __ bind(L2);
1222 }
1223
1224 __ ret(lr);
1225
1226 if (AvoidUnalignedAccesses) {
1227 Label drain, again;
1228 // Register order for storing. Order is different for backward copy.
1229
1230 __ bind(unaligned_copy_long);
1231
1232 // source address is even aligned, target odd aligned
1233 //
1234 // when forward copying word pairs we read long pairs at offsets
1235 // {0, 2, 4, 6} (in long words). when backwards copying we read
1236 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1237 // address by -2 in the forwards case so we can compute the
1238 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1239 // or -1.
1240 //
1241 // when forward copying we need to store 1 word, 3 pairs and
1242 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1243 // zero offset We adjust the destination by -1 which means we
1244 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1245 //
1246 // When backwards copyng we need to store 1 word, 3 pairs and
1247 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1248 // offsets {1, 3, 5, 7, 8} * unit.
1249
1250 if (direction == copy_forwards) {
1251 __ sub(s, s, 16);
1252 __ sub(d, d, 8);
1253 }
1254
1255 // Fill 8 registers
1256 //
1257 // for forwards copy s was offset by -16 from the original input
1258 // value of s so the register contents are at these offsets
1259 // relative to the 64 bit block addressed by that original input
1260 // and so on for each successive 64 byte block when s is updated
1261 //
1262 // t0 at offset 0, t1 at offset 8
1263 // t2 at offset 16, t3 at offset 24
1264 // t4 at offset 32, t5 at offset 40
1265 // t6 at offset 48, t7 at offset 56
1266
1267 // for backwards copy s was not offset so the register contents
1268 // are at these offsets into the preceding 64 byte block
1269 // relative to that original input and so on for each successive
1270 // preceding 64 byte block when s is updated. this explains the
1271 // slightly counter-intuitive looking pattern of register usage
1272 // in the stp instructions for backwards copy.
1273 //
1274 // t0 at offset -16, t1 at offset -8
1275 // t2 at offset -32, t3 at offset -24
1276 // t4 at offset -48, t5 at offset -40
1277 // t6 at offset -64, t7 at offset -56
1278
1279 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1280 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1281 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1282 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1283
1284 __ subs(count, count, 16);
1285 __ br(Assembler::LO, drain);
1286
1287 int prefetch = PrefetchCopyIntervalInBytes;
1288 bool use_stride = false;
1289 if (direction == copy_backwards) {
1290 use_stride = prefetch > 256;
1291 prefetch = -prefetch;
1292 if (use_stride) __ mov(stride, prefetch);
1293 }
1294
1295 __ bind(again);
1296
1297 if (PrefetchCopyIntervalInBytes > 0)
1298 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1299
1300 if (direction == copy_forwards) {
1301 // allowing for the offset of -8 the store instructions place
1302 // registers into the target 64 bit block at the following
1303 // offsets
1304 //
1305 // t0 at offset 0
1306 // t1 at offset 8, t2 at offset 16
1307 // t3 at offset 24, t4 at offset 32
1308 // t5 at offset 40, t6 at offset 48
1309 // t7 at offset 56
1310
1311 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1312 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1313 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1314 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1315 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1316 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1317 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1318 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1319 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1320 } else {
1321 // d was not offset when we started so the registers are
1322 // written into the 64 bit block preceding d with the following
1323 // offsets
1324 //
1325 // t1 at offset -8
1326 // t3 at offset -24, t0 at offset -16
1327 // t5 at offset -48, t2 at offset -32
1328 // t7 at offset -56, t4 at offset -48
1329 // t6 at offset -64
1330 //
1331 // note that this matches the offsets previously noted for the
1332 // loads
1333
1334 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1335 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1336 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1337 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1338 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1339 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1340 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1341 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1342 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1343 }
1344
1345 __ subs(count, count, 8);
1346 __ br(Assembler::HS, again);
1347
1348 // Drain
1349 //
1350 // this uses the same pattern of offsets and register arguments
1351 // as above
1352 __ bind(drain);
1353 if (direction == copy_forwards) {
1354 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1355 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1356 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1357 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1358 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1359 } else {
1360 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1361 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1362 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1363 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1364 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1365 }
1366 // now we need to copy any remaining part block which may
1367 // include a 4 word block subblock and/or a 2 word subblock.
1368 // bits 2 and 1 in the count are the tell-tale for whether we
1369 // have each such subblock
1370 {
1371 Label L1, L2;
1372 __ tbz(count, exact_log2(4), L1);
1373 // this is the same as above but copying only 4 longs hence
1374 // with only one intervening stp between the str instructions
1375 // but note that the offsets and registers still follow the
1376 // same pattern
1377 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1378 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1379 if (direction == copy_forwards) {
1380 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1381 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1382 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1383 } else {
1384 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1385 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1386 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1387 }
1388 __ bind(L1);
1389
1390 __ tbz(count, 1, L2);
1391 // this is the same as above but copying only 2 longs hence
1392 // there is no intervening stp between the str instructions
1393 // but note that the offset and register patterns are still
1394 // the same
1395 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1396 if (direction == copy_forwards) {
1397 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1398 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1399 } else {
1400 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1401 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1402 }
1403 __ bind(L2);
1404
1405 // for forwards copy we need to re-adjust the offsets we
1406 // applied so that s and d are follow the last words written
1407
1408 if (direction == copy_forwards) {
1409 __ add(s, s, 16);
1410 __ add(d, d, 8);
1411 }
1412
1413 }
1414
1415 __ ret(lr);
1416 }
1417
1418 // record the stub entry and end
1419 store_archive_data(stub_id, start, __ pc());
1420
1421 return start;
1422 }
1423
1424 // Small copy: less than 16 bytes.
1425 //
1426 // NB: Ignores all of the bits of count which represent more than 15
1427 // bytes, so a caller doesn't have to mask them.
1428
1429 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1430 bool is_backwards = step < 0;
1431 size_t granularity = g_uabs(step);
1432 int direction = is_backwards ? -1 : 1;
1433
1434 Label Lword, Lint, Lshort, Lbyte;
1435
1436 assert(granularity
1437 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1438
1439 const Register t0 = r3;
1440 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1441 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1442
1443 // ??? I don't know if this bit-test-and-branch is the right thing
1444 // to do. It does a lot of jumping, resulting in several
1445 // mispredicted branches. It might make more sense to do this
1446 // with something like Duff's device with a single computed branch.
1447
1448 __ tbz(count, 3 - exact_log2(granularity), Lword);
1449 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1450 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1451 __ bind(Lword);
1452
1453 if (granularity <= sizeof (jint)) {
1454 __ tbz(count, 2 - exact_log2(granularity), Lint);
1455 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1456 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1457 __ bind(Lint);
1458 }
1459
1460 if (granularity <= sizeof (jshort)) {
1461 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1462 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1463 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1464 __ bind(Lshort);
1465 }
1466
1467 if (granularity <= sizeof (jbyte)) {
1468 __ tbz(count, 0, Lbyte);
1469 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1470 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1471 __ bind(Lbyte);
1472 }
1473 }
1474
1475 // All-singing all-dancing memory copy.
1476 //
1477 // Copy count units of memory from s to d. The size of a unit is
1478 // step, which can be positive or negative depending on the direction
1479 // of copy. If is_aligned is false, we align the source address.
1480 //
1481
1482 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1483 Register s, Register d, Register count, int step) {
1484 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1485 bool is_backwards = step < 0;
1486 unsigned int granularity = g_uabs(step);
1487 const Register t0 = r3, t1 = r4;
1488
1489 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1490 // load all the data before writing anything
1491 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1492 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1493 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1494 const Register send = r17, dend = r16;
1495 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1496 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1497 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1498
1499 if (PrefetchCopyIntervalInBytes > 0)
1500 __ prfm(Address(s, 0), PLDL1KEEP);
1501 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1502 __ br(Assembler::HI, copy_big);
1503
1504 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1505 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1506
1507 __ cmp(count, u1(16/granularity));
1508 __ br(Assembler::LS, copy16);
1509
1510 __ cmp(count, u1(64/granularity));
1511 __ br(Assembler::HI, copy80);
1512
1513 __ cmp(count, u1(32/granularity));
1514 __ br(Assembler::LS, copy32);
1515
1516 // 33..64 bytes
1517 if (UseSIMDForMemoryOps) {
1518 bs.copy_load_at_32(v0, v1, Address(s, 0));
1519 bs.copy_load_at_32(v2, v3, Address(send, -32));
1520 bs.copy_store_at_32(Address(d, 0), v0, v1);
1521 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1522 } else {
1523 bs.copy_load_at_16(t0, t1, Address(s, 0));
1524 bs.copy_load_at_16(t2, t3, Address(s, 16));
1525 bs.copy_load_at_16(t4, t5, Address(send, -32));
1526 bs.copy_load_at_16(t6, t7, Address(send, -16));
1527
1528 bs.copy_store_at_16(Address(d, 0), t0, t1);
1529 bs.copy_store_at_16(Address(d, 16), t2, t3);
1530 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1531 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1532 }
1533 __ b(finish);
1534
1535 // 17..32 bytes
1536 __ bind(copy32);
1537 bs.copy_load_at_16(t0, t1, Address(s, 0));
1538 bs.copy_load_at_16(t6, t7, Address(send, -16));
1539
1540 bs.copy_store_at_16(Address(d, 0), t0, t1);
1541 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1542 __ b(finish);
1543
1544 // 65..80/96 bytes
1545 // (96 bytes if SIMD because we do 32 byes per instruction)
1546 __ bind(copy80);
1547 if (UseSIMDForMemoryOps) {
1548 bs.copy_load_at_32(v0, v1, Address(s, 0));
1549 bs.copy_load_at_32(v2, v3, Address(s, 32));
1550 // Unaligned pointers can be an issue for copying.
1551 // The issue has more chances to happen when granularity of data is
1552 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1553 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1554 // The most performance drop has been seen for the range 65-80 bytes.
1555 // For such cases using the pair of ldp/stp instead of the third pair of
1556 // ldpq/stpq fixes the performance issue.
1557 if (granularity < sizeof (jint)) {
1558 Label copy96;
1559 __ cmp(count, u1(80/granularity));
1560 __ br(Assembler::HI, copy96);
1561 bs.copy_load_at_16(t0, t1, Address(send, -16));
1562
1563 bs.copy_store_at_32(Address(d, 0), v0, v1);
1564 bs.copy_store_at_32(Address(d, 32), v2, v3);
1565
1566 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1567 __ b(finish);
1568
1569 __ bind(copy96);
1570 }
1571 bs.copy_load_at_32(v4, v5, Address(send, -32));
1572
1573 bs.copy_store_at_32(Address(d, 0), v0, v1);
1574 bs.copy_store_at_32(Address(d, 32), v2, v3);
1575
1576 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1577 } else {
1578 bs.copy_load_at_16(t0, t1, Address(s, 0));
1579 bs.copy_load_at_16(t2, t3, Address(s, 16));
1580 bs.copy_load_at_16(t4, t5, Address(s, 32));
1581 bs.copy_load_at_16(t6, t7, Address(s, 48));
1582 bs.copy_load_at_16(t8, t9, Address(send, -16));
1583
1584 bs.copy_store_at_16(Address(d, 0), t0, t1);
1585 bs.copy_store_at_16(Address(d, 16), t2, t3);
1586 bs.copy_store_at_16(Address(d, 32), t4, t5);
1587 bs.copy_store_at_16(Address(d, 48), t6, t7);
1588 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1589 }
1590 __ b(finish);
1591
1592 // 0..16 bytes
1593 __ bind(copy16);
1594 __ cmp(count, u1(8/granularity));
1595 __ br(Assembler::LO, copy8);
1596
1597 // 8..16 bytes
1598 bs.copy_load_at_8(t0, Address(s, 0));
1599 bs.copy_load_at_8(t1, Address(send, -8));
1600 bs.copy_store_at_8(Address(d, 0), t0);
1601 bs.copy_store_at_8(Address(dend, -8), t1);
1602 __ b(finish);
1603
1604 if (granularity < 8) {
1605 // 4..7 bytes
1606 __ bind(copy8);
1607 __ tbz(count, 2 - exact_log2(granularity), copy4);
1608 __ ldrw(t0, Address(s, 0));
1609 __ ldrw(t1, Address(send, -4));
1610 __ strw(t0, Address(d, 0));
1611 __ strw(t1, Address(dend, -4));
1612 __ b(finish);
1613 if (granularity < 4) {
1614 // 0..3 bytes
1615 __ bind(copy4);
1616 __ cbz(count, finish); // get rid of 0 case
1617 if (granularity == 2) {
1618 __ ldrh(t0, Address(s, 0));
1619 __ strh(t0, Address(d, 0));
1620 } else { // granularity == 1
1621 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1622 // the first and last byte.
1623 // Handle the 3 byte case by loading and storing base + count/2
1624 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1625 // This does means in the 1 byte case we load/store the same
1626 // byte 3 times.
1627 __ lsr(count, count, 1);
1628 __ ldrb(t0, Address(s, 0));
1629 __ ldrb(t1, Address(send, -1));
1630 __ ldrb(t2, Address(s, count));
1631 __ strb(t0, Address(d, 0));
1632 __ strb(t1, Address(dend, -1));
1633 __ strb(t2, Address(d, count));
1634 }
1635 __ b(finish);
1636 }
1637 }
1638
1639 __ bind(copy_big);
1640 if (is_backwards) {
1641 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1642 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1643 }
1644
1645 // Now we've got the small case out of the way we can align the
1646 // source address on a 2-word boundary.
1647
1648 // Here we will materialize a count in r15, which is used by copy_memory_small
1649 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1650 // Up until here, we have used t9, which aliases r15, but from here on, that register
1651 // can not be used as a temp register, as it contains the count.
1652
1653 Label aligned;
1654
1655 if (is_aligned) {
1656 // We may have to adjust by 1 word to get s 2-word-aligned.
1657 __ tbz(s, exact_log2(wordSize), aligned);
1658 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1659 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1660 __ sub(count, count, wordSize/granularity);
1661 } else {
1662 if (is_backwards) {
1663 __ andr(r15, s, 2 * wordSize - 1);
1664 } else {
1665 __ neg(r15, s);
1666 __ andr(r15, r15, 2 * wordSize - 1);
1667 }
1668 // r15 is the byte adjustment needed to align s.
1669 __ cbz(r15, aligned);
1670 int shift = exact_log2(granularity);
1671 if (shift > 0) {
1672 __ lsr(r15, r15, shift);
1673 }
1674 __ sub(count, count, r15);
1675
1676 #if 0
1677 // ?? This code is only correct for a disjoint copy. It may or
1678 // may not make sense to use it in that case.
1679
1680 // Copy the first pair; s and d may not be aligned.
1681 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1682 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1683
1684 // Align s and d, adjust count
1685 if (is_backwards) {
1686 __ sub(s, s, r15);
1687 __ sub(d, d, r15);
1688 } else {
1689 __ add(s, s, r15);
1690 __ add(d, d, r15);
1691 }
1692 #else
1693 copy_memory_small(decorators, type, s, d, r15, step);
1694 #endif
1695 }
1696
1697 __ bind(aligned);
1698
1699 // s is now 2-word-aligned.
1700
1701 // We have a count of units and some trailing bytes. Adjust the
1702 // count and do a bulk copy of words. If the shift is zero
1703 // perform a move instead to benefit from zero latency moves.
1704 int shift = exact_log2(wordSize/granularity);
1705 if (shift > 0) {
1706 __ lsr(r15, count, shift);
1707 } else {
1708 __ mov(r15, count);
1709 }
1710 if (direction == copy_forwards) {
1711 if (type != T_OBJECT) {
1712 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1713 __ blr(rscratch1);
1714 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1715 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1716 __ blr(rscratch1);
1717 } else {
1718 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1719 __ blr(rscratch1);
1720 }
1721 } else {
1722 if (type != T_OBJECT) {
1723 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1724 __ blr(rscratch1);
1725 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1726 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1727 __ blr(rscratch1);
1728 } else {
1729 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1730 __ blr(rscratch1);
1731 }
1732 }
1733
1734 // And the tail.
1735 copy_memory_small(decorators, type, s, d, count, step);
1736
1737 if (granularity >= 8) __ bind(copy8);
1738 if (granularity >= 4) __ bind(copy4);
1739 __ bind(finish);
1740 }
1741
1742
1743 void clobber_registers() {
1744 #ifdef ASSERT
1745 RegSet clobbered
1746 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1747 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1748 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1749 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1750 __ mov(*it, rscratch1);
1751 }
1752 #endif
1753
1754 }
1755
1756 // Scan over array at a for count oops, verifying each one.
1757 // Preserves a and count, clobbers rscratch1 and rscratch2.
1758 void verify_oop_array (int size, Register a, Register count, Register temp) {
1759 Label loop, end;
1760 __ mov(rscratch1, a);
1761 __ mov(rscratch2, zr);
1762 __ bind(loop);
1763 __ cmp(rscratch2, count);
1764 __ br(Assembler::HS, end);
1765 if (size == wordSize) {
1766 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1767 __ verify_oop(temp);
1768 } else {
1769 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1770 __ decode_heap_oop(temp); // calls verify_oop
1771 }
1772 __ add(rscratch2, rscratch2, 1);
1773 __ b(loop);
1774 __ bind(end);
1775 }
1776
1777 // Arguments:
1778 // stub_id - is used to name the stub and identify all details of
1779 // how to perform the copy.
1780 //
1781 // nopush_entry - is assigned to the stub's post push entry point
1782 // unless it is null
1783 //
1784 // Inputs:
1785 // c_rarg0 - source array address
1786 // c_rarg1 - destination array address
1787 // c_rarg2 - element count, treated as ssize_t, can be zero
1788 //
1789 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1790 // the hardware handle it. The two dwords within qwords that span
1791 // cache line boundaries will still be loaded and stored atomically.
1792 //
1793 // Side Effects: nopush_entry is set to the (post push) entry point
1794 // so it can be used by the corresponding conjoint
1795 // copy method
1796 //
1797 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1798 int size;
1799 bool aligned;
1800 bool is_oop;
1801 bool dest_uninitialized;
1802 switch (stub_id) {
1803 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1804 size = sizeof(jbyte);
1805 aligned = false;
1806 is_oop = false;
1807 dest_uninitialized = false;
1808 break;
1809 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1810 size = sizeof(jbyte);
1811 aligned = true;
1812 is_oop = false;
1813 dest_uninitialized = false;
1814 break;
1815 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1816 size = sizeof(jshort);
1817 aligned = false;
1818 is_oop = false;
1819 dest_uninitialized = false;
1820 break;
1821 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1822 size = sizeof(jshort);
1823 aligned = true;
1824 is_oop = false;
1825 dest_uninitialized = false;
1826 break;
1827 case StubId::stubgen_jint_disjoint_arraycopy_id:
1828 size = sizeof(jint);
1829 aligned = false;
1830 is_oop = false;
1831 dest_uninitialized = false;
1832 break;
1833 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1834 size = sizeof(jint);
1835 aligned = true;
1836 is_oop = false;
1837 dest_uninitialized = false;
1838 break;
1839 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1840 // since this is always aligned we can (should!) use the same
1841 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1842 ShouldNotReachHere();
1843 break;
1844 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1845 size = sizeof(jlong);
1846 aligned = true;
1847 is_oop = false;
1848 dest_uninitialized = false;
1849 break;
1850 case StubId::stubgen_oop_disjoint_arraycopy_id:
1851 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1852 aligned = !UseCompressedOops;
1853 is_oop = true;
1854 dest_uninitialized = false;
1855 break;
1856 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1857 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1858 aligned = !UseCompressedOops;
1859 is_oop = true;
1860 dest_uninitialized = false;
1861 break;
1862 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1863 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1864 aligned = !UseCompressedOops;
1865 is_oop = true;
1866 dest_uninitialized = true;
1867 break;
1868 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1869 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1870 aligned = !UseCompressedOops;
1871 is_oop = true;
1872 dest_uninitialized = true;
1873 break;
1874 default:
1875 ShouldNotReachHere();
1876 break;
1877 }
1878 // all stubs provide a 2nd entry which omits the frame push for
1879 // use when bailing out from a conjoint copy. However we may also
1880 // need some extra addressses for memory access protection.
1881 int entry_count = StubInfo::entry_count(stub_id);
1882 assert(entry_count == 2, "sanity check");
1883 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1884
1885 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1886 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1887 GrowableArray<address> entries;
1888 GrowableArray<address> extras;
1889 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1890 address start = load_archive_data(stub_id, &entries, extras_ptr);
1891 if (start != nullptr) {
1892 assert(entries.length() == entry_count - 1,
1893 "unexpected entries count %d", entries.length());
1894 *nopush_entry = entries.at(0);
1895 assert(extras.length() == extra_count,
1896 "unexpected extra count %d", extras.length());
1897 if (add_extras) {
1898 // register one handler at offset 0
1899 register_unsafe_access_handlers(extras, 0, 1);
1900 }
1901 return start;
1902 }
1903
1904 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1905 RegSet saved_reg = RegSet::of(s, d, count);
1906
1907 __ align(CodeEntryAlignment);
1908 StubCodeMark mark(this, stub_id);
1909 start = __ pc();
1910 __ enter();
1911
1912 *nopush_entry = __ pc();
1913 entries.append(*nopush_entry);
1914
1915 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1916 BLOCK_COMMENT("Post-Push Entry:");
1917
1918 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1919 if (dest_uninitialized) {
1920 decorators |= IS_DEST_UNINITIALIZED;
1921 }
1922 if (aligned) {
1923 decorators |= ARRAYCOPY_ALIGNED;
1924 }
1925
1926 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1927 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1928
1929 if (is_oop) {
1930 // save regs before copy_memory
1931 __ push(RegSet::of(d, count), sp);
1932 }
1933 {
1934 // UnsafeMemoryAccess page error: continue after unsafe access
1935 UnsafeMemoryAccessMark umam(this, add_extras, true);
1936 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1937 }
1938
1939 if (is_oop) {
1940 __ pop(RegSet::of(d, count), sp);
1941 if (VerifyOops)
1942 verify_oop_array(size, d, count, r16);
1943 }
1944
1945 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1946
1947 __ leave();
1948 __ mov(r0, zr); // return 0
1949 __ ret(lr);
1950
1951 address end = __ pc();
1952
1953 if (add_extras) {
1954 // retrieve the registered handler addresses
1955 retrieve_unsafe_access_handlers(start, end, extras);
1956 assert(extras.length() == extra_count
1957 , "incorrect handlers count %d", extras.length());
1958 }
1959
1960 // record the stub entry and end plus the no_push entry and any
1961 // extra handler addresses
1962 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1963
1964 return start;
1965 }
1966
1967 // Arguments:
1968 // stub_id - is used to name the stub and identify all details of
1969 // how to perform the copy.
1970 //
1971 // nooverlap_target - identifes the (post push) entry for the
1972 // corresponding disjoint copy routine which can be
1973 // jumped to if the ranges do not actually overlap
1974 //
1975 // nopush_entry - is assigned to the stub's post push entry point
1976 // unless it is null
1977 //
1978 //
1979 // Inputs:
1980 // c_rarg0 - source array address
1981 // c_rarg1 - destination array address
1982 // c_rarg2 - element count, treated as ssize_t, can be zero
1983 //
1984 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1985 // the hardware handle it. The two dwords within qwords that span
1986 // cache line boundaries will still be loaded and stored atomically.
1987 //
1988 // Side Effects:
1989 // nopush_entry is set to the no-overlap entry point so it can be
1990 // used by some other conjoint copy method
1991 //
1992 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1993 int size;
1994 bool aligned;
1995 bool is_oop;
1996 bool dest_uninitialized;
1997 switch (stub_id) {
1998 case StubId::stubgen_jbyte_arraycopy_id:
1999 size = sizeof(jbyte);
2000 aligned = false;
2001 is_oop = false;
2002 dest_uninitialized = false;
2003 break;
2004 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
2005 size = sizeof(jbyte);
2006 aligned = true;
2007 is_oop = false;
2008 dest_uninitialized = false;
2009 break;
2010 case StubId::stubgen_jshort_arraycopy_id:
2011 size = sizeof(jshort);
2012 aligned = false;
2013 is_oop = false;
2014 dest_uninitialized = false;
2015 break;
2016 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2017 size = sizeof(jshort);
2018 aligned = true;
2019 is_oop = false;
2020 dest_uninitialized = false;
2021 break;
2022 case StubId::stubgen_jint_arraycopy_id:
2023 size = sizeof(jint);
2024 aligned = false;
2025 is_oop = false;
2026 dest_uninitialized = false;
2027 break;
2028 case StubId::stubgen_arrayof_jint_arraycopy_id:
2029 size = sizeof(jint);
2030 aligned = true;
2031 is_oop = false;
2032 dest_uninitialized = false;
2033 break;
2034 case StubId::stubgen_jlong_arraycopy_id:
2035 // since this is always aligned we can (should!) use the same
2036 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2037 ShouldNotReachHere();
2038 break;
2039 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2040 size = sizeof(jlong);
2041 aligned = true;
2042 is_oop = false;
2043 dest_uninitialized = false;
2044 break;
2045 case StubId::stubgen_oop_arraycopy_id:
2046 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2047 aligned = !UseCompressedOops;
2048 is_oop = true;
2049 dest_uninitialized = false;
2050 break;
2051 case StubId::stubgen_arrayof_oop_arraycopy_id:
2052 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2053 aligned = !UseCompressedOops;
2054 is_oop = true;
2055 dest_uninitialized = false;
2056 break;
2057 case StubId::stubgen_oop_arraycopy_uninit_id:
2058 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2059 aligned = !UseCompressedOops;
2060 is_oop = true;
2061 dest_uninitialized = true;
2062 break;
2063 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2064 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2065 aligned = !UseCompressedOops;
2066 is_oop = true;
2067 dest_uninitialized = true;
2068 break;
2069 default:
2070 ShouldNotReachHere();
2071 }
2072 // only some conjoint stubs generate a 2nd entry
2073 int entry_count = StubInfo::entry_count(stub_id);
2074 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2075 assert(entry_count == expected_entry_count,
2076 "expected entry count %d does not match declared entry count %d for stub %s",
2077 expected_entry_count, entry_count, StubInfo::name(stub_id));
2078
2079 // We need to protect memory accesses in certain cases
2080 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2081 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2082 GrowableArray<address> entries;
2083 GrowableArray<address> extras;
2084 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2085 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2086 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2087 if (start != nullptr) {
2088 assert(entries.length() == expected_entry_count - 1,
2089 "unexpected entries count %d", entries.length());
2090 assert(extras.length() == extra_count,
2091 "unexpected extra count %d", extras.length());
2092 if (nopush_entry != nullptr) {
2093 *nopush_entry = entries.at(0);
2094 }
2095 if (add_extras) {
2096 // register one handler at offset 0
2097 register_unsafe_access_handlers(extras, 0, 1);
2098 }
2099 return start;
2100 }
2101
2102 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2103 RegSet saved_regs = RegSet::of(s, d, count);
2104 StubCodeMark mark(this, stub_id);
2105 start = __ pc();
2106 __ enter();
2107
2108 if (nopush_entry != nullptr) {
2109 *nopush_entry = __ pc();
2110 entries.append(*nopush_entry);
2111 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2112 BLOCK_COMMENT("Post-Push Entry:");
2113 }
2114
2115 // use fwd copy when (d-s) above_equal (count*size)
2116 Label L_overlapping;
2117 __ sub(rscratch1, d, s);
2118 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2119 __ br(Assembler::LO, L_overlapping);
2120 __ b(RuntimeAddress(nooverlap_target));
2121 __ bind(L_overlapping);
2122
2123 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2124 if (dest_uninitialized) {
2125 decorators |= IS_DEST_UNINITIALIZED;
2126 }
2127 if (aligned) {
2128 decorators |= ARRAYCOPY_ALIGNED;
2129 }
2130
2131 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2132 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2133
2134 if (is_oop) {
2135 // save regs before copy_memory
2136 __ push(RegSet::of(d, count), sp);
2137 }
2138 {
2139 // UnsafeMemoryAccess page error: continue after unsafe access
2140 UnsafeMemoryAccessMark umam(this, add_extras, true);
2141 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2142 }
2143 if (is_oop) {
2144 __ pop(RegSet::of(d, count), sp);
2145 if (VerifyOops)
2146 verify_oop_array(size, d, count, r16);
2147 }
2148 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2149 __ leave();
2150 __ mov(r0, zr); // return 0
2151 __ ret(lr);
2152
2153 assert(entries.length() == expected_entry_count - 1,
2154 "unexpected entries count %d", entries.length());
2155
2156 address end = __ pc();
2157
2158 if (add_extras) {
2159 // retrieve the registered handler addresses
2160 retrieve_unsafe_access_handlers(start, end, extras);
2161 assert(extras.length() == extra_count,
2162 "incorrect handlers count %d", extras.length());
2163 }
2164
2165 // record the stub entry and end plus any no_push entry and/or
2166 // extra handler addresses
2167 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2168
2169 return start;
2170 }
2171
2172 // Helper for generating a dynamic type check.
2173 // Smashes rscratch1, rscratch2.
2174 void generate_type_check(Register sub_klass,
2175 Register super_check_offset,
2176 Register super_klass,
2177 Register temp1,
2178 Register temp2,
2179 Register result,
2180 Label& L_success) {
2181 assert_different_registers(sub_klass, super_check_offset, super_klass);
2182
2183 BLOCK_COMMENT("type_check:");
2184
2185 Label L_miss;
2186
2187 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2188 super_check_offset);
2189 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2190
2191 // Fall through on failure!
2192 __ BIND(L_miss);
2193 }
2194
2195 //
2196 // Generate checkcasting array copy stub
2197 //
2198 // Input:
2199 // c_rarg0 - source array address
2200 // c_rarg1 - destination array address
2201 // c_rarg2 - element count, treated as ssize_t, can be zero
2202 // c_rarg3 - size_t ckoff (super_check_offset)
2203 // c_rarg4 - oop ckval (super_klass)
2204 //
2205 // Output:
2206 // r0 == 0 - success
2207 // r0 == -1^K - failure, where K is partial transfer count
2208 //
2209 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2210 bool dest_uninitialized;
2211 switch (stub_id) {
2212 case StubId::stubgen_checkcast_arraycopy_id:
2213 dest_uninitialized = false;
2214 break;
2215 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2216 dest_uninitialized = true;
2217 break;
2218 default:
2219 ShouldNotReachHere();
2220 }
2221
2222 // The normal stub provides a 2nd entry which omits the frame push
2223 // for use when bailing out from a disjoint copy.
2224 // Only some conjoint stubs generate a 2nd entry
2225 int entry_count = StubInfo::entry_count(stub_id);
2226 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2227 GrowableArray<address> entries;
2228 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2229 assert(entry_count == expected_entry_count,
2230 "expected entry count %d does not match declared entry count %d for stub %s",
2231 expected_entry_count, entry_count, StubInfo::name(stub_id));
2232 address start = load_archive_data(stub_id, entries_ptr);
2233 if (start != nullptr) {
2234 assert(entries.length() + 1 == expected_entry_count,
2235 "expected entry count %d does not match return entry count %d for stub %s",
2236 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2237 if (nopush_entry != nullptr) {
2238 *nopush_entry = entries.at(0);
2239 }
2240 return start;
2241 }
2242
2243 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2244
2245 // Input registers (after setup_arg_regs)
2246 const Register from = c_rarg0; // source array address
2247 const Register to = c_rarg1; // destination array address
2248 const Register count = c_rarg2; // elementscount
2249 const Register ckoff = c_rarg3; // super_check_offset
2250 const Register ckval = c_rarg4; // super_klass
2251
2252 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2253
2254 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2255 const Register copied_oop = r22; // actual oop copied
2256 const Register count_save = r21; // orig elementscount
2257 const Register start_to = r20; // destination array start address
2258 const Register r19_klass = r19; // oop._klass
2259
2260 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2261 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2262
2263 //---------------------------------------------------------------
2264 // Assembler stub will be used for this call to arraycopy
2265 // if the two arrays are subtypes of Object[] but the
2266 // destination array type is not equal to or a supertype
2267 // of the source type. Each element must be separately
2268 // checked.
2269
2270 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2271 copied_oop, r19_klass, count_save);
2272
2273 __ align(CodeEntryAlignment);
2274 StubCodeMark mark(this, stub_id);
2275 start = __ pc();
2276
2277 __ enter(); // required for proper stackwalking of RuntimeStub frame
2278
2279 #ifdef ASSERT
2280 // caller guarantees that the arrays really are different
2281 // otherwise, we would have to make conjoint checks
2282 { Label L;
2283 __ b(L); // conjoint check not yet implemented
2284 __ stop("checkcast_copy within a single array");
2285 __ bind(L);
2286 }
2287 #endif //ASSERT
2288
2289 // Caller of this entry point must set up the argument registers.
2290 if (nopush_entry != nullptr) {
2291 *nopush_entry = __ pc();
2292 entries.append(*nopush_entry);
2293 BLOCK_COMMENT("Entry:");
2294 }
2295
2296 // Empty array: Nothing to do.
2297 __ cbz(count, L_done);
2298 __ push(RegSet::of(r19, r20, r21, r22), sp);
2299
2300 #ifdef ASSERT
2301 BLOCK_COMMENT("assert consistent ckoff/ckval");
2302 // The ckoff and ckval must be mutually consistent,
2303 // even though caller generates both.
2304 { Label L;
2305 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2306 __ ldrw(start_to, Address(ckval, sco_offset));
2307 __ cmpw(ckoff, start_to);
2308 __ br(Assembler::EQ, L);
2309 __ stop("super_check_offset inconsistent");
2310 __ bind(L);
2311 }
2312 #endif //ASSERT
2313
2314 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2315 bool is_oop = true;
2316 int element_size = UseCompressedOops ? 4 : 8;
2317 if (dest_uninitialized) {
2318 decorators |= IS_DEST_UNINITIALIZED;
2319 }
2320
2321 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2322 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2323
2324 // save the original count
2325 __ mov(count_save, count);
2326
2327 // Copy from low to high addresses
2328 __ mov(start_to, to); // Save destination array start address
2329 __ b(L_load_element);
2330
2331 // ======== begin loop ========
2332 // (Loop is rotated; its entry is L_load_element.)
2333 // Loop control:
2334 // for (; count != 0; count--) {
2335 // copied_oop = load_heap_oop(from++);
2336 // ... generate_type_check ...;
2337 // store_heap_oop(to++, copied_oop);
2338 // }
2339 __ align(OptoLoopAlignment);
2340
2341 __ BIND(L_store_element);
2342 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2343 __ post(to, element_size), copied_oop, noreg,
2344 gct1, gct2, gct3);
2345 __ sub(count, count, 1);
2346 __ cbz(count, L_do_card_marks);
2347
2348 // ======== loop entry is here ========
2349 __ BIND(L_load_element);
2350 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2351 copied_oop, noreg, __ post(from, element_size),
2352 gct1);
2353 __ cbz(copied_oop, L_store_element);
2354
2355 __ load_klass(r19_klass, copied_oop);// query the object klass
2356
2357 BLOCK_COMMENT("type_check:");
2358 generate_type_check(/*sub_klass*/r19_klass,
2359 /*super_check_offset*/ckoff,
2360 /*super_klass*/ckval,
2361 /*r_array_base*/gct1,
2362 /*temp2*/gct2,
2363 /*result*/r10, L_store_element);
2364
2365 // Fall through on failure!
2366
2367 // ======== end loop ========
2368
2369 // It was a real error; we must depend on the caller to finish the job.
2370 // Register count = remaining oops, count_orig = total oops.
2371 // Emit GC store barriers for the oops we have copied and report
2372 // their number to the caller.
2373
2374 __ subs(count, count_save, count); // K = partially copied oop count
2375 __ eon(count, count, zr); // report (-1^K) to caller
2376 __ br(Assembler::EQ, L_done_pop);
2377
2378 __ BIND(L_do_card_marks);
2379 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2380
2381 __ bind(L_done_pop);
2382 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2383 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2384
2385 __ bind(L_done);
2386 __ mov(r0, count);
2387 __ leave();
2388 __ ret(lr);
2389
2390 // record the stub entry and end plus any no_push entry
2391 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2392 return start;
2393 }
2394
2395 // Perform range checks on the proposed arraycopy.
2396 // Kills temp, but nothing else.
2397 // Also, clean the sign bits of src_pos and dst_pos.
2398 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2399 Register src_pos, // source position (c_rarg1)
2400 Register dst, // destination array oo (c_rarg2)
2401 Register dst_pos, // destination position (c_rarg3)
2402 Register length,
2403 Register temp,
2404 Label& L_failed) {
2405 BLOCK_COMMENT("arraycopy_range_checks:");
2406
2407 assert_different_registers(rscratch1, temp);
2408
2409 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2410 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2411 __ addw(temp, length, src_pos);
2412 __ cmpw(temp, rscratch1);
2413 __ br(Assembler::HI, L_failed);
2414
2415 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2416 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2417 __ addw(temp, length, dst_pos);
2418 __ cmpw(temp, rscratch1);
2419 __ br(Assembler::HI, L_failed);
2420
2421 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2422 __ movw(src_pos, src_pos);
2423 __ movw(dst_pos, dst_pos);
2424
2425 BLOCK_COMMENT("arraycopy_range_checks done");
2426 }
2427
2428 // These stubs get called from some dumb test routine.
2429 // I'll write them properly when they're called from
2430 // something that's actually doing something.
2431 static void fake_arraycopy_stub(address src, address dst, int count) {
2432 assert(count == 0, "huh?");
2433 }
2434
2435
2436 //
2437 // Generate 'unsafe' array copy stub
2438 // Though just as safe as the other stubs, it takes an unscaled
2439 // size_t argument instead of an element count.
2440 //
2441 // Input:
2442 // c_rarg0 - source array address
2443 // c_rarg1 - destination array address
2444 // c_rarg2 - byte count, treated as ssize_t, can be zero
2445 //
2446 // Examines the alignment of the operands and dispatches
2447 // to a long, int, short, or byte copy loop.
2448 //
2449 address generate_unsafe_copy(address byte_copy_entry,
2450 address short_copy_entry,
2451 address int_copy_entry,
2452 address long_copy_entry) {
2453 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2454 int entry_count = StubInfo::entry_count(stub_id);
2455 assert(entry_count == 1, "sanity check");
2456 address start = load_archive_data(stub_id);
2457 if (start != nullptr) {
2458 return start;
2459 }
2460 Label L_long_aligned, L_int_aligned, L_short_aligned;
2461 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2462
2463 __ align(CodeEntryAlignment);
2464 StubCodeMark mark(this, stub_id);
2465 start = __ pc();
2466 __ enter(); // required for proper stackwalking of RuntimeStub frame
2467
2468 // bump this on entry, not on exit:
2469 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2470
2471 __ orr(rscratch1, s, d);
2472 __ orr(rscratch1, rscratch1, count);
2473
2474 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2475 __ cbz(rscratch1, L_long_aligned);
2476 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2477 __ cbz(rscratch1, L_int_aligned);
2478 __ tbz(rscratch1, 0, L_short_aligned);
2479 __ b(RuntimeAddress(byte_copy_entry));
2480
2481 __ BIND(L_short_aligned);
2482 __ lsr(count, count, LogBytesPerShort); // size => short_count
2483 __ b(RuntimeAddress(short_copy_entry));
2484 __ BIND(L_int_aligned);
2485 __ lsr(count, count, LogBytesPerInt); // size => int_count
2486 __ b(RuntimeAddress(int_copy_entry));
2487 __ BIND(L_long_aligned);
2488 __ lsr(count, count, LogBytesPerLong); // size => long_count
2489 __ b(RuntimeAddress(long_copy_entry));
2490
2491 // record the stub entry and end
2492 store_archive_data(stub_id, start, __ pc());
2493
2494 return start;
2495 }
2496
2497 //
2498 // Generate generic array copy stubs
2499 //
2500 // Input:
2501 // c_rarg0 - src oop
2502 // c_rarg1 - src_pos (32-bits)
2503 // c_rarg2 - dst oop
2504 // c_rarg3 - dst_pos (32-bits)
2505 // c_rarg4 - element count (32-bits)
2506 //
2507 // Output:
2508 // r0 == 0 - success
2509 // r0 == -1^K - failure, where K is partial transfer count
2510 //
2511 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2512 address int_copy_entry, address oop_copy_entry,
2513 address long_copy_entry, address checkcast_copy_entry) {
2514 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2515 int entry_count = StubInfo::entry_count(stub_id);
2516 assert(entry_count == 1, "sanity check");
2517 address start = load_archive_data(stub_id);
2518 if (start != nullptr) {
2519 return start;
2520 }
2521 Label L_failed, L_objArray;
2522 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2523
2524 // Input registers
2525 const Register src = c_rarg0; // source array oop
2526 const Register src_pos = c_rarg1; // source position
2527 const Register dst = c_rarg2; // destination array oop
2528 const Register dst_pos = c_rarg3; // destination position
2529 const Register length = c_rarg4;
2530
2531
2532 // Registers used as temps
2533 const Register dst_klass = c_rarg5;
2534
2535 __ align(CodeEntryAlignment);
2536
2537 StubCodeMark mark(this, stub_id);
2538
2539 start = __ pc();
2540
2541 __ enter(); // required for proper stackwalking of RuntimeStub frame
2542
2543 // bump this on entry, not on exit:
2544 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2545
2546 //-----------------------------------------------------------------------
2547 // Assembler stub will be used for this call to arraycopy
2548 // if the following conditions are met:
2549 //
2550 // (1) src and dst must not be null.
2551 // (2) src_pos must not be negative.
2552 // (3) dst_pos must not be negative.
2553 // (4) length must not be negative.
2554 // (5) src klass and dst klass should be the same and not null.
2555 // (6) src and dst should be arrays.
2556 // (7) src_pos + length must not exceed length of src.
2557 // (8) dst_pos + length must not exceed length of dst.
2558 //
2559
2560 // if (src == nullptr) return -1;
2561 __ cbz(src, L_failed);
2562
2563 // if (src_pos < 0) return -1;
2564 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2565
2566 // if (dst == nullptr) return -1;
2567 __ cbz(dst, L_failed);
2568
2569 // if (dst_pos < 0) return -1;
2570 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2571
2572 // registers used as temp
2573 const Register scratch_length = r16; // elements count to copy
2574 const Register scratch_src_klass = r17; // array klass
2575 const Register lh = r15; // layout helper
2576
2577 // if (length < 0) return -1;
2578 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2579 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2580
2581 __ load_klass(scratch_src_klass, src);
2582 #ifdef ASSERT
2583 // assert(src->klass() != nullptr);
2584 {
2585 BLOCK_COMMENT("assert klasses not null {");
2586 Label L1, L2;
2587 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2588 __ bind(L1);
2589 __ stop("broken null klass");
2590 __ bind(L2);
2591 __ load_klass(rscratch1, dst);
2592 __ cbz(rscratch1, L1); // this would be broken also
2593 BLOCK_COMMENT("} assert klasses not null done");
2594 }
2595 #endif
2596
2597 // Load layout helper (32-bits)
2598 //
2599 // |array_tag| | header_size | element_type | |log2_element_size|
2600 // 32 30 24 16 8 2 0
2601 //
2602 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2603 //
2604
2605 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2606
2607 // Handle objArrays completely differently...
2608 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2609 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2610 __ movw(rscratch1, objArray_lh);
2611 __ eorw(rscratch2, lh, rscratch1);
2612 __ cbzw(rscratch2, L_objArray);
2613
2614 // if (src->klass() != dst->klass()) return -1;
2615 __ load_klass(rscratch2, dst);
2616 __ eor(rscratch2, rscratch2, scratch_src_klass);
2617 __ cbnz(rscratch2, L_failed);
2618
2619 // Check for flat inline type array -> return -1
2620 __ test_flat_array_oop(src, rscratch2, L_failed);
2621
2622 // Check for null-free (non-flat) inline type array -> handle as object array
2623 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2624
2625 // if (!src->is_Array()) return -1;
2626 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2627
2628 // At this point, it is known to be a typeArray (array_tag 0x3).
2629 #ifdef ASSERT
2630 {
2631 BLOCK_COMMENT("assert primitive array {");
2632 Label L;
2633 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2634 __ cmpw(lh, rscratch2);
2635 __ br(Assembler::GE, L);
2636 __ stop("must be a primitive array");
2637 __ bind(L);
2638 BLOCK_COMMENT("} assert primitive array done");
2639 }
2640 #endif
2641
2642 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2643 rscratch2, L_failed);
2644
2645 // TypeArrayKlass
2646 //
2647 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2648 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2649 //
2650
2651 const Register rscratch1_offset = rscratch1; // array offset
2652 const Register r15_elsize = lh; // element size
2653
2654 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2655 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2656 __ add(src, src, rscratch1_offset); // src array offset
2657 __ add(dst, dst, rscratch1_offset); // dst array offset
2658 BLOCK_COMMENT("choose copy loop based on element size");
2659
2660 // next registers should be set before the jump to corresponding stub
2661 const Register from = c_rarg0; // source array address
2662 const Register to = c_rarg1; // destination array address
2663 const Register count = c_rarg2; // elements count
2664
2665 // 'from', 'to', 'count' registers should be set in such order
2666 // since they are the same as 'src', 'src_pos', 'dst'.
2667
2668 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2669
2670 // The possible values of elsize are 0-3, i.e. exact_log2(element
2671 // size in bytes). We do a simple bitwise binary search.
2672 __ BIND(L_copy_bytes);
2673 __ tbnz(r15_elsize, 1, L_copy_ints);
2674 __ tbnz(r15_elsize, 0, L_copy_shorts);
2675 __ lea(from, Address(src, src_pos));// src_addr
2676 __ lea(to, Address(dst, dst_pos));// dst_addr
2677 __ movw(count, scratch_length); // length
2678 __ b(RuntimeAddress(byte_copy_entry));
2679
2680 __ BIND(L_copy_shorts);
2681 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2682 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2683 __ movw(count, scratch_length); // length
2684 __ b(RuntimeAddress(short_copy_entry));
2685
2686 __ BIND(L_copy_ints);
2687 __ tbnz(r15_elsize, 0, L_copy_longs);
2688 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2689 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2690 __ movw(count, scratch_length); // length
2691 __ b(RuntimeAddress(int_copy_entry));
2692
2693 __ BIND(L_copy_longs);
2694 #ifdef ASSERT
2695 {
2696 BLOCK_COMMENT("assert long copy {");
2697 Label L;
2698 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2699 __ cmpw(r15_elsize, LogBytesPerLong);
2700 __ br(Assembler::EQ, L);
2701 __ stop("must be long copy, but elsize is wrong");
2702 __ bind(L);
2703 BLOCK_COMMENT("} assert long copy done");
2704 }
2705 #endif
2706 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2707 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2708 __ movw(count, scratch_length); // length
2709 __ b(RuntimeAddress(long_copy_entry));
2710
2711 // ObjArrayKlass
2712 __ BIND(L_objArray);
2713 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2714
2715 Label L_plain_copy, L_checkcast_copy;
2716 // test array classes for subtyping
2717 __ load_klass(r15, dst);
2718 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2719 __ br(Assembler::NE, L_checkcast_copy);
2720
2721 // Identically typed arrays can be copied without element-wise checks.
2722 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2723 rscratch2, L_failed);
2724
2725 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2726 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2727 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2728 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2729 __ movw(count, scratch_length); // length
2730 __ BIND(L_plain_copy);
2731 __ b(RuntimeAddress(oop_copy_entry));
2732
2733 __ BIND(L_checkcast_copy);
2734 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2735 {
2736 // Before looking at dst.length, make sure dst is also an objArray.
2737 __ ldrw(rscratch1, Address(r15, lh_offset));
2738 __ movw(rscratch2, objArray_lh);
2739 __ eorw(rscratch1, rscratch1, rscratch2);
2740 __ cbnzw(rscratch1, L_failed);
2741
2742 // It is safe to examine both src.length and dst.length.
2743 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2744 r15, L_failed);
2745
2746 __ load_klass(dst_klass, dst); // reload
2747
2748 // Marshal the base address arguments now, freeing registers.
2749 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2750 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2751 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2752 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2753 __ movw(count, length); // length (reloaded)
2754 Register sco_temp = c_rarg3; // this register is free now
2755 assert_different_registers(from, to, count, sco_temp,
2756 dst_klass, scratch_src_klass);
2757 // assert_clean_int(count, sco_temp);
2758
2759 // Generate the type check.
2760 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2761 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2762
2763 // Smashes rscratch1, rscratch2
2764 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2765 L_plain_copy);
2766
2767 // Fetch destination element klass from the ObjArrayKlass header.
2768 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2769 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2770 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2771
2772 // the checkcast_copy loop needs two extra arguments:
2773 assert(c_rarg3 == sco_temp, "#3 already in place");
2774 // Set up arguments for checkcast_copy_entry.
2775 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2776 __ b(RuntimeAddress(checkcast_copy_entry));
2777 }
2778
2779 __ BIND(L_failed);
2780 __ mov(r0, -1);
2781 __ leave(); // required for proper stackwalking of RuntimeStub frame
2782 __ ret(lr);
2783
2784 // record the stub entry and end
2785 store_archive_data(stub_id, start, __ pc());
2786
2787 return start;
2788 }
2789
2790 //
2791 // Generate stub for array fill. If "aligned" is true, the
2792 // "to" address is assumed to be heapword aligned.
2793 //
2794 // Arguments for generated stub:
2795 // to: c_rarg0
2796 // value: c_rarg1
2797 // count: c_rarg2 treated as signed
2798 //
2799 address generate_fill(StubId stub_id) {
2800 BasicType t;
2801 bool aligned;
2802
2803 switch (stub_id) {
2804 case StubId::stubgen_jbyte_fill_id:
2805 t = T_BYTE;
2806 aligned = false;
2807 break;
2808 case StubId::stubgen_jshort_fill_id:
2809 t = T_SHORT;
2810 aligned = false;
2811 break;
2812 case StubId::stubgen_jint_fill_id:
2813 t = T_INT;
2814 aligned = false;
2815 break;
2816 case StubId::stubgen_arrayof_jbyte_fill_id:
2817 t = T_BYTE;
2818 aligned = true;
2819 break;
2820 case StubId::stubgen_arrayof_jshort_fill_id:
2821 t = T_SHORT;
2822 aligned = true;
2823 break;
2824 case StubId::stubgen_arrayof_jint_fill_id:
2825 t = T_INT;
2826 aligned = true;
2827 break;
2828 default:
2829 ShouldNotReachHere();
2830 };
2831 int entry_count = StubInfo::entry_count(stub_id);
2832 assert(entry_count == 1, "sanity check");
2833 address start = load_archive_data(stub_id);
2834 if (start != nullptr) {
2835 return start;
2836 }
2837 __ align(CodeEntryAlignment);
2838 StubCodeMark mark(this, stub_id);
2839 start = __ pc();
2840
2841 BLOCK_COMMENT("Entry:");
2842
2843 const Register to = c_rarg0; // source array address
2844 const Register value = c_rarg1; // value
2845 const Register count = c_rarg2; // elements count
2846
2847 const Register bz_base = r10; // base for block_zero routine
2848 const Register cnt_words = r11; // temp register
2849
2850 __ enter();
2851
2852 Label L_fill_elements, L_exit1;
2853
2854 int shift = -1;
2855 switch (t) {
2856 case T_BYTE:
2857 shift = 0;
2858 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2859 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2860 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2861 __ br(Assembler::LO, L_fill_elements);
2862 break;
2863 case T_SHORT:
2864 shift = 1;
2865 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2866 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2867 __ br(Assembler::LO, L_fill_elements);
2868 break;
2869 case T_INT:
2870 shift = 2;
2871 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2872 __ br(Assembler::LO, L_fill_elements);
2873 break;
2874 default: ShouldNotReachHere();
2875 }
2876
2877 // Align source address at 8 bytes address boundary.
2878 Label L_skip_align1, L_skip_align2, L_skip_align4;
2879 if (!aligned) {
2880 switch (t) {
2881 case T_BYTE:
2882 // One byte misalignment happens only for byte arrays.
2883 __ tbz(to, 0, L_skip_align1);
2884 __ strb(value, Address(__ post(to, 1)));
2885 __ subw(count, count, 1);
2886 __ bind(L_skip_align1);
2887 // Fallthrough
2888 case T_SHORT:
2889 // Two bytes misalignment happens only for byte and short (char) arrays.
2890 __ tbz(to, 1, L_skip_align2);
2891 __ strh(value, Address(__ post(to, 2)));
2892 __ subw(count, count, 2 >> shift);
2893 __ bind(L_skip_align2);
2894 // Fallthrough
2895 case T_INT:
2896 // Align to 8 bytes, we know we are 4 byte aligned to start.
2897 __ tbz(to, 2, L_skip_align4);
2898 __ strw(value, Address(__ post(to, 4)));
2899 __ subw(count, count, 4 >> shift);
2900 __ bind(L_skip_align4);
2901 break;
2902 default: ShouldNotReachHere();
2903 }
2904 }
2905
2906 //
2907 // Fill large chunks
2908 //
2909 __ lsrw(cnt_words, count, 3 - shift); // number of words
2910 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2911 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2912 if (UseBlockZeroing) {
2913 Label non_block_zeroing, rest;
2914 // If the fill value is zero we can use the fast zero_words().
2915 __ cbnz(value, non_block_zeroing);
2916 __ mov(bz_base, to);
2917 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2918 address tpc = __ zero_words(bz_base, cnt_words);
2919 if (tpc == nullptr) {
2920 fatal("CodeCache is full at generate_fill");
2921 }
2922 __ b(rest);
2923 __ bind(non_block_zeroing);
2924 __ fill_words(to, cnt_words, value);
2925 __ bind(rest);
2926 } else {
2927 __ fill_words(to, cnt_words, value);
2928 }
2929
2930 // Remaining count is less than 8 bytes. Fill it by a single store.
2931 // Note that the total length is no less than 8 bytes.
2932 if (t == T_BYTE || t == T_SHORT) {
2933 Label L_exit1;
2934 __ cbzw(count, L_exit1);
2935 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2936 __ str(value, Address(to, -8)); // overwrite some elements
2937 __ bind(L_exit1);
2938 __ leave();
2939 __ ret(lr);
2940 }
2941
2942 // Handle copies less than 8 bytes.
2943 Label L_fill_2, L_fill_4, L_exit2;
2944 __ bind(L_fill_elements);
2945 switch (t) {
2946 case T_BYTE:
2947 __ tbz(count, 0, L_fill_2);
2948 __ strb(value, Address(__ post(to, 1)));
2949 __ bind(L_fill_2);
2950 __ tbz(count, 1, L_fill_4);
2951 __ strh(value, Address(__ post(to, 2)));
2952 __ bind(L_fill_4);
2953 __ tbz(count, 2, L_exit2);
2954 __ strw(value, Address(to));
2955 break;
2956 case T_SHORT:
2957 __ tbz(count, 0, L_fill_4);
2958 __ strh(value, Address(__ post(to, 2)));
2959 __ bind(L_fill_4);
2960 __ tbz(count, 1, L_exit2);
2961 __ strw(value, Address(to));
2962 break;
2963 case T_INT:
2964 __ cbzw(count, L_exit2);
2965 __ strw(value, Address(to));
2966 break;
2967 default: ShouldNotReachHere();
2968 }
2969 __ bind(L_exit2);
2970 __ leave();
2971 __ ret(lr);
2972
2973 // record the stub entry and end
2974 store_archive_data(stub_id, start, __ pc());
2975
2976 return start;
2977 }
2978
2979 address generate_unsafecopy_common_error_exit() {
2980 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2981 int entry_count = StubInfo::entry_count(stub_id);
2982 assert(entry_count == 1, "sanity check");
2983 address start = load_archive_data(stub_id);
2984 if (start != nullptr) {
2985 return start;
2986 }
2987 __ align(CodeEntryAlignment);
2988 StubCodeMark mark(this, stub_id);
2989 start = __ pc();
2990 __ leave();
2991 __ mov(r0, 0);
2992 __ ret(lr);
2993
2994 // record the stub entry and end
2995 store_archive_data(stub_id, start, __ pc());
2996
2997 return start;
2998 }
2999
3000 //
3001 // Generate 'unsafe' set memory stub
3002 // Though just as safe as the other stubs, it takes an unscaled
3003 // size_t (# bytes) argument instead of an element count.
3004 //
3005 // This fill operation is atomicity preserving: as long as the
3006 // address supplied is sufficiently aligned, all writes of up to 64
3007 // bits in size are single-copy atomic.
3008 //
3009 // Input:
3010 // c_rarg0 - destination array address
3011 // c_rarg1 - byte count (size_t)
3012 // c_rarg2 - byte value
3013 //
3014 address generate_unsafe_setmemory() {
3015 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3016 int entry_count = StubInfo::entry_count(stub_id);
3017 assert(entry_count == 1, "sanity check");
3018 // we expect one set of extra unsafememory access handler entries
3019 GrowableArray<address> extras;
3020 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
3021 address start = load_archive_data(stub_id, nullptr, &extras);
3022 if (start != nullptr) {
3023 assert(extras.length() == extra_count,
3024 "unexpected extra entry count %d", extras.length());
3025 register_unsafe_access_handlers(extras, 0, 1);
3026 return start;
3027 }
3028
3029 __ align(CodeEntryAlignment);
3030 StubCodeMark mark(this, stub_id);
3031 start = __ pc();
3032
3033 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3034 Label tail;
3035
3036 {
3037 UnsafeMemoryAccessMark umam(this, true, false);
3038
3039 __ enter(); // required for proper stackwalking of RuntimeStub frame
3040
3041 __ dup(v0, __ T16B, value);
3042
3043 if (AvoidUnalignedAccesses) {
3044 __ cmp(count, (u1)16);
3045 __ br(__ LO, tail);
3046
3047 __ mov(rscratch1, 16);
3048 __ andr(rscratch2, dest, 15);
3049 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3050 __ strq(v0, Address(dest));
3051 __ sub(count, count, rscratch1);
3052 __ add(dest, dest, rscratch1);
3053 }
3054
3055 __ subs(count, count, (u1)64);
3056 __ br(__ LO, tail);
3057 {
3058 Label again;
3059 __ bind(again);
3060 __ stpq(v0, v0, Address(dest));
3061 __ stpq(v0, v0, Address(dest, 32));
3062
3063 __ subs(count, count, 64);
3064 __ add(dest, dest, 64);
3065 __ br(__ HS, again);
3066 }
3067
3068 __ bind(tail);
3069 // The count of bytes is off by 64, but we don't need to correct
3070 // it because we're only going to use the least-significant few
3071 // count bits from here on.
3072 // __ add(count, count, 64);
3073
3074 {
3075 Label dont;
3076 __ tbz(count, exact_log2(32), dont);
3077 __ stpq(v0, v0, __ post(dest, 32));
3078 __ bind(dont);
3079 }
3080 {
3081 Label dont;
3082 __ tbz(count, exact_log2(16), dont);
3083 __ strq(v0, __ post(dest, 16));
3084 __ bind(dont);
3085 }
3086 {
3087 Label dont;
3088 __ tbz(count, exact_log2(8), dont);
3089 __ strd(v0, __ post(dest, 8));
3090 __ bind(dont);
3091 }
3092
3093 Label finished;
3094 __ tst(count, 7);
3095 __ br(__ EQ, finished);
3096
3097 {
3098 Label dont;
3099 __ tbz(count, exact_log2(4), dont);
3100 __ strs(v0, __ post(dest, 4));
3101 __ bind(dont);
3102 }
3103 {
3104 Label dont;
3105 __ tbz(count, exact_log2(2), dont);
3106 __ bfi(value, value, 8, 8);
3107 __ strh(value, __ post(dest, 2));
3108 __ bind(dont);
3109 }
3110 {
3111 Label dont;
3112 __ tbz(count, exact_log2(1), dont);
3113 __ strb(value, Address(dest));
3114 __ bind(dont);
3115 }
3116
3117 __ bind(finished);
3118 __ leave();
3119 __ ret(lr);
3120 // have to exit the block and destroy the UnsafeMemoryAccessMark
3121 // in order to retrieve the handler end address
3122 }
3123
3124 // install saved handler addresses in extras
3125 address end = __ pc();
3126 retrieve_unsafe_access_handlers(start, end, extras);
3127 assert(extras.length() == extra_count,
3128 "incorrect handlers count %d", extras.length());
3129 // record the stub entry and end plus the extras
3130 store_archive_data(stub_id, start, end, nullptr, &extras);
3131
3132 return start;
3133 }
3134
3135 address generate_data_cache_writeback() {
3136 const Register line = c_rarg0; // address of line to write back
3137
3138 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3139 int entry_count = StubInfo::entry_count(stub_id);
3140 assert(entry_count == 1, "sanity check");
3141 address start = load_archive_data(stub_id);
3142 if (start != nullptr) {
3143 return start;
3144 }
3145 __ align(CodeEntryAlignment);
3146 StubCodeMark mark(this, stub_id);
3147
3148 start = __ pc();
3149 __ enter();
3150 __ cache_wb(Address(line, 0));
3151 __ leave();
3152 __ ret(lr);
3153
3154 // record the stub entry and end
3155 store_archive_data(stub_id, start, __ pc());
3156
3157 return start;
3158 }
3159
3160 address generate_data_cache_writeback_sync() {
3161 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3162 int entry_count = StubInfo::entry_count(stub_id);
3163 assert(entry_count == 1, "sanity check");
3164 address start = load_archive_data(stub_id);
3165 if (start != nullptr) {
3166 return start;
3167 }
3168 const Register is_pre = c_rarg0; // pre or post sync
3169 __ align(CodeEntryAlignment);
3170 StubCodeMark mark(this, stub_id);
3171
3172 // pre wbsync is a no-op
3173 // post wbsync translates to an sfence
3174
3175 Label skip;
3176 start = __ pc();
3177 __ enter();
3178 __ cbnz(is_pre, skip);
3179 __ cache_wbsync(false);
3180 __ bind(skip);
3181 __ leave();
3182 __ ret(lr);
3183
3184 // record the stub entry and end
3185 store_archive_data(stub_id, start, __ pc());
3186
3187 return start;
3188 }
3189
3190 void generate_arraycopy_stubs() {
3191 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3192 // entry immediately following their stack push. This can be used
3193 // as a post-push branch target for compatible stubs when they
3194 // identify a special case that can be handled by the fallback
3195 // stub e.g a disjoint copy stub may be use as a special case
3196 // fallback for its compatible conjoint copy stub.
3197 //
3198 // A no push entry is always returned in the following local and
3199 // then published by assigning to the appropriate entry field in
3200 // class StubRoutines. The entry value is then passed to the
3201 // generator for the compatible stub. That means the entry must be
3202 // listed when saving to/restoring from the AOT cache, ensuring
3203 // that the inter-stub jumps are noted at AOT-cache save and
3204 // relocated at AOT cache load.
3205 address nopush_entry;
3206
3207 // generate the common exit first so later stubs can rely on it if
3208 // they want an UnsafeMemoryAccess exit non-local to the stub
3209 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3210 // register the stub as the default exit with class UnsafeMemoryAccess
3211 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3212
3213 // generate and publish arch64-specific bulk copy routines first
3214 // so we can call them from other copy stubs
3215 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3216 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3217
3218 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3219 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3220
3221 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3222 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3223
3224 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3225
3226 //*** jbyte
3227 // Always need aligned and unaligned versions
3228 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3229 // disjoint nopush entry is needed by conjoint copy
3230 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3231 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3232 // conjoint nopush entry is needed by generic/unsafe copy
3233 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3234 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3235 // disjoint arrayof nopush entry is needed by conjoint copy
3236 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3237 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3238
3239 //*** jshort
3240 // Always need aligned and unaligned versions
3241 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3242 // disjoint nopush entry is needed by conjoint copy
3243 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3244 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3245 // conjoint nopush entry is used by generic/unsafe copy
3246 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3247 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3248 // disjoint arrayof nopush entry is needed by conjoint copy
3249 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3250 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3251
3252 //*** jint
3253 // Aligned versions
3254 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3255 // disjoint arrayof nopush entry is needed by conjoint copy
3256 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3257 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3258 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3259 // jint_arraycopy_nopush always points to the unaligned version
3260 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3261 // disjoint nopush entry is needed by conjoint copy
3262 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3263 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3264 // conjoint nopush entry is needed by generic/unsafe copy
3265 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3266
3267 //*** jlong
3268 // It is always aligned
3269 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3270 // disjoint arrayof nopush entry is needed by conjoint copy
3271 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3272 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3273 // conjoint nopush entry is needed by generic/unsafe copy
3274 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3275 // disjoint normal/nopush and conjoint normal entries are not
3276 // generated since the arrayof versions are the same
3277 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3278 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3279 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3280
3281 //*** oops
3282 {
3283 StubRoutines::_arrayof_oop_disjoint_arraycopy
3284 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3285 // disjoint arrayof nopush entry is needed by conjoint copy
3286 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3287 StubRoutines::_arrayof_oop_arraycopy
3288 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3289 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3290 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3291 // Aligned versions without pre-barriers
3292 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3293 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3294 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3295 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3296 // note that we don't need a returned nopush entry because the
3297 // generic/unsafe copy does not cater for uninit arrays.
3298 StubRoutines::_arrayof_oop_arraycopy_uninit
3299 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3300 }
3301
3302 // for oop copies reuse arrayof entries for non-arrayof cases
3303 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3304 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3305 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3306 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3307 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3308 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3309
3310 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3311 // checkcast nopush entry is needed by generic copy
3312 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3313 // note that we don't need a returned nopush entry because the
3314 // generic copy does not cater for uninit arrays.
3315 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3316
3317 // unsafe arraycopy may fallback on conjoint stubs
3318 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3319 StubRoutines::_jshort_arraycopy_nopush,
3320 StubRoutines::_jint_arraycopy_nopush,
3321 StubRoutines::_jlong_arraycopy_nopush);
3322
3323 // generic arraycopy may fallback on conjoint stubs
3324 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3325 StubRoutines::_jshort_arraycopy_nopush,
3326 StubRoutines::_jint_arraycopy_nopush,
3327 StubRoutines::_oop_arraycopy_nopush,
3328 StubRoutines::_jlong_arraycopy_nopush,
3329 StubRoutines::_checkcast_arraycopy_nopush);
3330
3331 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3332 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3333 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3334 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3335 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3336 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3337 }
3338
3339 void generate_math_stubs() { Unimplemented(); }
3340
3341 // Arguments:
3342 //
3343 // Inputs:
3344 // c_rarg0 - source byte array address
3345 // c_rarg1 - destination byte array address
3346 // c_rarg2 - sessionKe (key) in little endian int array
3347 //
3348 address generate_aescrypt_encryptBlock() {
3349 assert(UseAES, "need AES cryptographic extension support");
3350 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3351 int entry_count = StubInfo::entry_count(stub_id);
3352 assert(entry_count == 1, "sanity check");
3353 address start = load_archive_data(stub_id);
3354 if (start != nullptr) {
3355 return start;
3356 }
3357 __ align(CodeEntryAlignment);
3358 StubCodeMark mark(this, stub_id);
3359
3360 const Register from = c_rarg0; // source array address
3361 const Register to = c_rarg1; // destination array address
3362 const Register key = c_rarg2; // key array address
3363 const Register keylen = rscratch1;
3364
3365 start = __ pc();
3366 __ enter();
3367
3368 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3369
3370 __ aesenc_loadkeys(key, keylen);
3371 __ aesecb_encrypt(from, to, keylen);
3372
3373 __ mov(r0, 0);
3374
3375 __ leave();
3376 __ ret(lr);
3377
3378 // record the stub entry and end
3379 store_archive_data(stub_id, start, __ pc());
3380
3381 return start;
3382 }
3383
3384 // Arguments:
3385 //
3386 // Inputs:
3387 // c_rarg0 - source byte array address
3388 // c_rarg1 - destination byte array address
3389 // c_rarg2 - sessionKd (key) in little endian int array
3390 //
3391 address generate_aescrypt_decryptBlock() {
3392 assert(UseAES, "need AES cryptographic extension support");
3393 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3394 int entry_count = StubInfo::entry_count(stub_id);
3395 assert(entry_count == 1, "sanity check");
3396 address start = load_archive_data(stub_id);
3397 if (start != nullptr) {
3398 return start;
3399 }
3400 __ align(CodeEntryAlignment);
3401 StubCodeMark mark(this, stub_id);
3402 Label L_doLast;
3403
3404 const Register from = c_rarg0; // source array address
3405 const Register to = c_rarg1; // destination array address
3406 const Register key = c_rarg2; // key array address
3407 const Register keylen = rscratch1;
3408
3409 start = __ pc();
3410 __ enter(); // required for proper stackwalking of RuntimeStub frame
3411
3412 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3413
3414 __ aesecb_decrypt(from, to, key, keylen);
3415
3416 __ mov(r0, 0);
3417
3418 __ leave();
3419 __ ret(lr);
3420
3421 // record the stub entry and end
3422 store_archive_data(stub_id, start, __ pc());
3423
3424 return start;
3425 }
3426
3427 // Arguments:
3428 //
3429 // Inputs:
3430 // c_rarg0 - source byte array address
3431 // c_rarg1 - destination byte array address
3432 // c_rarg2 - sessionKe (key) in little endian int array
3433 // c_rarg3 - r vector byte array address
3434 // c_rarg4 - input length
3435 //
3436 // Output:
3437 // x0 - input length
3438 //
3439 address generate_cipherBlockChaining_encryptAESCrypt() {
3440 assert(UseAES, "need AES cryptographic extension support");
3441 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3442 int entry_count = StubInfo::entry_count(stub_id);
3443 assert(entry_count == 1, "sanity check");
3444 address start = load_archive_data(stub_id);
3445 if (start != nullptr) {
3446 return start;
3447 }
3448 __ align(CodeEntryAlignment);
3449 StubCodeMark mark(this, stub_id);
3450
3451 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3452
3453 const Register from = c_rarg0; // source array address
3454 const Register to = c_rarg1; // destination array address
3455 const Register key = c_rarg2; // key array address
3456 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3457 // and left with the results of the last encryption block
3458 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3459 const Register keylen = rscratch1;
3460
3461 start = __ pc();
3462
3463 __ enter();
3464
3465 __ movw(rscratch2, len_reg);
3466
3467 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3468
3469 __ ld1(v0, __ T16B, rvec);
3470
3471 __ cmpw(keylen, 52);
3472 __ br(Assembler::CC, L_loadkeys_44);
3473 __ br(Assembler::EQ, L_loadkeys_52);
3474
3475 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3476 __ rev32(v17, __ T16B, v17);
3477 __ rev32(v18, __ T16B, v18);
3478 __ BIND(L_loadkeys_52);
3479 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3480 __ rev32(v19, __ T16B, v19);
3481 __ rev32(v20, __ T16B, v20);
3482 __ BIND(L_loadkeys_44);
3483 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3484 __ rev32(v21, __ T16B, v21);
3485 __ rev32(v22, __ T16B, v22);
3486 __ rev32(v23, __ T16B, v23);
3487 __ rev32(v24, __ T16B, v24);
3488 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3489 __ rev32(v25, __ T16B, v25);
3490 __ rev32(v26, __ T16B, v26);
3491 __ rev32(v27, __ T16B, v27);
3492 __ rev32(v28, __ T16B, v28);
3493 __ ld1(v29, v30, v31, __ T16B, key);
3494 __ rev32(v29, __ T16B, v29);
3495 __ rev32(v30, __ T16B, v30);
3496 __ rev32(v31, __ T16B, v31);
3497
3498 __ BIND(L_aes_loop);
3499 __ ld1(v1, __ T16B, __ post(from, 16));
3500 __ eor(v0, __ T16B, v0, v1);
3501
3502 __ br(Assembler::CC, L_rounds_44);
3503 __ br(Assembler::EQ, L_rounds_52);
3504
3505 __ aese(v0, v17); __ aesmc(v0, v0);
3506 __ aese(v0, v18); __ aesmc(v0, v0);
3507 __ BIND(L_rounds_52);
3508 __ aese(v0, v19); __ aesmc(v0, v0);
3509 __ aese(v0, v20); __ aesmc(v0, v0);
3510 __ BIND(L_rounds_44);
3511 __ aese(v0, v21); __ aesmc(v0, v0);
3512 __ aese(v0, v22); __ aesmc(v0, v0);
3513 __ aese(v0, v23); __ aesmc(v0, v0);
3514 __ aese(v0, v24); __ aesmc(v0, v0);
3515 __ aese(v0, v25); __ aesmc(v0, v0);
3516 __ aese(v0, v26); __ aesmc(v0, v0);
3517 __ aese(v0, v27); __ aesmc(v0, v0);
3518 __ aese(v0, v28); __ aesmc(v0, v0);
3519 __ aese(v0, v29); __ aesmc(v0, v0);
3520 __ aese(v0, v30);
3521 __ eor(v0, __ T16B, v0, v31);
3522
3523 __ st1(v0, __ T16B, __ post(to, 16));
3524
3525 __ subw(len_reg, len_reg, 16);
3526 __ cbnzw(len_reg, L_aes_loop);
3527
3528 __ st1(v0, __ T16B, rvec);
3529
3530 __ mov(r0, rscratch2);
3531
3532 __ leave();
3533 __ ret(lr);
3534
3535 // record the stub entry and end
3536 store_archive_data(stub_id, start, __ pc());
3537
3538 return start;
3539 }
3540
3541 // Arguments:
3542 //
3543 // Inputs:
3544 // c_rarg0 - source byte array address
3545 // c_rarg1 - destination byte array address
3546 // c_rarg2 - sessionKd (key) in little endian int array
3547 // c_rarg3 - r vector byte array address
3548 // c_rarg4 - input length
3549 //
3550 // Output:
3551 // r0 - input length
3552 //
3553 address generate_cipherBlockChaining_decryptAESCrypt() {
3554 assert(UseAES, "need AES cryptographic extension support");
3555 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3556 int entry_count = StubInfo::entry_count(stub_id);
3557 assert(entry_count == 1, "sanity check");
3558 address start = load_archive_data(stub_id);
3559 if (start != nullptr) {
3560 return start;
3561 }
3562 __ align(CodeEntryAlignment);
3563 StubCodeMark mark(this, stub_id);
3564
3565 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3566
3567 const Register from = c_rarg0; // source array address
3568 const Register to = c_rarg1; // destination array address
3569 const Register key = c_rarg2; // key array address
3570 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3571 // and left with the results of the last encryption block
3572 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3573 const Register keylen = rscratch1;
3574
3575 start = __ pc();
3576
3577 __ enter();
3578
3579 __ movw(rscratch2, len_reg);
3580
3581 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3582
3583 __ ld1(v2, __ T16B, rvec);
3584
3585 __ ld1(v31, __ T16B, __ post(key, 16));
3586 __ rev32(v31, __ T16B, v31);
3587
3588 __ cmpw(keylen, 52);
3589 __ br(Assembler::CC, L_loadkeys_44);
3590 __ br(Assembler::EQ, L_loadkeys_52);
3591
3592 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3593 __ rev32(v17, __ T16B, v17);
3594 __ rev32(v18, __ T16B, v18);
3595 __ BIND(L_loadkeys_52);
3596 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3597 __ rev32(v19, __ T16B, v19);
3598 __ rev32(v20, __ T16B, v20);
3599 __ BIND(L_loadkeys_44);
3600 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3601 __ rev32(v21, __ T16B, v21);
3602 __ rev32(v22, __ T16B, v22);
3603 __ rev32(v23, __ T16B, v23);
3604 __ rev32(v24, __ T16B, v24);
3605 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3606 __ rev32(v25, __ T16B, v25);
3607 __ rev32(v26, __ T16B, v26);
3608 __ rev32(v27, __ T16B, v27);
3609 __ rev32(v28, __ T16B, v28);
3610 __ ld1(v29, v30, __ T16B, key);
3611 __ rev32(v29, __ T16B, v29);
3612 __ rev32(v30, __ T16B, v30);
3613
3614 __ BIND(L_aes_loop);
3615 __ ld1(v0, __ T16B, __ post(from, 16));
3616 __ orr(v1, __ T16B, v0, v0);
3617
3618 __ br(Assembler::CC, L_rounds_44);
3619 __ br(Assembler::EQ, L_rounds_52);
3620
3621 __ aesd(v0, v17); __ aesimc(v0, v0);
3622 __ aesd(v0, v18); __ aesimc(v0, v0);
3623 __ BIND(L_rounds_52);
3624 __ aesd(v0, v19); __ aesimc(v0, v0);
3625 __ aesd(v0, v20); __ aesimc(v0, v0);
3626 __ BIND(L_rounds_44);
3627 __ aesd(v0, v21); __ aesimc(v0, v0);
3628 __ aesd(v0, v22); __ aesimc(v0, v0);
3629 __ aesd(v0, v23); __ aesimc(v0, v0);
3630 __ aesd(v0, v24); __ aesimc(v0, v0);
3631 __ aesd(v0, v25); __ aesimc(v0, v0);
3632 __ aesd(v0, v26); __ aesimc(v0, v0);
3633 __ aesd(v0, v27); __ aesimc(v0, v0);
3634 __ aesd(v0, v28); __ aesimc(v0, v0);
3635 __ aesd(v0, v29); __ aesimc(v0, v0);
3636 __ aesd(v0, v30);
3637 __ eor(v0, __ T16B, v0, v31);
3638 __ eor(v0, __ T16B, v0, v2);
3639
3640 __ st1(v0, __ T16B, __ post(to, 16));
3641 __ orr(v2, __ T16B, v1, v1);
3642
3643 __ subw(len_reg, len_reg, 16);
3644 __ cbnzw(len_reg, L_aes_loop);
3645
3646 __ st1(v2, __ T16B, rvec);
3647
3648 __ mov(r0, rscratch2);
3649
3650 __ leave();
3651 __ ret(lr);
3652
3653 // record the stub entry and end
3654 store_archive_data(stub_id, start, __ pc());
3655
3656 return start;
3657 }
3658
3659 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3660 // Inputs: 128-bits. in is preserved.
3661 // The least-significant 64-bit word is in the upper dword of each vector.
3662 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3663 // Output: result
3664 void be_add_128_64(FloatRegister result, FloatRegister in,
3665 FloatRegister inc, FloatRegister tmp) {
3666 assert_different_registers(result, tmp, inc);
3667
3668 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3669 // input
3670 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3671 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3672 // MSD == 0 (must be!) to LSD
3673 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3674 }
3675
3676 // CTR AES crypt.
3677 // Arguments:
3678 //
3679 // Inputs:
3680 // c_rarg0 - source byte array address
3681 // c_rarg1 - destination byte array address
3682 // c_rarg2 - sessionKe (key) in little endian int array
3683 // c_rarg3 - counter vector byte array address
3684 // c_rarg4 - input length
3685 // c_rarg5 - saved encryptedCounter start
3686 // c_rarg6 - saved used length
3687 //
3688 // Output:
3689 // r0 - input length
3690 //
3691 address generate_counterMode_AESCrypt() {
3692 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3693 int entry_count = StubInfo::entry_count(stub_id);
3694 assert(entry_count == 1, "sanity check");
3695 address start = load_archive_data(stub_id);
3696 if (start != nullptr) {
3697 return start;
3698 }
3699 const Register in = c_rarg0;
3700 const Register out = c_rarg1;
3701 const Register key = c_rarg2;
3702 const Register counter = c_rarg3;
3703 const Register saved_len = c_rarg4, len = r10;
3704 const Register saved_encrypted_ctr = c_rarg5;
3705 const Register used_ptr = c_rarg6, used = r12;
3706
3707 const Register offset = r7;
3708 const Register keylen = r11;
3709
3710 const unsigned char block_size = 16;
3711 const int bulk_width = 4;
3712 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3713 // performance with larger data sizes, but it also means that the
3714 // fast path isn't used until you have at least 8 blocks, and up
3715 // to 127 bytes of data will be executed on the slow path. For
3716 // that reason, and also so as not to blow away too much icache, 4
3717 // blocks seems like a sensible compromise.
3718
3719 // Algorithm:
3720 //
3721 // if (len == 0) {
3722 // goto DONE;
3723 // }
3724 // int result = len;
3725 // do {
3726 // if (used >= blockSize) {
3727 // if (len >= bulk_width * blockSize) {
3728 // CTR_large_block();
3729 // if (len == 0)
3730 // goto DONE;
3731 // }
3732 // for (;;) {
3733 // 16ByteVector v0 = counter;
3734 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3735 // used = 0;
3736 // if (len < blockSize)
3737 // break; /* goto NEXT */
3738 // 16ByteVector v1 = load16Bytes(in, offset);
3739 // v1 = v1 ^ encryptedCounter;
3740 // store16Bytes(out, offset);
3741 // used = blockSize;
3742 // offset += blockSize;
3743 // len -= blockSize;
3744 // if (len == 0)
3745 // goto DONE;
3746 // }
3747 // }
3748 // NEXT:
3749 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3750 // len--;
3751 // } while (len != 0);
3752 // DONE:
3753 // return result;
3754 //
3755 // CTR_large_block()
3756 // Wide bulk encryption of whole blocks.
3757
3758 __ align(CodeEntryAlignment);
3759 StubCodeMark mark(this, stub_id);
3760 start = __ pc();
3761 __ enter();
3762
3763 Label DONE, CTR_large_block, large_block_return;
3764 __ ldrw(used, Address(used_ptr));
3765 __ cbzw(saved_len, DONE);
3766
3767 __ mov(len, saved_len);
3768 __ mov(offset, 0);
3769
3770 // Compute #rounds for AES based on the length of the key array
3771 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3772
3773 __ aesenc_loadkeys(key, keylen);
3774
3775 {
3776 Label L_CTR_loop, NEXT;
3777
3778 __ bind(L_CTR_loop);
3779
3780 __ cmp(used, block_size);
3781 __ br(__ LO, NEXT);
3782
3783 // Maybe we have a lot of data
3784 __ subsw(rscratch1, len, bulk_width * block_size);
3785 __ br(__ HS, CTR_large_block);
3786 __ BIND(large_block_return);
3787 __ cbzw(len, DONE);
3788
3789 // Setup the counter
3790 __ movi(v4, __ T4S, 0);
3791 __ movi(v5, __ T4S, 1);
3792 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3793
3794 // 128-bit big-endian increment
3795 __ ld1(v0, __ T16B, counter);
3796 __ rev64(v16, __ T16B, v0);
3797 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3798 __ rev64(v16, __ T16B, v16);
3799 __ st1(v16, __ T16B, counter);
3800 // Previous counter value is in v0
3801 // v4 contains { 0, 1 }
3802
3803 {
3804 // We have fewer than bulk_width blocks of data left. Encrypt
3805 // them one by one until there is less than a full block
3806 // remaining, being careful to save both the encrypted counter
3807 // and the counter.
3808
3809 Label inner_loop;
3810 __ bind(inner_loop);
3811 // Counter to encrypt is in v0
3812 __ aesecb_encrypt(noreg, noreg, keylen);
3813 __ st1(v0, __ T16B, saved_encrypted_ctr);
3814
3815 // Do we have a remaining full block?
3816
3817 __ mov(used, 0);
3818 __ cmp(len, block_size);
3819 __ br(__ LO, NEXT);
3820
3821 // Yes, we have a full block
3822 __ ldrq(v1, Address(in, offset));
3823 __ eor(v1, __ T16B, v1, v0);
3824 __ strq(v1, Address(out, offset));
3825 __ mov(used, block_size);
3826 __ add(offset, offset, block_size);
3827
3828 __ subw(len, len, block_size);
3829 __ cbzw(len, DONE);
3830
3831 // Increment the counter, store it back
3832 __ orr(v0, __ T16B, v16, v16);
3833 __ rev64(v16, __ T16B, v16);
3834 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3835 __ rev64(v16, __ T16B, v16);
3836 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3837
3838 __ b(inner_loop);
3839 }
3840
3841 __ BIND(NEXT);
3842
3843 // Encrypt a single byte, and loop.
3844 // We expect this to be a rare event.
3845 __ ldrb(rscratch1, Address(in, offset));
3846 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3847 __ eor(rscratch1, rscratch1, rscratch2);
3848 __ strb(rscratch1, Address(out, offset));
3849 __ add(offset, offset, 1);
3850 __ add(used, used, 1);
3851 __ subw(len, len,1);
3852 __ cbnzw(len, L_CTR_loop);
3853 }
3854
3855 __ bind(DONE);
3856 __ strw(used, Address(used_ptr));
3857 __ mov(r0, saved_len);
3858
3859 __ leave(); // required for proper stackwalking of RuntimeStub frame
3860 __ ret(lr);
3861
3862 // Bulk encryption
3863
3864 __ BIND (CTR_large_block);
3865 assert(bulk_width == 4 || bulk_width == 8, "must be");
3866
3867 if (bulk_width == 8) {
3868 __ sub(sp, sp, 4 * 16);
3869 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3870 }
3871 __ sub(sp, sp, 4 * 16);
3872 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3873 RegSet saved_regs = (RegSet::of(in, out, offset)
3874 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3875 __ push(saved_regs, sp);
3876 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3877 __ add(in, in, offset);
3878 __ add(out, out, offset);
3879
3880 // Keys should already be loaded into the correct registers
3881
3882 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3883 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3884
3885 // AES/CTR loop
3886 {
3887 Label L_CTR_loop;
3888 __ BIND(L_CTR_loop);
3889
3890 // Setup the counters
3891 __ movi(v8, __ T4S, 0);
3892 __ movi(v9, __ T4S, 1);
3893 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3894
3895 for (int i = 0; i < bulk_width; i++) {
3896 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3897 __ rev64(v0_ofs, __ T16B, v16);
3898 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3899 }
3900
3901 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3902
3903 // Encrypt the counters
3904 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3905
3906 if (bulk_width == 8) {
3907 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3908 }
3909
3910 // XOR the encrypted counters with the inputs
3911 for (int i = 0; i < bulk_width; i++) {
3912 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3913 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3914 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3915 }
3916
3917 // Write the encrypted data
3918 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3919 if (bulk_width == 8) {
3920 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3921 }
3922
3923 __ subw(len, len, 16 * bulk_width);
3924 __ cbnzw(len, L_CTR_loop);
3925 }
3926
3927 // Save the counter back where it goes
3928 __ rev64(v16, __ T16B, v16);
3929 __ st1(v16, __ T16B, counter);
3930
3931 __ pop(saved_regs, sp);
3932
3933 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3934 if (bulk_width == 8) {
3935 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3936 }
3937
3938 __ andr(rscratch1, len, -16 * bulk_width);
3939 __ sub(len, len, rscratch1);
3940 __ add(offset, offset, rscratch1);
3941 __ mov(used, 16);
3942 __ strw(used, Address(used_ptr));
3943 __ b(large_block_return);
3944
3945 // record the stub entry and end
3946 store_archive_data(stub_id, start, __ pc());
3947
3948 return start;
3949 }
3950
3951 // Vector AES Galois Counter Mode implementation. Parameters:
3952 //
3953 // in = c_rarg0
3954 // len = c_rarg1
3955 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3956 // out = c_rarg3
3957 // key = c_rarg4
3958 // state = c_rarg5 - GHASH.state
3959 // subkeyHtbl = c_rarg6 - powers of H
3960 // counter = c_rarg7 - 16 bytes of CTR
3961 // return - number of processed bytes
3962 address generate_galoisCounterMode_AESCrypt() {
3963 Label ghash_polynomial; // local data generated after code
3964 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3965 int entry_count = StubInfo::entry_count(stub_id);
3966 assert(entry_count == 1, "sanity check");
3967 address start = load_archive_data(stub_id);
3968 if (start != nullptr) {
3969 return start;
3970 }
3971 __ align(CodeEntryAlignment);
3972 StubCodeMark mark(this, stub_id);
3973 start = __ pc();
3974 __ enter();
3975
3976 const Register in = c_rarg0;
3977 const Register len = c_rarg1;
3978 const Register ct = c_rarg2;
3979 const Register out = c_rarg3;
3980 // and updated with the incremented counter in the end
3981
3982 const Register key = c_rarg4;
3983 const Register state = c_rarg5;
3984
3985 const Register subkeyHtbl = c_rarg6;
3986
3987 const Register counter = c_rarg7;
3988
3989 const Register keylen = r10;
3990 // Save state before entering routine
3991 __ sub(sp, sp, 4 * 16);
3992 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3993 __ sub(sp, sp, 4 * 16);
3994 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3995
3996 // __ andr(len, len, -512);
3997 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3998 __ str(len, __ pre(sp, -2 * wordSize));
3999
4000 Label DONE;
4001 __ cbz(len, DONE);
4002
4003 // Compute #rounds for AES based on the length of the key array
4004 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4005
4006 __ aesenc_loadkeys(key, keylen);
4007 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
4008 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
4009
4010 // AES/CTR loop
4011 {
4012 Label L_CTR_loop;
4013 __ BIND(L_CTR_loop);
4014
4015 // Setup the counters
4016 __ movi(v8, __ T4S, 0);
4017 __ movi(v9, __ T4S, 1);
4018 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
4019
4020 assert(v0->encoding() < v8->encoding(), "");
4021 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4022 FloatRegister f = as_FloatRegister(i);
4023 __ rev32(f, __ T16B, v16);
4024 __ addv(v16, __ T4S, v16, v8);
4025 }
4026
4027 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4028
4029 // Encrypt the counters
4030 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4031
4032 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4033
4034 // XOR the encrypted counters with the inputs
4035 for (int i = 0; i < 8; i++) {
4036 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4037 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4038 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4039 }
4040 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4041 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4042
4043 __ subw(len, len, 16 * 8);
4044 __ cbnzw(len, L_CTR_loop);
4045 }
4046
4047 __ rev32(v16, __ T16B, v16);
4048 __ st1(v16, __ T16B, counter);
4049
4050 __ ldr(len, Address(sp));
4051 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4052
4053 // GHASH/CTR loop
4054 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4055 len, /*unrolls*/4);
4056
4057 #ifdef ASSERT
4058 { Label L;
4059 __ cmp(len, (unsigned char)0);
4060 __ br(Assembler::EQ, L);
4061 __ stop("stubGenerator: abort");
4062 __ bind(L);
4063 }
4064 #endif
4065
4066 __ bind(DONE);
4067 // Return the number of bytes processed
4068 __ ldr(r0, __ post(sp, 2 * wordSize));
4069
4070 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4071 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4072
4073 __ leave(); // required for proper stackwalking of RuntimeStub frame
4074 __ ret(lr);
4075
4076 // bind label and generate polynomial data
4077 __ align(wordSize * 2);
4078 __ bind(ghash_polynomial);
4079 __ emit_int64(0x87); // The low-order bits of the field
4080 // polynomial (i.e. p = z^7+z^2+z+1)
4081 // repeated in the low and high parts of a
4082 // 128-bit vector
4083 __ emit_int64(0x87);
4084
4085 // record the stub entry and end
4086 store_archive_data(stub_id, start, __ pc());
4087
4088 return start;
4089 }
4090
4091 class Cached64Bytes {
4092 private:
4093 MacroAssembler *_masm;
4094 Register _regs[8];
4095
4096 public:
4097 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4098 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4099 auto it = rs.begin();
4100 for (auto &r: _regs) {
4101 r = *it;
4102 ++it;
4103 }
4104 }
4105
4106 void gen_loads(Register base) {
4107 for (int i = 0; i < 8; i += 2) {
4108 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4109 }
4110 }
4111
4112 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4113 void extract_u32(Register dest, int i) {
4114 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4115 }
4116 };
4117
4118 // Utility routines for md5.
4119 // Clobbers r10 and r11.
4120 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4121 int k, int s, int t) {
4122 Register rscratch3 = r10;
4123 Register rscratch4 = r11;
4124
4125 __ eorw(rscratch3, r3, r4);
4126 __ movw(rscratch2, t);
4127 __ andw(rscratch3, rscratch3, r2);
4128 __ addw(rscratch4, r1, rscratch2);
4129 reg_cache.extract_u32(rscratch1, k);
4130 __ eorw(rscratch3, rscratch3, r4);
4131 __ addw(rscratch4, rscratch4, rscratch1);
4132 __ addw(rscratch3, rscratch3, rscratch4);
4133 __ rorw(rscratch2, rscratch3, 32 - s);
4134 __ addw(r1, rscratch2, r2);
4135 }
4136
4137 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4138 int k, int s, int t) {
4139 Register rscratch3 = r10;
4140 Register rscratch4 = r11;
4141
4142 reg_cache.extract_u32(rscratch1, k);
4143 __ movw(rscratch2, t);
4144 __ addw(rscratch4, r1, rscratch2);
4145 __ addw(rscratch4, rscratch4, rscratch1);
4146 __ bicw(rscratch2, r3, r4);
4147 __ andw(rscratch3, r2, r4);
4148 __ addw(rscratch2, rscratch2, rscratch4);
4149 __ addw(rscratch2, rscratch2, rscratch3);
4150 __ rorw(rscratch2, rscratch2, 32 - s);
4151 __ addw(r1, rscratch2, r2);
4152 }
4153
4154 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4155 int k, int s, int t) {
4156 Register rscratch3 = r10;
4157 Register rscratch4 = r11;
4158
4159 __ eorw(rscratch3, r3, r4);
4160 __ movw(rscratch2, t);
4161 __ addw(rscratch4, r1, rscratch2);
4162 reg_cache.extract_u32(rscratch1, k);
4163 __ eorw(rscratch3, rscratch3, r2);
4164 __ addw(rscratch4, rscratch4, rscratch1);
4165 __ addw(rscratch3, rscratch3, rscratch4);
4166 __ rorw(rscratch2, rscratch3, 32 - s);
4167 __ addw(r1, rscratch2, r2);
4168 }
4169
4170 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4171 int k, int s, int t) {
4172 Register rscratch3 = r10;
4173 Register rscratch4 = r11;
4174
4175 __ movw(rscratch3, t);
4176 __ ornw(rscratch2, r2, r4);
4177 __ addw(rscratch4, r1, rscratch3);
4178 reg_cache.extract_u32(rscratch1, k);
4179 __ eorw(rscratch3, rscratch2, r3);
4180 __ addw(rscratch4, rscratch4, rscratch1);
4181 __ addw(rscratch3, rscratch3, rscratch4);
4182 __ rorw(rscratch2, rscratch3, 32 - s);
4183 __ addw(r1, rscratch2, r2);
4184 }
4185
4186 // Arguments:
4187 //
4188 // Inputs:
4189 // c_rarg0 - byte[] source+offset
4190 // c_rarg1 - int[] SHA.state
4191 // c_rarg2 - int offset
4192 // c_rarg3 - int limit
4193 //
4194 address generate_md5_implCompress(StubId stub_id) {
4195 bool multi_block;
4196 switch (stub_id) {
4197 case StubId::stubgen_md5_implCompress_id:
4198 multi_block = false;
4199 break;
4200 case StubId::stubgen_md5_implCompressMB_id:
4201 multi_block = true;
4202 break;
4203 default:
4204 ShouldNotReachHere();
4205 }
4206 int entry_count = StubInfo::entry_count(stub_id);
4207 assert(entry_count == 1, "sanity check");
4208 address start = load_archive_data(stub_id);
4209 if (start != nullptr) {
4210 return start;
4211 }
4212 __ align(CodeEntryAlignment);
4213
4214 StubCodeMark mark(this, stub_id);
4215 start = __ pc();
4216
4217 Register buf = c_rarg0;
4218 Register state = c_rarg1;
4219 Register ofs = c_rarg2;
4220 Register limit = c_rarg3;
4221 Register a = r4;
4222 Register b = r5;
4223 Register c = r6;
4224 Register d = r7;
4225 Register rscratch3 = r10;
4226 Register rscratch4 = r11;
4227
4228 Register state_regs[2] = { r12, r13 };
4229 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4230 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4231
4232 __ push(saved_regs, sp);
4233
4234 __ ldp(state_regs[0], state_regs[1], Address(state));
4235 __ ubfx(a, state_regs[0], 0, 32);
4236 __ ubfx(b, state_regs[0], 32, 32);
4237 __ ubfx(c, state_regs[1], 0, 32);
4238 __ ubfx(d, state_regs[1], 32, 32);
4239
4240 Label md5_loop;
4241 __ BIND(md5_loop);
4242
4243 reg_cache.gen_loads(buf);
4244
4245 // Round 1
4246 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4247 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4248 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4249 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4250 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4251 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4252 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4253 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4254 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4255 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4256 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4257 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4258 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4259 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4260 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4261 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4262
4263 // Round 2
4264 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4265 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4266 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4267 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4268 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4269 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4270 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4271 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4272 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4273 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4274 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4275 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4276 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4277 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4278 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4279 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4280
4281 // Round 3
4282 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4283 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4284 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4285 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4286 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4287 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4288 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4289 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4290 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4291 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4292 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4293 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4294 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4295 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4296 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4297 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4298
4299 // Round 4
4300 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4301 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4302 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4303 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4304 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4305 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4306 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4307 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4308 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4309 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4310 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4311 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4312 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4313 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4314 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4315 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4316
4317 __ addw(a, state_regs[0], a);
4318 __ ubfx(rscratch2, state_regs[0], 32, 32);
4319 __ addw(b, rscratch2, b);
4320 __ addw(c, state_regs[1], c);
4321 __ ubfx(rscratch4, state_regs[1], 32, 32);
4322 __ addw(d, rscratch4, d);
4323
4324 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4325 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4326
4327 if (multi_block) {
4328 __ add(buf, buf, 64);
4329 __ add(ofs, ofs, 64);
4330 __ cmp(ofs, limit);
4331 __ br(Assembler::LE, md5_loop);
4332 __ mov(c_rarg0, ofs); // return ofs
4333 }
4334
4335 // write hash values back in the correct order
4336 __ stp(state_regs[0], state_regs[1], Address(state));
4337
4338 __ pop(saved_regs, sp);
4339
4340 __ ret(lr);
4341
4342 // record the stub entry and end
4343 store_archive_data(stub_id, start, __ pc());
4344
4345 return start;
4346 }
4347
4348 // Arguments:
4349 //
4350 // Inputs:
4351 // c_rarg0 - byte[] source+offset
4352 // c_rarg1 - int[] SHA.state
4353 // c_rarg2 - int offset
4354 // c_rarg3 - int limit
4355 //
4356 address generate_sha1_implCompress(StubId stub_id) {
4357 bool multi_block;
4358 switch (stub_id) {
4359 case StubId::stubgen_sha1_implCompress_id:
4360 multi_block = false;
4361 break;
4362 case StubId::stubgen_sha1_implCompressMB_id:
4363 multi_block = true;
4364 break;
4365 default:
4366 ShouldNotReachHere();
4367 }
4368 int entry_count = StubInfo::entry_count(stub_id);
4369 assert(entry_count == 1, "sanity check");
4370 address start = load_archive_data(stub_id);
4371 if (start != nullptr) {
4372 return start;
4373 }
4374 __ align(CodeEntryAlignment);
4375
4376 StubCodeMark mark(this, stub_id);
4377 start = __ pc();
4378
4379 Register buf = c_rarg0;
4380 Register state = c_rarg1;
4381 Register ofs = c_rarg2;
4382 Register limit = c_rarg3;
4383
4384 Label keys;
4385 Label sha1_loop;
4386
4387 // load the keys into v0..v3
4388 __ adr(rscratch1, keys);
4389 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4390 // load 5 words state into v6, v7
4391 __ ldrq(v6, Address(state, 0));
4392 __ ldrs(v7, Address(state, 16));
4393
4394
4395 __ BIND(sha1_loop);
4396 // load 64 bytes of data into v16..v19
4397 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4398 __ rev32(v16, __ T16B, v16);
4399 __ rev32(v17, __ T16B, v17);
4400 __ rev32(v18, __ T16B, v18);
4401 __ rev32(v19, __ T16B, v19);
4402
4403 // do the sha1
4404 __ addv(v4, __ T4S, v16, v0);
4405 __ orr(v20, __ T16B, v6, v6);
4406
4407 FloatRegister d0 = v16;
4408 FloatRegister d1 = v17;
4409 FloatRegister d2 = v18;
4410 FloatRegister d3 = v19;
4411
4412 for (int round = 0; round < 20; round++) {
4413 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4414 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4415 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4416 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4417 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4418
4419 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4420 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4421 __ sha1h(tmp2, __ T4S, v20);
4422 if (round < 5)
4423 __ sha1c(v20, __ T4S, tmp3, tmp4);
4424 else if (round < 10 || round >= 15)
4425 __ sha1p(v20, __ T4S, tmp3, tmp4);
4426 else
4427 __ sha1m(v20, __ T4S, tmp3, tmp4);
4428 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4429
4430 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4431 }
4432
4433 __ addv(v7, __ T2S, v7, v21);
4434 __ addv(v6, __ T4S, v6, v20);
4435
4436 if (multi_block) {
4437 __ add(ofs, ofs, 64);
4438 __ cmp(ofs, limit);
4439 __ br(Assembler::LE, sha1_loop);
4440 __ mov(c_rarg0, ofs); // return ofs
4441 }
4442
4443 __ strq(v6, Address(state, 0));
4444 __ strs(v7, Address(state, 16));
4445
4446 __ ret(lr);
4447
4448 __ bind(keys);
4449 __ emit_int32(0x5a827999);
4450 __ emit_int32(0x6ed9eba1);
4451 __ emit_int32(0x8f1bbcdc);
4452 __ emit_int32(0xca62c1d6);
4453
4454 // record the stub entry and end
4455 store_archive_data(stub_id, start, __ pc());
4456
4457 return start;
4458 }
4459
4460
4461 // Arguments:
4462 //
4463 // Inputs:
4464 // c_rarg0 - byte[] source+offset
4465 // c_rarg1 - int[] SHA.state
4466 // c_rarg2 - int offset
4467 // c_rarg3 - int limit
4468 //
4469 address generate_sha256_implCompress(StubId stub_id) {
4470 bool multi_block;
4471 switch (stub_id) {
4472 case StubId::stubgen_sha256_implCompress_id:
4473 multi_block = false;
4474 break;
4475 case StubId::stubgen_sha256_implCompressMB_id:
4476 multi_block = true;
4477 break;
4478 default:
4479 ShouldNotReachHere();
4480 }
4481 int entry_count = StubInfo::entry_count(stub_id);
4482 assert(entry_count == 1, "sanity check");
4483 address start = load_archive_data(stub_id);
4484 if (start != nullptr) {
4485 return start;
4486 }
4487 __ align(CodeEntryAlignment);
4488 StubCodeMark mark(this, stub_id);
4489 start = __ pc();
4490
4491 Register buf = c_rarg0;
4492 Register state = c_rarg1;
4493 Register ofs = c_rarg2;
4494 Register limit = c_rarg3;
4495
4496 Label sha1_loop;
4497
4498 __ stpd(v8, v9, __ pre(sp, -32));
4499 __ stpd(v10, v11, Address(sp, 16));
4500
4501 // dga == v0
4502 // dgb == v1
4503 // dg0 == v2
4504 // dg1 == v3
4505 // dg2 == v4
4506 // t0 == v6
4507 // t1 == v7
4508
4509 // load 16 keys to v16..v31
4510 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4511 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4512 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4513 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4514 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4515
4516 // load 8 words (256 bits) state
4517 __ ldpq(v0, v1, state);
4518
4519 __ BIND(sha1_loop);
4520 // load 64 bytes of data into v8..v11
4521 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4522 __ rev32(v8, __ T16B, v8);
4523 __ rev32(v9, __ T16B, v9);
4524 __ rev32(v10, __ T16B, v10);
4525 __ rev32(v11, __ T16B, v11);
4526
4527 __ addv(v6, __ T4S, v8, v16);
4528 __ orr(v2, __ T16B, v0, v0);
4529 __ orr(v3, __ T16B, v1, v1);
4530
4531 FloatRegister d0 = v8;
4532 FloatRegister d1 = v9;
4533 FloatRegister d2 = v10;
4534 FloatRegister d3 = v11;
4535
4536
4537 for (int round = 0; round < 16; round++) {
4538 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4539 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4540 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4541 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4542
4543 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4544 __ orr(v4, __ T16B, v2, v2);
4545 if (round < 15)
4546 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4547 __ sha256h(v2, __ T4S, v3, tmp2);
4548 __ sha256h2(v3, __ T4S, v4, tmp2);
4549 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4550
4551 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4552 }
4553
4554 __ addv(v0, __ T4S, v0, v2);
4555 __ addv(v1, __ T4S, v1, v3);
4556
4557 if (multi_block) {
4558 __ add(ofs, ofs, 64);
4559 __ cmp(ofs, limit);
4560 __ br(Assembler::LE, sha1_loop);
4561 __ mov(c_rarg0, ofs); // return ofs
4562 }
4563
4564 __ ldpd(v10, v11, Address(sp, 16));
4565 __ ldpd(v8, v9, __ post(sp, 32));
4566
4567 __ stpq(v0, v1, state);
4568
4569 __ ret(lr);
4570
4571 // record the stub entry and end
4572 store_archive_data(stub_id, start, __ pc());
4573
4574 return start;
4575 }
4576
4577 // Double rounds for sha512.
4578 void sha512_dround(int dr,
4579 FloatRegister vi0, FloatRegister vi1,
4580 FloatRegister vi2, FloatRegister vi3,
4581 FloatRegister vi4, FloatRegister vrc0,
4582 FloatRegister vrc1, FloatRegister vin0,
4583 FloatRegister vin1, FloatRegister vin2,
4584 FloatRegister vin3, FloatRegister vin4) {
4585 if (dr < 36) {
4586 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4587 }
4588 __ addv(v5, __ T2D, vrc0, vin0);
4589 __ ext(v6, __ T16B, vi2, vi3, 8);
4590 __ ext(v5, __ T16B, v5, v5, 8);
4591 __ ext(v7, __ T16B, vi1, vi2, 8);
4592 __ addv(vi3, __ T2D, vi3, v5);
4593 if (dr < 32) {
4594 __ ext(v5, __ T16B, vin3, vin4, 8);
4595 __ sha512su0(vin0, __ T2D, vin1);
4596 }
4597 __ sha512h(vi3, __ T2D, v6, v7);
4598 if (dr < 32) {
4599 __ sha512su1(vin0, __ T2D, vin2, v5);
4600 }
4601 __ addv(vi4, __ T2D, vi1, vi3);
4602 __ sha512h2(vi3, __ T2D, vi1, vi0);
4603 }
4604
4605 // Arguments:
4606 //
4607 // Inputs:
4608 // c_rarg0 - byte[] source+offset
4609 // c_rarg1 - int[] SHA.state
4610 // c_rarg2 - int offset
4611 // c_rarg3 - int limit
4612 //
4613 address generate_sha512_implCompress(StubId stub_id) {
4614 bool multi_block;
4615 switch (stub_id) {
4616 case StubId::stubgen_sha512_implCompress_id:
4617 multi_block = false;
4618 break;
4619 case StubId::stubgen_sha512_implCompressMB_id:
4620 multi_block = true;
4621 break;
4622 default:
4623 ShouldNotReachHere();
4624 }
4625 int entry_count = StubInfo::entry_count(stub_id);
4626 assert(entry_count == 1, "sanity check");
4627 address start = load_archive_data(stub_id);
4628 if (start != nullptr) {
4629 return start;
4630 }
4631 __ align(CodeEntryAlignment);
4632 StubCodeMark mark(this, stub_id);
4633 start = __ pc();
4634
4635 Register buf = c_rarg0;
4636 Register state = c_rarg1;
4637 Register ofs = c_rarg2;
4638 Register limit = c_rarg3;
4639
4640 __ stpd(v8, v9, __ pre(sp, -64));
4641 __ stpd(v10, v11, Address(sp, 16));
4642 __ stpd(v12, v13, Address(sp, 32));
4643 __ stpd(v14, v15, Address(sp, 48));
4644
4645 Label sha512_loop;
4646
4647 // load state
4648 __ ld1(v8, v9, v10, v11, __ T2D, state);
4649
4650 // load first 4 round constants
4651 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4652 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4653
4654 __ BIND(sha512_loop);
4655 // load 128B of data into v12..v19
4656 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4657 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4658 __ rev64(v12, __ T16B, v12);
4659 __ rev64(v13, __ T16B, v13);
4660 __ rev64(v14, __ T16B, v14);
4661 __ rev64(v15, __ T16B, v15);
4662 __ rev64(v16, __ T16B, v16);
4663 __ rev64(v17, __ T16B, v17);
4664 __ rev64(v18, __ T16B, v18);
4665 __ rev64(v19, __ T16B, v19);
4666
4667 __ mov(rscratch2, rscratch1);
4668
4669 __ mov(v0, __ T16B, v8);
4670 __ mov(v1, __ T16B, v9);
4671 __ mov(v2, __ T16B, v10);
4672 __ mov(v3, __ T16B, v11);
4673
4674 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4675 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4676 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4677 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4678 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4679 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4680 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4681 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4682 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4683 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4684 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4685 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4686 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4687 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4688 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4689 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4690 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4691 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4692 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4693 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4694 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4695 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4696 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4697 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4698 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4699 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4700 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4701 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4702 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4703 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4704 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4705 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4706 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4707 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4708 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4709 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4710 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4711 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4712 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4713 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4714
4715 __ addv(v8, __ T2D, v8, v0);
4716 __ addv(v9, __ T2D, v9, v1);
4717 __ addv(v10, __ T2D, v10, v2);
4718 __ addv(v11, __ T2D, v11, v3);
4719
4720 if (multi_block) {
4721 __ add(ofs, ofs, 128);
4722 __ cmp(ofs, limit);
4723 __ br(Assembler::LE, sha512_loop);
4724 __ mov(c_rarg0, ofs); // return ofs
4725 }
4726
4727 __ st1(v8, v9, v10, v11, __ T2D, state);
4728
4729 __ ldpd(v14, v15, Address(sp, 48));
4730 __ ldpd(v12, v13, Address(sp, 32));
4731 __ ldpd(v10, v11, Address(sp, 16));
4732 __ ldpd(v8, v9, __ post(sp, 64));
4733
4734 __ ret(lr);
4735
4736 // record the stub entry and end
4737 store_archive_data(stub_id, start, __ pc());
4738
4739 return start;
4740 }
4741
4742 // Execute one round of keccak of two computations in parallel.
4743 // One of the states should be loaded into the lower halves of
4744 // the vector registers v0-v24, the other should be loaded into
4745 // the upper halves of those registers. The ld1r instruction loads
4746 // the round constant into both halves of register v31.
4747 // Intermediate results c0...c5 and d0...d5 are computed
4748 // in registers v25...v30.
4749 // All vector instructions that are used operate on both register
4750 // halves in parallel.
4751 // If only a single computation is needed, one can only load the lower halves.
4752 void keccak_round(Register rscratch1) {
4753 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4754 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4755 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4756 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4757 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4758 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4759 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4760 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4761 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4762 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4763
4764 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4765 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4766 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4767 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4768 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4769
4770 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4771 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4772 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4773 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4774 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4775 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4776 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4777 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4778 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4779 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4780 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4781 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4782 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4783 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4784 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4785 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4786 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4787 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4788 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4789 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4790 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4791 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4792 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4793 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4794 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4795
4796 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4797 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4798 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4799 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4800 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4801
4802 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4803
4804 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4805 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4806 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4807 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4808 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4809
4810 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4811 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4812 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4813 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4814 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4815
4816 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4817 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4818 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4819 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4820 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4821
4822 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4823 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4824 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4825 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4826 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4827
4828 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4829 }
4830
4831 // Arguments:
4832 //
4833 // Inputs:
4834 // c_rarg0 - byte[] source+offset
4835 // c_rarg1 - byte[] SHA.state
4836 // c_rarg2 - int block_size
4837 // c_rarg3 - int offset
4838 // c_rarg4 - int limit
4839 //
4840 address generate_sha3_implCompress(StubId stub_id) {
4841 bool multi_block;
4842 switch (stub_id) {
4843 case StubId::stubgen_sha3_implCompress_id:
4844 multi_block = false;
4845 break;
4846 case StubId::stubgen_sha3_implCompressMB_id:
4847 multi_block = true;
4848 break;
4849 default:
4850 ShouldNotReachHere();
4851 }
4852 int entry_count = StubInfo::entry_count(stub_id);
4853 assert(entry_count == 1, "sanity check");
4854 address start = load_archive_data(stub_id);
4855 if (start != nullptr) {
4856 return start;
4857 }
4858 __ align(CodeEntryAlignment);
4859 StubCodeMark mark(this, stub_id);
4860 start = __ pc();
4861
4862 Register buf = c_rarg0;
4863 Register state = c_rarg1;
4864 Register block_size = c_rarg2;
4865 Register ofs = c_rarg3;
4866 Register limit = c_rarg4;
4867
4868 Label sha3_loop, rounds24_loop;
4869 Label sha3_512_or_sha3_384, shake128;
4870
4871 __ stpd(v8, v9, __ pre(sp, -64));
4872 __ stpd(v10, v11, Address(sp, 16));
4873 __ stpd(v12, v13, Address(sp, 32));
4874 __ stpd(v14, v15, Address(sp, 48));
4875
4876 // load state
4877 __ add(rscratch1, state, 32);
4878 __ ld1(v0, v1, v2, v3, __ T1D, state);
4879 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4880 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4881 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4882 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4883 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4884 __ ld1(v24, __ T1D, rscratch1);
4885
4886 __ BIND(sha3_loop);
4887
4888 // 24 keccak rounds
4889 __ movw(rscratch2, 24);
4890
4891 // load round_constants base
4892 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4893
4894 // load input
4895 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4896 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4897 __ eor(v0, __ T8B, v0, v25);
4898 __ eor(v1, __ T8B, v1, v26);
4899 __ eor(v2, __ T8B, v2, v27);
4900 __ eor(v3, __ T8B, v3, v28);
4901 __ eor(v4, __ T8B, v4, v29);
4902 __ eor(v5, __ T8B, v5, v30);
4903 __ eor(v6, __ T8B, v6, v31);
4904
4905 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4906 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4907
4908 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4909 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4910 __ eor(v7, __ T8B, v7, v25);
4911 __ eor(v8, __ T8B, v8, v26);
4912 __ eor(v9, __ T8B, v9, v27);
4913 __ eor(v10, __ T8B, v10, v28);
4914 __ eor(v11, __ T8B, v11, v29);
4915 __ eor(v12, __ T8B, v12, v30);
4916 __ eor(v13, __ T8B, v13, v31);
4917
4918 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4919 __ eor(v14, __ T8B, v14, v25);
4920 __ eor(v15, __ T8B, v15, v26);
4921 __ eor(v16, __ T8B, v16, v27);
4922
4923 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4924 __ andw(c_rarg5, block_size, 48);
4925 __ cbzw(c_rarg5, rounds24_loop);
4926
4927 __ tbnz(block_size, 5, shake128);
4928 // block_size == 144, bit5 == 0, SHA3-224
4929 __ ldrd(v28, __ post(buf, 8));
4930 __ eor(v17, __ T8B, v17, v28);
4931 __ b(rounds24_loop);
4932
4933 __ BIND(shake128);
4934 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4935 __ eor(v17, __ T8B, v17, v28);
4936 __ eor(v18, __ T8B, v18, v29);
4937 __ eor(v19, __ T8B, v19, v30);
4938 __ eor(v20, __ T8B, v20, v31);
4939 __ b(rounds24_loop); // block_size == 168, SHAKE128
4940
4941 __ BIND(sha3_512_or_sha3_384);
4942 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4943 __ eor(v7, __ T8B, v7, v25);
4944 __ eor(v8, __ T8B, v8, v26);
4945 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4946
4947 // SHA3-384
4948 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4949 __ eor(v9, __ T8B, v9, v27);
4950 __ eor(v10, __ T8B, v10, v28);
4951 __ eor(v11, __ T8B, v11, v29);
4952 __ eor(v12, __ T8B, v12, v30);
4953
4954 __ BIND(rounds24_loop);
4955 __ subw(rscratch2, rscratch2, 1);
4956
4957 keccak_round(rscratch1);
4958
4959 __ cbnzw(rscratch2, rounds24_loop);
4960
4961 if (multi_block) {
4962 __ add(ofs, ofs, block_size);
4963 __ cmp(ofs, limit);
4964 __ br(Assembler::LE, sha3_loop);
4965 __ mov(c_rarg0, ofs); // return ofs
4966 }
4967
4968 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4969 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4970 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4971 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4972 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4973 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4974 __ st1(v24, __ T1D, state);
4975
4976 // restore callee-saved registers
4977 __ ldpd(v14, v15, Address(sp, 48));
4978 __ ldpd(v12, v13, Address(sp, 32));
4979 __ ldpd(v10, v11, Address(sp, 16));
4980 __ ldpd(v8, v9, __ post(sp, 64));
4981
4982 __ ret(lr);
4983
4984 // record the stub entry and end
4985 store_archive_data(stub_id, start, __ pc());
4986
4987 return start;
4988 }
4989
4990 // Inputs:
4991 // c_rarg0 - long[] state0
4992 // c_rarg1 - long[] state1
4993 address generate_double_keccak() {
4994 StubId stub_id = StubId::stubgen_double_keccak_id;
4995 int entry_count = StubInfo::entry_count(stub_id);
4996 assert(entry_count == 1, "sanity check");
4997 address start = load_archive_data(stub_id);
4998 if (start != nullptr) {
4999 return start;
5000 }
5001 // Implements the double_keccak() method of the
5002 // sun.secyrity.provider.SHA3Parallel class
5003 __ align(CodeEntryAlignment);
5004 StubCodeMark mark(this, stub_id);
5005 start = __ pc();
5006 __ enter();
5007
5008 Register state0 = c_rarg0;
5009 Register state1 = c_rarg1;
5010
5011 Label rounds24_loop;
5012
5013 // save callee-saved registers
5014 __ stpd(v8, v9, __ pre(sp, -64));
5015 __ stpd(v10, v11, Address(sp, 16));
5016 __ stpd(v12, v13, Address(sp, 32));
5017 __ stpd(v14, v15, Address(sp, 48));
5018
5019 // load states
5020 __ add(rscratch1, state0, 32);
5021 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5022 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5023 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5024 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5025 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5026 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5027 __ ld1(v24, __ D, 0, rscratch1);
5028 __ add(rscratch1, state1, 32);
5029 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5030 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5031 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5032 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5033 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5034 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5035 __ ld1(v24, __ D, 1, rscratch1);
5036
5037 // 24 keccak rounds
5038 __ movw(rscratch2, 24);
5039
5040 // load round_constants base
5041 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5042
5043 __ BIND(rounds24_loop);
5044 __ subw(rscratch2, rscratch2, 1);
5045 keccak_round(rscratch1);
5046 __ cbnzw(rscratch2, rounds24_loop);
5047
5048 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5049 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5050 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5051 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5052 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5053 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5054 __ st1(v24, __ D, 0, state0);
5055 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5056 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5057 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5058 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5059 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5060 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5061 __ st1(v24, __ D, 1, state1);
5062
5063 // restore callee-saved vector registers
5064 __ ldpd(v14, v15, Address(sp, 48));
5065 __ ldpd(v12, v13, Address(sp, 32));
5066 __ ldpd(v10, v11, Address(sp, 16));
5067 __ ldpd(v8, v9, __ post(sp, 64));
5068
5069 __ leave(); // required for proper stackwalking of RuntimeStub frame
5070 __ mov(r0, zr); // return 0
5071 __ ret(lr);
5072
5073 // record the stub entry and end
5074 store_archive_data(stub_id, start, __ pc());
5075
5076 return start;
5077 }
5078
5079 // ChaCha20 block function. This version parallelizes the 32-bit
5080 // state elements on each of 16 vectors, producing 4 blocks of
5081 // keystream at a time.
5082 //
5083 // state (int[16]) = c_rarg0
5084 // keystream (byte[256]) = c_rarg1
5085 // return - number of bytes of produced keystream (always 256)
5086 //
5087 // This implementation takes each 32-bit integer from the state
5088 // array and broadcasts it across all 4 32-bit lanes of a vector register
5089 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5090 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5091 // the quarter round schedule is implemented as outlined in RFC 7539 section
5092 // 2.3. However, instead of sequentially processing the 3 quarter round
5093 // operations represented by one QUARTERROUND function, we instead stack all
5094 // the adds, xors and left-rotations from the first 4 quarter rounds together
5095 // and then do the same for the second set of 4 quarter rounds. This removes
5096 // some latency that would otherwise be incurred by waiting for an add to
5097 // complete before performing an xor (which depends on the result of the
5098 // add), etc. An adjustment happens between the first and second groups of 4
5099 // quarter rounds, but this is done only in the inputs to the macro functions
5100 // that generate the assembly instructions - these adjustments themselves are
5101 // not part of the resulting assembly.
5102 // The 4 registers v0-v3 are used during the quarter round operations as
5103 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5104 // registers become the vectors involved in adding the start state back onto
5105 // the post-QR working state. After the adds are complete, each of the 16
5106 // vectors write their first lane back to the keystream buffer, followed
5107 // by the second lane from all vectors and so on.
5108 address generate_chacha20Block_blockpar() {
5109 StubId stub_id = StubId::stubgen_chacha20Block_id;
5110 int entry_count = StubInfo::entry_count(stub_id);
5111 assert(entry_count == 1, "sanity check");
5112 address start = load_archive_data(stub_id);
5113 if (start != nullptr) {
5114 return start;
5115 }
5116 Label L_twoRounds, L_cc20_const;
5117 __ align(CodeEntryAlignment);
5118 StubCodeMark mark(this, stub_id);
5119 start = __ pc();
5120 __ enter();
5121
5122 int i, j;
5123 const Register state = c_rarg0;
5124 const Register keystream = c_rarg1;
5125 const Register loopCtr = r10;
5126 const Register tmpAddr = r11;
5127 const FloatRegister ctrAddOverlay = v28;
5128 const FloatRegister lrot8Tbl = v29;
5129
5130 // Organize SIMD registers in an array that facilitates
5131 // putting repetitive opcodes into loop structures. It is
5132 // important that each grouping of 4 registers is monotonically
5133 // increasing to support the requirements of multi-register
5134 // instructions (e.g. ld4r, st4, etc.)
5135 const FloatRegister workSt[16] = {
5136 v4, v5, v6, v7, v16, v17, v18, v19,
5137 v20, v21, v22, v23, v24, v25, v26, v27
5138 };
5139
5140 // Pull in constant data. The first 16 bytes are the add overlay
5141 // which is applied to the vector holding the counter (state[12]).
5142 // The second 16 bytes is the index register for the 8-bit left
5143 // rotation tbl instruction.
5144 __ adr(tmpAddr, L_cc20_const);
5145 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5146
5147 // Load from memory and interlace across 16 SIMD registers,
5148 // With each word from memory being broadcast to all lanes of
5149 // each successive SIMD register.
5150 // Addr(0) -> All lanes in workSt[i]
5151 // Addr(4) -> All lanes workSt[i + 1], etc.
5152 __ mov(tmpAddr, state);
5153 for (i = 0; i < 16; i += 4) {
5154 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5155 __ post(tmpAddr, 16));
5156 }
5157 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5158
5159 // Before entering the loop, create 5 4-register arrays. These
5160 // will hold the 4 registers that represent the a/b/c/d fields
5161 // in the quarter round operation. For instance the "b" field
5162 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5163 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5164 // since it is part of a diagonal organization. The aSet and scratch
5165 // register sets are defined at declaration time because they do not change
5166 // organization at any point during the 20-round processing.
5167 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5168 FloatRegister bSet[4];
5169 FloatRegister cSet[4];
5170 FloatRegister dSet[4];
5171 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5172
5173 // Set up the 10 iteration loop and perform all 8 quarter round ops
5174 __ mov(loopCtr, 10);
5175 __ BIND(L_twoRounds);
5176
5177 // Set to columnar organization and do the following 4 quarter-rounds:
5178 // QUARTERROUND(0, 4, 8, 12)
5179 // QUARTERROUND(1, 5, 9, 13)
5180 // QUARTERROUND(2, 6, 10, 14)
5181 // QUARTERROUND(3, 7, 11, 15)
5182 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5183 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5184 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5185
5186 __ cc20_qr_add4(aSet, bSet); // a += b
5187 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5188 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5189
5190 __ cc20_qr_add4(cSet, dSet); // c += d
5191 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5192 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5193
5194 __ cc20_qr_add4(aSet, bSet); // a += b
5195 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5196 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5197
5198 __ cc20_qr_add4(cSet, dSet); // c += d
5199 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5200 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5201
5202 // Set to diagonal organization and do the next 4 quarter-rounds:
5203 // QUARTERROUND(0, 5, 10, 15)
5204 // QUARTERROUND(1, 6, 11, 12)
5205 // QUARTERROUND(2, 7, 8, 13)
5206 // QUARTERROUND(3, 4, 9, 14)
5207 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5208 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5209 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5210
5211 __ cc20_qr_add4(aSet, bSet); // a += b
5212 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5213 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5214
5215 __ cc20_qr_add4(cSet, dSet); // c += d
5216 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5217 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5218
5219 __ cc20_qr_add4(aSet, bSet); // a += b
5220 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5221 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5222
5223 __ cc20_qr_add4(cSet, dSet); // c += d
5224 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5225 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5226
5227 // Decrement and iterate
5228 __ sub(loopCtr, loopCtr, 1);
5229 __ cbnz(loopCtr, L_twoRounds);
5230
5231 __ mov(tmpAddr, state);
5232
5233 // Add the starting state back to the post-loop keystream
5234 // state. We read/interlace the state array from memory into
5235 // 4 registers similar to what we did in the beginning. Then
5236 // add the counter overlay onto workSt[12] at the end.
5237 for (i = 0; i < 16; i += 4) {
5238 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5239 __ addv(workSt[i], __ T4S, workSt[i], v0);
5240 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5241 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5242 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5243 }
5244 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5245
5246 // Write working state into the keystream buffer. This is accomplished
5247 // by taking the lane "i" from each of the four vectors and writing
5248 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5249 // repeating with the next 4 vectors until all 16 vectors have been used.
5250 // Then move to the next lane and repeat the process until all lanes have
5251 // been written.
5252 for (i = 0; i < 4; i++) {
5253 for (j = 0; j < 16; j += 4) {
5254 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5255 __ post(keystream, 16));
5256 }
5257 }
5258
5259 __ mov(r0, 256); // Return length of output keystream
5260 __ leave();
5261 __ ret(lr);
5262
5263 // bind label and generate local constant data used by this stub
5264 // The constant data is broken into two 128-bit segments to be loaded
5265 // onto FloatRegisters. The first 128 bits are a counter add overlay
5266 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5267 // The second 128-bits is a table constant used for 8-bit left rotations.
5268 __ BIND(L_cc20_const);
5269 __ emit_int64(0x0000000100000000UL);
5270 __ emit_int64(0x0000000300000002UL);
5271 __ emit_int64(0x0605040702010003UL);
5272 __ emit_int64(0x0E0D0C0F0A09080BUL);
5273
5274 // record the stub entry and end
5275 store_archive_data(stub_id, start, __ pc());
5276
5277 return start;
5278 }
5279
5280 // Helpers to schedule parallel operation bundles across vector
5281 // register sequences of size 2, 4 or 8.
5282
5283 // Implement various primitive computations across vector sequences
5284
5285 template<int N>
5286 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5287 const VSeq<N>& v1, const VSeq<N>& v2) {
5288 // output must not be constant
5289 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5290 // output cannot overwrite pending inputs
5291 assert(!vs_write_before_read(v, v1), "output overwrites input");
5292 assert(!vs_write_before_read(v, v2), "output overwrites input");
5293 for (int i = 0; i < N; i++) {
5294 __ addv(v[i], T, v1[i], v2[i]);
5295 }
5296 }
5297
5298 template<int N>
5299 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5300 const VSeq<N>& v1, const VSeq<N>& v2) {
5301 // output must not be constant
5302 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5303 // output cannot overwrite pending inputs
5304 assert(!vs_write_before_read(v, v1), "output overwrites input");
5305 assert(!vs_write_before_read(v, v2), "output overwrites input");
5306 for (int i = 0; i < N; i++) {
5307 __ subv(v[i], T, v1[i], v2[i]);
5308 }
5309 }
5310
5311 template<int N>
5312 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5313 const VSeq<N>& v1, const VSeq<N>& v2) {
5314 // output must not be constant
5315 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5316 // output cannot overwrite pending inputs
5317 assert(!vs_write_before_read(v, v1), "output overwrites input");
5318 assert(!vs_write_before_read(v, v2), "output overwrites input");
5319 for (int i = 0; i < N; i++) {
5320 __ mulv(v[i], T, v1[i], v2[i]);
5321 }
5322 }
5323
5324 template<int N>
5325 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5326 // output must not be constant
5327 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5328 // output cannot overwrite pending inputs
5329 assert(!vs_write_before_read(v, v1), "output overwrites input");
5330 for (int i = 0; i < N; i++) {
5331 __ negr(v[i], T, v1[i]);
5332 }
5333 }
5334
5335 template<int N>
5336 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5337 const VSeq<N>& v1, int shift) {
5338 // output must not be constant
5339 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5340 // output cannot overwrite pending inputs
5341 assert(!vs_write_before_read(v, v1), "output overwrites input");
5342 for (int i = 0; i < N; i++) {
5343 __ sshr(v[i], T, v1[i], shift);
5344 }
5345 }
5346
5347 template<int N>
5348 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5349 // output must not be constant
5350 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5351 // output cannot overwrite pending inputs
5352 assert(!vs_write_before_read(v, v1), "output overwrites input");
5353 assert(!vs_write_before_read(v, v2), "output overwrites input");
5354 for (int i = 0; i < N; i++) {
5355 __ andr(v[i], __ T16B, v1[i], v2[i]);
5356 }
5357 }
5358
5359 template<int N>
5360 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5361 // output must not be constant
5362 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5363 // output cannot overwrite pending inputs
5364 assert(!vs_write_before_read(v, v1), "output overwrites input");
5365 assert(!vs_write_before_read(v, v2), "output overwrites input");
5366 for (int i = 0; i < N; i++) {
5367 __ orr(v[i], __ T16B, v1[i], v2[i]);
5368 }
5369 }
5370
5371 template<int N>
5372 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5373 // output must not be constant
5374 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5375 // output cannot overwrite pending inputs
5376 assert(!vs_write_before_read(v, v1), "output overwrites input");
5377 for (int i = 0; i < N; i++) {
5378 __ notr(v[i], __ T16B, v1[i]);
5379 }
5380 }
5381
5382 template<int N>
5383 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5384 // output must not be constant
5385 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5386 // output cannot overwrite pending inputs
5387 assert(!vs_write_before_read(v, v1), "output overwrites input");
5388 assert(!vs_write_before_read(v, v2), "output overwrites input");
5389 for (int i = 0; i < N; i++) {
5390 __ sqdmulh(v[i], T, v1[i], v2[i]);
5391 }
5392 }
5393
5394 template<int N>
5395 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5396 // output must not be constant
5397 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5398 // output cannot overwrite pending inputs
5399 assert(!vs_write_before_read(v, v1), "output overwrites input");
5400 assert(!vs_write_before_read(v, v2), "output overwrites input");
5401 for (int i = 0; i < N; i++) {
5402 __ mlsv(v[i], T, v1[i], v2[i]);
5403 }
5404 }
5405
5406 // load N/2 successive pairs of quadword values from memory in order
5407 // into N successive vector registers of the sequence via the
5408 // address supplied in base.
5409 template<int N>
5410 void vs_ldpq(const VSeq<N>& v, Register base) {
5411 for (int i = 0; i < N; i += 2) {
5412 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
5413 }
5414 }
5415
5416 // load N/2 successive pairs of quadword values from memory in order
5417 // into N vector registers of the sequence via the address supplied
5418 // in base using post-increment addressing
5419 template<int N>
5420 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5421 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5422 for (int i = 0; i < N; i += 2) {
5423 __ ldpq(v[i], v[i+1], __ post(base, 32));
5424 }
5425 }
5426
5427 // store N successive vector registers of the sequence into N/2
5428 // successive pairs of quadword memory locations via the address
5429 // supplied in base using post-increment addressing
5430 template<int N>
5431 void vs_stpq_post(const VSeq<N>& v, Register base) {
5432 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5433 for (int i = 0; i < N; i += 2) {
5434 __ stpq(v[i], v[i+1], __ post(base, 32));
5435 }
5436 }
5437
5438 // load N/2 pairs of quadword values from memory de-interleaved into
5439 // N vector registers 2 at a time via the address supplied in base
5440 // using post-increment addressing.
5441 template<int N>
5442 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5443 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5444 for (int i = 0; i < N; i += 2) {
5445 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5446 }
5447 }
5448
5449 // store N vector registers interleaved into N/2 pairs of quadword
5450 // memory locations via the address supplied in base using
5451 // post-increment addressing.
5452 template<int N>
5453 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5454 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5455 for (int i = 0; i < N; i += 2) {
5456 __ st2(v[i], v[i+1], T, __ post(base, 32));
5457 }
5458 }
5459
5460 // load N quadword values from memory de-interleaved into N vector
5461 // registers 3 elements at a time via the address supplied in base.
5462 template<int N>
5463 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5464 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5465 for (int i = 0; i < N; i += 3) {
5466 __ ld3(v[i], v[i+1], v[i+2], T, base);
5467 }
5468 }
5469
5470 // load N quadword values from memory de-interleaved into N vector
5471 // registers 3 elements at a time via the address supplied in base
5472 // using post-increment addressing.
5473 template<int N>
5474 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5475 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5476 for (int i = 0; i < N; i += 3) {
5477 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5478 }
5479 }
5480
5481 // load N/2 pairs of quadword values from memory into N vector
5482 // registers via the address supplied in base with each pair indexed
5483 // using the the start offset plus the corresponding entry in the
5484 // offsets array
5485 template<int N>
5486 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5487 for (int i = 0; i < N/2; i++) {
5488 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5489 }
5490 }
5491
5492 // store N vector registers into N/2 pairs of quadword memory
5493 // locations via the address supplied in base with each pair indexed
5494 // using the the start offset plus the corresponding entry in the
5495 // offsets array
5496 template<int N>
5497 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5498 for (int i = 0; i < N/2; i++) {
5499 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5500 }
5501 }
5502
5503 // load N single quadword values from memory into N vector registers
5504 // via the address supplied in base with each value indexed using
5505 // the the start offset plus the corresponding entry in the offsets
5506 // array
5507 template<int N>
5508 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5509 int start, int (&offsets)[N]) {
5510 for (int i = 0; i < N; i++) {
5511 __ ldr(v[i], T, Address(base, start + offsets[i]));
5512 }
5513 }
5514
5515 // store N vector registers into N single quadword memory locations
5516 // via the address supplied in base with each value indexed using
5517 // the the start offset plus the corresponding entry in the offsets
5518 // array
5519 template<int N>
5520 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5521 int start, int (&offsets)[N]) {
5522 for (int i = 0; i < N; i++) {
5523 __ str(v[i], T, Address(base, start + offsets[i]));
5524 }
5525 }
5526
5527 // load N/2 pairs of quadword values from memory de-interleaved into
5528 // N vector registers 2 at a time via the address supplied in base
5529 // with each pair indexed using the the start offset plus the
5530 // corresponding entry in the offsets array
5531 template<int N>
5532 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5533 Register tmp, int start, int (&offsets)[N/2]) {
5534 for (int i = 0; i < N/2; i++) {
5535 __ add(tmp, base, start + offsets[i]);
5536 __ ld2(v[2*i], v[2*i+1], T, tmp);
5537 }
5538 }
5539
5540 // store N vector registers 2 at a time interleaved into N/2 pairs
5541 // of quadword memory locations via the address supplied in base
5542 // with each pair indexed using the the start offset plus the
5543 // corresponding entry in the offsets array
5544 template<int N>
5545 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5546 Register tmp, int start, int (&offsets)[N/2]) {
5547 for (int i = 0; i < N/2; i++) {
5548 __ add(tmp, base, start + offsets[i]);
5549 __ st2(v[2*i], v[2*i+1], T, tmp);
5550 }
5551 }
5552
5553 // Helper routines for various flavours of Montgomery multiply
5554
5555 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5556 // multiplications in parallel
5557 //
5558
5559 // See the montMul() method of the sun.security.provider.ML_DSA
5560 // class.
5561 //
5562 // Computes 4x4S results or 8x8H results
5563 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5564 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5565 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5566 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5567 // Outputs: va - 4x4S or 4x8H vector register sequences
5568 // vb, vc, vtmp and vq must all be disjoint
5569 // va must be disjoint from all other inputs/temps or must equal vc
5570 // va must have a non-zero delta i.e. it must not be a constant vseq.
5571 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5572 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5573 Assembler::SIMD_Arrangement T,
5574 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5575 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5576 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5577 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5578 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5579
5580 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5581 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5582
5583 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5584
5585 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5586 assert(vs_disjoint(va, vb), "va and vb overlap");
5587 assert(vs_disjoint(va, vq), "va and vq overlap");
5588 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5589 assert(!va.is_constant(), "output vector must identify 4 different registers");
5590
5591 // schedule 4 streams of instructions across the vector sequences
5592 for (int i = 0; i < 4; i++) {
5593 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5594 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5595 }
5596
5597 for (int i = 0; i < 4; i++) {
5598 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5599 }
5600
5601 for (int i = 0; i < 4; i++) {
5602 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5603 }
5604
5605 for (int i = 0; i < 4; i++) {
5606 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5607 }
5608 }
5609
5610 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5611 // multiplications in parallel
5612 //
5613
5614 // See the montMul() method of the sun.security.provider.ML_DSA
5615 // class.
5616 //
5617 // Computes 4x4S results or 8x8H results
5618 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5619 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5620 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5621 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5622 // Outputs: va - 4x4S or 4x8H vector register sequences
5623 // vb, vc, vtmp and vq must all be disjoint
5624 // va must be disjoint from all other inputs/temps or must equal vc
5625 // va must have a non-zero delta i.e. it must not be a constant vseq.
5626 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5627 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5628 Assembler::SIMD_Arrangement T,
5629 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5630 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5631 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5632 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5633 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5634
5635 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5636 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5637
5638 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5639
5640 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5641 assert(vs_disjoint(va, vb), "va and vb overlap");
5642 assert(vs_disjoint(va, vq), "va and vq overlap");
5643 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5644 assert(!va.is_constant(), "output vector must identify 2 different registers");
5645
5646 // schedule 2 streams of instructions across the vector sequences
5647 for (int i = 0; i < 2; i++) {
5648 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5649 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5650 }
5651
5652 for (int i = 0; i < 2; i++) {
5653 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5654 }
5655
5656 for (int i = 0; i < 2; i++) {
5657 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5658 }
5659
5660 for (int i = 0; i < 2; i++) {
5661 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5662 }
5663 }
5664
5665 // Perform 16 16-bit Montgomery multiplications in parallel.
5666 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5667 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5668 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5669 // It will assert that the register use is valid
5670 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5671 }
5672
5673 // Perform 32 16-bit Montgomery multiplications in parallel.
5674 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5675 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5676 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5677 // It will assert that the register use is valid
5678 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5679 }
5680
5681 // Perform 64 16-bit Montgomery multiplications in parallel.
5682 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5683 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5684 // Schedule two successive 4x8H multiplies via the montmul helper
5685 // on the front and back halves of va, vb and vc. The helper will
5686 // assert that the register use has no overlap conflicts on each
5687 // individual call but we also need to ensure that the necessary
5688 // disjoint/equality constraints are met across both calls.
5689
5690 // vb, vc, vtmp and vq must be disjoint. va must either be
5691 // disjoint from all other registers or equal vc
5692
5693 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5694 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5695 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5696
5697 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5698 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5699
5700 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5701
5702 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5703 assert(vs_disjoint(va, vb), "va and vb overlap");
5704 assert(vs_disjoint(va, vq), "va and vq overlap");
5705 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5706
5707 // we multiply the front and back halves of each sequence 4 at a
5708 // time because
5709 //
5710 // 1) we are currently only able to get 4-way instruction
5711 // parallelism at best
5712 //
5713 // 2) we need registers for the constants in vq and temporary
5714 // scratch registers to hold intermediate results so vtmp can only
5715 // be a VSeq<4> which means we only have 4 scratch slots
5716
5717 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5718 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5719 }
5720
5721 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5722 const VSeq<4>& vc,
5723 const VSeq<4>& vtmp,
5724 const VSeq<2>& vq) {
5725 // compute a = montmul(a1, c)
5726 kyber_montmul32(vc, va1, vc, vtmp, vq);
5727 // ouptut a1 = a0 - a
5728 vs_subv(va1, __ T8H, va0, vc);
5729 // and a0 = a0 + a
5730 vs_addv(va0, __ T8H, va0, vc);
5731 }
5732
5733 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5734 const VSeq<4>& vb,
5735 const VSeq<4>& vtmp1,
5736 const VSeq<4>& vtmp2,
5737 const VSeq<2>& vq) {
5738 // compute c = a0 - a1
5739 vs_subv(vtmp1, __ T8H, va0, va1);
5740 // output a0 = a0 + a1
5741 vs_addv(va0, __ T8H, va0, va1);
5742 // output a1 = b montmul c
5743 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5744 }
5745
5746 void load64shorts(const VSeq<8>& v, Register shorts) {
5747 vs_ldpq_post(v, shorts);
5748 }
5749
5750 void load32shorts(const VSeq<4>& v, Register shorts) {
5751 vs_ldpq_post(v, shorts);
5752 }
5753
5754 void store64shorts(VSeq<8> v, Register tmpAddr) {
5755 vs_stpq_post(v, tmpAddr);
5756 }
5757
5758 // Kyber NTT function.
5759 // Implements
5760 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5761 //
5762 // coeffs (short[256]) = c_rarg0
5763 // ntt_zetas (short[256]) = c_rarg1
5764 address generate_kyberNtt() {
5765 StubId stub_id = StubId::stubgen_kyberNtt_id;
5766 int entry_count = StubInfo::entry_count(stub_id);
5767 assert(entry_count == 1, "sanity check");
5768 address start = load_archive_data(stub_id);
5769 if (start != nullptr) {
5770 return start;
5771 }
5772 __ align(CodeEntryAlignment);
5773 StubCodeMark mark(this, stub_id);
5774 start = __ pc();
5775 __ enter();
5776
5777 const Register coeffs = c_rarg0;
5778 const Register zetas = c_rarg1;
5779
5780 const Register kyberConsts = r10;
5781 const Register tmpAddr = r11;
5782
5783 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5784 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5785 VSeq<2> vq(30); // n.b. constants overlap vs3
5786
5787 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5788 // load the montmul constants
5789 vs_ldpq(vq, kyberConsts);
5790
5791 // Each level corresponds to an iteration of the outermost loop of the
5792 // Java method seilerNTT(int[] coeffs). There are some differences
5793 // from what is done in the seilerNTT() method, though:
5794 // 1. The computation is using 16-bit signed values, we do not convert them
5795 // to ints here.
5796 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5797 // this array for each level, it is easier that way to fill up the vector
5798 // registers.
5799 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5800 // multiplications (this is because that way there should not be any
5801 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5802 // that we can use the 16-bit arithmetic in the vector unit.
5803 //
5804 // On each level, we fill up the vector registers in such a way that the
5805 // array elements that need to be multiplied by the zetas go into one
5806 // set of vector registers while the corresponding ones that don't need to
5807 // be multiplied, go into another set.
5808 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5809 // registers interleaving the steps of 4 identical computations,
5810 // each done on 8 16-bit values per register.
5811
5812 // At levels 0-3 the coefficients multiplied by or added/subtracted
5813 // to the zetas occur in discrete blocks whose size is some multiple
5814 // of 32.
5815
5816 // level 0
5817 __ add(tmpAddr, coeffs, 256);
5818 load64shorts(vs1, tmpAddr);
5819 load64shorts(vs2, zetas);
5820 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5821 __ add(tmpAddr, coeffs, 0);
5822 load64shorts(vs1, tmpAddr);
5823 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5824 vs_addv(vs1, __ T8H, vs1, vs2);
5825 __ add(tmpAddr, coeffs, 0);
5826 vs_stpq_post(vs1, tmpAddr);
5827 __ add(tmpAddr, coeffs, 256);
5828 vs_stpq_post(vs3, tmpAddr);
5829 // restore montmul constants
5830 vs_ldpq(vq, kyberConsts);
5831 load64shorts(vs1, tmpAddr);
5832 load64shorts(vs2, zetas);
5833 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5834 __ add(tmpAddr, coeffs, 128);
5835 load64shorts(vs1, tmpAddr);
5836 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5837 vs_addv(vs1, __ T8H, vs1, vs2);
5838 __ add(tmpAddr, coeffs, 128);
5839 store64shorts(vs1, tmpAddr);
5840 __ add(tmpAddr, coeffs, 384);
5841 store64shorts(vs3, tmpAddr);
5842
5843 // level 1
5844 // restore montmul constants
5845 vs_ldpq(vq, kyberConsts);
5846 __ add(tmpAddr, coeffs, 128);
5847 load64shorts(vs1, tmpAddr);
5848 load64shorts(vs2, zetas);
5849 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5850 __ add(tmpAddr, coeffs, 0);
5851 load64shorts(vs1, tmpAddr);
5852 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5853 vs_addv(vs1, __ T8H, vs1, vs2);
5854 __ add(tmpAddr, coeffs, 0);
5855 store64shorts(vs1, tmpAddr);
5856 store64shorts(vs3, tmpAddr);
5857 vs_ldpq(vq, kyberConsts);
5858 __ add(tmpAddr, coeffs, 384);
5859 load64shorts(vs1, tmpAddr);
5860 load64shorts(vs2, zetas);
5861 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5862 __ add(tmpAddr, coeffs, 256);
5863 load64shorts(vs1, tmpAddr);
5864 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5865 vs_addv(vs1, __ T8H, vs1, vs2);
5866 __ add(tmpAddr, coeffs, 256);
5867 store64shorts(vs1, tmpAddr);
5868 store64shorts(vs3, tmpAddr);
5869
5870 // level 2
5871 vs_ldpq(vq, kyberConsts);
5872 int offsets1[4] = { 0, 32, 128, 160 };
5873 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5874 load64shorts(vs2, zetas);
5875 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5876 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5877 // kyber_subv_addv64();
5878 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5879 vs_addv(vs1, __ T8H, vs1, vs2);
5880 __ add(tmpAddr, coeffs, 0);
5881 vs_stpq_post(vs_front(vs1), tmpAddr);
5882 vs_stpq_post(vs_front(vs3), tmpAddr);
5883 vs_stpq_post(vs_back(vs1), tmpAddr);
5884 vs_stpq_post(vs_back(vs3), tmpAddr);
5885 vs_ldpq(vq, kyberConsts);
5886 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5887 load64shorts(vs2, zetas);
5888 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5889 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5890 // kyber_subv_addv64();
5891 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5892 vs_addv(vs1, __ T8H, vs1, vs2);
5893 __ add(tmpAddr, coeffs, 256);
5894 vs_stpq_post(vs_front(vs1), tmpAddr);
5895 vs_stpq_post(vs_front(vs3), tmpAddr);
5896 vs_stpq_post(vs_back(vs1), tmpAddr);
5897 vs_stpq_post(vs_back(vs3), tmpAddr);
5898
5899 // level 3
5900 vs_ldpq(vq, kyberConsts);
5901 int offsets2[4] = { 0, 64, 128, 192 };
5902 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5903 load64shorts(vs2, zetas);
5904 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5905 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5906 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5907 vs_addv(vs1, __ T8H, vs1, vs2);
5908 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5909 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5910
5911 vs_ldpq(vq, kyberConsts);
5912 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5913 load64shorts(vs2, zetas);
5914 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5915 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5916 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5917 vs_addv(vs1, __ T8H, vs1, vs2);
5918 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5919 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5920
5921 // level 4
5922 // At level 4 coefficients occur in 8 discrete blocks of size 16
5923 // so they are loaded using employing an ldr at 8 distinct offsets.
5924
5925 vs_ldpq(vq, kyberConsts);
5926 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5927 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5928 load64shorts(vs2, zetas);
5929 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5930 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5931 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5932 vs_addv(vs1, __ T8H, vs1, vs2);
5933 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5934 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5935
5936 vs_ldpq(vq, kyberConsts);
5937 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5938 load64shorts(vs2, zetas);
5939 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5940 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5941 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5942 vs_addv(vs1, __ T8H, vs1, vs2);
5943 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5944 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5945
5946 // level 5
5947 // At level 5 related coefficients occur in discrete blocks of size 8 so
5948 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5949
5950 vs_ldpq(vq, kyberConsts);
5951 int offsets4[4] = { 0, 32, 64, 96 };
5952 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5953 load32shorts(vs_front(vs2), zetas);
5954 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5955 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5956 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5957 load32shorts(vs_front(vs2), zetas);
5958 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5959 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5960 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5961 load32shorts(vs_front(vs2), zetas);
5962 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5963 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5964
5965 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5966 load32shorts(vs_front(vs2), zetas);
5967 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5968 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5969
5970 // level 6
5971 // At level 6 related coefficients occur in discrete blocks of size 4 so
5972 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5973
5974 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5975 load32shorts(vs_front(vs2), zetas);
5976 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5977 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5978 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5979 // __ ldpq(v18, v19, __ post(zetas, 32));
5980 load32shorts(vs_front(vs2), zetas);
5981 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5982 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5983
5984 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5985 load32shorts(vs_front(vs2), zetas);
5986 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5987 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5988
5989 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5990 load32shorts(vs_front(vs2), zetas);
5991 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5992 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5993
5994 __ leave(); // required for proper stackwalking of RuntimeStub frame
5995 __ mov(r0, zr); // return 0
5996 __ ret(lr);
5997
5998 // record the stub entry and end
5999 store_archive_data(stub_id, start, __ pc());
6000
6001 return start;
6002 }
6003
6004 // Kyber Inverse NTT function
6005 // Implements
6006 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
6007 //
6008 // coeffs (short[256]) = c_rarg0
6009 // ntt_zetas (short[256]) = c_rarg1
6010 address generate_kyberInverseNtt() {
6011 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
6012 int entry_count = StubInfo::entry_count(stub_id);
6013 assert(entry_count == 1, "sanity check");
6014 address start = load_archive_data(stub_id);
6015 if (start != nullptr) {
6016 return start;
6017 }
6018 __ align(CodeEntryAlignment);
6019 StubCodeMark mark(this, stub_id);
6020 start = __ pc();
6021 __ enter();
6022
6023 const Register coeffs = c_rarg0;
6024 const Register zetas = c_rarg1;
6025
6026 const Register kyberConsts = r10;
6027 const Register tmpAddr = r11;
6028 const Register tmpAddr2 = c_rarg2;
6029
6030 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6031 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6032 VSeq<2> vq(30); // n.b. constants overlap vs3
6033
6034 __ lea(kyberConsts,
6035 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6036
6037 // level 0
6038 // At level 0 related coefficients occur in discrete blocks of size 4 so
6039 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6040
6041 vs_ldpq(vq, kyberConsts);
6042 int offsets4[4] = { 0, 32, 64, 96 };
6043 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6044 load32shorts(vs_front(vs2), zetas);
6045 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6046 vs_front(vs2), vs_back(vs2), vtmp, vq);
6047 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6048 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6049 load32shorts(vs_front(vs2), zetas);
6050 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6051 vs_front(vs2), vs_back(vs2), vtmp, vq);
6052 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6053 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6054 load32shorts(vs_front(vs2), zetas);
6055 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6056 vs_front(vs2), vs_back(vs2), vtmp, vq);
6057 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6058 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6059 load32shorts(vs_front(vs2), zetas);
6060 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6061 vs_front(vs2), vs_back(vs2), vtmp, vq);
6062 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6063
6064 // level 1
6065 // At level 1 related coefficients occur in discrete blocks of size 8 so
6066 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6067
6068 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6069 load32shorts(vs_front(vs2), zetas);
6070 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6071 vs_front(vs2), vs_back(vs2), vtmp, vq);
6072 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6073 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6074 load32shorts(vs_front(vs2), zetas);
6075 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6076 vs_front(vs2), vs_back(vs2), vtmp, vq);
6077 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6078
6079 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6080 load32shorts(vs_front(vs2), zetas);
6081 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6082 vs_front(vs2), vs_back(vs2), vtmp, vq);
6083 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6084 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6085 load32shorts(vs_front(vs2), zetas);
6086 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6087 vs_front(vs2), vs_back(vs2), vtmp, vq);
6088 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6089
6090 // level 2
6091 // At level 2 coefficients occur in 8 discrete blocks of size 16
6092 // so they are loaded using employing an ldr at 8 distinct offsets.
6093
6094 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6095 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6096 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6097 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6098 vs_subv(vs1, __ T8H, vs1, vs2);
6099 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6100 load64shorts(vs2, zetas);
6101 vs_ldpq(vq, kyberConsts);
6102 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6103 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6104
6105 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6106 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6107 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6108 vs_subv(vs1, __ T8H, vs1, vs2);
6109 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6110 load64shorts(vs2, zetas);
6111 vs_ldpq(vq, kyberConsts);
6112 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6113 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6114
6115 // Barrett reduction at indexes where overflow may happen
6116
6117 // load q and the multiplier for the Barrett reduction
6118 __ add(tmpAddr, kyberConsts, 16);
6119 vs_ldpq(vq, tmpAddr);
6120
6121 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6122 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6123 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6124 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6125 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6126 vs_sshr(vs2, __ T8H, vs2, 11);
6127 vs_mlsv(vs1, __ T8H, vs2, vq1);
6128 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6129 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6130 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6131 vs_sshr(vs2, __ T8H, vs2, 11);
6132 vs_mlsv(vs1, __ T8H, vs2, vq1);
6133 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6134
6135 // level 3
6136 // From level 3 upwards coefficients occur in discrete blocks whose size is
6137 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6138
6139 int offsets2[4] = { 0, 64, 128, 192 };
6140 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6141 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6142 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6143 vs_subv(vs1, __ T8H, vs1, vs2);
6144 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6145 load64shorts(vs2, zetas);
6146 vs_ldpq(vq, kyberConsts);
6147 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6148 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6149
6150 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6151 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6152 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6153 vs_subv(vs1, __ T8H, vs1, vs2);
6154 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6155 load64shorts(vs2, zetas);
6156 vs_ldpq(vq, kyberConsts);
6157 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6158 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6159
6160 // level 4
6161
6162 int offsets1[4] = { 0, 32, 128, 160 };
6163 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6164 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6165 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6166 vs_subv(vs1, __ T8H, vs1, vs2);
6167 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6168 load64shorts(vs2, zetas);
6169 vs_ldpq(vq, kyberConsts);
6170 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6171 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6172
6173 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6174 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6175 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6176 vs_subv(vs1, __ T8H, vs1, vs2);
6177 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6178 load64shorts(vs2, zetas);
6179 vs_ldpq(vq, kyberConsts);
6180 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6181 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6182
6183 // level 5
6184
6185 __ add(tmpAddr, coeffs, 0);
6186 load64shorts(vs1, tmpAddr);
6187 __ add(tmpAddr, coeffs, 128);
6188 load64shorts(vs2, tmpAddr);
6189 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6190 vs_subv(vs1, __ T8H, vs1, vs2);
6191 __ add(tmpAddr, coeffs, 0);
6192 store64shorts(vs3, tmpAddr);
6193 load64shorts(vs2, zetas);
6194 vs_ldpq(vq, kyberConsts);
6195 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6196 __ add(tmpAddr, coeffs, 128);
6197 store64shorts(vs2, tmpAddr);
6198
6199 load64shorts(vs1, tmpAddr);
6200 __ add(tmpAddr, coeffs, 384);
6201 load64shorts(vs2, tmpAddr);
6202 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6203 vs_subv(vs1, __ T8H, vs1, vs2);
6204 __ add(tmpAddr, coeffs, 256);
6205 store64shorts(vs3, tmpAddr);
6206 load64shorts(vs2, zetas);
6207 vs_ldpq(vq, kyberConsts);
6208 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6209 __ add(tmpAddr, coeffs, 384);
6210 store64shorts(vs2, tmpAddr);
6211
6212 // Barrett reduction at indexes where overflow may happen
6213
6214 // load q and the multiplier for the Barrett reduction
6215 __ add(tmpAddr, kyberConsts, 16);
6216 vs_ldpq(vq, tmpAddr);
6217
6218 int offsets0[2] = { 0, 256 };
6219 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6220 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6221 vs_sshr(vs2, __ T8H, vs2, 11);
6222 vs_mlsv(vs1, __ T8H, vs2, vq1);
6223 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6224
6225 // level 6
6226
6227 __ add(tmpAddr, coeffs, 0);
6228 load64shorts(vs1, tmpAddr);
6229 __ add(tmpAddr, coeffs, 256);
6230 load64shorts(vs2, tmpAddr);
6231 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6232 vs_subv(vs1, __ T8H, vs1, vs2);
6233 __ add(tmpAddr, coeffs, 0);
6234 store64shorts(vs3, tmpAddr);
6235 load64shorts(vs2, zetas);
6236 vs_ldpq(vq, kyberConsts);
6237 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6238 __ add(tmpAddr, coeffs, 256);
6239 store64shorts(vs2, tmpAddr);
6240
6241 __ add(tmpAddr, coeffs, 128);
6242 load64shorts(vs1, tmpAddr);
6243 __ add(tmpAddr, coeffs, 384);
6244 load64shorts(vs2, tmpAddr);
6245 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6246 vs_subv(vs1, __ T8H, vs1, vs2);
6247 __ add(tmpAddr, coeffs, 128);
6248 store64shorts(vs3, tmpAddr);
6249 load64shorts(vs2, zetas);
6250 vs_ldpq(vq, kyberConsts);
6251 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6252 __ add(tmpAddr, coeffs, 384);
6253 store64shorts(vs2, tmpAddr);
6254
6255 // multiply by 2^-n
6256
6257 // load toMont(2^-n mod q)
6258 __ add(tmpAddr, kyberConsts, 48);
6259 __ ldr(v29, __ Q, tmpAddr);
6260
6261 vs_ldpq(vq, kyberConsts);
6262 __ add(tmpAddr, coeffs, 0);
6263 load64shorts(vs1, tmpAddr);
6264 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6265 __ add(tmpAddr, coeffs, 0);
6266 store64shorts(vs2, tmpAddr);
6267
6268 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6269 load64shorts(vs1, tmpAddr);
6270 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6271 __ add(tmpAddr, coeffs, 128);
6272 store64shorts(vs2, tmpAddr);
6273
6274 // now tmpAddr contains coeffs + 256
6275 load64shorts(vs1, tmpAddr);
6276 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6277 __ add(tmpAddr, coeffs, 256);
6278 store64shorts(vs2, tmpAddr);
6279
6280 // now tmpAddr contains coeffs + 384
6281 load64shorts(vs1, tmpAddr);
6282 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6283 __ add(tmpAddr, coeffs, 384);
6284 store64shorts(vs2, tmpAddr);
6285
6286 __ leave(); // required for proper stackwalking of RuntimeStub frame
6287 __ mov(r0, zr); // return 0
6288 __ ret(lr);
6289
6290 // record the stub entry and end
6291 store_archive_data(stub_id, start, __ pc());
6292
6293 return start;
6294 }
6295
6296 // Kyber multiply polynomials in the NTT domain.
6297 // Implements
6298 // static int implKyberNttMult(
6299 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6300 //
6301 // result (short[256]) = c_rarg0
6302 // ntta (short[256]) = c_rarg1
6303 // nttb (short[256]) = c_rarg2
6304 // zetas (short[128]) = c_rarg3
6305 address generate_kyberNttMult() {
6306 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6307 int entry_count = StubInfo::entry_count(stub_id);
6308 assert(entry_count == 1, "sanity check");
6309 address start = load_archive_data(stub_id);
6310 if (start != nullptr) {
6311 return start;
6312 }
6313 __ align(CodeEntryAlignment);
6314 StubCodeMark mark(this, stub_id);
6315 start = __ pc();
6316 __ enter();
6317
6318 const Register result = c_rarg0;
6319 const Register ntta = c_rarg1;
6320 const Register nttb = c_rarg2;
6321 const Register zetas = c_rarg3;
6322
6323 const Register kyberConsts = r10;
6324 const Register limit = r11;
6325
6326 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6327 VSeq<4> vs3(16), vs4(20);
6328 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6329 VSeq<2> vz(28); // pair of zetas
6330 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6331
6332 __ lea(kyberConsts,
6333 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6334
6335 Label kyberNttMult_loop;
6336
6337 __ add(limit, result, 512);
6338
6339 // load q and qinv
6340 vs_ldpq(vq, kyberConsts);
6341
6342 // load R^2 mod q (to convert back from Montgomery representation)
6343 __ add(kyberConsts, kyberConsts, 64);
6344 __ ldr(v27, __ Q, kyberConsts);
6345
6346 __ BIND(kyberNttMult_loop);
6347
6348 // load 16 zetas
6349 vs_ldpq_post(vz, zetas);
6350
6351 // load 2 sets of 32 coefficients from the two input arrays
6352 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6353 // are striped across pairs of vector registers
6354 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6355 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6356 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6357 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6358
6359 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6360 // i.e. montmul the first and second halves of vs1 in order and
6361 // then with one sequence reversed storing the two results in vs3
6362 //
6363 // vs3[0] <- montmul(a0, b0)
6364 // vs3[1] <- montmul(a1, b1)
6365 // vs3[2] <- montmul(a0, b1)
6366 // vs3[3] <- montmul(a1, b0)
6367 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6368 kyber_montmul16(vs_back(vs3),
6369 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6370
6371 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6372 // i.e. montmul the first and second halves of vs4 in order and
6373 // then with one sequence reversed storing the two results in vs1
6374 //
6375 // vs1[0] <- montmul(a2, b2)
6376 // vs1[1] <- montmul(a3, b3)
6377 // vs1[2] <- montmul(a2, b3)
6378 // vs1[3] <- montmul(a3, b2)
6379 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6380 kyber_montmul16(vs_back(vs1),
6381 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6382
6383 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6384 // We can schedule two montmuls at a time if we use a suitable vector
6385 // sequence <vs3[1], vs1[1]>.
6386 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6387 VSeq<2> vs5(vs3[1], delta);
6388
6389 // vs3[1] <- montmul(montmul(a1, b1), z0)
6390 // vs1[1] <- montmul(montmul(a3, b3), z1)
6391 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6392
6393 // add results in pairs storing in vs3
6394 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6395 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6396 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6397
6398 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6399 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6400 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6401
6402 // vs1 <- montmul(vs3, montRSquareModQ)
6403 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6404
6405 // store back the two pairs of result vectors de-interleaved as 8H elements
6406 // i.e. storing each pairs of shorts striped across a register pair adjacent
6407 // in memory
6408 vs_st2_post(vs1, __ T8H, result);
6409
6410 __ cmp(result, limit);
6411 __ br(Assembler::NE, kyberNttMult_loop);
6412
6413 __ leave(); // required for proper stackwalking of RuntimeStub frame
6414 __ mov(r0, zr); // return 0
6415 __ ret(lr);
6416
6417 // record the stub entry and end
6418 store_archive_data(stub_id, start, __ pc());
6419
6420 return start;
6421 }
6422
6423 // Kyber add 2 polynomials.
6424 // Implements
6425 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6426 //
6427 // result (short[256]) = c_rarg0
6428 // a (short[256]) = c_rarg1
6429 // b (short[256]) = c_rarg2
6430 address generate_kyberAddPoly_2() {
6431 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6432 int entry_count = StubInfo::entry_count(stub_id);
6433 assert(entry_count == 1, "sanity check");
6434 address start = load_archive_data(stub_id);
6435 if (start != nullptr) {
6436 return start;
6437 }
6438 __ align(CodeEntryAlignment);
6439 StubCodeMark mark(this, stub_id);
6440 start = __ pc();
6441 __ enter();
6442
6443 const Register result = c_rarg0;
6444 const Register a = c_rarg1;
6445 const Register b = c_rarg2;
6446
6447 const Register kyberConsts = r11;
6448
6449 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6450 // So, we can load, add and store the data in 3 groups of 11,
6451 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6452 // registers. A further constraint is that the mapping needs
6453 // to skip callee saves. So, we allocate the register
6454 // sequences using two 8 sequences, two 2 sequences and two
6455 // single registers.
6456 VSeq<8> vs1_1(0);
6457 VSeq<2> vs1_2(16);
6458 FloatRegister vs1_3 = v28;
6459 VSeq<8> vs2_1(18);
6460 VSeq<2> vs2_2(26);
6461 FloatRegister vs2_3 = v29;
6462
6463 // two constant vector sequences
6464 VSeq<8> vc_1(31, 0);
6465 VSeq<2> vc_2(31, 0);
6466
6467 FloatRegister vc_3 = v31;
6468 __ lea(kyberConsts,
6469 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6470
6471 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6472 for (int i = 0; i < 3; i++) {
6473 // load 80 or 88 values from a into vs1_1/2/3
6474 vs_ldpq_post(vs1_1, a);
6475 vs_ldpq_post(vs1_2, a);
6476 if (i < 2) {
6477 __ ldr(vs1_3, __ Q, __ post(a, 16));
6478 }
6479 // load 80 or 88 values from b into vs2_1/2/3
6480 vs_ldpq_post(vs2_1, b);
6481 vs_ldpq_post(vs2_2, b);
6482 if (i < 2) {
6483 __ ldr(vs2_3, __ Q, __ post(b, 16));
6484 }
6485 // sum 80 or 88 values across vs1 and vs2 into vs1
6486 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6487 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6488 if (i < 2) {
6489 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6490 }
6491 // add constant to all 80 or 88 results
6492 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6493 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6494 if (i < 2) {
6495 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6496 }
6497 // store 80 or 88 values
6498 vs_stpq_post(vs1_1, result);
6499 vs_stpq_post(vs1_2, result);
6500 if (i < 2) {
6501 __ str(vs1_3, __ Q, __ post(result, 16));
6502 }
6503 }
6504
6505 __ leave(); // required for proper stackwalking of RuntimeStub frame
6506 __ mov(r0, zr); // return 0
6507 __ ret(lr);
6508
6509 // record the stub entry and end
6510 store_archive_data(stub_id, start, __ pc());
6511
6512 return start;
6513 }
6514
6515 // Kyber add 3 polynomials.
6516 // Implements
6517 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6518 //
6519 // result (short[256]) = c_rarg0
6520 // a (short[256]) = c_rarg1
6521 // b (short[256]) = c_rarg2
6522 // c (short[256]) = c_rarg3
6523 address generate_kyberAddPoly_3() {
6524 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6525 int entry_count = StubInfo::entry_count(stub_id);
6526 assert(entry_count == 1, "sanity check");
6527 address start = load_archive_data(stub_id);
6528 if (start != nullptr) {
6529 return start;
6530 }
6531 __ align(CodeEntryAlignment);
6532 StubCodeMark mark(this, stub_id);
6533 start = __ pc();
6534 __ enter();
6535
6536 const Register result = c_rarg0;
6537 const Register a = c_rarg1;
6538 const Register b = c_rarg2;
6539 const Register c = c_rarg3;
6540
6541 const Register kyberConsts = r11;
6542
6543 // As above we sum 256 sets of values in total i.e. 32 x 8H
6544 // quadwords. So, we can load, add and store the data in 3
6545 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6546 // of 10 or 11 registers. A further constraint is that the
6547 // mapping needs to skip callee saves. So, we allocate the
6548 // register sequences using two 8 sequences, two 2 sequences
6549 // and two single registers.
6550 VSeq<8> vs1_1(0);
6551 VSeq<2> vs1_2(16);
6552 FloatRegister vs1_3 = v28;
6553 VSeq<8> vs2_1(18);
6554 VSeq<2> vs2_2(26);
6555 FloatRegister vs2_3 = v29;
6556
6557 // two constant vector sequences
6558 VSeq<8> vc_1(31, 0);
6559 VSeq<2> vc_2(31, 0);
6560
6561 FloatRegister vc_3 = v31;
6562
6563 __ lea(kyberConsts,
6564 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6565
6566 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6567 for (int i = 0; i < 3; i++) {
6568 // load 80 or 88 values from a into vs1_1/2/3
6569 vs_ldpq_post(vs1_1, a);
6570 vs_ldpq_post(vs1_2, a);
6571 if (i < 2) {
6572 __ ldr(vs1_3, __ Q, __ post(a, 16));
6573 }
6574 // load 80 or 88 values from b into vs2_1/2/3
6575 vs_ldpq_post(vs2_1, b);
6576 vs_ldpq_post(vs2_2, b);
6577 if (i < 2) {
6578 __ ldr(vs2_3, __ Q, __ post(b, 16));
6579 }
6580 // sum 80 or 88 values across vs1 and vs2 into vs1
6581 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6582 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6583 if (i < 2) {
6584 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6585 }
6586 // load 80 or 88 values from c into vs2_1/2/3
6587 vs_ldpq_post(vs2_1, c);
6588 vs_ldpq_post(vs2_2, c);
6589 if (i < 2) {
6590 __ ldr(vs2_3, __ Q, __ post(c, 16));
6591 }
6592 // sum 80 or 88 values across vs1 and vs2 into vs1
6593 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6594 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6595 if (i < 2) {
6596 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6597 }
6598 // add constant to all 80 or 88 results
6599 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6600 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6601 if (i < 2) {
6602 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6603 }
6604 // store 80 or 88 values
6605 vs_stpq_post(vs1_1, result);
6606 vs_stpq_post(vs1_2, result);
6607 if (i < 2) {
6608 __ str(vs1_3, __ Q, __ post(result, 16));
6609 }
6610 }
6611
6612 __ leave(); // required for proper stackwalking of RuntimeStub frame
6613 __ mov(r0, zr); // return 0
6614 __ ret(lr);
6615
6616 // record the stub entry and end
6617 store_archive_data(stub_id, start, __ pc());
6618
6619 return start;
6620 }
6621
6622 // Kyber parse XOF output to polynomial coefficient candidates
6623 // or decodePoly(12, ...).
6624 // Implements
6625 // static int implKyber12To16(
6626 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6627 //
6628 // we assume that parsed and condensed are allocated such that for
6629 // n = (parsedLength + 63) / 64
6630 // n blocks of 96 bytes of input can be processed, i.e.
6631 // index + n * 96 <= condensed.length and
6632 // n * 64 <= parsed.length
6633 //
6634 // condensed (byte[]) = c_rarg0
6635 // condensedIndex = c_rarg1
6636 // parsed (short[]) = c_rarg2
6637 // parsedLength = c_rarg3
6638 address generate_kyber12To16() {
6639 StubId stub_id = StubId::stubgen_kyber12To16_id;
6640 int entry_count = StubInfo::entry_count(stub_id);
6641 assert(entry_count == 1, "sanity check");
6642 address start = load_archive_data(stub_id);
6643 if (start != nullptr) {
6644 return start;
6645 }
6646 Label L_F00, L_loop;
6647
6648 __ align(CodeEntryAlignment);
6649 StubCodeMark mark(this, stub_id);
6650 start = __ pc();
6651 __ enter();
6652
6653 const Register condensed = c_rarg0;
6654 const Register condensedOffs = c_rarg1;
6655 const Register parsed = c_rarg2;
6656 const Register parsedLength = c_rarg3;
6657
6658 const Register tmpAddr = r11;
6659
6660 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6661 // quadwords so we need a 6 vector sequence for the inputs.
6662 // Parsing produces 64 shorts, employing two 8 vector
6663 // sequences to store and combine the intermediate data.
6664 VSeq<6> vin(24);
6665 VSeq<8> va(0), vb(16);
6666
6667 __ adr(tmpAddr, L_F00);
6668 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6669 __ add(condensed, condensed, condensedOffs);
6670
6671 __ BIND(L_loop);
6672 // load 96 (6 x 16B) byte values
6673 vs_ld3_post(vin, __ T16B, condensed);
6674
6675 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6676 // holds 48 (16x3) contiguous bytes from memory striped
6677 // horizontally across each of the 16 byte lanes. Equivalently,
6678 // that is 16 pairs of 12-bit integers. Likewise the back half
6679 // holds the next 48 bytes in the same arrangement.
6680
6681 // Each vector in the front half can also be viewed as a vertical
6682 // strip across the 16 pairs of 12 bit integers. Each byte in
6683 // vin[0] stores the low 8 bits of the first int in a pair. Each
6684 // byte in vin[1] stores the high 4 bits of the first int and the
6685 // low 4 bits of the second int. Each byte in vin[2] stores the
6686 // high 8 bits of the second int. Likewise the vectors in second
6687 // half.
6688
6689 // Converting the data to 16-bit shorts requires first of all
6690 // expanding each of the 6 x 16B vectors into 6 corresponding
6691 // pairs of 8H vectors. Mask, shift and add operations on the
6692 // resulting vector pairs can be used to combine 4 and 8 bit
6693 // parts of related 8H vector elements.
6694 //
6695 // The middle vectors (vin[2] and vin[5]) are actually expanded
6696 // twice, one copy manipulated to provide the lower 4 bits
6697 // belonging to the first short in a pair and another copy
6698 // manipulated to provide the higher 4 bits belonging to the
6699 // second short in a pair. This is why the the vector sequences va
6700 // and vb used to hold the expanded 8H elements are of length 8.
6701
6702 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6703 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6704 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6705 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6706 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6707 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6708 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6709 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6710
6711 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6712 // and vb[4:5]
6713 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6714 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6715 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6716 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6717 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6718 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6719
6720 // shift lo byte of copy 1 of the middle stripe into the high byte
6721 __ shl(va[2], __ T8H, va[2], 8);
6722 __ shl(va[3], __ T8H, va[3], 8);
6723 __ shl(vb[2], __ T8H, vb[2], 8);
6724 __ shl(vb[3], __ T8H, vb[3], 8);
6725
6726 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6727 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6728 // are in bit positions [4..11].
6729 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6730 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6731 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6732 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6733
6734 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6735 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6736 // copy2
6737 __ andr(va[2], __ T16B, va[2], v31);
6738 __ andr(va[3], __ T16B, va[3], v31);
6739 __ ushr(va[4], __ T8H, va[4], 4);
6740 __ ushr(va[5], __ T8H, va[5], 4);
6741 __ andr(vb[2], __ T16B, vb[2], v31);
6742 __ andr(vb[3], __ T16B, vb[3], v31);
6743 __ ushr(vb[4], __ T8H, vb[4], 4);
6744 __ ushr(vb[5], __ T8H, vb[5], 4);
6745
6746 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6747 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6748 // n.b. the ordering ensures: i) inputs are consumed before they
6749 // are overwritten ii) the order of 16-bit results across successive
6750 // pairs of vectors in va and then vb reflects the order of the
6751 // corresponding 12-bit inputs
6752 __ addv(va[0], __ T8H, va[0], va[2]);
6753 __ addv(va[2], __ T8H, va[1], va[3]);
6754 __ addv(va[1], __ T8H, va[4], va[6]);
6755 __ addv(va[3], __ T8H, va[5], va[7]);
6756 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6757 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6758 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6759 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6760
6761 // store 64 results interleaved as shorts
6762 vs_st2_post(vs_front(va), __ T8H, parsed);
6763 vs_st2_post(vs_front(vb), __ T8H, parsed);
6764
6765 __ sub(parsedLength, parsedLength, 64);
6766 __ cmp(parsedLength, (u1)0);
6767 __ br(Assembler::GT, L_loop);
6768
6769 __ leave(); // required for proper stackwalking of RuntimeStub frame
6770 __ mov(r0, zr); // return 0
6771 __ ret(lr);
6772
6773 // bind label and generate constant data used by this stub
6774 __ BIND(L_F00);
6775 __ emit_int64(0x0f000f000f000f00);
6776 __ emit_int64(0x0f000f000f000f00);
6777
6778 // record the stub entry and end
6779 store_archive_data(stub_id, start, __ pc());
6780
6781 return start;
6782 }
6783
6784 // Kyber Barrett reduce function.
6785 // Implements
6786 // static int implKyberBarrettReduce(short[] coeffs) {}
6787 //
6788 // coeffs (short[256]) = c_rarg0
6789 address generate_kyberBarrettReduce() {
6790 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6791 int entry_count = StubInfo::entry_count(stub_id);
6792 assert(entry_count == 1, "sanity check");
6793 address start = load_archive_data(stub_id);
6794 if (start != nullptr) {
6795 return start;
6796 }
6797 __ align(CodeEntryAlignment);
6798 StubCodeMark mark(this, stub_id);
6799 start = __ pc();
6800 __ enter();
6801
6802 const Register coeffs = c_rarg0;
6803
6804 const Register kyberConsts = r10;
6805 const Register result = r11;
6806
6807 // As above we process 256 sets of values in total i.e. 32 x
6808 // 8H quadwords. So, we can load, add and store the data in 3
6809 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6810 // of 10 or 11 registers. A further constraint is that the
6811 // mapping needs to skip callee saves. So, we allocate the
6812 // register sequences using two 8 sequences, two 2 sequences
6813 // and two single registers.
6814 VSeq<8> vs1_1(0);
6815 VSeq<2> vs1_2(16);
6816 FloatRegister vs1_3 = v28;
6817 VSeq<8> vs2_1(18);
6818 VSeq<2> vs2_2(26);
6819 FloatRegister vs2_3 = v29;
6820
6821 // we also need a pair of corresponding constant sequences
6822
6823 VSeq<8> vc1_1(30, 0);
6824 VSeq<2> vc1_2(30, 0);
6825 FloatRegister vc1_3 = v30; // for kyber_q
6826
6827 VSeq<8> vc2_1(31, 0);
6828 VSeq<2> vc2_2(31, 0);
6829 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6830
6831 __ add(result, coeffs, 0);
6832 __ lea(kyberConsts,
6833 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6834
6835 // load q and the multiplier for the Barrett reduction
6836 __ add(kyberConsts, kyberConsts, 16);
6837 __ ldpq(vc1_3, vc2_3, kyberConsts);
6838
6839 for (int i = 0; i < 3; i++) {
6840 // load 80 or 88 coefficients
6841 vs_ldpq_post(vs1_1, coeffs);
6842 vs_ldpq_post(vs1_2, coeffs);
6843 if (i < 2) {
6844 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6845 }
6846
6847 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6848 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6849 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6850 if (i < 2) {
6851 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6852 }
6853
6854 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6855 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6856 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6857 if (i < 2) {
6858 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6859 }
6860
6861 // vs1 <- vs1 - vs2 * kyber_q
6862 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6863 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6864 if (i < 2) {
6865 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6866 }
6867
6868 vs_stpq_post(vs1_1, result);
6869 vs_stpq_post(vs1_2, result);
6870 if (i < 2) {
6871 __ str(vs1_3, __ Q, __ post(result, 16));
6872 }
6873 }
6874
6875 __ leave(); // required for proper stackwalking of RuntimeStub frame
6876 __ mov(r0, zr); // return 0
6877 __ ret(lr);
6878
6879 // record the stub entry and end
6880 store_archive_data(stub_id, start, __ pc());
6881
6882 return start;
6883 }
6884
6885
6886 // Dilithium-specific montmul helper routines that generate parallel
6887 // code for, respectively, a single 4x4s vector sequence montmul or
6888 // two such multiplies in a row.
6889
6890 // Perform 16 32-bit Montgomery multiplications in parallel
6891 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6892 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6893 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6894 // It will assert that the register use is valid
6895 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6896 }
6897
6898 // Perform 2x16 32-bit Montgomery multiplications in parallel
6899 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6900 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6901 // Schedule two successive 4x4S multiplies via the montmul helper
6902 // on the front and back halves of va, vb and vc. The helper will
6903 // assert that the register use has no overlap conflicts on each
6904 // individual call but we also need to ensure that the necessary
6905 // disjoint/equality constraints are met across both calls.
6906
6907 // vb, vc, vtmp and vq must be disjoint. va must either be
6908 // disjoint from all other registers or equal vc
6909
6910 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6911 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6912 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6913
6914 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6915 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6916
6917 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6918
6919 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6920 assert(vs_disjoint(va, vb), "va and vb overlap");
6921 assert(vs_disjoint(va, vq), "va and vq overlap");
6922 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6923
6924 // We multiply the front and back halves of each sequence 4 at a
6925 // time because
6926 //
6927 // 1) we are currently only able to get 4-way instruction
6928 // parallelism at best
6929 //
6930 // 2) we need registers for the constants in vq and temporary
6931 // scratch registers to hold intermediate results so vtmp can only
6932 // be a VSeq<4> which means we only have 4 scratch slots.
6933
6934 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6935 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6936 }
6937
6938 // Perform combined montmul then add/sub on 4x4S vectors.
6939 void dilithium_montmul16_sub_add(
6940 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6941 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6942 // compute a = montmul(a1, c)
6943 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6944 // ouptut a1 = a0 - a
6945 vs_subv(va1, __ T4S, va0, vc);
6946 // and a0 = a0 + a
6947 vs_addv(va0, __ T4S, va0, vc);
6948 }
6949
6950 // Perform combined add/sub then montul on 4x4S vectors.
6951 void dilithium_sub_add_montmul16(
6952 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6953 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6954 // compute c = a0 - a1
6955 vs_subv(vtmp1, __ T4S, va0, va1);
6956 // output a0 = a0 + a1
6957 vs_addv(va0, __ T4S, va0, va1);
6958 // output a1 = b montmul c
6959 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6960 }
6961
6962 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6963 // in the Java implementation come in sequences of at least 8, so we
6964 // can use ldpq to collect the corresponding data into pairs of vector
6965 // registers.
6966 // We collect the coefficients corresponding to the 'j+l' indexes into
6967 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6968 // then we do the (Montgomery) multiplications by the zetas in parallel
6969 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6970 // v0-v7, then do the additions into v24-v31 and the subtractions into
6971 // v0-v7 and finally save the results back to the coeffs array.
6972 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6973 const Register coeffs, const Register zetas) {
6974 int c1 = 0;
6975 int c2 = 512;
6976 int startIncr;
6977 // don't use callee save registers v8 - v15
6978 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6979 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6980 VSeq<2> vq(30); // n.b. constants overlap vs3
6981 int offsets[4] = { 0, 32, 64, 96 };
6982
6983 for (int level = 0; level < 5; level++) {
6984 int c1Start = c1;
6985 int c2Start = c2;
6986 if (level == 3) {
6987 offsets[1] = 32;
6988 offsets[2] = 128;
6989 offsets[3] = 160;
6990 } else if (level == 4) {
6991 offsets[1] = 64;
6992 offsets[2] = 128;
6993 offsets[3] = 192;
6994 }
6995
6996 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6997 // time at 4 different offsets and multiply them in order by the
6998 // next set of input values. So we employ indexed load and store
6999 // pair instructions with arrangement 4S.
7000 for (int i = 0; i < 4; i++) {
7001 // reload q and qinv
7002 vs_ldpq(vq, dilithiumConsts); // qInv, q
7003 // load 8x4S coefficients via second start pos == c2
7004 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
7005 // load next 8x4S inputs == b
7006 vs_ldpq_post(vs2, zetas);
7007 // compute a == c2 * b mod MONT_Q
7008 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7009 // load 8x4s coefficients via first start pos == c1
7010 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7011 // compute a1 = c1 + a
7012 vs_addv(vs3, __ T4S, vs1, vs2);
7013 // compute a2 = c1 - a
7014 vs_subv(vs1, __ T4S, vs1, vs2);
7015 // output a1 and a2
7016 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7017 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
7018
7019 int k = 4 * level + i;
7020
7021 if (k > 7) {
7022 startIncr = 256;
7023 } else if (k == 5) {
7024 startIncr = 384;
7025 } else {
7026 startIncr = 128;
7027 }
7028
7029 c1Start += startIncr;
7030 c2Start += startIncr;
7031 }
7032
7033 c2 /= 2;
7034 }
7035 }
7036
7037 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7038 // Implements the method
7039 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7040 // of the Java class sun.security.provider
7041 //
7042 // coeffs (int[256]) = c_rarg0
7043 // zetas (int[256]) = c_rarg1
7044 address generate_dilithiumAlmostNtt() {
7045 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7046 int entry_count = StubInfo::entry_count(stub_id);
7047 assert(entry_count == 1, "sanity check");
7048 address start = load_archive_data(stub_id);
7049 if (start != nullptr) {
7050 return start;
7051 }
7052 __ align(CodeEntryAlignment);
7053 StubCodeMark mark(this, stub_id);
7054 start = __ pc();
7055 __ enter();
7056
7057 const Register coeffs = c_rarg0;
7058 const Register zetas = c_rarg1;
7059
7060 const Register tmpAddr = r9;
7061 const Register dilithiumConsts = r10;
7062 const Register result = r11;
7063 // don't use callee save registers v8 - v15
7064 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7065 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7066 VSeq<2> vq(30); // n.b. constants overlap vs3
7067 int offsets[4] = { 0, 32, 64, 96};
7068 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7069 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7070 __ add(result, coeffs, 0);
7071 __ lea(dilithiumConsts,
7072 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7073
7074 // Each level represents one iteration of the outer for loop of the Java version.
7075
7076 // level 0-4
7077 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7078
7079 // level 5
7080
7081 // At level 5 the coefficients we need to combine with the zetas
7082 // are grouped in memory in blocks of size 4. So, for both sets of
7083 // coefficients we load 4 adjacent values at 8 different offsets
7084 // using an indexed ldr with register variant Q and multiply them
7085 // in sequence order by the next set of inputs. Likewise we store
7086 // the resuls using an indexed str with register variant Q.
7087 for (int i = 0; i < 1024; i += 256) {
7088 // reload constants q, qinv each iteration as they get clobbered later
7089 vs_ldpq(vq, dilithiumConsts); // qInv, q
7090 // load 32 (8x4S) coefficients via first offsets = c1
7091 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7092 // load next 32 (8x4S) inputs = b
7093 vs_ldpq_post(vs2, zetas);
7094 // a = b montul c1
7095 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7096 // load 32 (8x4S) coefficients via second offsets = c2
7097 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7098 // add/sub with result of multiply
7099 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7100 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7101 // write back new coefficients using same offsets
7102 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7103 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7104 }
7105
7106 // level 6
7107 // At level 6 the coefficients we need to combine with the zetas
7108 // are grouped in memory in pairs, the first two being montmul
7109 // inputs and the second add/sub inputs. We can still implement
7110 // the montmul+sub+add using 4-way parallelism but only if we
7111 // combine the coefficients with the zetas 16 at a time. We load 8
7112 // adjacent values at 4 different offsets using an ld2 load with
7113 // arrangement 2D. That interleaves the lower and upper halves of
7114 // each pair of quadwords into successive vector registers. We
7115 // then need to montmul the 4 even elements of the coefficients
7116 // register sequence by the zetas in order and then add/sub the 4
7117 // odd elements of the coefficients register sequence. We use an
7118 // equivalent st2 operation to store the results back into memory
7119 // de-interleaved.
7120 for (int i = 0; i < 1024; i += 128) {
7121 // reload constants q, qinv each iteration as they get clobbered later
7122 vs_ldpq(vq, dilithiumConsts); // qInv, q
7123 // load interleaved 16 (4x2D) coefficients via offsets
7124 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7125 // load next 16 (4x4S) inputs
7126 vs_ldpq_post(vs_front(vs2), zetas);
7127 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7128 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7129 vs_front(vs2), vtmp, vq);
7130 // store interleaved 16 (4x2D) coefficients via offsets
7131 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7132 }
7133
7134 // level 7
7135 // At level 7 the coefficients we need to combine with the zetas
7136 // occur singly with montmul inputs alterating with add/sub
7137 // inputs. Once again we can use 4-way parallelism to combine 16
7138 // zetas at a time. However, we have to load 8 adjacent values at
7139 // 4 different offsets using an ld2 load with arrangement 4S. That
7140 // interleaves the the odd words of each pair into one
7141 // coefficients vector register and the even words of the pair
7142 // into the next register. We then need to montmul the 4 even
7143 // elements of the coefficients register sequence by the zetas in
7144 // order and then add/sub the 4 odd elements of the coefficients
7145 // register sequence. We use an equivalent st2 operation to store
7146 // the results back into memory de-interleaved.
7147
7148 for (int i = 0; i < 1024; i += 128) {
7149 // reload constants q, qinv each iteration as they get clobbered later
7150 vs_ldpq(vq, dilithiumConsts); // qInv, q
7151 // load interleaved 16 (4x4S) coefficients via offsets
7152 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7153 // load next 16 (4x4S) inputs
7154 vs_ldpq_post(vs_front(vs2), zetas);
7155 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7156 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7157 vs_front(vs2), vtmp, vq);
7158 // store interleaved 16 (4x4S) coefficients via offsets
7159 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7160 }
7161 __ leave(); // required for proper stackwalking of RuntimeStub frame
7162 __ mov(r0, zr); // return 0
7163 __ ret(lr);
7164
7165 // record the stub entry and end
7166 store_archive_data(stub_id, start, __ pc());
7167
7168 return start;
7169 }
7170
7171 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7172 // in the Java implementation come in sequences of at least 8, so we
7173 // can use ldpq to collect the corresponding data into pairs of vector
7174 // registers
7175 // We collect the coefficients that correspond to the 'j's into vs1
7176 // the coefficiets that correspond to the 'j+l's into vs2 then
7177 // do the additions into vs3 and the subtractions into vs1 then
7178 // save the result of the additions, load the zetas into vs2
7179 // do the (Montgomery) multiplications by zeta in parallel into vs2
7180 // finally save the results back to the coeffs array
7181 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7182 const Register coeffs, const Register zetas) {
7183 int c1 = 0;
7184 int c2 = 32;
7185 int startIncr;
7186 int offsets[4];
7187 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7188 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7189 VSeq<2> vq(30); // n.b. constants overlap vs3
7190
7191 offsets[0] = 0;
7192
7193 for (int level = 3; level < 8; level++) {
7194 int c1Start = c1;
7195 int c2Start = c2;
7196 if (level == 3) {
7197 offsets[1] = 64;
7198 offsets[2] = 128;
7199 offsets[3] = 192;
7200 } else if (level == 4) {
7201 offsets[1] = 32;
7202 offsets[2] = 128;
7203 offsets[3] = 160;
7204 } else {
7205 offsets[1] = 32;
7206 offsets[2] = 64;
7207 offsets[3] = 96;
7208 }
7209
7210 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7211 // time at 4 different offsets and multiply them in order by the
7212 // next set of input values. So we employ indexed load and store
7213 // pair instructions with arrangement 4S.
7214 for (int i = 0; i < 4; i++) {
7215 // load v1 32 (8x4S) coefficients relative to first start index
7216 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7217 // load v2 32 (8x4S) coefficients relative to second start index
7218 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7219 // a0 = v1 + v2 -- n.b. clobbers vqs
7220 vs_addv(vs3, __ T4S, vs1, vs2);
7221 // a1 = v1 - v2
7222 vs_subv(vs1, __ T4S, vs1, vs2);
7223 // save a1 relative to first start index
7224 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7225 // load constants q, qinv each iteration as they get clobbered above
7226 vs_ldpq(vq, dilithiumConsts); // qInv, q
7227 // load b next 32 (8x4S) inputs
7228 vs_ldpq_post(vs2, zetas);
7229 // a = a1 montmul b
7230 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7231 // save a relative to second start index
7232 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7233
7234 int k = 4 * level + i;
7235
7236 if (k < 24) {
7237 startIncr = 256;
7238 } else if (k == 25) {
7239 startIncr = 384;
7240 } else {
7241 startIncr = 128;
7242 }
7243
7244 c1Start += startIncr;
7245 c2Start += startIncr;
7246 }
7247
7248 c2 *= 2;
7249 }
7250 }
7251
7252 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7253 // Implements the method
7254 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7255 // the sun.security.provider.ML_DSA class.
7256 //
7257 // coeffs (int[256]) = c_rarg0
7258 // zetas (int[256]) = c_rarg1
7259 address generate_dilithiumAlmostInverseNtt() {
7260 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7261 int entry_count = StubInfo::entry_count(stub_id);
7262 assert(entry_count == 1, "sanity check");
7263 address start = load_archive_data(stub_id);
7264 if (start != nullptr) {
7265 return start;
7266 }
7267 __ align(CodeEntryAlignment);
7268 StubCodeMark mark(this, stub_id);
7269 start = __ pc();
7270 __ enter();
7271
7272 const Register coeffs = c_rarg0;
7273 const Register zetas = c_rarg1;
7274
7275 const Register tmpAddr = r9;
7276 const Register dilithiumConsts = r10;
7277 const Register result = r11;
7278 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7279 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7280 VSeq<2> vq(30); // n.b. constants overlap vs3
7281 int offsets[4] = { 0, 32, 64, 96 };
7282 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7283 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7284
7285 __ add(result, coeffs, 0);
7286 __ lea(dilithiumConsts,
7287 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7288
7289 // Each level represents one iteration of the outer for loop of the Java version
7290
7291 // level 0
7292 // At level 0 we need to interleave adjacent quartets of
7293 // coefficients before we multiply and add/sub by the next 16
7294 // zetas just as we did for level 7 in the multiply code. So we
7295 // load and store the values using an ld2/st2 with arrangement 4S.
7296 for (int i = 0; i < 1024; i += 128) {
7297 // load constants q, qinv
7298 // n.b. this can be moved out of the loop as they do not get
7299 // clobbered by first two loops
7300 vs_ldpq(vq, dilithiumConsts); // qInv, q
7301 // a0/a1 load interleaved 32 (8x4S) coefficients
7302 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7303 // b load next 32 (8x4S) inputs
7304 vs_ldpq_post(vs_front(vs2), zetas);
7305 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7306 // n.b. second half of vs2 provides temporary register storage
7307 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7308 vs_front(vs2), vs_back(vs2), vtmp, vq);
7309 // a0/a1 store interleaved 32 (8x4S) coefficients
7310 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7311 }
7312
7313 // level 1
7314 // At level 1 we need to interleave pairs of adjacent pairs of
7315 // coefficients before we multiply by the next 16 zetas just as we
7316 // did for level 6 in the multiply code. So we load and store the
7317 // values an ld2/st2 with arrangement 2D.
7318 for (int i = 0; i < 1024; i += 128) {
7319 // a0/a1 load interleaved 32 (8x2D) coefficients
7320 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7321 // b load next 16 (4x4S) inputs
7322 vs_ldpq_post(vs_front(vs2), zetas);
7323 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7324 // n.b. second half of vs2 provides temporary register storage
7325 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7326 vs_front(vs2), vs_back(vs2), vtmp, vq);
7327 // a0/a1 store interleaved 32 (8x2D) coefficients
7328 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7329 }
7330
7331 // level 2
7332 // At level 2 coefficients come in blocks of 4. So, we load 4
7333 // adjacent coefficients at 8 distinct offsets for both the first
7334 // and second coefficient sequences, using an ldr with register
7335 // variant Q then combine them with next set of 32 zetas. Likewise
7336 // we store the results using an str with register variant Q.
7337 for (int i = 0; i < 1024; i += 256) {
7338 // c0 load 32 (8x4S) coefficients via first offsets
7339 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7340 // c1 load 32 (8x4S) coefficients via second offsets
7341 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
7342 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7343 vs_addv(vs3, __ T4S, vs1, vs2);
7344 // c = c0 - c1
7345 vs_subv(vs1, __ T4S, vs1, vs2);
7346 // store a0 32 (8x4S) coefficients via first offsets
7347 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7348 // b load 32 (8x4S) next inputs
7349 vs_ldpq_post(vs2, zetas);
7350 // reload constants q, qinv -- they were clobbered earlier
7351 vs_ldpq(vq, dilithiumConsts); // qInv, q
7352 // compute a1 = b montmul c
7353 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7354 // store a1 32 (8x4S) coefficients via second offsets
7355 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7356 }
7357
7358 // level 3-7
7359 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7360
7361 __ leave(); // required for proper stackwalking of RuntimeStub frame
7362 __ mov(r0, zr); // return 0
7363 __ ret(lr);
7364
7365 // record the stub entry and end
7366 store_archive_data(stub_id, start, __ pc());
7367
7368 return start;
7369 }
7370
7371 // Dilithium multiply polynomials in the NTT domain.
7372 // Straightforward implementation of the method
7373 // static int implDilithiumNttMult(
7374 // int[] result, int[] ntta, int[] nttb {} of
7375 // the sun.security.provider.ML_DSA class.
7376 //
7377 // result (int[256]) = c_rarg0
7378 // poly1 (int[256]) = c_rarg1
7379 // poly2 (int[256]) = c_rarg2
7380 address generate_dilithiumNttMult() {
7381 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7382 int entry_count = StubInfo::entry_count(stub_id);
7383 assert(entry_count == 1, "sanity check");
7384 address start = load_archive_data(stub_id);
7385 if (start != nullptr) {
7386 return start;
7387 }
7388 __ align(CodeEntryAlignment);
7389 StubCodeMark mark(this, stub_id);
7390 start = __ pc();
7391 __ enter();
7392
7393 Label L_loop;
7394
7395 const Register result = c_rarg0;
7396 const Register poly1 = c_rarg1;
7397 const Register poly2 = c_rarg2;
7398
7399 const Register dilithiumConsts = r10;
7400 const Register len = r11;
7401
7402 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7403 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7404 VSeq<2> vq(30); // n.b. constants overlap vs3
7405 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7406
7407 __ lea(dilithiumConsts,
7408 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7409
7410 // load constants q, qinv
7411 vs_ldpq(vq, dilithiumConsts); // qInv, q
7412 // load constant rSquare into v29
7413 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7414
7415 __ mov(len, zr);
7416 __ add(len, len, 1024);
7417
7418 __ BIND(L_loop);
7419
7420 // b load 32 (8x4S) next inputs from poly1
7421 vs_ldpq_post(vs1, poly1);
7422 // c load 32 (8x4S) next inputs from poly2
7423 vs_ldpq_post(vs2, poly2);
7424 // compute a = b montmul c
7425 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7426 // compute a = rsquare montmul a
7427 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7428 // save a 32 (8x4S) results
7429 vs_stpq_post(vs2, result);
7430
7431 __ sub(len, len, 128);
7432 __ cmp(len, (u1)128);
7433 __ br(Assembler::GE, L_loop);
7434
7435 __ leave(); // required for proper stackwalking of RuntimeStub frame
7436 __ mov(r0, zr); // return 0
7437 __ ret(lr);
7438
7439 // record the stub entry and end
7440 store_archive_data(stub_id, start, __ pc());
7441
7442 return start;
7443 }
7444
7445 // Dilithium Motgomery multiply an array by a constant.
7446 // A straightforward implementation of the method
7447 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7448 // of the sun.security.provider.MLDSA class
7449 //
7450 // coeffs (int[256]) = c_rarg0
7451 // constant (int) = c_rarg1
7452 address generate_dilithiumMontMulByConstant() {
7453 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7454 int entry_count = StubInfo::entry_count(stub_id);
7455 assert(entry_count == 1, "sanity check");
7456 address start = load_archive_data(stub_id);
7457 if (start != nullptr) {
7458 return start;
7459 }
7460 __ align(CodeEntryAlignment);
7461 StubCodeMark mark(this, stub_id);
7462 start = __ pc();
7463 __ enter();
7464
7465 Label L_loop;
7466
7467 const Register coeffs = c_rarg0;
7468 const Register constant = c_rarg1;
7469
7470 const Register dilithiumConsts = r10;
7471 const Register result = r11;
7472 const Register len = r12;
7473
7474 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7475 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7476 VSeq<2> vq(30); // n.b. constants overlap vs3
7477 VSeq<8> vconst(29, 0); // for montmul by constant
7478
7479 // results track inputs
7480 __ add(result, coeffs, 0);
7481 __ lea(dilithiumConsts,
7482 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7483
7484 // load constants q, qinv -- they do not get clobbered by first two loops
7485 vs_ldpq(vq, dilithiumConsts); // qInv, q
7486 // copy caller supplied constant across vconst
7487 __ dup(vconst[0], __ T4S, constant);
7488 __ mov(len, zr);
7489 __ add(len, len, 1024);
7490
7491 __ BIND(L_loop);
7492
7493 // load next 32 inputs
7494 vs_ldpq_post(vs2, coeffs);
7495 // mont mul by constant
7496 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7497 // write next 32 results
7498 vs_stpq_post(vs2, result);
7499
7500 __ sub(len, len, 128);
7501 __ cmp(len, (u1)128);
7502 __ br(Assembler::GE, L_loop);
7503
7504 __ leave(); // required for proper stackwalking of RuntimeStub frame
7505 __ mov(r0, zr); // return 0
7506 __ ret(lr);
7507
7508 // record the stub entry and end
7509 store_archive_data(stub_id, start, __ pc());
7510
7511 return start;
7512 }
7513
7514 // Dilithium decompose poly.
7515 // Implements the method
7516 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7517 // of the sun.security.provider.ML_DSA class
7518 //
7519 // input (int[256]) = c_rarg0
7520 // lowPart (int[256]) = c_rarg1
7521 // highPart (int[256]) = c_rarg2
7522 // twoGamma2 (int) = c_rarg3
7523 // multiplier (int) = c_rarg4
7524 address generate_dilithiumDecomposePoly() {
7525 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7526 int entry_count = StubInfo::entry_count(stub_id);
7527 assert(entry_count == 1, "sanity check");
7528 address start = load_archive_data(stub_id);
7529 if (start != nullptr) {
7530 return start;
7531 }
7532 __ align(CodeEntryAlignment);
7533 StubCodeMark mark(this, stub_id);
7534 start = __ pc();
7535 Label L_loop;
7536
7537 const Register input = c_rarg0;
7538 const Register lowPart = c_rarg1;
7539 const Register highPart = c_rarg2;
7540 const Register twoGamma2 = c_rarg3;
7541 const Register multiplier = c_rarg4;
7542
7543 const Register len = r9;
7544 const Register dilithiumConsts = r10;
7545 const Register tmp = r11;
7546
7547 // 6 independent sets of 4x4s values
7548 VSeq<4> vs1(0), vs2(4), vs3(8);
7549 VSeq<4> vs4(12), vs5(16), vtmp(20);
7550
7551 // 7 constants for cross-multiplying
7552 VSeq<4> one(25, 0);
7553 VSeq<4> qminus1(26, 0);
7554 VSeq<4> g2(27, 0);
7555 VSeq<4> twog2(28, 0);
7556 VSeq<4> mult(29, 0);
7557 VSeq<4> q(30, 0);
7558 VSeq<4> qadd(31, 0);
7559
7560 __ enter();
7561
7562 __ lea(dilithiumConsts,
7563 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7564
7565 // save callee-saved registers
7566 __ stpd(v8, v9, __ pre(sp, -64));
7567 __ stpd(v10, v11, Address(sp, 16));
7568 __ stpd(v12, v13, Address(sp, 32));
7569 __ stpd(v14, v15, Address(sp, 48));
7570
7571 // populate constant registers
7572 __ mov(tmp, zr);
7573 __ add(tmp, tmp, 1);
7574 __ dup(one[0], __ T4S, tmp); // 1
7575 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7576 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7577 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7578 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7579 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7580 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7581
7582 __ mov(len, zr);
7583 __ add(len, len, 1024);
7584
7585 __ BIND(L_loop);
7586
7587 // load next 4x4S inputs interleaved: rplus --> vs1
7588 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7589
7590 // rplus = rplus - ((rplus + qadd) >> 23) * q
7591 vs_addv(vtmp, __ T4S, vs1, qadd);
7592 vs_sshr(vtmp, __ T4S, vtmp, 23);
7593 vs_mulv(vtmp, __ T4S, vtmp, q);
7594 vs_subv(vs1, __ T4S, vs1, vtmp);
7595
7596 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7597 vs_sshr(vtmp, __ T4S, vs1, 31);
7598 vs_andr(vtmp, vtmp, q);
7599 vs_addv(vs1, __ T4S, vs1, vtmp);
7600
7601 // quotient --> vs2
7602 // int quotient = (rplus * multiplier) >> 22;
7603 vs_mulv(vtmp, __ T4S, vs1, mult);
7604 vs_sshr(vs2, __ T4S, vtmp, 22);
7605
7606 // r0 --> vs3
7607 // int r0 = rplus - quotient * twoGamma2;
7608 vs_mulv(vtmp, __ T4S, vs2, twog2);
7609 vs_subv(vs3, __ T4S, vs1, vtmp);
7610
7611 // mask --> vs4
7612 // int mask = (twoGamma2 - r0) >> 22;
7613 vs_subv(vtmp, __ T4S, twog2, vs3);
7614 vs_sshr(vs4, __ T4S, vtmp, 22);
7615
7616 // r0 -= (mask & twoGamma2);
7617 vs_andr(vtmp, vs4, twog2);
7618 vs_subv(vs3, __ T4S, vs3, vtmp);
7619
7620 // quotient += (mask & 1);
7621 vs_andr(vtmp, vs4, one);
7622 vs_addv(vs2, __ T4S, vs2, vtmp);
7623
7624 // mask = (twoGamma2 / 2 - r0) >> 31;
7625 vs_subv(vtmp, __ T4S, g2, vs3);
7626 vs_sshr(vs4, __ T4S, vtmp, 31);
7627
7628 // r0 -= (mask & twoGamma2);
7629 vs_andr(vtmp, vs4, twog2);
7630 vs_subv(vs3, __ T4S, vs3, vtmp);
7631
7632 // quotient += (mask & 1);
7633 vs_andr(vtmp, vs4, one);
7634 vs_addv(vs2, __ T4S, vs2, vtmp);
7635
7636 // r1 --> vs5
7637 // int r1 = rplus - r0 - (dilithium_q - 1);
7638 vs_subv(vtmp, __ T4S, vs1, vs3);
7639 vs_subv(vs5, __ T4S, vtmp, qminus1);
7640
7641 // r1 --> vs1 (overwriting rplus)
7642 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7643 vs_negr(vtmp, __ T4S, vs5);
7644 vs_orr(vtmp, vs5, vtmp);
7645 vs_sshr(vs1, __ T4S, vtmp, 31);
7646
7647 // r0 += ~r1;
7648 vs_notr(vtmp, vs1);
7649 vs_addv(vs3, __ T4S, vs3, vtmp);
7650
7651 // r1 = r1 & quotient;
7652 vs_andr(vs1, vs2, vs1);
7653
7654 // store results inteleaved
7655 // lowPart[m] = r0;
7656 // highPart[m] = r1;
7657 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7658 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7659
7660 __ sub(len, len, 64);
7661 __ cmp(len, (u1)64);
7662 __ br(Assembler::GE, L_loop);
7663
7664 // restore callee-saved vector registers
7665 __ ldpd(v14, v15, Address(sp, 48));
7666 __ ldpd(v12, v13, Address(sp, 32));
7667 __ ldpd(v10, v11, Address(sp, 16));
7668 __ ldpd(v8, v9, __ post(sp, 64));
7669
7670 __ leave(); // required for proper stackwalking of RuntimeStub frame
7671 __ mov(r0, zr); // return 0
7672 __ ret(lr);
7673
7674 // record the stub entry and end
7675 store_archive_data(stub_id, start, __ pc());
7676
7677 return start;
7678 }
7679
7680 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7681 Register tmp0, Register tmp1, Register tmp2) {
7682 __ bic(tmp0, a2, a1); // for a0
7683 __ bic(tmp1, a3, a2); // for a1
7684 __ bic(tmp2, a4, a3); // for a2
7685 __ eor(a2, a2, tmp2);
7686 __ bic(tmp2, a0, a4); // for a3
7687 __ eor(a3, a3, tmp2);
7688 __ bic(tmp2, a1, a0); // for a4
7689 __ eor(a0, a0, tmp0);
7690 __ eor(a1, a1, tmp1);
7691 __ eor(a4, a4, tmp2);
7692 }
7693
7694 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7695 Register a0, Register a1, Register a2, Register a3, Register a4,
7696 Register a5, Register a6, Register a7, Register a8, Register a9,
7697 Register a10, Register a11, Register a12, Register a13, Register a14,
7698 Register a15, Register a16, Register a17, Register a18, Register a19,
7699 Register a20, Register a21, Register a22, Register a23, Register a24,
7700 Register tmp0, Register tmp1, Register tmp2) {
7701 __ eor3(tmp1, a4, a9, a14);
7702 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7703 __ eor3(tmp2, a1, a6, a11);
7704 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7705 __ rax1(tmp2, tmp0, tmp1); // d0
7706 {
7707
7708 Register tmp3, tmp4;
7709 if (can_use_fp && can_use_r18) {
7710 tmp3 = rfp;
7711 tmp4 = r18_tls;
7712 } else {
7713 tmp3 = a4;
7714 tmp4 = a9;
7715 __ stp(tmp3, tmp4, __ pre(sp, -16));
7716 }
7717
7718 __ eor3(tmp3, a0, a5, a10);
7719 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7720 __ eor(a0, a0, tmp2);
7721 __ eor(a5, a5, tmp2);
7722 __ eor(a10, a10, tmp2);
7723 __ eor(a15, a15, tmp2);
7724 __ eor(a20, a20, tmp2); // d0(tmp2)
7725 __ eor3(tmp3, a2, a7, a12);
7726 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7727 __ rax1(tmp3, tmp4, tmp2); // d1
7728 __ eor(a1, a1, tmp3);
7729 __ eor(a6, a6, tmp3);
7730 __ eor(a11, a11, tmp3);
7731 __ eor(a16, a16, tmp3);
7732 __ eor(a21, a21, tmp3); // d1(tmp3)
7733 __ rax1(tmp3, tmp2, tmp0); // d3
7734 __ eor3(tmp2, a3, a8, a13);
7735 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7736 __ eor(a3, a3, tmp3);
7737 __ eor(a8, a8, tmp3);
7738 __ eor(a13, a13, tmp3);
7739 __ eor(a18, a18, tmp3);
7740 __ eor(a23, a23, tmp3);
7741 __ rax1(tmp2, tmp1, tmp0); // d2
7742 __ eor(a2, a2, tmp2);
7743 __ eor(a7, a7, tmp2);
7744 __ eor(a12, a12, tmp2);
7745 __ rax1(tmp0, tmp0, tmp4); // d4
7746 if (!can_use_fp || !can_use_r18) {
7747 __ ldp(tmp3, tmp4, __ post(sp, 16));
7748 }
7749 __ eor(a17, a17, tmp2);
7750 __ eor(a22, a22, tmp2);
7751 __ eor(a4, a4, tmp0);
7752 __ eor(a9, a9, tmp0);
7753 __ eor(a14, a14, tmp0);
7754 __ eor(a19, a19, tmp0);
7755 __ eor(a24, a24, tmp0);
7756 }
7757
7758 __ rol(tmp0, a10, 3);
7759 __ rol(a10, a1, 1);
7760 __ rol(a1, a6, 44);
7761 __ rol(a6, a9, 20);
7762 __ rol(a9, a22, 61);
7763 __ rol(a22, a14, 39);
7764 __ rol(a14, a20, 18);
7765 __ rol(a20, a2, 62);
7766 __ rol(a2, a12, 43);
7767 __ rol(a12, a13, 25);
7768 __ rol(a13, a19, 8) ;
7769 __ rol(a19, a23, 56);
7770 __ rol(a23, a15, 41);
7771 __ rol(a15, a4, 27);
7772 __ rol(a4, a24, 14);
7773 __ rol(a24, a21, 2);
7774 __ rol(a21, a8, 55);
7775 __ rol(a8, a16, 45);
7776 __ rol(a16, a5, 36);
7777 __ rol(a5, a3, 28);
7778 __ rol(a3, a18, 21);
7779 __ rol(a18, a17, 15);
7780 __ rol(a17, a11, 10);
7781 __ rol(a11, a7, 6);
7782 __ mov(a7, tmp0);
7783
7784 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7785 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7786 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7787 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7788 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7789
7790 __ ldr(tmp1, __ post(rc, 8));
7791 __ eor(a0, a0, tmp1);
7792
7793 }
7794
7795 // Arguments:
7796 //
7797 // Inputs:
7798 // c_rarg0 - byte[] source+offset
7799 // c_rarg1 - byte[] SHA.state
7800 // c_rarg2 - int block_size
7801 // c_rarg3 - int offset
7802 // c_rarg4 - int limit
7803 //
7804 address generate_sha3_implCompress_gpr(StubId stub_id) {
7805 bool multi_block;
7806 switch (stub_id) {
7807 case StubId::stubgen_sha3_implCompress_id:
7808 multi_block = false;
7809 break;
7810 case StubId::stubgen_sha3_implCompressMB_id:
7811 multi_block = true;
7812 break;
7813 default:
7814 ShouldNotReachHere();
7815 }
7816 int entry_count = StubInfo::entry_count(stub_id);
7817 assert(entry_count == 1, "sanity check");
7818 address start = load_archive_data(stub_id);
7819 if (start != nullptr) {
7820 return start;
7821 }
7822 __ align(CodeEntryAlignment);
7823 StubCodeMark mark(this, stub_id);
7824 start = __ pc();
7825
7826 Register buf = c_rarg0;
7827 Register state = c_rarg1;
7828 Register block_size = c_rarg2;
7829 Register ofs = c_rarg3;
7830 Register limit = c_rarg4;
7831
7832 // use r3.r17,r19..r28 to keep a0..a24.
7833 // a0..a24 are respective locals from SHA3.java
7834 Register a0 = r25,
7835 a1 = r26,
7836 a2 = r27,
7837 a3 = r3,
7838 a4 = r4,
7839 a5 = r5,
7840 a6 = r6,
7841 a7 = r7,
7842 a8 = rscratch1, // r8
7843 a9 = rscratch2, // r9
7844 a10 = r10,
7845 a11 = r11,
7846 a12 = r12,
7847 a13 = r13,
7848 a14 = r14,
7849 a15 = r15,
7850 a16 = r16,
7851 a17 = r17,
7852 a18 = r28,
7853 a19 = r19,
7854 a20 = r20,
7855 a21 = r21,
7856 a22 = r22,
7857 a23 = r23,
7858 a24 = r24;
7859
7860 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7861
7862 Label sha3_loop, rounds24_preloop, loop_body;
7863 Label sha3_512_or_sha3_384, shake128;
7864
7865 bool can_use_r18 = false;
7866 #ifndef R18_RESERVED
7867 can_use_r18 = true;
7868 #endif
7869 bool can_use_fp = !PreserveFramePointer;
7870
7871 __ enter();
7872
7873 // save almost all yet unsaved gpr registers on stack
7874 __ str(block_size, __ pre(sp, -128));
7875 if (multi_block) {
7876 __ stpw(ofs, limit, Address(sp, 8));
7877 }
7878 // 8 bytes at sp+16 will be used to keep buf
7879 __ stp(r19, r20, Address(sp, 32));
7880 __ stp(r21, r22, Address(sp, 48));
7881 __ stp(r23, r24, Address(sp, 64));
7882 __ stp(r25, r26, Address(sp, 80));
7883 __ stp(r27, r28, Address(sp, 96));
7884 if (can_use_r18 && can_use_fp) {
7885 __ stp(r18_tls, state, Address(sp, 112));
7886 } else {
7887 __ str(state, Address(sp, 112));
7888 }
7889
7890 // begin sha3 calculations: loading a0..a24 from state arrary
7891 __ ldp(a0, a1, state);
7892 __ ldp(a2, a3, Address(state, 16));
7893 __ ldp(a4, a5, Address(state, 32));
7894 __ ldp(a6, a7, Address(state, 48));
7895 __ ldp(a8, a9, Address(state, 64));
7896 __ ldp(a10, a11, Address(state, 80));
7897 __ ldp(a12, a13, Address(state, 96));
7898 __ ldp(a14, a15, Address(state, 112));
7899 __ ldp(a16, a17, Address(state, 128));
7900 __ ldp(a18, a19, Address(state, 144));
7901 __ ldp(a20, a21, Address(state, 160));
7902 __ ldp(a22, a23, Address(state, 176));
7903 __ ldr(a24, Address(state, 192));
7904
7905 __ BIND(sha3_loop);
7906
7907 // load input
7908 __ ldp(tmp3, tmp2, __ post(buf, 16));
7909 __ eor(a0, a0, tmp3);
7910 __ eor(a1, a1, tmp2);
7911 __ ldp(tmp3, tmp2, __ post(buf, 16));
7912 __ eor(a2, a2, tmp3);
7913 __ eor(a3, a3, tmp2);
7914 __ ldp(tmp3, tmp2, __ post(buf, 16));
7915 __ eor(a4, a4, tmp3);
7916 __ eor(a5, a5, tmp2);
7917 __ ldr(tmp3, __ post(buf, 8));
7918 __ eor(a6, a6, tmp3);
7919
7920 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7921 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7922
7923 __ ldp(tmp3, tmp2, __ post(buf, 16));
7924 __ eor(a7, a7, tmp3);
7925 __ eor(a8, a8, tmp2);
7926 __ ldp(tmp3, tmp2, __ post(buf, 16));
7927 __ eor(a9, a9, tmp3);
7928 __ eor(a10, a10, tmp2);
7929 __ ldp(tmp3, tmp2, __ post(buf, 16));
7930 __ eor(a11, a11, tmp3);
7931 __ eor(a12, a12, tmp2);
7932 __ ldp(tmp3, tmp2, __ post(buf, 16));
7933 __ eor(a13, a13, tmp3);
7934 __ eor(a14, a14, tmp2);
7935 __ ldp(tmp3, tmp2, __ post(buf, 16));
7936 __ eor(a15, a15, tmp3);
7937 __ eor(a16, a16, tmp2);
7938
7939 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7940 __ andw(tmp2, block_size, 48);
7941 __ cbzw(tmp2, rounds24_preloop);
7942 __ tbnz(block_size, 5, shake128);
7943 // block_size == 144, bit5 == 0, SHA3-244
7944 __ ldr(tmp3, __ post(buf, 8));
7945 __ eor(a17, a17, tmp3);
7946 __ b(rounds24_preloop);
7947
7948 __ BIND(shake128);
7949 __ ldp(tmp3, tmp2, __ post(buf, 16));
7950 __ eor(a17, a17, tmp3);
7951 __ eor(a18, a18, tmp2);
7952 __ ldp(tmp3, tmp2, __ post(buf, 16));
7953 __ eor(a19, a19, tmp3);
7954 __ eor(a20, a20, tmp2);
7955 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7956
7957 __ BIND(sha3_512_or_sha3_384);
7958 __ ldp(tmp3, tmp2, __ post(buf, 16));
7959 __ eor(a7, a7, tmp3);
7960 __ eor(a8, a8, tmp2);
7961 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7962
7963 // SHA3-384
7964 __ ldp(tmp3, tmp2, __ post(buf, 16));
7965 __ eor(a9, a9, tmp3);
7966 __ eor(a10, a10, tmp2);
7967 __ ldp(tmp3, tmp2, __ post(buf, 16));
7968 __ eor(a11, a11, tmp3);
7969 __ eor(a12, a12, tmp2);
7970
7971 __ BIND(rounds24_preloop);
7972 __ fmovs(v0, 24.0); // float loop counter,
7973 __ fmovs(v1, 1.0); // exact representation
7974
7975 __ str(buf, Address(sp, 16));
7976 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
7977
7978 __ BIND(loop_body);
7979 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7980 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7981 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7982 tmp0, tmp1, tmp2);
7983 __ fsubs(v0, v0, v1);
7984 __ fcmps(v0, 0.0);
7985 __ br(__ NE, loop_body);
7986
7987 if (multi_block) {
7988 __ ldrw(block_size, sp); // block_size
7989 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7990 __ addw(tmp2, tmp2, block_size);
7991 __ cmpw(tmp2, tmp1);
7992 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7993 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7994 __ br(Assembler::LE, sha3_loop);
7995 __ movw(c_rarg0, tmp2); // return offset
7996 }
7997 if (can_use_fp && can_use_r18) {
7998 __ ldp(r18_tls, state, Address(sp, 112));
7999 } else {
8000 __ ldr(state, Address(sp, 112));
8001 }
8002 // save calculated sha3 state
8003 __ stp(a0, a1, Address(state));
8004 __ stp(a2, a3, Address(state, 16));
8005 __ stp(a4, a5, Address(state, 32));
8006 __ stp(a6, a7, Address(state, 48));
8007 __ stp(a8, a9, Address(state, 64));
8008 __ stp(a10, a11, Address(state, 80));
8009 __ stp(a12, a13, Address(state, 96));
8010 __ stp(a14, a15, Address(state, 112));
8011 __ stp(a16, a17, Address(state, 128));
8012 __ stp(a18, a19, Address(state, 144));
8013 __ stp(a20, a21, Address(state, 160));
8014 __ stp(a22, a23, Address(state, 176));
8015 __ str(a24, Address(state, 192));
8016
8017 // restore required registers from stack
8018 __ ldp(r19, r20, Address(sp, 32));
8019 __ ldp(r21, r22, Address(sp, 48));
8020 __ ldp(r23, r24, Address(sp, 64));
8021 __ ldp(r25, r26, Address(sp, 80));
8022 __ ldp(r27, r28, Address(sp, 96));
8023 if (can_use_fp && can_use_r18) {
8024 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
8025 } // else no need to recalculate rfp, since it wasn't changed
8026
8027 __ leave();
8028
8029 __ ret(lr);
8030
8031 // record the stub entry and end
8032 store_archive_data(stub_id, start, __ pc());
8033
8034 return start;
8035 }
8036
8037 /**
8038 * Arguments:
8039 *
8040 * Inputs:
8041 * c_rarg0 - int crc
8042 * c_rarg1 - byte* buf
8043 * c_rarg2 - int length
8044 *
8045 * Output:
8046 * rax - int crc result
8047 */
8048 address generate_updateBytesCRC32() {
8049 assert(UseCRC32Intrinsics, "what are we doing here?");
8050 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
8051 int entry_count = StubInfo::entry_count(stub_id);
8052 assert(entry_count == 1, "sanity check");
8053 address start = load_archive_data(stub_id);
8054 if (start != nullptr) {
8055 return start;
8056 }
8057 __ align(CodeEntryAlignment);
8058 StubCodeMark mark(this, stub_id);
8059
8060 start = __ pc();
8061
8062 const Register crc = c_rarg0; // crc
8063 const Register buf = c_rarg1; // source java byte array address
8064 const Register len = c_rarg2; // length
8065 const Register table0 = c_rarg3; // crc_table address
8066 const Register table1 = c_rarg4;
8067 const Register table2 = c_rarg5;
8068 const Register table3 = c_rarg6;
8069 const Register tmp3 = c_rarg7;
8070
8071 BLOCK_COMMENT("Entry:");
8072 __ enter(); // required for proper stackwalking of RuntimeStub frame
8073
8074 __ kernel_crc32(crc, buf, len,
8075 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8076
8077 __ leave(); // required for proper stackwalking of RuntimeStub frame
8078 __ ret(lr);
8079
8080 // record the stub entry and end
8081 store_archive_data(stub_id, start, __ pc());
8082
8083 return start;
8084 }
8085
8086 /**
8087 * Arguments:
8088 *
8089 * Inputs:
8090 * c_rarg0 - int crc
8091 * c_rarg1 - byte* buf
8092 * c_rarg2 - int length
8093 * c_rarg3 - int* table
8094 *
8095 * Output:
8096 * r0 - int crc result
8097 */
8098 address generate_updateBytesCRC32C() {
8099 assert(UseCRC32CIntrinsics, "what are we doing here?");
8100 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
8101 int entry_count = StubInfo::entry_count(stub_id);
8102 assert(entry_count == 1, "sanity check");
8103 address start = load_archive_data(stub_id);
8104 if (start != nullptr) {
8105 return start;
8106 }
8107 __ align(CodeEntryAlignment);
8108 StubCodeMark mark(this, stub_id);
8109
8110 start = __ pc();
8111
8112 const Register crc = c_rarg0; // crc
8113 const Register buf = c_rarg1; // source java byte array address
8114 const Register len = c_rarg2; // length
8115 const Register table0 = c_rarg3; // crc_table address
8116 const Register table1 = c_rarg4;
8117 const Register table2 = c_rarg5;
8118 const Register table3 = c_rarg6;
8119 const Register tmp3 = c_rarg7;
8120
8121 BLOCK_COMMENT("Entry:");
8122 __ enter(); // required for proper stackwalking of RuntimeStub frame
8123
8124 __ kernel_crc32c(crc, buf, len,
8125 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8126
8127 __ leave(); // required for proper stackwalking of RuntimeStub frame
8128 __ ret(lr);
8129
8130 // record the stub entry and end
8131 store_archive_data(stub_id, start, __ pc());
8132
8133 return start;
8134 }
8135
8136 /***
8137 * Arguments:
8138 *
8139 * Inputs:
8140 * c_rarg0 - int adler
8141 * c_rarg1 - byte* buff
8142 * c_rarg2 - int len
8143 *
8144 * Output:
8145 * c_rarg0 - int adler result
8146 */
8147 address generate_updateBytesAdler32() {
8148 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
8149 int entry_count = StubInfo::entry_count(stub_id);
8150 assert(entry_count == 1, "sanity check");
8151 address start = load_archive_data(stub_id);
8152 if (start != nullptr) {
8153 return start;
8154 }
8155 __ align(CodeEntryAlignment);
8156 StubCodeMark mark(this, stub_id);
8157 start = __ pc();
8158
8159 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
8160
8161 // Aliases
8162 Register adler = c_rarg0;
8163 Register s1 = c_rarg0;
8164 Register s2 = c_rarg3;
8165 Register buff = c_rarg1;
8166 Register len = c_rarg2;
8167 Register nmax = r4;
8168 Register base = r5;
8169 Register count = r6;
8170 Register temp0 = rscratch1;
8171 Register temp1 = rscratch2;
8172 FloatRegister vbytes = v0;
8173 FloatRegister vs1acc = v1;
8174 FloatRegister vs2acc = v2;
8175 FloatRegister vtable = v3;
8176
8177 // Max number of bytes we can process before having to take the mod
8178 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
8179 uint64_t BASE = 0xfff1;
8180 uint64_t NMAX = 0x15B0;
8181
8182 __ mov(base, BASE);
8183 __ mov(nmax, NMAX);
8184
8185 // Load accumulation coefficients for the upper 16 bits
8186 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
8187 __ ld1(vtable, __ T16B, Address(temp0));
8188
8189 // s1 is initialized to the lower 16 bits of adler
8190 // s2 is initialized to the upper 16 bits of adler
8191 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
8192 __ uxth(s1, adler); // s1 = (adler & 0xffff)
8193
8194 // The pipelined loop needs at least 16 elements for 1 iteration
8195 // It does check this, but it is more effective to skip to the cleanup loop
8196 __ cmp(len, (u1)16);
8197 __ br(Assembler::HS, L_nmax);
8198 __ cbz(len, L_combine);
8199
8200 __ bind(L_simple_by1_loop);
8201 __ ldrb(temp0, Address(__ post(buff, 1)));
8202 __ add(s1, s1, temp0);
8203 __ add(s2, s2, s1);
8204 __ subs(len, len, 1);
8205 __ br(Assembler::HI, L_simple_by1_loop);
8206
8207 // s1 = s1 % BASE
8208 __ subs(temp0, s1, base);
8209 __ csel(s1, temp0, s1, Assembler::HS);
8210
8211 // s2 = s2 % BASE
8212 __ lsr(temp0, s2, 16);
8213 __ lsl(temp1, temp0, 4);
8214 __ sub(temp1, temp1, temp0);
8215 __ add(s2, temp1, s2, ext::uxth);
8216
8217 __ subs(temp0, s2, base);
8218 __ csel(s2, temp0, s2, Assembler::HS);
8219
8220 __ b(L_combine);
8221
8222 __ bind(L_nmax);
8223 __ subs(len, len, nmax);
8224 __ sub(count, nmax, 16);
8225 __ br(Assembler::LO, L_by16);
8226
8227 __ bind(L_nmax_loop);
8228
8229 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8230 vbytes, vs1acc, vs2acc, vtable);
8231
8232 __ subs(count, count, 16);
8233 __ br(Assembler::HS, L_nmax_loop);
8234
8235 // s1 = s1 % BASE
8236 __ lsr(temp0, s1, 16);
8237 __ lsl(temp1, temp0, 4);
8238 __ sub(temp1, temp1, temp0);
8239 __ add(temp1, temp1, s1, ext::uxth);
8240
8241 __ lsr(temp0, temp1, 16);
8242 __ lsl(s1, temp0, 4);
8243 __ sub(s1, s1, temp0);
8244 __ add(s1, s1, temp1, ext:: uxth);
8245
8246 __ subs(temp0, s1, base);
8247 __ csel(s1, temp0, s1, Assembler::HS);
8248
8249 // s2 = s2 % BASE
8250 __ lsr(temp0, s2, 16);
8251 __ lsl(temp1, temp0, 4);
8252 __ sub(temp1, temp1, temp0);
8253 __ add(temp1, temp1, s2, ext::uxth);
8254
8255 __ lsr(temp0, temp1, 16);
8256 __ lsl(s2, temp0, 4);
8257 __ sub(s2, s2, temp0);
8258 __ add(s2, s2, temp1, ext:: uxth);
8259
8260 __ subs(temp0, s2, base);
8261 __ csel(s2, temp0, s2, Assembler::HS);
8262
8263 __ subs(len, len, nmax);
8264 __ sub(count, nmax, 16);
8265 __ br(Assembler::HS, L_nmax_loop);
8266
8267 __ bind(L_by16);
8268 __ adds(len, len, count);
8269 __ br(Assembler::LO, L_by1);
8270
8271 __ bind(L_by16_loop);
8272
8273 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8274 vbytes, vs1acc, vs2acc, vtable);
8275
8276 __ subs(len, len, 16);
8277 __ br(Assembler::HS, L_by16_loop);
8278
8279 __ bind(L_by1);
8280 __ adds(len, len, 15);
8281 __ br(Assembler::LO, L_do_mod);
8282
8283 __ bind(L_by1_loop);
8284 __ ldrb(temp0, Address(__ post(buff, 1)));
8285 __ add(s1, temp0, s1);
8286 __ add(s2, s2, s1);
8287 __ subs(len, len, 1);
8288 __ br(Assembler::HS, L_by1_loop);
8289
8290 __ bind(L_do_mod);
8291 // s1 = s1 % BASE
8292 __ lsr(temp0, s1, 16);
8293 __ lsl(temp1, temp0, 4);
8294 __ sub(temp1, temp1, temp0);
8295 __ add(temp1, temp1, s1, ext::uxth);
8296
8297 __ lsr(temp0, temp1, 16);
8298 __ lsl(s1, temp0, 4);
8299 __ sub(s1, s1, temp0);
8300 __ add(s1, s1, temp1, ext:: uxth);
8301
8302 __ subs(temp0, s1, base);
8303 __ csel(s1, temp0, s1, Assembler::HS);
8304
8305 // s2 = s2 % BASE
8306 __ lsr(temp0, s2, 16);
8307 __ lsl(temp1, temp0, 4);
8308 __ sub(temp1, temp1, temp0);
8309 __ add(temp1, temp1, s2, ext::uxth);
8310
8311 __ lsr(temp0, temp1, 16);
8312 __ lsl(s2, temp0, 4);
8313 __ sub(s2, s2, temp0);
8314 __ add(s2, s2, temp1, ext:: uxth);
8315
8316 __ subs(temp0, s2, base);
8317 __ csel(s2, temp0, s2, Assembler::HS);
8318
8319 // Combine lower bits and higher bits
8320 __ bind(L_combine);
8321 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
8322
8323 __ ret(lr);
8324
8325 // record the stub entry and end
8326 store_archive_data(stub_id, start, __ pc());
8327
8328 return start;
8329 }
8330
8331 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
8332 Register temp0, Register temp1, FloatRegister vbytes,
8333 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
8334 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
8335 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
8336 // In non-vectorized code, we update s1 and s2 as:
8337 // s1 <- s1 + b1
8338 // s2 <- s2 + s1
8339 // s1 <- s1 + b2
8340 // s2 <- s2 + b1
8341 // ...
8342 // s1 <- s1 + b16
8343 // s2 <- s2 + s1
8344 // Putting above assignments together, we have:
8345 // s1_new = s1 + b1 + b2 + ... + b16
8346 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
8347 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
8348 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
8349 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
8350
8351 // s2 = s2 + s1 * 16
8352 __ add(s2, s2, s1, Assembler::LSL, 4);
8353
8354 // vs1acc = b1 + b2 + b3 + ... + b16
8355 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
8356 __ umullv(vs2acc, __ T8B, vtable, vbytes);
8357 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
8358 __ uaddlv(vs1acc, __ T16B, vbytes);
8359 __ uaddlv(vs2acc, __ T8H, vs2acc);
8360
8361 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
8362 __ fmovd(temp0, vs1acc);
8363 __ fmovd(temp1, vs2acc);
8364 __ add(s1, s1, temp0);
8365 __ add(s2, s2, temp1);
8366 }
8367
8368 /**
8369 * Arguments:
8370 *
8371 * Input:
8372 * c_rarg0 - x address
8373 * c_rarg1 - x length
8374 * c_rarg2 - y address
8375 * c_rarg3 - y length
8376 * c_rarg4 - z address
8377 */
8378 address generate_multiplyToLen() {
8379 StubId stub_id = StubId::stubgen_multiplyToLen_id;
8380 int entry_count = StubInfo::entry_count(stub_id);
8381 assert(entry_count == 1, "sanity check");
8382 address start = load_archive_data(stub_id);
8383 if (start != nullptr) {
8384 return start;
8385 }
8386 __ align(CodeEntryAlignment);
8387 StubCodeMark mark(this, stub_id);
8388
8389 start = __ pc();
8390 const Register x = r0;
8391 const Register xlen = r1;
8392 const Register y = r2;
8393 const Register ylen = r3;
8394 const Register z = r4;
8395
8396 const Register tmp0 = r5;
8397 const Register tmp1 = r10;
8398 const Register tmp2 = r11;
8399 const Register tmp3 = r12;
8400 const Register tmp4 = r13;
8401 const Register tmp5 = r14;
8402 const Register tmp6 = r15;
8403 const Register tmp7 = r16;
8404
8405 BLOCK_COMMENT("Entry:");
8406 __ enter(); // required for proper stackwalking of RuntimeStub frame
8407 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8408 __ leave(); // required for proper stackwalking of RuntimeStub frame
8409 __ ret(lr);
8410
8411 // record the stub entry and end
8412 store_archive_data(stub_id, start, __ pc());
8413
8414 return start;
8415 }
8416
8417 address generate_squareToLen() {
8418 // squareToLen algorithm for sizes 1..127 described in java code works
8419 // faster than multiply_to_len on some CPUs and slower on others, but
8420 // multiply_to_len shows a bit better overall results
8421 StubId stub_id = StubId::stubgen_squareToLen_id;
8422 int entry_count = StubInfo::entry_count(stub_id);
8423 assert(entry_count == 1, "sanity check");
8424 address start = load_archive_data(stub_id);
8425 if (start != nullptr) {
8426 return start;
8427 }
8428 __ align(CodeEntryAlignment);
8429 StubCodeMark mark(this, stub_id);
8430 start = __ pc();
8431
8432 const Register x = r0;
8433 const Register xlen = r1;
8434 const Register z = r2;
8435 const Register y = r4; // == x
8436 const Register ylen = r5; // == xlen
8437
8438 const Register tmp0 = r3;
8439 const Register tmp1 = r10;
8440 const Register tmp2 = r11;
8441 const Register tmp3 = r12;
8442 const Register tmp4 = r13;
8443 const Register tmp5 = r14;
8444 const Register tmp6 = r15;
8445 const Register tmp7 = r16;
8446
8447 RegSet spilled_regs = RegSet::of(y, ylen);
8448 BLOCK_COMMENT("Entry:");
8449 __ enter();
8450 __ push(spilled_regs, sp);
8451 __ mov(y, x);
8452 __ mov(ylen, xlen);
8453 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8454 __ pop(spilled_regs, sp);
8455 __ leave();
8456 __ ret(lr);
8457
8458 // record the stub entry and end
8459 store_archive_data(stub_id, start, __ pc());
8460
8461 return start;
8462 }
8463
8464 address generate_mulAdd() {
8465 StubId stub_id = StubId::stubgen_mulAdd_id;
8466 int entry_count = StubInfo::entry_count(stub_id);
8467 assert(entry_count == 1, "sanity check");
8468 address start = load_archive_data(stub_id);
8469 if (start != nullptr) {
8470 return start;
8471 }
8472 __ align(CodeEntryAlignment);
8473 StubCodeMark mark(this, stub_id);
8474
8475 start = __ pc();
8476
8477 const Register out = r0;
8478 const Register in = r1;
8479 const Register offset = r2;
8480 const Register len = r3;
8481 const Register k = r4;
8482
8483 BLOCK_COMMENT("Entry:");
8484 __ enter();
8485 __ mul_add(out, in, offset, len, k);
8486 __ leave();
8487 __ ret(lr);
8488
8489 // record the stub entry and end
8490 store_archive_data(stub_id, start, __ pc());
8491
8492 return start;
8493 }
8494
8495 // Arguments:
8496 //
8497 // Input:
8498 // c_rarg0 - newArr address
8499 // c_rarg1 - oldArr address
8500 // c_rarg2 - newIdx
8501 // c_rarg3 - shiftCount
8502 // c_rarg4 - numIter
8503 //
8504 address generate_bigIntegerRightShift() {
8505 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
8506 int entry_count = StubInfo::entry_count(stub_id);
8507 assert(entry_count == 1, "sanity check");
8508 address start = load_archive_data(stub_id);
8509 if (start != nullptr) {
8510 return start;
8511 }
8512 __ align(CodeEntryAlignment);
8513 StubCodeMark mark(this, stub_id);
8514 start = __ pc();
8515
8516 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8517
8518 Register newArr = c_rarg0;
8519 Register oldArr = c_rarg1;
8520 Register newIdx = c_rarg2;
8521 Register shiftCount = c_rarg3;
8522 Register numIter = c_rarg4;
8523 Register idx = numIter;
8524
8525 Register newArrCur = rscratch1;
8526 Register shiftRevCount = rscratch2;
8527 Register oldArrCur = r13;
8528 Register oldArrNext = r14;
8529
8530 FloatRegister oldElem0 = v0;
8531 FloatRegister oldElem1 = v1;
8532 FloatRegister newElem = v2;
8533 FloatRegister shiftVCount = v3;
8534 FloatRegister shiftVRevCount = v4;
8535
8536 __ cbz(idx, Exit);
8537
8538 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8539
8540 // left shift count
8541 __ movw(shiftRevCount, 32);
8542 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8543
8544 // numIter too small to allow a 4-words SIMD loop, rolling back
8545 __ cmp(numIter, (u1)4);
8546 __ br(Assembler::LT, ShiftThree);
8547
8548 __ dup(shiftVCount, __ T4S, shiftCount);
8549 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8550 __ negr(shiftVCount, __ T4S, shiftVCount);
8551
8552 __ BIND(ShiftSIMDLoop);
8553
8554 // Calculate the load addresses
8555 __ sub(idx, idx, 4);
8556 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8557 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8558 __ add(oldArrCur, oldArrNext, 4);
8559
8560 // Load 4 words and process
8561 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
8562 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
8563 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8564 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8565 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8566 __ st1(newElem, __ T4S, Address(newArrCur));
8567
8568 __ cmp(idx, (u1)4);
8569 __ br(Assembler::LT, ShiftTwoLoop);
8570 __ b(ShiftSIMDLoop);
8571
8572 __ BIND(ShiftTwoLoop);
8573 __ cbz(idx, Exit);
8574 __ cmp(idx, (u1)1);
8575 __ br(Assembler::EQ, ShiftOne);
8576
8577 // Calculate the load addresses
8578 __ sub(idx, idx, 2);
8579 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8580 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8581 __ add(oldArrCur, oldArrNext, 4);
8582
8583 // Load 2 words and process
8584 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8585 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8586 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8587 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8588 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8589 __ st1(newElem, __ T2S, Address(newArrCur));
8590 __ b(ShiftTwoLoop);
8591
8592 __ BIND(ShiftThree);
8593 __ tbz(idx, 1, ShiftOne);
8594 __ tbz(idx, 0, ShiftTwo);
8595 __ ldrw(r10, Address(oldArr, 12));
8596 __ ldrw(r11, Address(oldArr, 8));
8597 __ lsrvw(r10, r10, shiftCount);
8598 __ lslvw(r11, r11, shiftRevCount);
8599 __ orrw(r12, r10, r11);
8600 __ strw(r12, Address(newArr, 8));
8601
8602 __ BIND(ShiftTwo);
8603 __ ldrw(r10, Address(oldArr, 8));
8604 __ ldrw(r11, Address(oldArr, 4));
8605 __ lsrvw(r10, r10, shiftCount);
8606 __ lslvw(r11, r11, shiftRevCount);
8607 __ orrw(r12, r10, r11);
8608 __ strw(r12, Address(newArr, 4));
8609
8610 __ BIND(ShiftOne);
8611 __ ldrw(r10, Address(oldArr, 4));
8612 __ ldrw(r11, Address(oldArr));
8613 __ lsrvw(r10, r10, shiftCount);
8614 __ lslvw(r11, r11, shiftRevCount);
8615 __ orrw(r12, r10, r11);
8616 __ strw(r12, Address(newArr));
8617
8618 __ BIND(Exit);
8619 __ ret(lr);
8620
8621 // record the stub entry and end
8622 store_archive_data(stub_id, start, __ pc());
8623
8624 return start;
8625 }
8626
8627 // Arguments:
8628 //
8629 // Input:
8630 // c_rarg0 - newArr address
8631 // c_rarg1 - oldArr address
8632 // c_rarg2 - newIdx
8633 // c_rarg3 - shiftCount
8634 // c_rarg4 - numIter
8635 //
8636 address generate_bigIntegerLeftShift() {
8637 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8638 int entry_count = StubInfo::entry_count(stub_id);
8639 assert(entry_count == 1, "sanity check");
8640 address start = load_archive_data(stub_id);
8641 if (start != nullptr) {
8642 return start;
8643 }
8644 __ align(CodeEntryAlignment);
8645 StubCodeMark mark(this, stub_id);
8646 start = __ pc();
8647
8648 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8649
8650 Register newArr = c_rarg0;
8651 Register oldArr = c_rarg1;
8652 Register newIdx = c_rarg2;
8653 Register shiftCount = c_rarg3;
8654 Register numIter = c_rarg4;
8655
8656 Register shiftRevCount = rscratch1;
8657 Register oldArrNext = rscratch2;
8658
8659 FloatRegister oldElem0 = v0;
8660 FloatRegister oldElem1 = v1;
8661 FloatRegister newElem = v2;
8662 FloatRegister shiftVCount = v3;
8663 FloatRegister shiftVRevCount = v4;
8664
8665 __ cbz(numIter, Exit);
8666
8667 __ add(oldArrNext, oldArr, 4);
8668 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8669
8670 // right shift count
8671 __ movw(shiftRevCount, 32);
8672 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8673
8674 // numIter too small to allow a 4-words SIMD loop, rolling back
8675 __ cmp(numIter, (u1)4);
8676 __ br(Assembler::LT, ShiftThree);
8677
8678 __ dup(shiftVCount, __ T4S, shiftCount);
8679 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8680 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8681
8682 __ BIND(ShiftSIMDLoop);
8683
8684 // load 4 words and process
8685 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8686 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8687 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8688 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8689 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8690 __ st1(newElem, __ T4S, __ post(newArr, 16));
8691 __ sub(numIter, numIter, 4);
8692
8693 __ cmp(numIter, (u1)4);
8694 __ br(Assembler::LT, ShiftTwoLoop);
8695 __ b(ShiftSIMDLoop);
8696
8697 __ BIND(ShiftTwoLoop);
8698 __ cbz(numIter, Exit);
8699 __ cmp(numIter, (u1)1);
8700 __ br(Assembler::EQ, ShiftOne);
8701
8702 // load 2 words and process
8703 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8704 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8705 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8706 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8707 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8708 __ st1(newElem, __ T2S, __ post(newArr, 8));
8709 __ sub(numIter, numIter, 2);
8710 __ b(ShiftTwoLoop);
8711
8712 __ BIND(ShiftThree);
8713 __ ldrw(r10, __ post(oldArr, 4));
8714 __ ldrw(r11, __ post(oldArrNext, 4));
8715 __ lslvw(r10, r10, shiftCount);
8716 __ lsrvw(r11, r11, shiftRevCount);
8717 __ orrw(r12, r10, r11);
8718 __ strw(r12, __ post(newArr, 4));
8719 __ tbz(numIter, 1, Exit);
8720 __ tbz(numIter, 0, ShiftOne);
8721
8722 __ BIND(ShiftTwo);
8723 __ ldrw(r10, __ post(oldArr, 4));
8724 __ ldrw(r11, __ post(oldArrNext, 4));
8725 __ lslvw(r10, r10, shiftCount);
8726 __ lsrvw(r11, r11, shiftRevCount);
8727 __ orrw(r12, r10, r11);
8728 __ strw(r12, __ post(newArr, 4));
8729
8730 __ BIND(ShiftOne);
8731 __ ldrw(r10, Address(oldArr));
8732 __ ldrw(r11, Address(oldArrNext));
8733 __ lslvw(r10, r10, shiftCount);
8734 __ lsrvw(r11, r11, shiftRevCount);
8735 __ orrw(r12, r10, r11);
8736 __ strw(r12, Address(newArr));
8737
8738 __ BIND(Exit);
8739 __ ret(lr);
8740
8741 // record the stub entry and end
8742 store_archive_data(stub_id, start, __ pc());
8743
8744 return start;
8745 }
8746
8747 address generate_count_positives(address &count_positives_long) {
8748 StubId stub_id = StubId::stubgen_count_positives_id;
8749 GrowableArray<address> entries;
8750 int entry_count = StubInfo::entry_count(stub_id);
8751 // We have an extra entry for count_positives_long.
8752 assert(entry_count == 2, "sanity check");
8753 address start = load_archive_data(stub_id, &entries);
8754 if (start != nullptr) {
8755 assert(entries.length() == 1,
8756 "unexpected extra entry count %d", entries.length());
8757 count_positives_long = entries.at(0);
8758 return start;
8759 }
8760 const u1 large_loop_size = 64;
8761 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8762 int dcache_line = VM_Version::dcache_line_size();
8763
8764 Register ary1 = r1, len = r2, result = r0;
8765
8766 __ align(CodeEntryAlignment);
8767 StubCodeMark mark(this, stub_id);
8768
8769 address entry = __ pc();
8770
8771 __ enter();
8772 // precondition: a copy of len is already in result
8773 // __ mov(result, len);
8774
8775 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8776 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8777
8778 __ cmp(len, (u1)15);
8779 __ br(Assembler::GT, LEN_OVER_15);
8780 // The only case when execution falls into this code is when pointer is near
8781 // the end of memory page and we have to avoid reading next page
8782 __ add(ary1, ary1, len);
8783 __ subs(len, len, 8);
8784 __ br(Assembler::GT, LEN_OVER_8);
8785 __ ldr(rscratch2, Address(ary1, -8));
8786 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8787 __ lsrv(rscratch2, rscratch2, rscratch1);
8788 __ tst(rscratch2, UPPER_BIT_MASK);
8789 __ csel(result, zr, result, Assembler::NE);
8790 __ leave();
8791 __ ret(lr);
8792 __ bind(LEN_OVER_8);
8793 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8794 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8795 __ tst(rscratch2, UPPER_BIT_MASK);
8796 __ br(Assembler::NE, RET_NO_POP);
8797 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8798 __ lsrv(rscratch1, rscratch1, rscratch2);
8799 __ tst(rscratch1, UPPER_BIT_MASK);
8800 __ bind(RET_NO_POP);
8801 __ csel(result, zr, result, Assembler::NE);
8802 __ leave();
8803 __ ret(lr);
8804
8805 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8806 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8807
8808 count_positives_long = __ pc(); // 2nd entry point
8809 entries.append(count_positives_long);
8810
8811 __ enter();
8812
8813 __ bind(LEN_OVER_15);
8814 __ push(spilled_regs, sp);
8815 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8816 __ cbz(rscratch2, ALIGNED);
8817 __ ldp(tmp6, tmp1, Address(ary1));
8818 __ mov(tmp5, 16);
8819 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8820 __ add(ary1, ary1, rscratch1);
8821 __ orr(tmp6, tmp6, tmp1);
8822 __ tst(tmp6, UPPER_BIT_MASK);
8823 __ br(Assembler::NE, RET_ADJUST);
8824 __ sub(len, len, rscratch1);
8825
8826 __ bind(ALIGNED);
8827 __ cmp(len, large_loop_size);
8828 __ br(Assembler::LT, CHECK_16);
8829 // Perform 16-byte load as early return in pre-loop to handle situation
8830 // when initially aligned large array has negative values at starting bytes,
8831 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8832 // slower. Cases with negative bytes further ahead won't be affected that
8833 // much. In fact, it'll be faster due to early loads, less instructions and
8834 // less branches in LARGE_LOOP.
8835 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8836 __ sub(len, len, 16);
8837 __ orr(tmp6, tmp6, tmp1);
8838 __ tst(tmp6, UPPER_BIT_MASK);
8839 __ br(Assembler::NE, RET_ADJUST_16);
8840 __ cmp(len, large_loop_size);
8841 __ br(Assembler::LT, CHECK_16);
8842
8843 if (SoftwarePrefetchHintDistance >= 0
8844 && SoftwarePrefetchHintDistance >= dcache_line) {
8845 // initial prefetch
8846 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8847 }
8848 __ bind(LARGE_LOOP);
8849 if (SoftwarePrefetchHintDistance >= 0) {
8850 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8851 }
8852 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8853 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8854 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8855 // instructions per cycle and have less branches, but this approach disables
8856 // early return, thus, all 64 bytes are loaded and checked every time.
8857 __ ldp(tmp2, tmp3, Address(ary1));
8858 __ ldp(tmp4, tmp5, Address(ary1, 16));
8859 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8860 __ ldp(tmp6, tmp1, Address(ary1, 48));
8861 __ add(ary1, ary1, large_loop_size);
8862 __ sub(len, len, large_loop_size);
8863 __ orr(tmp2, tmp2, tmp3);
8864 __ orr(tmp4, tmp4, tmp5);
8865 __ orr(rscratch1, rscratch1, rscratch2);
8866 __ orr(tmp6, tmp6, tmp1);
8867 __ orr(tmp2, tmp2, tmp4);
8868 __ orr(rscratch1, rscratch1, tmp6);
8869 __ orr(tmp2, tmp2, rscratch1);
8870 __ tst(tmp2, UPPER_BIT_MASK);
8871 __ br(Assembler::NE, RET_ADJUST_LONG);
8872 __ cmp(len, large_loop_size);
8873 __ br(Assembler::GE, LARGE_LOOP);
8874
8875 __ bind(CHECK_16); // small 16-byte load pre-loop
8876 __ cmp(len, (u1)16);
8877 __ br(Assembler::LT, POST_LOOP16);
8878
8879 __ bind(LOOP16); // small 16-byte load loop
8880 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8881 __ sub(len, len, 16);
8882 __ orr(tmp2, tmp2, tmp3);
8883 __ tst(tmp2, UPPER_BIT_MASK);
8884 __ br(Assembler::NE, RET_ADJUST_16);
8885 __ cmp(len, (u1)16);
8886 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8887
8888 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8889 __ cmp(len, (u1)8);
8890 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8891 __ ldr(tmp3, Address(__ post(ary1, 8)));
8892 __ tst(tmp3, UPPER_BIT_MASK);
8893 __ br(Assembler::NE, RET_ADJUST);
8894 __ sub(len, len, 8);
8895
8896 __ bind(POST_LOOP16_LOAD_TAIL);
8897 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8898 __ ldr(tmp1, Address(ary1));
8899 __ mov(tmp2, 64);
8900 __ sub(tmp4, tmp2, len, __ LSL, 3);
8901 __ lslv(tmp1, tmp1, tmp4);
8902 __ tst(tmp1, UPPER_BIT_MASK);
8903 __ br(Assembler::NE, RET_ADJUST);
8904 // Fallthrough
8905
8906 __ bind(RET_LEN);
8907 __ pop(spilled_regs, sp);
8908 __ leave();
8909 __ ret(lr);
8910
8911 // difference result - len is the count of guaranteed to be
8912 // positive bytes
8913
8914 __ bind(RET_ADJUST_LONG);
8915 __ add(len, len, (u1)(large_loop_size - 16));
8916 __ bind(RET_ADJUST_16);
8917 __ add(len, len, 16);
8918 __ bind(RET_ADJUST);
8919 __ pop(spilled_regs, sp);
8920 __ leave();
8921 __ sub(result, result, len);
8922 __ ret(lr);
8923
8924 // record the stub entry and end plus the extra entry
8925 store_archive_data(stub_id, entry, __ pc(), &entries);
8926
8927 return entry;
8928 }
8929
8930 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8931 bool usePrefetch, Label &NOT_EQUAL) {
8932 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8933 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8934 tmp7 = r12, tmp8 = r13;
8935 Label LOOP;
8936
8937 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8938 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8939 __ bind(LOOP);
8940 if (usePrefetch) {
8941 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8942 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8943 }
8944 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8945 __ eor(tmp1, tmp1, tmp2);
8946 __ eor(tmp3, tmp3, tmp4);
8947 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8948 __ orr(tmp1, tmp1, tmp3);
8949 __ cbnz(tmp1, NOT_EQUAL);
8950 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8951 __ eor(tmp5, tmp5, tmp6);
8952 __ eor(tmp7, tmp7, tmp8);
8953 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8954 __ orr(tmp5, tmp5, tmp7);
8955 __ cbnz(tmp5, NOT_EQUAL);
8956 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8957 __ eor(tmp1, tmp1, tmp2);
8958 __ eor(tmp3, tmp3, tmp4);
8959 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8960 __ orr(tmp1, tmp1, tmp3);
8961 __ cbnz(tmp1, NOT_EQUAL);
8962 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8963 __ eor(tmp5, tmp5, tmp6);
8964 __ sub(cnt1, cnt1, 8 * wordSize);
8965 __ eor(tmp7, tmp7, tmp8);
8966 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8967 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8968 // cmp) because subs allows an unlimited range of immediate operand.
8969 __ subs(tmp6, cnt1, loopThreshold);
8970 __ orr(tmp5, tmp5, tmp7);
8971 __ cbnz(tmp5, NOT_EQUAL);
8972 __ br(__ GE, LOOP);
8973 // post-loop
8974 __ eor(tmp1, tmp1, tmp2);
8975 __ eor(tmp3, tmp3, tmp4);
8976 __ orr(tmp1, tmp1, tmp3);
8977 __ sub(cnt1, cnt1, 2 * wordSize);
8978 __ cbnz(tmp1, NOT_EQUAL);
8979 }
8980
8981 void generate_large_array_equals_loop_simd(int loopThreshold,
8982 bool usePrefetch, Label &NOT_EQUAL) {
8983 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8984 tmp2 = rscratch2;
8985 Label LOOP;
8986
8987 __ bind(LOOP);
8988 if (usePrefetch) {
8989 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8990 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8991 }
8992 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8993 __ sub(cnt1, cnt1, 8 * wordSize);
8994 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8995 __ subs(tmp1, cnt1, loopThreshold);
8996 __ eor(v0, __ T16B, v0, v4);
8997 __ eor(v1, __ T16B, v1, v5);
8998 __ eor(v2, __ T16B, v2, v6);
8999 __ eor(v3, __ T16B, v3, v7);
9000 __ orr(v0, __ T16B, v0, v1);
9001 __ orr(v1, __ T16B, v2, v3);
9002 __ orr(v0, __ T16B, v0, v1);
9003 __ umov(tmp1, v0, __ D, 0);
9004 __ umov(tmp2, v0, __ D, 1);
9005 __ orr(tmp1, tmp1, tmp2);
9006 __ cbnz(tmp1, NOT_EQUAL);
9007 __ br(__ GE, LOOP);
9008 }
9009
9010 // a1 = r1 - array1 address
9011 // a2 = r2 - array2 address
9012 // result = r0 - return value. Already contains "false"
9013 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
9014 // r3-r5 are reserved temporary registers
9015 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
9016 address generate_large_array_equals() {
9017 StubId stub_id = StubId::stubgen_large_array_equals_id;
9018 int entry_count = StubInfo::entry_count(stub_id);
9019 assert(entry_count == 1, "sanity check");
9020 address start = load_archive_data(stub_id);
9021 if (start != nullptr) {
9022 return start;
9023 }
9024 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
9025 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
9026 tmp7 = r12, tmp8 = r13;
9027 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
9028 SMALL_LOOP, POST_LOOP;
9029 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
9030 // calculate if at least 32 prefetched bytes are used
9031 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
9032 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
9033 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
9034 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
9035 tmp5, tmp6, tmp7, tmp8);
9036
9037 __ align(CodeEntryAlignment);
9038
9039 StubCodeMark mark(this, stub_id);
9040
9041 address entry = __ pc();
9042 __ enter();
9043 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
9044 // also advance pointers to use post-increment instead of pre-increment
9045 __ add(a1, a1, wordSize);
9046 __ add(a2, a2, wordSize);
9047 if (AvoidUnalignedAccesses) {
9048 // both implementations (SIMD/nonSIMD) are using relatively large load
9049 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
9050 // on some CPUs in case of address is not at least 16-byte aligned.
9051 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
9052 // load if needed at least for 1st address and make if 16-byte aligned.
9053 Label ALIGNED16;
9054 __ tbz(a1, 3, ALIGNED16);
9055 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9056 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9057 __ sub(cnt1, cnt1, wordSize);
9058 __ eor(tmp1, tmp1, tmp2);
9059 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
9060 __ bind(ALIGNED16);
9061 }
9062 if (UseSIMDForArrayEquals) {
9063 if (SoftwarePrefetchHintDistance >= 0) {
9064 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9065 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9066 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
9067 /* prfm = */ true, NOT_EQUAL);
9068 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9069 __ br(__ LT, TAIL);
9070 }
9071 __ bind(NO_PREFETCH_LARGE_LOOP);
9072 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
9073 /* prfm = */ false, NOT_EQUAL);
9074 } else {
9075 __ push(spilled_regs, sp);
9076 if (SoftwarePrefetchHintDistance >= 0) {
9077 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9078 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9079 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
9080 /* prfm = */ true, NOT_EQUAL);
9081 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9082 __ br(__ LT, TAIL);
9083 }
9084 __ bind(NO_PREFETCH_LARGE_LOOP);
9085 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
9086 /* prfm = */ false, NOT_EQUAL);
9087 }
9088 __ bind(TAIL);
9089 __ cbz(cnt1, EQUAL);
9090 __ subs(cnt1, cnt1, wordSize);
9091 __ br(__ LE, POST_LOOP);
9092 __ bind(SMALL_LOOP);
9093 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9094 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9095 __ subs(cnt1, cnt1, wordSize);
9096 __ eor(tmp1, tmp1, tmp2);
9097 __ cbnz(tmp1, NOT_EQUAL);
9098 __ br(__ GT, SMALL_LOOP);
9099 __ bind(POST_LOOP);
9100 __ ldr(tmp1, Address(a1, cnt1));
9101 __ ldr(tmp2, Address(a2, cnt1));
9102 __ eor(tmp1, tmp1, tmp2);
9103 __ cbnz(tmp1, NOT_EQUAL);
9104 __ bind(EQUAL);
9105 __ mov(result, true);
9106 __ bind(NOT_EQUAL);
9107 if (!UseSIMDForArrayEquals) {
9108 __ pop(spilled_regs, sp);
9109 }
9110 __ bind(NOT_EQUAL_NO_POP);
9111 __ leave();
9112 __ ret(lr);
9113
9114 // record the stub entry and end
9115 store_archive_data(stub_id, entry, __ pc());
9116
9117 return entry;
9118 }
9119
9120 // result = r0 - return value. Contains initial hashcode value on entry.
9121 // ary = r1 - array address
9122 // cnt = r2 - elements count
9123 // Clobbers: v0-v13, rscratch1, rscratch2
9124 address generate_large_arrays_hashcode(BasicType eltype) {
9125 StubId stub_id;
9126 switch (eltype) {
9127 case T_BOOLEAN:
9128 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
9129 break;
9130 case T_BYTE:
9131 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
9132 break;
9133 case T_CHAR:
9134 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
9135 break;
9136 case T_SHORT:
9137 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
9138 break;
9139 case T_INT:
9140 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
9141 break;
9142 default:
9143 stub_id = StubId::NO_STUBID;
9144 ShouldNotReachHere();
9145 };
9146 int entry_count = StubInfo::entry_count(stub_id);
9147 assert(entry_count == 1, "sanity check");
9148 address start = load_archive_data(stub_id);
9149 if (start != nullptr) {
9150 return start;
9151 }
9152 const Register result = r0, ary = r1, cnt = r2;
9153 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
9154 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
9155 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
9156 const FloatRegister vpowm = v13;
9157
9158 ARRAYS_HASHCODE_REGISTERS;
9159
9160 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
9161
9162 unsigned int vf; // vectorization factor
9163 bool multiply_by_halves;
9164 Assembler::SIMD_Arrangement load_arrangement;
9165 switch (eltype) {
9166 case T_BOOLEAN:
9167 case T_BYTE:
9168 load_arrangement = Assembler::T8B;
9169 multiply_by_halves = true;
9170 vf = 8;
9171 break;
9172 case T_CHAR:
9173 case T_SHORT:
9174 load_arrangement = Assembler::T8H;
9175 multiply_by_halves = true;
9176 vf = 8;
9177 break;
9178 case T_INT:
9179 load_arrangement = Assembler::T4S;
9180 multiply_by_halves = false;
9181 vf = 4;
9182 break;
9183 default:
9184 ShouldNotReachHere();
9185 }
9186
9187 // Unroll factor
9188 const unsigned uf = 4;
9189
9190 // Effective vectorization factor
9191 const unsigned evf = vf * uf;
9192
9193 __ align(CodeEntryAlignment);
9194
9195 StubCodeMark mark(this, stub_id);
9196
9197 address entry = __ pc();
9198 __ enter();
9199
9200 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
9201 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
9202 // value shouldn't change throughout both loops.
9203 __ movw(rscratch1, intpow(31U, 3));
9204 __ mov(vpow, Assembler::S, 0, rscratch1);
9205 __ movw(rscratch1, intpow(31U, 2));
9206 __ mov(vpow, Assembler::S, 1, rscratch1);
9207 __ movw(rscratch1, intpow(31U, 1));
9208 __ mov(vpow, Assembler::S, 2, rscratch1);
9209 __ movw(rscratch1, intpow(31U, 0));
9210 __ mov(vpow, Assembler::S, 3, rscratch1);
9211
9212 __ mov(vmul0, Assembler::T16B, 0);
9213 __ mov(vmul0, Assembler::S, 3, result);
9214
9215 __ andr(rscratch2, cnt, (uf - 1) * vf);
9216 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
9217
9218 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
9219 __ mov(vpowm, Assembler::S, 0, rscratch1);
9220
9221 // SMALL LOOP
9222 __ bind(SMALL_LOOP);
9223
9224 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
9225 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9226 __ subsw(rscratch2, rscratch2, vf);
9227
9228 if (load_arrangement == Assembler::T8B) {
9229 // Extend 8B to 8H to be able to use vector multiply
9230 // instructions
9231 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9232 if (is_signed_subword_type(eltype)) {
9233 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9234 } else {
9235 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9236 }
9237 }
9238
9239 switch (load_arrangement) {
9240 case Assembler::T4S:
9241 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9242 break;
9243 case Assembler::T8B:
9244 case Assembler::T8H:
9245 assert(is_subword_type(eltype), "subword type expected");
9246 if (is_signed_subword_type(eltype)) {
9247 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9248 } else {
9249 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9250 }
9251 break;
9252 default:
9253 __ should_not_reach_here();
9254 }
9255
9256 // Process the upper half of a vector
9257 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9258 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9259 if (is_signed_subword_type(eltype)) {
9260 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9261 } else {
9262 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9263 }
9264 }
9265
9266 __ br(Assembler::HI, SMALL_LOOP);
9267
9268 // SMALL LOOP'S EPILOQUE
9269 __ lsr(rscratch2, cnt, exact_log2(evf));
9270 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
9271
9272 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9273 __ addv(vmul0, Assembler::T4S, vmul0);
9274 __ umov(result, vmul0, Assembler::S, 0);
9275
9276 // TAIL
9277 __ bind(TAIL);
9278
9279 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
9280 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
9281 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
9282 __ andr(rscratch2, cnt, vf - 1);
9283 __ bind(TAIL_SHORTCUT);
9284 __ adr(rscratch1, BR_BASE);
9285 // For Cortex-A53 offset is 4 because 2 nops are generated.
9286 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
9287 __ movw(rscratch2, 0x1f);
9288 __ br(rscratch1);
9289
9290 for (size_t i = 0; i < vf - 1; ++i) {
9291 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
9292 eltype);
9293 __ maddw(result, result, rscratch2, rscratch1);
9294 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
9295 // Generate 2nd nop to have 4 instructions per iteration.
9296 if (VM_Version::supports_a53mac()) {
9297 __ nop();
9298 }
9299 }
9300 __ bind(BR_BASE);
9301
9302 __ leave();
9303 __ ret(lr);
9304
9305 // LARGE LOOP
9306 __ bind(LARGE_LOOP_PREHEADER);
9307
9308 __ lsr(rscratch2, cnt, exact_log2(evf));
9309
9310 if (multiply_by_halves) {
9311 // 31^4 - multiplier between lower and upper parts of a register
9312 __ movw(rscratch1, intpow(31U, vf / 2));
9313 __ mov(vpowm, Assembler::S, 1, rscratch1);
9314 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
9315 __ movw(rscratch1, intpow(31U, evf - vf / 2));
9316 __ mov(vpowm, Assembler::S, 0, rscratch1);
9317 } else {
9318 // 31^16
9319 __ movw(rscratch1, intpow(31U, evf));
9320 __ mov(vpowm, Assembler::S, 0, rscratch1);
9321 }
9322
9323 __ mov(vmul3, Assembler::T16B, 0);
9324 __ mov(vmul2, Assembler::T16B, 0);
9325 __ mov(vmul1, Assembler::T16B, 0);
9326
9327 __ bind(LARGE_LOOP);
9328
9329 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
9330 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
9331 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
9332 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9333
9334 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
9335 Address(__ post(ary, evf * type2aelembytes(eltype))));
9336
9337 if (load_arrangement == Assembler::T8B) {
9338 // Extend 8B to 8H to be able to use vector multiply
9339 // instructions
9340 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9341 if (is_signed_subword_type(eltype)) {
9342 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9343 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9344 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9345 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9346 } else {
9347 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9348 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9349 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9350 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9351 }
9352 }
9353
9354 switch (load_arrangement) {
9355 case Assembler::T4S:
9356 __ addv(vmul3, load_arrangement, vmul3, vdata3);
9357 __ addv(vmul2, load_arrangement, vmul2, vdata2);
9358 __ addv(vmul1, load_arrangement, vmul1, vdata1);
9359 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9360 break;
9361 case Assembler::T8B:
9362 case Assembler::T8H:
9363 assert(is_subword_type(eltype), "subword type expected");
9364 if (is_signed_subword_type(eltype)) {
9365 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9366 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9367 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9368 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9369 } else {
9370 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9371 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9372 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9373 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9374 }
9375 break;
9376 default:
9377 __ should_not_reach_here();
9378 }
9379
9380 // Process the upper half of a vector
9381 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9382 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
9383 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
9384 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
9385 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
9386 if (is_signed_subword_type(eltype)) {
9387 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9388 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9389 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9390 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9391 } else {
9392 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9393 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9394 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9395 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9396 }
9397 }
9398
9399 __ subsw(rscratch2, rscratch2, 1);
9400 __ br(Assembler::HI, LARGE_LOOP);
9401
9402 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
9403 __ addv(vmul3, Assembler::T4S, vmul3);
9404 __ umov(result, vmul3, Assembler::S, 0);
9405
9406 __ mov(rscratch2, intpow(31U, vf));
9407
9408 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
9409 __ addv(vmul2, Assembler::T4S, vmul2);
9410 __ umov(rscratch1, vmul2, Assembler::S, 0);
9411 __ maddw(result, result, rscratch2, rscratch1);
9412
9413 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
9414 __ addv(vmul1, Assembler::T4S, vmul1);
9415 __ umov(rscratch1, vmul1, Assembler::S, 0);
9416 __ maddw(result, result, rscratch2, rscratch1);
9417
9418 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9419 __ addv(vmul0, Assembler::T4S, vmul0);
9420 __ umov(rscratch1, vmul0, Assembler::S, 0);
9421 __ maddw(result, result, rscratch2, rscratch1);
9422
9423 __ andr(rscratch2, cnt, vf - 1);
9424 __ cbnz(rscratch2, TAIL_SHORTCUT);
9425
9426 __ leave();
9427 __ ret(lr);
9428
9429 // record the stub entry and end
9430 store_archive_data(stub_id, entry, __ pc());
9431
9432 return entry;
9433 }
9434
9435 address generate_dsin_dcos(bool isCos) {
9436 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
9437 int entry_count = StubInfo::entry_count(stub_id);
9438 assert(entry_count == 1, "sanity check");
9439 address start = load_archive_data(stub_id);
9440 if (start != nullptr) {
9441 return start;
9442 }
9443 __ align(CodeEntryAlignment);
9444 StubCodeMark mark(this, stub_id);
9445 start = __ pc();
9446 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
9447 (address)StubRoutines::aarch64::_two_over_pi,
9448 (address)StubRoutines::aarch64::_pio2,
9449 (address)StubRoutines::aarch64::_dsin_coef,
9450 (address)StubRoutines::aarch64::_dcos_coef);
9451
9452 // record the stub entry and end
9453 store_archive_data(stub_id, start, __ pc());
9454
9455 return start;
9456 }
9457
9458 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
9459 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
9460 Label &DIFF2) {
9461 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
9462 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
9463
9464 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
9465 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9466 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
9467 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
9468
9469 __ fmovd(tmpL, vtmp3);
9470 __ eor(rscratch2, tmp3, tmpL);
9471 __ cbnz(rscratch2, DIFF2);
9472
9473 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9474 __ umov(tmpL, vtmp3, __ D, 1);
9475 __ eor(rscratch2, tmpU, tmpL);
9476 __ cbnz(rscratch2, DIFF1);
9477
9478 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
9479 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9480 __ fmovd(tmpL, vtmp);
9481 __ eor(rscratch2, tmp3, tmpL);
9482 __ cbnz(rscratch2, DIFF2);
9483
9484 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9485 __ umov(tmpL, vtmp, __ D, 1);
9486 __ eor(rscratch2, tmpU, tmpL);
9487 __ cbnz(rscratch2, DIFF1);
9488 }
9489
9490 // r0 = result
9491 // r1 = str1
9492 // r2 = cnt1
9493 // r3 = str2
9494 // r4 = cnt2
9495 // r10 = tmp1
9496 // r11 = tmp2
9497 address generate_compare_long_string_different_encoding(bool isLU) {
9498 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
9499 int entry_count = StubInfo::entry_count(stub_id);
9500 assert(entry_count == 1, "sanity check");
9501 address start = load_archive_data(stub_id);
9502 if (start != nullptr) {
9503 return start;
9504 }
9505 __ align(CodeEntryAlignment);
9506 StubCodeMark mark(this, stub_id);
9507 address entry = __ pc();
9508 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
9509 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
9510 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
9511 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9512 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
9513 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
9514 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
9515
9516 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
9517
9518 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
9519 // cnt2 == amount of characters left to compare
9520 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
9521 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9522 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
9523 __ add(str2, str2, isLU ? wordSize : wordSize/2);
9524 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
9525 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
9526 __ eor(rscratch2, tmp1, tmp2);
9527 __ mov(rscratch1, tmp2);
9528 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
9529 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
9530 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
9531 __ push(spilled_regs, sp);
9532 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
9533 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
9534
9535 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9536
9537 if (SoftwarePrefetchHintDistance >= 0) {
9538 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9539 __ br(__ LT, NO_PREFETCH);
9540 __ bind(LARGE_LOOP_PREFETCH);
9541 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
9542 __ mov(tmp4, 2);
9543 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9544 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
9545 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9546 __ subs(tmp4, tmp4, 1);
9547 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
9548 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9549 __ mov(tmp4, 2);
9550 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
9551 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9552 __ subs(tmp4, tmp4, 1);
9553 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
9554 __ sub(cnt2, cnt2, 64);
9555 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9556 __ br(__ GE, LARGE_LOOP_PREFETCH);
9557 }
9558 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
9559 __ bind(NO_PREFETCH);
9560 __ subs(cnt2, cnt2, 16);
9561 __ br(__ LT, TAIL);
9562 __ align(OptoLoopAlignment);
9563 __ bind(SMALL_LOOP); // smaller loop
9564 __ subs(cnt2, cnt2, 16);
9565 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9566 __ br(__ GE, SMALL_LOOP);
9567 __ cmn(cnt2, (u1)16);
9568 __ br(__ EQ, LOAD_LAST);
9569 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
9570 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
9571 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
9572 __ ldr(tmp3, Address(cnt1, -8));
9573 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
9574 __ b(LOAD_LAST);
9575 __ bind(DIFF2);
9576 __ mov(tmpU, tmp3);
9577 __ bind(DIFF1);
9578 __ pop(spilled_regs, sp);
9579 __ b(CALCULATE_DIFFERENCE);
9580 __ bind(LOAD_LAST);
9581 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
9582 // No need to load it again
9583 __ mov(tmpU, tmp3);
9584 __ pop(spilled_regs, sp);
9585
9586 // tmp2 points to the address of the last 4 Latin1 characters right now
9587 __ ldrs(vtmp, Address(tmp2));
9588 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9589 __ fmovd(tmpL, vtmp);
9590
9591 __ eor(rscratch2, tmpU, tmpL);
9592 __ cbz(rscratch2, DONE);
9593
9594 // Find the first different characters in the longwords and
9595 // compute their difference.
9596 __ bind(CALCULATE_DIFFERENCE);
9597 __ rev(rscratch2, rscratch2);
9598 __ clz(rscratch2, rscratch2);
9599 __ andr(rscratch2, rscratch2, -16);
9600 __ lsrv(tmp1, tmp1, rscratch2);
9601 __ uxthw(tmp1, tmp1);
9602 __ lsrv(rscratch1, rscratch1, rscratch2);
9603 __ uxthw(rscratch1, rscratch1);
9604 __ subw(result, tmp1, rscratch1);
9605 __ bind(DONE);
9606 __ ret(lr);
9607
9608 // record the stub entry and end
9609 store_archive_data(stub_id, entry, __ pc());
9610
9611 return entry;
9612 }
9613
9614 // r0 = input (float16)
9615 // v0 = result (float)
9616 // v1 = temporary float register
9617 address generate_float16ToFloat() {
9618 StubId stub_id = StubId::stubgen_hf2f_id;
9619 int entry_count = StubInfo::entry_count(stub_id);
9620 assert(entry_count == 1, "sanity check");
9621 address start = load_archive_data(stub_id);
9622 if (start != nullptr) {
9623 return start;
9624 }
9625 __ align(CodeEntryAlignment);
9626 StubCodeMark mark(this, stub_id);
9627 address entry = __ pc();
9628 BLOCK_COMMENT("Entry:");
9629 __ flt16_to_flt(v0, r0, v1);
9630 __ ret(lr);
9631
9632 // record the stub entry and end
9633 store_archive_data(stub_id, entry, __ pc());
9634
9635 return entry;
9636 }
9637
9638 // v0 = input (float)
9639 // r0 = result (float16)
9640 // v1 = temporary float register
9641 address generate_floatToFloat16() {
9642 StubId stub_id = StubId::stubgen_f2hf_id;
9643 int entry_count = StubInfo::entry_count(stub_id);
9644 assert(entry_count == 1, "sanity check");
9645 address start = load_archive_data(stub_id);
9646 if (start != nullptr) {
9647 return start;
9648 }
9649 __ align(CodeEntryAlignment);
9650 StubCodeMark mark(this, stub_id);
9651 address entry = __ pc();
9652 BLOCK_COMMENT("Entry:");
9653 __ flt_to_flt16(r0, v0, v1);
9654 __ ret(lr);
9655
9656 // record the stub entry and end
9657 store_archive_data(stub_id, entry, __ pc());
9658
9659 return entry;
9660 }
9661
9662 address generate_method_entry_barrier() {
9663 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9664 int entry_count = StubInfo::entry_count(stub_id);
9665 assert(entry_count == 1, "sanity check");
9666 address start = load_archive_data(stub_id);
9667 if (start != nullptr) {
9668 return start;
9669 }
9670 __ align(CodeEntryAlignment);
9671 StubCodeMark mark(this, stub_id);
9672
9673 Label deoptimize_label;
9674
9675 start = __ pc();
9676
9677 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9678
9679 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9680 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9681 // We can get here despite the nmethod being good, if we have not
9682 // yet applied our cross modification fence (or data fence).
9683 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9684 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9685 __ ldrw(rscratch2, rscratch2);
9686 __ strw(rscratch2, thread_epoch_addr);
9687 __ isb();
9688 __ membar(__ LoadLoad);
9689 }
9690
9691 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9692
9693 __ enter();
9694 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9695
9696 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9697
9698 __ push_call_clobbered_registers();
9699
9700 __ mov(c_rarg0, rscratch2);
9701 __ call_VM_leaf
9702 (CAST_FROM_FN_PTR
9703 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9704
9705 __ reset_last_Java_frame(true);
9706
9707 __ mov(rscratch1, r0);
9708
9709 __ pop_call_clobbered_registers();
9710
9711 __ cbnz(rscratch1, deoptimize_label);
9712
9713 __ leave();
9714 __ ret(lr);
9715
9716 __ BIND(deoptimize_label);
9717
9718 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9719 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9720
9721 __ mov(sp, rscratch1);
9722 __ br(rscratch2);
9723
9724 // record the stub entry and end
9725 store_archive_data(stub_id, start, __ pc());
9726
9727 return start;
9728 }
9729
9730 // r0 = result
9731 // r1 = str1
9732 // r2 = cnt1
9733 // r3 = str2
9734 // r4 = cnt2
9735 // r10 = tmp1
9736 // r11 = tmp2
9737 address generate_compare_long_string_same_encoding(bool isLL) {
9738 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9739 int entry_count = StubInfo::entry_count(stub_id);
9740 assert(entry_count == 1, "sanity check");
9741 address start = load_archive_data(stub_id);
9742 if (start != nullptr) {
9743 return start;
9744 }
9745 __ align(CodeEntryAlignment);
9746 StubCodeMark mark(this, stub_id);
9747 address entry = __ pc();
9748 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9749 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9750
9751 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9752
9753 // exit from large loop when less than 64 bytes left to read or we're about
9754 // to prefetch memory behind array border
9755 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9756
9757 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9758 __ eor(rscratch2, tmp1, tmp2);
9759 __ cbnz(rscratch2, CAL_DIFFERENCE);
9760
9761 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9762 // update pointers, because of previous read
9763 __ add(str1, str1, wordSize);
9764 __ add(str2, str2, wordSize);
9765 if (SoftwarePrefetchHintDistance >= 0) {
9766 __ align(OptoLoopAlignment);
9767 __ bind(LARGE_LOOP_PREFETCH);
9768 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9769 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9770
9771 for (int i = 0; i < 4; i++) {
9772 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9773 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9774 __ cmp(tmp1, tmp2);
9775 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9776 __ br(Assembler::NE, DIFF);
9777 }
9778 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9779 __ add(str1, str1, 64);
9780 __ add(str2, str2, 64);
9781 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9782 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9783 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9784 }
9785
9786 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9787 __ br(Assembler::LE, LESS16);
9788 __ align(OptoLoopAlignment);
9789 __ bind(LOOP_COMPARE16);
9790 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9791 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9792 __ cmp(tmp1, tmp2);
9793 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9794 __ br(Assembler::NE, DIFF);
9795 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9796 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9797 __ br(Assembler::LT, LESS16);
9798
9799 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9800 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9801 __ cmp(tmp1, tmp2);
9802 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9803 __ br(Assembler::NE, DIFF);
9804 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9805 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9806 __ br(Assembler::GE, LOOP_COMPARE16);
9807 __ cbz(cnt2, LENGTH_DIFF);
9808
9809 __ bind(LESS16);
9810 // each 8 compare
9811 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9812 __ br(Assembler::LE, LESS8);
9813 __ ldr(tmp1, Address(__ post(str1, 8)));
9814 __ ldr(tmp2, Address(__ post(str2, 8)));
9815 __ eor(rscratch2, tmp1, tmp2);
9816 __ cbnz(rscratch2, CAL_DIFFERENCE);
9817 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9818
9819 __ bind(LESS8); // directly load last 8 bytes
9820 if (!isLL) {
9821 __ add(cnt2, cnt2, cnt2);
9822 }
9823 __ ldr(tmp1, Address(str1, cnt2));
9824 __ ldr(tmp2, Address(str2, cnt2));
9825 __ eor(rscratch2, tmp1, tmp2);
9826 __ cbz(rscratch2, LENGTH_DIFF);
9827 __ b(CAL_DIFFERENCE);
9828
9829 __ bind(DIFF);
9830 __ cmp(tmp1, tmp2);
9831 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9832 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9833 // reuse rscratch2 register for the result of eor instruction
9834 __ eor(rscratch2, tmp1, tmp2);
9835
9836 __ bind(CAL_DIFFERENCE);
9837 __ rev(rscratch2, rscratch2);
9838 __ clz(rscratch2, rscratch2);
9839 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9840 __ lsrv(tmp1, tmp1, rscratch2);
9841 __ lsrv(tmp2, tmp2, rscratch2);
9842 if (isLL) {
9843 __ uxtbw(tmp1, tmp1);
9844 __ uxtbw(tmp2, tmp2);
9845 } else {
9846 __ uxthw(tmp1, tmp1);
9847 __ uxthw(tmp2, tmp2);
9848 }
9849 __ subw(result, tmp1, tmp2);
9850
9851 __ bind(LENGTH_DIFF);
9852 __ ret(lr);
9853
9854 // record the stub entry and end
9855 store_archive_data(stub_id, entry, __ pc());
9856
9857 return entry;
9858 }
9859
9860 enum string_compare_mode {
9861 LL,
9862 LU,
9863 UL,
9864 UU,
9865 };
9866
9867 // The following registers are declared in aarch64.ad
9868 // r0 = result
9869 // r1 = str1
9870 // r2 = cnt1
9871 // r3 = str2
9872 // r4 = cnt2
9873 // r10 = tmp1
9874 // r11 = tmp2
9875 // z0 = ztmp1
9876 // z1 = ztmp2
9877 // p0 = pgtmp1
9878 // p1 = pgtmp2
9879 address generate_compare_long_string_sve(string_compare_mode mode) {
9880 StubId stub_id;
9881 switch (mode) {
9882 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9883 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9884 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9885 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9886 default: ShouldNotReachHere();
9887 }
9888 int entry_count = StubInfo::entry_count(stub_id);
9889 assert(entry_count == 1, "sanity check");
9890 address start = load_archive_data(stub_id);
9891 if (start != nullptr) {
9892 return start;
9893 }
9894 __ align(CodeEntryAlignment);
9895 StubCodeMark mark(this, stub_id);
9896 address entry = __ pc();
9897 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9898 tmp1 = r10, tmp2 = r11;
9899
9900 Label LOOP, DONE, MISMATCH;
9901 Register vec_len = tmp1;
9902 Register idx = tmp2;
9903 // The minimum of the string lengths has been stored in cnt2.
9904 Register cnt = cnt2;
9905 FloatRegister ztmp1 = z0, ztmp2 = z1;
9906 PRegister pgtmp1 = p0, pgtmp2 = p1;
9907
9908 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9909 switch (mode) { \
9910 case LL: \
9911 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9912 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9913 break; \
9914 case LU: \
9915 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9916 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9917 break; \
9918 case UL: \
9919 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9920 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9921 break; \
9922 case UU: \
9923 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9924 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9925 break; \
9926 default: \
9927 ShouldNotReachHere(); \
9928 }
9929
9930 __ mov(idx, 0);
9931 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9932
9933 if (mode == LL) {
9934 __ sve_cntb(vec_len);
9935 } else {
9936 __ sve_cnth(vec_len);
9937 }
9938
9939 __ sub(rscratch1, cnt, vec_len);
9940
9941 __ bind(LOOP);
9942
9943 // main loop
9944 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9945 __ add(idx, idx, vec_len);
9946 // Compare strings.
9947 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9948 __ br(__ NE, MISMATCH);
9949 __ cmp(idx, rscratch1);
9950 __ br(__ LT, LOOP);
9951
9952 // post loop, last iteration
9953 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9954
9955 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9956 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9957 __ br(__ EQ, DONE);
9958
9959 __ bind(MISMATCH);
9960
9961 // Crop the vector to find its location.
9962 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9963 // Extract the first different characters of each string.
9964 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9965 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9966
9967 // Compute the difference of the first different characters.
9968 __ sub(result, rscratch1, rscratch2);
9969
9970 __ bind(DONE);
9971 __ ret(lr);
9972 #undef LOAD_PAIR
9973
9974 // record the stub entry and end
9975 store_archive_data(stub_id, entry, __ pc());
9976
9977 return entry;
9978 }
9979
9980 void generate_compare_long_strings() {
9981 if (UseSVE == 0) {
9982 StubRoutines::aarch64::_compare_long_string_LL
9983 = generate_compare_long_string_same_encoding(true);
9984 StubRoutines::aarch64::_compare_long_string_UU
9985 = generate_compare_long_string_same_encoding(false);
9986 StubRoutines::aarch64::_compare_long_string_LU
9987 = generate_compare_long_string_different_encoding(true);
9988 StubRoutines::aarch64::_compare_long_string_UL
9989 = generate_compare_long_string_different_encoding(false);
9990 } else {
9991 StubRoutines::aarch64::_compare_long_string_LL
9992 = generate_compare_long_string_sve(LL);
9993 StubRoutines::aarch64::_compare_long_string_UU
9994 = generate_compare_long_string_sve(UU);
9995 StubRoutines::aarch64::_compare_long_string_LU
9996 = generate_compare_long_string_sve(LU);
9997 StubRoutines::aarch64::_compare_long_string_UL
9998 = generate_compare_long_string_sve(UL);
9999 }
10000 }
10001
10002 // R0 = result
10003 // R1 = str2
10004 // R2 = cnt1
10005 // R3 = str1
10006 // R4 = cnt2
10007 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10008 //
10009 // This generic linear code use few additional ideas, which makes it faster:
10010 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10011 // in order to skip initial loading(help in systems with 1 ld pipeline)
10012 // 2) we can use "fast" algorithm of finding single character to search for
10013 // first symbol with less branches(1 branch per each loaded register instead
10014 // of branch for each symbol), so, this is where constants like
10015 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10016 // 3) after loading and analyzing 1st register of source string, it can be
10017 // used to search for every 1st character entry, saving few loads in
10018 // comparison with "simplier-but-slower" implementation
10019 // 4) in order to avoid lots of push/pop operations, code below is heavily
10020 // re-using/re-initializing/compressing register values, which makes code
10021 // larger and a bit less readable, however, most of extra operations are
10022 // issued during loads or branches, so, penalty is minimal
10023 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10024 StubId stub_id;
10025 if (str1_isL) {
10026 if (str2_isL) {
10027 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10028 } else {
10029 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10030 }
10031 } else {
10032 if (str2_isL) {
10033 ShouldNotReachHere();
10034 } else {
10035 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10036 }
10037 }
10038 int entry_count = StubInfo::entry_count(stub_id);
10039 assert(entry_count == 1, "sanity check");
10040 address start = load_archive_data(stub_id);
10041 if (start != nullptr) {
10042 return start;
10043 }
10044 __ align(CodeEntryAlignment);
10045 StubCodeMark mark(this, stub_id);
10046 address entry = __ pc();
10047
10048 int str1_chr_size = str1_isL ? 1 : 2;
10049 int str2_chr_size = str2_isL ? 1 : 2;
10050 int str1_chr_shift = str1_isL ? 0 : 1;
10051 int str2_chr_shift = str2_isL ? 0 : 1;
10052 bool isL = str1_isL && str2_isL;
10053 // parameters
10054 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10055 // temporary registers
10056 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10057 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10058 // redefinitions
10059 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10060
10061 __ push(spilled_regs, sp);
10062 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10063 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10064 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10065 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10066 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10067 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10068 // Read whole register from str1. It is safe, because length >=8 here
10069 __ ldr(ch1, Address(str1));
10070 // Read whole register from str2. It is safe, because length >=8 here
10071 __ ldr(ch2, Address(str2));
10072 __ sub(cnt2, cnt2, cnt1);
10073 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10074 if (str1_isL != str2_isL) {
10075 __ eor(v0, __ T16B, v0, v0);
10076 }
10077 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10078 __ mul(first, first, tmp1);
10079 // check if we have less than 1 register to check
10080 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10081 if (str1_isL != str2_isL) {
10082 __ fmovd(v1, ch1);
10083 }
10084 __ br(__ LE, L_SMALL);
10085 __ eor(ch2, first, ch2);
10086 if (str1_isL != str2_isL) {
10087 __ zip1(v1, __ T16B, v1, v0);
10088 }
10089 __ sub(tmp2, ch2, tmp1);
10090 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10091 __ bics(tmp2, tmp2, ch2);
10092 if (str1_isL != str2_isL) {
10093 __ fmovd(ch1, v1);
10094 }
10095 __ br(__ NE, L_HAS_ZERO);
10096 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10097 __ add(result, result, wordSize/str2_chr_size);
10098 __ add(str2, str2, wordSize);
10099 __ br(__ LT, L_POST_LOOP);
10100 __ BIND(L_LOOP);
10101 __ ldr(ch2, Address(str2));
10102 __ eor(ch2, first, ch2);
10103 __ sub(tmp2, ch2, tmp1);
10104 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10105 __ bics(tmp2, tmp2, ch2);
10106 __ br(__ NE, L_HAS_ZERO);
10107 __ BIND(L_LOOP_PROCEED);
10108 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10109 __ add(str2, str2, wordSize);
10110 __ add(result, result, wordSize/str2_chr_size);
10111 __ br(__ GE, L_LOOP);
10112 __ BIND(L_POST_LOOP);
10113 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10114 __ br(__ LE, NOMATCH);
10115 __ ldr(ch2, Address(str2));
10116 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10117 __ eor(ch2, first, ch2);
10118 __ sub(tmp2, ch2, tmp1);
10119 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10120 __ mov(tmp4, -1); // all bits set
10121 __ b(L_SMALL_PROCEED);
10122 __ align(OptoLoopAlignment);
10123 __ BIND(L_SMALL);
10124 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10125 __ eor(ch2, first, ch2);
10126 if (str1_isL != str2_isL) {
10127 __ zip1(v1, __ T16B, v1, v0);
10128 }
10129 __ sub(tmp2, ch2, tmp1);
10130 __ mov(tmp4, -1); // all bits set
10131 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10132 if (str1_isL != str2_isL) {
10133 __ fmovd(ch1, v1); // move converted 4 symbols
10134 }
10135 __ BIND(L_SMALL_PROCEED);
10136 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10137 __ bic(tmp2, tmp2, ch2);
10138 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10139 __ rbit(tmp2, tmp2);
10140 __ br(__ EQ, NOMATCH);
10141 __ BIND(L_SMALL_HAS_ZERO_LOOP);
10142 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10143 __ cmp(cnt1, u1(wordSize/str2_chr_size));
10144 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10145 if (str2_isL) { // LL
10146 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10147 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10148 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10149 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10150 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10151 } else {
10152 __ mov(ch2, 0xE); // all bits in byte set except last one
10153 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10154 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10155 __ lslv(tmp2, tmp2, tmp4);
10156 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10157 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10158 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10159 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10160 }
10161 __ cmp(ch1, ch2);
10162 __ mov(tmp4, wordSize/str2_chr_size);
10163 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10164 __ BIND(L_SMALL_CMP_LOOP);
10165 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10166 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10167 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10168 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10169 __ add(tmp4, tmp4, 1);
10170 __ cmp(tmp4, cnt1);
10171 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10172 __ cmp(first, ch2);
10173 __ br(__ EQ, L_SMALL_CMP_LOOP);
10174 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10175 __ cbz(tmp2, NOMATCH); // no more matches. exit
10176 __ clz(tmp4, tmp2);
10177 __ add(result, result, 1); // advance index
10178 __ add(str2, str2, str2_chr_size); // advance pointer
10179 __ b(L_SMALL_HAS_ZERO_LOOP);
10180 __ align(OptoLoopAlignment);
10181 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10182 __ cmp(first, ch2);
10183 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10184 __ b(DONE);
10185 __ align(OptoLoopAlignment);
10186 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10187 if (str2_isL) { // LL
10188 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10189 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10190 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10191 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10192 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10193 } else {
10194 __ mov(ch2, 0xE); // all bits in byte set except last one
10195 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10196 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10197 __ lslv(tmp2, tmp2, tmp4);
10198 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10199 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10200 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10201 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10202 }
10203 __ cmp(ch1, ch2);
10204 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10205 __ b(DONE);
10206 __ align(OptoLoopAlignment);
10207 __ BIND(L_HAS_ZERO);
10208 __ rbit(tmp2, tmp2);
10209 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10210 // Now, perform compression of counters(cnt2 and cnt1) into one register.
10211 // It's fine because both counters are 32bit and are not changed in this
10212 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10213 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10214 __ sub(result, result, 1);
10215 __ BIND(L_HAS_ZERO_LOOP);
10216 __ mov(cnt1, wordSize/str2_chr_size);
10217 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10218 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10219 if (str2_isL) {
10220 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10221 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10222 __ lslv(tmp2, tmp2, tmp4);
10223 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10224 __ add(tmp4, tmp4, 1);
10225 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10226 __ lsl(tmp2, tmp2, 1);
10227 __ mov(tmp4, wordSize/str2_chr_size);
10228 } else {
10229 __ mov(ch2, 0xE);
10230 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10231 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10232 __ lslv(tmp2, tmp2, tmp4);
10233 __ add(tmp4, tmp4, 1);
10234 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10235 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10236 __ lsl(tmp2, tmp2, 1);
10237 __ mov(tmp4, wordSize/str2_chr_size);
10238 __ sub(str2, str2, str2_chr_size);
10239 }
10240 __ cmp(ch1, ch2);
10241 __ mov(tmp4, wordSize/str2_chr_size);
10242 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10243 __ BIND(L_CMP_LOOP);
10244 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10245 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10246 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10247 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10248 __ add(tmp4, tmp4, 1);
10249 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10250 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10251 __ cmp(cnt1, ch2);
10252 __ br(__ EQ, L_CMP_LOOP);
10253 __ BIND(L_CMP_LOOP_NOMATCH);
10254 // here we're not matched
10255 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10256 __ clz(tmp4, tmp2);
10257 __ add(str2, str2, str2_chr_size); // advance pointer
10258 __ b(L_HAS_ZERO_LOOP);
10259 __ align(OptoLoopAlignment);
10260 __ BIND(L_CMP_LOOP_LAST_CMP);
10261 __ cmp(cnt1, ch2);
10262 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10263 __ b(DONE);
10264 __ align(OptoLoopAlignment);
10265 __ BIND(L_CMP_LOOP_LAST_CMP2);
10266 if (str2_isL) {
10267 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10268 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10269 __ lslv(tmp2, tmp2, tmp4);
10270 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10271 __ add(tmp4, tmp4, 1);
10272 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10273 __ lsl(tmp2, tmp2, 1);
10274 } else {
10275 __ mov(ch2, 0xE);
10276 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10277 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10278 __ lslv(tmp2, tmp2, tmp4);
10279 __ add(tmp4, tmp4, 1);
10280 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10281 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10282 __ lsl(tmp2, tmp2, 1);
10283 __ sub(str2, str2, str2_chr_size);
10284 }
10285 __ cmp(ch1, ch2);
10286 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10287 __ b(DONE);
10288 __ align(OptoLoopAlignment);
10289 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10290 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10291 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10292 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10293 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10294 // result by analyzed characters value, so, we can just reset lower bits
10295 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10296 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10297 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10298 // index of last analyzed substring inside current octet. So, str2 in at
10299 // respective start address. We need to advance it to next octet
10300 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10301 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10302 __ bfm(result, zr, 0, 2 - str2_chr_shift);
10303 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10304 __ movw(cnt2, cnt2);
10305 __ b(L_LOOP_PROCEED);
10306 __ align(OptoLoopAlignment);
10307 __ BIND(NOMATCH);
10308 __ mov(result, -1);
10309 __ BIND(DONE);
10310 __ pop(spilled_regs, sp);
10311 __ ret(lr);
10312
10313 // record the stub entry and end
10314 store_archive_data(stub_id, entry, __ pc());
10315
10316 return entry;
10317 }
10318
10319 void generate_string_indexof_stubs() {
10320 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10321 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10322 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10323 }
10324
10325 void inflate_and_store_2_fp_registers(bool generatePrfm,
10326 FloatRegister src1, FloatRegister src2) {
10327 Register dst = r1;
10328 __ zip1(v1, __ T16B, src1, v0);
10329 __ zip2(v2, __ T16B, src1, v0);
10330 if (generatePrfm) {
10331 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10332 }
10333 __ zip1(v3, __ T16B, src2, v0);
10334 __ zip2(v4, __ T16B, src2, v0);
10335 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10336 }
10337
10338 // R0 = src
10339 // R1 = dst
10340 // R2 = len
10341 // R3 = len >> 3
10342 // V0 = 0
10343 // v1 = loaded 8 bytes
10344 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10345 address generate_large_byte_array_inflate() {
10346 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10347 int entry_count = StubInfo::entry_count(stub_id);
10348 assert(entry_count == 1, "sanity check");
10349 address start = load_archive_data(stub_id);
10350 if (start != nullptr) {
10351 return start;
10352 }
10353 __ align(CodeEntryAlignment);
10354 StubCodeMark mark(this, stub_id);
10355 address entry = __ pc();
10356 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10357 Register src = r0, dst = r1, len = r2, octetCounter = r3;
10358 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10359
10360 // do one more 8-byte read to have address 16-byte aligned in most cases
10361 // also use single store instruction
10362 __ ldrd(v2, __ post(src, 8));
10363 __ sub(octetCounter, octetCounter, 2);
10364 __ zip1(v1, __ T16B, v1, v0);
10365 __ zip1(v2, __ T16B, v2, v0);
10366 __ st1(v1, v2, __ T16B, __ post(dst, 32));
10367 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10368 __ subs(rscratch1, octetCounter, large_loop_threshold);
10369 __ br(__ LE, LOOP_START);
10370 __ b(LOOP_PRFM_START);
10371 __ bind(LOOP_PRFM);
10372 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10373 __ bind(LOOP_PRFM_START);
10374 __ prfm(Address(src, SoftwarePrefetchHintDistance));
10375 __ sub(octetCounter, octetCounter, 8);
10376 __ subs(rscratch1, octetCounter, large_loop_threshold);
10377 inflate_and_store_2_fp_registers(true, v3, v4);
10378 inflate_and_store_2_fp_registers(true, v5, v6);
10379 __ br(__ GT, LOOP_PRFM);
10380 __ cmp(octetCounter, (u1)8);
10381 __ br(__ LT, DONE);
10382 __ bind(LOOP);
10383 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10384 __ bind(LOOP_START);
10385 __ sub(octetCounter, octetCounter, 8);
10386 __ cmp(octetCounter, (u1)8);
10387 inflate_and_store_2_fp_registers(false, v3, v4);
10388 inflate_and_store_2_fp_registers(false, v5, v6);
10389 __ br(__ GE, LOOP);
10390 __ bind(DONE);
10391 __ ret(lr);
10392
10393 // record the stub entry and end
10394 store_archive_data(stub_id, entry, __ pc());
10395
10396 return entry;
10397 }
10398
10399 /**
10400 * Arguments:
10401 *
10402 * Input:
10403 * c_rarg0 - current state address
10404 * c_rarg1 - H key address
10405 * c_rarg2 - data address
10406 * c_rarg3 - number of blocks
10407 *
10408 * Output:
10409 * Updated state at c_rarg0
10410 */
10411 address generate_ghash_processBlocks_small() {
10412 // Bafflingly, GCM uses little-endian for the byte order, but
10413 // big-endian for the bit order. For example, the polynomial 1 is
10414 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10415 //
10416 // So, we must either reverse the bytes in each word and do
10417 // everything big-endian or reverse the bits in each byte and do
10418 // it little-endian. On AArch64 it's more idiomatic to reverse
10419 // the bits in each byte (we have an instruction, RBIT, to do
10420 // that) and keep the data in little-endian bit order through the
10421 // calculation, bit-reversing the inputs and outputs.
10422
10423 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10424 int entry_count = StubInfo::entry_count(stub_id);
10425 assert(entry_count == 1, "sanity check");
10426 address start = load_archive_data(stub_id);
10427 if (start != nullptr) {
10428 return start;
10429 }
10430 __ align(CodeEntryAlignment);
10431 StubCodeMark mark(this, stub_id);
10432 Label polynomial; // local data generated at end of stub
10433 start = __ pc();
10434
10435 Register state = c_rarg0;
10436 Register subkeyH = c_rarg1;
10437 Register data = c_rarg2;
10438 Register blocks = c_rarg3;
10439
10440 FloatRegister vzr = v30;
10441 __ eor(vzr, __ T16B, vzr, vzr); // zero register
10442
10443 __ adr(rscratch1, polynomial);
10444 __ ldrq(v24, rscratch1); // The field polynomial
10445
10446 __ ldrq(v0, Address(state));
10447 __ ldrq(v1, Address(subkeyH));
10448
10449 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
10450 __ rbit(v0, __ T16B, v0);
10451 __ rev64(v1, __ T16B, v1);
10452 __ rbit(v1, __ T16B, v1);
10453
10454 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10455 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10456
10457 {
10458 Label L_ghash_loop;
10459 __ bind(L_ghash_loop);
10460
10461 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10462 // reversing each byte
10463 __ rbit(v2, __ T16B, v2);
10464 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
10465
10466 // Multiply state in v2 by subkey in v1
10467 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10468 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10469 /*temps*/v6, v3, /*reuse/clobber b*/v2);
10470 // Reduce v7:v5 by the field polynomial
10471 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10472
10473 __ sub(blocks, blocks, 1);
10474 __ cbnz(blocks, L_ghash_loop);
10475 }
10476
10477 // The bit-reversed result is at this point in v0
10478 __ rev64(v0, __ T16B, v0);
10479 __ rbit(v0, __ T16B, v0);
10480
10481 __ st1(v0, __ T16B, state);
10482 __ ret(lr);
10483
10484 // bind label and generate local polynomial data
10485 __ align(wordSize * 2);
10486 __ bind(polynomial);
10487 __ emit_int64(0x87); // The low-order bits of the field
10488 // polynomial (i.e. p = z^7+z^2+z+1)
10489 // repeated in the low and high parts of a
10490 // 128-bit vector
10491 __ emit_int64(0x87);
10492
10493 // record the stub entry and end
10494 store_archive_data(stub_id, start, __ pc());
10495
10496 return start;
10497 }
10498
10499 address generate_ghash_processBlocks(address small) {
10500 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10501 int entry_count = StubInfo::entry_count(stub_id);
10502 assert(entry_count == 1, "sanity check");
10503 address start = load_archive_data(stub_id);
10504 if (start != nullptr) {
10505 return start;
10506 }
10507 Label polynomial; // local data generated after stub
10508 __ align(CodeEntryAlignment);
10509 StubCodeMark mark(this, stub_id);
10510 start = __ pc();
10511
10512 Register state = c_rarg0;
10513 Register subkeyH = c_rarg1;
10514 Register data = c_rarg2;
10515 Register blocks = c_rarg3;
10516
10517 const int unroll = 4;
10518
10519 __ cmp(blocks, (unsigned char)(unroll * 2));
10520 __ br(__ LT, small);
10521
10522 if (unroll > 1) {
10523 // Save state before entering routine
10524 __ sub(sp, sp, 4 * 16);
10525 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10526 __ sub(sp, sp, 4 * 16);
10527 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10528 }
10529
10530 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10531
10532 if (unroll > 1) {
10533 // And restore state
10534 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10535 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10536 }
10537
10538 __ cmp(blocks, (unsigned char)0);
10539 __ br(__ GT, small);
10540
10541 __ ret(lr);
10542
10543 // bind label and generate polynomial data
10544 __ align(wordSize * 2);
10545 __ bind(polynomial);
10546 __ emit_int64(0x87); // The low-order bits of the field
10547 // polynomial (i.e. p = z^7+z^2+z+1)
10548 // repeated in the low and high parts of a
10549 // 128-bit vector
10550 __ emit_int64(0x87);
10551
10552 // record the stub entry and end
10553 store_archive_data(stub_id, start, __ pc());
10554
10555 return start;
10556 }
10557
10558 void generate_base64_encode_simdround(Register src, Register dst,
10559 FloatRegister codec, u8 size) {
10560
10561 FloatRegister in0 = v4, in1 = v5, in2 = v6;
10562 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10563 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10564
10565 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10566
10567 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10568
10569 __ ushr(ind0, arrangement, in0, 2);
10570
10571 __ ushr(ind1, arrangement, in1, 2);
10572 __ shl(in0, arrangement, in0, 6);
10573 __ orr(ind1, arrangement, ind1, in0);
10574 __ ushr(ind1, arrangement, ind1, 2);
10575
10576 __ ushr(ind2, arrangement, in2, 4);
10577 __ shl(in1, arrangement, in1, 4);
10578 __ orr(ind2, arrangement, in1, ind2);
10579 __ ushr(ind2, arrangement, ind2, 2);
10580
10581 __ shl(ind3, arrangement, in2, 2);
10582 __ ushr(ind3, arrangement, ind3, 2);
10583
10584 __ tbl(out0, arrangement, codec, 4, ind0);
10585 __ tbl(out1, arrangement, codec, 4, ind1);
10586 __ tbl(out2, arrangement, codec, 4, ind2);
10587 __ tbl(out3, arrangement, codec, 4, ind3);
10588
10589 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
10590 }
10591
10592 /**
10593 * Arguments:
10594 *
10595 * Input:
10596 * c_rarg0 - src_start
10597 * c_rarg1 - src_offset
10598 * c_rarg2 - src_length
10599 * c_rarg3 - dest_start
10600 * c_rarg4 - dest_offset
10601 * c_rarg5 - isURL
10602 *
10603 */
10604 address generate_base64_encodeBlock() {
10605
10606 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10607 int entry_count = StubInfo::entry_count(stub_id);
10608 assert(entry_count == 1, "sanity check");
10609 address start = load_archive_data(stub_id);
10610 if (start != nullptr) {
10611 return start;
10612 }
10613 __ align(CodeEntryAlignment);
10614 StubCodeMark mark(this, stub_id);
10615 start = __ pc();
10616
10617 Register src = c_rarg0; // source array
10618 Register soff = c_rarg1; // source start offset
10619 Register send = c_rarg2; // source end offset
10620 Register dst = c_rarg3; // dest array
10621 Register doff = c_rarg4; // position for writing to dest array
10622 Register isURL = c_rarg5; // Base64 or URL character set
10623
10624 // c_rarg6 and c_rarg7 are free to use as temps
10625 Register codec = c_rarg6;
10626 Register length = c_rarg7;
10627
10628 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10629
10630 __ add(src, src, soff);
10631 __ add(dst, dst, doff);
10632 __ sub(length, send, soff);
10633
10634 // load the codec base address
10635 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10636 __ cbz(isURL, ProcessData);
10637 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10638
10639 __ BIND(ProcessData);
10640
10641 // too short to formup a SIMD loop, roll back
10642 __ cmp(length, (u1)24);
10643 __ br(Assembler::LT, Process3B);
10644
10645 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10646
10647 __ BIND(Process48B);
10648 __ cmp(length, (u1)48);
10649 __ br(Assembler::LT, Process24B);
10650 generate_base64_encode_simdround(src, dst, v0, 16);
10651 __ sub(length, length, 48);
10652 __ b(Process48B);
10653
10654 __ BIND(Process24B);
10655 __ cmp(length, (u1)24);
10656 __ br(Assembler::LT, SIMDExit);
10657 generate_base64_encode_simdround(src, dst, v0, 8);
10658 __ sub(length, length, 24);
10659
10660 __ BIND(SIMDExit);
10661 __ cbz(length, Exit);
10662
10663 __ BIND(Process3B);
10664 // 3 src bytes, 24 bits
10665 __ ldrb(r10, __ post(src, 1));
10666 __ ldrb(r11, __ post(src, 1));
10667 __ ldrb(r12, __ post(src, 1));
10668 __ orrw(r11, r11, r10, Assembler::LSL, 8);
10669 __ orrw(r12, r12, r11, Assembler::LSL, 8);
10670 // codec index
10671 __ ubfmw(r15, r12, 18, 23);
10672 __ ubfmw(r14, r12, 12, 17);
10673 __ ubfmw(r13, r12, 6, 11);
10674 __ andw(r12, r12, 63);
10675 // get the code based on the codec
10676 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10677 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10678 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10679 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10680 __ strb(r15, __ post(dst, 1));
10681 __ strb(r14, __ post(dst, 1));
10682 __ strb(r13, __ post(dst, 1));
10683 __ strb(r12, __ post(dst, 1));
10684 __ sub(length, length, 3);
10685 __ cbnz(length, Process3B);
10686
10687 __ BIND(Exit);
10688 __ ret(lr);
10689
10690 // record the stub entry and end
10691 store_archive_data(stub_id, start, __ pc());
10692
10693 return start;
10694 }
10695
10696 void generate_base64_decode_simdround(Register src, Register dst,
10697 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10698
10699 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
10700 FloatRegister out0 = v20, out1 = v21, out2 = v22;
10701
10702 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10703 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10704
10705 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10706
10707 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10708
10709 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10710
10711 // we need unsigned saturating subtract, to make sure all input values
10712 // in range [0, 63] will have 0U value in the higher half lookup
10713 __ uqsubv(decH0, __ T16B, in0, v27);
10714 __ uqsubv(decH1, __ T16B, in1, v27);
10715 __ uqsubv(decH2, __ T16B, in2, v27);
10716 __ uqsubv(decH3, __ T16B, in3, v27);
10717
10718 // lower half lookup
10719 __ tbl(decL0, arrangement, codecL, 4, in0);
10720 __ tbl(decL1, arrangement, codecL, 4, in1);
10721 __ tbl(decL2, arrangement, codecL, 4, in2);
10722 __ tbl(decL3, arrangement, codecL, 4, in3);
10723
10724 // higher half lookup
10725 __ tbx(decH0, arrangement, codecH, 4, decH0);
10726 __ tbx(decH1, arrangement, codecH, 4, decH1);
10727 __ tbx(decH2, arrangement, codecH, 4, decH2);
10728 __ tbx(decH3, arrangement, codecH, 4, decH3);
10729
10730 // combine lower and higher
10731 __ orr(decL0, arrangement, decL0, decH0);
10732 __ orr(decL1, arrangement, decL1, decH1);
10733 __ orr(decL2, arrangement, decL2, decH2);
10734 __ orr(decL3, arrangement, decL3, decH3);
10735
10736 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10737 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10738 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10739 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10740 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10741 __ orr(in0, arrangement, decH0, decH1);
10742 __ orr(in1, arrangement, decH2, decH3);
10743 __ orr(in2, arrangement, in0, in1);
10744 __ umaxv(in3, arrangement, in2);
10745 __ umov(rscratch2, in3, __ B, 0);
10746
10747 // get the data to output
10748 __ shl(out0, arrangement, decL0, 2);
10749 __ ushr(out1, arrangement, decL1, 4);
10750 __ orr(out0, arrangement, out0, out1);
10751 __ shl(out1, arrangement, decL1, 4);
10752 __ ushr(out2, arrangement, decL2, 2);
10753 __ orr(out1, arrangement, out1, out2);
10754 __ shl(out2, arrangement, decL2, 6);
10755 __ orr(out2, arrangement, out2, decL3);
10756
10757 __ cbz(rscratch2, NoIllegalData);
10758
10759 // handle illegal input
10760 __ umov(r10, in2, __ D, 0);
10761 if (size == 16) {
10762 __ cbnz(r10, ErrorInLowerHalf);
10763
10764 // illegal input is in higher half, store the lower half now.
10765 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10766
10767 __ umov(r10, in2, __ D, 1);
10768 __ umov(r11, out0, __ D, 1);
10769 __ umov(r12, out1, __ D, 1);
10770 __ umov(r13, out2, __ D, 1);
10771 __ b(StoreLegalData);
10772
10773 __ BIND(ErrorInLowerHalf);
10774 }
10775 __ umov(r11, out0, __ D, 0);
10776 __ umov(r12, out1, __ D, 0);
10777 __ umov(r13, out2, __ D, 0);
10778
10779 __ BIND(StoreLegalData);
10780 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10781 __ strb(r11, __ post(dst, 1));
10782 __ strb(r12, __ post(dst, 1));
10783 __ strb(r13, __ post(dst, 1));
10784 __ lsr(r10, r10, 8);
10785 __ lsr(r11, r11, 8);
10786 __ lsr(r12, r12, 8);
10787 __ lsr(r13, r13, 8);
10788 __ b(StoreLegalData);
10789
10790 __ BIND(NoIllegalData);
10791 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10792 }
10793
10794
10795 /**
10796 * Arguments:
10797 *
10798 * Input:
10799 * c_rarg0 - src_start
10800 * c_rarg1 - src_offset
10801 * c_rarg2 - src_length
10802 * c_rarg3 - dest_start
10803 * c_rarg4 - dest_offset
10804 * c_rarg5 - isURL
10805 * c_rarg6 - isMIME
10806 *
10807 */
10808 address generate_base64_decodeBlock() {
10809
10810 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10811 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10812 // titled "Base64 decoding".
10813
10814 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10815 int entry_count = StubInfo::entry_count(stub_id);
10816 assert(entry_count == 1, "sanity check");
10817 address start = load_archive_data(stub_id);
10818 if (start != nullptr) {
10819 return start;
10820 }
10821 __ align(CodeEntryAlignment);
10822 StubCodeMark mark(this, stub_id);
10823 start = __ pc();
10824
10825 Register src = c_rarg0; // source array
10826 Register soff = c_rarg1; // source start offset
10827 Register send = c_rarg2; // source end offset
10828 Register dst = c_rarg3; // dest array
10829 Register doff = c_rarg4; // position for writing to dest array
10830 Register isURL = c_rarg5; // Base64 or URL character set
10831 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10832
10833 Register length = send; // reuse send as length of source data to process
10834
10835 Register simd_codec = c_rarg6;
10836 Register nosimd_codec = c_rarg7;
10837
10838 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10839
10840 __ enter();
10841
10842 __ add(src, src, soff);
10843 __ add(dst, dst, doff);
10844
10845 __ mov(doff, dst);
10846
10847 __ sub(length, send, soff);
10848 __ bfm(length, zr, 0, 1);
10849
10850 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10851 __ cbz(isURL, ProcessData);
10852 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10853
10854 __ BIND(ProcessData);
10855 __ mov(rscratch1, length);
10856 __ cmp(length, (u1)144); // 144 = 80 + 64
10857 __ br(Assembler::LT, Process4B);
10858
10859 // In the MIME case, the line length cannot be more than 76
10860 // bytes (see RFC 2045). This is too short a block for SIMD
10861 // to be worthwhile, so we use non-SIMD here.
10862 __ movw(rscratch1, 79);
10863
10864 __ BIND(Process4B);
10865 __ ldrw(r14, __ post(src, 4));
10866 __ ubfxw(r10, r14, 0, 8);
10867 __ ubfxw(r11, r14, 8, 8);
10868 __ ubfxw(r12, r14, 16, 8);
10869 __ ubfxw(r13, r14, 24, 8);
10870 // get the de-code
10871 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10872 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10873 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10874 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10875 // error detection, 255u indicates an illegal input
10876 __ orrw(r14, r10, r11);
10877 __ orrw(r15, r12, r13);
10878 __ orrw(r14, r14, r15);
10879 __ tbnz(r14, 7, Exit);
10880 // recover the data
10881 __ lslw(r14, r10, 10);
10882 __ bfiw(r14, r11, 4, 6);
10883 __ bfmw(r14, r12, 2, 5);
10884 __ rev16w(r14, r14);
10885 __ bfiw(r13, r12, 6, 2);
10886 __ strh(r14, __ post(dst, 2));
10887 __ strb(r13, __ post(dst, 1));
10888 // non-simd loop
10889 __ subsw(rscratch1, rscratch1, 4);
10890 __ br(Assembler::GT, Process4B);
10891
10892 // if exiting from PreProcess80B, rscratch1 == -1;
10893 // otherwise, rscratch1 == 0.
10894 __ cbzw(rscratch1, Exit);
10895 __ sub(length, length, 80);
10896
10897 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10898 __ cbz(isURL, SIMDEnter);
10899 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10900
10901 __ BIND(SIMDEnter);
10902 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10903 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10904 __ mov(rscratch1, 63);
10905 __ dup(v27, __ T16B, rscratch1);
10906
10907 __ BIND(Process64B);
10908 __ cmp(length, (u1)64);
10909 __ br(Assembler::LT, Process32B);
10910 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10911 __ sub(length, length, 64);
10912 __ b(Process64B);
10913
10914 __ BIND(Process32B);
10915 __ cmp(length, (u1)32);
10916 __ br(Assembler::LT, SIMDExit);
10917 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10918 __ sub(length, length, 32);
10919 __ b(Process32B);
10920
10921 __ BIND(SIMDExit);
10922 __ cbz(length, Exit);
10923 __ movw(rscratch1, length);
10924 __ b(Process4B);
10925
10926 __ BIND(Exit);
10927 __ sub(c_rarg0, dst, doff);
10928
10929 __ leave();
10930 __ ret(lr);
10931
10932 // record the stub entry and end
10933 store_archive_data(stub_id, start, __ pc());
10934
10935 return start;
10936 }
10937
10938 // Support for spin waits.
10939 address generate_spin_wait() {
10940 StubId stub_id = StubId::stubgen_spin_wait_id;
10941 int entry_count = StubInfo::entry_count(stub_id);
10942 assert(entry_count == 1, "sanity check");
10943 address start = load_archive_data(stub_id);
10944 if (start != nullptr) {
10945 return start;
10946 }
10947 __ align(CodeEntryAlignment);
10948 StubCodeMark mark(this, stub_id);
10949 start = __ pc();
10950
10951 __ spin_wait();
10952 __ ret(lr);
10953
10954 // record the stub entry and end
10955 store_archive_data(stub_id, start, __ pc());
10956
10957 return start;
10958 }
10959
10960 void generate_lookup_secondary_supers_table_stub() {
10961 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10962 GrowableArray<address> entries;
10963 int entry_count = StubInfo::entry_count(stub_id);
10964 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10965 address start = load_archive_data(stub_id, &entries);
10966 if (start != nullptr) {
10967 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10968 "unexpected extra entry count %d", entries.length());
10969 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10970 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10971 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10972 }
10973 return;
10974 }
10975
10976 StubCodeMark mark(this, stub_id);
10977
10978 const Register
10979 r_super_klass = r0,
10980 r_array_base = r1,
10981 r_array_length = r2,
10982 r_array_index = r3,
10983 r_sub_klass = r4,
10984 r_bitmap = rscratch2,
10985 result = r5;
10986 const FloatRegister
10987 vtemp = v0;
10988
10989 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10990 address next_entry = __ pc();
10991 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10992 if (slot == 0) {
10993 start = next_entry;
10994 } else {
10995 entries.append(next_entry);
10996 }
10997 Label L_success;
10998 __ enter();
10999 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
11000 r_array_base, r_array_length, r_array_index,
11001 vtemp, result, slot,
11002 /*stub_is_near*/true);
11003 __ leave();
11004 __ ret(lr);
11005 }
11006 // record the stub entry and end plus all the auxiliary entries
11007 store_archive_data(stub_id, start, __ pc(), &entries);
11008 }
11009
11010 // Slow path implementation for UseSecondarySupersTable.
11011 address generate_lookup_secondary_supers_table_slow_path_stub() {
11012 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11013 int entry_count = StubInfo::entry_count(stub_id);
11014 assert(entry_count == 1, "sanity check");
11015 address start = load_archive_data(stub_id);
11016 if (start != nullptr) {
11017 return start;
11018 }
11019 StubCodeMark mark(this, stub_id);
11020 start = __ pc();
11021 const Register
11022 r_super_klass = r0, // argument
11023 r_array_base = r1, // argument
11024 temp1 = r2, // temp
11025 r_array_index = r3, // argument
11026 r_bitmap = rscratch2, // argument
11027 result = r5; // argument
11028
11029 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11030 __ ret(lr);
11031
11032 // record the stub entry and end
11033 store_archive_data(stub_id, start, __ pc());
11034
11035 return start;
11036 }
11037
11038 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11039
11040 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11041 //
11042 // If LSE is in use, generate LSE versions of all the stubs. The
11043 // non-LSE versions are in atomic_aarch64.S.
11044
11045 // class AtomicStubMark records the entry point of a stub and the
11046 // stub pointer which will point to it. The stub pointer is set to
11047 // the entry point when ~AtomicStubMark() is called, which must be
11048 // after ICache::invalidate_range. This ensures safe publication of
11049 // the generated code.
11050 class AtomicStubMark {
11051 address _entry_point;
11052 aarch64_atomic_stub_t *_stub;
11053 MacroAssembler *_masm;
11054 public:
11055 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11056 _masm = masm;
11057 __ align(32);
11058 _entry_point = __ pc();
11059 _stub = stub;
11060 }
11061 ~AtomicStubMark() {
11062 *_stub = (aarch64_atomic_stub_t)_entry_point;
11063 }
11064 };
11065
11066 // NB: For memory_order_conservative we need a trailing membar after
11067 // LSE atomic operations but not a leading membar.
11068 //
11069 // We don't need a leading membar because a clause in the Arm ARM
11070 // says:
11071 //
11072 // Barrier-ordered-before
11073 //
11074 // Barrier instructions order prior Memory effects before subsequent
11075 // Memory effects generated by the same Observer. A read or a write
11076 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11077 // Observer if and only if RW1 appears in program order before RW 2
11078 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11079 // instruction with both Acquire and Release semantics.
11080 //
11081 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11082 // and Release semantics, therefore we don't need a leading
11083 // barrier. However, there is no corresponding Barrier-ordered-after
11084 // relationship, therefore we need a trailing membar to prevent a
11085 // later store or load from being reordered with the store in an
11086 // atomic instruction.
11087 //
11088 // This was checked by using the herd7 consistency model simulator
11089 // (http://diy.inria.fr/) with this test case:
11090 //
11091 // AArch64 LseCas
11092 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11093 // P0 | P1;
11094 // LDR W4, [X2] | MOV W3, #0;
11095 // DMB LD | MOV W4, #1;
11096 // LDR W3, [X1] | CASAL W3, W4, [X1];
11097 // | DMB ISH;
11098 // | STR W4, [X2];
11099 // exists
11100 // (0:X3=0 /\ 0:X4=1)
11101 //
11102 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11103 // with the store to x in P1. Without the DMB in P1 this may happen.
11104 //
11105 // At the time of writing we don't know of any AArch64 hardware that
11106 // reorders stores in this way, but the Reference Manual permits it.
11107
11108 void gen_cas_entry(Assembler::operand_size size,
11109 atomic_memory_order order) {
11110 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11111 exchange_val = c_rarg2;
11112 bool acquire, release;
11113 switch (order) {
11114 case memory_order_relaxed:
11115 acquire = false;
11116 release = false;
11117 break;
11118 case memory_order_release:
11119 acquire = false;
11120 release = true;
11121 break;
11122 default:
11123 acquire = true;
11124 release = true;
11125 break;
11126 }
11127 __ mov(prev, compare_val);
11128 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11129 if (order == memory_order_conservative) {
11130 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11131 }
11132 if (size == Assembler::xword) {
11133 __ mov(r0, prev);
11134 } else {
11135 __ movw(r0, prev);
11136 }
11137 __ ret(lr);
11138 }
11139
11140 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11141 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11142 // If not relaxed, then default to conservative. Relaxed is the only
11143 // case we use enough to be worth specializing.
11144 if (order == memory_order_relaxed) {
11145 __ ldadd(size, incr, prev, addr);
11146 } else {
11147 __ ldaddal(size, incr, prev, addr);
11148 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11149 }
11150 if (size == Assembler::xword) {
11151 __ mov(r0, prev);
11152 } else {
11153 __ movw(r0, prev);
11154 }
11155 __ ret(lr);
11156 }
11157
11158 void gen_swpal_entry(Assembler::operand_size size) {
11159 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11160 __ swpal(size, incr, prev, addr);
11161 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11162 if (size == Assembler::xword) {
11163 __ mov(r0, prev);
11164 } else {
11165 __ movw(r0, prev);
11166 }
11167 __ ret(lr);
11168 }
11169
11170 void generate_atomic_entry_points() {
11171 if (! UseLSE) {
11172 return;
11173 }
11174 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11175 GrowableArray<address> entries;
11176 int entry_count = StubInfo::entry_count(stub_id);
11177 address start = load_archive_data(stub_id, &entries);
11178 if (start != nullptr) {
11179 assert(entries.length() == entry_count - 1,
11180 "unexpected extra entry count %d", entries.length());
11181 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11182 int idx = 0;
11183 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11184 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11185 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11186 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11187 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11188 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11189 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11190 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11191 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11192 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11193 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11194 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11195 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11196 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11197 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11198 assert(idx == entries.length(), "sanity!");
11199 return;
11200 }
11201
11202 __ align(CodeEntryAlignment);
11203 StubCodeMark mark(this, stub_id);
11204 start = __ pc();
11205 address end;
11206 {
11207 // ADD, memory_order_conservative
11208 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11209 gen_ldadd_entry(Assembler::word, memory_order_conservative);
11210
11211 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11212 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11213
11214 // ADD, memory_order_relaxed
11215 AtomicStubMark mark_fetch_add_4_relaxed
11216 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11217 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11218
11219 AtomicStubMark mark_fetch_add_8_relaxed
11220 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11221 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11222
11223 // XCHG, memory_order_conservative
11224 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11225 gen_swpal_entry(Assembler::word);
11226
11227 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11228 gen_swpal_entry(Assembler::xword);
11229
11230 // CAS, memory_order_conservative
11231 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11232 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11233
11234 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11235 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11236
11237 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11238 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11239
11240 // CAS, memory_order_relaxed
11241 AtomicStubMark mark_cmpxchg_1_relaxed
11242 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11243 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11244
11245 AtomicStubMark mark_cmpxchg_4_relaxed
11246 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11247 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11248
11249 AtomicStubMark mark_cmpxchg_8_relaxed
11250 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11251 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11252
11253 AtomicStubMark mark_cmpxchg_4_release
11254 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11255 gen_cas_entry(MacroAssembler::word, memory_order_release);
11256
11257 AtomicStubMark mark_cmpxchg_8_release
11258 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11259 gen_cas_entry(MacroAssembler::xword, memory_order_release);
11260
11261 AtomicStubMark mark_cmpxchg_4_seq_cst
11262 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11263 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11264
11265 AtomicStubMark mark_cmpxchg_8_seq_cst
11266 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11267 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11268
11269 end = __ pc();
11270
11271 ICache::invalidate_range(start, end - start);
11272 // exit block to force update of AtomicStubMark targets
11273 }
11274
11275 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11276 "atomic stub should be at start of buffer");
11277 // record the stub start and end plus all the entries saved by the
11278 // AtomicStubMark destructor
11279 entries.append((address)aarch64_atomic_fetch_add_8_impl);
11280 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11281 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11282 entries.append((address)aarch64_atomic_xchg_4_impl);
11283 entries.append((address)aarch64_atomic_xchg_8_impl);
11284 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11285 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11286 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11287 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11288 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11289 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11290 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11291 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11292 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11293 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11294
11295 assert(entries.length() == entry_count - 1,
11296 "unexpected extra entry count %d", entries.length());
11297
11298 store_archive_data(stub_id, start, end, &entries);
11299 }
11300 #endif // LINUX
11301
11302 static void save_return_registers(MacroAssembler* masm) {
11303 if (InlineTypeReturnedAsFields) {
11304 masm->push(RegSet::range(r0, r7), sp);
11305 masm->sub(sp, sp, 4 * wordSize);
11306 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
11307 masm->sub(sp, sp, 4 * wordSize);
11308 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
11309 } else {
11310 masm->fmovd(rscratch1, v0);
11311 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
11312 }
11313 }
11314
11315 static void restore_return_registers(MacroAssembler* masm) {
11316 if (InlineTypeReturnedAsFields) {
11317 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11318 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11319 masm->pop(RegSet::range(r0, r7), sp);
11320 } else {
11321 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
11322 masm->fmovd(v0, rscratch1);
11323 }
11324 }
11325
11326 address generate_cont_thaw(Continuation::thaw_kind kind) {
11327 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11328 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11329
11330 address start = __ pc();
11331
11332 if (return_barrier) {
11333 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11334 __ mov(sp, rscratch1);
11335 }
11336 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11337
11338 if (return_barrier) {
11339 // preserve possible return value from a method returning to the return barrier
11340 save_return_registers(_masm);
11341 }
11342
11343 __ movw(c_rarg1, (return_barrier ? 1 : 0));
11344 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11345 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11346
11347 if (return_barrier) {
11348 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11349 restore_return_registers(_masm);
11350 }
11351 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11352
11353
11354 Label thaw_success;
11355 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11356 __ cbnz(rscratch2, thaw_success);
11357 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11358 __ br(rscratch1);
11359 __ bind(thaw_success);
11360
11361 // make room for the thawed frames
11362 __ sub(rscratch1, sp, rscratch2);
11363 __ andr(rscratch1, rscratch1, -16); // align
11364 __ mov(sp, rscratch1);
11365
11366 if (return_barrier) {
11367 // save original return value -- again
11368 save_return_registers(_masm);
11369 }
11370
11371 // If we want, we can templatize thaw by kind, and have three different entries
11372 __ movw(c_rarg1, (uint32_t)kind);
11373
11374 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11375 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11376
11377 if (return_barrier) {
11378 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11379 restore_return_registers(_masm);
11380 } else {
11381 __ mov(r0, zr); // return 0 (success) from doYield
11382 }
11383
11384 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11385 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11386 __ mov(rfp, sp);
11387
11388 if (return_barrier_exception) {
11389 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11390 __ authenticate_return_address(c_rarg1);
11391 __ verify_oop(r0);
11392 // save return value containing the exception oop in callee-saved R19
11393 __ mov(r19, r0);
11394
11395 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11396
11397 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11398 // __ reinitialize_ptrue();
11399
11400 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11401
11402 __ mov(r1, r0); // the exception handler
11403 __ mov(r0, r19); // restore return value containing the exception oop
11404 __ verify_oop(r0);
11405
11406 __ leave();
11407 __ mov(r3, lr);
11408 __ br(r1); // the exception handler
11409 } else {
11410 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11411 __ leave();
11412 __ ret(lr);
11413 }
11414
11415 return start;
11416 }
11417
11418 address generate_cont_thaw() {
11419 if (!Continuations::enabled()) return nullptr;
11420
11421 StubId stub_id = StubId::stubgen_cont_thaw_id;
11422 int entry_count = StubInfo::entry_count(stub_id);
11423 assert(entry_count == 1, "sanity check");
11424 address start = load_archive_data(stub_id);
11425 if (start != nullptr) {
11426 return start;
11427 }
11428 StubCodeMark mark(this, stub_id);
11429 start = __ pc();
11430 generate_cont_thaw(Continuation::thaw_top);
11431
11432 // record the stub start and end
11433 store_archive_data(stub_id, start, __ pc());
11434
11435 return start;
11436 }
11437
11438 address generate_cont_returnBarrier() {
11439 if (!Continuations::enabled()) return nullptr;
11440
11441 // TODO: will probably need multiple return barriers depending on return type
11442 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11443 int entry_count = StubInfo::entry_count(stub_id);
11444 assert(entry_count == 1, "sanity check");
11445 address start = load_archive_data(stub_id);
11446 if (start != nullptr) {
11447 return start;
11448 }
11449 StubCodeMark mark(this, stub_id);
11450 start = __ pc();
11451
11452 generate_cont_thaw(Continuation::thaw_return_barrier);
11453
11454 // record the stub start and end
11455 store_archive_data(stub_id, start, __ pc());
11456
11457 return start;
11458 }
11459
11460 address generate_cont_returnBarrier_exception() {
11461 if (!Continuations::enabled()) return nullptr;
11462
11463 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11464 int entry_count = StubInfo::entry_count(stub_id);
11465 assert(entry_count == 1, "sanity check");
11466 address start = load_archive_data(stub_id);
11467 if (start != nullptr) {
11468 return start;
11469 }
11470 StubCodeMark mark(this, stub_id);
11471 start = __ pc();
11472
11473 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11474
11475 // record the stub start and end
11476 store_archive_data(stub_id, start, __ pc());
11477
11478 return start;
11479 }
11480
11481 address generate_cont_preempt_stub() {
11482 if (!Continuations::enabled()) return nullptr;
11483 StubId stub_id = StubId::stubgen_cont_preempt_id;
11484 int entry_count = StubInfo::entry_count(stub_id);
11485 assert(entry_count == 1, "sanity check");
11486 address start = load_archive_data(stub_id);
11487 if (start != nullptr) {
11488 return start;
11489 }
11490 StubCodeMark mark(this, stub_id);
11491 start = __ pc();
11492
11493 __ reset_last_Java_frame(true);
11494
11495 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11496 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11497 __ mov(sp, rscratch2);
11498
11499 Label preemption_cancelled;
11500 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11501 __ cbnz(rscratch1, preemption_cancelled);
11502
11503 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11504 SharedRuntime::continuation_enter_cleanup(_masm);
11505 __ leave();
11506 __ ret(lr);
11507
11508 // We acquired the monitor after freezing the frames so call thaw to continue execution.
11509 __ bind(preemption_cancelled);
11510 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11511 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11512 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11513 __ ldr(rscratch1, Address(rscratch1));
11514 __ br(rscratch1);
11515
11516 // record the stub start and end
11517 store_archive_data(stub_id, start, __ pc());
11518
11519 return start;
11520 }
11521
11522 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11523 // are represented as long[5], with BITS_PER_LIMB = 26.
11524 // Pack five 26-bit limbs into three 64-bit registers.
11525 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11526 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
11527 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
11528 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11529 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
11530
11531 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
11532 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
11533 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11534 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
11535
11536 if (dest2->is_valid()) {
11537 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
11538 } else {
11539 #ifdef ASSERT
11540 Label OK;
11541 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
11542 __ br(__ EQ, OK);
11543 __ stop("high bits of Poly1305 integer should be zero");
11544 __ should_not_reach_here();
11545 __ bind(OK);
11546 #endif
11547 }
11548 }
11549
11550 // As above, but return only a 128-bit integer, packed into two
11551 // 64-bit registers.
11552 void pack_26(Register dest0, Register dest1, Register src) {
11553 pack_26(dest0, dest1, noreg, src);
11554 }
11555
11556 // Multiply and multiply-accumulate unsigned 64-bit registers.
11557 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11558 __ mul(prod_lo, n, m);
11559 __ umulh(prod_hi, n, m);
11560 }
11561 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11562 wide_mul(rscratch1, rscratch2, n, m);
11563 __ adds(sum_lo, sum_lo, rscratch1);
11564 __ adc(sum_hi, sum_hi, rscratch2);
11565 }
11566
11567 // Poly1305, RFC 7539
11568
11569 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11570 // description of the tricks used to simplify and accelerate this
11571 // computation.
11572
11573 address generate_poly1305_processBlocks() {
11574 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11575 int entry_count = StubInfo::entry_count(stub_id);
11576 assert(entry_count == 1, "sanity check");
11577 address start = load_archive_data(stub_id);
11578 if (start != nullptr) {
11579 return start;
11580 }
11581 __ align(CodeEntryAlignment);
11582 StubCodeMark mark(this, stub_id);
11583 start = __ pc();
11584 Label here;
11585 __ enter();
11586 RegSet callee_saved = RegSet::range(r19, r28);
11587 __ push(callee_saved, sp);
11588
11589 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11590
11591 // Arguments
11592 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11593
11594 // R_n is the 128-bit randomly-generated key, packed into two
11595 // registers. The caller passes this key to us as long[5], with
11596 // BITS_PER_LIMB = 26.
11597 const Register R_0 = *++regs, R_1 = *++regs;
11598 pack_26(R_0, R_1, r_start);
11599
11600 // RR_n is (R_n >> 2) * 5
11601 const Register RR_0 = *++regs, RR_1 = *++regs;
11602 __ lsr(RR_0, R_0, 2);
11603 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11604 __ lsr(RR_1, R_1, 2);
11605 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11606
11607 // U_n is the current checksum
11608 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11609 pack_26(U_0, U_1, U_2, acc_start);
11610
11611 static constexpr int BLOCK_LENGTH = 16;
11612 Label DONE, LOOP;
11613
11614 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11615 __ br(Assembler::LT, DONE); {
11616 __ bind(LOOP);
11617
11618 // S_n is to be the sum of U_n and the next block of data
11619 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11620 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11621 __ adds(S_0, U_0, S_0);
11622 __ adcs(S_1, U_1, S_1);
11623 __ adc(S_2, U_2, zr);
11624 __ add(S_2, S_2, 1);
11625
11626 const Register U_0HI = *++regs, U_1HI = *++regs;
11627
11628 // NB: this logic depends on some of the special properties of
11629 // Poly1305 keys. In particular, because we know that the top
11630 // four bits of R_0 and R_1 are zero, we can add together
11631 // partial products without any risk of needing to propagate a
11632 // carry out.
11633 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11634 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
11635 __ andr(U_2, R_0, 3);
11636 __ mul(U_2, S_2, U_2);
11637
11638 // Recycle registers S_0, S_1, S_2
11639 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11640
11641 // Partial reduction mod 2**130 - 5
11642 __ adds(U_1, U_0HI, U_1);
11643 __ adc(U_2, U_1HI, U_2);
11644 // Sum now in U_2:U_1:U_0.
11645 // Dead: U_0HI, U_1HI.
11646 regs = (regs.remaining() + U_0HI + U_1HI).begin();
11647
11648 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11649
11650 // First, U_2:U_1:U_0 += (U_2 >> 2)
11651 __ lsr(rscratch1, U_2, 2);
11652 __ andr(U_2, U_2, (u8)3);
11653 __ adds(U_0, U_0, rscratch1);
11654 __ adcs(U_1, U_1, zr);
11655 __ adc(U_2, U_2, zr);
11656 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11657 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11658 __ adcs(U_1, U_1, zr);
11659 __ adc(U_2, U_2, zr);
11660
11661 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11662 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11663 __ br(~ Assembler::LT, LOOP);
11664 }
11665
11666 // Further reduce modulo 2^130 - 5
11667 __ lsr(rscratch1, U_2, 2);
11668 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11669 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11670 __ adcs(U_1, U_1, zr);
11671 __ andr(U_2, U_2, (u1)3);
11672 __ adc(U_2, U_2, zr);
11673
11674 // Unpack the sum into five 26-bit limbs and write to memory.
11675 __ ubfiz(rscratch1, U_0, 0, 26);
11676 __ ubfx(rscratch2, U_0, 26, 26);
11677 __ stp(rscratch1, rscratch2, Address(acc_start));
11678 __ ubfx(rscratch1, U_0, 52, 12);
11679 __ bfi(rscratch1, U_1, 12, 14);
11680 __ ubfx(rscratch2, U_1, 14, 26);
11681 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11682 __ ubfx(rscratch1, U_1, 40, 24);
11683 __ bfi(rscratch1, U_2, 24, 3);
11684 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11685
11686 __ bind(DONE);
11687 __ pop(callee_saved, sp);
11688 __ leave();
11689 __ ret(lr);
11690
11691 // record the stub start and end
11692 store_archive_data(stub_id, start, __ pc());
11693
11694 return start;
11695 }
11696
11697 // exception handler for upcall stubs
11698 address generate_upcall_stub_exception_handler() {
11699 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11700 int entry_count = StubInfo::entry_count(stub_id);
11701 assert(entry_count == 1, "sanity check");
11702 address start = load_archive_data(stub_id);
11703 if (start != nullptr) {
11704 return start;
11705 }
11706 StubCodeMark mark(this, stub_id);
11707 start = __ pc();
11708
11709 // Native caller has no idea how to handle exceptions,
11710 // so we just crash here. Up to callee to catch exceptions.
11711 __ verify_oop(r0);
11712 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11713 __ blr(rscratch1);
11714 __ should_not_reach_here();
11715
11716 // record the stub start and end
11717 store_archive_data(stub_id, start, __ pc());
11718
11719 return start;
11720 }
11721
11722 // load Method* target of MethodHandle
11723 // j_rarg0 = jobject receiver
11724 // rmethod = result
11725 address generate_upcall_stub_load_target() {
11726 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11727 int entry_count = StubInfo::entry_count(stub_id);
11728 assert(entry_count == 1, "sanity check");
11729 address start = load_archive_data(stub_id);
11730 if (start != nullptr) {
11731 return start;
11732 }
11733 StubCodeMark mark(this, stub_id);
11734 start = __ pc();
11735
11736 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11737 // Load target method from receiver
11738 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11739 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11740 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11741 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11742 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11743 noreg, noreg);
11744 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11745
11746 __ ret(lr);
11747
11748 // record the stub start and end
11749 store_archive_data(stub_id, start, __ pc());
11750
11751 return start;
11752 }
11753
11754 #undef __
11755 #define __ masm->
11756
11757 class MontgomeryMultiplyGenerator : public MacroAssembler {
11758
11759 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11760 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11761
11762 RegSet _toSave;
11763 bool _squaring;
11764
11765 public:
11766 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11767 : MacroAssembler(as->code()), _squaring(squaring) {
11768
11769 // Register allocation
11770
11771 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11772 Pa_base = *regs; // Argument registers
11773 if (squaring)
11774 Pb_base = Pa_base;
11775 else
11776 Pb_base = *++regs;
11777 Pn_base = *++regs;
11778 Rlen= *++regs;
11779 inv = *++regs;
11780 Pm_base = *++regs;
11781
11782 // Working registers:
11783 Ra = *++regs; // The current digit of a, b, n, and m.
11784 Rb = *++regs;
11785 Rm = *++regs;
11786 Rn = *++regs;
11787
11788 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
11789 Pb = *++regs;
11790 Pm = *++regs;
11791 Pn = *++regs;
11792
11793 t0 = *++regs; // Three registers which form a
11794 t1 = *++regs; // triple-precision accumuator.
11795 t2 = *++regs;
11796
11797 Ri = *++regs; // Inner and outer loop indexes.
11798 Rj = *++regs;
11799
11800 Rhi_ab = *++regs; // Product registers: low and high parts
11801 Rlo_ab = *++regs; // of a*b and m*n.
11802 Rhi_mn = *++regs;
11803 Rlo_mn = *++regs;
11804
11805 // r19 and up are callee-saved.
11806 _toSave = RegSet::range(r19, *regs) + Pm_base;
11807 }
11808
11809 private:
11810 void save_regs() {
11811 push(_toSave, sp);
11812 }
11813
11814 void restore_regs() {
11815 pop(_toSave, sp);
11816 }
11817
11818 template <typename T>
11819 void unroll_2(Register count, T block) {
11820 Label loop, end, odd;
11821 tbnz(count, 0, odd);
11822 cbz(count, end);
11823 align(16);
11824 bind(loop);
11825 (this->*block)();
11826 bind(odd);
11827 (this->*block)();
11828 subs(count, count, 2);
11829 br(Assembler::GT, loop);
11830 bind(end);
11831 }
11832
11833 template <typename T>
11834 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11835 Label loop, end, odd;
11836 tbnz(count, 0, odd);
11837 cbz(count, end);
11838 align(16);
11839 bind(loop);
11840 (this->*block)(d, s, tmp);
11841 bind(odd);
11842 (this->*block)(d, s, tmp);
11843 subs(count, count, 2);
11844 br(Assembler::GT, loop);
11845 bind(end);
11846 }
11847
11848 void pre1(RegisterOrConstant i) {
11849 block_comment("pre1");
11850 // Pa = Pa_base;
11851 // Pb = Pb_base + i;
11852 // Pm = Pm_base;
11853 // Pn = Pn_base + i;
11854 // Ra = *Pa;
11855 // Rb = *Pb;
11856 // Rm = *Pm;
11857 // Rn = *Pn;
11858 ldr(Ra, Address(Pa_base));
11859 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11860 ldr(Rm, Address(Pm_base));
11861 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11862 lea(Pa, Address(Pa_base));
11863 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11864 lea(Pm, Address(Pm_base));
11865 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11866
11867 // Zero the m*n result.
11868 mov(Rhi_mn, zr);
11869 mov(Rlo_mn, zr);
11870 }
11871
11872 // The core multiply-accumulate step of a Montgomery
11873 // multiplication. The idea is to schedule operations as a
11874 // pipeline so that instructions with long latencies (loads and
11875 // multiplies) have time to complete before their results are
11876 // used. This most benefits in-order implementations of the
11877 // architecture but out-of-order ones also benefit.
11878 void step() {
11879 block_comment("step");
11880 // MACC(Ra, Rb, t0, t1, t2);
11881 // Ra = *++Pa;
11882 // Rb = *--Pb;
11883 umulh(Rhi_ab, Ra, Rb);
11884 mul(Rlo_ab, Ra, Rb);
11885 ldr(Ra, pre(Pa, wordSize));
11886 ldr(Rb, pre(Pb, -wordSize));
11887 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11888 // previous iteration.
11889 // MACC(Rm, Rn, t0, t1, t2);
11890 // Rm = *++Pm;
11891 // Rn = *--Pn;
11892 umulh(Rhi_mn, Rm, Rn);
11893 mul(Rlo_mn, Rm, Rn);
11894 ldr(Rm, pre(Pm, wordSize));
11895 ldr(Rn, pre(Pn, -wordSize));
11896 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11897 }
11898
11899 void post1() {
11900 block_comment("post1");
11901
11902 // MACC(Ra, Rb, t0, t1, t2);
11903 // Ra = *++Pa;
11904 // Rb = *--Pb;
11905 umulh(Rhi_ab, Ra, Rb);
11906 mul(Rlo_ab, Ra, Rb);
11907 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11908 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11909
11910 // *Pm = Rm = t0 * inv;
11911 mul(Rm, t0, inv);
11912 str(Rm, Address(Pm));
11913
11914 // MACC(Rm, Rn, t0, t1, t2);
11915 // t0 = t1; t1 = t2; t2 = 0;
11916 umulh(Rhi_mn, Rm, Rn);
11917
11918 #ifndef PRODUCT
11919 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11920 {
11921 mul(Rlo_mn, Rm, Rn);
11922 add(Rlo_mn, t0, Rlo_mn);
11923 Label ok;
11924 cbz(Rlo_mn, ok); {
11925 stop("broken Montgomery multiply");
11926 } bind(ok);
11927 }
11928 #endif
11929 // We have very carefully set things up so that
11930 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11931 // the lower half of Rm * Rn because we know the result already:
11932 // it must be -t0. t0 + (-t0) must generate a carry iff
11933 // t0 != 0. So, rather than do a mul and an adds we just set
11934 // the carry flag iff t0 is nonzero.
11935 //
11936 // mul(Rlo_mn, Rm, Rn);
11937 // adds(zr, t0, Rlo_mn);
11938 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11939 adcs(t0, t1, Rhi_mn);
11940 adc(t1, t2, zr);
11941 mov(t2, zr);
11942 }
11943
11944 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11945 block_comment("pre2");
11946 // Pa = Pa_base + i-len;
11947 // Pb = Pb_base + len;
11948 // Pm = Pm_base + i-len;
11949 // Pn = Pn_base + len;
11950
11951 if (i.is_register()) {
11952 sub(Rj, i.as_register(), len);
11953 } else {
11954 mov(Rj, i.as_constant());
11955 sub(Rj, Rj, len);
11956 }
11957 // Rj == i-len
11958
11959 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11960 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11961 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11962 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11963
11964 // Ra = *++Pa;
11965 // Rb = *--Pb;
11966 // Rm = *++Pm;
11967 // Rn = *--Pn;
11968 ldr(Ra, pre(Pa, wordSize));
11969 ldr(Rb, pre(Pb, -wordSize));
11970 ldr(Rm, pre(Pm, wordSize));
11971 ldr(Rn, pre(Pn, -wordSize));
11972
11973 mov(Rhi_mn, zr);
11974 mov(Rlo_mn, zr);
11975 }
11976
11977 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11978 block_comment("post2");
11979 if (i.is_constant()) {
11980 mov(Rj, i.as_constant()-len.as_constant());
11981 } else {
11982 sub(Rj, i.as_register(), len);
11983 }
11984
11985 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11986
11987 // As soon as we know the least significant digit of our result,
11988 // store it.
11989 // Pm_base[i-len] = t0;
11990 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11991
11992 // t0 = t1; t1 = t2; t2 = 0;
11993 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11994 adc(t1, t2, zr);
11995 mov(t2, zr);
11996 }
11997
11998 // A carry in t0 after Montgomery multiplication means that we
11999 // should subtract multiples of n from our result in m. We'll
12000 // keep doing that until there is no carry.
12001 void normalize(RegisterOrConstant len) {
12002 block_comment("normalize");
12003 // while (t0)
12004 // t0 = sub(Pm_base, Pn_base, t0, len);
12005 Label loop, post, again;
12006 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
12007 cbz(t0, post); {
12008 bind(again); {
12009 mov(i, zr);
12010 mov(cnt, len);
12011 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12012 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12013 subs(zr, zr, zr); // set carry flag, i.e. no borrow
12014 align(16);
12015 bind(loop); {
12016 sbcs(Rm, Rm, Rn);
12017 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12018 add(i, i, 1);
12019 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12020 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12021 sub(cnt, cnt, 1);
12022 } cbnz(cnt, loop);
12023 sbc(t0, t0, zr);
12024 } cbnz(t0, again);
12025 } bind(post);
12026 }
12027
12028 // Move memory at s to d, reversing words.
12029 // Increments d to end of copied memory
12030 // Destroys tmp1, tmp2
12031 // Preserves len
12032 // Leaves s pointing to the address which was in d at start
12033 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12034 assert(tmp1->encoding() < r19->encoding(), "register corruption");
12035 assert(tmp2->encoding() < r19->encoding(), "register corruption");
12036
12037 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12038 mov(tmp1, len);
12039 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12040 sub(s, d, len, ext::uxtw, LogBytesPerWord);
12041 }
12042 // where
12043 void reverse1(Register d, Register s, Register tmp) {
12044 ldr(tmp, pre(s, -wordSize));
12045 ror(tmp, tmp, 32);
12046 str(tmp, post(d, wordSize));
12047 }
12048
12049 void step_squaring() {
12050 // An extra ACC
12051 step();
12052 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12053 }
12054
12055 void last_squaring(RegisterOrConstant i) {
12056 Label dont;
12057 // if ((i & 1) == 0) {
12058 tbnz(i.as_register(), 0, dont); {
12059 // MACC(Ra, Rb, t0, t1, t2);
12060 // Ra = *++Pa;
12061 // Rb = *--Pb;
12062 umulh(Rhi_ab, Ra, Rb);
12063 mul(Rlo_ab, Ra, Rb);
12064 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12065 } bind(dont);
12066 }
12067
12068 void extra_step_squaring() {
12069 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12070
12071 // MACC(Rm, Rn, t0, t1, t2);
12072 // Rm = *++Pm;
12073 // Rn = *--Pn;
12074 umulh(Rhi_mn, Rm, Rn);
12075 mul(Rlo_mn, Rm, Rn);
12076 ldr(Rm, pre(Pm, wordSize));
12077 ldr(Rn, pre(Pn, -wordSize));
12078 }
12079
12080 void post1_squaring() {
12081 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12082
12083 // *Pm = Rm = t0 * inv;
12084 mul(Rm, t0, inv);
12085 str(Rm, Address(Pm));
12086
12087 // MACC(Rm, Rn, t0, t1, t2);
12088 // t0 = t1; t1 = t2; t2 = 0;
12089 umulh(Rhi_mn, Rm, Rn);
12090
12091 #ifndef PRODUCT
12092 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12093 {
12094 mul(Rlo_mn, Rm, Rn);
12095 add(Rlo_mn, t0, Rlo_mn);
12096 Label ok;
12097 cbz(Rlo_mn, ok); {
12098 stop("broken Montgomery multiply");
12099 } bind(ok);
12100 }
12101 #endif
12102 // We have very carefully set things up so that
12103 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12104 // the lower half of Rm * Rn because we know the result already:
12105 // it must be -t0. t0 + (-t0) must generate a carry iff
12106 // t0 != 0. So, rather than do a mul and an adds we just set
12107 // the carry flag iff t0 is nonzero.
12108 //
12109 // mul(Rlo_mn, Rm, Rn);
12110 // adds(zr, t0, Rlo_mn);
12111 subs(zr, t0, 1); // Set carry iff t0 is nonzero
12112 adcs(t0, t1, Rhi_mn);
12113 adc(t1, t2, zr);
12114 mov(t2, zr);
12115 }
12116
12117 void acc(Register Rhi, Register Rlo,
12118 Register t0, Register t1, Register t2) {
12119 adds(t0, t0, Rlo);
12120 adcs(t1, t1, Rhi);
12121 adc(t2, t2, zr);
12122 }
12123
12124 public:
12125 /**
12126 * Fast Montgomery multiplication. The derivation of the
12127 * algorithm is in A Cryptographic Library for the Motorola
12128 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12129 *
12130 * Arguments:
12131 *
12132 * Inputs for multiplication:
12133 * c_rarg0 - int array elements a
12134 * c_rarg1 - int array elements b
12135 * c_rarg2 - int array elements n (the modulus)
12136 * c_rarg3 - int length
12137 * c_rarg4 - int inv
12138 * c_rarg5 - int array elements m (the result)
12139 *
12140 * Inputs for squaring:
12141 * c_rarg0 - int array elements a
12142 * c_rarg1 - int array elements n (the modulus)
12143 * c_rarg2 - int length
12144 * c_rarg3 - int inv
12145 * c_rarg4 - int array elements m (the result)
12146 *
12147 */
12148 address generate_multiply() {
12149 Label argh, nothing;
12150
12151 align(CodeEntryAlignment);
12152 address entry = pc();
12153
12154 cbzw(Rlen, nothing);
12155
12156 enter();
12157
12158 // Make room.
12159 cmpw(Rlen, 512);
12160 br(Assembler::HI, argh);
12161 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12162 andr(sp, Ra, -2 * wordSize);
12163
12164 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12165
12166 {
12167 // Copy input args, reversing as we go. We use Ra as a
12168 // temporary variable.
12169 reverse(Ra, Pa_base, Rlen, t0, t1);
12170 if (!_squaring)
12171 reverse(Ra, Pb_base, Rlen, t0, t1);
12172 reverse(Ra, Pn_base, Rlen, t0, t1);
12173 }
12174
12175 // Push all call-saved registers and also Pm_base which we'll need
12176 // at the end.
12177 save_regs();
12178
12179 #ifndef PRODUCT
12180 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12181 {
12182 ldr(Rn, Address(Pn_base, 0));
12183 mul(Rlo_mn, Rn, inv);
12184 subs(zr, Rlo_mn, -1);
12185 Label ok;
12186 br(EQ, ok); {
12187 stop("broken inverse in Montgomery multiply");
12188 } bind(ok);
12189 }
12190 #endif
12191
12192 mov(Pm_base, Ra);
12193
12194 mov(t0, zr);
12195 mov(t1, zr);
12196 mov(t2, zr);
12197
12198 block_comment("for (int i = 0; i < len; i++) {");
12199 mov(Ri, zr); {
12200 Label loop, end;
12201 cmpw(Ri, Rlen);
12202 br(Assembler::GE, end);
12203
12204 bind(loop);
12205 pre1(Ri);
12206
12207 block_comment(" for (j = i; j; j--) {"); {
12208 movw(Rj, Ri);
12209 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12210 } block_comment(" } // j");
12211
12212 post1();
12213 addw(Ri, Ri, 1);
12214 cmpw(Ri, Rlen);
12215 br(Assembler::LT, loop);
12216 bind(end);
12217 block_comment("} // i");
12218 }
12219
12220 block_comment("for (int i = len; i < 2*len; i++) {");
12221 mov(Ri, Rlen); {
12222 Label loop, end;
12223 cmpw(Ri, Rlen, Assembler::LSL, 1);
12224 br(Assembler::GE, end);
12225
12226 bind(loop);
12227 pre2(Ri, Rlen);
12228
12229 block_comment(" for (j = len*2-i-1; j; j--) {"); {
12230 lslw(Rj, Rlen, 1);
12231 subw(Rj, Rj, Ri);
12232 subw(Rj, Rj, 1);
12233 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12234 } block_comment(" } // j");
12235
12236 post2(Ri, Rlen);
12237 addw(Ri, Ri, 1);
12238 cmpw(Ri, Rlen, Assembler::LSL, 1);
12239 br(Assembler::LT, loop);
12240 bind(end);
12241 }
12242 block_comment("} // i");
12243
12244 normalize(Rlen);
12245
12246 mov(Ra, Pm_base); // Save Pm_base in Ra
12247 restore_regs(); // Restore caller's Pm_base
12248
12249 // Copy our result into caller's Pm_base
12250 reverse(Pm_base, Ra, Rlen, t0, t1);
12251
12252 leave();
12253 bind(nothing);
12254 ret(lr);
12255
12256 // handler for error case
12257 bind(argh);
12258 stop("MontgomeryMultiply total_allocation must be <= 8192");
12259
12260 return entry;
12261 }
12262 // In C, approximately:
12263
12264 // void
12265 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12266 // julong Pn_base[], julong Pm_base[],
12267 // julong inv, int len) {
12268 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12269 // julong *Pa, *Pb, *Pn, *Pm;
12270 // julong Ra, Rb, Rn, Rm;
12271
12272 // int i;
12273
12274 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12275
12276 // for (i = 0; i < len; i++) {
12277 // int j;
12278
12279 // Pa = Pa_base;
12280 // Pb = Pb_base + i;
12281 // Pm = Pm_base;
12282 // Pn = Pn_base + i;
12283
12284 // Ra = *Pa;
12285 // Rb = *Pb;
12286 // Rm = *Pm;
12287 // Rn = *Pn;
12288
12289 // int iters = i;
12290 // for (j = 0; iters--; j++) {
12291 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12292 // MACC(Ra, Rb, t0, t1, t2);
12293 // Ra = *++Pa;
12294 // Rb = *--Pb;
12295 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12296 // MACC(Rm, Rn, t0, t1, t2);
12297 // Rm = *++Pm;
12298 // Rn = *--Pn;
12299 // }
12300
12301 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12302 // MACC(Ra, Rb, t0, t1, t2);
12303 // *Pm = Rm = t0 * inv;
12304 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12305 // MACC(Rm, Rn, t0, t1, t2);
12306
12307 // assert(t0 == 0, "broken Montgomery multiply");
12308
12309 // t0 = t1; t1 = t2; t2 = 0;
12310 // }
12311
12312 // for (i = len; i < 2*len; i++) {
12313 // int j;
12314
12315 // Pa = Pa_base + i-len;
12316 // Pb = Pb_base + len;
12317 // Pm = Pm_base + i-len;
12318 // Pn = Pn_base + len;
12319
12320 // Ra = *++Pa;
12321 // Rb = *--Pb;
12322 // Rm = *++Pm;
12323 // Rn = *--Pn;
12324
12325 // int iters = len*2-i-1;
12326 // for (j = i-len+1; iters--; j++) {
12327 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12328 // MACC(Ra, Rb, t0, t1, t2);
12329 // Ra = *++Pa;
12330 // Rb = *--Pb;
12331 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12332 // MACC(Rm, Rn, t0, t1, t2);
12333 // Rm = *++Pm;
12334 // Rn = *--Pn;
12335 // }
12336
12337 // Pm_base[i-len] = t0;
12338 // t0 = t1; t1 = t2; t2 = 0;
12339 // }
12340
12341 // while (t0)
12342 // t0 = sub(Pm_base, Pn_base, t0, len);
12343 // }
12344
12345 /**
12346 * Fast Montgomery squaring. This uses asymptotically 25% fewer
12347 * multiplies than Montgomery multiplication so it should be up to
12348 * 25% faster. However, its loop control is more complex and it
12349 * may actually run slower on some machines.
12350 *
12351 * Arguments:
12352 *
12353 * Inputs:
12354 * c_rarg0 - int array elements a
12355 * c_rarg1 - int array elements n (the modulus)
12356 * c_rarg2 - int length
12357 * c_rarg3 - int inv
12358 * c_rarg4 - int array elements m (the result)
12359 *
12360 */
12361 address generate_square() {
12362 Label argh;
12363
12364 align(CodeEntryAlignment);
12365 address entry = pc();
12366
12367 enter();
12368
12369 // Make room.
12370 cmpw(Rlen, 512);
12371 br(Assembler::HI, argh);
12372 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12373 andr(sp, Ra, -2 * wordSize);
12374
12375 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12376
12377 {
12378 // Copy input args, reversing as we go. We use Ra as a
12379 // temporary variable.
12380 reverse(Ra, Pa_base, Rlen, t0, t1);
12381 reverse(Ra, Pn_base, Rlen, t0, t1);
12382 }
12383
12384 // Push all call-saved registers and also Pm_base which we'll need
12385 // at the end.
12386 save_regs();
12387
12388 mov(Pm_base, Ra);
12389
12390 mov(t0, zr);
12391 mov(t1, zr);
12392 mov(t2, zr);
12393
12394 block_comment("for (int i = 0; i < len; i++) {");
12395 mov(Ri, zr); {
12396 Label loop, end;
12397 bind(loop);
12398 cmp(Ri, Rlen);
12399 br(Assembler::GE, end);
12400
12401 pre1(Ri);
12402
12403 block_comment("for (j = (i+1)/2; j; j--) {"); {
12404 add(Rj, Ri, 1);
12405 lsr(Rj, Rj, 1);
12406 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12407 } block_comment(" } // j");
12408
12409 last_squaring(Ri);
12410
12411 block_comment(" for (j = i/2; j; j--) {"); {
12412 lsr(Rj, Ri, 1);
12413 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12414 } block_comment(" } // j");
12415
12416 post1_squaring();
12417 add(Ri, Ri, 1);
12418 cmp(Ri, Rlen);
12419 br(Assembler::LT, loop);
12420
12421 bind(end);
12422 block_comment("} // i");
12423 }
12424
12425 block_comment("for (int i = len; i < 2*len; i++) {");
12426 mov(Ri, Rlen); {
12427 Label loop, end;
12428 bind(loop);
12429 cmp(Ri, Rlen, Assembler::LSL, 1);
12430 br(Assembler::GE, end);
12431
12432 pre2(Ri, Rlen);
12433
12434 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
12435 lsl(Rj, Rlen, 1);
12436 sub(Rj, Rj, Ri);
12437 sub(Rj, Rj, 1);
12438 lsr(Rj, Rj, 1);
12439 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12440 } block_comment(" } // j");
12441
12442 last_squaring(Ri);
12443
12444 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
12445 lsl(Rj, Rlen, 1);
12446 sub(Rj, Rj, Ri);
12447 lsr(Rj, Rj, 1);
12448 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12449 } block_comment(" } // j");
12450
12451 post2(Ri, Rlen);
12452 add(Ri, Ri, 1);
12453 cmp(Ri, Rlen, Assembler::LSL, 1);
12454
12455 br(Assembler::LT, loop);
12456 bind(end);
12457 block_comment("} // i");
12458 }
12459
12460 normalize(Rlen);
12461
12462 mov(Ra, Pm_base); // Save Pm_base in Ra
12463 restore_regs(); // Restore caller's Pm_base
12464
12465 // Copy our result into caller's Pm_base
12466 reverse(Pm_base, Ra, Rlen, t0, t1);
12467
12468 leave();
12469 ret(lr);
12470
12471 // handler for error case
12472 bind(argh);
12473 stop("MontgomeryMultiply total_allocation must be <= 8192");
12474
12475 return entry;
12476 }
12477 // In C, approximately:
12478
12479 // void
12480 // montgomery_square(julong Pa_base[], julong Pn_base[],
12481 // julong Pm_base[], julong inv, int len) {
12482 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12483 // julong *Pa, *Pb, *Pn, *Pm;
12484 // julong Ra, Rb, Rn, Rm;
12485
12486 // int i;
12487
12488 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12489
12490 // for (i = 0; i < len; i++) {
12491 // int j;
12492
12493 // Pa = Pa_base;
12494 // Pb = Pa_base + i;
12495 // Pm = Pm_base;
12496 // Pn = Pn_base + i;
12497
12498 // Ra = *Pa;
12499 // Rb = *Pb;
12500 // Rm = *Pm;
12501 // Rn = *Pn;
12502
12503 // int iters = (i+1)/2;
12504 // for (j = 0; iters--; j++) {
12505 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12506 // MACC2(Ra, Rb, t0, t1, t2);
12507 // Ra = *++Pa;
12508 // Rb = *--Pb;
12509 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12510 // MACC(Rm, Rn, t0, t1, t2);
12511 // Rm = *++Pm;
12512 // Rn = *--Pn;
12513 // }
12514 // if ((i & 1) == 0) {
12515 // assert(Ra == Pa_base[j], "must be");
12516 // MACC(Ra, Ra, t0, t1, t2);
12517 // }
12518 // iters = i/2;
12519 // assert(iters == i-j, "must be");
12520 // for (; iters--; j++) {
12521 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12522 // MACC(Rm, Rn, t0, t1, t2);
12523 // Rm = *++Pm;
12524 // Rn = *--Pn;
12525 // }
12526
12527 // *Pm = Rm = t0 * inv;
12528 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12529 // MACC(Rm, Rn, t0, t1, t2);
12530
12531 // assert(t0 == 0, "broken Montgomery multiply");
12532
12533 // t0 = t1; t1 = t2; t2 = 0;
12534 // }
12535
12536 // for (i = len; i < 2*len; i++) {
12537 // int start = i-len+1;
12538 // int end = start + (len - start)/2;
12539 // int j;
12540
12541 // Pa = Pa_base + i-len;
12542 // Pb = Pa_base + len;
12543 // Pm = Pm_base + i-len;
12544 // Pn = Pn_base + len;
12545
12546 // Ra = *++Pa;
12547 // Rb = *--Pb;
12548 // Rm = *++Pm;
12549 // Rn = *--Pn;
12550
12551 // int iters = (2*len-i-1)/2;
12552 // assert(iters == end-start, "must be");
12553 // for (j = start; iters--; j++) {
12554 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12555 // MACC2(Ra, Rb, t0, t1, t2);
12556 // Ra = *++Pa;
12557 // Rb = *--Pb;
12558 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12559 // MACC(Rm, Rn, t0, t1, t2);
12560 // Rm = *++Pm;
12561 // Rn = *--Pn;
12562 // }
12563 // if ((i & 1) == 0) {
12564 // assert(Ra == Pa_base[j], "must be");
12565 // MACC(Ra, Ra, t0, t1, t2);
12566 // }
12567 // iters = (2*len-i)/2;
12568 // assert(iters == len-j, "must be");
12569 // for (; iters--; j++) {
12570 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12571 // MACC(Rm, Rn, t0, t1, t2);
12572 // Rm = *++Pm;
12573 // Rn = *--Pn;
12574 // }
12575 // Pm_base[i-len] = t0;
12576 // t0 = t1; t1 = t2; t2 = 0;
12577 // }
12578
12579 // while (t0)
12580 // t0 = sub(Pm_base, Pn_base, t0, len);
12581 // }
12582 };
12583
12584 // Call here from the interpreter or compiled code to either load
12585 // multiple returned values from the inline type instance being
12586 // returned to registers or to store returned values to a newly
12587 // allocated inline type instance.
12588 address generate_return_value_stub(address destination, const char* name, bool has_res) {
12589 // We need to save all registers the calling convention may use so
12590 // the runtime calls read or update those registers. This needs to
12591 // be in sync with SharedRuntime::java_return_convention().
12592 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
12593 enum layout {
12594 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
12595 j_rarg6_off, j_rarg6_2,
12596 j_rarg5_off, j_rarg5_2,
12597 j_rarg4_off, j_rarg4_2,
12598 j_rarg3_off, j_rarg3_2,
12599 j_rarg2_off, j_rarg2_2,
12600 j_rarg1_off, j_rarg1_2,
12601 j_rarg0_off, j_rarg0_2,
12602
12603 j_farg7_off, j_farg7_2,
12604 j_farg6_off, j_farg6_2,
12605 j_farg5_off, j_farg5_2,
12606 j_farg4_off, j_farg4_2,
12607 j_farg3_off, j_farg3_2,
12608 j_farg2_off, j_farg2_2,
12609 j_farg1_off, j_farg1_2,
12610 j_farg0_off, j_farg0_2,
12611
12612 rfp_off, rfp_off2,
12613 return_off, return_off2,
12614
12615 framesize // inclusive of return address
12616 };
12617
12618 CodeBuffer code(name, 512, 64);
12619 MacroAssembler* masm = new MacroAssembler(&code);
12620
12621 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
12622 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
12623 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
12624 int frame_size_in_words = frame_size_in_bytes / wordSize;
12625
12626 OopMapSet* oop_maps = new OopMapSet();
12627 OopMap* map = new OopMap(frame_size_in_slots, 0);
12628
12629 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
12630 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
12631 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
12632 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
12633 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
12634 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
12635 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
12636 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
12637
12638 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
12639 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
12640 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
12641 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
12642 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
12643 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
12644 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
12645 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
12646
12647 address start = __ pc();
12648
12649 __ enter(); // Save FP and LR before call
12650
12651 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
12652 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
12653 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
12654 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
12655
12656 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
12657 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
12658 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
12659 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
12660
12661 int frame_complete = __ offset();
12662
12663 // Set up last_Java_sp and last_Java_fp
12664 address the_pc = __ pc();
12665 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
12666
12667 // Call runtime
12668 __ mov(c_rarg1, r0);
12669 __ mov(c_rarg0, rthread);
12670
12671 __ mov(rscratch1, destination);
12672 __ blr(rscratch1);
12673
12674 oop_maps->add_gc_map(the_pc - start, map);
12675
12676 __ reset_last_Java_frame(false);
12677
12678 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
12679 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
12680 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
12681 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
12682
12683 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
12684 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
12685 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
12686 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
12687
12688 __ leave();
12689
12690 // check for pending exceptions
12691 Label pending;
12692 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
12693 __ cbnz(rscratch1, pending);
12694
12695 if (has_res) {
12696 __ get_vm_result_oop(r0, rthread);
12697 }
12698
12699 __ ret(lr);
12700
12701 __ bind(pending);
12702 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
12703
12704 // -------------
12705 // make sure all code is generated
12706 masm->flush();
12707
12708 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
12709 return stub->entry_point();
12710 }
12711
12712 // Initialization
12713 void generate_preuniverse_stubs() {
12714 // preuniverse stubs are not needed for aarch64
12715 }
12716
12717 void generate_initial_stubs() {
12718 // Generate initial stubs and initializes the entry points
12719
12720 // entry points that exist in all platforms Note: This is code
12721 // that could be shared among different platforms - however the
12722 // benefit seems to be smaller than the disadvantage of having a
12723 // much more complicated generator structure. See also comment in
12724 // stubRoutines.hpp.
12725
12726 StubRoutines::_forward_exception_entry = generate_forward_exception();
12727
12728 StubRoutines::_call_stub_entry =
12729 generate_call_stub(StubRoutines::_call_stub_return_address);
12730
12731 // is referenced by megamorphic call
12732 StubRoutines::_catch_exception_entry = generate_catch_exception();
12733
12734 // Initialize table for copy memory (arraycopy) check.
12735 if (UnsafeMemoryAccess::_table == nullptr) {
12736 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12737 }
12738
12739 if (UseCRC32Intrinsics) {
12740 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12741 }
12742
12743 if (UseCRC32CIntrinsics) {
12744 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12745 }
12746
12747 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12748 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12749 }
12750
12751 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12752 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12753 }
12754
12755 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12756 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12757 StubRoutines::_hf2f = generate_float16ToFloat();
12758 StubRoutines::_f2hf = generate_floatToFloat16();
12759 }
12760
12761 if (InlineTypeReturnedAsFields) {
12762 StubRoutines::_load_inline_type_fields_in_regs =
12763 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
12764 StubRoutines::_store_inline_type_fields_to_buf =
12765 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
12766 }
12767
12768 }
12769
12770 void generate_continuation_stubs() {
12771 // Continuation stubs:
12772 StubRoutines::_cont_thaw = generate_cont_thaw();
12773 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12774 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12775 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12776 }
12777
12778 void generate_final_stubs() {
12779 // support for verify_oop (must happen after universe_init)
12780 if (VerifyOops) {
12781 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
12782 }
12783
12784 // arraycopy stubs used by compilers
12785 generate_arraycopy_stubs();
12786
12787 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12788
12789 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12790
12791 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12792 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12793
12794 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12795
12796 generate_atomic_entry_points();
12797
12798 #endif // LINUX
12799
12800 #ifdef COMPILER2
12801 if (UseSecondarySupersTable) {
12802 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12803 if (! InlineSecondarySupersTest) {
12804 generate_lookup_secondary_supers_table_stub();
12805 }
12806 }
12807 #endif
12808
12809 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12810 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12811 }
12812
12813 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12814 }
12815
12816 void generate_compiler_stubs() {
12817 #if COMPILER2_OR_JVMCI
12818
12819 if (UseSVE == 0) {
12820 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12821 }
12822
12823 // array equals stub for large arrays.
12824 if (!UseSimpleArrayEquals) {
12825 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12826 }
12827
12828 // arrays_hascode stub for large arrays.
12829 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12830 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12831 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12832 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12833 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12834
12835 // byte_array_inflate stub for large arrays.
12836 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12837
12838 // countPositives stub for large arrays.
12839 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12840
12841 generate_compare_long_strings();
12842
12843 generate_string_indexof_stubs();
12844
12845 #ifdef COMPILER2
12846 if (UseMultiplyToLenIntrinsic) {
12847 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12848 }
12849
12850 if (UseSquareToLenIntrinsic) {
12851 StubRoutines::_squareToLen = generate_squareToLen();
12852 }
12853
12854 if (UseMulAddIntrinsic) {
12855 StubRoutines::_mulAdd = generate_mulAdd();
12856 }
12857
12858 if (UseSIMDForBigIntegerShiftIntrinsics) {
12859 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12860 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12861 }
12862
12863 if (UseMontgomeryMultiplyIntrinsic) {
12864 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12865 address start = load_archive_data(stub_id);
12866 if (start == nullptr) {
12867 // we have to generate it
12868 StubCodeMark mark(this, stub_id);
12869 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12870 start = g.generate_multiply();
12871 // record the stub start and end
12872 store_archive_data(stub_id, start, _masm->pc());
12873 }
12874 StubRoutines::_montgomeryMultiply = start;
12875 }
12876
12877 if (UseMontgomerySquareIntrinsic) {
12878 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12879 address start = load_archive_data(stub_id);
12880 if (start == nullptr) {
12881 // we have to generate it
12882 StubCodeMark mark(this, stub_id);
12883 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12884 // We use generate_multiply() rather than generate_square()
12885 // because it's faster for the sizes of modulus we care about.
12886 start = g.generate_multiply();
12887 // record the stub start and end
12888 store_archive_data(stub_id, start, _masm->pc());
12889 }
12890 StubRoutines::_montgomerySquare = start;
12891 }
12892
12893 #endif // COMPILER2
12894
12895 if (UseChaCha20Intrinsics) {
12896 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12897 }
12898
12899 if (UseKyberIntrinsics) {
12900 StubRoutines::_kyberNtt = generate_kyberNtt();
12901 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12902 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12903 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12904 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12905 StubRoutines::_kyber12To16 = generate_kyber12To16();
12906 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12907 }
12908
12909 if (UseDilithiumIntrinsics) {
12910 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12911 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12912 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12913 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12914 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12915 }
12916
12917 if (UseBASE64Intrinsics) {
12918 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12919 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12920 }
12921
12922 // data cache line writeback
12923 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12924 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12925
12926 if (UseAESIntrinsics) {
12927 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12928 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12929 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12930 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12931 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12932 }
12933 if (UseGHASHIntrinsics) {
12934 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12935 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12936 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12937 }
12938 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12939 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12940 }
12941
12942 if (UseMD5Intrinsics) {
12943 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12944 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12945 }
12946 if (UseSHA1Intrinsics) {
12947 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12948 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12949 }
12950 if (UseSHA256Intrinsics) {
12951 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12952 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12953 }
12954 if (UseSHA512Intrinsics) {
12955 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12956 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12957 }
12958 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12959 StubRoutines::_double_keccak = generate_double_keccak();
12960 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12961 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12962 } else if (UseSHA3Intrinsics) {
12963 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12964 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12965 }
12966
12967 if (UsePoly1305Intrinsics) {
12968 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12969 }
12970
12971 // generate Adler32 intrinsics code
12972 if (UseAdler32Intrinsics) {
12973 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12974 }
12975
12976 #endif // COMPILER2_OR_JVMCI
12977 }
12978
12979 public:
12980 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12981 switch(blob_id) {
12982 case BlobId::stubgen_preuniverse_id:
12983 generate_preuniverse_stubs();
12984 break;
12985 case BlobId::stubgen_initial_id:
12986 generate_initial_stubs();
12987 break;
12988 case BlobId::stubgen_continuation_id:
12989 generate_continuation_stubs();
12990 break;
12991 case BlobId::stubgen_compiler_id:
12992 generate_compiler_stubs();
12993 break;
12994 case BlobId::stubgen_final_id:
12995 generate_final_stubs();
12996 break;
12997 default:
12998 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12999 break;
13000 };
13001 }
13002
13003 #if INCLUDE_CDS
13004 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13005 // external data defined in this file
13006 #define ADD(addr) external_addresses.append((address)(addr));
13007 ADD(_sha256_round_consts);
13008 ADD(_sha512_round_consts);
13009 ADD(_sha3_round_consts);
13010 ADD(_double_keccak_round_consts);
13011 ADD(_encodeBlock_toBase64);
13012 ADD(_encodeBlock_toBase64URL);
13013 ADD(_decodeBlock_fromBase64ForNoSIMD);
13014 ADD(_decodeBlock_fromBase64URLForNoSIMD);
13015 ADD(_decodeBlock_fromBase64ForSIMD);
13016 ADD(_decodeBlock_fromBase64URLForSIMD);
13017 #undef ADD
13018 }
13019 #endif // INCLUDE_CDS
13020 }; // end class declaration
13021
13022 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13023 StubGenerator g(code, blob_id, stub_data);
13024 }
13025
13026 #if INCLUDE_CDS
13027 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13028 StubGenerator::init_AOTAddressTable(addresses);
13029 }
13030 #endif // INCLUDE_CDS
13031
13032 #if defined (LINUX)
13033
13034 // Define pointers to atomic stubs and initialize them to point to the
13035 // code in atomic_aarch64.S.
13036
13037 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
13038 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13039 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
13040 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13041 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13042
13043 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13044 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13045 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
13046 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
13047 DEFAULT_ATOMIC_OP(xchg, 4, )
13048 DEFAULT_ATOMIC_OP(xchg, 8, )
13049 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
13050 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
13051 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
13052 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
13053 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
13054 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
13055 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
13056 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
13057 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
13058 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
13059
13060 #undef DEFAULT_ATOMIC_OP
13061
13062 #endif // LINUX