1 /*
2 * Copyright (c) 2003, 2025, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 static const char _encodeBlock_toBase64[64] = {
156 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
157 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
158 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
159 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
160 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
161 };
162
163 static const char _encodeBlock_toBase64URL[64] = {
164 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
165 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
166 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
167 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
168 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
169 };
170
171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
178 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
179 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
180 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
181 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
182 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 };
192
193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
197 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
198 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
199 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
200 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
201 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
203 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
205 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
206 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
207 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 };
211
212 // A legal value of base64 code is in range [0, 127]. We need two lookups
213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
215 // table vector lookup use tbx, out of range indices are unchanged in
216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
217 // The value of index 64 is set to 0, so that we know that we already get the
218 // decoded data with the 1st lookup.
219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
220 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
221 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
222 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
223 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
224 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
225 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
226 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
227 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
228 };
229
230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
231 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
232 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
233 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
234 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
235 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
236 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
237 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
238 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
239 };
240
241
242 // Stub Code definitions
243
244 class StubGenerator: public StubCodeGenerator {
245 private:
246
247 #ifdef PRODUCT
248 #define inc_counter_np(counter) ((void)0)
249 #else
250 void inc_counter_np_(uint& counter) {
251 __ incrementw(ExternalAddress((address)&counter));
252 }
253 #define inc_counter_np(counter) \
254 BLOCK_COMMENT("inc_counter " #counter); \
255 inc_counter_np_(counter);
256 #endif
257
258 // Call stubs are used to call Java from C
259 //
260 // Arguments:
261 // c_rarg0: call wrapper address address
262 // c_rarg1: result address
263 // c_rarg2: result type BasicType
264 // c_rarg3: method Method*
265 // c_rarg4: (interpreter) entry point address
266 // c_rarg5: parameters intptr_t*
267 // c_rarg6: parameter size (in words) int
268 // c_rarg7: thread Thread*
269 //
270 // There is no return from the stub itself as any Java result
271 // is written to result
272 //
273 // we save r30 (lr) as the return PC at the base of the frame and
274 // link r29 (fp) below it as the frame pointer installing sp (r31)
275 // into fp.
276 //
277 // we save r0-r7, which accounts for all the c arguments.
278 //
279 // TODO: strictly do we need to save them all? they are treated as
280 // volatile by C so could we omit saving the ones we are going to
281 // place in global registers (thread? method?) or those we only use
282 // during setup of the Java call?
283 //
284 // we don't need to save r8 which C uses as an indirect result location
285 // return register.
286 //
287 // we don't need to save r9-r15 which both C and Java treat as
288 // volatile
289 //
290 // we don't need to save r16-18 because Java does not use them
291 //
292 // we save r19-r28 which Java uses as scratch registers and C
293 // expects to be callee-save
294 //
295 // we save the bottom 64 bits of each value stored in v8-v15; it is
296 // the responsibility of the caller to preserve larger values.
297 //
298 // so the stub frame looks like this when we enter Java code
299 //
300 // [ return_from_Java ] <--- sp
301 // [ argument word n ]
302 // ...
303 // -29 [ argument word 1 ]
304 // -28 [ saved Floating-point Control Register ]
305 // -26 [ saved v15 ] <--- sp_after_call
306 // -25 [ saved v14 ]
307 // -24 [ saved v13 ]
308 // -23 [ saved v12 ]
309 // -22 [ saved v11 ]
310 // -21 [ saved v10 ]
311 // -20 [ saved v9 ]
312 // -19 [ saved v8 ]
313 // -18 [ saved r28 ]
314 // -17 [ saved r27 ]
315 // -16 [ saved r26 ]
316 // -15 [ saved r25 ]
317 // -14 [ saved r24 ]
318 // -13 [ saved r23 ]
319 // -12 [ saved r22 ]
320 // -11 [ saved r21 ]
321 // -10 [ saved r20 ]
322 // -9 [ saved r19 ]
323 // -8 [ call wrapper (r0) ]
324 // -7 [ result (r1) ]
325 // -6 [ result type (r2) ]
326 // -5 [ method (r3) ]
327 // -4 [ entry point (r4) ]
328 // -3 [ parameters (r5) ]
329 // -2 [ parameter size (r6) ]
330 // -1 [ thread (r7) ]
331 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
332 // 1 [ saved lr (r30) ]
333
334 // Call stub stack layout word offsets from fp
335 enum call_stub_layout {
336 sp_after_call_off = -28,
337
338 fpcr_off = sp_after_call_off,
339 d15_off = -26,
340 d13_off = -24,
341 d11_off = -22,
342 d9_off = -20,
343
344 r28_off = -18,
345 r26_off = -16,
346 r24_off = -14,
347 r22_off = -12,
348 r20_off = -10,
349 call_wrapper_off = -8,
350 result_off = -7,
351 result_type_off = -6,
352 method_off = -5,
353 entry_point_off = -4,
354 parameter_size_off = -2,
355 thread_off = -1,
356 fp_f = 0,
357 retaddr_off = 1,
358 };
359
360 address generate_call_stub(address& return_address) {
361 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
362 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
363 "adjust this code");
364
365 StubId stub_id = StubId::stubgen_call_stub_id;
366 GrowableArray<address> entries;
367 int entry_count = StubInfo::entry_count(stub_id);
368 assert(entry_count == 2, "sanity check");
369 address start = load_archive_data(stub_id, &entries);
370 if (start != nullptr) {
371 assert(entries.length() == 1, "expected 1 extra entry");
372 return_address = entries.at(0);
373 return start;
374 }
375 StubCodeMark mark(this, stub_id);
376 start = __ pc();
377
378 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
379
380 const Address fpcr_save (rfp, fpcr_off * wordSize);
381 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
382 const Address result (rfp, result_off * wordSize);
383 const Address result_type (rfp, result_type_off * wordSize);
384 const Address method (rfp, method_off * wordSize);
385 const Address entry_point (rfp, entry_point_off * wordSize);
386 const Address parameter_size(rfp, parameter_size_off * wordSize);
387
388 const Address thread (rfp, thread_off * wordSize);
389
390 const Address d15_save (rfp, d15_off * wordSize);
391 const Address d13_save (rfp, d13_off * wordSize);
392 const Address d11_save (rfp, d11_off * wordSize);
393 const Address d9_save (rfp, d9_off * wordSize);
394
395 const Address r28_save (rfp, r28_off * wordSize);
396 const Address r26_save (rfp, r26_off * wordSize);
397 const Address r24_save (rfp, r24_off * wordSize);
398 const Address r22_save (rfp, r22_off * wordSize);
399 const Address r20_save (rfp, r20_off * wordSize);
400
401 // stub code
402
403 address aarch64_entry = __ pc();
404
405 // set up frame and move sp to end of save area
406 __ enter();
407 __ sub(sp, rfp, -sp_after_call_off * wordSize);
408
409 // save register parameters and Java scratch/global registers
410 // n.b. we save thread even though it gets installed in
411 // rthread because we want to sanity check rthread later
412 __ str(c_rarg7, thread);
413 __ strw(c_rarg6, parameter_size);
414 __ stp(c_rarg4, c_rarg5, entry_point);
415 __ stp(c_rarg2, c_rarg3, result_type);
416 __ stp(c_rarg0, c_rarg1, call_wrapper);
417
418 __ stp(r20, r19, r20_save);
419 __ stp(r22, r21, r22_save);
420 __ stp(r24, r23, r24_save);
421 __ stp(r26, r25, r26_save);
422 __ stp(r28, r27, r28_save);
423
424 __ stpd(v9, v8, d9_save);
425 __ stpd(v11, v10, d11_save);
426 __ stpd(v13, v12, d13_save);
427 __ stpd(v15, v14, d15_save);
428
429 __ get_fpcr(rscratch1);
430 __ str(rscratch1, fpcr_save);
431 // Set FPCR to the state we need. We do want Round to Nearest. We
432 // don't want non-IEEE rounding modes or floating-point traps.
433 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
434 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
435 __ set_fpcr(rscratch1);
436
437 // install Java thread in global register now we have saved
438 // whatever value it held
439 __ mov(rthread, c_rarg7);
440 // And method
441 __ mov(rmethod, c_rarg3);
442
443 // set up the heapbase register
444 __ reinit_heapbase();
445
446 #ifdef ASSERT
447 // make sure we have no pending exceptions
448 {
449 Label L;
450 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
451 __ cmp(rscratch1, (u1)NULL_WORD);
452 __ br(Assembler::EQ, L);
453 __ stop("StubRoutines::call_stub: entered with pending exception");
454 __ BIND(L);
455 }
456 #endif
457 // pass parameters if any
458 __ mov(esp, sp);
459 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
460 __ andr(sp, rscratch1, -2 * wordSize);
461
462 BLOCK_COMMENT("pass parameters if any");
463 Label parameters_done;
464 // parameter count is still in c_rarg6
465 // and parameter pointer identifying param 1 is in c_rarg5
466 __ cbzw(c_rarg6, parameters_done);
467
468 address loop = __ pc();
469 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
470 __ subsw(c_rarg6, c_rarg6, 1);
471 __ push(rscratch1);
472 __ br(Assembler::GT, loop);
473
474 __ BIND(parameters_done);
475
476 // call Java entry -- passing methdoOop, and current sp
477 // rmethod: Method*
478 // r19_sender_sp: sender sp
479 BLOCK_COMMENT("call Java function");
480 __ mov(r19_sender_sp, sp);
481 __ blr(c_rarg4);
482
483 // we do this here because the notify will already have been done
484 // if we get to the next instruction via an exception
485 //
486 // n.b. adding this instruction here affects the calculation of
487 // whether or not a routine returns to the call stub (used when
488 // doing stack walks) since the normal test is to check the return
489 // pc against the address saved below. so we may need to allow for
490 // this extra instruction in the check.
491
492 // save current address for use by exception handling code
493
494 return_address = __ pc();
495 entries.append(return_address);
496
497 // store result depending on type (everything that is not
498 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
499 // n.b. this assumes Java returns an integral result in r0
500 // and a floating result in j_farg0
501 __ ldr(j_rarg2, result);
502 Label is_long, is_float, is_double, exit;
503 __ ldr(j_rarg1, result_type);
504 __ cmp(j_rarg1, (u1)T_OBJECT);
505 __ br(Assembler::EQ, is_long);
506 __ cmp(j_rarg1, (u1)T_LONG);
507 __ br(Assembler::EQ, is_long);
508 __ cmp(j_rarg1, (u1)T_FLOAT);
509 __ br(Assembler::EQ, is_float);
510 __ cmp(j_rarg1, (u1)T_DOUBLE);
511 __ br(Assembler::EQ, is_double);
512
513 // handle T_INT case
514 __ strw(r0, Address(j_rarg2));
515
516 __ BIND(exit);
517
518 // pop parameters
519 __ sub(esp, rfp, -sp_after_call_off * wordSize);
520
521 #ifdef ASSERT
522 // verify that threads correspond
523 {
524 Label L, S;
525 __ ldr(rscratch1, thread);
526 __ cmp(rthread, rscratch1);
527 __ br(Assembler::NE, S);
528 __ get_thread(rscratch1);
529 __ cmp(rthread, rscratch1);
530 __ br(Assembler::EQ, L);
531 __ BIND(S);
532 __ stop("StubRoutines::call_stub: threads must correspond");
533 __ BIND(L);
534 }
535 #endif
536
537 __ pop_cont_fastpath(rthread);
538
539 // restore callee-save registers
540 __ ldpd(v15, v14, d15_save);
541 __ ldpd(v13, v12, d13_save);
542 __ ldpd(v11, v10, d11_save);
543 __ ldpd(v9, v8, d9_save);
544
545 __ ldp(r28, r27, r28_save);
546 __ ldp(r26, r25, r26_save);
547 __ ldp(r24, r23, r24_save);
548 __ ldp(r22, r21, r22_save);
549 __ ldp(r20, r19, r20_save);
550
551 // restore fpcr
552 __ ldr(rscratch1, fpcr_save);
553 __ set_fpcr(rscratch1);
554
555 __ ldp(c_rarg0, c_rarg1, call_wrapper);
556 __ ldrw(c_rarg2, result_type);
557 __ ldr(c_rarg3, method);
558 __ ldp(c_rarg4, c_rarg5, entry_point);
559 __ ldp(c_rarg6, c_rarg7, parameter_size);
560
561 // leave frame and return to caller
562 __ leave();
563 __ ret(lr);
564
565 // handle return types different from T_INT
566
567 __ BIND(is_long);
568 __ str(r0, Address(j_rarg2, 0));
569 __ br(Assembler::AL, exit);
570
571 __ BIND(is_float);
572 __ strs(j_farg0, Address(j_rarg2, 0));
573 __ br(Assembler::AL, exit);
574
575 __ BIND(is_double);
576 __ strd(j_farg0, Address(j_rarg2, 0));
577 __ br(Assembler::AL, exit);
578
579 // record the stub entry and end plus the auxiliary entry
580 store_archive_data(stub_id, start, __ pc(), &entries);
581
582 return start;
583 }
584
585 // Return point for a Java call if there's an exception thrown in
586 // Java code. The exception is caught and transformed into a
587 // pending exception stored in JavaThread that can be tested from
588 // within the VM.
589 //
590 // Note: Usually the parameters are removed by the callee. In case
591 // of an exception crossing an activation frame boundary, that is
592 // not the case if the callee is compiled code => need to setup the
593 // rsp.
594 //
595 // r0: exception oop
596
597 address generate_catch_exception() {
598 StubId stub_id = StubId::stubgen_catch_exception_id;
599 int entry_count = StubInfo::entry_count(stub_id);
600 assert(entry_count == 1, "sanity check");
601 address start = load_archive_data(stub_id);
602 if (start != nullptr) {
603 return start;
604 }
605 StubCodeMark mark(this, stub_id);
606 start = __ pc();
607
608 // same as in generate_call_stub():
609 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
610 const Address thread (rfp, thread_off * wordSize);
611
612 #ifdef ASSERT
613 // verify that threads correspond
614 {
615 Label L, S;
616 __ ldr(rscratch1, thread);
617 __ cmp(rthread, rscratch1);
618 __ br(Assembler::NE, S);
619 __ get_thread(rscratch1);
620 __ cmp(rthread, rscratch1);
621 __ br(Assembler::EQ, L);
622 __ bind(S);
623 __ stop("StubRoutines::catch_exception: threads must correspond");
624 __ bind(L);
625 }
626 #endif
627
628 // set pending exception
629 __ verify_oop(r0);
630
631 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
632 // special case -- add file name string to AOT address table
633 address file = (address)AOTCodeCache::add_C_string(__FILE__);
634 __ lea(rscratch1, ExternalAddress(file));
635 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
636 __ movw(rscratch1, (int)__LINE__);
637 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
638
639 // complete return to VM
640 assert(StubRoutines::_call_stub_return_address != nullptr,
641 "_call_stub_return_address must have been generated before");
642 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
643
644 // record the stub entry and end
645 store_archive_data(stub_id, start, __ pc());
646
647 return start;
648 }
649
650 // Continuation point for runtime calls returning with a pending
651 // exception. The pending exception check happened in the runtime
652 // or native call stub. The pending exception in Thread is
653 // converted into a Java-level exception.
654 //
655 // Contract with Java-level exception handlers:
656 // r0: exception
657 // r3: throwing pc
658 //
659 // NOTE: At entry of this stub, exception-pc must be in LR !!
660
661 // NOTE: this is always used as a jump target within generated code
662 // so it just needs to be generated code with no x86 prolog
663
664 address generate_forward_exception() {
665 StubId stub_id = StubId::stubgen_forward_exception_id;
666 int entry_count = StubInfo::entry_count(stub_id);
667 assert(entry_count == 1, "sanity check");
668 address start = load_archive_data(stub_id);
669 if (start != nullptr) {
670 return start;
671 }
672 StubCodeMark mark(this, stub_id);
673 start = __ pc();
674
675 // Upon entry, LR points to the return address returning into
676 // Java (interpreted or compiled) code; i.e., the return address
677 // becomes the throwing pc.
678 //
679 // Arguments pushed before the runtime call are still on the stack
680 // but the exception handler will reset the stack pointer ->
681 // ignore them. A potential result in registers can be ignored as
682 // well.
683
684 #ifdef ASSERT
685 // make sure this code is only executed if there is a pending exception
686 {
687 Label L;
688 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
689 __ cbnz(rscratch1, L);
690 __ stop("StubRoutines::forward exception: no pending exception (1)");
691 __ bind(L);
692 }
693 #endif
694
695 // compute exception handler into r19
696
697 // call the VM to find the handler address associated with the
698 // caller address. pass thread in r0 and caller pc (ret address)
699 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
700 // the stack.
701 __ mov(c_rarg1, lr);
702 // lr will be trashed by the VM call so we move it to R19
703 // (callee-saved) because we also need to pass it to the handler
704 // returned by this call.
705 __ mov(r19, lr);
706 BLOCK_COMMENT("call exception_handler_for_return_address");
707 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
708 SharedRuntime::exception_handler_for_return_address),
709 rthread, c_rarg1);
710 // Reinitialize the ptrue predicate register, in case the external runtime
711 // call clobbers ptrue reg, as we may return to SVE compiled code.
712 __ reinitialize_ptrue();
713
714 // we should not really care that lr is no longer the callee
715 // address. we saved the value the handler needs in r19 so we can
716 // just copy it to r3. however, the C2 handler will push its own
717 // frame and then calls into the VM and the VM code asserts that
718 // the PC for the frame above the handler belongs to a compiled
719 // Java method. So, we restore lr here to satisfy that assert.
720 __ mov(lr, r19);
721 // setup r0 & r3 & clear pending exception
722 __ mov(r3, r19);
723 __ mov(r19, r0);
724 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
725 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
726
727 #ifdef ASSERT
728 // make sure exception is set
729 {
730 Label L;
731 __ cbnz(r0, L);
732 __ stop("StubRoutines::forward exception: no pending exception (2)");
733 __ bind(L);
734 }
735 #endif
736
737 // continue at exception handler
738 // r0: exception
739 // r3: throwing pc
740 // r19: exception handler
741 __ verify_oop(r0);
742 __ br(r19);
743
744 // record the stub entry and end
745 store_archive_data(stub_id, start, __ pc());
746
747 return start;
748 }
749
750 // Non-destructive plausibility checks for oops
751 //
752 // Arguments:
753 // r0: oop to verify
754 // rscratch1: error message
755 //
756 // Stack after saving c_rarg3:
757 // [tos + 0]: saved c_rarg3
758 // [tos + 1]: saved c_rarg2
759 // [tos + 2]: saved lr
760 // [tos + 3]: saved rscratch2
761 // [tos + 4]: saved r0
762 // [tos + 5]: saved rscratch1
763 address generate_verify_oop() {
764 StubId stub_id = StubId::stubgen_verify_oop_id;
765 int entry_count = StubInfo::entry_count(stub_id);
766 assert(entry_count == 1, "sanity check");
767 address start = load_archive_data(stub_id);
768 if (start != nullptr) {
769 return start;
770 }
771 StubCodeMark mark(this, stub_id);
772 start = __ pc();
773
774 Label exit, error;
775
776 // save c_rarg2 and c_rarg3
777 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
778
779 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
780 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
781 __ ldr(c_rarg3, Address(c_rarg2));
782 __ add(c_rarg3, c_rarg3, 1);
783 __ str(c_rarg3, Address(c_rarg2));
784
785 // object is in r0
786 // make sure object is 'reasonable'
787 __ cbz(r0, exit); // if obj is null it is OK
788
789 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
790 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
791
792 // return if everything seems ok
793 __ bind(exit);
794
795 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
796 __ ret(lr);
797
798 // handle errors
799 __ bind(error);
800 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
801
802 __ push(RegSet::range(r0, r29), sp);
803 // debug(char* msg, int64_t pc, int64_t regs[])
804 __ mov(c_rarg0, rscratch1); // pass address of error message
805 __ mov(c_rarg1, lr); // pass return address
806 __ mov(c_rarg2, sp); // pass address of regs on stack
807 #ifndef PRODUCT
808 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
809 #endif
810 BLOCK_COMMENT("call MacroAssembler::debug");
811 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
812 __ blr(rscratch1);
813 __ hlt(0);
814
815 // record the stub entry and end
816 store_archive_data(stub_id, start, __ pc());
817
818 return start;
819 }
820
821 // Generate indices for iota vector.
822 void generate_iota_indices(StubId stub_id) {
823 GrowableArray<address> entries;
824 int entry_count = StubInfo::entry_count(stub_id);
825 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
826 address start = load_archive_data(stub_id, &entries);
827 if (start != nullptr) {
828 assert(entries.length() == entry_count - 1,
829 "unexpected entries count %d", entries.length());
830 StubRoutines::aarch64::_vector_iota_indices[0] = start;
831 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
832 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
833 }
834 return;
835 }
836 __ align(CodeEntryAlignment);
837 StubCodeMark mark(this, stub_id);
838 start = __ pc();
839 // B
840 __ emit_data64(0x0706050403020100, relocInfo::none);
841 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
842 entries.append(__ pc());
843 // H
844 __ emit_data64(0x0003000200010000, relocInfo::none);
845 __ emit_data64(0x0007000600050004, relocInfo::none);
846 entries.append(__ pc());
847 // S
848 __ emit_data64(0x0000000100000000, relocInfo::none);
849 __ emit_data64(0x0000000300000002, relocInfo::none);
850 entries.append(__ pc());
851 // D
852 __ emit_data64(0x0000000000000000, relocInfo::none);
853 __ emit_data64(0x0000000000000001, relocInfo::none);
854 entries.append(__ pc());
855 // S - FP
856 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
857 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
858 entries.append(__ pc());
859 // D - FP
860 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
861 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
862
863 // record the stub entry and end
864 store_archive_data(stub_id, start, __ pc(), &entries);
865
866 // install the entry addresses in the entry array
867 assert(entries.length() == entry_count - 1,
868 "unexpected entries count %d", entries.length());
869 StubRoutines::aarch64::_vector_iota_indices[0] = start;
870 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
871 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
872 }
873 }
874
875 // The inner part of zero_words(). This is the bulk operation,
876 // zeroing words in blocks, possibly using DC ZVA to do it. The
877 // caller is responsible for zeroing the last few words.
878 //
879 // Inputs:
880 // r10: the HeapWord-aligned base address of an array to zero.
881 // r11: the count in HeapWords, r11 > 0.
882 //
883 // Returns r10 and r11, adjusted for the caller to clear.
884 // r10: the base address of the tail of words left to clear.
885 // r11: the number of words in the tail.
886 // r11 < MacroAssembler::zero_words_block_size.
887
888 address generate_zero_blocks() {
889 StubId stub_id = StubId::stubgen_zero_blocks_id;
890 int entry_count = StubInfo::entry_count(stub_id);
891 assert(entry_count == 1, "sanity check");
892 address start = load_archive_data(stub_id);
893 if (start != nullptr) {
894 return start;
895 }
896 __ align(CodeEntryAlignment);
897 StubCodeMark mark(this, stub_id);
898 Label done;
899 Label base_aligned;
900
901 Register base = r10, cnt = r11;
902
903 start = __ pc();
904
905 if (UseBlockZeroing) {
906 int zva_length = VM_Version::zva_length();
907
908 // Ensure ZVA length can be divided by 16. This is required by
909 // the subsequent operations.
910 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
911
912 __ tbz(base, 3, base_aligned);
913 __ str(zr, Address(__ post(base, 8)));
914 __ sub(cnt, cnt, 1);
915 __ bind(base_aligned);
916
917 // Ensure count >= zva_length * 2 so that it still deserves a zva after
918 // alignment.
919 Label small;
920 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
921 __ subs(rscratch1, cnt, low_limit >> 3);
922 __ br(Assembler::LT, small);
923 __ zero_dcache_blocks(base, cnt);
924 __ bind(small);
925 }
926
927 {
928 // Number of stp instructions we'll unroll
929 const int unroll =
930 MacroAssembler::zero_words_block_size / 2;
931 // Clear the remaining blocks.
932 Label loop;
933 __ subs(cnt, cnt, unroll * 2);
934 __ br(Assembler::LT, done);
935 __ bind(loop);
936 for (int i = 0; i < unroll; i++)
937 __ stp(zr, zr, __ post(base, 16));
938 __ subs(cnt, cnt, unroll * 2);
939 __ br(Assembler::GE, loop);
940 __ bind(done);
941 __ add(cnt, cnt, unroll * 2);
942 }
943
944 __ ret(lr);
945
946 // record the stub entry and end
947 store_archive_data(stub_id, start, __ pc());
948
949 return start;
950 }
951
952
953 typedef enum {
954 copy_forwards = 1,
955 copy_backwards = -1
956 } copy_direction;
957
958 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
959 // for arraycopy stubs.
960 class ArrayCopyBarrierSetHelper : StackObj {
961 BarrierSetAssembler* _bs_asm;
962 MacroAssembler* _masm;
963 DecoratorSet _decorators;
964 BasicType _type;
965 Register _gct1;
966 Register _gct2;
967 Register _gct3;
968 FloatRegister _gcvt1;
969 FloatRegister _gcvt2;
970 FloatRegister _gcvt3;
971
972 public:
973 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
974 DecoratorSet decorators,
975 BasicType type,
976 Register gct1,
977 Register gct2,
978 Register gct3,
979 FloatRegister gcvt1,
980 FloatRegister gcvt2,
981 FloatRegister gcvt3)
982 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
983 _masm(masm),
984 _decorators(decorators),
985 _type(type),
986 _gct1(gct1),
987 _gct2(gct2),
988 _gct3(gct3),
989 _gcvt1(gcvt1),
990 _gcvt2(gcvt2),
991 _gcvt3(gcvt3) {
992 }
993
994 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
995 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
996 dst1, dst2, src,
997 _gct1, _gct2, _gcvt1);
998 }
999
1000 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1001 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1002 dst, src1, src2,
1003 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1004 }
1005
1006 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1007 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1008 dst1, dst2, src,
1009 _gct1);
1010 }
1011
1012 void copy_store_at_16(Address dst, Register src1, Register src2) {
1013 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1014 dst, src1, src2,
1015 _gct1, _gct2, _gct3);
1016 }
1017
1018 void copy_load_at_8(Register dst, Address src) {
1019 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1020 dst, noreg, src,
1021 _gct1);
1022 }
1023
1024 void copy_store_at_8(Address dst, Register src) {
1025 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1026 dst, src, noreg,
1027 _gct1, _gct2, _gct3);
1028 }
1029 };
1030
1031 // Bulk copy of blocks of 8 words.
1032 //
1033 // count is a count of words.
1034 //
1035 // Precondition: count >= 8
1036 //
1037 // Postconditions:
1038 //
1039 // The least significant bit of count contains the remaining count
1040 // of words to copy. The rest of count is trash.
1041 //
1042 // s and d are adjusted to point to the remaining words to copy
1043 //
1044 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1045 int entry_count = StubInfo::entry_count(stub_id);
1046 assert(entry_count == 1, "sanity check");
1047 address start = load_archive_data(stub_id);
1048 if (start != nullptr) {
1049 return start;
1050 }
1051 BasicType type;
1052 copy_direction direction;
1053
1054 switch (stub_id) {
1055 case StubId::stubgen_copy_byte_f_id:
1056 direction = copy_forwards;
1057 type = T_BYTE;
1058 break;
1059 case StubId::stubgen_copy_byte_b_id:
1060 direction = copy_backwards;
1061 type = T_BYTE;
1062 break;
1063 case StubId::stubgen_copy_oop_f_id:
1064 direction = copy_forwards;
1065 type = T_OBJECT;
1066 break;
1067 case StubId::stubgen_copy_oop_b_id:
1068 direction = copy_backwards;
1069 type = T_OBJECT;
1070 break;
1071 case StubId::stubgen_copy_oop_uninit_f_id:
1072 direction = copy_forwards;
1073 type = T_OBJECT;
1074 break;
1075 case StubId::stubgen_copy_oop_uninit_b_id:
1076 direction = copy_backwards;
1077 type = T_OBJECT;
1078 break;
1079 default:
1080 ShouldNotReachHere();
1081 }
1082
1083 int unit = wordSize * direction;
1084 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1085
1086 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1087 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1088 const Register stride = r14;
1089 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1090 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1091 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1092
1093 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1094 assert_different_registers(s, d, count, rscratch1, rscratch2);
1095
1096 Label again, drain;
1097
1098 __ align(CodeEntryAlignment);
1099
1100 StubCodeMark mark(this, stub_id);
1101
1102 start = __ pc();
1103
1104 Label unaligned_copy_long;
1105 if (AvoidUnalignedAccesses) {
1106 __ tbnz(d, 3, unaligned_copy_long);
1107 }
1108
1109 if (direction == copy_forwards) {
1110 __ sub(s, s, bias);
1111 __ sub(d, d, bias);
1112 }
1113
1114 #ifdef ASSERT
1115 // Make sure we are never given < 8 words
1116 {
1117 Label L;
1118 __ cmp(count, (u1)8);
1119 __ br(Assembler::GE, L);
1120 __ stop("genrate_copy_longs called with < 8 words");
1121 __ bind(L);
1122 }
1123 #endif
1124
1125 // Fill 8 registers
1126 if (UseSIMDForMemoryOps) {
1127 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1128 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1129 } else {
1130 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1131 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1132 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1133 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1134 }
1135
1136 __ subs(count, count, 16);
1137 __ br(Assembler::LO, drain);
1138
1139 int prefetch = PrefetchCopyIntervalInBytes;
1140 bool use_stride = false;
1141 if (direction == copy_backwards) {
1142 use_stride = prefetch > 256;
1143 prefetch = -prefetch;
1144 if (use_stride) __ mov(stride, prefetch);
1145 }
1146
1147 __ bind(again);
1148
1149 if (PrefetchCopyIntervalInBytes > 0)
1150 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1151
1152 if (UseSIMDForMemoryOps) {
1153 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1154 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1155 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1156 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1157 } else {
1158 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1159 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1160 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1161 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1162 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1163 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1164 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1165 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1166 }
1167
1168 __ subs(count, count, 8);
1169 __ br(Assembler::HS, again);
1170
1171 // Drain
1172 __ bind(drain);
1173 if (UseSIMDForMemoryOps) {
1174 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1175 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1176 } else {
1177 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1178 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1179 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1180 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1181 }
1182
1183 {
1184 Label L1, L2;
1185 __ tbz(count, exact_log2(4), L1);
1186 if (UseSIMDForMemoryOps) {
1187 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1188 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1189 } else {
1190 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1191 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1192 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1193 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1194 }
1195 __ bind(L1);
1196
1197 if (direction == copy_forwards) {
1198 __ add(s, s, bias);
1199 __ add(d, d, bias);
1200 }
1201
1202 __ tbz(count, 1, L2);
1203 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1204 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1205 __ bind(L2);
1206 }
1207
1208 __ ret(lr);
1209
1210 if (AvoidUnalignedAccesses) {
1211 Label drain, again;
1212 // Register order for storing. Order is different for backward copy.
1213
1214 __ bind(unaligned_copy_long);
1215
1216 // source address is even aligned, target odd aligned
1217 //
1218 // when forward copying word pairs we read long pairs at offsets
1219 // {0, 2, 4, 6} (in long words). when backwards copying we read
1220 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1221 // address by -2 in the forwards case so we can compute the
1222 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1223 // or -1.
1224 //
1225 // when forward copying we need to store 1 word, 3 pairs and
1226 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1227 // zero offset We adjust the destination by -1 which means we
1228 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1229 //
1230 // When backwards copyng we need to store 1 word, 3 pairs and
1231 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1232 // offsets {1, 3, 5, 7, 8} * unit.
1233
1234 if (direction == copy_forwards) {
1235 __ sub(s, s, 16);
1236 __ sub(d, d, 8);
1237 }
1238
1239 // Fill 8 registers
1240 //
1241 // for forwards copy s was offset by -16 from the original input
1242 // value of s so the register contents are at these offsets
1243 // relative to the 64 bit block addressed by that original input
1244 // and so on for each successive 64 byte block when s is updated
1245 //
1246 // t0 at offset 0, t1 at offset 8
1247 // t2 at offset 16, t3 at offset 24
1248 // t4 at offset 32, t5 at offset 40
1249 // t6 at offset 48, t7 at offset 56
1250
1251 // for backwards copy s was not offset so the register contents
1252 // are at these offsets into the preceding 64 byte block
1253 // relative to that original input and so on for each successive
1254 // preceding 64 byte block when s is updated. this explains the
1255 // slightly counter-intuitive looking pattern of register usage
1256 // in the stp instructions for backwards copy.
1257 //
1258 // t0 at offset -16, t1 at offset -8
1259 // t2 at offset -32, t3 at offset -24
1260 // t4 at offset -48, t5 at offset -40
1261 // t6 at offset -64, t7 at offset -56
1262
1263 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1264 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1265 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1266 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1267
1268 __ subs(count, count, 16);
1269 __ br(Assembler::LO, drain);
1270
1271 int prefetch = PrefetchCopyIntervalInBytes;
1272 bool use_stride = false;
1273 if (direction == copy_backwards) {
1274 use_stride = prefetch > 256;
1275 prefetch = -prefetch;
1276 if (use_stride) __ mov(stride, prefetch);
1277 }
1278
1279 __ bind(again);
1280
1281 if (PrefetchCopyIntervalInBytes > 0)
1282 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1283
1284 if (direction == copy_forwards) {
1285 // allowing for the offset of -8 the store instructions place
1286 // registers into the target 64 bit block at the following
1287 // offsets
1288 //
1289 // t0 at offset 0
1290 // t1 at offset 8, t2 at offset 16
1291 // t3 at offset 24, t4 at offset 32
1292 // t5 at offset 40, t6 at offset 48
1293 // t7 at offset 56
1294
1295 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1296 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1297 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1298 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1299 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1300 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1301 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1302 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1303 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1304 } else {
1305 // d was not offset when we started so the registers are
1306 // written into the 64 bit block preceding d with the following
1307 // offsets
1308 //
1309 // t1 at offset -8
1310 // t3 at offset -24, t0 at offset -16
1311 // t5 at offset -48, t2 at offset -32
1312 // t7 at offset -56, t4 at offset -48
1313 // t6 at offset -64
1314 //
1315 // note that this matches the offsets previously noted for the
1316 // loads
1317
1318 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1319 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1320 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1321 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1322 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1323 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1324 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1325 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1326 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1327 }
1328
1329 __ subs(count, count, 8);
1330 __ br(Assembler::HS, again);
1331
1332 // Drain
1333 //
1334 // this uses the same pattern of offsets and register arguments
1335 // as above
1336 __ bind(drain);
1337 if (direction == copy_forwards) {
1338 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1339 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1340 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1341 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1342 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1343 } else {
1344 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1345 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1346 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1347 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1348 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1349 }
1350 // now we need to copy any remaining part block which may
1351 // include a 4 word block subblock and/or a 2 word subblock.
1352 // bits 2 and 1 in the count are the tell-tale for whether we
1353 // have each such subblock
1354 {
1355 Label L1, L2;
1356 __ tbz(count, exact_log2(4), L1);
1357 // this is the same as above but copying only 4 longs hence
1358 // with only one intervening stp between the str instructions
1359 // but note that the offsets and registers still follow the
1360 // same pattern
1361 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1362 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1363 if (direction == copy_forwards) {
1364 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1365 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1366 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1367 } else {
1368 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1369 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1370 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1371 }
1372 __ bind(L1);
1373
1374 __ tbz(count, 1, L2);
1375 // this is the same as above but copying only 2 longs hence
1376 // there is no intervening stp between the str instructions
1377 // but note that the offset and register patterns are still
1378 // the same
1379 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1380 if (direction == copy_forwards) {
1381 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1382 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1383 } else {
1384 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1385 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1386 }
1387 __ bind(L2);
1388
1389 // for forwards copy we need to re-adjust the offsets we
1390 // applied so that s and d are follow the last words written
1391
1392 if (direction == copy_forwards) {
1393 __ add(s, s, 16);
1394 __ add(d, d, 8);
1395 }
1396
1397 }
1398
1399 __ ret(lr);
1400 }
1401
1402 // record the stub entry and end
1403 store_archive_data(stub_id, start, __ pc());
1404
1405 return start;
1406 }
1407
1408 // Small copy: less than 16 bytes.
1409 //
1410 // NB: Ignores all of the bits of count which represent more than 15
1411 // bytes, so a caller doesn't have to mask them.
1412
1413 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1414 bool is_backwards = step < 0;
1415 size_t granularity = g_uabs(step);
1416 int direction = is_backwards ? -1 : 1;
1417
1418 Label Lword, Lint, Lshort, Lbyte;
1419
1420 assert(granularity
1421 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1422
1423 const Register t0 = r3;
1424 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1425 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1426
1427 // ??? I don't know if this bit-test-and-branch is the right thing
1428 // to do. It does a lot of jumping, resulting in several
1429 // mispredicted branches. It might make more sense to do this
1430 // with something like Duff's device with a single computed branch.
1431
1432 __ tbz(count, 3 - exact_log2(granularity), Lword);
1433 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1434 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1435 __ bind(Lword);
1436
1437 if (granularity <= sizeof (jint)) {
1438 __ tbz(count, 2 - exact_log2(granularity), Lint);
1439 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1440 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1441 __ bind(Lint);
1442 }
1443
1444 if (granularity <= sizeof (jshort)) {
1445 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1446 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1447 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1448 __ bind(Lshort);
1449 }
1450
1451 if (granularity <= sizeof (jbyte)) {
1452 __ tbz(count, 0, Lbyte);
1453 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1454 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1455 __ bind(Lbyte);
1456 }
1457 }
1458
1459 // All-singing all-dancing memory copy.
1460 //
1461 // Copy count units of memory from s to d. The size of a unit is
1462 // step, which can be positive or negative depending on the direction
1463 // of copy. If is_aligned is false, we align the source address.
1464 //
1465
1466 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1467 Register s, Register d, Register count, int step) {
1468 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1469 bool is_backwards = step < 0;
1470 unsigned int granularity = g_uabs(step);
1471 const Register t0 = r3, t1 = r4;
1472
1473 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1474 // load all the data before writing anything
1475 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1476 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1477 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1478 const Register send = r17, dend = r16;
1479 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1480 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1481 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1482
1483 if (PrefetchCopyIntervalInBytes > 0)
1484 __ prfm(Address(s, 0), PLDL1KEEP);
1485 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1486 __ br(Assembler::HI, copy_big);
1487
1488 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1489 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1490
1491 __ cmp(count, u1(16/granularity));
1492 __ br(Assembler::LS, copy16);
1493
1494 __ cmp(count, u1(64/granularity));
1495 __ br(Assembler::HI, copy80);
1496
1497 __ cmp(count, u1(32/granularity));
1498 __ br(Assembler::LS, copy32);
1499
1500 // 33..64 bytes
1501 if (UseSIMDForMemoryOps) {
1502 bs.copy_load_at_32(v0, v1, Address(s, 0));
1503 bs.copy_load_at_32(v2, v3, Address(send, -32));
1504 bs.copy_store_at_32(Address(d, 0), v0, v1);
1505 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1506 } else {
1507 bs.copy_load_at_16(t0, t1, Address(s, 0));
1508 bs.copy_load_at_16(t2, t3, Address(s, 16));
1509 bs.copy_load_at_16(t4, t5, Address(send, -32));
1510 bs.copy_load_at_16(t6, t7, Address(send, -16));
1511
1512 bs.copy_store_at_16(Address(d, 0), t0, t1);
1513 bs.copy_store_at_16(Address(d, 16), t2, t3);
1514 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1515 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1516 }
1517 __ b(finish);
1518
1519 // 17..32 bytes
1520 __ bind(copy32);
1521 bs.copy_load_at_16(t0, t1, Address(s, 0));
1522 bs.copy_load_at_16(t6, t7, Address(send, -16));
1523
1524 bs.copy_store_at_16(Address(d, 0), t0, t1);
1525 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1526 __ b(finish);
1527
1528 // 65..80/96 bytes
1529 // (96 bytes if SIMD because we do 32 byes per instruction)
1530 __ bind(copy80);
1531 if (UseSIMDForMemoryOps) {
1532 bs.copy_load_at_32(v0, v1, Address(s, 0));
1533 bs.copy_load_at_32(v2, v3, Address(s, 32));
1534 // Unaligned pointers can be an issue for copying.
1535 // The issue has more chances to happen when granularity of data is
1536 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1537 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1538 // The most performance drop has been seen for the range 65-80 bytes.
1539 // For such cases using the pair of ldp/stp instead of the third pair of
1540 // ldpq/stpq fixes the performance issue.
1541 if (granularity < sizeof (jint)) {
1542 Label copy96;
1543 __ cmp(count, u1(80/granularity));
1544 __ br(Assembler::HI, copy96);
1545 bs.copy_load_at_16(t0, t1, Address(send, -16));
1546
1547 bs.copy_store_at_32(Address(d, 0), v0, v1);
1548 bs.copy_store_at_32(Address(d, 32), v2, v3);
1549
1550 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1551 __ b(finish);
1552
1553 __ bind(copy96);
1554 }
1555 bs.copy_load_at_32(v4, v5, Address(send, -32));
1556
1557 bs.copy_store_at_32(Address(d, 0), v0, v1);
1558 bs.copy_store_at_32(Address(d, 32), v2, v3);
1559
1560 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1561 } else {
1562 bs.copy_load_at_16(t0, t1, Address(s, 0));
1563 bs.copy_load_at_16(t2, t3, Address(s, 16));
1564 bs.copy_load_at_16(t4, t5, Address(s, 32));
1565 bs.copy_load_at_16(t6, t7, Address(s, 48));
1566 bs.copy_load_at_16(t8, t9, Address(send, -16));
1567
1568 bs.copy_store_at_16(Address(d, 0), t0, t1);
1569 bs.copy_store_at_16(Address(d, 16), t2, t3);
1570 bs.copy_store_at_16(Address(d, 32), t4, t5);
1571 bs.copy_store_at_16(Address(d, 48), t6, t7);
1572 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1573 }
1574 __ b(finish);
1575
1576 // 0..16 bytes
1577 __ bind(copy16);
1578 __ cmp(count, u1(8/granularity));
1579 __ br(Assembler::LO, copy8);
1580
1581 // 8..16 bytes
1582 bs.copy_load_at_8(t0, Address(s, 0));
1583 bs.copy_load_at_8(t1, Address(send, -8));
1584 bs.copy_store_at_8(Address(d, 0), t0);
1585 bs.copy_store_at_8(Address(dend, -8), t1);
1586 __ b(finish);
1587
1588 if (granularity < 8) {
1589 // 4..7 bytes
1590 __ bind(copy8);
1591 __ tbz(count, 2 - exact_log2(granularity), copy4);
1592 __ ldrw(t0, Address(s, 0));
1593 __ ldrw(t1, Address(send, -4));
1594 __ strw(t0, Address(d, 0));
1595 __ strw(t1, Address(dend, -4));
1596 __ b(finish);
1597 if (granularity < 4) {
1598 // 0..3 bytes
1599 __ bind(copy4);
1600 __ cbz(count, finish); // get rid of 0 case
1601 if (granularity == 2) {
1602 __ ldrh(t0, Address(s, 0));
1603 __ strh(t0, Address(d, 0));
1604 } else { // granularity == 1
1605 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1606 // the first and last byte.
1607 // Handle the 3 byte case by loading and storing base + count/2
1608 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1609 // This does means in the 1 byte case we load/store the same
1610 // byte 3 times.
1611 __ lsr(count, count, 1);
1612 __ ldrb(t0, Address(s, 0));
1613 __ ldrb(t1, Address(send, -1));
1614 __ ldrb(t2, Address(s, count));
1615 __ strb(t0, Address(d, 0));
1616 __ strb(t1, Address(dend, -1));
1617 __ strb(t2, Address(d, count));
1618 }
1619 __ b(finish);
1620 }
1621 }
1622
1623 __ bind(copy_big);
1624 if (is_backwards) {
1625 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1626 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1627 }
1628
1629 // Now we've got the small case out of the way we can align the
1630 // source address on a 2-word boundary.
1631
1632 // Here we will materialize a count in r15, which is used by copy_memory_small
1633 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1634 // Up until here, we have used t9, which aliases r15, but from here on, that register
1635 // can not be used as a temp register, as it contains the count.
1636
1637 Label aligned;
1638
1639 if (is_aligned) {
1640 // We may have to adjust by 1 word to get s 2-word-aligned.
1641 __ tbz(s, exact_log2(wordSize), aligned);
1642 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1643 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1644 __ sub(count, count, wordSize/granularity);
1645 } else {
1646 if (is_backwards) {
1647 __ andr(r15, s, 2 * wordSize - 1);
1648 } else {
1649 __ neg(r15, s);
1650 __ andr(r15, r15, 2 * wordSize - 1);
1651 }
1652 // r15 is the byte adjustment needed to align s.
1653 __ cbz(r15, aligned);
1654 int shift = exact_log2(granularity);
1655 if (shift > 0) {
1656 __ lsr(r15, r15, shift);
1657 }
1658 __ sub(count, count, r15);
1659
1660 #if 0
1661 // ?? This code is only correct for a disjoint copy. It may or
1662 // may not make sense to use it in that case.
1663
1664 // Copy the first pair; s and d may not be aligned.
1665 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1666 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1667
1668 // Align s and d, adjust count
1669 if (is_backwards) {
1670 __ sub(s, s, r15);
1671 __ sub(d, d, r15);
1672 } else {
1673 __ add(s, s, r15);
1674 __ add(d, d, r15);
1675 }
1676 #else
1677 copy_memory_small(decorators, type, s, d, r15, step);
1678 #endif
1679 }
1680
1681 __ bind(aligned);
1682
1683 // s is now 2-word-aligned.
1684
1685 // We have a count of units and some trailing bytes. Adjust the
1686 // count and do a bulk copy of words. If the shift is zero
1687 // perform a move instead to benefit from zero latency moves.
1688 int shift = exact_log2(wordSize/granularity);
1689 if (shift > 0) {
1690 __ lsr(r15, count, shift);
1691 } else {
1692 __ mov(r15, count);
1693 }
1694 if (direction == copy_forwards) {
1695 if (type != T_OBJECT) {
1696 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1697 __ blr(rscratch1);
1698 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1699 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1700 __ blr(rscratch1);
1701 } else {
1702 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1703 __ blr(rscratch1);
1704 }
1705 } else {
1706 if (type != T_OBJECT) {
1707 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1708 __ blr(rscratch1);
1709 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1710 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1711 __ blr(rscratch1);
1712 } else {
1713 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1714 __ blr(rscratch1);
1715 }
1716 }
1717
1718 // And the tail.
1719 copy_memory_small(decorators, type, s, d, count, step);
1720
1721 if (granularity >= 8) __ bind(copy8);
1722 if (granularity >= 4) __ bind(copy4);
1723 __ bind(finish);
1724 }
1725
1726
1727 void clobber_registers() {
1728 #ifdef ASSERT
1729 RegSet clobbered
1730 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1731 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1732 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1733 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1734 __ mov(*it, rscratch1);
1735 }
1736 #endif
1737
1738 }
1739
1740 // Scan over array at a for count oops, verifying each one.
1741 // Preserves a and count, clobbers rscratch1 and rscratch2.
1742 void verify_oop_array (int size, Register a, Register count, Register temp) {
1743 Label loop, end;
1744 __ mov(rscratch1, a);
1745 __ mov(rscratch2, zr);
1746 __ bind(loop);
1747 __ cmp(rscratch2, count);
1748 __ br(Assembler::HS, end);
1749 if (size == wordSize) {
1750 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1751 __ verify_oop(temp);
1752 } else {
1753 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1754 __ decode_heap_oop(temp); // calls verify_oop
1755 }
1756 __ add(rscratch2, rscratch2, 1);
1757 __ b(loop);
1758 __ bind(end);
1759 }
1760
1761 // Arguments:
1762 // stub_id - is used to name the stub and identify all details of
1763 // how to perform the copy.
1764 //
1765 // nopush_entry - is assigned to the stub's post push entry point
1766 // unless it is null
1767 //
1768 // Inputs:
1769 // c_rarg0 - source array address
1770 // c_rarg1 - destination array address
1771 // c_rarg2 - element count, treated as ssize_t, can be zero
1772 //
1773 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1774 // the hardware handle it. The two dwords within qwords that span
1775 // cache line boundaries will still be loaded and stored atomically.
1776 //
1777 // Side Effects: nopush_entry is set to the (post push) entry point
1778 // so it can be used by the corresponding conjoint
1779 // copy method
1780 //
1781 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1782 int size;
1783 bool aligned;
1784 bool is_oop;
1785 bool dest_uninitialized;
1786 switch (stub_id) {
1787 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1788 size = sizeof(jbyte);
1789 aligned = false;
1790 is_oop = false;
1791 dest_uninitialized = false;
1792 break;
1793 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1794 size = sizeof(jbyte);
1795 aligned = true;
1796 is_oop = false;
1797 dest_uninitialized = false;
1798 break;
1799 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1800 size = sizeof(jshort);
1801 aligned = false;
1802 is_oop = false;
1803 dest_uninitialized = false;
1804 break;
1805 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1806 size = sizeof(jshort);
1807 aligned = true;
1808 is_oop = false;
1809 dest_uninitialized = false;
1810 break;
1811 case StubId::stubgen_jint_disjoint_arraycopy_id:
1812 size = sizeof(jint);
1813 aligned = false;
1814 is_oop = false;
1815 dest_uninitialized = false;
1816 break;
1817 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1818 size = sizeof(jint);
1819 aligned = true;
1820 is_oop = false;
1821 dest_uninitialized = false;
1822 break;
1823 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1824 // since this is always aligned we can (should!) use the same
1825 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1826 ShouldNotReachHere();
1827 break;
1828 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1829 size = sizeof(jlong);
1830 aligned = true;
1831 is_oop = false;
1832 dest_uninitialized = false;
1833 break;
1834 case StubId::stubgen_oop_disjoint_arraycopy_id:
1835 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1836 aligned = !UseCompressedOops;
1837 is_oop = true;
1838 dest_uninitialized = false;
1839 break;
1840 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1841 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1842 aligned = !UseCompressedOops;
1843 is_oop = true;
1844 dest_uninitialized = false;
1845 break;
1846 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1847 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1848 aligned = !UseCompressedOops;
1849 is_oop = true;
1850 dest_uninitialized = true;
1851 break;
1852 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1853 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1854 aligned = !UseCompressedOops;
1855 is_oop = true;
1856 dest_uninitialized = true;
1857 break;
1858 default:
1859 ShouldNotReachHere();
1860 break;
1861 }
1862 // all stubs provide a 2nd entry which omits the frame push for
1863 // use when bailing out from a conjoint copy. However we may also
1864 // need some extra addressses for memory access protection.
1865 int entry_count = StubInfo::entry_count(stub_id);
1866 assert(entry_count == 2, "sanity check");
1867 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1868
1869 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1870 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1871 GrowableArray<address> entries;
1872 GrowableArray<address> extras;
1873 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1874 address start = load_archive_data(stub_id, &entries, extras_ptr);
1875 if (start != nullptr) {
1876 assert(entries.length() == entry_count - 1,
1877 "unexpected entries count %d", entries.length());
1878 *nopush_entry = entries.at(0);
1879 assert(extras.length() == extra_count,
1880 "unexpected extra count %d", extras.length());
1881 if (add_extras) {
1882 // register one handler at offset 0
1883 register_unsafe_access_handlers(extras, 0, 1);
1884 }
1885 return start;
1886 }
1887
1888 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1889 RegSet saved_reg = RegSet::of(s, d, count);
1890
1891 __ align(CodeEntryAlignment);
1892 StubCodeMark mark(this, stub_id);
1893 start = __ pc();
1894 __ enter();
1895
1896 *nopush_entry = __ pc();
1897 entries.append(*nopush_entry);
1898
1899 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1900 BLOCK_COMMENT("Post-Push Entry:");
1901
1902 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1903 if (dest_uninitialized) {
1904 decorators |= IS_DEST_UNINITIALIZED;
1905 }
1906 if (aligned) {
1907 decorators |= ARRAYCOPY_ALIGNED;
1908 }
1909
1910 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1911 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1912
1913 if (is_oop) {
1914 // save regs before copy_memory
1915 __ push(RegSet::of(d, count), sp);
1916 }
1917 {
1918 // UnsafeMemoryAccess page error: continue after unsafe access
1919 UnsafeMemoryAccessMark umam(this, add_extras, true);
1920 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1921 }
1922
1923 if (is_oop) {
1924 __ pop(RegSet::of(d, count), sp);
1925 if (VerifyOops)
1926 verify_oop_array(size, d, count, r16);
1927 }
1928
1929 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1930
1931 __ leave();
1932 __ mov(r0, zr); // return 0
1933 __ ret(lr);
1934
1935 address end = __ pc();
1936
1937 if (add_extras) {
1938 // retrieve the registered handler addresses
1939 retrieve_unsafe_access_handlers(start, end, extras);
1940 assert(extras.length() == extra_count
1941 , "incorrect handlers count %d", extras.length());
1942 }
1943
1944 // record the stub entry and end plus the no_push entry and any
1945 // extra handler addresses
1946 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1947
1948 return start;
1949 }
1950
1951 // Arguments:
1952 // stub_id - is used to name the stub and identify all details of
1953 // how to perform the copy.
1954 //
1955 // nooverlap_target - identifes the (post push) entry for the
1956 // corresponding disjoint copy routine which can be
1957 // jumped to if the ranges do not actually overlap
1958 //
1959 // nopush_entry - is assigned to the stub's post push entry point
1960 // unless it is null
1961 //
1962 //
1963 // Inputs:
1964 // c_rarg0 - source array address
1965 // c_rarg1 - destination array address
1966 // c_rarg2 - element count, treated as ssize_t, can be zero
1967 //
1968 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1969 // the hardware handle it. The two dwords within qwords that span
1970 // cache line boundaries will still be loaded and stored atomically.
1971 //
1972 // Side Effects:
1973 // nopush_entry is set to the no-overlap entry point so it can be
1974 // used by some other conjoint copy method
1975 //
1976 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1977 int size;
1978 bool aligned;
1979 bool is_oop;
1980 bool dest_uninitialized;
1981 switch (stub_id) {
1982 case StubId::stubgen_jbyte_arraycopy_id:
1983 size = sizeof(jbyte);
1984 aligned = false;
1985 is_oop = false;
1986 dest_uninitialized = false;
1987 break;
1988 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
1989 size = sizeof(jbyte);
1990 aligned = true;
1991 is_oop = false;
1992 dest_uninitialized = false;
1993 break;
1994 case StubId::stubgen_jshort_arraycopy_id:
1995 size = sizeof(jshort);
1996 aligned = false;
1997 is_oop = false;
1998 dest_uninitialized = false;
1999 break;
2000 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2001 size = sizeof(jshort);
2002 aligned = true;
2003 is_oop = false;
2004 dest_uninitialized = false;
2005 break;
2006 case StubId::stubgen_jint_arraycopy_id:
2007 size = sizeof(jint);
2008 aligned = false;
2009 is_oop = false;
2010 dest_uninitialized = false;
2011 break;
2012 case StubId::stubgen_arrayof_jint_arraycopy_id:
2013 size = sizeof(jint);
2014 aligned = true;
2015 is_oop = false;
2016 dest_uninitialized = false;
2017 break;
2018 case StubId::stubgen_jlong_arraycopy_id:
2019 // since this is always aligned we can (should!) use the same
2020 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2021 ShouldNotReachHere();
2022 break;
2023 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2024 size = sizeof(jlong);
2025 aligned = true;
2026 is_oop = false;
2027 dest_uninitialized = false;
2028 break;
2029 case StubId::stubgen_oop_arraycopy_id:
2030 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2031 aligned = !UseCompressedOops;
2032 is_oop = true;
2033 dest_uninitialized = false;
2034 break;
2035 case StubId::stubgen_arrayof_oop_arraycopy_id:
2036 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2037 aligned = !UseCompressedOops;
2038 is_oop = true;
2039 dest_uninitialized = false;
2040 break;
2041 case StubId::stubgen_oop_arraycopy_uninit_id:
2042 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2043 aligned = !UseCompressedOops;
2044 is_oop = true;
2045 dest_uninitialized = true;
2046 break;
2047 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2048 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2049 aligned = !UseCompressedOops;
2050 is_oop = true;
2051 dest_uninitialized = true;
2052 break;
2053 default:
2054 ShouldNotReachHere();
2055 }
2056 // only some conjoint stubs generate a 2nd entry
2057 int entry_count = StubInfo::entry_count(stub_id);
2058 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2059 assert(entry_count == expected_entry_count,
2060 "expected entry count %d does not match declared entry count %d for stub %s",
2061 expected_entry_count, entry_count, StubInfo::name(stub_id));
2062
2063 // We need to protect memory accesses in certain cases
2064 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2065 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2066 GrowableArray<address> entries;
2067 GrowableArray<address> extras;
2068 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2069 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2070 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2071 if (start != nullptr) {
2072 assert(entries.length() == expected_entry_count - 1,
2073 "unexpected entries count %d", entries.length());
2074 assert(extras.length() == extra_count,
2075 "unexpected extra count %d", extras.length());
2076 if (nopush_entry != nullptr) {
2077 *nopush_entry = entries.at(0);
2078 }
2079 if (add_extras) {
2080 // register one handler at offset 0
2081 register_unsafe_access_handlers(extras, 0, 1);
2082 }
2083 return start;
2084 }
2085
2086 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2087 RegSet saved_regs = RegSet::of(s, d, count);
2088 StubCodeMark mark(this, stub_id);
2089 start = __ pc();
2090 __ enter();
2091
2092 if (nopush_entry != nullptr) {
2093 *nopush_entry = __ pc();
2094 entries.append(*nopush_entry);
2095 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2096 BLOCK_COMMENT("Post-Push Entry:");
2097 }
2098
2099 // use fwd copy when (d-s) above_equal (count*size)
2100 Label L_overlapping;
2101 __ sub(rscratch1, d, s);
2102 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2103 __ br(Assembler::LO, L_overlapping);
2104 __ b(RuntimeAddress(nooverlap_target));
2105 __ bind(L_overlapping);
2106
2107 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2108 if (dest_uninitialized) {
2109 decorators |= IS_DEST_UNINITIALIZED;
2110 }
2111 if (aligned) {
2112 decorators |= ARRAYCOPY_ALIGNED;
2113 }
2114
2115 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2116 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2117
2118 if (is_oop) {
2119 // save regs before copy_memory
2120 __ push(RegSet::of(d, count), sp);
2121 }
2122 {
2123 // UnsafeMemoryAccess page error: continue after unsafe access
2124 UnsafeMemoryAccessMark umam(this, add_extras, true);
2125 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2126 }
2127 if (is_oop) {
2128 __ pop(RegSet::of(d, count), sp);
2129 if (VerifyOops)
2130 verify_oop_array(size, d, count, r16);
2131 }
2132 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2133 __ leave();
2134 __ mov(r0, zr); // return 0
2135 __ ret(lr);
2136
2137 assert(entries.length() == expected_entry_count - 1,
2138 "unexpected entries count %d", entries.length());
2139
2140 address end = __ pc();
2141
2142 if (add_extras) {
2143 // retrieve the registered handler addresses
2144 retrieve_unsafe_access_handlers(start, end, extras);
2145 assert(extras.length() == extra_count,
2146 "incorrect handlers count %d", extras.length());
2147 }
2148
2149 // record the stub entry and end plus any no_push entry and/or
2150 // extra handler addresses
2151 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2152
2153 return start;
2154 }
2155
2156 // Helper for generating a dynamic type check.
2157 // Smashes rscratch1, rscratch2.
2158 void generate_type_check(Register sub_klass,
2159 Register super_check_offset,
2160 Register super_klass,
2161 Register temp1,
2162 Register temp2,
2163 Register result,
2164 Label& L_success) {
2165 assert_different_registers(sub_klass, super_check_offset, super_klass);
2166
2167 BLOCK_COMMENT("type_check:");
2168
2169 Label L_miss;
2170
2171 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2172 super_check_offset);
2173 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2174
2175 // Fall through on failure!
2176 __ BIND(L_miss);
2177 }
2178
2179 //
2180 // Generate checkcasting array copy stub
2181 //
2182 // Input:
2183 // c_rarg0 - source array address
2184 // c_rarg1 - destination array address
2185 // c_rarg2 - element count, treated as ssize_t, can be zero
2186 // c_rarg3 - size_t ckoff (super_check_offset)
2187 // c_rarg4 - oop ckval (super_klass)
2188 //
2189 // Output:
2190 // r0 == 0 - success
2191 // r0 == -1^K - failure, where K is partial transfer count
2192 //
2193 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2194 bool dest_uninitialized;
2195 switch (stub_id) {
2196 case StubId::stubgen_checkcast_arraycopy_id:
2197 dest_uninitialized = false;
2198 break;
2199 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2200 dest_uninitialized = true;
2201 break;
2202 default:
2203 ShouldNotReachHere();
2204 }
2205
2206 // The normal stub provides a 2nd entry which omits the frame push
2207 // for use when bailing out from a disjoint copy.
2208 // Only some conjoint stubs generate a 2nd entry
2209 int entry_count = StubInfo::entry_count(stub_id);
2210 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2211 GrowableArray<address> entries;
2212 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2213 assert(entry_count == expected_entry_count,
2214 "expected entry count %d does not match declared entry count %d for stub %s",
2215 expected_entry_count, entry_count, StubInfo::name(stub_id));
2216 address start = load_archive_data(stub_id, entries_ptr);
2217 if (start != nullptr) {
2218 assert(entries.length() + 1 == expected_entry_count,
2219 "expected entry count %d does not match return entry count %d for stub %s",
2220 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2221 if (nopush_entry != nullptr) {
2222 *nopush_entry = entries.at(0);
2223 }
2224 return start;
2225 }
2226
2227 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2228
2229 // Input registers (after setup_arg_regs)
2230 const Register from = c_rarg0; // source array address
2231 const Register to = c_rarg1; // destination array address
2232 const Register count = c_rarg2; // elementscount
2233 const Register ckoff = c_rarg3; // super_check_offset
2234 const Register ckval = c_rarg4; // super_klass
2235
2236 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2237
2238 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2239 const Register copied_oop = r22; // actual oop copied
2240 const Register count_save = r21; // orig elementscount
2241 const Register start_to = r20; // destination array start address
2242 const Register r19_klass = r19; // oop._klass
2243
2244 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2245 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2246
2247 //---------------------------------------------------------------
2248 // Assembler stub will be used for this call to arraycopy
2249 // if the two arrays are subtypes of Object[] but the
2250 // destination array type is not equal to or a supertype
2251 // of the source type. Each element must be separately
2252 // checked.
2253
2254 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2255 copied_oop, r19_klass, count_save);
2256
2257 __ align(CodeEntryAlignment);
2258 StubCodeMark mark(this, stub_id);
2259 start = __ pc();
2260
2261 __ enter(); // required for proper stackwalking of RuntimeStub frame
2262
2263 #ifdef ASSERT
2264 // caller guarantees that the arrays really are different
2265 // otherwise, we would have to make conjoint checks
2266 { Label L;
2267 __ b(L); // conjoint check not yet implemented
2268 __ stop("checkcast_copy within a single array");
2269 __ bind(L);
2270 }
2271 #endif //ASSERT
2272
2273 // Caller of this entry point must set up the argument registers.
2274 if (nopush_entry != nullptr) {
2275 *nopush_entry = __ pc();
2276 entries.append(*nopush_entry);
2277 BLOCK_COMMENT("Entry:");
2278 }
2279
2280 // Empty array: Nothing to do.
2281 __ cbz(count, L_done);
2282 __ push(RegSet::of(r19, r20, r21, r22), sp);
2283
2284 #ifdef ASSERT
2285 BLOCK_COMMENT("assert consistent ckoff/ckval");
2286 // The ckoff and ckval must be mutually consistent,
2287 // even though caller generates both.
2288 { Label L;
2289 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2290 __ ldrw(start_to, Address(ckval, sco_offset));
2291 __ cmpw(ckoff, start_to);
2292 __ br(Assembler::EQ, L);
2293 __ stop("super_check_offset inconsistent");
2294 __ bind(L);
2295 }
2296 #endif //ASSERT
2297
2298 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2299 bool is_oop = true;
2300 int element_size = UseCompressedOops ? 4 : 8;
2301 if (dest_uninitialized) {
2302 decorators |= IS_DEST_UNINITIALIZED;
2303 }
2304
2305 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2306 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2307
2308 // save the original count
2309 __ mov(count_save, count);
2310
2311 // Copy from low to high addresses
2312 __ mov(start_to, to); // Save destination array start address
2313 __ b(L_load_element);
2314
2315 // ======== begin loop ========
2316 // (Loop is rotated; its entry is L_load_element.)
2317 // Loop control:
2318 // for (; count != 0; count--) {
2319 // copied_oop = load_heap_oop(from++);
2320 // ... generate_type_check ...;
2321 // store_heap_oop(to++, copied_oop);
2322 // }
2323 __ align(OptoLoopAlignment);
2324
2325 __ BIND(L_store_element);
2326 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2327 __ post(to, element_size), copied_oop, noreg,
2328 gct1, gct2, gct3);
2329 __ sub(count, count, 1);
2330 __ cbz(count, L_do_card_marks);
2331
2332 // ======== loop entry is here ========
2333 __ BIND(L_load_element);
2334 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2335 copied_oop, noreg, __ post(from, element_size),
2336 gct1);
2337 __ cbz(copied_oop, L_store_element);
2338
2339 __ load_klass(r19_klass, copied_oop);// query the object klass
2340
2341 BLOCK_COMMENT("type_check:");
2342 generate_type_check(/*sub_klass*/r19_klass,
2343 /*super_check_offset*/ckoff,
2344 /*super_klass*/ckval,
2345 /*r_array_base*/gct1,
2346 /*temp2*/gct2,
2347 /*result*/r10, L_store_element);
2348
2349 // Fall through on failure!
2350
2351 // ======== end loop ========
2352
2353 // It was a real error; we must depend on the caller to finish the job.
2354 // Register count = remaining oops, count_orig = total oops.
2355 // Emit GC store barriers for the oops we have copied and report
2356 // their number to the caller.
2357
2358 __ subs(count, count_save, count); // K = partially copied oop count
2359 __ eon(count, count, zr); // report (-1^K) to caller
2360 __ br(Assembler::EQ, L_done_pop);
2361
2362 __ BIND(L_do_card_marks);
2363 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2364
2365 __ bind(L_done_pop);
2366 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2367 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2368
2369 __ bind(L_done);
2370 __ mov(r0, count);
2371 __ leave();
2372 __ ret(lr);
2373
2374 // record the stub entry and end plus any no_push entry
2375 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2376 return start;
2377 }
2378
2379 // Perform range checks on the proposed arraycopy.
2380 // Kills temp, but nothing else.
2381 // Also, clean the sign bits of src_pos and dst_pos.
2382 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2383 Register src_pos, // source position (c_rarg1)
2384 Register dst, // destination array oo (c_rarg2)
2385 Register dst_pos, // destination position (c_rarg3)
2386 Register length,
2387 Register temp,
2388 Label& L_failed) {
2389 BLOCK_COMMENT("arraycopy_range_checks:");
2390
2391 assert_different_registers(rscratch1, temp);
2392
2393 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2394 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2395 __ addw(temp, length, src_pos);
2396 __ cmpw(temp, rscratch1);
2397 __ br(Assembler::HI, L_failed);
2398
2399 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2400 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2401 __ addw(temp, length, dst_pos);
2402 __ cmpw(temp, rscratch1);
2403 __ br(Assembler::HI, L_failed);
2404
2405 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2406 __ movw(src_pos, src_pos);
2407 __ movw(dst_pos, dst_pos);
2408
2409 BLOCK_COMMENT("arraycopy_range_checks done");
2410 }
2411
2412 // These stubs get called from some dumb test routine.
2413 // I'll write them properly when they're called from
2414 // something that's actually doing something.
2415 static void fake_arraycopy_stub(address src, address dst, int count) {
2416 assert(count == 0, "huh?");
2417 }
2418
2419
2420 //
2421 // Generate 'unsafe' array copy stub
2422 // Though just as safe as the other stubs, it takes an unscaled
2423 // size_t argument instead of an element count.
2424 //
2425 // Input:
2426 // c_rarg0 - source array address
2427 // c_rarg1 - destination array address
2428 // c_rarg2 - byte count, treated as ssize_t, can be zero
2429 //
2430 // Examines the alignment of the operands and dispatches
2431 // to a long, int, short, or byte copy loop.
2432 //
2433 address generate_unsafe_copy(address byte_copy_entry,
2434 address short_copy_entry,
2435 address int_copy_entry,
2436 address long_copy_entry) {
2437 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2438 int entry_count = StubInfo::entry_count(stub_id);
2439 assert(entry_count == 1, "sanity check");
2440 address start = load_archive_data(stub_id);
2441 if (start != nullptr) {
2442 return start;
2443 }
2444 Label L_long_aligned, L_int_aligned, L_short_aligned;
2445 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2446
2447 __ align(CodeEntryAlignment);
2448 StubCodeMark mark(this, stub_id);
2449 start = __ pc();
2450 __ enter(); // required for proper stackwalking of RuntimeStub frame
2451
2452 // bump this on entry, not on exit:
2453 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2454
2455 __ orr(rscratch1, s, d);
2456 __ orr(rscratch1, rscratch1, count);
2457
2458 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2459 __ cbz(rscratch1, L_long_aligned);
2460 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2461 __ cbz(rscratch1, L_int_aligned);
2462 __ tbz(rscratch1, 0, L_short_aligned);
2463 __ b(RuntimeAddress(byte_copy_entry));
2464
2465 __ BIND(L_short_aligned);
2466 __ lsr(count, count, LogBytesPerShort); // size => short_count
2467 __ b(RuntimeAddress(short_copy_entry));
2468 __ BIND(L_int_aligned);
2469 __ lsr(count, count, LogBytesPerInt); // size => int_count
2470 __ b(RuntimeAddress(int_copy_entry));
2471 __ BIND(L_long_aligned);
2472 __ lsr(count, count, LogBytesPerLong); // size => long_count
2473 __ b(RuntimeAddress(long_copy_entry));
2474
2475 // record the stub entry and end
2476 store_archive_data(stub_id, start, __ pc());
2477
2478 return start;
2479 }
2480
2481 //
2482 // Generate generic array copy stubs
2483 //
2484 // Input:
2485 // c_rarg0 - src oop
2486 // c_rarg1 - src_pos (32-bits)
2487 // c_rarg2 - dst oop
2488 // c_rarg3 - dst_pos (32-bits)
2489 // c_rarg4 - element count (32-bits)
2490 //
2491 // Output:
2492 // r0 == 0 - success
2493 // r0 == -1^K - failure, where K is partial transfer count
2494 //
2495 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2496 address int_copy_entry, address oop_copy_entry,
2497 address long_copy_entry, address checkcast_copy_entry) {
2498 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2499 int entry_count = StubInfo::entry_count(stub_id);
2500 assert(entry_count == 1, "sanity check");
2501 address start = load_archive_data(stub_id);
2502 if (start != nullptr) {
2503 return start;
2504 }
2505 Label L_failed, L_objArray;
2506 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2507
2508 // Input registers
2509 const Register src = c_rarg0; // source array oop
2510 const Register src_pos = c_rarg1; // source position
2511 const Register dst = c_rarg2; // destination array oop
2512 const Register dst_pos = c_rarg3; // destination position
2513 const Register length = c_rarg4;
2514
2515
2516 // Registers used as temps
2517 const Register dst_klass = c_rarg5;
2518
2519 __ align(CodeEntryAlignment);
2520
2521 StubCodeMark mark(this, stub_id);
2522
2523 start = __ pc();
2524
2525 __ enter(); // required for proper stackwalking of RuntimeStub frame
2526
2527 // bump this on entry, not on exit:
2528 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2529
2530 //-----------------------------------------------------------------------
2531 // Assembler stub will be used for this call to arraycopy
2532 // if the following conditions are met:
2533 //
2534 // (1) src and dst must not be null.
2535 // (2) src_pos must not be negative.
2536 // (3) dst_pos must not be negative.
2537 // (4) length must not be negative.
2538 // (5) src klass and dst klass should be the same and not null.
2539 // (6) src and dst should be arrays.
2540 // (7) src_pos + length must not exceed length of src.
2541 // (8) dst_pos + length must not exceed length of dst.
2542 //
2543
2544 // if (src == nullptr) return -1;
2545 __ cbz(src, L_failed);
2546
2547 // if (src_pos < 0) return -1;
2548 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2549
2550 // if (dst == nullptr) return -1;
2551 __ cbz(dst, L_failed);
2552
2553 // if (dst_pos < 0) return -1;
2554 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2555
2556 // registers used as temp
2557 const Register scratch_length = r16; // elements count to copy
2558 const Register scratch_src_klass = r17; // array klass
2559 const Register lh = r15; // layout helper
2560
2561 // if (length < 0) return -1;
2562 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2563 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2564
2565 __ load_klass(scratch_src_klass, src);
2566 #ifdef ASSERT
2567 // assert(src->klass() != nullptr);
2568 {
2569 BLOCK_COMMENT("assert klasses not null {");
2570 Label L1, L2;
2571 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2572 __ bind(L1);
2573 __ stop("broken null klass");
2574 __ bind(L2);
2575 __ load_klass(rscratch1, dst);
2576 __ cbz(rscratch1, L1); // this would be broken also
2577 BLOCK_COMMENT("} assert klasses not null done");
2578 }
2579 #endif
2580
2581 // Load layout helper (32-bits)
2582 //
2583 // |array_tag| | header_size | element_type | |log2_element_size|
2584 // 32 30 24 16 8 2 0
2585 //
2586 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2587 //
2588
2589 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2590
2591 // Handle objArrays completely differently...
2592 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2593 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2594 __ movw(rscratch1, objArray_lh);
2595 __ eorw(rscratch2, lh, rscratch1);
2596 __ cbzw(rscratch2, L_objArray);
2597
2598 // if (src->klass() != dst->klass()) return -1;
2599 __ load_klass(rscratch2, dst);
2600 __ eor(rscratch2, rscratch2, scratch_src_klass);
2601 __ cbnz(rscratch2, L_failed);
2602
2603 // if (!src->is_Array()) return -1;
2604 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2605
2606 // At this point, it is known to be a typeArray (array_tag 0x3).
2607 #ifdef ASSERT
2608 {
2609 BLOCK_COMMENT("assert primitive array {");
2610 Label L;
2611 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2612 __ cmpw(lh, rscratch2);
2613 __ br(Assembler::GE, L);
2614 __ stop("must be a primitive array");
2615 __ bind(L);
2616 BLOCK_COMMENT("} assert primitive array done");
2617 }
2618 #endif
2619
2620 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2621 rscratch2, L_failed);
2622
2623 // TypeArrayKlass
2624 //
2625 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2626 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2627 //
2628
2629 const Register rscratch1_offset = rscratch1; // array offset
2630 const Register r15_elsize = lh; // element size
2631
2632 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2633 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2634 __ add(src, src, rscratch1_offset); // src array offset
2635 __ add(dst, dst, rscratch1_offset); // dst array offset
2636 BLOCK_COMMENT("choose copy loop based on element size");
2637
2638 // next registers should be set before the jump to corresponding stub
2639 const Register from = c_rarg0; // source array address
2640 const Register to = c_rarg1; // destination array address
2641 const Register count = c_rarg2; // elements count
2642
2643 // 'from', 'to', 'count' registers should be set in such order
2644 // since they are the same as 'src', 'src_pos', 'dst'.
2645
2646 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2647
2648 // The possible values of elsize are 0-3, i.e. exact_log2(element
2649 // size in bytes). We do a simple bitwise binary search.
2650 __ BIND(L_copy_bytes);
2651 __ tbnz(r15_elsize, 1, L_copy_ints);
2652 __ tbnz(r15_elsize, 0, L_copy_shorts);
2653 __ lea(from, Address(src, src_pos));// src_addr
2654 __ lea(to, Address(dst, dst_pos));// dst_addr
2655 __ movw(count, scratch_length); // length
2656 __ b(RuntimeAddress(byte_copy_entry));
2657
2658 __ BIND(L_copy_shorts);
2659 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2660 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2661 __ movw(count, scratch_length); // length
2662 __ b(RuntimeAddress(short_copy_entry));
2663
2664 __ BIND(L_copy_ints);
2665 __ tbnz(r15_elsize, 0, L_copy_longs);
2666 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2667 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2668 __ movw(count, scratch_length); // length
2669 __ b(RuntimeAddress(int_copy_entry));
2670
2671 __ BIND(L_copy_longs);
2672 #ifdef ASSERT
2673 {
2674 BLOCK_COMMENT("assert long copy {");
2675 Label L;
2676 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2677 __ cmpw(r15_elsize, LogBytesPerLong);
2678 __ br(Assembler::EQ, L);
2679 __ stop("must be long copy, but elsize is wrong");
2680 __ bind(L);
2681 BLOCK_COMMENT("} assert long copy done");
2682 }
2683 #endif
2684 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2685 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2686 __ movw(count, scratch_length); // length
2687 __ b(RuntimeAddress(long_copy_entry));
2688
2689 // ObjArrayKlass
2690 __ BIND(L_objArray);
2691 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2692
2693 Label L_plain_copy, L_checkcast_copy;
2694 // test array classes for subtyping
2695 __ load_klass(r15, dst);
2696 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2697 __ br(Assembler::NE, L_checkcast_copy);
2698
2699 // Identically typed arrays can be copied without element-wise checks.
2700 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2701 rscratch2, L_failed);
2702
2703 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2704 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2705 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2706 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2707 __ movw(count, scratch_length); // length
2708 __ BIND(L_plain_copy);
2709 __ b(RuntimeAddress(oop_copy_entry));
2710
2711 __ BIND(L_checkcast_copy);
2712 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2713 {
2714 // Before looking at dst.length, make sure dst is also an objArray.
2715 __ ldrw(rscratch1, Address(r15, lh_offset));
2716 __ movw(rscratch2, objArray_lh);
2717 __ eorw(rscratch1, rscratch1, rscratch2);
2718 __ cbnzw(rscratch1, L_failed);
2719
2720 // It is safe to examine both src.length and dst.length.
2721 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2722 r15, L_failed);
2723
2724 __ load_klass(dst_klass, dst); // reload
2725
2726 // Marshal the base address arguments now, freeing registers.
2727 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2728 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2729 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2730 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2731 __ movw(count, length); // length (reloaded)
2732 Register sco_temp = c_rarg3; // this register is free now
2733 assert_different_registers(from, to, count, sco_temp,
2734 dst_klass, scratch_src_klass);
2735 // assert_clean_int(count, sco_temp);
2736
2737 // Generate the type check.
2738 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2739 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2740
2741 // Smashes rscratch1, rscratch2
2742 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2743 L_plain_copy);
2744
2745 // Fetch destination element klass from the ObjArrayKlass header.
2746 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2747 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2748 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2749
2750 // the checkcast_copy loop needs two extra arguments:
2751 assert(c_rarg3 == sco_temp, "#3 already in place");
2752 // Set up arguments for checkcast_copy_entry.
2753 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2754 __ b(RuntimeAddress(checkcast_copy_entry));
2755 }
2756
2757 __ BIND(L_failed);
2758 __ mov(r0, -1);
2759 __ leave(); // required for proper stackwalking of RuntimeStub frame
2760 __ ret(lr);
2761
2762 // record the stub entry and end
2763 store_archive_data(stub_id, start, __ pc());
2764
2765 return start;
2766 }
2767
2768 //
2769 // Generate stub for array fill. If "aligned" is true, the
2770 // "to" address is assumed to be heapword aligned.
2771 //
2772 // Arguments for generated stub:
2773 // to: c_rarg0
2774 // value: c_rarg1
2775 // count: c_rarg2 treated as signed
2776 //
2777 address generate_fill(StubId stub_id) {
2778 BasicType t;
2779 bool aligned;
2780
2781 switch (stub_id) {
2782 case StubId::stubgen_jbyte_fill_id:
2783 t = T_BYTE;
2784 aligned = false;
2785 break;
2786 case StubId::stubgen_jshort_fill_id:
2787 t = T_SHORT;
2788 aligned = false;
2789 break;
2790 case StubId::stubgen_jint_fill_id:
2791 t = T_INT;
2792 aligned = false;
2793 break;
2794 case StubId::stubgen_arrayof_jbyte_fill_id:
2795 t = T_BYTE;
2796 aligned = true;
2797 break;
2798 case StubId::stubgen_arrayof_jshort_fill_id:
2799 t = T_SHORT;
2800 aligned = true;
2801 break;
2802 case StubId::stubgen_arrayof_jint_fill_id:
2803 t = T_INT;
2804 aligned = true;
2805 break;
2806 default:
2807 ShouldNotReachHere();
2808 };
2809 int entry_count = StubInfo::entry_count(stub_id);
2810 assert(entry_count == 1, "sanity check");
2811 address start = load_archive_data(stub_id);
2812 if (start != nullptr) {
2813 return start;
2814 }
2815 __ align(CodeEntryAlignment);
2816 StubCodeMark mark(this, stub_id);
2817 start = __ pc();
2818
2819 BLOCK_COMMENT("Entry:");
2820
2821 const Register to = c_rarg0; // source array address
2822 const Register value = c_rarg1; // value
2823 const Register count = c_rarg2; // elements count
2824
2825 const Register bz_base = r10; // base for block_zero routine
2826 const Register cnt_words = r11; // temp register
2827
2828 __ enter();
2829
2830 Label L_fill_elements, L_exit1;
2831
2832 int shift = -1;
2833 switch (t) {
2834 case T_BYTE:
2835 shift = 0;
2836 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2837 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2838 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2839 __ br(Assembler::LO, L_fill_elements);
2840 break;
2841 case T_SHORT:
2842 shift = 1;
2843 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2844 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2845 __ br(Assembler::LO, L_fill_elements);
2846 break;
2847 case T_INT:
2848 shift = 2;
2849 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2850 __ br(Assembler::LO, L_fill_elements);
2851 break;
2852 default: ShouldNotReachHere();
2853 }
2854
2855 // Align source address at 8 bytes address boundary.
2856 Label L_skip_align1, L_skip_align2, L_skip_align4;
2857 if (!aligned) {
2858 switch (t) {
2859 case T_BYTE:
2860 // One byte misalignment happens only for byte arrays.
2861 __ tbz(to, 0, L_skip_align1);
2862 __ strb(value, Address(__ post(to, 1)));
2863 __ subw(count, count, 1);
2864 __ bind(L_skip_align1);
2865 // Fallthrough
2866 case T_SHORT:
2867 // Two bytes misalignment happens only for byte and short (char) arrays.
2868 __ tbz(to, 1, L_skip_align2);
2869 __ strh(value, Address(__ post(to, 2)));
2870 __ subw(count, count, 2 >> shift);
2871 __ bind(L_skip_align2);
2872 // Fallthrough
2873 case T_INT:
2874 // Align to 8 bytes, we know we are 4 byte aligned to start.
2875 __ tbz(to, 2, L_skip_align4);
2876 __ strw(value, Address(__ post(to, 4)));
2877 __ subw(count, count, 4 >> shift);
2878 __ bind(L_skip_align4);
2879 break;
2880 default: ShouldNotReachHere();
2881 }
2882 }
2883
2884 //
2885 // Fill large chunks
2886 //
2887 __ lsrw(cnt_words, count, 3 - shift); // number of words
2888 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2889 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2890 if (UseBlockZeroing) {
2891 Label non_block_zeroing, rest;
2892 // If the fill value is zero we can use the fast zero_words().
2893 __ cbnz(value, non_block_zeroing);
2894 __ mov(bz_base, to);
2895 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2896 address tpc = __ zero_words(bz_base, cnt_words);
2897 if (tpc == nullptr) {
2898 fatal("CodeCache is full at generate_fill");
2899 }
2900 __ b(rest);
2901 __ bind(non_block_zeroing);
2902 __ fill_words(to, cnt_words, value);
2903 __ bind(rest);
2904 } else {
2905 __ fill_words(to, cnt_words, value);
2906 }
2907
2908 // Remaining count is less than 8 bytes. Fill it by a single store.
2909 // Note that the total length is no less than 8 bytes.
2910 if (t == T_BYTE || t == T_SHORT) {
2911 Label L_exit1;
2912 __ cbzw(count, L_exit1);
2913 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2914 __ str(value, Address(to, -8)); // overwrite some elements
2915 __ bind(L_exit1);
2916 __ leave();
2917 __ ret(lr);
2918 }
2919
2920 // Handle copies less than 8 bytes.
2921 Label L_fill_2, L_fill_4, L_exit2;
2922 __ bind(L_fill_elements);
2923 switch (t) {
2924 case T_BYTE:
2925 __ tbz(count, 0, L_fill_2);
2926 __ strb(value, Address(__ post(to, 1)));
2927 __ bind(L_fill_2);
2928 __ tbz(count, 1, L_fill_4);
2929 __ strh(value, Address(__ post(to, 2)));
2930 __ bind(L_fill_4);
2931 __ tbz(count, 2, L_exit2);
2932 __ strw(value, Address(to));
2933 break;
2934 case T_SHORT:
2935 __ tbz(count, 0, L_fill_4);
2936 __ strh(value, Address(__ post(to, 2)));
2937 __ bind(L_fill_4);
2938 __ tbz(count, 1, L_exit2);
2939 __ strw(value, Address(to));
2940 break;
2941 case T_INT:
2942 __ cbzw(count, L_exit2);
2943 __ strw(value, Address(to));
2944 break;
2945 default: ShouldNotReachHere();
2946 }
2947 __ bind(L_exit2);
2948 __ leave();
2949 __ ret(lr);
2950
2951 // record the stub entry and end
2952 store_archive_data(stub_id, start, __ pc());
2953
2954 return start;
2955 }
2956
2957 address generate_unsafecopy_common_error_exit() {
2958 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2959 int entry_count = StubInfo::entry_count(stub_id);
2960 assert(entry_count == 1, "sanity check");
2961 address start = load_archive_data(stub_id);
2962 if (start != nullptr) {
2963 return start;
2964 }
2965 __ align(CodeEntryAlignment);
2966 StubCodeMark mark(this, stub_id);
2967 start = __ pc();
2968 __ leave();
2969 __ mov(r0, 0);
2970 __ ret(lr);
2971
2972 // record the stub entry and end
2973 store_archive_data(stub_id, start, __ pc());
2974
2975 return start;
2976 }
2977
2978 //
2979 // Generate 'unsafe' set memory stub
2980 // Though just as safe as the other stubs, it takes an unscaled
2981 // size_t (# bytes) argument instead of an element count.
2982 //
2983 // This fill operation is atomicity preserving: as long as the
2984 // address supplied is sufficiently aligned, all writes of up to 64
2985 // bits in size are single-copy atomic.
2986 //
2987 // Input:
2988 // c_rarg0 - destination array address
2989 // c_rarg1 - byte count (size_t)
2990 // c_rarg2 - byte value
2991 //
2992 address generate_unsafe_setmemory() {
2993 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
2994 int entry_count = StubInfo::entry_count(stub_id);
2995 assert(entry_count == 1, "sanity check");
2996 // we expect one set of extra unsafememory access handler entries
2997 GrowableArray<address> extras;
2998 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
2999 address start = load_archive_data(stub_id, nullptr, &extras);
3000 if (start != nullptr) {
3001 assert(extras.length() == extra_count,
3002 "unexpected extra entry count %d", extras.length());
3003 register_unsafe_access_handlers(extras, 0, 1);
3004 return start;
3005 }
3006
3007 __ align(CodeEntryAlignment);
3008 StubCodeMark mark(this, stub_id);
3009 start = __ pc();
3010
3011 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3012 Label tail;
3013
3014 {
3015 UnsafeMemoryAccessMark umam(this, true, false);
3016
3017 __ enter(); // required for proper stackwalking of RuntimeStub frame
3018
3019 __ dup(v0, __ T16B, value);
3020
3021 if (AvoidUnalignedAccesses) {
3022 __ cmp(count, (u1)16);
3023 __ br(__ LO, tail);
3024
3025 __ mov(rscratch1, 16);
3026 __ andr(rscratch2, dest, 15);
3027 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3028 __ strq(v0, Address(dest));
3029 __ sub(count, count, rscratch1);
3030 __ add(dest, dest, rscratch1);
3031 }
3032
3033 __ subs(count, count, (u1)64);
3034 __ br(__ LO, tail);
3035 {
3036 Label again;
3037 __ bind(again);
3038 __ stpq(v0, v0, Address(dest));
3039 __ stpq(v0, v0, Address(dest, 32));
3040
3041 __ subs(count, count, 64);
3042 __ add(dest, dest, 64);
3043 __ br(__ HS, again);
3044 }
3045
3046 __ bind(tail);
3047 // The count of bytes is off by 64, but we don't need to correct
3048 // it because we're only going to use the least-significant few
3049 // count bits from here on.
3050 // __ add(count, count, 64);
3051
3052 {
3053 Label dont;
3054 __ tbz(count, exact_log2(32), dont);
3055 __ stpq(v0, v0, __ post(dest, 32));
3056 __ bind(dont);
3057 }
3058 {
3059 Label dont;
3060 __ tbz(count, exact_log2(16), dont);
3061 __ strq(v0, __ post(dest, 16));
3062 __ bind(dont);
3063 }
3064 {
3065 Label dont;
3066 __ tbz(count, exact_log2(8), dont);
3067 __ strd(v0, __ post(dest, 8));
3068 __ bind(dont);
3069 }
3070
3071 Label finished;
3072 __ tst(count, 7);
3073 __ br(__ EQ, finished);
3074
3075 {
3076 Label dont;
3077 __ tbz(count, exact_log2(4), dont);
3078 __ strs(v0, __ post(dest, 4));
3079 __ bind(dont);
3080 }
3081 {
3082 Label dont;
3083 __ tbz(count, exact_log2(2), dont);
3084 __ bfi(value, value, 8, 8);
3085 __ strh(value, __ post(dest, 2));
3086 __ bind(dont);
3087 }
3088 {
3089 Label dont;
3090 __ tbz(count, exact_log2(1), dont);
3091 __ strb(value, Address(dest));
3092 __ bind(dont);
3093 }
3094
3095 __ bind(finished);
3096 __ leave();
3097 __ ret(lr);
3098 // have to exit the block and destroy the UnsafeMemoryAccessMark
3099 // in order to retrieve the handler end address
3100 }
3101
3102 // install saved handler addresses in extras
3103 address end = __ pc();
3104 retrieve_unsafe_access_handlers(start, end, extras);
3105 assert(extras.length() == extra_count,
3106 "incorrect handlers count %d", extras.length());
3107 // record the stub entry and end plus the extras
3108 store_archive_data(stub_id, start, end, nullptr, &extras);
3109
3110 return start;
3111 }
3112
3113 address generate_data_cache_writeback() {
3114 const Register line = c_rarg0; // address of line to write back
3115
3116 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3117 int entry_count = StubInfo::entry_count(stub_id);
3118 assert(entry_count == 1, "sanity check");
3119 address start = load_archive_data(stub_id);
3120 if (start != nullptr) {
3121 return start;
3122 }
3123 __ align(CodeEntryAlignment);
3124 StubCodeMark mark(this, stub_id);
3125
3126 start = __ pc();
3127 __ enter();
3128 __ cache_wb(Address(line, 0));
3129 __ leave();
3130 __ ret(lr);
3131
3132 // record the stub entry and end
3133 store_archive_data(stub_id, start, __ pc());
3134
3135 return start;
3136 }
3137
3138 address generate_data_cache_writeback_sync() {
3139 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3140 int entry_count = StubInfo::entry_count(stub_id);
3141 assert(entry_count == 1, "sanity check");
3142 address start = load_archive_data(stub_id);
3143 if (start != nullptr) {
3144 return start;
3145 }
3146 const Register is_pre = c_rarg0; // pre or post sync
3147 __ align(CodeEntryAlignment);
3148 StubCodeMark mark(this, stub_id);
3149
3150 // pre wbsync is a no-op
3151 // post wbsync translates to an sfence
3152
3153 Label skip;
3154 start = __ pc();
3155 __ enter();
3156 __ cbnz(is_pre, skip);
3157 __ cache_wbsync(false);
3158 __ bind(skip);
3159 __ leave();
3160 __ ret(lr);
3161
3162 // record the stub entry and end
3163 store_archive_data(stub_id, start, __ pc());
3164
3165 return start;
3166 }
3167
3168 void generate_arraycopy_stubs() {
3169 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3170 // entry immediately following their stack push. This can be used
3171 // as a post-push branch target for compatible stubs when they
3172 // identify a special case that can be handled by the fallback
3173 // stub e.g a disjoint copy stub may be use as a special case
3174 // fallback for its compatible conjoint copy stub.
3175 //
3176 // A no push entry is always returned in the following local and
3177 // then published by assigning to the appropriate entry field in
3178 // class StubRoutines. The entry value is then passed to the
3179 // generator for the compatible stub. That means the entry must be
3180 // listed when saving to/restoring from the AOT cache, ensuring
3181 // that the inter-stub jumps are noted at AOT-cache save and
3182 // relocated at AOT cache load.
3183 address nopush_entry;
3184
3185 // generate the common exit first so later stubs can rely on it if
3186 // they want an UnsafeMemoryAccess exit non-local to the stub
3187 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3188 // register the stub as the default exit with class UnsafeMemoryAccess
3189 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3190
3191 // generate and publish arch64-specific bulk copy routines first
3192 // so we can call them from other copy stubs
3193 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3194 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3195
3196 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3197 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3198
3199 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3200 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3201
3202 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3203
3204 //*** jbyte
3205 // Always need aligned and unaligned versions
3206 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3207 // disjoint nopush entry is needed by conjoint copy
3208 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3209 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3210 // conjoint nopush entry is needed by generic/unsafe copy
3211 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3212 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3213 // disjoint arrayof nopush entry is needed by conjoint copy
3214 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3215 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3216
3217 //*** jshort
3218 // Always need aligned and unaligned versions
3219 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3220 // disjoint nopush entry is needed by conjoint copy
3221 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3222 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3223 // conjoint nopush entry is used by generic/unsafe copy
3224 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3225 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3226 // disjoint arrayof nopush entry is needed by conjoint copy
3227 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3228 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3229
3230 //*** jint
3231 // Aligned versions
3232 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3233 // disjoint arrayof nopush entry is needed by conjoint copy
3234 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3235 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3236 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3237 // jint_arraycopy_nopush always points to the unaligned version
3238 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3239 // disjoint nopush entry is needed by conjoint copy
3240 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3241 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3242 // conjoint nopush entry is needed by generic/unsafe copy
3243 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3244
3245 //*** jlong
3246 // It is always aligned
3247 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3248 // disjoint arrayof nopush entry is needed by conjoint copy
3249 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3250 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3251 // conjoint nopush entry is needed by generic/unsafe copy
3252 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3253 // disjoint normal/nopush and conjoint normal entries are not
3254 // generated since the arrayof versions are the same
3255 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3256 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3257 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3258
3259 //*** oops
3260 {
3261 StubRoutines::_arrayof_oop_disjoint_arraycopy
3262 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3263 // disjoint arrayof nopush entry is needed by conjoint copy
3264 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3265 StubRoutines::_arrayof_oop_arraycopy
3266 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3267 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3268 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3269 // Aligned versions without pre-barriers
3270 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3271 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3272 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3273 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3274 // note that we don't need a returned nopush entry because the
3275 // generic/unsafe copy does not cater for uninit arrays.
3276 StubRoutines::_arrayof_oop_arraycopy_uninit
3277 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3278 }
3279
3280 // for oop copies reuse arrayof entries for non-arrayof cases
3281 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3282 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3283 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3284 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3285 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3286 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3287
3288 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3289 // checkcast nopush entry is needed by generic copy
3290 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3291 // note that we don't need a returned nopush entry because the
3292 // generic copy does not cater for uninit arrays.
3293 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3294
3295 // unsafe arraycopy may fallback on conjoint stubs
3296 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3297 StubRoutines::_jshort_arraycopy_nopush,
3298 StubRoutines::_jint_arraycopy_nopush,
3299 StubRoutines::_jlong_arraycopy_nopush);
3300
3301 // generic arraycopy may fallback on conjoint stubs
3302 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3303 StubRoutines::_jshort_arraycopy_nopush,
3304 StubRoutines::_jint_arraycopy_nopush,
3305 StubRoutines::_oop_arraycopy_nopush,
3306 StubRoutines::_jlong_arraycopy_nopush,
3307 StubRoutines::_checkcast_arraycopy_nopush);
3308
3309 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3310 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3311 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3312 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3313 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3314 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3315 }
3316
3317 void generate_math_stubs() { Unimplemented(); }
3318
3319 // Arguments:
3320 //
3321 // Inputs:
3322 // c_rarg0 - source byte array address
3323 // c_rarg1 - destination byte array address
3324 // c_rarg2 - sessionKe (key) in little endian int array
3325 //
3326 address generate_aescrypt_encryptBlock() {
3327 assert(UseAES, "need AES cryptographic extension support");
3328 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3329 int entry_count = StubInfo::entry_count(stub_id);
3330 assert(entry_count == 1, "sanity check");
3331 address start = load_archive_data(stub_id);
3332 if (start != nullptr) {
3333 return start;
3334 }
3335 __ align(CodeEntryAlignment);
3336 StubCodeMark mark(this, stub_id);
3337
3338 const Register from = c_rarg0; // source array address
3339 const Register to = c_rarg1; // destination array address
3340 const Register key = c_rarg2; // key array address
3341 const Register keylen = rscratch1;
3342
3343 start = __ pc();
3344 __ enter();
3345
3346 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3347
3348 __ aesenc_loadkeys(key, keylen);
3349 __ aesecb_encrypt(from, to, keylen);
3350
3351 __ mov(r0, 0);
3352
3353 __ leave();
3354 __ ret(lr);
3355
3356 // record the stub entry and end
3357 store_archive_data(stub_id, start, __ pc());
3358
3359 return start;
3360 }
3361
3362 // Arguments:
3363 //
3364 // Inputs:
3365 // c_rarg0 - source byte array address
3366 // c_rarg1 - destination byte array address
3367 // c_rarg2 - sessionKd (key) in little endian int array
3368 //
3369 address generate_aescrypt_decryptBlock() {
3370 assert(UseAES, "need AES cryptographic extension support");
3371 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3372 int entry_count = StubInfo::entry_count(stub_id);
3373 assert(entry_count == 1, "sanity check");
3374 address start = load_archive_data(stub_id);
3375 if (start != nullptr) {
3376 return start;
3377 }
3378 __ align(CodeEntryAlignment);
3379 StubCodeMark mark(this, stub_id);
3380 Label L_doLast;
3381
3382 const Register from = c_rarg0; // source array address
3383 const Register to = c_rarg1; // destination array address
3384 const Register key = c_rarg2; // key array address
3385 const Register keylen = rscratch1;
3386
3387 start = __ pc();
3388 __ enter(); // required for proper stackwalking of RuntimeStub frame
3389
3390 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3391
3392 __ aesecb_decrypt(from, to, key, keylen);
3393
3394 __ mov(r0, 0);
3395
3396 __ leave();
3397 __ ret(lr);
3398
3399 // record the stub entry and end
3400 store_archive_data(stub_id, start, __ pc());
3401
3402 return start;
3403 }
3404
3405 // Arguments:
3406 //
3407 // Inputs:
3408 // c_rarg0 - source byte array address
3409 // c_rarg1 - destination byte array address
3410 // c_rarg2 - sessionKe (key) in little endian int array
3411 // c_rarg3 - r vector byte array address
3412 // c_rarg4 - input length
3413 //
3414 // Output:
3415 // x0 - input length
3416 //
3417 address generate_cipherBlockChaining_encryptAESCrypt() {
3418 assert(UseAES, "need AES cryptographic extension support");
3419 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3420 int entry_count = StubInfo::entry_count(stub_id);
3421 assert(entry_count == 1, "sanity check");
3422 address start = load_archive_data(stub_id);
3423 if (start != nullptr) {
3424 return start;
3425 }
3426 __ align(CodeEntryAlignment);
3427 StubCodeMark mark(this, stub_id);
3428
3429 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3430
3431 const Register from = c_rarg0; // source array address
3432 const Register to = c_rarg1; // destination array address
3433 const Register key = c_rarg2; // key array address
3434 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3435 // and left with the results of the last encryption block
3436 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3437 const Register keylen = rscratch1;
3438
3439 start = __ pc();
3440
3441 __ enter();
3442
3443 __ movw(rscratch2, len_reg);
3444
3445 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3446
3447 __ ld1(v0, __ T16B, rvec);
3448
3449 __ cmpw(keylen, 52);
3450 __ br(Assembler::CC, L_loadkeys_44);
3451 __ br(Assembler::EQ, L_loadkeys_52);
3452
3453 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3454 __ rev32(v17, __ T16B, v17);
3455 __ rev32(v18, __ T16B, v18);
3456 __ BIND(L_loadkeys_52);
3457 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3458 __ rev32(v19, __ T16B, v19);
3459 __ rev32(v20, __ T16B, v20);
3460 __ BIND(L_loadkeys_44);
3461 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3462 __ rev32(v21, __ T16B, v21);
3463 __ rev32(v22, __ T16B, v22);
3464 __ rev32(v23, __ T16B, v23);
3465 __ rev32(v24, __ T16B, v24);
3466 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3467 __ rev32(v25, __ T16B, v25);
3468 __ rev32(v26, __ T16B, v26);
3469 __ rev32(v27, __ T16B, v27);
3470 __ rev32(v28, __ T16B, v28);
3471 __ ld1(v29, v30, v31, __ T16B, key);
3472 __ rev32(v29, __ T16B, v29);
3473 __ rev32(v30, __ T16B, v30);
3474 __ rev32(v31, __ T16B, v31);
3475
3476 __ BIND(L_aes_loop);
3477 __ ld1(v1, __ T16B, __ post(from, 16));
3478 __ eor(v0, __ T16B, v0, v1);
3479
3480 __ br(Assembler::CC, L_rounds_44);
3481 __ br(Assembler::EQ, L_rounds_52);
3482
3483 __ aese(v0, v17); __ aesmc(v0, v0);
3484 __ aese(v0, v18); __ aesmc(v0, v0);
3485 __ BIND(L_rounds_52);
3486 __ aese(v0, v19); __ aesmc(v0, v0);
3487 __ aese(v0, v20); __ aesmc(v0, v0);
3488 __ BIND(L_rounds_44);
3489 __ aese(v0, v21); __ aesmc(v0, v0);
3490 __ aese(v0, v22); __ aesmc(v0, v0);
3491 __ aese(v0, v23); __ aesmc(v0, v0);
3492 __ aese(v0, v24); __ aesmc(v0, v0);
3493 __ aese(v0, v25); __ aesmc(v0, v0);
3494 __ aese(v0, v26); __ aesmc(v0, v0);
3495 __ aese(v0, v27); __ aesmc(v0, v0);
3496 __ aese(v0, v28); __ aesmc(v0, v0);
3497 __ aese(v0, v29); __ aesmc(v0, v0);
3498 __ aese(v0, v30);
3499 __ eor(v0, __ T16B, v0, v31);
3500
3501 __ st1(v0, __ T16B, __ post(to, 16));
3502
3503 __ subw(len_reg, len_reg, 16);
3504 __ cbnzw(len_reg, L_aes_loop);
3505
3506 __ st1(v0, __ T16B, rvec);
3507
3508 __ mov(r0, rscratch2);
3509
3510 __ leave();
3511 __ ret(lr);
3512
3513 // record the stub entry and end
3514 store_archive_data(stub_id, start, __ pc());
3515
3516 return start;
3517 }
3518
3519 // Arguments:
3520 //
3521 // Inputs:
3522 // c_rarg0 - source byte array address
3523 // c_rarg1 - destination byte array address
3524 // c_rarg2 - sessionKd (key) in little endian int array
3525 // c_rarg3 - r vector byte array address
3526 // c_rarg4 - input length
3527 //
3528 // Output:
3529 // r0 - input length
3530 //
3531 address generate_cipherBlockChaining_decryptAESCrypt() {
3532 assert(UseAES, "need AES cryptographic extension support");
3533 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3534 int entry_count = StubInfo::entry_count(stub_id);
3535 assert(entry_count == 1, "sanity check");
3536 address start = load_archive_data(stub_id);
3537 if (start != nullptr) {
3538 return start;
3539 }
3540 __ align(CodeEntryAlignment);
3541 StubCodeMark mark(this, stub_id);
3542
3543 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3544
3545 const Register from = c_rarg0; // source array address
3546 const Register to = c_rarg1; // destination array address
3547 const Register key = c_rarg2; // key array address
3548 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3549 // and left with the results of the last encryption block
3550 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3551 const Register keylen = rscratch1;
3552
3553 start = __ pc();
3554
3555 __ enter();
3556
3557 __ movw(rscratch2, len_reg);
3558
3559 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3560
3561 __ ld1(v2, __ T16B, rvec);
3562
3563 __ ld1(v31, __ T16B, __ post(key, 16));
3564 __ rev32(v31, __ T16B, v31);
3565
3566 __ cmpw(keylen, 52);
3567 __ br(Assembler::CC, L_loadkeys_44);
3568 __ br(Assembler::EQ, L_loadkeys_52);
3569
3570 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3571 __ rev32(v17, __ T16B, v17);
3572 __ rev32(v18, __ T16B, v18);
3573 __ BIND(L_loadkeys_52);
3574 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3575 __ rev32(v19, __ T16B, v19);
3576 __ rev32(v20, __ T16B, v20);
3577 __ BIND(L_loadkeys_44);
3578 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3579 __ rev32(v21, __ T16B, v21);
3580 __ rev32(v22, __ T16B, v22);
3581 __ rev32(v23, __ T16B, v23);
3582 __ rev32(v24, __ T16B, v24);
3583 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3584 __ rev32(v25, __ T16B, v25);
3585 __ rev32(v26, __ T16B, v26);
3586 __ rev32(v27, __ T16B, v27);
3587 __ rev32(v28, __ T16B, v28);
3588 __ ld1(v29, v30, __ T16B, key);
3589 __ rev32(v29, __ T16B, v29);
3590 __ rev32(v30, __ T16B, v30);
3591
3592 __ BIND(L_aes_loop);
3593 __ ld1(v0, __ T16B, __ post(from, 16));
3594 __ orr(v1, __ T16B, v0, v0);
3595
3596 __ br(Assembler::CC, L_rounds_44);
3597 __ br(Assembler::EQ, L_rounds_52);
3598
3599 __ aesd(v0, v17); __ aesimc(v0, v0);
3600 __ aesd(v0, v18); __ aesimc(v0, v0);
3601 __ BIND(L_rounds_52);
3602 __ aesd(v0, v19); __ aesimc(v0, v0);
3603 __ aesd(v0, v20); __ aesimc(v0, v0);
3604 __ BIND(L_rounds_44);
3605 __ aesd(v0, v21); __ aesimc(v0, v0);
3606 __ aesd(v0, v22); __ aesimc(v0, v0);
3607 __ aesd(v0, v23); __ aesimc(v0, v0);
3608 __ aesd(v0, v24); __ aesimc(v0, v0);
3609 __ aesd(v0, v25); __ aesimc(v0, v0);
3610 __ aesd(v0, v26); __ aesimc(v0, v0);
3611 __ aesd(v0, v27); __ aesimc(v0, v0);
3612 __ aesd(v0, v28); __ aesimc(v0, v0);
3613 __ aesd(v0, v29); __ aesimc(v0, v0);
3614 __ aesd(v0, v30);
3615 __ eor(v0, __ T16B, v0, v31);
3616 __ eor(v0, __ T16B, v0, v2);
3617
3618 __ st1(v0, __ T16B, __ post(to, 16));
3619 __ orr(v2, __ T16B, v1, v1);
3620
3621 __ subw(len_reg, len_reg, 16);
3622 __ cbnzw(len_reg, L_aes_loop);
3623
3624 __ st1(v2, __ T16B, rvec);
3625
3626 __ mov(r0, rscratch2);
3627
3628 __ leave();
3629 __ ret(lr);
3630
3631 // record the stub entry and end
3632 store_archive_data(stub_id, start, __ pc());
3633
3634 return start;
3635 }
3636
3637 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3638 // Inputs: 128-bits. in is preserved.
3639 // The least-significant 64-bit word is in the upper dword of each vector.
3640 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3641 // Output: result
3642 void be_add_128_64(FloatRegister result, FloatRegister in,
3643 FloatRegister inc, FloatRegister tmp) {
3644 assert_different_registers(result, tmp, inc);
3645
3646 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3647 // input
3648 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3649 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3650 // MSD == 0 (must be!) to LSD
3651 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3652 }
3653
3654 // CTR AES crypt.
3655 // Arguments:
3656 //
3657 // Inputs:
3658 // c_rarg0 - source byte array address
3659 // c_rarg1 - destination byte array address
3660 // c_rarg2 - sessionKe (key) in little endian int array
3661 // c_rarg3 - counter vector byte array address
3662 // c_rarg4 - input length
3663 // c_rarg5 - saved encryptedCounter start
3664 // c_rarg6 - saved used length
3665 //
3666 // Output:
3667 // r0 - input length
3668 //
3669 address generate_counterMode_AESCrypt() {
3670 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3671 int entry_count = StubInfo::entry_count(stub_id);
3672 assert(entry_count == 1, "sanity check");
3673 address start = load_archive_data(stub_id);
3674 if (start != nullptr) {
3675 return start;
3676 }
3677 const Register in = c_rarg0;
3678 const Register out = c_rarg1;
3679 const Register key = c_rarg2;
3680 const Register counter = c_rarg3;
3681 const Register saved_len = c_rarg4, len = r10;
3682 const Register saved_encrypted_ctr = c_rarg5;
3683 const Register used_ptr = c_rarg6, used = r12;
3684
3685 const Register offset = r7;
3686 const Register keylen = r11;
3687
3688 const unsigned char block_size = 16;
3689 const int bulk_width = 4;
3690 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3691 // performance with larger data sizes, but it also means that the
3692 // fast path isn't used until you have at least 8 blocks, and up
3693 // to 127 bytes of data will be executed on the slow path. For
3694 // that reason, and also so as not to blow away too much icache, 4
3695 // blocks seems like a sensible compromise.
3696
3697 // Algorithm:
3698 //
3699 // if (len == 0) {
3700 // goto DONE;
3701 // }
3702 // int result = len;
3703 // do {
3704 // if (used >= blockSize) {
3705 // if (len >= bulk_width * blockSize) {
3706 // CTR_large_block();
3707 // if (len == 0)
3708 // goto DONE;
3709 // }
3710 // for (;;) {
3711 // 16ByteVector v0 = counter;
3712 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3713 // used = 0;
3714 // if (len < blockSize)
3715 // break; /* goto NEXT */
3716 // 16ByteVector v1 = load16Bytes(in, offset);
3717 // v1 = v1 ^ encryptedCounter;
3718 // store16Bytes(out, offset);
3719 // used = blockSize;
3720 // offset += blockSize;
3721 // len -= blockSize;
3722 // if (len == 0)
3723 // goto DONE;
3724 // }
3725 // }
3726 // NEXT:
3727 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3728 // len--;
3729 // } while (len != 0);
3730 // DONE:
3731 // return result;
3732 //
3733 // CTR_large_block()
3734 // Wide bulk encryption of whole blocks.
3735
3736 __ align(CodeEntryAlignment);
3737 StubCodeMark mark(this, stub_id);
3738 start = __ pc();
3739 __ enter();
3740
3741 Label DONE, CTR_large_block, large_block_return;
3742 __ ldrw(used, Address(used_ptr));
3743 __ cbzw(saved_len, DONE);
3744
3745 __ mov(len, saved_len);
3746 __ mov(offset, 0);
3747
3748 // Compute #rounds for AES based on the length of the key array
3749 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3750
3751 __ aesenc_loadkeys(key, keylen);
3752
3753 {
3754 Label L_CTR_loop, NEXT;
3755
3756 __ bind(L_CTR_loop);
3757
3758 __ cmp(used, block_size);
3759 __ br(__ LO, NEXT);
3760
3761 // Maybe we have a lot of data
3762 __ subsw(rscratch1, len, bulk_width * block_size);
3763 __ br(__ HS, CTR_large_block);
3764 __ BIND(large_block_return);
3765 __ cbzw(len, DONE);
3766
3767 // Setup the counter
3768 __ movi(v4, __ T4S, 0);
3769 __ movi(v5, __ T4S, 1);
3770 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3771
3772 // 128-bit big-endian increment
3773 __ ld1(v0, __ T16B, counter);
3774 __ rev64(v16, __ T16B, v0);
3775 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3776 __ rev64(v16, __ T16B, v16);
3777 __ st1(v16, __ T16B, counter);
3778 // Previous counter value is in v0
3779 // v4 contains { 0, 1 }
3780
3781 {
3782 // We have fewer than bulk_width blocks of data left. Encrypt
3783 // them one by one until there is less than a full block
3784 // remaining, being careful to save both the encrypted counter
3785 // and the counter.
3786
3787 Label inner_loop;
3788 __ bind(inner_loop);
3789 // Counter to encrypt is in v0
3790 __ aesecb_encrypt(noreg, noreg, keylen);
3791 __ st1(v0, __ T16B, saved_encrypted_ctr);
3792
3793 // Do we have a remaining full block?
3794
3795 __ mov(used, 0);
3796 __ cmp(len, block_size);
3797 __ br(__ LO, NEXT);
3798
3799 // Yes, we have a full block
3800 __ ldrq(v1, Address(in, offset));
3801 __ eor(v1, __ T16B, v1, v0);
3802 __ strq(v1, Address(out, offset));
3803 __ mov(used, block_size);
3804 __ add(offset, offset, block_size);
3805
3806 __ subw(len, len, block_size);
3807 __ cbzw(len, DONE);
3808
3809 // Increment the counter, store it back
3810 __ orr(v0, __ T16B, v16, v16);
3811 __ rev64(v16, __ T16B, v16);
3812 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3813 __ rev64(v16, __ T16B, v16);
3814 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3815
3816 __ b(inner_loop);
3817 }
3818
3819 __ BIND(NEXT);
3820
3821 // Encrypt a single byte, and loop.
3822 // We expect this to be a rare event.
3823 __ ldrb(rscratch1, Address(in, offset));
3824 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3825 __ eor(rscratch1, rscratch1, rscratch2);
3826 __ strb(rscratch1, Address(out, offset));
3827 __ add(offset, offset, 1);
3828 __ add(used, used, 1);
3829 __ subw(len, len,1);
3830 __ cbnzw(len, L_CTR_loop);
3831 }
3832
3833 __ bind(DONE);
3834 __ strw(used, Address(used_ptr));
3835 __ mov(r0, saved_len);
3836
3837 __ leave(); // required for proper stackwalking of RuntimeStub frame
3838 __ ret(lr);
3839
3840 // Bulk encryption
3841
3842 __ BIND (CTR_large_block);
3843 assert(bulk_width == 4 || bulk_width == 8, "must be");
3844
3845 if (bulk_width == 8) {
3846 __ sub(sp, sp, 4 * 16);
3847 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3848 }
3849 __ sub(sp, sp, 4 * 16);
3850 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3851 RegSet saved_regs = (RegSet::of(in, out, offset)
3852 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3853 __ push(saved_regs, sp);
3854 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3855 __ add(in, in, offset);
3856 __ add(out, out, offset);
3857
3858 // Keys should already be loaded into the correct registers
3859
3860 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3861 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3862
3863 // AES/CTR loop
3864 {
3865 Label L_CTR_loop;
3866 __ BIND(L_CTR_loop);
3867
3868 // Setup the counters
3869 __ movi(v8, __ T4S, 0);
3870 __ movi(v9, __ T4S, 1);
3871 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3872
3873 for (int i = 0; i < bulk_width; i++) {
3874 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3875 __ rev64(v0_ofs, __ T16B, v16);
3876 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3877 }
3878
3879 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3880
3881 // Encrypt the counters
3882 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3883
3884 if (bulk_width == 8) {
3885 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3886 }
3887
3888 // XOR the encrypted counters with the inputs
3889 for (int i = 0; i < bulk_width; i++) {
3890 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3891 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3892 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3893 }
3894
3895 // Write the encrypted data
3896 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3897 if (bulk_width == 8) {
3898 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3899 }
3900
3901 __ subw(len, len, 16 * bulk_width);
3902 __ cbnzw(len, L_CTR_loop);
3903 }
3904
3905 // Save the counter back where it goes
3906 __ rev64(v16, __ T16B, v16);
3907 __ st1(v16, __ T16B, counter);
3908
3909 __ pop(saved_regs, sp);
3910
3911 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3912 if (bulk_width == 8) {
3913 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3914 }
3915
3916 __ andr(rscratch1, len, -16 * bulk_width);
3917 __ sub(len, len, rscratch1);
3918 __ add(offset, offset, rscratch1);
3919 __ mov(used, 16);
3920 __ strw(used, Address(used_ptr));
3921 __ b(large_block_return);
3922
3923 // record the stub entry and end
3924 store_archive_data(stub_id, start, __ pc());
3925
3926 return start;
3927 }
3928
3929 // Vector AES Galois Counter Mode implementation. Parameters:
3930 //
3931 // in = c_rarg0
3932 // len = c_rarg1
3933 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3934 // out = c_rarg3
3935 // key = c_rarg4
3936 // state = c_rarg5 - GHASH.state
3937 // subkeyHtbl = c_rarg6 - powers of H
3938 // counter = c_rarg7 - 16 bytes of CTR
3939 // return - number of processed bytes
3940 address generate_galoisCounterMode_AESCrypt() {
3941 Label ghash_polynomial; // local data generated after code
3942 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3943 int entry_count = StubInfo::entry_count(stub_id);
3944 assert(entry_count == 1, "sanity check");
3945 address start = load_archive_data(stub_id);
3946 if (start != nullptr) {
3947 return start;
3948 }
3949 __ align(CodeEntryAlignment);
3950 StubCodeMark mark(this, stub_id);
3951 start = __ pc();
3952 __ enter();
3953
3954 const Register in = c_rarg0;
3955 const Register len = c_rarg1;
3956 const Register ct = c_rarg2;
3957 const Register out = c_rarg3;
3958 // and updated with the incremented counter in the end
3959
3960 const Register key = c_rarg4;
3961 const Register state = c_rarg5;
3962
3963 const Register subkeyHtbl = c_rarg6;
3964
3965 const Register counter = c_rarg7;
3966
3967 const Register keylen = r10;
3968 // Save state before entering routine
3969 __ sub(sp, sp, 4 * 16);
3970 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3971 __ sub(sp, sp, 4 * 16);
3972 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3973
3974 // __ andr(len, len, -512);
3975 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3976 __ str(len, __ pre(sp, -2 * wordSize));
3977
3978 Label DONE;
3979 __ cbz(len, DONE);
3980
3981 // Compute #rounds for AES based on the length of the key array
3982 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3983
3984 __ aesenc_loadkeys(key, keylen);
3985 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3986 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
3987
3988 // AES/CTR loop
3989 {
3990 Label L_CTR_loop;
3991 __ BIND(L_CTR_loop);
3992
3993 // Setup the counters
3994 __ movi(v8, __ T4S, 0);
3995 __ movi(v9, __ T4S, 1);
3996 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
3997
3998 assert(v0->encoding() < v8->encoding(), "");
3999 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4000 FloatRegister f = as_FloatRegister(i);
4001 __ rev32(f, __ T16B, v16);
4002 __ addv(v16, __ T4S, v16, v8);
4003 }
4004
4005 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4006
4007 // Encrypt the counters
4008 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4009
4010 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4011
4012 // XOR the encrypted counters with the inputs
4013 for (int i = 0; i < 8; i++) {
4014 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4015 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4016 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4017 }
4018 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4019 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4020
4021 __ subw(len, len, 16 * 8);
4022 __ cbnzw(len, L_CTR_loop);
4023 }
4024
4025 __ rev32(v16, __ T16B, v16);
4026 __ st1(v16, __ T16B, counter);
4027
4028 __ ldr(len, Address(sp));
4029 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4030
4031 // GHASH/CTR loop
4032 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4033 len, /*unrolls*/4);
4034
4035 #ifdef ASSERT
4036 { Label L;
4037 __ cmp(len, (unsigned char)0);
4038 __ br(Assembler::EQ, L);
4039 __ stop("stubGenerator: abort");
4040 __ bind(L);
4041 }
4042 #endif
4043
4044 __ bind(DONE);
4045 // Return the number of bytes processed
4046 __ ldr(r0, __ post(sp, 2 * wordSize));
4047
4048 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4049 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4050
4051 __ leave(); // required for proper stackwalking of RuntimeStub frame
4052 __ ret(lr);
4053
4054 // bind label and generate polynomial data
4055 __ align(wordSize * 2);
4056 __ bind(ghash_polynomial);
4057 __ emit_int64(0x87); // The low-order bits of the field
4058 // polynomial (i.e. p = z^7+z^2+z+1)
4059 // repeated in the low and high parts of a
4060 // 128-bit vector
4061 __ emit_int64(0x87);
4062
4063 // record the stub entry and end
4064 store_archive_data(stub_id, start, __ pc());
4065
4066 return start;
4067 }
4068
4069 class Cached64Bytes {
4070 private:
4071 MacroAssembler *_masm;
4072 Register _regs[8];
4073
4074 public:
4075 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4076 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4077 auto it = rs.begin();
4078 for (auto &r: _regs) {
4079 r = *it;
4080 ++it;
4081 }
4082 }
4083
4084 void gen_loads(Register base) {
4085 for (int i = 0; i < 8; i += 2) {
4086 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4087 }
4088 }
4089
4090 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4091 void extract_u32(Register dest, int i) {
4092 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4093 }
4094 };
4095
4096 // Utility routines for md5.
4097 // Clobbers r10 and r11.
4098 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4099 int k, int s, int t) {
4100 Register rscratch3 = r10;
4101 Register rscratch4 = r11;
4102
4103 __ eorw(rscratch3, r3, r4);
4104 __ movw(rscratch2, t);
4105 __ andw(rscratch3, rscratch3, r2);
4106 __ addw(rscratch4, r1, rscratch2);
4107 reg_cache.extract_u32(rscratch1, k);
4108 __ eorw(rscratch3, rscratch3, r4);
4109 __ addw(rscratch4, rscratch4, rscratch1);
4110 __ addw(rscratch3, rscratch3, rscratch4);
4111 __ rorw(rscratch2, rscratch3, 32 - s);
4112 __ addw(r1, rscratch2, r2);
4113 }
4114
4115 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4116 int k, int s, int t) {
4117 Register rscratch3 = r10;
4118 Register rscratch4 = r11;
4119
4120 reg_cache.extract_u32(rscratch1, k);
4121 __ movw(rscratch2, t);
4122 __ addw(rscratch4, r1, rscratch2);
4123 __ addw(rscratch4, rscratch4, rscratch1);
4124 __ bicw(rscratch2, r3, r4);
4125 __ andw(rscratch3, r2, r4);
4126 __ addw(rscratch2, rscratch2, rscratch4);
4127 __ addw(rscratch2, rscratch2, rscratch3);
4128 __ rorw(rscratch2, rscratch2, 32 - s);
4129 __ addw(r1, rscratch2, r2);
4130 }
4131
4132 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4133 int k, int s, int t) {
4134 Register rscratch3 = r10;
4135 Register rscratch4 = r11;
4136
4137 __ eorw(rscratch3, r3, r4);
4138 __ movw(rscratch2, t);
4139 __ addw(rscratch4, r1, rscratch2);
4140 reg_cache.extract_u32(rscratch1, k);
4141 __ eorw(rscratch3, rscratch3, r2);
4142 __ addw(rscratch4, rscratch4, rscratch1);
4143 __ addw(rscratch3, rscratch3, rscratch4);
4144 __ rorw(rscratch2, rscratch3, 32 - s);
4145 __ addw(r1, rscratch2, r2);
4146 }
4147
4148 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4149 int k, int s, int t) {
4150 Register rscratch3 = r10;
4151 Register rscratch4 = r11;
4152
4153 __ movw(rscratch3, t);
4154 __ ornw(rscratch2, r2, r4);
4155 __ addw(rscratch4, r1, rscratch3);
4156 reg_cache.extract_u32(rscratch1, k);
4157 __ eorw(rscratch3, rscratch2, r3);
4158 __ addw(rscratch4, rscratch4, rscratch1);
4159 __ addw(rscratch3, rscratch3, rscratch4);
4160 __ rorw(rscratch2, rscratch3, 32 - s);
4161 __ addw(r1, rscratch2, r2);
4162 }
4163
4164 // Arguments:
4165 //
4166 // Inputs:
4167 // c_rarg0 - byte[] source+offset
4168 // c_rarg1 - int[] SHA.state
4169 // c_rarg2 - int offset
4170 // c_rarg3 - int limit
4171 //
4172 address generate_md5_implCompress(StubId stub_id) {
4173 bool multi_block;
4174 switch (stub_id) {
4175 case StubId::stubgen_md5_implCompress_id:
4176 multi_block = false;
4177 break;
4178 case StubId::stubgen_md5_implCompressMB_id:
4179 multi_block = true;
4180 break;
4181 default:
4182 ShouldNotReachHere();
4183 }
4184 int entry_count = StubInfo::entry_count(stub_id);
4185 assert(entry_count == 1, "sanity check");
4186 address start = load_archive_data(stub_id);
4187 if (start != nullptr) {
4188 return start;
4189 }
4190 __ align(CodeEntryAlignment);
4191
4192 StubCodeMark mark(this, stub_id);
4193 start = __ pc();
4194
4195 Register buf = c_rarg0;
4196 Register state = c_rarg1;
4197 Register ofs = c_rarg2;
4198 Register limit = c_rarg3;
4199 Register a = r4;
4200 Register b = r5;
4201 Register c = r6;
4202 Register d = r7;
4203 Register rscratch3 = r10;
4204 Register rscratch4 = r11;
4205
4206 Register state_regs[2] = { r12, r13 };
4207 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4208 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4209
4210 __ push(saved_regs, sp);
4211
4212 __ ldp(state_regs[0], state_regs[1], Address(state));
4213 __ ubfx(a, state_regs[0], 0, 32);
4214 __ ubfx(b, state_regs[0], 32, 32);
4215 __ ubfx(c, state_regs[1], 0, 32);
4216 __ ubfx(d, state_regs[1], 32, 32);
4217
4218 Label md5_loop;
4219 __ BIND(md5_loop);
4220
4221 reg_cache.gen_loads(buf);
4222
4223 // Round 1
4224 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4225 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4226 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4227 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4228 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4229 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4230 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4231 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4232 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4233 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4234 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4235 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4236 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4237 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4238 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4239 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4240
4241 // Round 2
4242 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4243 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4244 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4245 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4246 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4247 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4248 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4249 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4250 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4251 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4252 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4253 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4254 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4255 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4256 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4257 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4258
4259 // Round 3
4260 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4261 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4262 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4263 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4264 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4265 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4266 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4267 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4268 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4269 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4270 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4271 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4272 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4273 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4274 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4275 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4276
4277 // Round 4
4278 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4279 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4280 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4281 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4282 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4283 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4284 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4285 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4286 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4287 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4288 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4289 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4290 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4291 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4292 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4293 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4294
4295 __ addw(a, state_regs[0], a);
4296 __ ubfx(rscratch2, state_regs[0], 32, 32);
4297 __ addw(b, rscratch2, b);
4298 __ addw(c, state_regs[1], c);
4299 __ ubfx(rscratch4, state_regs[1], 32, 32);
4300 __ addw(d, rscratch4, d);
4301
4302 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4303 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4304
4305 if (multi_block) {
4306 __ add(buf, buf, 64);
4307 __ add(ofs, ofs, 64);
4308 __ cmp(ofs, limit);
4309 __ br(Assembler::LE, md5_loop);
4310 __ mov(c_rarg0, ofs); // return ofs
4311 }
4312
4313 // write hash values back in the correct order
4314 __ stp(state_regs[0], state_regs[1], Address(state));
4315
4316 __ pop(saved_regs, sp);
4317
4318 __ ret(lr);
4319
4320 // record the stub entry and end
4321 store_archive_data(stub_id, start, __ pc());
4322
4323 return start;
4324 }
4325
4326 // Arguments:
4327 //
4328 // Inputs:
4329 // c_rarg0 - byte[] source+offset
4330 // c_rarg1 - int[] SHA.state
4331 // c_rarg2 - int offset
4332 // c_rarg3 - int limit
4333 //
4334 address generate_sha1_implCompress(StubId stub_id) {
4335 bool multi_block;
4336 switch (stub_id) {
4337 case StubId::stubgen_sha1_implCompress_id:
4338 multi_block = false;
4339 break;
4340 case StubId::stubgen_sha1_implCompressMB_id:
4341 multi_block = true;
4342 break;
4343 default:
4344 ShouldNotReachHere();
4345 }
4346 int entry_count = StubInfo::entry_count(stub_id);
4347 assert(entry_count == 1, "sanity check");
4348 address start = load_archive_data(stub_id);
4349 if (start != nullptr) {
4350 return start;
4351 }
4352 __ align(CodeEntryAlignment);
4353
4354 StubCodeMark mark(this, stub_id);
4355 start = __ pc();
4356
4357 Register buf = c_rarg0;
4358 Register state = c_rarg1;
4359 Register ofs = c_rarg2;
4360 Register limit = c_rarg3;
4361
4362 Label keys;
4363 Label sha1_loop;
4364
4365 // load the keys into v0..v3
4366 __ adr(rscratch1, keys);
4367 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4368 // load 5 words state into v6, v7
4369 __ ldrq(v6, Address(state, 0));
4370 __ ldrs(v7, Address(state, 16));
4371
4372
4373 __ BIND(sha1_loop);
4374 // load 64 bytes of data into v16..v19
4375 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4376 __ rev32(v16, __ T16B, v16);
4377 __ rev32(v17, __ T16B, v17);
4378 __ rev32(v18, __ T16B, v18);
4379 __ rev32(v19, __ T16B, v19);
4380
4381 // do the sha1
4382 __ addv(v4, __ T4S, v16, v0);
4383 __ orr(v20, __ T16B, v6, v6);
4384
4385 FloatRegister d0 = v16;
4386 FloatRegister d1 = v17;
4387 FloatRegister d2 = v18;
4388 FloatRegister d3 = v19;
4389
4390 for (int round = 0; round < 20; round++) {
4391 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4392 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4393 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4394 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4395 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4396
4397 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4398 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4399 __ sha1h(tmp2, __ T4S, v20);
4400 if (round < 5)
4401 __ sha1c(v20, __ T4S, tmp3, tmp4);
4402 else if (round < 10 || round >= 15)
4403 __ sha1p(v20, __ T4S, tmp3, tmp4);
4404 else
4405 __ sha1m(v20, __ T4S, tmp3, tmp4);
4406 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4407
4408 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4409 }
4410
4411 __ addv(v7, __ T2S, v7, v21);
4412 __ addv(v6, __ T4S, v6, v20);
4413
4414 if (multi_block) {
4415 __ add(ofs, ofs, 64);
4416 __ cmp(ofs, limit);
4417 __ br(Assembler::LE, sha1_loop);
4418 __ mov(c_rarg0, ofs); // return ofs
4419 }
4420
4421 __ strq(v6, Address(state, 0));
4422 __ strs(v7, Address(state, 16));
4423
4424 __ ret(lr);
4425
4426 __ bind(keys);
4427 __ emit_int32(0x5a827999);
4428 __ emit_int32(0x6ed9eba1);
4429 __ emit_int32(0x8f1bbcdc);
4430 __ emit_int32(0xca62c1d6);
4431
4432 // record the stub entry and end
4433 store_archive_data(stub_id, start, __ pc());
4434
4435 return start;
4436 }
4437
4438
4439 // Arguments:
4440 //
4441 // Inputs:
4442 // c_rarg0 - byte[] source+offset
4443 // c_rarg1 - int[] SHA.state
4444 // c_rarg2 - int offset
4445 // c_rarg3 - int limit
4446 //
4447 address generate_sha256_implCompress(StubId stub_id) {
4448 bool multi_block;
4449 switch (stub_id) {
4450 case StubId::stubgen_sha256_implCompress_id:
4451 multi_block = false;
4452 break;
4453 case StubId::stubgen_sha256_implCompressMB_id:
4454 multi_block = true;
4455 break;
4456 default:
4457 ShouldNotReachHere();
4458 }
4459 int entry_count = StubInfo::entry_count(stub_id);
4460 assert(entry_count == 1, "sanity check");
4461 address start = load_archive_data(stub_id);
4462 if (start != nullptr) {
4463 return start;
4464 }
4465 __ align(CodeEntryAlignment);
4466 StubCodeMark mark(this, stub_id);
4467 start = __ pc();
4468
4469 Register buf = c_rarg0;
4470 Register state = c_rarg1;
4471 Register ofs = c_rarg2;
4472 Register limit = c_rarg3;
4473
4474 Label sha1_loop;
4475
4476 __ stpd(v8, v9, __ pre(sp, -32));
4477 __ stpd(v10, v11, Address(sp, 16));
4478
4479 // dga == v0
4480 // dgb == v1
4481 // dg0 == v2
4482 // dg1 == v3
4483 // dg2 == v4
4484 // t0 == v6
4485 // t1 == v7
4486
4487 // load 16 keys to v16..v31
4488 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4489 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4490 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4491 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4492 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4493
4494 // load 8 words (256 bits) state
4495 __ ldpq(v0, v1, state);
4496
4497 __ BIND(sha1_loop);
4498 // load 64 bytes of data into v8..v11
4499 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4500 __ rev32(v8, __ T16B, v8);
4501 __ rev32(v9, __ T16B, v9);
4502 __ rev32(v10, __ T16B, v10);
4503 __ rev32(v11, __ T16B, v11);
4504
4505 __ addv(v6, __ T4S, v8, v16);
4506 __ orr(v2, __ T16B, v0, v0);
4507 __ orr(v3, __ T16B, v1, v1);
4508
4509 FloatRegister d0 = v8;
4510 FloatRegister d1 = v9;
4511 FloatRegister d2 = v10;
4512 FloatRegister d3 = v11;
4513
4514
4515 for (int round = 0; round < 16; round++) {
4516 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4517 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4518 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4519 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4520
4521 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4522 __ orr(v4, __ T16B, v2, v2);
4523 if (round < 15)
4524 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4525 __ sha256h(v2, __ T4S, v3, tmp2);
4526 __ sha256h2(v3, __ T4S, v4, tmp2);
4527 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4528
4529 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4530 }
4531
4532 __ addv(v0, __ T4S, v0, v2);
4533 __ addv(v1, __ T4S, v1, v3);
4534
4535 if (multi_block) {
4536 __ add(ofs, ofs, 64);
4537 __ cmp(ofs, limit);
4538 __ br(Assembler::LE, sha1_loop);
4539 __ mov(c_rarg0, ofs); // return ofs
4540 }
4541
4542 __ ldpd(v10, v11, Address(sp, 16));
4543 __ ldpd(v8, v9, __ post(sp, 32));
4544
4545 __ stpq(v0, v1, state);
4546
4547 __ ret(lr);
4548
4549 // record the stub entry and end
4550 store_archive_data(stub_id, start, __ pc());
4551
4552 return start;
4553 }
4554
4555 // Double rounds for sha512.
4556 void sha512_dround(int dr,
4557 FloatRegister vi0, FloatRegister vi1,
4558 FloatRegister vi2, FloatRegister vi3,
4559 FloatRegister vi4, FloatRegister vrc0,
4560 FloatRegister vrc1, FloatRegister vin0,
4561 FloatRegister vin1, FloatRegister vin2,
4562 FloatRegister vin3, FloatRegister vin4) {
4563 if (dr < 36) {
4564 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4565 }
4566 __ addv(v5, __ T2D, vrc0, vin0);
4567 __ ext(v6, __ T16B, vi2, vi3, 8);
4568 __ ext(v5, __ T16B, v5, v5, 8);
4569 __ ext(v7, __ T16B, vi1, vi2, 8);
4570 __ addv(vi3, __ T2D, vi3, v5);
4571 if (dr < 32) {
4572 __ ext(v5, __ T16B, vin3, vin4, 8);
4573 __ sha512su0(vin0, __ T2D, vin1);
4574 }
4575 __ sha512h(vi3, __ T2D, v6, v7);
4576 if (dr < 32) {
4577 __ sha512su1(vin0, __ T2D, vin2, v5);
4578 }
4579 __ addv(vi4, __ T2D, vi1, vi3);
4580 __ sha512h2(vi3, __ T2D, vi1, vi0);
4581 }
4582
4583 // Arguments:
4584 //
4585 // Inputs:
4586 // c_rarg0 - byte[] source+offset
4587 // c_rarg1 - int[] SHA.state
4588 // c_rarg2 - int offset
4589 // c_rarg3 - int limit
4590 //
4591 address generate_sha512_implCompress(StubId stub_id) {
4592 bool multi_block;
4593 switch (stub_id) {
4594 case StubId::stubgen_sha512_implCompress_id:
4595 multi_block = false;
4596 break;
4597 case StubId::stubgen_sha512_implCompressMB_id:
4598 multi_block = true;
4599 break;
4600 default:
4601 ShouldNotReachHere();
4602 }
4603 int entry_count = StubInfo::entry_count(stub_id);
4604 assert(entry_count == 1, "sanity check");
4605 address start = load_archive_data(stub_id);
4606 if (start != nullptr) {
4607 return start;
4608 }
4609 __ align(CodeEntryAlignment);
4610 StubCodeMark mark(this, stub_id);
4611 start = __ pc();
4612
4613 Register buf = c_rarg0;
4614 Register state = c_rarg1;
4615 Register ofs = c_rarg2;
4616 Register limit = c_rarg3;
4617
4618 __ stpd(v8, v9, __ pre(sp, -64));
4619 __ stpd(v10, v11, Address(sp, 16));
4620 __ stpd(v12, v13, Address(sp, 32));
4621 __ stpd(v14, v15, Address(sp, 48));
4622
4623 Label sha512_loop;
4624
4625 // load state
4626 __ ld1(v8, v9, v10, v11, __ T2D, state);
4627
4628 // load first 4 round constants
4629 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4630 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4631
4632 __ BIND(sha512_loop);
4633 // load 128B of data into v12..v19
4634 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4635 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4636 __ rev64(v12, __ T16B, v12);
4637 __ rev64(v13, __ T16B, v13);
4638 __ rev64(v14, __ T16B, v14);
4639 __ rev64(v15, __ T16B, v15);
4640 __ rev64(v16, __ T16B, v16);
4641 __ rev64(v17, __ T16B, v17);
4642 __ rev64(v18, __ T16B, v18);
4643 __ rev64(v19, __ T16B, v19);
4644
4645 __ mov(rscratch2, rscratch1);
4646
4647 __ mov(v0, __ T16B, v8);
4648 __ mov(v1, __ T16B, v9);
4649 __ mov(v2, __ T16B, v10);
4650 __ mov(v3, __ T16B, v11);
4651
4652 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4653 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4654 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4655 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4656 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4657 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4658 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4659 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4660 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4661 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4662 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4663 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4664 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4665 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4666 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4667 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4668 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4669 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4670 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4671 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4672 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4673 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4674 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4675 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4676 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4677 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4678 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4679 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4680 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4681 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4682 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4683 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4684 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4685 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4686 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4687 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4688 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4689 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4690 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4691 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4692
4693 __ addv(v8, __ T2D, v8, v0);
4694 __ addv(v9, __ T2D, v9, v1);
4695 __ addv(v10, __ T2D, v10, v2);
4696 __ addv(v11, __ T2D, v11, v3);
4697
4698 if (multi_block) {
4699 __ add(ofs, ofs, 128);
4700 __ cmp(ofs, limit);
4701 __ br(Assembler::LE, sha512_loop);
4702 __ mov(c_rarg0, ofs); // return ofs
4703 }
4704
4705 __ st1(v8, v9, v10, v11, __ T2D, state);
4706
4707 __ ldpd(v14, v15, Address(sp, 48));
4708 __ ldpd(v12, v13, Address(sp, 32));
4709 __ ldpd(v10, v11, Address(sp, 16));
4710 __ ldpd(v8, v9, __ post(sp, 64));
4711
4712 __ ret(lr);
4713
4714 // record the stub entry and end
4715 store_archive_data(stub_id, start, __ pc());
4716
4717 return start;
4718 }
4719
4720 // Execute one round of keccak of two computations in parallel.
4721 // One of the states should be loaded into the lower halves of
4722 // the vector registers v0-v24, the other should be loaded into
4723 // the upper halves of those registers. The ld1r instruction loads
4724 // the round constant into both halves of register v31.
4725 // Intermediate results c0...c5 and d0...d5 are computed
4726 // in registers v25...v30.
4727 // All vector instructions that are used operate on both register
4728 // halves in parallel.
4729 // If only a single computation is needed, one can only load the lower halves.
4730 void keccak_round(Register rscratch1) {
4731 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4732 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4733 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4734 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4735 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4736 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4737 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4738 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4739 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4740 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4741
4742 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4743 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4744 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4745 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4746 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4747
4748 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4749 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4750 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4751 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4752 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4753 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4754 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4755 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4756 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4757 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4758 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4759 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4760 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4761 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4762 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4763 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4764 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4765 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4766 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4767 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4768 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4769 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4770 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4771 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4772 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4773
4774 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4775 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4776 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4777 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4778 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4779
4780 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4781
4782 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4783 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4784 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4785 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4786 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4787
4788 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4789 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4790 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4791 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4792 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4793
4794 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4795 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4796 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4797 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4798 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4799
4800 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4801 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4802 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4803 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4804 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4805
4806 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4807 }
4808
4809 // Arguments:
4810 //
4811 // Inputs:
4812 // c_rarg0 - byte[] source+offset
4813 // c_rarg1 - byte[] SHA.state
4814 // c_rarg2 - int block_size
4815 // c_rarg3 - int offset
4816 // c_rarg4 - int limit
4817 //
4818 address generate_sha3_implCompress(StubId stub_id) {
4819 bool multi_block;
4820 switch (stub_id) {
4821 case StubId::stubgen_sha3_implCompress_id:
4822 multi_block = false;
4823 break;
4824 case StubId::stubgen_sha3_implCompressMB_id:
4825 multi_block = true;
4826 break;
4827 default:
4828 ShouldNotReachHere();
4829 }
4830 int entry_count = StubInfo::entry_count(stub_id);
4831 assert(entry_count == 1, "sanity check");
4832 address start = load_archive_data(stub_id);
4833 if (start != nullptr) {
4834 return start;
4835 }
4836 __ align(CodeEntryAlignment);
4837 StubCodeMark mark(this, stub_id);
4838 start = __ pc();
4839
4840 Register buf = c_rarg0;
4841 Register state = c_rarg1;
4842 Register block_size = c_rarg2;
4843 Register ofs = c_rarg3;
4844 Register limit = c_rarg4;
4845
4846 Label sha3_loop, rounds24_loop;
4847 Label sha3_512_or_sha3_384, shake128;
4848
4849 __ stpd(v8, v9, __ pre(sp, -64));
4850 __ stpd(v10, v11, Address(sp, 16));
4851 __ stpd(v12, v13, Address(sp, 32));
4852 __ stpd(v14, v15, Address(sp, 48));
4853
4854 // load state
4855 __ add(rscratch1, state, 32);
4856 __ ld1(v0, v1, v2, v3, __ T1D, state);
4857 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4858 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4859 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4860 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4861 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4862 __ ld1(v24, __ T1D, rscratch1);
4863
4864 __ BIND(sha3_loop);
4865
4866 // 24 keccak rounds
4867 __ movw(rscratch2, 24);
4868
4869 // load round_constants base
4870 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4871
4872 // load input
4873 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4874 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4875 __ eor(v0, __ T8B, v0, v25);
4876 __ eor(v1, __ T8B, v1, v26);
4877 __ eor(v2, __ T8B, v2, v27);
4878 __ eor(v3, __ T8B, v3, v28);
4879 __ eor(v4, __ T8B, v4, v29);
4880 __ eor(v5, __ T8B, v5, v30);
4881 __ eor(v6, __ T8B, v6, v31);
4882
4883 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4884 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4885
4886 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4887 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4888 __ eor(v7, __ T8B, v7, v25);
4889 __ eor(v8, __ T8B, v8, v26);
4890 __ eor(v9, __ T8B, v9, v27);
4891 __ eor(v10, __ T8B, v10, v28);
4892 __ eor(v11, __ T8B, v11, v29);
4893 __ eor(v12, __ T8B, v12, v30);
4894 __ eor(v13, __ T8B, v13, v31);
4895
4896 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4897 __ eor(v14, __ T8B, v14, v25);
4898 __ eor(v15, __ T8B, v15, v26);
4899 __ eor(v16, __ T8B, v16, v27);
4900
4901 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4902 __ andw(c_rarg5, block_size, 48);
4903 __ cbzw(c_rarg5, rounds24_loop);
4904
4905 __ tbnz(block_size, 5, shake128);
4906 // block_size == 144, bit5 == 0, SHA3-224
4907 __ ldrd(v28, __ post(buf, 8));
4908 __ eor(v17, __ T8B, v17, v28);
4909 __ b(rounds24_loop);
4910
4911 __ BIND(shake128);
4912 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4913 __ eor(v17, __ T8B, v17, v28);
4914 __ eor(v18, __ T8B, v18, v29);
4915 __ eor(v19, __ T8B, v19, v30);
4916 __ eor(v20, __ T8B, v20, v31);
4917 __ b(rounds24_loop); // block_size == 168, SHAKE128
4918
4919 __ BIND(sha3_512_or_sha3_384);
4920 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4921 __ eor(v7, __ T8B, v7, v25);
4922 __ eor(v8, __ T8B, v8, v26);
4923 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4924
4925 // SHA3-384
4926 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4927 __ eor(v9, __ T8B, v9, v27);
4928 __ eor(v10, __ T8B, v10, v28);
4929 __ eor(v11, __ T8B, v11, v29);
4930 __ eor(v12, __ T8B, v12, v30);
4931
4932 __ BIND(rounds24_loop);
4933 __ subw(rscratch2, rscratch2, 1);
4934
4935 keccak_round(rscratch1);
4936
4937 __ cbnzw(rscratch2, rounds24_loop);
4938
4939 if (multi_block) {
4940 __ add(ofs, ofs, block_size);
4941 __ cmp(ofs, limit);
4942 __ br(Assembler::LE, sha3_loop);
4943 __ mov(c_rarg0, ofs); // return ofs
4944 }
4945
4946 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4947 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4948 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4949 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4950 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4951 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4952 __ st1(v24, __ T1D, state);
4953
4954 // restore callee-saved registers
4955 __ ldpd(v14, v15, Address(sp, 48));
4956 __ ldpd(v12, v13, Address(sp, 32));
4957 __ ldpd(v10, v11, Address(sp, 16));
4958 __ ldpd(v8, v9, __ post(sp, 64));
4959
4960 __ ret(lr);
4961
4962 // record the stub entry and end
4963 store_archive_data(stub_id, start, __ pc());
4964
4965 return start;
4966 }
4967
4968 // Inputs:
4969 // c_rarg0 - long[] state0
4970 // c_rarg1 - long[] state1
4971 address generate_double_keccak() {
4972 StubId stub_id = StubId::stubgen_double_keccak_id;
4973 int entry_count = StubInfo::entry_count(stub_id);
4974 assert(entry_count == 1, "sanity check");
4975 address start = load_archive_data(stub_id);
4976 if (start != nullptr) {
4977 return start;
4978 }
4979 // Implements the double_keccak() method of the
4980 // sun.secyrity.provider.SHA3Parallel class
4981 __ align(CodeEntryAlignment);
4982 StubCodeMark mark(this, stub_id);
4983 start = __ pc();
4984 __ enter();
4985
4986 Register state0 = c_rarg0;
4987 Register state1 = c_rarg1;
4988
4989 Label rounds24_loop;
4990
4991 // save callee-saved registers
4992 __ stpd(v8, v9, __ pre(sp, -64));
4993 __ stpd(v10, v11, Address(sp, 16));
4994 __ stpd(v12, v13, Address(sp, 32));
4995 __ stpd(v14, v15, Address(sp, 48));
4996
4997 // load states
4998 __ add(rscratch1, state0, 32);
4999 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5000 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5001 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5002 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5003 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5004 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5005 __ ld1(v24, __ D, 0, rscratch1);
5006 __ add(rscratch1, state1, 32);
5007 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5008 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5009 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5010 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5011 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5012 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5013 __ ld1(v24, __ D, 1, rscratch1);
5014
5015 // 24 keccak rounds
5016 __ movw(rscratch2, 24);
5017
5018 // load round_constants base
5019 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5020
5021 __ BIND(rounds24_loop);
5022 __ subw(rscratch2, rscratch2, 1);
5023 keccak_round(rscratch1);
5024 __ cbnzw(rscratch2, rounds24_loop);
5025
5026 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5027 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5028 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5029 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5030 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5031 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5032 __ st1(v24, __ D, 0, state0);
5033 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5034 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5035 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5036 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5037 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5038 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5039 __ st1(v24, __ D, 1, state1);
5040
5041 // restore callee-saved vector registers
5042 __ ldpd(v14, v15, Address(sp, 48));
5043 __ ldpd(v12, v13, Address(sp, 32));
5044 __ ldpd(v10, v11, Address(sp, 16));
5045 __ ldpd(v8, v9, __ post(sp, 64));
5046
5047 __ leave(); // required for proper stackwalking of RuntimeStub frame
5048 __ mov(r0, zr); // return 0
5049 __ ret(lr);
5050
5051 // record the stub entry and end
5052 store_archive_data(stub_id, start, __ pc());
5053
5054 return start;
5055 }
5056
5057 // ChaCha20 block function. This version parallelizes the 32-bit
5058 // state elements on each of 16 vectors, producing 4 blocks of
5059 // keystream at a time.
5060 //
5061 // state (int[16]) = c_rarg0
5062 // keystream (byte[256]) = c_rarg1
5063 // return - number of bytes of produced keystream (always 256)
5064 //
5065 // This implementation takes each 32-bit integer from the state
5066 // array and broadcasts it across all 4 32-bit lanes of a vector register
5067 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5068 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5069 // the quarter round schedule is implemented as outlined in RFC 7539 section
5070 // 2.3. However, instead of sequentially processing the 3 quarter round
5071 // operations represented by one QUARTERROUND function, we instead stack all
5072 // the adds, xors and left-rotations from the first 4 quarter rounds together
5073 // and then do the same for the second set of 4 quarter rounds. This removes
5074 // some latency that would otherwise be incurred by waiting for an add to
5075 // complete before performing an xor (which depends on the result of the
5076 // add), etc. An adjustment happens between the first and second groups of 4
5077 // quarter rounds, but this is done only in the inputs to the macro functions
5078 // that generate the assembly instructions - these adjustments themselves are
5079 // not part of the resulting assembly.
5080 // The 4 registers v0-v3 are used during the quarter round operations as
5081 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5082 // registers become the vectors involved in adding the start state back onto
5083 // the post-QR working state. After the adds are complete, each of the 16
5084 // vectors write their first lane back to the keystream buffer, followed
5085 // by the second lane from all vectors and so on.
5086 address generate_chacha20Block_blockpar() {
5087 StubId stub_id = StubId::stubgen_chacha20Block_id;
5088 int entry_count = StubInfo::entry_count(stub_id);
5089 assert(entry_count == 1, "sanity check");
5090 address start = load_archive_data(stub_id);
5091 if (start != nullptr) {
5092 return start;
5093 }
5094 Label L_twoRounds, L_cc20_const;
5095 __ align(CodeEntryAlignment);
5096 StubCodeMark mark(this, stub_id);
5097 start = __ pc();
5098 __ enter();
5099
5100 int i, j;
5101 const Register state = c_rarg0;
5102 const Register keystream = c_rarg1;
5103 const Register loopCtr = r10;
5104 const Register tmpAddr = r11;
5105 const FloatRegister ctrAddOverlay = v28;
5106 const FloatRegister lrot8Tbl = v29;
5107
5108 // Organize SIMD registers in an array that facilitates
5109 // putting repetitive opcodes into loop structures. It is
5110 // important that each grouping of 4 registers is monotonically
5111 // increasing to support the requirements of multi-register
5112 // instructions (e.g. ld4r, st4, etc.)
5113 const FloatRegister workSt[16] = {
5114 v4, v5, v6, v7, v16, v17, v18, v19,
5115 v20, v21, v22, v23, v24, v25, v26, v27
5116 };
5117
5118 // Pull in constant data. The first 16 bytes are the add overlay
5119 // which is applied to the vector holding the counter (state[12]).
5120 // The second 16 bytes is the index register for the 8-bit left
5121 // rotation tbl instruction.
5122 __ adr(tmpAddr, L_cc20_const);
5123 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5124
5125 // Load from memory and interlace across 16 SIMD registers,
5126 // With each word from memory being broadcast to all lanes of
5127 // each successive SIMD register.
5128 // Addr(0) -> All lanes in workSt[i]
5129 // Addr(4) -> All lanes workSt[i + 1], etc.
5130 __ mov(tmpAddr, state);
5131 for (i = 0; i < 16; i += 4) {
5132 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5133 __ post(tmpAddr, 16));
5134 }
5135 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5136
5137 // Before entering the loop, create 5 4-register arrays. These
5138 // will hold the 4 registers that represent the a/b/c/d fields
5139 // in the quarter round operation. For instance the "b" field
5140 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5141 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5142 // since it is part of a diagonal organization. The aSet and scratch
5143 // register sets are defined at declaration time because they do not change
5144 // organization at any point during the 20-round processing.
5145 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5146 FloatRegister bSet[4];
5147 FloatRegister cSet[4];
5148 FloatRegister dSet[4];
5149 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5150
5151 // Set up the 10 iteration loop and perform all 8 quarter round ops
5152 __ mov(loopCtr, 10);
5153 __ BIND(L_twoRounds);
5154
5155 // Set to columnar organization and do the following 4 quarter-rounds:
5156 // QUARTERROUND(0, 4, 8, 12)
5157 // QUARTERROUND(1, 5, 9, 13)
5158 // QUARTERROUND(2, 6, 10, 14)
5159 // QUARTERROUND(3, 7, 11, 15)
5160 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5161 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5162 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5163
5164 __ cc20_qr_add4(aSet, bSet); // a += b
5165 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5166 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5167
5168 __ cc20_qr_add4(cSet, dSet); // c += d
5169 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5170 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5171
5172 __ cc20_qr_add4(aSet, bSet); // a += b
5173 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5174 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5175
5176 __ cc20_qr_add4(cSet, dSet); // c += d
5177 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5178 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5179
5180 // Set to diagonal organization and do the next 4 quarter-rounds:
5181 // QUARTERROUND(0, 5, 10, 15)
5182 // QUARTERROUND(1, 6, 11, 12)
5183 // QUARTERROUND(2, 7, 8, 13)
5184 // QUARTERROUND(3, 4, 9, 14)
5185 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5186 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5187 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5188
5189 __ cc20_qr_add4(aSet, bSet); // a += b
5190 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5191 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5192
5193 __ cc20_qr_add4(cSet, dSet); // c += d
5194 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5195 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5196
5197 __ cc20_qr_add4(aSet, bSet); // a += b
5198 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5199 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5200
5201 __ cc20_qr_add4(cSet, dSet); // c += d
5202 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5203 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5204
5205 // Decrement and iterate
5206 __ sub(loopCtr, loopCtr, 1);
5207 __ cbnz(loopCtr, L_twoRounds);
5208
5209 __ mov(tmpAddr, state);
5210
5211 // Add the starting state back to the post-loop keystream
5212 // state. We read/interlace the state array from memory into
5213 // 4 registers similar to what we did in the beginning. Then
5214 // add the counter overlay onto workSt[12] at the end.
5215 for (i = 0; i < 16; i += 4) {
5216 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5217 __ addv(workSt[i], __ T4S, workSt[i], v0);
5218 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5219 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5220 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5221 }
5222 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5223
5224 // Write working state into the keystream buffer. This is accomplished
5225 // by taking the lane "i" from each of the four vectors and writing
5226 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5227 // repeating with the next 4 vectors until all 16 vectors have been used.
5228 // Then move to the next lane and repeat the process until all lanes have
5229 // been written.
5230 for (i = 0; i < 4; i++) {
5231 for (j = 0; j < 16; j += 4) {
5232 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5233 __ post(keystream, 16));
5234 }
5235 }
5236
5237 __ mov(r0, 256); // Return length of output keystream
5238 __ leave();
5239 __ ret(lr);
5240
5241 // bind label and generate local constant data used by this stub
5242 // The constant data is broken into two 128-bit segments to be loaded
5243 // onto FloatRegisters. The first 128 bits are a counter add overlay
5244 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5245 // The second 128-bits is a table constant used for 8-bit left rotations.
5246 __ BIND(L_cc20_const);
5247 __ emit_int64(0x0000000100000000UL);
5248 __ emit_int64(0x0000000300000002UL);
5249 __ emit_int64(0x0605040702010003UL);
5250 __ emit_int64(0x0E0D0C0F0A09080BUL);
5251
5252 // record the stub entry and end
5253 store_archive_data(stub_id, start, __ pc());
5254
5255 return start;
5256 }
5257
5258 // Helpers to schedule parallel operation bundles across vector
5259 // register sequences of size 2, 4 or 8.
5260
5261 // Implement various primitive computations across vector sequences
5262
5263 template<int N>
5264 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5265 const VSeq<N>& v1, const VSeq<N>& v2) {
5266 // output must not be constant
5267 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5268 // output cannot overwrite pending inputs
5269 assert(!vs_write_before_read(v, v1), "output overwrites input");
5270 assert(!vs_write_before_read(v, v2), "output overwrites input");
5271 for (int i = 0; i < N; i++) {
5272 __ addv(v[i], T, v1[i], v2[i]);
5273 }
5274 }
5275
5276 template<int N>
5277 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5278 const VSeq<N>& v1, const VSeq<N>& v2) {
5279 // output must not be constant
5280 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5281 // output cannot overwrite pending inputs
5282 assert(!vs_write_before_read(v, v1), "output overwrites input");
5283 assert(!vs_write_before_read(v, v2), "output overwrites input");
5284 for (int i = 0; i < N; i++) {
5285 __ subv(v[i], T, v1[i], v2[i]);
5286 }
5287 }
5288
5289 template<int N>
5290 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5291 const VSeq<N>& v1, const VSeq<N>& v2) {
5292 // output must not be constant
5293 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5294 // output cannot overwrite pending inputs
5295 assert(!vs_write_before_read(v, v1), "output overwrites input");
5296 assert(!vs_write_before_read(v, v2), "output overwrites input");
5297 for (int i = 0; i < N; i++) {
5298 __ mulv(v[i], T, v1[i], v2[i]);
5299 }
5300 }
5301
5302 template<int N>
5303 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5304 // output must not be constant
5305 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5306 // output cannot overwrite pending inputs
5307 assert(!vs_write_before_read(v, v1), "output overwrites input");
5308 for (int i = 0; i < N; i++) {
5309 __ negr(v[i], T, v1[i]);
5310 }
5311 }
5312
5313 template<int N>
5314 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5315 const VSeq<N>& v1, int shift) {
5316 // output must not be constant
5317 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5318 // output cannot overwrite pending inputs
5319 assert(!vs_write_before_read(v, v1), "output overwrites input");
5320 for (int i = 0; i < N; i++) {
5321 __ sshr(v[i], T, v1[i], shift);
5322 }
5323 }
5324
5325 template<int N>
5326 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5327 // output must not be constant
5328 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5329 // output cannot overwrite pending inputs
5330 assert(!vs_write_before_read(v, v1), "output overwrites input");
5331 assert(!vs_write_before_read(v, v2), "output overwrites input");
5332 for (int i = 0; i < N; i++) {
5333 __ andr(v[i], __ T16B, v1[i], v2[i]);
5334 }
5335 }
5336
5337 template<int N>
5338 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5339 // output must not be constant
5340 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5341 // output cannot overwrite pending inputs
5342 assert(!vs_write_before_read(v, v1), "output overwrites input");
5343 assert(!vs_write_before_read(v, v2), "output overwrites input");
5344 for (int i = 0; i < N; i++) {
5345 __ orr(v[i], __ T16B, v1[i], v2[i]);
5346 }
5347 }
5348
5349 template<int N>
5350 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5351 // output must not be constant
5352 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5353 // output cannot overwrite pending inputs
5354 assert(!vs_write_before_read(v, v1), "output overwrites input");
5355 for (int i = 0; i < N; i++) {
5356 __ notr(v[i], __ T16B, v1[i]);
5357 }
5358 }
5359
5360 template<int N>
5361 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5362 // output must not be constant
5363 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5364 // output cannot overwrite pending inputs
5365 assert(!vs_write_before_read(v, v1), "output overwrites input");
5366 assert(!vs_write_before_read(v, v2), "output overwrites input");
5367 for (int i = 0; i < N; i++) {
5368 __ sqdmulh(v[i], T, v1[i], v2[i]);
5369 }
5370 }
5371
5372 template<int N>
5373 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5374 // output must not be constant
5375 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5376 // output cannot overwrite pending inputs
5377 assert(!vs_write_before_read(v, v1), "output overwrites input");
5378 assert(!vs_write_before_read(v, v2), "output overwrites input");
5379 for (int i = 0; i < N; i++) {
5380 __ mlsv(v[i], T, v1[i], v2[i]);
5381 }
5382 }
5383
5384 // load N/2 successive pairs of quadword values from memory in order
5385 // into N successive vector registers of the sequence via the
5386 // address supplied in base.
5387 template<int N>
5388 void vs_ldpq(const VSeq<N>& v, Register base) {
5389 for (int i = 0; i < N; i += 2) {
5390 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
5391 }
5392 }
5393
5394 // load N/2 successive pairs of quadword values from memory in order
5395 // into N vector registers of the sequence via the address supplied
5396 // in base using post-increment addressing
5397 template<int N>
5398 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5399 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5400 for (int i = 0; i < N; i += 2) {
5401 __ ldpq(v[i], v[i+1], __ post(base, 32));
5402 }
5403 }
5404
5405 // store N successive vector registers of the sequence into N/2
5406 // successive pairs of quadword memory locations via the address
5407 // supplied in base using post-increment addressing
5408 template<int N>
5409 void vs_stpq_post(const VSeq<N>& v, Register base) {
5410 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5411 for (int i = 0; i < N; i += 2) {
5412 __ stpq(v[i], v[i+1], __ post(base, 32));
5413 }
5414 }
5415
5416 // load N/2 pairs of quadword values from memory de-interleaved into
5417 // N vector registers 2 at a time via the address supplied in base
5418 // using post-increment addressing.
5419 template<int N>
5420 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5421 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5422 for (int i = 0; i < N; i += 2) {
5423 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5424 }
5425 }
5426
5427 // store N vector registers interleaved into N/2 pairs of quadword
5428 // memory locations via the address supplied in base using
5429 // post-increment addressing.
5430 template<int N>
5431 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5432 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5433 for (int i = 0; i < N; i += 2) {
5434 __ st2(v[i], v[i+1], T, __ post(base, 32));
5435 }
5436 }
5437
5438 // load N quadword values from memory de-interleaved into N vector
5439 // registers 3 elements at a time via the address supplied in base.
5440 template<int N>
5441 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5442 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5443 for (int i = 0; i < N; i += 3) {
5444 __ ld3(v[i], v[i+1], v[i+2], T, base);
5445 }
5446 }
5447
5448 // load N quadword values from memory de-interleaved into N vector
5449 // registers 3 elements at a time via the address supplied in base
5450 // using post-increment addressing.
5451 template<int N>
5452 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5453 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5454 for (int i = 0; i < N; i += 3) {
5455 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5456 }
5457 }
5458
5459 // load N/2 pairs of quadword values from memory into N vector
5460 // registers via the address supplied in base with each pair indexed
5461 // using the the start offset plus the corresponding entry in the
5462 // offsets array
5463 template<int N>
5464 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5465 for (int i = 0; i < N/2; i++) {
5466 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5467 }
5468 }
5469
5470 // store N vector registers into N/2 pairs of quadword memory
5471 // locations via the address supplied in base with each pair indexed
5472 // using the the start offset plus the corresponding entry in the
5473 // offsets array
5474 template<int N>
5475 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5476 for (int i = 0; i < N/2; i++) {
5477 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5478 }
5479 }
5480
5481 // load N single quadword values from memory into N vector registers
5482 // via the address supplied in base with each value indexed using
5483 // the the start offset plus the corresponding entry in the offsets
5484 // array
5485 template<int N>
5486 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5487 int start, int (&offsets)[N]) {
5488 for (int i = 0; i < N; i++) {
5489 __ ldr(v[i], T, Address(base, start + offsets[i]));
5490 }
5491 }
5492
5493 // store N vector registers into N single quadword memory locations
5494 // via the address supplied in base with each value indexed using
5495 // the the start offset plus the corresponding entry in the offsets
5496 // array
5497 template<int N>
5498 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5499 int start, int (&offsets)[N]) {
5500 for (int i = 0; i < N; i++) {
5501 __ str(v[i], T, Address(base, start + offsets[i]));
5502 }
5503 }
5504
5505 // load N/2 pairs of quadword values from memory de-interleaved into
5506 // N vector registers 2 at a time via the address supplied in base
5507 // with each pair indexed using the the start offset plus the
5508 // corresponding entry in the offsets array
5509 template<int N>
5510 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5511 Register tmp, int start, int (&offsets)[N/2]) {
5512 for (int i = 0; i < N/2; i++) {
5513 __ add(tmp, base, start + offsets[i]);
5514 __ ld2(v[2*i], v[2*i+1], T, tmp);
5515 }
5516 }
5517
5518 // store N vector registers 2 at a time interleaved into N/2 pairs
5519 // of quadword memory locations via the address supplied in base
5520 // with each pair indexed using the the start offset plus the
5521 // corresponding entry in the offsets array
5522 template<int N>
5523 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5524 Register tmp, int start, int (&offsets)[N/2]) {
5525 for (int i = 0; i < N/2; i++) {
5526 __ add(tmp, base, start + offsets[i]);
5527 __ st2(v[2*i], v[2*i+1], T, tmp);
5528 }
5529 }
5530
5531 // Helper routines for various flavours of Montgomery multiply
5532
5533 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5534 // multiplications in parallel
5535 //
5536
5537 // See the montMul() method of the sun.security.provider.ML_DSA
5538 // class.
5539 //
5540 // Computes 4x4S results or 8x8H results
5541 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5542 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5543 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5544 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5545 // Outputs: va - 4x4S or 4x8H vector register sequences
5546 // vb, vc, vtmp and vq must all be disjoint
5547 // va must be disjoint from all other inputs/temps or must equal vc
5548 // va must have a non-zero delta i.e. it must not be a constant vseq.
5549 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5550 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5551 Assembler::SIMD_Arrangement T,
5552 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5553 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5554 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5555 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5556 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5557
5558 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5559 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5560
5561 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5562
5563 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5564 assert(vs_disjoint(va, vb), "va and vb overlap");
5565 assert(vs_disjoint(va, vq), "va and vq overlap");
5566 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5567 assert(!va.is_constant(), "output vector must identify 4 different registers");
5568
5569 // schedule 4 streams of instructions across the vector sequences
5570 for (int i = 0; i < 4; i++) {
5571 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5572 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5573 }
5574
5575 for (int i = 0; i < 4; i++) {
5576 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5577 }
5578
5579 for (int i = 0; i < 4; i++) {
5580 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5581 }
5582
5583 for (int i = 0; i < 4; i++) {
5584 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5585 }
5586 }
5587
5588 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5589 // multiplications in parallel
5590 //
5591
5592 // See the montMul() method of the sun.security.provider.ML_DSA
5593 // class.
5594 //
5595 // Computes 4x4S results or 8x8H results
5596 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5597 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5598 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5599 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5600 // Outputs: va - 4x4S or 4x8H vector register sequences
5601 // vb, vc, vtmp and vq must all be disjoint
5602 // va must be disjoint from all other inputs/temps or must equal vc
5603 // va must have a non-zero delta i.e. it must not be a constant vseq.
5604 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5605 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5606 Assembler::SIMD_Arrangement T,
5607 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5608 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5609 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5610 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5611 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5612
5613 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5614 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5615
5616 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5617
5618 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5619 assert(vs_disjoint(va, vb), "va and vb overlap");
5620 assert(vs_disjoint(va, vq), "va and vq overlap");
5621 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5622 assert(!va.is_constant(), "output vector must identify 2 different registers");
5623
5624 // schedule 2 streams of instructions across the vector sequences
5625 for (int i = 0; i < 2; i++) {
5626 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5627 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5628 }
5629
5630 for (int i = 0; i < 2; i++) {
5631 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5632 }
5633
5634 for (int i = 0; i < 2; i++) {
5635 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5636 }
5637
5638 for (int i = 0; i < 2; i++) {
5639 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5640 }
5641 }
5642
5643 // Perform 16 16-bit Montgomery multiplications in parallel.
5644 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5645 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5646 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5647 // It will assert that the register use is valid
5648 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5649 }
5650
5651 // Perform 32 16-bit Montgomery multiplications in parallel.
5652 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5653 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5654 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5655 // It will assert that the register use is valid
5656 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5657 }
5658
5659 // Perform 64 16-bit Montgomery multiplications in parallel.
5660 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5661 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5662 // Schedule two successive 4x8H multiplies via the montmul helper
5663 // on the front and back halves of va, vb and vc. The helper will
5664 // assert that the register use has no overlap conflicts on each
5665 // individual call but we also need to ensure that the necessary
5666 // disjoint/equality constraints are met across both calls.
5667
5668 // vb, vc, vtmp and vq must be disjoint. va must either be
5669 // disjoint from all other registers or equal vc
5670
5671 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5672 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5673 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5674
5675 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5676 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5677
5678 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5679
5680 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5681 assert(vs_disjoint(va, vb), "va and vb overlap");
5682 assert(vs_disjoint(va, vq), "va and vq overlap");
5683 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5684
5685 // we multiply the front and back halves of each sequence 4 at a
5686 // time because
5687 //
5688 // 1) we are currently only able to get 4-way instruction
5689 // parallelism at best
5690 //
5691 // 2) we need registers for the constants in vq and temporary
5692 // scratch registers to hold intermediate results so vtmp can only
5693 // be a VSeq<4> which means we only have 4 scratch slots
5694
5695 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5696 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5697 }
5698
5699 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5700 const VSeq<4>& vc,
5701 const VSeq<4>& vtmp,
5702 const VSeq<2>& vq) {
5703 // compute a = montmul(a1, c)
5704 kyber_montmul32(vc, va1, vc, vtmp, vq);
5705 // ouptut a1 = a0 - a
5706 vs_subv(va1, __ T8H, va0, vc);
5707 // and a0 = a0 + a
5708 vs_addv(va0, __ T8H, va0, vc);
5709 }
5710
5711 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5712 const VSeq<4>& vb,
5713 const VSeq<4>& vtmp1,
5714 const VSeq<4>& vtmp2,
5715 const VSeq<2>& vq) {
5716 // compute c = a0 - a1
5717 vs_subv(vtmp1, __ T8H, va0, va1);
5718 // output a0 = a0 + a1
5719 vs_addv(va0, __ T8H, va0, va1);
5720 // output a1 = b montmul c
5721 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5722 }
5723
5724 void load64shorts(const VSeq<8>& v, Register shorts) {
5725 vs_ldpq_post(v, shorts);
5726 }
5727
5728 void load32shorts(const VSeq<4>& v, Register shorts) {
5729 vs_ldpq_post(v, shorts);
5730 }
5731
5732 void store64shorts(VSeq<8> v, Register tmpAddr) {
5733 vs_stpq_post(v, tmpAddr);
5734 }
5735
5736 // Kyber NTT function.
5737 // Implements
5738 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5739 //
5740 // coeffs (short[256]) = c_rarg0
5741 // ntt_zetas (short[256]) = c_rarg1
5742 address generate_kyberNtt() {
5743 StubId stub_id = StubId::stubgen_kyberNtt_id;
5744 int entry_count = StubInfo::entry_count(stub_id);
5745 assert(entry_count == 1, "sanity check");
5746 address start = load_archive_data(stub_id);
5747 if (start != nullptr) {
5748 return start;
5749 }
5750 __ align(CodeEntryAlignment);
5751 StubCodeMark mark(this, stub_id);
5752 start = __ pc();
5753 __ enter();
5754
5755 const Register coeffs = c_rarg0;
5756 const Register zetas = c_rarg1;
5757
5758 const Register kyberConsts = r10;
5759 const Register tmpAddr = r11;
5760
5761 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5762 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5763 VSeq<2> vq(30); // n.b. constants overlap vs3
5764
5765 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5766 // load the montmul constants
5767 vs_ldpq(vq, kyberConsts);
5768
5769 // Each level corresponds to an iteration of the outermost loop of the
5770 // Java method seilerNTT(int[] coeffs). There are some differences
5771 // from what is done in the seilerNTT() method, though:
5772 // 1. The computation is using 16-bit signed values, we do not convert them
5773 // to ints here.
5774 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5775 // this array for each level, it is easier that way to fill up the vector
5776 // registers.
5777 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5778 // multiplications (this is because that way there should not be any
5779 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5780 // that we can use the 16-bit arithmetic in the vector unit.
5781 //
5782 // On each level, we fill up the vector registers in such a way that the
5783 // array elements that need to be multiplied by the zetas go into one
5784 // set of vector registers while the corresponding ones that don't need to
5785 // be multiplied, go into another set.
5786 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5787 // registers interleaving the steps of 4 identical computations,
5788 // each done on 8 16-bit values per register.
5789
5790 // At levels 0-3 the coefficients multiplied by or added/subtracted
5791 // to the zetas occur in discrete blocks whose size is some multiple
5792 // of 32.
5793
5794 // level 0
5795 __ add(tmpAddr, coeffs, 256);
5796 load64shorts(vs1, tmpAddr);
5797 load64shorts(vs2, zetas);
5798 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5799 __ add(tmpAddr, coeffs, 0);
5800 load64shorts(vs1, tmpAddr);
5801 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5802 vs_addv(vs1, __ T8H, vs1, vs2);
5803 __ add(tmpAddr, coeffs, 0);
5804 vs_stpq_post(vs1, tmpAddr);
5805 __ add(tmpAddr, coeffs, 256);
5806 vs_stpq_post(vs3, tmpAddr);
5807 // restore montmul constants
5808 vs_ldpq(vq, kyberConsts);
5809 load64shorts(vs1, tmpAddr);
5810 load64shorts(vs2, zetas);
5811 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5812 __ add(tmpAddr, coeffs, 128);
5813 load64shorts(vs1, tmpAddr);
5814 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5815 vs_addv(vs1, __ T8H, vs1, vs2);
5816 __ add(tmpAddr, coeffs, 128);
5817 store64shorts(vs1, tmpAddr);
5818 __ add(tmpAddr, coeffs, 384);
5819 store64shorts(vs3, tmpAddr);
5820
5821 // level 1
5822 // restore montmul constants
5823 vs_ldpq(vq, kyberConsts);
5824 __ add(tmpAddr, coeffs, 128);
5825 load64shorts(vs1, tmpAddr);
5826 load64shorts(vs2, zetas);
5827 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5828 __ add(tmpAddr, coeffs, 0);
5829 load64shorts(vs1, tmpAddr);
5830 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5831 vs_addv(vs1, __ T8H, vs1, vs2);
5832 __ add(tmpAddr, coeffs, 0);
5833 store64shorts(vs1, tmpAddr);
5834 store64shorts(vs3, tmpAddr);
5835 vs_ldpq(vq, kyberConsts);
5836 __ add(tmpAddr, coeffs, 384);
5837 load64shorts(vs1, tmpAddr);
5838 load64shorts(vs2, zetas);
5839 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5840 __ add(tmpAddr, coeffs, 256);
5841 load64shorts(vs1, tmpAddr);
5842 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5843 vs_addv(vs1, __ T8H, vs1, vs2);
5844 __ add(tmpAddr, coeffs, 256);
5845 store64shorts(vs1, tmpAddr);
5846 store64shorts(vs3, tmpAddr);
5847
5848 // level 2
5849 vs_ldpq(vq, kyberConsts);
5850 int offsets1[4] = { 0, 32, 128, 160 };
5851 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5852 load64shorts(vs2, zetas);
5853 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5854 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5855 // kyber_subv_addv64();
5856 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5857 vs_addv(vs1, __ T8H, vs1, vs2);
5858 __ add(tmpAddr, coeffs, 0);
5859 vs_stpq_post(vs_front(vs1), tmpAddr);
5860 vs_stpq_post(vs_front(vs3), tmpAddr);
5861 vs_stpq_post(vs_back(vs1), tmpAddr);
5862 vs_stpq_post(vs_back(vs3), tmpAddr);
5863 vs_ldpq(vq, kyberConsts);
5864 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5865 load64shorts(vs2, zetas);
5866 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5867 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5868 // kyber_subv_addv64();
5869 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5870 vs_addv(vs1, __ T8H, vs1, vs2);
5871 __ add(tmpAddr, coeffs, 256);
5872 vs_stpq_post(vs_front(vs1), tmpAddr);
5873 vs_stpq_post(vs_front(vs3), tmpAddr);
5874 vs_stpq_post(vs_back(vs1), tmpAddr);
5875 vs_stpq_post(vs_back(vs3), tmpAddr);
5876
5877 // level 3
5878 vs_ldpq(vq, kyberConsts);
5879 int offsets2[4] = { 0, 64, 128, 192 };
5880 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5881 load64shorts(vs2, zetas);
5882 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5883 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5884 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5885 vs_addv(vs1, __ T8H, vs1, vs2);
5886 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5887 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5888
5889 vs_ldpq(vq, kyberConsts);
5890 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5891 load64shorts(vs2, zetas);
5892 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5893 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5894 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5895 vs_addv(vs1, __ T8H, vs1, vs2);
5896 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5897 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5898
5899 // level 4
5900 // At level 4 coefficients occur in 8 discrete blocks of size 16
5901 // so they are loaded using employing an ldr at 8 distinct offsets.
5902
5903 vs_ldpq(vq, kyberConsts);
5904 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5905 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5906 load64shorts(vs2, zetas);
5907 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5908 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5909 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5910 vs_addv(vs1, __ T8H, vs1, vs2);
5911 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5912 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5913
5914 vs_ldpq(vq, kyberConsts);
5915 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5916 load64shorts(vs2, zetas);
5917 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5918 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5919 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5920 vs_addv(vs1, __ T8H, vs1, vs2);
5921 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5922 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5923
5924 // level 5
5925 // At level 5 related coefficients occur in discrete blocks of size 8 so
5926 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5927
5928 vs_ldpq(vq, kyberConsts);
5929 int offsets4[4] = { 0, 32, 64, 96 };
5930 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5931 load32shorts(vs_front(vs2), zetas);
5932 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5933 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5934 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5935 load32shorts(vs_front(vs2), zetas);
5936 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5937 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5938 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5939 load32shorts(vs_front(vs2), zetas);
5940 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5941 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5942
5943 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5944 load32shorts(vs_front(vs2), zetas);
5945 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5946 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5947
5948 // level 6
5949 // At level 6 related coefficients occur in discrete blocks of size 4 so
5950 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5951
5952 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5953 load32shorts(vs_front(vs2), zetas);
5954 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5955 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5956 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5957 // __ ldpq(v18, v19, __ post(zetas, 32));
5958 load32shorts(vs_front(vs2), zetas);
5959 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5960 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5961
5962 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5963 load32shorts(vs_front(vs2), zetas);
5964 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5965 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5966
5967 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5968 load32shorts(vs_front(vs2), zetas);
5969 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5970 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5971
5972 __ leave(); // required for proper stackwalking of RuntimeStub frame
5973 __ mov(r0, zr); // return 0
5974 __ ret(lr);
5975
5976 // record the stub entry and end
5977 store_archive_data(stub_id, start, __ pc());
5978
5979 return start;
5980 }
5981
5982 // Kyber Inverse NTT function
5983 // Implements
5984 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
5985 //
5986 // coeffs (short[256]) = c_rarg0
5987 // ntt_zetas (short[256]) = c_rarg1
5988 address generate_kyberInverseNtt() {
5989 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
5990 int entry_count = StubInfo::entry_count(stub_id);
5991 assert(entry_count == 1, "sanity check");
5992 address start = load_archive_data(stub_id);
5993 if (start != nullptr) {
5994 return start;
5995 }
5996 __ align(CodeEntryAlignment);
5997 StubCodeMark mark(this, stub_id);
5998 start = __ pc();
5999 __ enter();
6000
6001 const Register coeffs = c_rarg0;
6002 const Register zetas = c_rarg1;
6003
6004 const Register kyberConsts = r10;
6005 const Register tmpAddr = r11;
6006 const Register tmpAddr2 = c_rarg2;
6007
6008 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6009 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6010 VSeq<2> vq(30); // n.b. constants overlap vs3
6011
6012 __ lea(kyberConsts,
6013 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6014
6015 // level 0
6016 // At level 0 related coefficients occur in discrete blocks of size 4 so
6017 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6018
6019 vs_ldpq(vq, kyberConsts);
6020 int offsets4[4] = { 0, 32, 64, 96 };
6021 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6022 load32shorts(vs_front(vs2), zetas);
6023 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6024 vs_front(vs2), vs_back(vs2), vtmp, vq);
6025 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6026 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6027 load32shorts(vs_front(vs2), zetas);
6028 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6029 vs_front(vs2), vs_back(vs2), vtmp, vq);
6030 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6031 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6032 load32shorts(vs_front(vs2), zetas);
6033 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6034 vs_front(vs2), vs_back(vs2), vtmp, vq);
6035 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6036 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6037 load32shorts(vs_front(vs2), zetas);
6038 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6039 vs_front(vs2), vs_back(vs2), vtmp, vq);
6040 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6041
6042 // level 1
6043 // At level 1 related coefficients occur in discrete blocks of size 8 so
6044 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6045
6046 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6047 load32shorts(vs_front(vs2), zetas);
6048 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6049 vs_front(vs2), vs_back(vs2), vtmp, vq);
6050 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6051 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6052 load32shorts(vs_front(vs2), zetas);
6053 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6054 vs_front(vs2), vs_back(vs2), vtmp, vq);
6055 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6056
6057 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6058 load32shorts(vs_front(vs2), zetas);
6059 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6060 vs_front(vs2), vs_back(vs2), vtmp, vq);
6061 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6062 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6063 load32shorts(vs_front(vs2), zetas);
6064 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6065 vs_front(vs2), vs_back(vs2), vtmp, vq);
6066 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6067
6068 // level 2
6069 // At level 2 coefficients occur in 8 discrete blocks of size 16
6070 // so they are loaded using employing an ldr at 8 distinct offsets.
6071
6072 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6073 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6074 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6075 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6076 vs_subv(vs1, __ T8H, vs1, vs2);
6077 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6078 load64shorts(vs2, zetas);
6079 vs_ldpq(vq, kyberConsts);
6080 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6081 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6082
6083 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6084 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6085 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6086 vs_subv(vs1, __ T8H, vs1, vs2);
6087 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6088 load64shorts(vs2, zetas);
6089 vs_ldpq(vq, kyberConsts);
6090 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6091 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6092
6093 // Barrett reduction at indexes where overflow may happen
6094
6095 // load q and the multiplier for the Barrett reduction
6096 __ add(tmpAddr, kyberConsts, 16);
6097 vs_ldpq(vq, tmpAddr);
6098
6099 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6100 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6101 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6102 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6103 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6104 vs_sshr(vs2, __ T8H, vs2, 11);
6105 vs_mlsv(vs1, __ T8H, vs2, vq1);
6106 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6107 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6108 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6109 vs_sshr(vs2, __ T8H, vs2, 11);
6110 vs_mlsv(vs1, __ T8H, vs2, vq1);
6111 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6112
6113 // level 3
6114 // From level 3 upwards coefficients occur in discrete blocks whose size is
6115 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6116
6117 int offsets2[4] = { 0, 64, 128, 192 };
6118 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6119 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6120 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6121 vs_subv(vs1, __ T8H, vs1, vs2);
6122 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6123 load64shorts(vs2, zetas);
6124 vs_ldpq(vq, kyberConsts);
6125 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6126 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6127
6128 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6129 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6130 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6131 vs_subv(vs1, __ T8H, vs1, vs2);
6132 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6133 load64shorts(vs2, zetas);
6134 vs_ldpq(vq, kyberConsts);
6135 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6136 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6137
6138 // level 4
6139
6140 int offsets1[4] = { 0, 32, 128, 160 };
6141 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6142 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6143 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6144 vs_subv(vs1, __ T8H, vs1, vs2);
6145 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6146 load64shorts(vs2, zetas);
6147 vs_ldpq(vq, kyberConsts);
6148 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6149 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6150
6151 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6152 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6153 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6154 vs_subv(vs1, __ T8H, vs1, vs2);
6155 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6156 load64shorts(vs2, zetas);
6157 vs_ldpq(vq, kyberConsts);
6158 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6159 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6160
6161 // level 5
6162
6163 __ add(tmpAddr, coeffs, 0);
6164 load64shorts(vs1, tmpAddr);
6165 __ add(tmpAddr, coeffs, 128);
6166 load64shorts(vs2, tmpAddr);
6167 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6168 vs_subv(vs1, __ T8H, vs1, vs2);
6169 __ add(tmpAddr, coeffs, 0);
6170 store64shorts(vs3, tmpAddr);
6171 load64shorts(vs2, zetas);
6172 vs_ldpq(vq, kyberConsts);
6173 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6174 __ add(tmpAddr, coeffs, 128);
6175 store64shorts(vs2, tmpAddr);
6176
6177 load64shorts(vs1, tmpAddr);
6178 __ add(tmpAddr, coeffs, 384);
6179 load64shorts(vs2, tmpAddr);
6180 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6181 vs_subv(vs1, __ T8H, vs1, vs2);
6182 __ add(tmpAddr, coeffs, 256);
6183 store64shorts(vs3, tmpAddr);
6184 load64shorts(vs2, zetas);
6185 vs_ldpq(vq, kyberConsts);
6186 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6187 __ add(tmpAddr, coeffs, 384);
6188 store64shorts(vs2, tmpAddr);
6189
6190 // Barrett reduction at indexes where overflow may happen
6191
6192 // load q and the multiplier for the Barrett reduction
6193 __ add(tmpAddr, kyberConsts, 16);
6194 vs_ldpq(vq, tmpAddr);
6195
6196 int offsets0[2] = { 0, 256 };
6197 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6198 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6199 vs_sshr(vs2, __ T8H, vs2, 11);
6200 vs_mlsv(vs1, __ T8H, vs2, vq1);
6201 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6202
6203 // level 6
6204
6205 __ add(tmpAddr, coeffs, 0);
6206 load64shorts(vs1, tmpAddr);
6207 __ add(tmpAddr, coeffs, 256);
6208 load64shorts(vs2, tmpAddr);
6209 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6210 vs_subv(vs1, __ T8H, vs1, vs2);
6211 __ add(tmpAddr, coeffs, 0);
6212 store64shorts(vs3, tmpAddr);
6213 load64shorts(vs2, zetas);
6214 vs_ldpq(vq, kyberConsts);
6215 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6216 __ add(tmpAddr, coeffs, 256);
6217 store64shorts(vs2, tmpAddr);
6218
6219 __ add(tmpAddr, coeffs, 128);
6220 load64shorts(vs1, tmpAddr);
6221 __ add(tmpAddr, coeffs, 384);
6222 load64shorts(vs2, tmpAddr);
6223 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6224 vs_subv(vs1, __ T8H, vs1, vs2);
6225 __ add(tmpAddr, coeffs, 128);
6226 store64shorts(vs3, tmpAddr);
6227 load64shorts(vs2, zetas);
6228 vs_ldpq(vq, kyberConsts);
6229 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6230 __ add(tmpAddr, coeffs, 384);
6231 store64shorts(vs2, tmpAddr);
6232
6233 // multiply by 2^-n
6234
6235 // load toMont(2^-n mod q)
6236 __ add(tmpAddr, kyberConsts, 48);
6237 __ ldr(v29, __ Q, tmpAddr);
6238
6239 vs_ldpq(vq, kyberConsts);
6240 __ add(tmpAddr, coeffs, 0);
6241 load64shorts(vs1, tmpAddr);
6242 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6243 __ add(tmpAddr, coeffs, 0);
6244 store64shorts(vs2, tmpAddr);
6245
6246 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6247 load64shorts(vs1, tmpAddr);
6248 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6249 __ add(tmpAddr, coeffs, 128);
6250 store64shorts(vs2, tmpAddr);
6251
6252 // now tmpAddr contains coeffs + 256
6253 load64shorts(vs1, tmpAddr);
6254 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6255 __ add(tmpAddr, coeffs, 256);
6256 store64shorts(vs2, tmpAddr);
6257
6258 // now tmpAddr contains coeffs + 384
6259 load64shorts(vs1, tmpAddr);
6260 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6261 __ add(tmpAddr, coeffs, 384);
6262 store64shorts(vs2, tmpAddr);
6263
6264 __ leave(); // required for proper stackwalking of RuntimeStub frame
6265 __ mov(r0, zr); // return 0
6266 __ ret(lr);
6267
6268 // record the stub entry and end
6269 store_archive_data(stub_id, start, __ pc());
6270
6271 return start;
6272 }
6273
6274 // Kyber multiply polynomials in the NTT domain.
6275 // Implements
6276 // static int implKyberNttMult(
6277 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6278 //
6279 // result (short[256]) = c_rarg0
6280 // ntta (short[256]) = c_rarg1
6281 // nttb (short[256]) = c_rarg2
6282 // zetas (short[128]) = c_rarg3
6283 address generate_kyberNttMult() {
6284 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6285 int entry_count = StubInfo::entry_count(stub_id);
6286 assert(entry_count == 1, "sanity check");
6287 address start = load_archive_data(stub_id);
6288 if (start != nullptr) {
6289 return start;
6290 }
6291 __ align(CodeEntryAlignment);
6292 StubCodeMark mark(this, stub_id);
6293 start = __ pc();
6294 __ enter();
6295
6296 const Register result = c_rarg0;
6297 const Register ntta = c_rarg1;
6298 const Register nttb = c_rarg2;
6299 const Register zetas = c_rarg3;
6300
6301 const Register kyberConsts = r10;
6302 const Register limit = r11;
6303
6304 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6305 VSeq<4> vs3(16), vs4(20);
6306 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6307 VSeq<2> vz(28); // pair of zetas
6308 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6309
6310 __ lea(kyberConsts,
6311 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6312
6313 Label kyberNttMult_loop;
6314
6315 __ add(limit, result, 512);
6316
6317 // load q and qinv
6318 vs_ldpq(vq, kyberConsts);
6319
6320 // load R^2 mod q (to convert back from Montgomery representation)
6321 __ add(kyberConsts, kyberConsts, 64);
6322 __ ldr(v27, __ Q, kyberConsts);
6323
6324 __ BIND(kyberNttMult_loop);
6325
6326 // load 16 zetas
6327 vs_ldpq_post(vz, zetas);
6328
6329 // load 2 sets of 32 coefficients from the two input arrays
6330 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6331 // are striped across pairs of vector registers
6332 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6333 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6334 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6335 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6336
6337 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6338 // i.e. montmul the first and second halves of vs1 in order and
6339 // then with one sequence reversed storing the two results in vs3
6340 //
6341 // vs3[0] <- montmul(a0, b0)
6342 // vs3[1] <- montmul(a1, b1)
6343 // vs3[2] <- montmul(a0, b1)
6344 // vs3[3] <- montmul(a1, b0)
6345 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6346 kyber_montmul16(vs_back(vs3),
6347 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6348
6349 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6350 // i.e. montmul the first and second halves of vs4 in order and
6351 // then with one sequence reversed storing the two results in vs1
6352 //
6353 // vs1[0] <- montmul(a2, b2)
6354 // vs1[1] <- montmul(a3, b3)
6355 // vs1[2] <- montmul(a2, b3)
6356 // vs1[3] <- montmul(a3, b2)
6357 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6358 kyber_montmul16(vs_back(vs1),
6359 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6360
6361 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6362 // We can schedule two montmuls at a time if we use a suitable vector
6363 // sequence <vs3[1], vs1[1]>.
6364 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6365 VSeq<2> vs5(vs3[1], delta);
6366
6367 // vs3[1] <- montmul(montmul(a1, b1), z0)
6368 // vs1[1] <- montmul(montmul(a3, b3), z1)
6369 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6370
6371 // add results in pairs storing in vs3
6372 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6373 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6374 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6375
6376 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6377 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6378 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6379
6380 // vs1 <- montmul(vs3, montRSquareModQ)
6381 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6382
6383 // store back the two pairs of result vectors de-interleaved as 8H elements
6384 // i.e. storing each pairs of shorts striped across a register pair adjacent
6385 // in memory
6386 vs_st2_post(vs1, __ T8H, result);
6387
6388 __ cmp(result, limit);
6389 __ br(Assembler::NE, kyberNttMult_loop);
6390
6391 __ leave(); // required for proper stackwalking of RuntimeStub frame
6392 __ mov(r0, zr); // return 0
6393 __ ret(lr);
6394
6395 // record the stub entry and end
6396 store_archive_data(stub_id, start, __ pc());
6397
6398 return start;
6399 }
6400
6401 // Kyber add 2 polynomials.
6402 // Implements
6403 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6404 //
6405 // result (short[256]) = c_rarg0
6406 // a (short[256]) = c_rarg1
6407 // b (short[256]) = c_rarg2
6408 address generate_kyberAddPoly_2() {
6409 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6410 int entry_count = StubInfo::entry_count(stub_id);
6411 assert(entry_count == 1, "sanity check");
6412 address start = load_archive_data(stub_id);
6413 if (start != nullptr) {
6414 return start;
6415 }
6416 __ align(CodeEntryAlignment);
6417 StubCodeMark mark(this, stub_id);
6418 start = __ pc();
6419 __ enter();
6420
6421 const Register result = c_rarg0;
6422 const Register a = c_rarg1;
6423 const Register b = c_rarg2;
6424
6425 const Register kyberConsts = r11;
6426
6427 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6428 // So, we can load, add and store the data in 3 groups of 11,
6429 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6430 // registers. A further constraint is that the mapping needs
6431 // to skip callee saves. So, we allocate the register
6432 // sequences using two 8 sequences, two 2 sequences and two
6433 // single registers.
6434 VSeq<8> vs1_1(0);
6435 VSeq<2> vs1_2(16);
6436 FloatRegister vs1_3 = v28;
6437 VSeq<8> vs2_1(18);
6438 VSeq<2> vs2_2(26);
6439 FloatRegister vs2_3 = v29;
6440
6441 // two constant vector sequences
6442 VSeq<8> vc_1(31, 0);
6443 VSeq<2> vc_2(31, 0);
6444
6445 FloatRegister vc_3 = v31;
6446 __ lea(kyberConsts,
6447 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6448
6449 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6450 for (int i = 0; i < 3; i++) {
6451 // load 80 or 88 values from a into vs1_1/2/3
6452 vs_ldpq_post(vs1_1, a);
6453 vs_ldpq_post(vs1_2, a);
6454 if (i < 2) {
6455 __ ldr(vs1_3, __ Q, __ post(a, 16));
6456 }
6457 // load 80 or 88 values from b into vs2_1/2/3
6458 vs_ldpq_post(vs2_1, b);
6459 vs_ldpq_post(vs2_2, b);
6460 if (i < 2) {
6461 __ ldr(vs2_3, __ Q, __ post(b, 16));
6462 }
6463 // sum 80 or 88 values across vs1 and vs2 into vs1
6464 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6465 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6466 if (i < 2) {
6467 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6468 }
6469 // add constant to all 80 or 88 results
6470 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6471 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6472 if (i < 2) {
6473 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6474 }
6475 // store 80 or 88 values
6476 vs_stpq_post(vs1_1, result);
6477 vs_stpq_post(vs1_2, result);
6478 if (i < 2) {
6479 __ str(vs1_3, __ Q, __ post(result, 16));
6480 }
6481 }
6482
6483 __ leave(); // required for proper stackwalking of RuntimeStub frame
6484 __ mov(r0, zr); // return 0
6485 __ ret(lr);
6486
6487 // record the stub entry and end
6488 store_archive_data(stub_id, start, __ pc());
6489
6490 return start;
6491 }
6492
6493 // Kyber add 3 polynomials.
6494 // Implements
6495 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6496 //
6497 // result (short[256]) = c_rarg0
6498 // a (short[256]) = c_rarg1
6499 // b (short[256]) = c_rarg2
6500 // c (short[256]) = c_rarg3
6501 address generate_kyberAddPoly_3() {
6502 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6503 int entry_count = StubInfo::entry_count(stub_id);
6504 assert(entry_count == 1, "sanity check");
6505 address start = load_archive_data(stub_id);
6506 if (start != nullptr) {
6507 return start;
6508 }
6509 __ align(CodeEntryAlignment);
6510 StubCodeMark mark(this, stub_id);
6511 start = __ pc();
6512 __ enter();
6513
6514 const Register result = c_rarg0;
6515 const Register a = c_rarg1;
6516 const Register b = c_rarg2;
6517 const Register c = c_rarg3;
6518
6519 const Register kyberConsts = r11;
6520
6521 // As above we sum 256 sets of values in total i.e. 32 x 8H
6522 // quadwords. So, we can load, add and store the data in 3
6523 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6524 // of 10 or 11 registers. A further constraint is that the
6525 // mapping needs to skip callee saves. So, we allocate the
6526 // register sequences using two 8 sequences, two 2 sequences
6527 // and two single registers.
6528 VSeq<8> vs1_1(0);
6529 VSeq<2> vs1_2(16);
6530 FloatRegister vs1_3 = v28;
6531 VSeq<8> vs2_1(18);
6532 VSeq<2> vs2_2(26);
6533 FloatRegister vs2_3 = v29;
6534
6535 // two constant vector sequences
6536 VSeq<8> vc_1(31, 0);
6537 VSeq<2> vc_2(31, 0);
6538
6539 FloatRegister vc_3 = v31;
6540
6541 __ lea(kyberConsts,
6542 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6543
6544 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6545 for (int i = 0; i < 3; i++) {
6546 // load 80 or 88 values from a into vs1_1/2/3
6547 vs_ldpq_post(vs1_1, a);
6548 vs_ldpq_post(vs1_2, a);
6549 if (i < 2) {
6550 __ ldr(vs1_3, __ Q, __ post(a, 16));
6551 }
6552 // load 80 or 88 values from b into vs2_1/2/3
6553 vs_ldpq_post(vs2_1, b);
6554 vs_ldpq_post(vs2_2, b);
6555 if (i < 2) {
6556 __ ldr(vs2_3, __ Q, __ post(b, 16));
6557 }
6558 // sum 80 or 88 values across vs1 and vs2 into vs1
6559 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6560 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6561 if (i < 2) {
6562 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6563 }
6564 // load 80 or 88 values from c into vs2_1/2/3
6565 vs_ldpq_post(vs2_1, c);
6566 vs_ldpq_post(vs2_2, c);
6567 if (i < 2) {
6568 __ ldr(vs2_3, __ Q, __ post(c, 16));
6569 }
6570 // sum 80 or 88 values across vs1 and vs2 into vs1
6571 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6572 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6573 if (i < 2) {
6574 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6575 }
6576 // add constant to all 80 or 88 results
6577 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6578 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6579 if (i < 2) {
6580 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6581 }
6582 // store 80 or 88 values
6583 vs_stpq_post(vs1_1, result);
6584 vs_stpq_post(vs1_2, result);
6585 if (i < 2) {
6586 __ str(vs1_3, __ Q, __ post(result, 16));
6587 }
6588 }
6589
6590 __ leave(); // required for proper stackwalking of RuntimeStub frame
6591 __ mov(r0, zr); // return 0
6592 __ ret(lr);
6593
6594 // record the stub entry and end
6595 store_archive_data(stub_id, start, __ pc());
6596
6597 return start;
6598 }
6599
6600 // Kyber parse XOF output to polynomial coefficient candidates
6601 // or decodePoly(12, ...).
6602 // Implements
6603 // static int implKyber12To16(
6604 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6605 //
6606 // we assume that parsed and condensed are allocated such that for
6607 // n = (parsedLength + 63) / 64
6608 // n blocks of 96 bytes of input can be processed, i.e.
6609 // index + n * 96 <= condensed.length and
6610 // n * 64 <= parsed.length
6611 //
6612 // condensed (byte[]) = c_rarg0
6613 // condensedIndex = c_rarg1
6614 // parsed (short[]) = c_rarg2
6615 // parsedLength = c_rarg3
6616 address generate_kyber12To16() {
6617 StubId stub_id = StubId::stubgen_kyber12To16_id;
6618 int entry_count = StubInfo::entry_count(stub_id);
6619 assert(entry_count == 1, "sanity check");
6620 address start = load_archive_data(stub_id);
6621 if (start != nullptr) {
6622 return start;
6623 }
6624 Label L_F00, L_loop;
6625
6626 __ align(CodeEntryAlignment);
6627 StubCodeMark mark(this, stub_id);
6628 start = __ pc();
6629 __ enter();
6630
6631 const Register condensed = c_rarg0;
6632 const Register condensedOffs = c_rarg1;
6633 const Register parsed = c_rarg2;
6634 const Register parsedLength = c_rarg3;
6635
6636 const Register tmpAddr = r11;
6637
6638 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6639 // quadwords so we need a 6 vector sequence for the inputs.
6640 // Parsing produces 64 shorts, employing two 8 vector
6641 // sequences to store and combine the intermediate data.
6642 VSeq<6> vin(24);
6643 VSeq<8> va(0), vb(16);
6644
6645 __ adr(tmpAddr, L_F00);
6646 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6647 __ add(condensed, condensed, condensedOffs);
6648
6649 __ BIND(L_loop);
6650 // load 96 (6 x 16B) byte values
6651 vs_ld3_post(vin, __ T16B, condensed);
6652
6653 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6654 // holds 48 (16x3) contiguous bytes from memory striped
6655 // horizontally across each of the 16 byte lanes. Equivalently,
6656 // that is 16 pairs of 12-bit integers. Likewise the back half
6657 // holds the next 48 bytes in the same arrangement.
6658
6659 // Each vector in the front half can also be viewed as a vertical
6660 // strip across the 16 pairs of 12 bit integers. Each byte in
6661 // vin[0] stores the low 8 bits of the first int in a pair. Each
6662 // byte in vin[1] stores the high 4 bits of the first int and the
6663 // low 4 bits of the second int. Each byte in vin[2] stores the
6664 // high 8 bits of the second int. Likewise the vectors in second
6665 // half.
6666
6667 // Converting the data to 16-bit shorts requires first of all
6668 // expanding each of the 6 x 16B vectors into 6 corresponding
6669 // pairs of 8H vectors. Mask, shift and add operations on the
6670 // resulting vector pairs can be used to combine 4 and 8 bit
6671 // parts of related 8H vector elements.
6672 //
6673 // The middle vectors (vin[2] and vin[5]) are actually expanded
6674 // twice, one copy manipulated to provide the lower 4 bits
6675 // belonging to the first short in a pair and another copy
6676 // manipulated to provide the higher 4 bits belonging to the
6677 // second short in a pair. This is why the the vector sequences va
6678 // and vb used to hold the expanded 8H elements are of length 8.
6679
6680 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6681 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6682 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6683 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6684 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6685 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6686 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6687 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6688
6689 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6690 // and vb[4:5]
6691 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6692 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6693 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6694 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6695 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6696 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6697
6698 // shift lo byte of copy 1 of the middle stripe into the high byte
6699 __ shl(va[2], __ T8H, va[2], 8);
6700 __ shl(va[3], __ T8H, va[3], 8);
6701 __ shl(vb[2], __ T8H, vb[2], 8);
6702 __ shl(vb[3], __ T8H, vb[3], 8);
6703
6704 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6705 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6706 // are in bit positions [4..11].
6707 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6708 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6709 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6710 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6711
6712 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6713 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6714 // copy2
6715 __ andr(va[2], __ T16B, va[2], v31);
6716 __ andr(va[3], __ T16B, va[3], v31);
6717 __ ushr(va[4], __ T8H, va[4], 4);
6718 __ ushr(va[5], __ T8H, va[5], 4);
6719 __ andr(vb[2], __ T16B, vb[2], v31);
6720 __ andr(vb[3], __ T16B, vb[3], v31);
6721 __ ushr(vb[4], __ T8H, vb[4], 4);
6722 __ ushr(vb[5], __ T8H, vb[5], 4);
6723
6724 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6725 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6726 // n.b. the ordering ensures: i) inputs are consumed before they
6727 // are overwritten ii) the order of 16-bit results across successive
6728 // pairs of vectors in va and then vb reflects the order of the
6729 // corresponding 12-bit inputs
6730 __ addv(va[0], __ T8H, va[0], va[2]);
6731 __ addv(va[2], __ T8H, va[1], va[3]);
6732 __ addv(va[1], __ T8H, va[4], va[6]);
6733 __ addv(va[3], __ T8H, va[5], va[7]);
6734 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6735 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6736 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6737 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6738
6739 // store 64 results interleaved as shorts
6740 vs_st2_post(vs_front(va), __ T8H, parsed);
6741 vs_st2_post(vs_front(vb), __ T8H, parsed);
6742
6743 __ sub(parsedLength, parsedLength, 64);
6744 __ cmp(parsedLength, (u1)0);
6745 __ br(Assembler::GT, L_loop);
6746
6747 __ leave(); // required for proper stackwalking of RuntimeStub frame
6748 __ mov(r0, zr); // return 0
6749 __ ret(lr);
6750
6751 // bind label and generate constant data used by this stub
6752 __ BIND(L_F00);
6753 __ emit_int64(0x0f000f000f000f00);
6754 __ emit_int64(0x0f000f000f000f00);
6755
6756 // record the stub entry and end
6757 store_archive_data(stub_id, start, __ pc());
6758
6759 return start;
6760 }
6761
6762 // Kyber Barrett reduce function.
6763 // Implements
6764 // static int implKyberBarrettReduce(short[] coeffs) {}
6765 //
6766 // coeffs (short[256]) = c_rarg0
6767 address generate_kyberBarrettReduce() {
6768 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6769 int entry_count = StubInfo::entry_count(stub_id);
6770 assert(entry_count == 1, "sanity check");
6771 address start = load_archive_data(stub_id);
6772 if (start != nullptr) {
6773 return start;
6774 }
6775 __ align(CodeEntryAlignment);
6776 StubCodeMark mark(this, stub_id);
6777 start = __ pc();
6778 __ enter();
6779
6780 const Register coeffs = c_rarg0;
6781
6782 const Register kyberConsts = r10;
6783 const Register result = r11;
6784
6785 // As above we process 256 sets of values in total i.e. 32 x
6786 // 8H quadwords. So, we can load, add and store the data in 3
6787 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6788 // of 10 or 11 registers. A further constraint is that the
6789 // mapping needs to skip callee saves. So, we allocate the
6790 // register sequences using two 8 sequences, two 2 sequences
6791 // and two single registers.
6792 VSeq<8> vs1_1(0);
6793 VSeq<2> vs1_2(16);
6794 FloatRegister vs1_3 = v28;
6795 VSeq<8> vs2_1(18);
6796 VSeq<2> vs2_2(26);
6797 FloatRegister vs2_3 = v29;
6798
6799 // we also need a pair of corresponding constant sequences
6800
6801 VSeq<8> vc1_1(30, 0);
6802 VSeq<2> vc1_2(30, 0);
6803 FloatRegister vc1_3 = v30; // for kyber_q
6804
6805 VSeq<8> vc2_1(31, 0);
6806 VSeq<2> vc2_2(31, 0);
6807 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6808
6809 __ add(result, coeffs, 0);
6810 __ lea(kyberConsts,
6811 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6812
6813 // load q and the multiplier for the Barrett reduction
6814 __ add(kyberConsts, kyberConsts, 16);
6815 __ ldpq(vc1_3, vc2_3, kyberConsts);
6816
6817 for (int i = 0; i < 3; i++) {
6818 // load 80 or 88 coefficients
6819 vs_ldpq_post(vs1_1, coeffs);
6820 vs_ldpq_post(vs1_2, coeffs);
6821 if (i < 2) {
6822 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6823 }
6824
6825 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6826 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6827 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6828 if (i < 2) {
6829 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6830 }
6831
6832 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6833 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6834 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6835 if (i < 2) {
6836 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6837 }
6838
6839 // vs1 <- vs1 - vs2 * kyber_q
6840 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6841 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6842 if (i < 2) {
6843 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6844 }
6845
6846 vs_stpq_post(vs1_1, result);
6847 vs_stpq_post(vs1_2, result);
6848 if (i < 2) {
6849 __ str(vs1_3, __ Q, __ post(result, 16));
6850 }
6851 }
6852
6853 __ leave(); // required for proper stackwalking of RuntimeStub frame
6854 __ mov(r0, zr); // return 0
6855 __ ret(lr);
6856
6857 // record the stub entry and end
6858 store_archive_data(stub_id, start, __ pc());
6859
6860 return start;
6861 }
6862
6863
6864 // Dilithium-specific montmul helper routines that generate parallel
6865 // code for, respectively, a single 4x4s vector sequence montmul or
6866 // two such multiplies in a row.
6867
6868 // Perform 16 32-bit Montgomery multiplications in parallel
6869 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6870 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6871 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6872 // It will assert that the register use is valid
6873 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6874 }
6875
6876 // Perform 2x16 32-bit Montgomery multiplications in parallel
6877 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6878 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6879 // Schedule two successive 4x4S multiplies via the montmul helper
6880 // on the front and back halves of va, vb and vc. The helper will
6881 // assert that the register use has no overlap conflicts on each
6882 // individual call but we also need to ensure that the necessary
6883 // disjoint/equality constraints are met across both calls.
6884
6885 // vb, vc, vtmp and vq must be disjoint. va must either be
6886 // disjoint from all other registers or equal vc
6887
6888 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6889 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6890 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6891
6892 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6893 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6894
6895 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6896
6897 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6898 assert(vs_disjoint(va, vb), "va and vb overlap");
6899 assert(vs_disjoint(va, vq), "va and vq overlap");
6900 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6901
6902 // We multiply the front and back halves of each sequence 4 at a
6903 // time because
6904 //
6905 // 1) we are currently only able to get 4-way instruction
6906 // parallelism at best
6907 //
6908 // 2) we need registers for the constants in vq and temporary
6909 // scratch registers to hold intermediate results so vtmp can only
6910 // be a VSeq<4> which means we only have 4 scratch slots.
6911
6912 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6913 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6914 }
6915
6916 // Perform combined montmul then add/sub on 4x4S vectors.
6917 void dilithium_montmul16_sub_add(
6918 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6919 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6920 // compute a = montmul(a1, c)
6921 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6922 // ouptut a1 = a0 - a
6923 vs_subv(va1, __ T4S, va0, vc);
6924 // and a0 = a0 + a
6925 vs_addv(va0, __ T4S, va0, vc);
6926 }
6927
6928 // Perform combined add/sub then montul on 4x4S vectors.
6929 void dilithium_sub_add_montmul16(
6930 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6931 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6932 // compute c = a0 - a1
6933 vs_subv(vtmp1, __ T4S, va0, va1);
6934 // output a0 = a0 + a1
6935 vs_addv(va0, __ T4S, va0, va1);
6936 // output a1 = b montmul c
6937 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6938 }
6939
6940 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6941 // in the Java implementation come in sequences of at least 8, so we
6942 // can use ldpq to collect the corresponding data into pairs of vector
6943 // registers.
6944 // We collect the coefficients corresponding to the 'j+l' indexes into
6945 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6946 // then we do the (Montgomery) multiplications by the zetas in parallel
6947 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6948 // v0-v7, then do the additions into v24-v31 and the subtractions into
6949 // v0-v7 and finally save the results back to the coeffs array.
6950 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6951 const Register coeffs, const Register zetas) {
6952 int c1 = 0;
6953 int c2 = 512;
6954 int startIncr;
6955 // don't use callee save registers v8 - v15
6956 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6957 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6958 VSeq<2> vq(30); // n.b. constants overlap vs3
6959 int offsets[4] = { 0, 32, 64, 96 };
6960
6961 for (int level = 0; level < 5; level++) {
6962 int c1Start = c1;
6963 int c2Start = c2;
6964 if (level == 3) {
6965 offsets[1] = 32;
6966 offsets[2] = 128;
6967 offsets[3] = 160;
6968 } else if (level == 4) {
6969 offsets[1] = 64;
6970 offsets[2] = 128;
6971 offsets[3] = 192;
6972 }
6973
6974 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
6975 // time at 4 different offsets and multiply them in order by the
6976 // next set of input values. So we employ indexed load and store
6977 // pair instructions with arrangement 4S.
6978 for (int i = 0; i < 4; i++) {
6979 // reload q and qinv
6980 vs_ldpq(vq, dilithiumConsts); // qInv, q
6981 // load 8x4S coefficients via second start pos == c2
6982 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
6983 // load next 8x4S inputs == b
6984 vs_ldpq_post(vs2, zetas);
6985 // compute a == c2 * b mod MONT_Q
6986 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
6987 // load 8x4s coefficients via first start pos == c1
6988 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
6989 // compute a1 = c1 + a
6990 vs_addv(vs3, __ T4S, vs1, vs2);
6991 // compute a2 = c1 - a
6992 vs_subv(vs1, __ T4S, vs1, vs2);
6993 // output a1 and a2
6994 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
6995 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
6996
6997 int k = 4 * level + i;
6998
6999 if (k > 7) {
7000 startIncr = 256;
7001 } else if (k == 5) {
7002 startIncr = 384;
7003 } else {
7004 startIncr = 128;
7005 }
7006
7007 c1Start += startIncr;
7008 c2Start += startIncr;
7009 }
7010
7011 c2 /= 2;
7012 }
7013 }
7014
7015 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7016 // Implements the method
7017 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7018 // of the Java class sun.security.provider
7019 //
7020 // coeffs (int[256]) = c_rarg0
7021 // zetas (int[256]) = c_rarg1
7022 address generate_dilithiumAlmostNtt() {
7023 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7024 int entry_count = StubInfo::entry_count(stub_id);
7025 assert(entry_count == 1, "sanity check");
7026 address start = load_archive_data(stub_id);
7027 if (start != nullptr) {
7028 return start;
7029 }
7030 __ align(CodeEntryAlignment);
7031 StubCodeMark mark(this, stub_id);
7032 start = __ pc();
7033 __ enter();
7034
7035 const Register coeffs = c_rarg0;
7036 const Register zetas = c_rarg1;
7037
7038 const Register tmpAddr = r9;
7039 const Register dilithiumConsts = r10;
7040 const Register result = r11;
7041 // don't use callee save registers v8 - v15
7042 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7043 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7044 VSeq<2> vq(30); // n.b. constants overlap vs3
7045 int offsets[4] = { 0, 32, 64, 96};
7046 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7047 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7048 __ add(result, coeffs, 0);
7049 __ lea(dilithiumConsts,
7050 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7051
7052 // Each level represents one iteration of the outer for loop of the Java version.
7053
7054 // level 0-4
7055 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7056
7057 // level 5
7058
7059 // At level 5 the coefficients we need to combine with the zetas
7060 // are grouped in memory in blocks of size 4. So, for both sets of
7061 // coefficients we load 4 adjacent values at 8 different offsets
7062 // using an indexed ldr with register variant Q and multiply them
7063 // in sequence order by the next set of inputs. Likewise we store
7064 // the resuls using an indexed str with register variant Q.
7065 for (int i = 0; i < 1024; i += 256) {
7066 // reload constants q, qinv each iteration as they get clobbered later
7067 vs_ldpq(vq, dilithiumConsts); // qInv, q
7068 // load 32 (8x4S) coefficients via first offsets = c1
7069 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7070 // load next 32 (8x4S) inputs = b
7071 vs_ldpq_post(vs2, zetas);
7072 // a = b montul c1
7073 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7074 // load 32 (8x4S) coefficients via second offsets = c2
7075 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7076 // add/sub with result of multiply
7077 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7078 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7079 // write back new coefficients using same offsets
7080 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7081 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7082 }
7083
7084 // level 6
7085 // At level 6 the coefficients we need to combine with the zetas
7086 // are grouped in memory in pairs, the first two being montmul
7087 // inputs and the second add/sub inputs. We can still implement
7088 // the montmul+sub+add using 4-way parallelism but only if we
7089 // combine the coefficients with the zetas 16 at a time. We load 8
7090 // adjacent values at 4 different offsets using an ld2 load with
7091 // arrangement 2D. That interleaves the lower and upper halves of
7092 // each pair of quadwords into successive vector registers. We
7093 // then need to montmul the 4 even elements of the coefficients
7094 // register sequence by the zetas in order and then add/sub the 4
7095 // odd elements of the coefficients register sequence. We use an
7096 // equivalent st2 operation to store the results back into memory
7097 // de-interleaved.
7098 for (int i = 0; i < 1024; i += 128) {
7099 // reload constants q, qinv each iteration as they get clobbered later
7100 vs_ldpq(vq, dilithiumConsts); // qInv, q
7101 // load interleaved 16 (4x2D) coefficients via offsets
7102 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7103 // load next 16 (4x4S) inputs
7104 vs_ldpq_post(vs_front(vs2), zetas);
7105 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7106 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7107 vs_front(vs2), vtmp, vq);
7108 // store interleaved 16 (4x2D) coefficients via offsets
7109 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7110 }
7111
7112 // level 7
7113 // At level 7 the coefficients we need to combine with the zetas
7114 // occur singly with montmul inputs alterating with add/sub
7115 // inputs. Once again we can use 4-way parallelism to combine 16
7116 // zetas at a time. However, we have to load 8 adjacent values at
7117 // 4 different offsets using an ld2 load with arrangement 4S. That
7118 // interleaves the the odd words of each pair into one
7119 // coefficients vector register and the even words of the pair
7120 // into the next register. We then need to montmul the 4 even
7121 // elements of the coefficients register sequence by the zetas in
7122 // order and then add/sub the 4 odd elements of the coefficients
7123 // register sequence. We use an equivalent st2 operation to store
7124 // the results back into memory de-interleaved.
7125
7126 for (int i = 0; i < 1024; i += 128) {
7127 // reload constants q, qinv each iteration as they get clobbered later
7128 vs_ldpq(vq, dilithiumConsts); // qInv, q
7129 // load interleaved 16 (4x4S) coefficients via offsets
7130 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7131 // load next 16 (4x4S) inputs
7132 vs_ldpq_post(vs_front(vs2), zetas);
7133 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7134 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7135 vs_front(vs2), vtmp, vq);
7136 // store interleaved 16 (4x4S) coefficients via offsets
7137 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7138 }
7139 __ leave(); // required for proper stackwalking of RuntimeStub frame
7140 __ mov(r0, zr); // return 0
7141 __ ret(lr);
7142
7143 // record the stub entry and end
7144 store_archive_data(stub_id, start, __ pc());
7145
7146 return start;
7147 }
7148
7149 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7150 // in the Java implementation come in sequences of at least 8, so we
7151 // can use ldpq to collect the corresponding data into pairs of vector
7152 // registers
7153 // We collect the coefficients that correspond to the 'j's into vs1
7154 // the coefficiets that correspond to the 'j+l's into vs2 then
7155 // do the additions into vs3 and the subtractions into vs1 then
7156 // save the result of the additions, load the zetas into vs2
7157 // do the (Montgomery) multiplications by zeta in parallel into vs2
7158 // finally save the results back to the coeffs array
7159 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7160 const Register coeffs, const Register zetas) {
7161 int c1 = 0;
7162 int c2 = 32;
7163 int startIncr;
7164 int offsets[4];
7165 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7166 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7167 VSeq<2> vq(30); // n.b. constants overlap vs3
7168
7169 offsets[0] = 0;
7170
7171 for (int level = 3; level < 8; level++) {
7172 int c1Start = c1;
7173 int c2Start = c2;
7174 if (level == 3) {
7175 offsets[1] = 64;
7176 offsets[2] = 128;
7177 offsets[3] = 192;
7178 } else if (level == 4) {
7179 offsets[1] = 32;
7180 offsets[2] = 128;
7181 offsets[3] = 160;
7182 } else {
7183 offsets[1] = 32;
7184 offsets[2] = 64;
7185 offsets[3] = 96;
7186 }
7187
7188 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7189 // time at 4 different offsets and multiply them in order by the
7190 // next set of input values. So we employ indexed load and store
7191 // pair instructions with arrangement 4S.
7192 for (int i = 0; i < 4; i++) {
7193 // load v1 32 (8x4S) coefficients relative to first start index
7194 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7195 // load v2 32 (8x4S) coefficients relative to second start index
7196 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7197 // a0 = v1 + v2 -- n.b. clobbers vqs
7198 vs_addv(vs3, __ T4S, vs1, vs2);
7199 // a1 = v1 - v2
7200 vs_subv(vs1, __ T4S, vs1, vs2);
7201 // save a1 relative to first start index
7202 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7203 // load constants q, qinv each iteration as they get clobbered above
7204 vs_ldpq(vq, dilithiumConsts); // qInv, q
7205 // load b next 32 (8x4S) inputs
7206 vs_ldpq_post(vs2, zetas);
7207 // a = a1 montmul b
7208 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7209 // save a relative to second start index
7210 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7211
7212 int k = 4 * level + i;
7213
7214 if (k < 24) {
7215 startIncr = 256;
7216 } else if (k == 25) {
7217 startIncr = 384;
7218 } else {
7219 startIncr = 128;
7220 }
7221
7222 c1Start += startIncr;
7223 c2Start += startIncr;
7224 }
7225
7226 c2 *= 2;
7227 }
7228 }
7229
7230 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7231 // Implements the method
7232 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7233 // the sun.security.provider.ML_DSA class.
7234 //
7235 // coeffs (int[256]) = c_rarg0
7236 // zetas (int[256]) = c_rarg1
7237 address generate_dilithiumAlmostInverseNtt() {
7238 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7239 int entry_count = StubInfo::entry_count(stub_id);
7240 assert(entry_count == 1, "sanity check");
7241 address start = load_archive_data(stub_id);
7242 if (start != nullptr) {
7243 return start;
7244 }
7245 __ align(CodeEntryAlignment);
7246 StubCodeMark mark(this, stub_id);
7247 start = __ pc();
7248 __ enter();
7249
7250 const Register coeffs = c_rarg0;
7251 const Register zetas = c_rarg1;
7252
7253 const Register tmpAddr = r9;
7254 const Register dilithiumConsts = r10;
7255 const Register result = r11;
7256 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7257 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7258 VSeq<2> vq(30); // n.b. constants overlap vs3
7259 int offsets[4] = { 0, 32, 64, 96 };
7260 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7261 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7262
7263 __ add(result, coeffs, 0);
7264 __ lea(dilithiumConsts,
7265 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7266
7267 // Each level represents one iteration of the outer for loop of the Java version
7268
7269 // level 0
7270 // At level 0 we need to interleave adjacent quartets of
7271 // coefficients before we multiply and add/sub by the next 16
7272 // zetas just as we did for level 7 in the multiply code. So we
7273 // load and store the values using an ld2/st2 with arrangement 4S.
7274 for (int i = 0; i < 1024; i += 128) {
7275 // load constants q, qinv
7276 // n.b. this can be moved out of the loop as they do not get
7277 // clobbered by first two loops
7278 vs_ldpq(vq, dilithiumConsts); // qInv, q
7279 // a0/a1 load interleaved 32 (8x4S) coefficients
7280 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7281 // b load next 32 (8x4S) inputs
7282 vs_ldpq_post(vs_front(vs2), zetas);
7283 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7284 // n.b. second half of vs2 provides temporary register storage
7285 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7286 vs_front(vs2), vs_back(vs2), vtmp, vq);
7287 // a0/a1 store interleaved 32 (8x4S) coefficients
7288 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7289 }
7290
7291 // level 1
7292 // At level 1 we need to interleave pairs of adjacent pairs of
7293 // coefficients before we multiply by the next 16 zetas just as we
7294 // did for level 6 in the multiply code. So we load and store the
7295 // values an ld2/st2 with arrangement 2D.
7296 for (int i = 0; i < 1024; i += 128) {
7297 // a0/a1 load interleaved 32 (8x2D) coefficients
7298 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7299 // b load next 16 (4x4S) inputs
7300 vs_ldpq_post(vs_front(vs2), zetas);
7301 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7302 // n.b. second half of vs2 provides temporary register storage
7303 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7304 vs_front(vs2), vs_back(vs2), vtmp, vq);
7305 // a0/a1 store interleaved 32 (8x2D) coefficients
7306 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7307 }
7308
7309 // level 2
7310 // At level 2 coefficients come in blocks of 4. So, we load 4
7311 // adjacent coefficients at 8 distinct offsets for both the first
7312 // and second coefficient sequences, using an ldr with register
7313 // variant Q then combine them with next set of 32 zetas. Likewise
7314 // we store the results using an str with register variant Q.
7315 for (int i = 0; i < 1024; i += 256) {
7316 // c0 load 32 (8x4S) coefficients via first offsets
7317 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7318 // c1 load 32 (8x4S) coefficients via second offsets
7319 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
7320 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7321 vs_addv(vs3, __ T4S, vs1, vs2);
7322 // c = c0 - c1
7323 vs_subv(vs1, __ T4S, vs1, vs2);
7324 // store a0 32 (8x4S) coefficients via first offsets
7325 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7326 // b load 32 (8x4S) next inputs
7327 vs_ldpq_post(vs2, zetas);
7328 // reload constants q, qinv -- they were clobbered earlier
7329 vs_ldpq(vq, dilithiumConsts); // qInv, q
7330 // compute a1 = b montmul c
7331 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7332 // store a1 32 (8x4S) coefficients via second offsets
7333 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7334 }
7335
7336 // level 3-7
7337 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7338
7339 __ leave(); // required for proper stackwalking of RuntimeStub frame
7340 __ mov(r0, zr); // return 0
7341 __ ret(lr);
7342
7343 // record the stub entry and end
7344 store_archive_data(stub_id, start, __ pc());
7345
7346 return start;
7347 }
7348
7349 // Dilithium multiply polynomials in the NTT domain.
7350 // Straightforward implementation of the method
7351 // static int implDilithiumNttMult(
7352 // int[] result, int[] ntta, int[] nttb {} of
7353 // the sun.security.provider.ML_DSA class.
7354 //
7355 // result (int[256]) = c_rarg0
7356 // poly1 (int[256]) = c_rarg1
7357 // poly2 (int[256]) = c_rarg2
7358 address generate_dilithiumNttMult() {
7359 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7360 int entry_count = StubInfo::entry_count(stub_id);
7361 assert(entry_count == 1, "sanity check");
7362 address start = load_archive_data(stub_id);
7363 if (start != nullptr) {
7364 return start;
7365 }
7366 __ align(CodeEntryAlignment);
7367 StubCodeMark mark(this, stub_id);
7368 start = __ pc();
7369 __ enter();
7370
7371 Label L_loop;
7372
7373 const Register result = c_rarg0;
7374 const Register poly1 = c_rarg1;
7375 const Register poly2 = c_rarg2;
7376
7377 const Register dilithiumConsts = r10;
7378 const Register len = r11;
7379
7380 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7381 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7382 VSeq<2> vq(30); // n.b. constants overlap vs3
7383 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7384
7385 __ lea(dilithiumConsts,
7386 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7387
7388 // load constants q, qinv
7389 vs_ldpq(vq, dilithiumConsts); // qInv, q
7390 // load constant rSquare into v29
7391 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7392
7393 __ mov(len, zr);
7394 __ add(len, len, 1024);
7395
7396 __ BIND(L_loop);
7397
7398 // b load 32 (8x4S) next inputs from poly1
7399 vs_ldpq_post(vs1, poly1);
7400 // c load 32 (8x4S) next inputs from poly2
7401 vs_ldpq_post(vs2, poly2);
7402 // compute a = b montmul c
7403 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7404 // compute a = rsquare montmul a
7405 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7406 // save a 32 (8x4S) results
7407 vs_stpq_post(vs2, result);
7408
7409 __ sub(len, len, 128);
7410 __ cmp(len, (u1)128);
7411 __ br(Assembler::GE, L_loop);
7412
7413 __ leave(); // required for proper stackwalking of RuntimeStub frame
7414 __ mov(r0, zr); // return 0
7415 __ ret(lr);
7416
7417 // record the stub entry and end
7418 store_archive_data(stub_id, start, __ pc());
7419
7420 return start;
7421 }
7422
7423 // Dilithium Motgomery multiply an array by a constant.
7424 // A straightforward implementation of the method
7425 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7426 // of the sun.security.provider.MLDSA class
7427 //
7428 // coeffs (int[256]) = c_rarg0
7429 // constant (int) = c_rarg1
7430 address generate_dilithiumMontMulByConstant() {
7431 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7432 int entry_count = StubInfo::entry_count(stub_id);
7433 assert(entry_count == 1, "sanity check");
7434 address start = load_archive_data(stub_id);
7435 if (start != nullptr) {
7436 return start;
7437 }
7438 __ align(CodeEntryAlignment);
7439 StubCodeMark mark(this, stub_id);
7440 start = __ pc();
7441 __ enter();
7442
7443 Label L_loop;
7444
7445 const Register coeffs = c_rarg0;
7446 const Register constant = c_rarg1;
7447
7448 const Register dilithiumConsts = r10;
7449 const Register result = r11;
7450 const Register len = r12;
7451
7452 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7453 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7454 VSeq<2> vq(30); // n.b. constants overlap vs3
7455 VSeq<8> vconst(29, 0); // for montmul by constant
7456
7457 // results track inputs
7458 __ add(result, coeffs, 0);
7459 __ lea(dilithiumConsts,
7460 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7461
7462 // load constants q, qinv -- they do not get clobbered by first two loops
7463 vs_ldpq(vq, dilithiumConsts); // qInv, q
7464 // copy caller supplied constant across vconst
7465 __ dup(vconst[0], __ T4S, constant);
7466 __ mov(len, zr);
7467 __ add(len, len, 1024);
7468
7469 __ BIND(L_loop);
7470
7471 // load next 32 inputs
7472 vs_ldpq_post(vs2, coeffs);
7473 // mont mul by constant
7474 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7475 // write next 32 results
7476 vs_stpq_post(vs2, result);
7477
7478 __ sub(len, len, 128);
7479 __ cmp(len, (u1)128);
7480 __ br(Assembler::GE, L_loop);
7481
7482 __ leave(); // required for proper stackwalking of RuntimeStub frame
7483 __ mov(r0, zr); // return 0
7484 __ ret(lr);
7485
7486 // record the stub entry and end
7487 store_archive_data(stub_id, start, __ pc());
7488
7489 return start;
7490 }
7491
7492 // Dilithium decompose poly.
7493 // Implements the method
7494 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7495 // of the sun.security.provider.ML_DSA class
7496 //
7497 // input (int[256]) = c_rarg0
7498 // lowPart (int[256]) = c_rarg1
7499 // highPart (int[256]) = c_rarg2
7500 // twoGamma2 (int) = c_rarg3
7501 // multiplier (int) = c_rarg4
7502 address generate_dilithiumDecomposePoly() {
7503 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7504 int entry_count = StubInfo::entry_count(stub_id);
7505 assert(entry_count == 1, "sanity check");
7506 address start = load_archive_data(stub_id);
7507 if (start != nullptr) {
7508 return start;
7509 }
7510 __ align(CodeEntryAlignment);
7511 StubCodeMark mark(this, stub_id);
7512 start = __ pc();
7513 Label L_loop;
7514
7515 const Register input = c_rarg0;
7516 const Register lowPart = c_rarg1;
7517 const Register highPart = c_rarg2;
7518 const Register twoGamma2 = c_rarg3;
7519 const Register multiplier = c_rarg4;
7520
7521 const Register len = r9;
7522 const Register dilithiumConsts = r10;
7523 const Register tmp = r11;
7524
7525 // 6 independent sets of 4x4s values
7526 VSeq<4> vs1(0), vs2(4), vs3(8);
7527 VSeq<4> vs4(12), vs5(16), vtmp(20);
7528
7529 // 7 constants for cross-multiplying
7530 VSeq<4> one(25, 0);
7531 VSeq<4> qminus1(26, 0);
7532 VSeq<4> g2(27, 0);
7533 VSeq<4> twog2(28, 0);
7534 VSeq<4> mult(29, 0);
7535 VSeq<4> q(30, 0);
7536 VSeq<4> qadd(31, 0);
7537
7538 __ enter();
7539
7540 __ lea(dilithiumConsts,
7541 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7542
7543 // save callee-saved registers
7544 __ stpd(v8, v9, __ pre(sp, -64));
7545 __ stpd(v10, v11, Address(sp, 16));
7546 __ stpd(v12, v13, Address(sp, 32));
7547 __ stpd(v14, v15, Address(sp, 48));
7548
7549 // populate constant registers
7550 __ mov(tmp, zr);
7551 __ add(tmp, tmp, 1);
7552 __ dup(one[0], __ T4S, tmp); // 1
7553 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7554 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7555 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7556 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7557 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7558 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7559
7560 __ mov(len, zr);
7561 __ add(len, len, 1024);
7562
7563 __ BIND(L_loop);
7564
7565 // load next 4x4S inputs interleaved: rplus --> vs1
7566 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7567
7568 // rplus = rplus - ((rplus + qadd) >> 23) * q
7569 vs_addv(vtmp, __ T4S, vs1, qadd);
7570 vs_sshr(vtmp, __ T4S, vtmp, 23);
7571 vs_mulv(vtmp, __ T4S, vtmp, q);
7572 vs_subv(vs1, __ T4S, vs1, vtmp);
7573
7574 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7575 vs_sshr(vtmp, __ T4S, vs1, 31);
7576 vs_andr(vtmp, vtmp, q);
7577 vs_addv(vs1, __ T4S, vs1, vtmp);
7578
7579 // quotient --> vs2
7580 // int quotient = (rplus * multiplier) >> 22;
7581 vs_mulv(vtmp, __ T4S, vs1, mult);
7582 vs_sshr(vs2, __ T4S, vtmp, 22);
7583
7584 // r0 --> vs3
7585 // int r0 = rplus - quotient * twoGamma2;
7586 vs_mulv(vtmp, __ T4S, vs2, twog2);
7587 vs_subv(vs3, __ T4S, vs1, vtmp);
7588
7589 // mask --> vs4
7590 // int mask = (twoGamma2 - r0) >> 22;
7591 vs_subv(vtmp, __ T4S, twog2, vs3);
7592 vs_sshr(vs4, __ T4S, vtmp, 22);
7593
7594 // r0 -= (mask & twoGamma2);
7595 vs_andr(vtmp, vs4, twog2);
7596 vs_subv(vs3, __ T4S, vs3, vtmp);
7597
7598 // quotient += (mask & 1);
7599 vs_andr(vtmp, vs4, one);
7600 vs_addv(vs2, __ T4S, vs2, vtmp);
7601
7602 // mask = (twoGamma2 / 2 - r0) >> 31;
7603 vs_subv(vtmp, __ T4S, g2, vs3);
7604 vs_sshr(vs4, __ T4S, vtmp, 31);
7605
7606 // r0 -= (mask & twoGamma2);
7607 vs_andr(vtmp, vs4, twog2);
7608 vs_subv(vs3, __ T4S, vs3, vtmp);
7609
7610 // quotient += (mask & 1);
7611 vs_andr(vtmp, vs4, one);
7612 vs_addv(vs2, __ T4S, vs2, vtmp);
7613
7614 // r1 --> vs5
7615 // int r1 = rplus - r0 - (dilithium_q - 1);
7616 vs_subv(vtmp, __ T4S, vs1, vs3);
7617 vs_subv(vs5, __ T4S, vtmp, qminus1);
7618
7619 // r1 --> vs1 (overwriting rplus)
7620 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7621 vs_negr(vtmp, __ T4S, vs5);
7622 vs_orr(vtmp, vs5, vtmp);
7623 vs_sshr(vs1, __ T4S, vtmp, 31);
7624
7625 // r0 += ~r1;
7626 vs_notr(vtmp, vs1);
7627 vs_addv(vs3, __ T4S, vs3, vtmp);
7628
7629 // r1 = r1 & quotient;
7630 vs_andr(vs1, vs2, vs1);
7631
7632 // store results inteleaved
7633 // lowPart[m] = r0;
7634 // highPart[m] = r1;
7635 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7636 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7637
7638 __ sub(len, len, 64);
7639 __ cmp(len, (u1)64);
7640 __ br(Assembler::GE, L_loop);
7641
7642 // restore callee-saved vector registers
7643 __ ldpd(v14, v15, Address(sp, 48));
7644 __ ldpd(v12, v13, Address(sp, 32));
7645 __ ldpd(v10, v11, Address(sp, 16));
7646 __ ldpd(v8, v9, __ post(sp, 64));
7647
7648 __ leave(); // required for proper stackwalking of RuntimeStub frame
7649 __ mov(r0, zr); // return 0
7650 __ ret(lr);
7651
7652 // record the stub entry and end
7653 store_archive_data(stub_id, start, __ pc());
7654
7655 return start;
7656 }
7657
7658 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7659 Register tmp0, Register tmp1, Register tmp2) {
7660 __ bic(tmp0, a2, a1); // for a0
7661 __ bic(tmp1, a3, a2); // for a1
7662 __ bic(tmp2, a4, a3); // for a2
7663 __ eor(a2, a2, tmp2);
7664 __ bic(tmp2, a0, a4); // for a3
7665 __ eor(a3, a3, tmp2);
7666 __ bic(tmp2, a1, a0); // for a4
7667 __ eor(a0, a0, tmp0);
7668 __ eor(a1, a1, tmp1);
7669 __ eor(a4, a4, tmp2);
7670 }
7671
7672 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7673 Register a0, Register a1, Register a2, Register a3, Register a4,
7674 Register a5, Register a6, Register a7, Register a8, Register a9,
7675 Register a10, Register a11, Register a12, Register a13, Register a14,
7676 Register a15, Register a16, Register a17, Register a18, Register a19,
7677 Register a20, Register a21, Register a22, Register a23, Register a24,
7678 Register tmp0, Register tmp1, Register tmp2) {
7679 __ eor3(tmp1, a4, a9, a14);
7680 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7681 __ eor3(tmp2, a1, a6, a11);
7682 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7683 __ rax1(tmp2, tmp0, tmp1); // d0
7684 {
7685
7686 Register tmp3, tmp4;
7687 if (can_use_fp && can_use_r18) {
7688 tmp3 = rfp;
7689 tmp4 = r18_tls;
7690 } else {
7691 tmp3 = a4;
7692 tmp4 = a9;
7693 __ stp(tmp3, tmp4, __ pre(sp, -16));
7694 }
7695
7696 __ eor3(tmp3, a0, a5, a10);
7697 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7698 __ eor(a0, a0, tmp2);
7699 __ eor(a5, a5, tmp2);
7700 __ eor(a10, a10, tmp2);
7701 __ eor(a15, a15, tmp2);
7702 __ eor(a20, a20, tmp2); // d0(tmp2)
7703 __ eor3(tmp3, a2, a7, a12);
7704 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7705 __ rax1(tmp3, tmp4, tmp2); // d1
7706 __ eor(a1, a1, tmp3);
7707 __ eor(a6, a6, tmp3);
7708 __ eor(a11, a11, tmp3);
7709 __ eor(a16, a16, tmp3);
7710 __ eor(a21, a21, tmp3); // d1(tmp3)
7711 __ rax1(tmp3, tmp2, tmp0); // d3
7712 __ eor3(tmp2, a3, a8, a13);
7713 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7714 __ eor(a3, a3, tmp3);
7715 __ eor(a8, a8, tmp3);
7716 __ eor(a13, a13, tmp3);
7717 __ eor(a18, a18, tmp3);
7718 __ eor(a23, a23, tmp3);
7719 __ rax1(tmp2, tmp1, tmp0); // d2
7720 __ eor(a2, a2, tmp2);
7721 __ eor(a7, a7, tmp2);
7722 __ eor(a12, a12, tmp2);
7723 __ rax1(tmp0, tmp0, tmp4); // d4
7724 if (!can_use_fp || !can_use_r18) {
7725 __ ldp(tmp3, tmp4, __ post(sp, 16));
7726 }
7727 __ eor(a17, a17, tmp2);
7728 __ eor(a22, a22, tmp2);
7729 __ eor(a4, a4, tmp0);
7730 __ eor(a9, a9, tmp0);
7731 __ eor(a14, a14, tmp0);
7732 __ eor(a19, a19, tmp0);
7733 __ eor(a24, a24, tmp0);
7734 }
7735
7736 __ rol(tmp0, a10, 3);
7737 __ rol(a10, a1, 1);
7738 __ rol(a1, a6, 44);
7739 __ rol(a6, a9, 20);
7740 __ rol(a9, a22, 61);
7741 __ rol(a22, a14, 39);
7742 __ rol(a14, a20, 18);
7743 __ rol(a20, a2, 62);
7744 __ rol(a2, a12, 43);
7745 __ rol(a12, a13, 25);
7746 __ rol(a13, a19, 8) ;
7747 __ rol(a19, a23, 56);
7748 __ rol(a23, a15, 41);
7749 __ rol(a15, a4, 27);
7750 __ rol(a4, a24, 14);
7751 __ rol(a24, a21, 2);
7752 __ rol(a21, a8, 55);
7753 __ rol(a8, a16, 45);
7754 __ rol(a16, a5, 36);
7755 __ rol(a5, a3, 28);
7756 __ rol(a3, a18, 21);
7757 __ rol(a18, a17, 15);
7758 __ rol(a17, a11, 10);
7759 __ rol(a11, a7, 6);
7760 __ mov(a7, tmp0);
7761
7762 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7763 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7764 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7765 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7766 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7767
7768 __ ldr(tmp1, __ post(rc, 8));
7769 __ eor(a0, a0, tmp1);
7770
7771 }
7772
7773 // Arguments:
7774 //
7775 // Inputs:
7776 // c_rarg0 - byte[] source+offset
7777 // c_rarg1 - byte[] SHA.state
7778 // c_rarg2 - int block_size
7779 // c_rarg3 - int offset
7780 // c_rarg4 - int limit
7781 //
7782 address generate_sha3_implCompress_gpr(StubId stub_id) {
7783 bool multi_block;
7784 switch (stub_id) {
7785 case StubId::stubgen_sha3_implCompress_id:
7786 multi_block = false;
7787 break;
7788 case StubId::stubgen_sha3_implCompressMB_id:
7789 multi_block = true;
7790 break;
7791 default:
7792 ShouldNotReachHere();
7793 }
7794 int entry_count = StubInfo::entry_count(stub_id);
7795 assert(entry_count == 1, "sanity check");
7796 address start = load_archive_data(stub_id);
7797 if (start != nullptr) {
7798 return start;
7799 }
7800 __ align(CodeEntryAlignment);
7801 StubCodeMark mark(this, stub_id);
7802 start = __ pc();
7803
7804 Register buf = c_rarg0;
7805 Register state = c_rarg1;
7806 Register block_size = c_rarg2;
7807 Register ofs = c_rarg3;
7808 Register limit = c_rarg4;
7809
7810 // use r3.r17,r19..r28 to keep a0..a24.
7811 // a0..a24 are respective locals from SHA3.java
7812 Register a0 = r25,
7813 a1 = r26,
7814 a2 = r27,
7815 a3 = r3,
7816 a4 = r4,
7817 a5 = r5,
7818 a6 = r6,
7819 a7 = r7,
7820 a8 = rscratch1, // r8
7821 a9 = rscratch2, // r9
7822 a10 = r10,
7823 a11 = r11,
7824 a12 = r12,
7825 a13 = r13,
7826 a14 = r14,
7827 a15 = r15,
7828 a16 = r16,
7829 a17 = r17,
7830 a18 = r28,
7831 a19 = r19,
7832 a20 = r20,
7833 a21 = r21,
7834 a22 = r22,
7835 a23 = r23,
7836 a24 = r24;
7837
7838 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7839
7840 Label sha3_loop, rounds24_preloop, loop_body;
7841 Label sha3_512_or_sha3_384, shake128;
7842
7843 bool can_use_r18 = false;
7844 #ifndef R18_RESERVED
7845 can_use_r18 = true;
7846 #endif
7847 bool can_use_fp = !PreserveFramePointer;
7848
7849 __ enter();
7850
7851 // save almost all yet unsaved gpr registers on stack
7852 __ str(block_size, __ pre(sp, -128));
7853 if (multi_block) {
7854 __ stpw(ofs, limit, Address(sp, 8));
7855 }
7856 // 8 bytes at sp+16 will be used to keep buf
7857 __ stp(r19, r20, Address(sp, 32));
7858 __ stp(r21, r22, Address(sp, 48));
7859 __ stp(r23, r24, Address(sp, 64));
7860 __ stp(r25, r26, Address(sp, 80));
7861 __ stp(r27, r28, Address(sp, 96));
7862 if (can_use_r18 && can_use_fp) {
7863 __ stp(r18_tls, state, Address(sp, 112));
7864 } else {
7865 __ str(state, Address(sp, 112));
7866 }
7867
7868 // begin sha3 calculations: loading a0..a24 from state arrary
7869 __ ldp(a0, a1, state);
7870 __ ldp(a2, a3, Address(state, 16));
7871 __ ldp(a4, a5, Address(state, 32));
7872 __ ldp(a6, a7, Address(state, 48));
7873 __ ldp(a8, a9, Address(state, 64));
7874 __ ldp(a10, a11, Address(state, 80));
7875 __ ldp(a12, a13, Address(state, 96));
7876 __ ldp(a14, a15, Address(state, 112));
7877 __ ldp(a16, a17, Address(state, 128));
7878 __ ldp(a18, a19, Address(state, 144));
7879 __ ldp(a20, a21, Address(state, 160));
7880 __ ldp(a22, a23, Address(state, 176));
7881 __ ldr(a24, Address(state, 192));
7882
7883 __ BIND(sha3_loop);
7884
7885 // load input
7886 __ ldp(tmp3, tmp2, __ post(buf, 16));
7887 __ eor(a0, a0, tmp3);
7888 __ eor(a1, a1, tmp2);
7889 __ ldp(tmp3, tmp2, __ post(buf, 16));
7890 __ eor(a2, a2, tmp3);
7891 __ eor(a3, a3, tmp2);
7892 __ ldp(tmp3, tmp2, __ post(buf, 16));
7893 __ eor(a4, a4, tmp3);
7894 __ eor(a5, a5, tmp2);
7895 __ ldr(tmp3, __ post(buf, 8));
7896 __ eor(a6, a6, tmp3);
7897
7898 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7899 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7900
7901 __ ldp(tmp3, tmp2, __ post(buf, 16));
7902 __ eor(a7, a7, tmp3);
7903 __ eor(a8, a8, tmp2);
7904 __ ldp(tmp3, tmp2, __ post(buf, 16));
7905 __ eor(a9, a9, tmp3);
7906 __ eor(a10, a10, tmp2);
7907 __ ldp(tmp3, tmp2, __ post(buf, 16));
7908 __ eor(a11, a11, tmp3);
7909 __ eor(a12, a12, tmp2);
7910 __ ldp(tmp3, tmp2, __ post(buf, 16));
7911 __ eor(a13, a13, tmp3);
7912 __ eor(a14, a14, tmp2);
7913 __ ldp(tmp3, tmp2, __ post(buf, 16));
7914 __ eor(a15, a15, tmp3);
7915 __ eor(a16, a16, tmp2);
7916
7917 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7918 __ andw(tmp2, block_size, 48);
7919 __ cbzw(tmp2, rounds24_preloop);
7920 __ tbnz(block_size, 5, shake128);
7921 // block_size == 144, bit5 == 0, SHA3-244
7922 __ ldr(tmp3, __ post(buf, 8));
7923 __ eor(a17, a17, tmp3);
7924 __ b(rounds24_preloop);
7925
7926 __ BIND(shake128);
7927 __ ldp(tmp3, tmp2, __ post(buf, 16));
7928 __ eor(a17, a17, tmp3);
7929 __ eor(a18, a18, tmp2);
7930 __ ldp(tmp3, tmp2, __ post(buf, 16));
7931 __ eor(a19, a19, tmp3);
7932 __ eor(a20, a20, tmp2);
7933 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7934
7935 __ BIND(sha3_512_or_sha3_384);
7936 __ ldp(tmp3, tmp2, __ post(buf, 16));
7937 __ eor(a7, a7, tmp3);
7938 __ eor(a8, a8, tmp2);
7939 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7940
7941 // SHA3-384
7942 __ ldp(tmp3, tmp2, __ post(buf, 16));
7943 __ eor(a9, a9, tmp3);
7944 __ eor(a10, a10, tmp2);
7945 __ ldp(tmp3, tmp2, __ post(buf, 16));
7946 __ eor(a11, a11, tmp3);
7947 __ eor(a12, a12, tmp2);
7948
7949 __ BIND(rounds24_preloop);
7950 __ fmovs(v0, 24.0); // float loop counter,
7951 __ fmovs(v1, 1.0); // exact representation
7952
7953 __ str(buf, Address(sp, 16));
7954 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
7955
7956 __ BIND(loop_body);
7957 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7958 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7959 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
7960 tmp0, tmp1, tmp2);
7961 __ fsubs(v0, v0, v1);
7962 __ fcmps(v0, 0.0);
7963 __ br(__ NE, loop_body);
7964
7965 if (multi_block) {
7966 __ ldrw(block_size, sp); // block_size
7967 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
7968 __ addw(tmp2, tmp2, block_size);
7969 __ cmpw(tmp2, tmp1);
7970 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
7971 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
7972 __ br(Assembler::LE, sha3_loop);
7973 __ movw(c_rarg0, tmp2); // return offset
7974 }
7975 if (can_use_fp && can_use_r18) {
7976 __ ldp(r18_tls, state, Address(sp, 112));
7977 } else {
7978 __ ldr(state, Address(sp, 112));
7979 }
7980 // save calculated sha3 state
7981 __ stp(a0, a1, Address(state));
7982 __ stp(a2, a3, Address(state, 16));
7983 __ stp(a4, a5, Address(state, 32));
7984 __ stp(a6, a7, Address(state, 48));
7985 __ stp(a8, a9, Address(state, 64));
7986 __ stp(a10, a11, Address(state, 80));
7987 __ stp(a12, a13, Address(state, 96));
7988 __ stp(a14, a15, Address(state, 112));
7989 __ stp(a16, a17, Address(state, 128));
7990 __ stp(a18, a19, Address(state, 144));
7991 __ stp(a20, a21, Address(state, 160));
7992 __ stp(a22, a23, Address(state, 176));
7993 __ str(a24, Address(state, 192));
7994
7995 // restore required registers from stack
7996 __ ldp(r19, r20, Address(sp, 32));
7997 __ ldp(r21, r22, Address(sp, 48));
7998 __ ldp(r23, r24, Address(sp, 64));
7999 __ ldp(r25, r26, Address(sp, 80));
8000 __ ldp(r27, r28, Address(sp, 96));
8001 if (can_use_fp && can_use_r18) {
8002 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
8003 } // else no need to recalculate rfp, since it wasn't changed
8004
8005 __ leave();
8006
8007 __ ret(lr);
8008
8009 // record the stub entry and end
8010 store_archive_data(stub_id, start, __ pc());
8011
8012 return start;
8013 }
8014
8015 /**
8016 * Arguments:
8017 *
8018 * Inputs:
8019 * c_rarg0 - int crc
8020 * c_rarg1 - byte* buf
8021 * c_rarg2 - int length
8022 *
8023 * Output:
8024 * rax - int crc result
8025 */
8026 address generate_updateBytesCRC32() {
8027 assert(UseCRC32Intrinsics, "what are we doing here?");
8028 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
8029 int entry_count = StubInfo::entry_count(stub_id);
8030 assert(entry_count == 1, "sanity check");
8031 address start = load_archive_data(stub_id);
8032 if (start != nullptr) {
8033 return start;
8034 }
8035 __ align(CodeEntryAlignment);
8036 StubCodeMark mark(this, stub_id);
8037
8038 start = __ pc();
8039
8040 const Register crc = c_rarg0; // crc
8041 const Register buf = c_rarg1; // source java byte array address
8042 const Register len = c_rarg2; // length
8043 const Register table0 = c_rarg3; // crc_table address
8044 const Register table1 = c_rarg4;
8045 const Register table2 = c_rarg5;
8046 const Register table3 = c_rarg6;
8047 const Register tmp3 = c_rarg7;
8048
8049 BLOCK_COMMENT("Entry:");
8050 __ enter(); // required for proper stackwalking of RuntimeStub frame
8051
8052 __ kernel_crc32(crc, buf, len,
8053 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8054
8055 __ leave(); // required for proper stackwalking of RuntimeStub frame
8056 __ ret(lr);
8057
8058 // record the stub entry and end
8059 store_archive_data(stub_id, start, __ pc());
8060
8061 return start;
8062 }
8063
8064 /**
8065 * Arguments:
8066 *
8067 * Inputs:
8068 * c_rarg0 - int crc
8069 * c_rarg1 - byte* buf
8070 * c_rarg2 - int length
8071 * c_rarg3 - int* table
8072 *
8073 * Output:
8074 * r0 - int crc result
8075 */
8076 address generate_updateBytesCRC32C() {
8077 assert(UseCRC32CIntrinsics, "what are we doing here?");
8078 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
8079 int entry_count = StubInfo::entry_count(stub_id);
8080 assert(entry_count == 1, "sanity check");
8081 address start = load_archive_data(stub_id);
8082 if (start != nullptr) {
8083 return start;
8084 }
8085 __ align(CodeEntryAlignment);
8086 StubCodeMark mark(this, stub_id);
8087
8088 start = __ pc();
8089
8090 const Register crc = c_rarg0; // crc
8091 const Register buf = c_rarg1; // source java byte array address
8092 const Register len = c_rarg2; // length
8093 const Register table0 = c_rarg3; // crc_table address
8094 const Register table1 = c_rarg4;
8095 const Register table2 = c_rarg5;
8096 const Register table3 = c_rarg6;
8097 const Register tmp3 = c_rarg7;
8098
8099 BLOCK_COMMENT("Entry:");
8100 __ enter(); // required for proper stackwalking of RuntimeStub frame
8101
8102 __ kernel_crc32c(crc, buf, len,
8103 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8104
8105 __ leave(); // required for proper stackwalking of RuntimeStub frame
8106 __ ret(lr);
8107
8108 // record the stub entry and end
8109 store_archive_data(stub_id, start, __ pc());
8110
8111 return start;
8112 }
8113
8114 /***
8115 * Arguments:
8116 *
8117 * Inputs:
8118 * c_rarg0 - int adler
8119 * c_rarg1 - byte* buff
8120 * c_rarg2 - int len
8121 *
8122 * Output:
8123 * c_rarg0 - int adler result
8124 */
8125 address generate_updateBytesAdler32() {
8126 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
8127 int entry_count = StubInfo::entry_count(stub_id);
8128 assert(entry_count == 1, "sanity check");
8129 address start = load_archive_data(stub_id);
8130 if (start != nullptr) {
8131 return start;
8132 }
8133 __ align(CodeEntryAlignment);
8134 StubCodeMark mark(this, stub_id);
8135 start = __ pc();
8136
8137 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
8138
8139 // Aliases
8140 Register adler = c_rarg0;
8141 Register s1 = c_rarg0;
8142 Register s2 = c_rarg3;
8143 Register buff = c_rarg1;
8144 Register len = c_rarg2;
8145 Register nmax = r4;
8146 Register base = r5;
8147 Register count = r6;
8148 Register temp0 = rscratch1;
8149 Register temp1 = rscratch2;
8150 FloatRegister vbytes = v0;
8151 FloatRegister vs1acc = v1;
8152 FloatRegister vs2acc = v2;
8153 FloatRegister vtable = v3;
8154
8155 // Max number of bytes we can process before having to take the mod
8156 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
8157 uint64_t BASE = 0xfff1;
8158 uint64_t NMAX = 0x15B0;
8159
8160 __ mov(base, BASE);
8161 __ mov(nmax, NMAX);
8162
8163 // Load accumulation coefficients for the upper 16 bits
8164 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
8165 __ ld1(vtable, __ T16B, Address(temp0));
8166
8167 // s1 is initialized to the lower 16 bits of adler
8168 // s2 is initialized to the upper 16 bits of adler
8169 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
8170 __ uxth(s1, adler); // s1 = (adler & 0xffff)
8171
8172 // The pipelined loop needs at least 16 elements for 1 iteration
8173 // It does check this, but it is more effective to skip to the cleanup loop
8174 __ cmp(len, (u1)16);
8175 __ br(Assembler::HS, L_nmax);
8176 __ cbz(len, L_combine);
8177
8178 __ bind(L_simple_by1_loop);
8179 __ ldrb(temp0, Address(__ post(buff, 1)));
8180 __ add(s1, s1, temp0);
8181 __ add(s2, s2, s1);
8182 __ subs(len, len, 1);
8183 __ br(Assembler::HI, L_simple_by1_loop);
8184
8185 // s1 = s1 % BASE
8186 __ subs(temp0, s1, base);
8187 __ csel(s1, temp0, s1, Assembler::HS);
8188
8189 // s2 = s2 % BASE
8190 __ lsr(temp0, s2, 16);
8191 __ lsl(temp1, temp0, 4);
8192 __ sub(temp1, temp1, temp0);
8193 __ add(s2, temp1, s2, ext::uxth);
8194
8195 __ subs(temp0, s2, base);
8196 __ csel(s2, temp0, s2, Assembler::HS);
8197
8198 __ b(L_combine);
8199
8200 __ bind(L_nmax);
8201 __ subs(len, len, nmax);
8202 __ sub(count, nmax, 16);
8203 __ br(Assembler::LO, L_by16);
8204
8205 __ bind(L_nmax_loop);
8206
8207 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8208 vbytes, vs1acc, vs2acc, vtable);
8209
8210 __ subs(count, count, 16);
8211 __ br(Assembler::HS, L_nmax_loop);
8212
8213 // s1 = s1 % BASE
8214 __ lsr(temp0, s1, 16);
8215 __ lsl(temp1, temp0, 4);
8216 __ sub(temp1, temp1, temp0);
8217 __ add(temp1, temp1, s1, ext::uxth);
8218
8219 __ lsr(temp0, temp1, 16);
8220 __ lsl(s1, temp0, 4);
8221 __ sub(s1, s1, temp0);
8222 __ add(s1, s1, temp1, ext:: uxth);
8223
8224 __ subs(temp0, s1, base);
8225 __ csel(s1, temp0, s1, Assembler::HS);
8226
8227 // s2 = s2 % BASE
8228 __ lsr(temp0, s2, 16);
8229 __ lsl(temp1, temp0, 4);
8230 __ sub(temp1, temp1, temp0);
8231 __ add(temp1, temp1, s2, ext::uxth);
8232
8233 __ lsr(temp0, temp1, 16);
8234 __ lsl(s2, temp0, 4);
8235 __ sub(s2, s2, temp0);
8236 __ add(s2, s2, temp1, ext:: uxth);
8237
8238 __ subs(temp0, s2, base);
8239 __ csel(s2, temp0, s2, Assembler::HS);
8240
8241 __ subs(len, len, nmax);
8242 __ sub(count, nmax, 16);
8243 __ br(Assembler::HS, L_nmax_loop);
8244
8245 __ bind(L_by16);
8246 __ adds(len, len, count);
8247 __ br(Assembler::LO, L_by1);
8248
8249 __ bind(L_by16_loop);
8250
8251 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8252 vbytes, vs1acc, vs2acc, vtable);
8253
8254 __ subs(len, len, 16);
8255 __ br(Assembler::HS, L_by16_loop);
8256
8257 __ bind(L_by1);
8258 __ adds(len, len, 15);
8259 __ br(Assembler::LO, L_do_mod);
8260
8261 __ bind(L_by1_loop);
8262 __ ldrb(temp0, Address(__ post(buff, 1)));
8263 __ add(s1, temp0, s1);
8264 __ add(s2, s2, s1);
8265 __ subs(len, len, 1);
8266 __ br(Assembler::HS, L_by1_loop);
8267
8268 __ bind(L_do_mod);
8269 // s1 = s1 % BASE
8270 __ lsr(temp0, s1, 16);
8271 __ lsl(temp1, temp0, 4);
8272 __ sub(temp1, temp1, temp0);
8273 __ add(temp1, temp1, s1, ext::uxth);
8274
8275 __ lsr(temp0, temp1, 16);
8276 __ lsl(s1, temp0, 4);
8277 __ sub(s1, s1, temp0);
8278 __ add(s1, s1, temp1, ext:: uxth);
8279
8280 __ subs(temp0, s1, base);
8281 __ csel(s1, temp0, s1, Assembler::HS);
8282
8283 // s2 = s2 % BASE
8284 __ lsr(temp0, s2, 16);
8285 __ lsl(temp1, temp0, 4);
8286 __ sub(temp1, temp1, temp0);
8287 __ add(temp1, temp1, s2, ext::uxth);
8288
8289 __ lsr(temp0, temp1, 16);
8290 __ lsl(s2, temp0, 4);
8291 __ sub(s2, s2, temp0);
8292 __ add(s2, s2, temp1, ext:: uxth);
8293
8294 __ subs(temp0, s2, base);
8295 __ csel(s2, temp0, s2, Assembler::HS);
8296
8297 // Combine lower bits and higher bits
8298 __ bind(L_combine);
8299 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
8300
8301 __ ret(lr);
8302
8303 // record the stub entry and end
8304 store_archive_data(stub_id, start, __ pc());
8305
8306 return start;
8307 }
8308
8309 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
8310 Register temp0, Register temp1, FloatRegister vbytes,
8311 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
8312 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
8313 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
8314 // In non-vectorized code, we update s1 and s2 as:
8315 // s1 <- s1 + b1
8316 // s2 <- s2 + s1
8317 // s1 <- s1 + b2
8318 // s2 <- s2 + b1
8319 // ...
8320 // s1 <- s1 + b16
8321 // s2 <- s2 + s1
8322 // Putting above assignments together, we have:
8323 // s1_new = s1 + b1 + b2 + ... + b16
8324 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
8325 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
8326 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
8327 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
8328
8329 // s2 = s2 + s1 * 16
8330 __ add(s2, s2, s1, Assembler::LSL, 4);
8331
8332 // vs1acc = b1 + b2 + b3 + ... + b16
8333 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
8334 __ umullv(vs2acc, __ T8B, vtable, vbytes);
8335 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
8336 __ uaddlv(vs1acc, __ T16B, vbytes);
8337 __ uaddlv(vs2acc, __ T8H, vs2acc);
8338
8339 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
8340 __ fmovd(temp0, vs1acc);
8341 __ fmovd(temp1, vs2acc);
8342 __ add(s1, s1, temp0);
8343 __ add(s2, s2, temp1);
8344 }
8345
8346 /**
8347 * Arguments:
8348 *
8349 * Input:
8350 * c_rarg0 - x address
8351 * c_rarg1 - x length
8352 * c_rarg2 - y address
8353 * c_rarg3 - y length
8354 * c_rarg4 - z address
8355 */
8356 address generate_multiplyToLen() {
8357 StubId stub_id = StubId::stubgen_multiplyToLen_id;
8358 int entry_count = StubInfo::entry_count(stub_id);
8359 assert(entry_count == 1, "sanity check");
8360 address start = load_archive_data(stub_id);
8361 if (start != nullptr) {
8362 return start;
8363 }
8364 __ align(CodeEntryAlignment);
8365 StubCodeMark mark(this, stub_id);
8366
8367 start = __ pc();
8368 const Register x = r0;
8369 const Register xlen = r1;
8370 const Register y = r2;
8371 const Register ylen = r3;
8372 const Register z = r4;
8373
8374 const Register tmp0 = r5;
8375 const Register tmp1 = r10;
8376 const Register tmp2 = r11;
8377 const Register tmp3 = r12;
8378 const Register tmp4 = r13;
8379 const Register tmp5 = r14;
8380 const Register tmp6 = r15;
8381 const Register tmp7 = r16;
8382
8383 BLOCK_COMMENT("Entry:");
8384 __ enter(); // required for proper stackwalking of RuntimeStub frame
8385 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8386 __ leave(); // required for proper stackwalking of RuntimeStub frame
8387 __ ret(lr);
8388
8389 // record the stub entry and end
8390 store_archive_data(stub_id, start, __ pc());
8391
8392 return start;
8393 }
8394
8395 address generate_squareToLen() {
8396 // squareToLen algorithm for sizes 1..127 described in java code works
8397 // faster than multiply_to_len on some CPUs and slower on others, but
8398 // multiply_to_len shows a bit better overall results
8399 StubId stub_id = StubId::stubgen_squareToLen_id;
8400 int entry_count = StubInfo::entry_count(stub_id);
8401 assert(entry_count == 1, "sanity check");
8402 address start = load_archive_data(stub_id);
8403 if (start != nullptr) {
8404 return start;
8405 }
8406 __ align(CodeEntryAlignment);
8407 StubCodeMark mark(this, stub_id);
8408 start = __ pc();
8409
8410 const Register x = r0;
8411 const Register xlen = r1;
8412 const Register z = r2;
8413 const Register y = r4; // == x
8414 const Register ylen = r5; // == xlen
8415
8416 const Register tmp0 = r3;
8417 const Register tmp1 = r10;
8418 const Register tmp2 = r11;
8419 const Register tmp3 = r12;
8420 const Register tmp4 = r13;
8421 const Register tmp5 = r14;
8422 const Register tmp6 = r15;
8423 const Register tmp7 = r16;
8424
8425 RegSet spilled_regs = RegSet::of(y, ylen);
8426 BLOCK_COMMENT("Entry:");
8427 __ enter();
8428 __ push(spilled_regs, sp);
8429 __ mov(y, x);
8430 __ mov(ylen, xlen);
8431 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8432 __ pop(spilled_regs, sp);
8433 __ leave();
8434 __ ret(lr);
8435
8436 // record the stub entry and end
8437 store_archive_data(stub_id, start, __ pc());
8438
8439 return start;
8440 }
8441
8442 address generate_mulAdd() {
8443 StubId stub_id = StubId::stubgen_mulAdd_id;
8444 int entry_count = StubInfo::entry_count(stub_id);
8445 assert(entry_count == 1, "sanity check");
8446 address start = load_archive_data(stub_id);
8447 if (start != nullptr) {
8448 return start;
8449 }
8450 __ align(CodeEntryAlignment);
8451 StubCodeMark mark(this, stub_id);
8452
8453 start = __ pc();
8454
8455 const Register out = r0;
8456 const Register in = r1;
8457 const Register offset = r2;
8458 const Register len = r3;
8459 const Register k = r4;
8460
8461 BLOCK_COMMENT("Entry:");
8462 __ enter();
8463 __ mul_add(out, in, offset, len, k);
8464 __ leave();
8465 __ ret(lr);
8466
8467 // record the stub entry and end
8468 store_archive_data(stub_id, start, __ pc());
8469
8470 return start;
8471 }
8472
8473 // Arguments:
8474 //
8475 // Input:
8476 // c_rarg0 - newArr address
8477 // c_rarg1 - oldArr address
8478 // c_rarg2 - newIdx
8479 // c_rarg3 - shiftCount
8480 // c_rarg4 - numIter
8481 //
8482 address generate_bigIntegerRightShift() {
8483 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
8484 int entry_count = StubInfo::entry_count(stub_id);
8485 assert(entry_count == 1, "sanity check");
8486 address start = load_archive_data(stub_id);
8487 if (start != nullptr) {
8488 return start;
8489 }
8490 __ align(CodeEntryAlignment);
8491 StubCodeMark mark(this, stub_id);
8492 start = __ pc();
8493
8494 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8495
8496 Register newArr = c_rarg0;
8497 Register oldArr = c_rarg1;
8498 Register newIdx = c_rarg2;
8499 Register shiftCount = c_rarg3;
8500 Register numIter = c_rarg4;
8501 Register idx = numIter;
8502
8503 Register newArrCur = rscratch1;
8504 Register shiftRevCount = rscratch2;
8505 Register oldArrCur = r13;
8506 Register oldArrNext = r14;
8507
8508 FloatRegister oldElem0 = v0;
8509 FloatRegister oldElem1 = v1;
8510 FloatRegister newElem = v2;
8511 FloatRegister shiftVCount = v3;
8512 FloatRegister shiftVRevCount = v4;
8513
8514 __ cbz(idx, Exit);
8515
8516 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8517
8518 // left shift count
8519 __ movw(shiftRevCount, 32);
8520 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8521
8522 // numIter too small to allow a 4-words SIMD loop, rolling back
8523 __ cmp(numIter, (u1)4);
8524 __ br(Assembler::LT, ShiftThree);
8525
8526 __ dup(shiftVCount, __ T4S, shiftCount);
8527 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8528 __ negr(shiftVCount, __ T4S, shiftVCount);
8529
8530 __ BIND(ShiftSIMDLoop);
8531
8532 // Calculate the load addresses
8533 __ sub(idx, idx, 4);
8534 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8535 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8536 __ add(oldArrCur, oldArrNext, 4);
8537
8538 // Load 4 words and process
8539 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
8540 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
8541 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8542 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8543 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8544 __ st1(newElem, __ T4S, Address(newArrCur));
8545
8546 __ cmp(idx, (u1)4);
8547 __ br(Assembler::LT, ShiftTwoLoop);
8548 __ b(ShiftSIMDLoop);
8549
8550 __ BIND(ShiftTwoLoop);
8551 __ cbz(idx, Exit);
8552 __ cmp(idx, (u1)1);
8553 __ br(Assembler::EQ, ShiftOne);
8554
8555 // Calculate the load addresses
8556 __ sub(idx, idx, 2);
8557 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8558 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8559 __ add(oldArrCur, oldArrNext, 4);
8560
8561 // Load 2 words and process
8562 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8563 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8564 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8565 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8566 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8567 __ st1(newElem, __ T2S, Address(newArrCur));
8568 __ b(ShiftTwoLoop);
8569
8570 __ BIND(ShiftThree);
8571 __ tbz(idx, 1, ShiftOne);
8572 __ tbz(idx, 0, ShiftTwo);
8573 __ ldrw(r10, Address(oldArr, 12));
8574 __ ldrw(r11, Address(oldArr, 8));
8575 __ lsrvw(r10, r10, shiftCount);
8576 __ lslvw(r11, r11, shiftRevCount);
8577 __ orrw(r12, r10, r11);
8578 __ strw(r12, Address(newArr, 8));
8579
8580 __ BIND(ShiftTwo);
8581 __ ldrw(r10, Address(oldArr, 8));
8582 __ ldrw(r11, Address(oldArr, 4));
8583 __ lsrvw(r10, r10, shiftCount);
8584 __ lslvw(r11, r11, shiftRevCount);
8585 __ orrw(r12, r10, r11);
8586 __ strw(r12, Address(newArr, 4));
8587
8588 __ BIND(ShiftOne);
8589 __ ldrw(r10, Address(oldArr, 4));
8590 __ ldrw(r11, Address(oldArr));
8591 __ lsrvw(r10, r10, shiftCount);
8592 __ lslvw(r11, r11, shiftRevCount);
8593 __ orrw(r12, r10, r11);
8594 __ strw(r12, Address(newArr));
8595
8596 __ BIND(Exit);
8597 __ ret(lr);
8598
8599 // record the stub entry and end
8600 store_archive_data(stub_id, start, __ pc());
8601
8602 return start;
8603 }
8604
8605 // Arguments:
8606 //
8607 // Input:
8608 // c_rarg0 - newArr address
8609 // c_rarg1 - oldArr address
8610 // c_rarg2 - newIdx
8611 // c_rarg3 - shiftCount
8612 // c_rarg4 - numIter
8613 //
8614 address generate_bigIntegerLeftShift() {
8615 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8616 int entry_count = StubInfo::entry_count(stub_id);
8617 assert(entry_count == 1, "sanity check");
8618 address start = load_archive_data(stub_id);
8619 if (start != nullptr) {
8620 return start;
8621 }
8622 __ align(CodeEntryAlignment);
8623 StubCodeMark mark(this, stub_id);
8624 start = __ pc();
8625
8626 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8627
8628 Register newArr = c_rarg0;
8629 Register oldArr = c_rarg1;
8630 Register newIdx = c_rarg2;
8631 Register shiftCount = c_rarg3;
8632 Register numIter = c_rarg4;
8633
8634 Register shiftRevCount = rscratch1;
8635 Register oldArrNext = rscratch2;
8636
8637 FloatRegister oldElem0 = v0;
8638 FloatRegister oldElem1 = v1;
8639 FloatRegister newElem = v2;
8640 FloatRegister shiftVCount = v3;
8641 FloatRegister shiftVRevCount = v4;
8642
8643 __ cbz(numIter, Exit);
8644
8645 __ add(oldArrNext, oldArr, 4);
8646 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8647
8648 // right shift count
8649 __ movw(shiftRevCount, 32);
8650 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8651
8652 // numIter too small to allow a 4-words SIMD loop, rolling back
8653 __ cmp(numIter, (u1)4);
8654 __ br(Assembler::LT, ShiftThree);
8655
8656 __ dup(shiftVCount, __ T4S, shiftCount);
8657 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8658 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8659
8660 __ BIND(ShiftSIMDLoop);
8661
8662 // load 4 words and process
8663 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8664 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8665 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8666 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8667 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8668 __ st1(newElem, __ T4S, __ post(newArr, 16));
8669 __ sub(numIter, numIter, 4);
8670
8671 __ cmp(numIter, (u1)4);
8672 __ br(Assembler::LT, ShiftTwoLoop);
8673 __ b(ShiftSIMDLoop);
8674
8675 __ BIND(ShiftTwoLoop);
8676 __ cbz(numIter, Exit);
8677 __ cmp(numIter, (u1)1);
8678 __ br(Assembler::EQ, ShiftOne);
8679
8680 // load 2 words and process
8681 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8682 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8683 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8684 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8685 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8686 __ st1(newElem, __ T2S, __ post(newArr, 8));
8687 __ sub(numIter, numIter, 2);
8688 __ b(ShiftTwoLoop);
8689
8690 __ BIND(ShiftThree);
8691 __ ldrw(r10, __ post(oldArr, 4));
8692 __ ldrw(r11, __ post(oldArrNext, 4));
8693 __ lslvw(r10, r10, shiftCount);
8694 __ lsrvw(r11, r11, shiftRevCount);
8695 __ orrw(r12, r10, r11);
8696 __ strw(r12, __ post(newArr, 4));
8697 __ tbz(numIter, 1, Exit);
8698 __ tbz(numIter, 0, ShiftOne);
8699
8700 __ BIND(ShiftTwo);
8701 __ ldrw(r10, __ post(oldArr, 4));
8702 __ ldrw(r11, __ post(oldArrNext, 4));
8703 __ lslvw(r10, r10, shiftCount);
8704 __ lsrvw(r11, r11, shiftRevCount);
8705 __ orrw(r12, r10, r11);
8706 __ strw(r12, __ post(newArr, 4));
8707
8708 __ BIND(ShiftOne);
8709 __ ldrw(r10, Address(oldArr));
8710 __ ldrw(r11, Address(oldArrNext));
8711 __ lslvw(r10, r10, shiftCount);
8712 __ lsrvw(r11, r11, shiftRevCount);
8713 __ orrw(r12, r10, r11);
8714 __ strw(r12, Address(newArr));
8715
8716 __ BIND(Exit);
8717 __ ret(lr);
8718
8719 // record the stub entry and end
8720 store_archive_data(stub_id, start, __ pc());
8721
8722 return start;
8723 }
8724
8725 address generate_count_positives(address &count_positives_long) {
8726 StubId stub_id = StubId::stubgen_count_positives_id;
8727 GrowableArray<address> entries;
8728 int entry_count = StubInfo::entry_count(stub_id);
8729 // We have an extra entry for count_positives_long.
8730 assert(entry_count == 2, "sanity check");
8731 address start = load_archive_data(stub_id, &entries);
8732 if (start != nullptr) {
8733 assert(entries.length() == 1,
8734 "unexpected extra entry count %d", entries.length());
8735 count_positives_long = entries.at(0);
8736 return start;
8737 }
8738 const u1 large_loop_size = 64;
8739 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8740 int dcache_line = VM_Version::dcache_line_size();
8741
8742 Register ary1 = r1, len = r2, result = r0;
8743
8744 __ align(CodeEntryAlignment);
8745 StubCodeMark mark(this, stub_id);
8746
8747 address entry = __ pc();
8748
8749 __ enter();
8750 // precondition: a copy of len is already in result
8751 // __ mov(result, len);
8752
8753 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8754 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8755
8756 __ cmp(len, (u1)15);
8757 __ br(Assembler::GT, LEN_OVER_15);
8758 // The only case when execution falls into this code is when pointer is near
8759 // the end of memory page and we have to avoid reading next page
8760 __ add(ary1, ary1, len);
8761 __ subs(len, len, 8);
8762 __ br(Assembler::GT, LEN_OVER_8);
8763 __ ldr(rscratch2, Address(ary1, -8));
8764 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8765 __ lsrv(rscratch2, rscratch2, rscratch1);
8766 __ tst(rscratch2, UPPER_BIT_MASK);
8767 __ csel(result, zr, result, Assembler::NE);
8768 __ leave();
8769 __ ret(lr);
8770 __ bind(LEN_OVER_8);
8771 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8772 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8773 __ tst(rscratch2, UPPER_BIT_MASK);
8774 __ br(Assembler::NE, RET_NO_POP);
8775 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8776 __ lsrv(rscratch1, rscratch1, rscratch2);
8777 __ tst(rscratch1, UPPER_BIT_MASK);
8778 __ bind(RET_NO_POP);
8779 __ csel(result, zr, result, Assembler::NE);
8780 __ leave();
8781 __ ret(lr);
8782
8783 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8784 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8785
8786 count_positives_long = __ pc(); // 2nd entry point
8787 entries.append(count_positives_long);
8788
8789 __ enter();
8790
8791 __ bind(LEN_OVER_15);
8792 __ push(spilled_regs, sp);
8793 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8794 __ cbz(rscratch2, ALIGNED);
8795 __ ldp(tmp6, tmp1, Address(ary1));
8796 __ mov(tmp5, 16);
8797 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8798 __ add(ary1, ary1, rscratch1);
8799 __ orr(tmp6, tmp6, tmp1);
8800 __ tst(tmp6, UPPER_BIT_MASK);
8801 __ br(Assembler::NE, RET_ADJUST);
8802 __ sub(len, len, rscratch1);
8803
8804 __ bind(ALIGNED);
8805 __ cmp(len, large_loop_size);
8806 __ br(Assembler::LT, CHECK_16);
8807 // Perform 16-byte load as early return in pre-loop to handle situation
8808 // when initially aligned large array has negative values at starting bytes,
8809 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8810 // slower. Cases with negative bytes further ahead won't be affected that
8811 // much. In fact, it'll be faster due to early loads, less instructions and
8812 // less branches in LARGE_LOOP.
8813 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8814 __ sub(len, len, 16);
8815 __ orr(tmp6, tmp6, tmp1);
8816 __ tst(tmp6, UPPER_BIT_MASK);
8817 __ br(Assembler::NE, RET_ADJUST_16);
8818 __ cmp(len, large_loop_size);
8819 __ br(Assembler::LT, CHECK_16);
8820
8821 if (SoftwarePrefetchHintDistance >= 0
8822 && SoftwarePrefetchHintDistance >= dcache_line) {
8823 // initial prefetch
8824 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8825 }
8826 __ bind(LARGE_LOOP);
8827 if (SoftwarePrefetchHintDistance >= 0) {
8828 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8829 }
8830 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8831 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8832 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8833 // instructions per cycle and have less branches, but this approach disables
8834 // early return, thus, all 64 bytes are loaded and checked every time.
8835 __ ldp(tmp2, tmp3, Address(ary1));
8836 __ ldp(tmp4, tmp5, Address(ary1, 16));
8837 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8838 __ ldp(tmp6, tmp1, Address(ary1, 48));
8839 __ add(ary1, ary1, large_loop_size);
8840 __ sub(len, len, large_loop_size);
8841 __ orr(tmp2, tmp2, tmp3);
8842 __ orr(tmp4, tmp4, tmp5);
8843 __ orr(rscratch1, rscratch1, rscratch2);
8844 __ orr(tmp6, tmp6, tmp1);
8845 __ orr(tmp2, tmp2, tmp4);
8846 __ orr(rscratch1, rscratch1, tmp6);
8847 __ orr(tmp2, tmp2, rscratch1);
8848 __ tst(tmp2, UPPER_BIT_MASK);
8849 __ br(Assembler::NE, RET_ADJUST_LONG);
8850 __ cmp(len, large_loop_size);
8851 __ br(Assembler::GE, LARGE_LOOP);
8852
8853 __ bind(CHECK_16); // small 16-byte load pre-loop
8854 __ cmp(len, (u1)16);
8855 __ br(Assembler::LT, POST_LOOP16);
8856
8857 __ bind(LOOP16); // small 16-byte load loop
8858 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8859 __ sub(len, len, 16);
8860 __ orr(tmp2, tmp2, tmp3);
8861 __ tst(tmp2, UPPER_BIT_MASK);
8862 __ br(Assembler::NE, RET_ADJUST_16);
8863 __ cmp(len, (u1)16);
8864 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8865
8866 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8867 __ cmp(len, (u1)8);
8868 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8869 __ ldr(tmp3, Address(__ post(ary1, 8)));
8870 __ tst(tmp3, UPPER_BIT_MASK);
8871 __ br(Assembler::NE, RET_ADJUST);
8872 __ sub(len, len, 8);
8873
8874 __ bind(POST_LOOP16_LOAD_TAIL);
8875 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8876 __ ldr(tmp1, Address(ary1));
8877 __ mov(tmp2, 64);
8878 __ sub(tmp4, tmp2, len, __ LSL, 3);
8879 __ lslv(tmp1, tmp1, tmp4);
8880 __ tst(tmp1, UPPER_BIT_MASK);
8881 __ br(Assembler::NE, RET_ADJUST);
8882 // Fallthrough
8883
8884 __ bind(RET_LEN);
8885 __ pop(spilled_regs, sp);
8886 __ leave();
8887 __ ret(lr);
8888
8889 // difference result - len is the count of guaranteed to be
8890 // positive bytes
8891
8892 __ bind(RET_ADJUST_LONG);
8893 __ add(len, len, (u1)(large_loop_size - 16));
8894 __ bind(RET_ADJUST_16);
8895 __ add(len, len, 16);
8896 __ bind(RET_ADJUST);
8897 __ pop(spilled_regs, sp);
8898 __ leave();
8899 __ sub(result, result, len);
8900 __ ret(lr);
8901
8902 // record the stub entry and end plus the extra entry
8903 store_archive_data(stub_id, entry, __ pc(), &entries);
8904
8905 return entry;
8906 }
8907
8908 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8909 bool usePrefetch, Label &NOT_EQUAL) {
8910 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8911 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8912 tmp7 = r12, tmp8 = r13;
8913 Label LOOP;
8914
8915 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8916 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8917 __ bind(LOOP);
8918 if (usePrefetch) {
8919 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8920 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8921 }
8922 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8923 __ eor(tmp1, tmp1, tmp2);
8924 __ eor(tmp3, tmp3, tmp4);
8925 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8926 __ orr(tmp1, tmp1, tmp3);
8927 __ cbnz(tmp1, NOT_EQUAL);
8928 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8929 __ eor(tmp5, tmp5, tmp6);
8930 __ eor(tmp7, tmp7, tmp8);
8931 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8932 __ orr(tmp5, tmp5, tmp7);
8933 __ cbnz(tmp5, NOT_EQUAL);
8934 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8935 __ eor(tmp1, tmp1, tmp2);
8936 __ eor(tmp3, tmp3, tmp4);
8937 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8938 __ orr(tmp1, tmp1, tmp3);
8939 __ cbnz(tmp1, NOT_EQUAL);
8940 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8941 __ eor(tmp5, tmp5, tmp6);
8942 __ sub(cnt1, cnt1, 8 * wordSize);
8943 __ eor(tmp7, tmp7, tmp8);
8944 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8945 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8946 // cmp) because subs allows an unlimited range of immediate operand.
8947 __ subs(tmp6, cnt1, loopThreshold);
8948 __ orr(tmp5, tmp5, tmp7);
8949 __ cbnz(tmp5, NOT_EQUAL);
8950 __ br(__ GE, LOOP);
8951 // post-loop
8952 __ eor(tmp1, tmp1, tmp2);
8953 __ eor(tmp3, tmp3, tmp4);
8954 __ orr(tmp1, tmp1, tmp3);
8955 __ sub(cnt1, cnt1, 2 * wordSize);
8956 __ cbnz(tmp1, NOT_EQUAL);
8957 }
8958
8959 void generate_large_array_equals_loop_simd(int loopThreshold,
8960 bool usePrefetch, Label &NOT_EQUAL) {
8961 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8962 tmp2 = rscratch2;
8963 Label LOOP;
8964
8965 __ bind(LOOP);
8966 if (usePrefetch) {
8967 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8968 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8969 }
8970 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
8971 __ sub(cnt1, cnt1, 8 * wordSize);
8972 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
8973 __ subs(tmp1, cnt1, loopThreshold);
8974 __ eor(v0, __ T16B, v0, v4);
8975 __ eor(v1, __ T16B, v1, v5);
8976 __ eor(v2, __ T16B, v2, v6);
8977 __ eor(v3, __ T16B, v3, v7);
8978 __ orr(v0, __ T16B, v0, v1);
8979 __ orr(v1, __ T16B, v2, v3);
8980 __ orr(v0, __ T16B, v0, v1);
8981 __ umov(tmp1, v0, __ D, 0);
8982 __ umov(tmp2, v0, __ D, 1);
8983 __ orr(tmp1, tmp1, tmp2);
8984 __ cbnz(tmp1, NOT_EQUAL);
8985 __ br(__ GE, LOOP);
8986 }
8987
8988 // a1 = r1 - array1 address
8989 // a2 = r2 - array2 address
8990 // result = r0 - return value. Already contains "false"
8991 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
8992 // r3-r5 are reserved temporary registers
8993 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
8994 address generate_large_array_equals() {
8995 StubId stub_id = StubId::stubgen_large_array_equals_id;
8996 int entry_count = StubInfo::entry_count(stub_id);
8997 assert(entry_count == 1, "sanity check");
8998 address start = load_archive_data(stub_id);
8999 if (start != nullptr) {
9000 return start;
9001 }
9002 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
9003 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
9004 tmp7 = r12, tmp8 = r13;
9005 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
9006 SMALL_LOOP, POST_LOOP;
9007 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
9008 // calculate if at least 32 prefetched bytes are used
9009 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
9010 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
9011 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
9012 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
9013 tmp5, tmp6, tmp7, tmp8);
9014
9015 __ align(CodeEntryAlignment);
9016
9017 StubCodeMark mark(this, stub_id);
9018
9019 address entry = __ pc();
9020 __ enter();
9021 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
9022 // also advance pointers to use post-increment instead of pre-increment
9023 __ add(a1, a1, wordSize);
9024 __ add(a2, a2, wordSize);
9025 if (AvoidUnalignedAccesses) {
9026 // both implementations (SIMD/nonSIMD) are using relatively large load
9027 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
9028 // on some CPUs in case of address is not at least 16-byte aligned.
9029 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
9030 // load if needed at least for 1st address and make if 16-byte aligned.
9031 Label ALIGNED16;
9032 __ tbz(a1, 3, ALIGNED16);
9033 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9034 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9035 __ sub(cnt1, cnt1, wordSize);
9036 __ eor(tmp1, tmp1, tmp2);
9037 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
9038 __ bind(ALIGNED16);
9039 }
9040 if (UseSIMDForArrayEquals) {
9041 if (SoftwarePrefetchHintDistance >= 0) {
9042 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9043 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9044 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
9045 /* prfm = */ true, NOT_EQUAL);
9046 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9047 __ br(__ LT, TAIL);
9048 }
9049 __ bind(NO_PREFETCH_LARGE_LOOP);
9050 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
9051 /* prfm = */ false, NOT_EQUAL);
9052 } else {
9053 __ push(spilled_regs, sp);
9054 if (SoftwarePrefetchHintDistance >= 0) {
9055 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9056 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9057 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
9058 /* prfm = */ true, NOT_EQUAL);
9059 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9060 __ br(__ LT, TAIL);
9061 }
9062 __ bind(NO_PREFETCH_LARGE_LOOP);
9063 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
9064 /* prfm = */ false, NOT_EQUAL);
9065 }
9066 __ bind(TAIL);
9067 __ cbz(cnt1, EQUAL);
9068 __ subs(cnt1, cnt1, wordSize);
9069 __ br(__ LE, POST_LOOP);
9070 __ bind(SMALL_LOOP);
9071 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9072 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9073 __ subs(cnt1, cnt1, wordSize);
9074 __ eor(tmp1, tmp1, tmp2);
9075 __ cbnz(tmp1, NOT_EQUAL);
9076 __ br(__ GT, SMALL_LOOP);
9077 __ bind(POST_LOOP);
9078 __ ldr(tmp1, Address(a1, cnt1));
9079 __ ldr(tmp2, Address(a2, cnt1));
9080 __ eor(tmp1, tmp1, tmp2);
9081 __ cbnz(tmp1, NOT_EQUAL);
9082 __ bind(EQUAL);
9083 __ mov(result, true);
9084 __ bind(NOT_EQUAL);
9085 if (!UseSIMDForArrayEquals) {
9086 __ pop(spilled_regs, sp);
9087 }
9088 __ bind(NOT_EQUAL_NO_POP);
9089 __ leave();
9090 __ ret(lr);
9091
9092 // record the stub entry and end
9093 store_archive_data(stub_id, entry, __ pc());
9094
9095 return entry;
9096 }
9097
9098 // result = r0 - return value. Contains initial hashcode value on entry.
9099 // ary = r1 - array address
9100 // cnt = r2 - elements count
9101 // Clobbers: v0-v13, rscratch1, rscratch2
9102 address generate_large_arrays_hashcode(BasicType eltype) {
9103 StubId stub_id;
9104 switch (eltype) {
9105 case T_BOOLEAN:
9106 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
9107 break;
9108 case T_BYTE:
9109 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
9110 break;
9111 case T_CHAR:
9112 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
9113 break;
9114 case T_SHORT:
9115 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
9116 break;
9117 case T_INT:
9118 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
9119 break;
9120 default:
9121 stub_id = StubId::NO_STUBID;
9122 ShouldNotReachHere();
9123 };
9124 int entry_count = StubInfo::entry_count(stub_id);
9125 assert(entry_count == 1, "sanity check");
9126 address start = load_archive_data(stub_id);
9127 if (start != nullptr) {
9128 return start;
9129 }
9130 const Register result = r0, ary = r1, cnt = r2;
9131 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
9132 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
9133 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
9134 const FloatRegister vpowm = v13;
9135
9136 ARRAYS_HASHCODE_REGISTERS;
9137
9138 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
9139
9140 unsigned int vf; // vectorization factor
9141 bool multiply_by_halves;
9142 Assembler::SIMD_Arrangement load_arrangement;
9143 switch (eltype) {
9144 case T_BOOLEAN:
9145 case T_BYTE:
9146 load_arrangement = Assembler::T8B;
9147 multiply_by_halves = true;
9148 vf = 8;
9149 break;
9150 case T_CHAR:
9151 case T_SHORT:
9152 load_arrangement = Assembler::T8H;
9153 multiply_by_halves = true;
9154 vf = 8;
9155 break;
9156 case T_INT:
9157 load_arrangement = Assembler::T4S;
9158 multiply_by_halves = false;
9159 vf = 4;
9160 break;
9161 default:
9162 ShouldNotReachHere();
9163 }
9164
9165 // Unroll factor
9166 const unsigned uf = 4;
9167
9168 // Effective vectorization factor
9169 const unsigned evf = vf * uf;
9170
9171 __ align(CodeEntryAlignment);
9172
9173 StubCodeMark mark(this, stub_id);
9174
9175 address entry = __ pc();
9176 __ enter();
9177
9178 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
9179 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
9180 // value shouldn't change throughout both loops.
9181 __ movw(rscratch1, intpow(31U, 3));
9182 __ mov(vpow, Assembler::S, 0, rscratch1);
9183 __ movw(rscratch1, intpow(31U, 2));
9184 __ mov(vpow, Assembler::S, 1, rscratch1);
9185 __ movw(rscratch1, intpow(31U, 1));
9186 __ mov(vpow, Assembler::S, 2, rscratch1);
9187 __ movw(rscratch1, intpow(31U, 0));
9188 __ mov(vpow, Assembler::S, 3, rscratch1);
9189
9190 __ mov(vmul0, Assembler::T16B, 0);
9191 __ mov(vmul0, Assembler::S, 3, result);
9192
9193 __ andr(rscratch2, cnt, (uf - 1) * vf);
9194 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
9195
9196 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
9197 __ mov(vpowm, Assembler::S, 0, rscratch1);
9198
9199 // SMALL LOOP
9200 __ bind(SMALL_LOOP);
9201
9202 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
9203 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9204 __ subsw(rscratch2, rscratch2, vf);
9205
9206 if (load_arrangement == Assembler::T8B) {
9207 // Extend 8B to 8H to be able to use vector multiply
9208 // instructions
9209 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9210 if (is_signed_subword_type(eltype)) {
9211 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9212 } else {
9213 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9214 }
9215 }
9216
9217 switch (load_arrangement) {
9218 case Assembler::T4S:
9219 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9220 break;
9221 case Assembler::T8B:
9222 case Assembler::T8H:
9223 assert(is_subword_type(eltype), "subword type expected");
9224 if (is_signed_subword_type(eltype)) {
9225 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9226 } else {
9227 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9228 }
9229 break;
9230 default:
9231 __ should_not_reach_here();
9232 }
9233
9234 // Process the upper half of a vector
9235 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9236 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9237 if (is_signed_subword_type(eltype)) {
9238 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9239 } else {
9240 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9241 }
9242 }
9243
9244 __ br(Assembler::HI, SMALL_LOOP);
9245
9246 // SMALL LOOP'S EPILOQUE
9247 __ lsr(rscratch2, cnt, exact_log2(evf));
9248 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
9249
9250 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9251 __ addv(vmul0, Assembler::T4S, vmul0);
9252 __ umov(result, vmul0, Assembler::S, 0);
9253
9254 // TAIL
9255 __ bind(TAIL);
9256
9257 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
9258 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
9259 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
9260 __ andr(rscratch2, cnt, vf - 1);
9261 __ bind(TAIL_SHORTCUT);
9262 __ adr(rscratch1, BR_BASE);
9263 // For Cortex-A53 offset is 4 because 2 nops are generated.
9264 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
9265 __ movw(rscratch2, 0x1f);
9266 __ br(rscratch1);
9267
9268 for (size_t i = 0; i < vf - 1; ++i) {
9269 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
9270 eltype);
9271 __ maddw(result, result, rscratch2, rscratch1);
9272 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
9273 // Generate 2nd nop to have 4 instructions per iteration.
9274 if (VM_Version::supports_a53mac()) {
9275 __ nop();
9276 }
9277 }
9278 __ bind(BR_BASE);
9279
9280 __ leave();
9281 __ ret(lr);
9282
9283 // LARGE LOOP
9284 __ bind(LARGE_LOOP_PREHEADER);
9285
9286 __ lsr(rscratch2, cnt, exact_log2(evf));
9287
9288 if (multiply_by_halves) {
9289 // 31^4 - multiplier between lower and upper parts of a register
9290 __ movw(rscratch1, intpow(31U, vf / 2));
9291 __ mov(vpowm, Assembler::S, 1, rscratch1);
9292 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
9293 __ movw(rscratch1, intpow(31U, evf - vf / 2));
9294 __ mov(vpowm, Assembler::S, 0, rscratch1);
9295 } else {
9296 // 31^16
9297 __ movw(rscratch1, intpow(31U, evf));
9298 __ mov(vpowm, Assembler::S, 0, rscratch1);
9299 }
9300
9301 __ mov(vmul3, Assembler::T16B, 0);
9302 __ mov(vmul2, Assembler::T16B, 0);
9303 __ mov(vmul1, Assembler::T16B, 0);
9304
9305 __ bind(LARGE_LOOP);
9306
9307 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
9308 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
9309 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
9310 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9311
9312 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
9313 Address(__ post(ary, evf * type2aelembytes(eltype))));
9314
9315 if (load_arrangement == Assembler::T8B) {
9316 // Extend 8B to 8H to be able to use vector multiply
9317 // instructions
9318 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9319 if (is_signed_subword_type(eltype)) {
9320 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9321 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9322 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9323 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9324 } else {
9325 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9326 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9327 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9328 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9329 }
9330 }
9331
9332 switch (load_arrangement) {
9333 case Assembler::T4S:
9334 __ addv(vmul3, load_arrangement, vmul3, vdata3);
9335 __ addv(vmul2, load_arrangement, vmul2, vdata2);
9336 __ addv(vmul1, load_arrangement, vmul1, vdata1);
9337 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9338 break;
9339 case Assembler::T8B:
9340 case Assembler::T8H:
9341 assert(is_subword_type(eltype), "subword type expected");
9342 if (is_signed_subword_type(eltype)) {
9343 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9344 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9345 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9346 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9347 } else {
9348 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9349 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9350 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9351 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9352 }
9353 break;
9354 default:
9355 __ should_not_reach_here();
9356 }
9357
9358 // Process the upper half of a vector
9359 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9360 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
9361 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
9362 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
9363 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
9364 if (is_signed_subword_type(eltype)) {
9365 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9366 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9367 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9368 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9369 } else {
9370 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9371 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9372 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9373 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9374 }
9375 }
9376
9377 __ subsw(rscratch2, rscratch2, 1);
9378 __ br(Assembler::HI, LARGE_LOOP);
9379
9380 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
9381 __ addv(vmul3, Assembler::T4S, vmul3);
9382 __ umov(result, vmul3, Assembler::S, 0);
9383
9384 __ mov(rscratch2, intpow(31U, vf));
9385
9386 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
9387 __ addv(vmul2, Assembler::T4S, vmul2);
9388 __ umov(rscratch1, vmul2, Assembler::S, 0);
9389 __ maddw(result, result, rscratch2, rscratch1);
9390
9391 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
9392 __ addv(vmul1, Assembler::T4S, vmul1);
9393 __ umov(rscratch1, vmul1, Assembler::S, 0);
9394 __ maddw(result, result, rscratch2, rscratch1);
9395
9396 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9397 __ addv(vmul0, Assembler::T4S, vmul0);
9398 __ umov(rscratch1, vmul0, Assembler::S, 0);
9399 __ maddw(result, result, rscratch2, rscratch1);
9400
9401 __ andr(rscratch2, cnt, vf - 1);
9402 __ cbnz(rscratch2, TAIL_SHORTCUT);
9403
9404 __ leave();
9405 __ ret(lr);
9406
9407 // record the stub entry and end
9408 store_archive_data(stub_id, entry, __ pc());
9409
9410 return entry;
9411 }
9412
9413 address generate_dsin_dcos(bool isCos) {
9414 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
9415 int entry_count = StubInfo::entry_count(stub_id);
9416 assert(entry_count == 1, "sanity check");
9417 address start = load_archive_data(stub_id);
9418 if (start != nullptr) {
9419 return start;
9420 }
9421 __ align(CodeEntryAlignment);
9422 StubCodeMark mark(this, stub_id);
9423 start = __ pc();
9424 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
9425 (address)StubRoutines::aarch64::_two_over_pi,
9426 (address)StubRoutines::aarch64::_pio2,
9427 (address)StubRoutines::aarch64::_dsin_coef,
9428 (address)StubRoutines::aarch64::_dcos_coef);
9429
9430 // record the stub entry and end
9431 store_archive_data(stub_id, start, __ pc());
9432
9433 return start;
9434 }
9435
9436 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
9437 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
9438 Label &DIFF2) {
9439 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
9440 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
9441
9442 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
9443 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9444 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
9445 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
9446
9447 __ fmovd(tmpL, vtmp3);
9448 __ eor(rscratch2, tmp3, tmpL);
9449 __ cbnz(rscratch2, DIFF2);
9450
9451 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9452 __ umov(tmpL, vtmp3, __ D, 1);
9453 __ eor(rscratch2, tmpU, tmpL);
9454 __ cbnz(rscratch2, DIFF1);
9455
9456 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
9457 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9458 __ fmovd(tmpL, vtmp);
9459 __ eor(rscratch2, tmp3, tmpL);
9460 __ cbnz(rscratch2, DIFF2);
9461
9462 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9463 __ umov(tmpL, vtmp, __ D, 1);
9464 __ eor(rscratch2, tmpU, tmpL);
9465 __ cbnz(rscratch2, DIFF1);
9466 }
9467
9468 // r0 = result
9469 // r1 = str1
9470 // r2 = cnt1
9471 // r3 = str2
9472 // r4 = cnt2
9473 // r10 = tmp1
9474 // r11 = tmp2
9475 address generate_compare_long_string_different_encoding(bool isLU) {
9476 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
9477 int entry_count = StubInfo::entry_count(stub_id);
9478 assert(entry_count == 1, "sanity check");
9479 address start = load_archive_data(stub_id);
9480 if (start != nullptr) {
9481 return start;
9482 }
9483 __ align(CodeEntryAlignment);
9484 StubCodeMark mark(this, stub_id);
9485 address entry = __ pc();
9486 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
9487 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
9488 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
9489 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9490 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
9491 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
9492 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
9493
9494 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
9495
9496 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
9497 // cnt2 == amount of characters left to compare
9498 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
9499 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9500 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
9501 __ add(str2, str2, isLU ? wordSize : wordSize/2);
9502 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
9503 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
9504 __ eor(rscratch2, tmp1, tmp2);
9505 __ mov(rscratch1, tmp2);
9506 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
9507 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
9508 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
9509 __ push(spilled_regs, sp);
9510 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
9511 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
9512
9513 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9514
9515 if (SoftwarePrefetchHintDistance >= 0) {
9516 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9517 __ br(__ LT, NO_PREFETCH);
9518 __ bind(LARGE_LOOP_PREFETCH);
9519 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
9520 __ mov(tmp4, 2);
9521 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9522 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
9523 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9524 __ subs(tmp4, tmp4, 1);
9525 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
9526 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9527 __ mov(tmp4, 2);
9528 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
9529 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9530 __ subs(tmp4, tmp4, 1);
9531 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
9532 __ sub(cnt2, cnt2, 64);
9533 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9534 __ br(__ GE, LARGE_LOOP_PREFETCH);
9535 }
9536 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
9537 __ bind(NO_PREFETCH);
9538 __ subs(cnt2, cnt2, 16);
9539 __ br(__ LT, TAIL);
9540 __ align(OptoLoopAlignment);
9541 __ bind(SMALL_LOOP); // smaller loop
9542 __ subs(cnt2, cnt2, 16);
9543 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9544 __ br(__ GE, SMALL_LOOP);
9545 __ cmn(cnt2, (u1)16);
9546 __ br(__ EQ, LOAD_LAST);
9547 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
9548 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
9549 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
9550 __ ldr(tmp3, Address(cnt1, -8));
9551 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
9552 __ b(LOAD_LAST);
9553 __ bind(DIFF2);
9554 __ mov(tmpU, tmp3);
9555 __ bind(DIFF1);
9556 __ pop(spilled_regs, sp);
9557 __ b(CALCULATE_DIFFERENCE);
9558 __ bind(LOAD_LAST);
9559 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
9560 // No need to load it again
9561 __ mov(tmpU, tmp3);
9562 __ pop(spilled_regs, sp);
9563
9564 // tmp2 points to the address of the last 4 Latin1 characters right now
9565 __ ldrs(vtmp, Address(tmp2));
9566 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9567 __ fmovd(tmpL, vtmp);
9568
9569 __ eor(rscratch2, tmpU, tmpL);
9570 __ cbz(rscratch2, DONE);
9571
9572 // Find the first different characters in the longwords and
9573 // compute their difference.
9574 __ bind(CALCULATE_DIFFERENCE);
9575 __ rev(rscratch2, rscratch2);
9576 __ clz(rscratch2, rscratch2);
9577 __ andr(rscratch2, rscratch2, -16);
9578 __ lsrv(tmp1, tmp1, rscratch2);
9579 __ uxthw(tmp1, tmp1);
9580 __ lsrv(rscratch1, rscratch1, rscratch2);
9581 __ uxthw(rscratch1, rscratch1);
9582 __ subw(result, tmp1, rscratch1);
9583 __ bind(DONE);
9584 __ ret(lr);
9585
9586 // record the stub entry and end
9587 store_archive_data(stub_id, entry, __ pc());
9588
9589 return entry;
9590 }
9591
9592 // r0 = input (float16)
9593 // v0 = result (float)
9594 // v1 = temporary float register
9595 address generate_float16ToFloat() {
9596 StubId stub_id = StubId::stubgen_hf2f_id;
9597 int entry_count = StubInfo::entry_count(stub_id);
9598 assert(entry_count == 1, "sanity check");
9599 address start = load_archive_data(stub_id);
9600 if (start != nullptr) {
9601 return start;
9602 }
9603 __ align(CodeEntryAlignment);
9604 StubCodeMark mark(this, stub_id);
9605 address entry = __ pc();
9606 BLOCK_COMMENT("Entry:");
9607 __ flt16_to_flt(v0, r0, v1);
9608 __ ret(lr);
9609
9610 // record the stub entry and end
9611 store_archive_data(stub_id, entry, __ pc());
9612
9613 return entry;
9614 }
9615
9616 // v0 = input (float)
9617 // r0 = result (float16)
9618 // v1 = temporary float register
9619 address generate_floatToFloat16() {
9620 StubId stub_id = StubId::stubgen_f2hf_id;
9621 int entry_count = StubInfo::entry_count(stub_id);
9622 assert(entry_count == 1, "sanity check");
9623 address start = load_archive_data(stub_id);
9624 if (start != nullptr) {
9625 return start;
9626 }
9627 __ align(CodeEntryAlignment);
9628 StubCodeMark mark(this, stub_id);
9629 address entry = __ pc();
9630 BLOCK_COMMENT("Entry:");
9631 __ flt_to_flt16(r0, v0, v1);
9632 __ ret(lr);
9633
9634 // record the stub entry and end
9635 store_archive_data(stub_id, entry, __ pc());
9636
9637 return entry;
9638 }
9639
9640 address generate_method_entry_barrier() {
9641 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9642 int entry_count = StubInfo::entry_count(stub_id);
9643 assert(entry_count == 1, "sanity check");
9644 address start = load_archive_data(stub_id);
9645 if (start != nullptr) {
9646 return start;
9647 }
9648 __ align(CodeEntryAlignment);
9649 StubCodeMark mark(this, stub_id);
9650
9651 Label deoptimize_label;
9652
9653 start = __ pc();
9654
9655 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9656
9657 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9658 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9659 // We can get here despite the nmethod being good, if we have not
9660 // yet applied our cross modification fence (or data fence).
9661 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9662 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9663 __ ldrw(rscratch2, rscratch2);
9664 __ strw(rscratch2, thread_epoch_addr);
9665 __ isb();
9666 __ membar(__ LoadLoad);
9667 }
9668
9669 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9670
9671 __ enter();
9672 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9673
9674 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9675
9676 __ push_call_clobbered_registers();
9677
9678 __ mov(c_rarg0, rscratch2);
9679 __ call_VM_leaf
9680 (CAST_FROM_FN_PTR
9681 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9682
9683 __ reset_last_Java_frame(true);
9684
9685 __ mov(rscratch1, r0);
9686
9687 __ pop_call_clobbered_registers();
9688
9689 __ cbnz(rscratch1, deoptimize_label);
9690
9691 __ leave();
9692 __ ret(lr);
9693
9694 __ BIND(deoptimize_label);
9695
9696 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9697 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9698
9699 __ mov(sp, rscratch1);
9700 __ br(rscratch2);
9701
9702 // record the stub entry and end
9703 store_archive_data(stub_id, start, __ pc());
9704
9705 return start;
9706 }
9707
9708 // r0 = result
9709 // r1 = str1
9710 // r2 = cnt1
9711 // r3 = str2
9712 // r4 = cnt2
9713 // r10 = tmp1
9714 // r11 = tmp2
9715 address generate_compare_long_string_same_encoding(bool isLL) {
9716 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9717 int entry_count = StubInfo::entry_count(stub_id);
9718 assert(entry_count == 1, "sanity check");
9719 address start = load_archive_data(stub_id);
9720 if (start != nullptr) {
9721 return start;
9722 }
9723 __ align(CodeEntryAlignment);
9724 StubCodeMark mark(this, stub_id);
9725 address entry = __ pc();
9726 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9727 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9728
9729 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9730
9731 // exit from large loop when less than 64 bytes left to read or we're about
9732 // to prefetch memory behind array border
9733 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9734
9735 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9736 __ eor(rscratch2, tmp1, tmp2);
9737 __ cbnz(rscratch2, CAL_DIFFERENCE);
9738
9739 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9740 // update pointers, because of previous read
9741 __ add(str1, str1, wordSize);
9742 __ add(str2, str2, wordSize);
9743 if (SoftwarePrefetchHintDistance >= 0) {
9744 __ align(OptoLoopAlignment);
9745 __ bind(LARGE_LOOP_PREFETCH);
9746 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9747 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9748
9749 for (int i = 0; i < 4; i++) {
9750 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9751 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9752 __ cmp(tmp1, tmp2);
9753 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9754 __ br(Assembler::NE, DIFF);
9755 }
9756 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9757 __ add(str1, str1, 64);
9758 __ add(str2, str2, 64);
9759 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9760 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9761 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9762 }
9763
9764 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9765 __ br(Assembler::LE, LESS16);
9766 __ align(OptoLoopAlignment);
9767 __ bind(LOOP_COMPARE16);
9768 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9769 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9770 __ cmp(tmp1, tmp2);
9771 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9772 __ br(Assembler::NE, DIFF);
9773 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9774 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9775 __ br(Assembler::LT, LESS16);
9776
9777 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9778 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9779 __ cmp(tmp1, tmp2);
9780 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9781 __ br(Assembler::NE, DIFF);
9782 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9783 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9784 __ br(Assembler::GE, LOOP_COMPARE16);
9785 __ cbz(cnt2, LENGTH_DIFF);
9786
9787 __ bind(LESS16);
9788 // each 8 compare
9789 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9790 __ br(Assembler::LE, LESS8);
9791 __ ldr(tmp1, Address(__ post(str1, 8)));
9792 __ ldr(tmp2, Address(__ post(str2, 8)));
9793 __ eor(rscratch2, tmp1, tmp2);
9794 __ cbnz(rscratch2, CAL_DIFFERENCE);
9795 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9796
9797 __ bind(LESS8); // directly load last 8 bytes
9798 if (!isLL) {
9799 __ add(cnt2, cnt2, cnt2);
9800 }
9801 __ ldr(tmp1, Address(str1, cnt2));
9802 __ ldr(tmp2, Address(str2, cnt2));
9803 __ eor(rscratch2, tmp1, tmp2);
9804 __ cbz(rscratch2, LENGTH_DIFF);
9805 __ b(CAL_DIFFERENCE);
9806
9807 __ bind(DIFF);
9808 __ cmp(tmp1, tmp2);
9809 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9810 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9811 // reuse rscratch2 register for the result of eor instruction
9812 __ eor(rscratch2, tmp1, tmp2);
9813
9814 __ bind(CAL_DIFFERENCE);
9815 __ rev(rscratch2, rscratch2);
9816 __ clz(rscratch2, rscratch2);
9817 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9818 __ lsrv(tmp1, tmp1, rscratch2);
9819 __ lsrv(tmp2, tmp2, rscratch2);
9820 if (isLL) {
9821 __ uxtbw(tmp1, tmp1);
9822 __ uxtbw(tmp2, tmp2);
9823 } else {
9824 __ uxthw(tmp1, tmp1);
9825 __ uxthw(tmp2, tmp2);
9826 }
9827 __ subw(result, tmp1, tmp2);
9828
9829 __ bind(LENGTH_DIFF);
9830 __ ret(lr);
9831
9832 // record the stub entry and end
9833 store_archive_data(stub_id, entry, __ pc());
9834
9835 return entry;
9836 }
9837
9838 enum string_compare_mode {
9839 LL,
9840 LU,
9841 UL,
9842 UU,
9843 };
9844
9845 // The following registers are declared in aarch64.ad
9846 // r0 = result
9847 // r1 = str1
9848 // r2 = cnt1
9849 // r3 = str2
9850 // r4 = cnt2
9851 // r10 = tmp1
9852 // r11 = tmp2
9853 // z0 = ztmp1
9854 // z1 = ztmp2
9855 // p0 = pgtmp1
9856 // p1 = pgtmp2
9857 address generate_compare_long_string_sve(string_compare_mode mode) {
9858 StubId stub_id;
9859 switch (mode) {
9860 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9861 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9862 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9863 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9864 default: ShouldNotReachHere();
9865 }
9866 int entry_count = StubInfo::entry_count(stub_id);
9867 assert(entry_count == 1, "sanity check");
9868 address start = load_archive_data(stub_id);
9869 if (start != nullptr) {
9870 return start;
9871 }
9872 __ align(CodeEntryAlignment);
9873 StubCodeMark mark(this, stub_id);
9874 address entry = __ pc();
9875 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9876 tmp1 = r10, tmp2 = r11;
9877
9878 Label LOOP, DONE, MISMATCH;
9879 Register vec_len = tmp1;
9880 Register idx = tmp2;
9881 // The minimum of the string lengths has been stored in cnt2.
9882 Register cnt = cnt2;
9883 FloatRegister ztmp1 = z0, ztmp2 = z1;
9884 PRegister pgtmp1 = p0, pgtmp2 = p1;
9885
9886 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9887 switch (mode) { \
9888 case LL: \
9889 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9890 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9891 break; \
9892 case LU: \
9893 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9894 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9895 break; \
9896 case UL: \
9897 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9898 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9899 break; \
9900 case UU: \
9901 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9902 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9903 break; \
9904 default: \
9905 ShouldNotReachHere(); \
9906 }
9907
9908 __ mov(idx, 0);
9909 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9910
9911 if (mode == LL) {
9912 __ sve_cntb(vec_len);
9913 } else {
9914 __ sve_cnth(vec_len);
9915 }
9916
9917 __ sub(rscratch1, cnt, vec_len);
9918
9919 __ bind(LOOP);
9920
9921 // main loop
9922 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9923 __ add(idx, idx, vec_len);
9924 // Compare strings.
9925 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9926 __ br(__ NE, MISMATCH);
9927 __ cmp(idx, rscratch1);
9928 __ br(__ LT, LOOP);
9929
9930 // post loop, last iteration
9931 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9932
9933 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9934 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9935 __ br(__ EQ, DONE);
9936
9937 __ bind(MISMATCH);
9938
9939 // Crop the vector to find its location.
9940 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9941 // Extract the first different characters of each string.
9942 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9943 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9944
9945 // Compute the difference of the first different characters.
9946 __ sub(result, rscratch1, rscratch2);
9947
9948 __ bind(DONE);
9949 __ ret(lr);
9950 #undef LOAD_PAIR
9951
9952 // record the stub entry and end
9953 store_archive_data(stub_id, entry, __ pc());
9954
9955 return entry;
9956 }
9957
9958 void generate_compare_long_strings() {
9959 if (UseSVE == 0) {
9960 StubRoutines::aarch64::_compare_long_string_LL
9961 = generate_compare_long_string_same_encoding(true);
9962 StubRoutines::aarch64::_compare_long_string_UU
9963 = generate_compare_long_string_same_encoding(false);
9964 StubRoutines::aarch64::_compare_long_string_LU
9965 = generate_compare_long_string_different_encoding(true);
9966 StubRoutines::aarch64::_compare_long_string_UL
9967 = generate_compare_long_string_different_encoding(false);
9968 } else {
9969 StubRoutines::aarch64::_compare_long_string_LL
9970 = generate_compare_long_string_sve(LL);
9971 StubRoutines::aarch64::_compare_long_string_UU
9972 = generate_compare_long_string_sve(UU);
9973 StubRoutines::aarch64::_compare_long_string_LU
9974 = generate_compare_long_string_sve(LU);
9975 StubRoutines::aarch64::_compare_long_string_UL
9976 = generate_compare_long_string_sve(UL);
9977 }
9978 }
9979
9980 // R0 = result
9981 // R1 = str2
9982 // R2 = cnt1
9983 // R3 = str1
9984 // R4 = cnt2
9985 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
9986 //
9987 // This generic linear code use few additional ideas, which makes it faster:
9988 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
9989 // in order to skip initial loading(help in systems with 1 ld pipeline)
9990 // 2) we can use "fast" algorithm of finding single character to search for
9991 // first symbol with less branches(1 branch per each loaded register instead
9992 // of branch for each symbol), so, this is where constants like
9993 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
9994 // 3) after loading and analyzing 1st register of source string, it can be
9995 // used to search for every 1st character entry, saving few loads in
9996 // comparison with "simplier-but-slower" implementation
9997 // 4) in order to avoid lots of push/pop operations, code below is heavily
9998 // re-using/re-initializing/compressing register values, which makes code
9999 // larger and a bit less readable, however, most of extra operations are
10000 // issued during loads or branches, so, penalty is minimal
10001 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10002 StubId stub_id;
10003 if (str1_isL) {
10004 if (str2_isL) {
10005 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10006 } else {
10007 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10008 }
10009 } else {
10010 if (str2_isL) {
10011 ShouldNotReachHere();
10012 } else {
10013 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10014 }
10015 }
10016 int entry_count = StubInfo::entry_count(stub_id);
10017 assert(entry_count == 1, "sanity check");
10018 address start = load_archive_data(stub_id);
10019 if (start != nullptr) {
10020 return start;
10021 }
10022 __ align(CodeEntryAlignment);
10023 StubCodeMark mark(this, stub_id);
10024 address entry = __ pc();
10025
10026 int str1_chr_size = str1_isL ? 1 : 2;
10027 int str2_chr_size = str2_isL ? 1 : 2;
10028 int str1_chr_shift = str1_isL ? 0 : 1;
10029 int str2_chr_shift = str2_isL ? 0 : 1;
10030 bool isL = str1_isL && str2_isL;
10031 // parameters
10032 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10033 // temporary registers
10034 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10035 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10036 // redefinitions
10037 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10038
10039 __ push(spilled_regs, sp);
10040 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10041 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10042 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10043 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10044 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10045 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10046 // Read whole register from str1. It is safe, because length >=8 here
10047 __ ldr(ch1, Address(str1));
10048 // Read whole register from str2. It is safe, because length >=8 here
10049 __ ldr(ch2, Address(str2));
10050 __ sub(cnt2, cnt2, cnt1);
10051 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10052 if (str1_isL != str2_isL) {
10053 __ eor(v0, __ T16B, v0, v0);
10054 }
10055 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10056 __ mul(first, first, tmp1);
10057 // check if we have less than 1 register to check
10058 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10059 if (str1_isL != str2_isL) {
10060 __ fmovd(v1, ch1);
10061 }
10062 __ br(__ LE, L_SMALL);
10063 __ eor(ch2, first, ch2);
10064 if (str1_isL != str2_isL) {
10065 __ zip1(v1, __ T16B, v1, v0);
10066 }
10067 __ sub(tmp2, ch2, tmp1);
10068 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10069 __ bics(tmp2, tmp2, ch2);
10070 if (str1_isL != str2_isL) {
10071 __ fmovd(ch1, v1);
10072 }
10073 __ br(__ NE, L_HAS_ZERO);
10074 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10075 __ add(result, result, wordSize/str2_chr_size);
10076 __ add(str2, str2, wordSize);
10077 __ br(__ LT, L_POST_LOOP);
10078 __ BIND(L_LOOP);
10079 __ ldr(ch2, Address(str2));
10080 __ eor(ch2, first, ch2);
10081 __ sub(tmp2, ch2, tmp1);
10082 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10083 __ bics(tmp2, tmp2, ch2);
10084 __ br(__ NE, L_HAS_ZERO);
10085 __ BIND(L_LOOP_PROCEED);
10086 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10087 __ add(str2, str2, wordSize);
10088 __ add(result, result, wordSize/str2_chr_size);
10089 __ br(__ GE, L_LOOP);
10090 __ BIND(L_POST_LOOP);
10091 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10092 __ br(__ LE, NOMATCH);
10093 __ ldr(ch2, Address(str2));
10094 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10095 __ eor(ch2, first, ch2);
10096 __ sub(tmp2, ch2, tmp1);
10097 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10098 __ mov(tmp4, -1); // all bits set
10099 __ b(L_SMALL_PROCEED);
10100 __ align(OptoLoopAlignment);
10101 __ BIND(L_SMALL);
10102 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10103 __ eor(ch2, first, ch2);
10104 if (str1_isL != str2_isL) {
10105 __ zip1(v1, __ T16B, v1, v0);
10106 }
10107 __ sub(tmp2, ch2, tmp1);
10108 __ mov(tmp4, -1); // all bits set
10109 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10110 if (str1_isL != str2_isL) {
10111 __ fmovd(ch1, v1); // move converted 4 symbols
10112 }
10113 __ BIND(L_SMALL_PROCEED);
10114 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10115 __ bic(tmp2, tmp2, ch2);
10116 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10117 __ rbit(tmp2, tmp2);
10118 __ br(__ EQ, NOMATCH);
10119 __ BIND(L_SMALL_HAS_ZERO_LOOP);
10120 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10121 __ cmp(cnt1, u1(wordSize/str2_chr_size));
10122 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10123 if (str2_isL) { // LL
10124 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10125 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10126 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10127 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10128 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10129 } else {
10130 __ mov(ch2, 0xE); // all bits in byte set except last one
10131 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10132 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10133 __ lslv(tmp2, tmp2, tmp4);
10134 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10135 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10136 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10137 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10138 }
10139 __ cmp(ch1, ch2);
10140 __ mov(tmp4, wordSize/str2_chr_size);
10141 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10142 __ BIND(L_SMALL_CMP_LOOP);
10143 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10144 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10145 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10146 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10147 __ add(tmp4, tmp4, 1);
10148 __ cmp(tmp4, cnt1);
10149 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10150 __ cmp(first, ch2);
10151 __ br(__ EQ, L_SMALL_CMP_LOOP);
10152 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10153 __ cbz(tmp2, NOMATCH); // no more matches. exit
10154 __ clz(tmp4, tmp2);
10155 __ add(result, result, 1); // advance index
10156 __ add(str2, str2, str2_chr_size); // advance pointer
10157 __ b(L_SMALL_HAS_ZERO_LOOP);
10158 __ align(OptoLoopAlignment);
10159 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10160 __ cmp(first, ch2);
10161 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10162 __ b(DONE);
10163 __ align(OptoLoopAlignment);
10164 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10165 if (str2_isL) { // LL
10166 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10167 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10168 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10169 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10170 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10171 } else {
10172 __ mov(ch2, 0xE); // all bits in byte set except last one
10173 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10174 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10175 __ lslv(tmp2, tmp2, tmp4);
10176 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10177 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10178 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10179 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10180 }
10181 __ cmp(ch1, ch2);
10182 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10183 __ b(DONE);
10184 __ align(OptoLoopAlignment);
10185 __ BIND(L_HAS_ZERO);
10186 __ rbit(tmp2, tmp2);
10187 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10188 // Now, perform compression of counters(cnt2 and cnt1) into one register.
10189 // It's fine because both counters are 32bit and are not changed in this
10190 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10191 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10192 __ sub(result, result, 1);
10193 __ BIND(L_HAS_ZERO_LOOP);
10194 __ mov(cnt1, wordSize/str2_chr_size);
10195 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10196 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10197 if (str2_isL) {
10198 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10199 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10200 __ lslv(tmp2, tmp2, tmp4);
10201 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10202 __ add(tmp4, tmp4, 1);
10203 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10204 __ lsl(tmp2, tmp2, 1);
10205 __ mov(tmp4, wordSize/str2_chr_size);
10206 } else {
10207 __ mov(ch2, 0xE);
10208 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10209 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10210 __ lslv(tmp2, tmp2, tmp4);
10211 __ add(tmp4, tmp4, 1);
10212 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10213 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10214 __ lsl(tmp2, tmp2, 1);
10215 __ mov(tmp4, wordSize/str2_chr_size);
10216 __ sub(str2, str2, str2_chr_size);
10217 }
10218 __ cmp(ch1, ch2);
10219 __ mov(tmp4, wordSize/str2_chr_size);
10220 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10221 __ BIND(L_CMP_LOOP);
10222 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10223 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10224 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10225 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10226 __ add(tmp4, tmp4, 1);
10227 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10228 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10229 __ cmp(cnt1, ch2);
10230 __ br(__ EQ, L_CMP_LOOP);
10231 __ BIND(L_CMP_LOOP_NOMATCH);
10232 // here we're not matched
10233 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10234 __ clz(tmp4, tmp2);
10235 __ add(str2, str2, str2_chr_size); // advance pointer
10236 __ b(L_HAS_ZERO_LOOP);
10237 __ align(OptoLoopAlignment);
10238 __ BIND(L_CMP_LOOP_LAST_CMP);
10239 __ cmp(cnt1, ch2);
10240 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10241 __ b(DONE);
10242 __ align(OptoLoopAlignment);
10243 __ BIND(L_CMP_LOOP_LAST_CMP2);
10244 if (str2_isL) {
10245 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10246 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10247 __ lslv(tmp2, tmp2, tmp4);
10248 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10249 __ add(tmp4, tmp4, 1);
10250 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10251 __ lsl(tmp2, tmp2, 1);
10252 } else {
10253 __ mov(ch2, 0xE);
10254 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10255 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10256 __ lslv(tmp2, tmp2, tmp4);
10257 __ add(tmp4, tmp4, 1);
10258 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10259 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10260 __ lsl(tmp2, tmp2, 1);
10261 __ sub(str2, str2, str2_chr_size);
10262 }
10263 __ cmp(ch1, ch2);
10264 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10265 __ b(DONE);
10266 __ align(OptoLoopAlignment);
10267 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10268 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10269 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10270 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10271 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10272 // result by analyzed characters value, so, we can just reset lower bits
10273 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10274 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10275 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10276 // index of last analyzed substring inside current octet. So, str2 in at
10277 // respective start address. We need to advance it to next octet
10278 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10279 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10280 __ bfm(result, zr, 0, 2 - str2_chr_shift);
10281 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10282 __ movw(cnt2, cnt2);
10283 __ b(L_LOOP_PROCEED);
10284 __ align(OptoLoopAlignment);
10285 __ BIND(NOMATCH);
10286 __ mov(result, -1);
10287 __ BIND(DONE);
10288 __ pop(spilled_regs, sp);
10289 __ ret(lr);
10290
10291 // record the stub entry and end
10292 store_archive_data(stub_id, entry, __ pc());
10293
10294 return entry;
10295 }
10296
10297 void generate_string_indexof_stubs() {
10298 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10299 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10300 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10301 }
10302
10303 void inflate_and_store_2_fp_registers(bool generatePrfm,
10304 FloatRegister src1, FloatRegister src2) {
10305 Register dst = r1;
10306 __ zip1(v1, __ T16B, src1, v0);
10307 __ zip2(v2, __ T16B, src1, v0);
10308 if (generatePrfm) {
10309 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10310 }
10311 __ zip1(v3, __ T16B, src2, v0);
10312 __ zip2(v4, __ T16B, src2, v0);
10313 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10314 }
10315
10316 // R0 = src
10317 // R1 = dst
10318 // R2 = len
10319 // R3 = len >> 3
10320 // V0 = 0
10321 // v1 = loaded 8 bytes
10322 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10323 address generate_large_byte_array_inflate() {
10324 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10325 int entry_count = StubInfo::entry_count(stub_id);
10326 assert(entry_count == 1, "sanity check");
10327 address start = load_archive_data(stub_id);
10328 if (start != nullptr) {
10329 return start;
10330 }
10331 __ align(CodeEntryAlignment);
10332 StubCodeMark mark(this, stub_id);
10333 address entry = __ pc();
10334 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10335 Register src = r0, dst = r1, len = r2, octetCounter = r3;
10336 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10337
10338 // do one more 8-byte read to have address 16-byte aligned in most cases
10339 // also use single store instruction
10340 __ ldrd(v2, __ post(src, 8));
10341 __ sub(octetCounter, octetCounter, 2);
10342 __ zip1(v1, __ T16B, v1, v0);
10343 __ zip1(v2, __ T16B, v2, v0);
10344 __ st1(v1, v2, __ T16B, __ post(dst, 32));
10345 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10346 __ subs(rscratch1, octetCounter, large_loop_threshold);
10347 __ br(__ LE, LOOP_START);
10348 __ b(LOOP_PRFM_START);
10349 __ bind(LOOP_PRFM);
10350 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10351 __ bind(LOOP_PRFM_START);
10352 __ prfm(Address(src, SoftwarePrefetchHintDistance));
10353 __ sub(octetCounter, octetCounter, 8);
10354 __ subs(rscratch1, octetCounter, large_loop_threshold);
10355 inflate_and_store_2_fp_registers(true, v3, v4);
10356 inflate_and_store_2_fp_registers(true, v5, v6);
10357 __ br(__ GT, LOOP_PRFM);
10358 __ cmp(octetCounter, (u1)8);
10359 __ br(__ LT, DONE);
10360 __ bind(LOOP);
10361 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10362 __ bind(LOOP_START);
10363 __ sub(octetCounter, octetCounter, 8);
10364 __ cmp(octetCounter, (u1)8);
10365 inflate_and_store_2_fp_registers(false, v3, v4);
10366 inflate_and_store_2_fp_registers(false, v5, v6);
10367 __ br(__ GE, LOOP);
10368 __ bind(DONE);
10369 __ ret(lr);
10370
10371 // record the stub entry and end
10372 store_archive_data(stub_id, entry, __ pc());
10373
10374 return entry;
10375 }
10376
10377 /**
10378 * Arguments:
10379 *
10380 * Input:
10381 * c_rarg0 - current state address
10382 * c_rarg1 - H key address
10383 * c_rarg2 - data address
10384 * c_rarg3 - number of blocks
10385 *
10386 * Output:
10387 * Updated state at c_rarg0
10388 */
10389 address generate_ghash_processBlocks_small() {
10390 // Bafflingly, GCM uses little-endian for the byte order, but
10391 // big-endian for the bit order. For example, the polynomial 1 is
10392 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10393 //
10394 // So, we must either reverse the bytes in each word and do
10395 // everything big-endian or reverse the bits in each byte and do
10396 // it little-endian. On AArch64 it's more idiomatic to reverse
10397 // the bits in each byte (we have an instruction, RBIT, to do
10398 // that) and keep the data in little-endian bit order through the
10399 // calculation, bit-reversing the inputs and outputs.
10400
10401 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10402 int entry_count = StubInfo::entry_count(stub_id);
10403 assert(entry_count == 1, "sanity check");
10404 address start = load_archive_data(stub_id);
10405 if (start != nullptr) {
10406 return start;
10407 }
10408 __ align(CodeEntryAlignment);
10409 StubCodeMark mark(this, stub_id);
10410 Label polynomial; // local data generated at end of stub
10411 start = __ pc();
10412
10413 Register state = c_rarg0;
10414 Register subkeyH = c_rarg1;
10415 Register data = c_rarg2;
10416 Register blocks = c_rarg3;
10417
10418 FloatRegister vzr = v30;
10419 __ eor(vzr, __ T16B, vzr, vzr); // zero register
10420
10421 __ adr(rscratch1, polynomial);
10422 __ ldrq(v24, rscratch1); // The field polynomial
10423
10424 __ ldrq(v0, Address(state));
10425 __ ldrq(v1, Address(subkeyH));
10426
10427 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
10428 __ rbit(v0, __ T16B, v0);
10429 __ rev64(v1, __ T16B, v1);
10430 __ rbit(v1, __ T16B, v1);
10431
10432 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10433 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10434
10435 {
10436 Label L_ghash_loop;
10437 __ bind(L_ghash_loop);
10438
10439 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10440 // reversing each byte
10441 __ rbit(v2, __ T16B, v2);
10442 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
10443
10444 // Multiply state in v2 by subkey in v1
10445 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10446 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10447 /*temps*/v6, v3, /*reuse/clobber b*/v2);
10448 // Reduce v7:v5 by the field polynomial
10449 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10450
10451 __ sub(blocks, blocks, 1);
10452 __ cbnz(blocks, L_ghash_loop);
10453 }
10454
10455 // The bit-reversed result is at this point in v0
10456 __ rev64(v0, __ T16B, v0);
10457 __ rbit(v0, __ T16B, v0);
10458
10459 __ st1(v0, __ T16B, state);
10460 __ ret(lr);
10461
10462 // bind label and generate local polynomial data
10463 __ align(wordSize * 2);
10464 __ bind(polynomial);
10465 __ emit_int64(0x87); // The low-order bits of the field
10466 // polynomial (i.e. p = z^7+z^2+z+1)
10467 // repeated in the low and high parts of a
10468 // 128-bit vector
10469 __ emit_int64(0x87);
10470
10471 // record the stub entry and end
10472 store_archive_data(stub_id, start, __ pc());
10473
10474 return start;
10475 }
10476
10477 address generate_ghash_processBlocks(address small) {
10478 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10479 int entry_count = StubInfo::entry_count(stub_id);
10480 assert(entry_count == 1, "sanity check");
10481 address start = load_archive_data(stub_id);
10482 if (start != nullptr) {
10483 return start;
10484 }
10485 Label polynomial; // local data generated after stub
10486 __ align(CodeEntryAlignment);
10487 StubCodeMark mark(this, stub_id);
10488 start = __ pc();
10489
10490 Register state = c_rarg0;
10491 Register subkeyH = c_rarg1;
10492 Register data = c_rarg2;
10493 Register blocks = c_rarg3;
10494
10495 const int unroll = 4;
10496
10497 __ cmp(blocks, (unsigned char)(unroll * 2));
10498 __ br(__ LT, small);
10499
10500 if (unroll > 1) {
10501 // Save state before entering routine
10502 __ sub(sp, sp, 4 * 16);
10503 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10504 __ sub(sp, sp, 4 * 16);
10505 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10506 }
10507
10508 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10509
10510 if (unroll > 1) {
10511 // And restore state
10512 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10513 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10514 }
10515
10516 __ cmp(blocks, (unsigned char)0);
10517 __ br(__ GT, small);
10518
10519 __ ret(lr);
10520
10521 // bind label and generate polynomial data
10522 __ align(wordSize * 2);
10523 __ bind(polynomial);
10524 __ emit_int64(0x87); // The low-order bits of the field
10525 // polynomial (i.e. p = z^7+z^2+z+1)
10526 // repeated in the low and high parts of a
10527 // 128-bit vector
10528 __ emit_int64(0x87);
10529
10530 // record the stub entry and end
10531 store_archive_data(stub_id, start, __ pc());
10532
10533 return start;
10534 }
10535
10536 void generate_base64_encode_simdround(Register src, Register dst,
10537 FloatRegister codec, u8 size) {
10538
10539 FloatRegister in0 = v4, in1 = v5, in2 = v6;
10540 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10541 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10542
10543 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10544
10545 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10546
10547 __ ushr(ind0, arrangement, in0, 2);
10548
10549 __ ushr(ind1, arrangement, in1, 2);
10550 __ shl(in0, arrangement, in0, 6);
10551 __ orr(ind1, arrangement, ind1, in0);
10552 __ ushr(ind1, arrangement, ind1, 2);
10553
10554 __ ushr(ind2, arrangement, in2, 4);
10555 __ shl(in1, arrangement, in1, 4);
10556 __ orr(ind2, arrangement, in1, ind2);
10557 __ ushr(ind2, arrangement, ind2, 2);
10558
10559 __ shl(ind3, arrangement, in2, 2);
10560 __ ushr(ind3, arrangement, ind3, 2);
10561
10562 __ tbl(out0, arrangement, codec, 4, ind0);
10563 __ tbl(out1, arrangement, codec, 4, ind1);
10564 __ tbl(out2, arrangement, codec, 4, ind2);
10565 __ tbl(out3, arrangement, codec, 4, ind3);
10566
10567 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
10568 }
10569
10570 /**
10571 * Arguments:
10572 *
10573 * Input:
10574 * c_rarg0 - src_start
10575 * c_rarg1 - src_offset
10576 * c_rarg2 - src_length
10577 * c_rarg3 - dest_start
10578 * c_rarg4 - dest_offset
10579 * c_rarg5 - isURL
10580 *
10581 */
10582 address generate_base64_encodeBlock() {
10583
10584 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10585 int entry_count = StubInfo::entry_count(stub_id);
10586 assert(entry_count == 1, "sanity check");
10587 address start = load_archive_data(stub_id);
10588 if (start != nullptr) {
10589 return start;
10590 }
10591 __ align(CodeEntryAlignment);
10592 StubCodeMark mark(this, stub_id);
10593 start = __ pc();
10594
10595 Register src = c_rarg0; // source array
10596 Register soff = c_rarg1; // source start offset
10597 Register send = c_rarg2; // source end offset
10598 Register dst = c_rarg3; // dest array
10599 Register doff = c_rarg4; // position for writing to dest array
10600 Register isURL = c_rarg5; // Base64 or URL character set
10601
10602 // c_rarg6 and c_rarg7 are free to use as temps
10603 Register codec = c_rarg6;
10604 Register length = c_rarg7;
10605
10606 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10607
10608 __ add(src, src, soff);
10609 __ add(dst, dst, doff);
10610 __ sub(length, send, soff);
10611
10612 // load the codec base address
10613 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10614 __ cbz(isURL, ProcessData);
10615 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10616
10617 __ BIND(ProcessData);
10618
10619 // too short to formup a SIMD loop, roll back
10620 __ cmp(length, (u1)24);
10621 __ br(Assembler::LT, Process3B);
10622
10623 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10624
10625 __ BIND(Process48B);
10626 __ cmp(length, (u1)48);
10627 __ br(Assembler::LT, Process24B);
10628 generate_base64_encode_simdround(src, dst, v0, 16);
10629 __ sub(length, length, 48);
10630 __ b(Process48B);
10631
10632 __ BIND(Process24B);
10633 __ cmp(length, (u1)24);
10634 __ br(Assembler::LT, SIMDExit);
10635 generate_base64_encode_simdround(src, dst, v0, 8);
10636 __ sub(length, length, 24);
10637
10638 __ BIND(SIMDExit);
10639 __ cbz(length, Exit);
10640
10641 __ BIND(Process3B);
10642 // 3 src bytes, 24 bits
10643 __ ldrb(r10, __ post(src, 1));
10644 __ ldrb(r11, __ post(src, 1));
10645 __ ldrb(r12, __ post(src, 1));
10646 __ orrw(r11, r11, r10, Assembler::LSL, 8);
10647 __ orrw(r12, r12, r11, Assembler::LSL, 8);
10648 // codec index
10649 __ ubfmw(r15, r12, 18, 23);
10650 __ ubfmw(r14, r12, 12, 17);
10651 __ ubfmw(r13, r12, 6, 11);
10652 __ andw(r12, r12, 63);
10653 // get the code based on the codec
10654 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10655 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10656 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10657 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10658 __ strb(r15, __ post(dst, 1));
10659 __ strb(r14, __ post(dst, 1));
10660 __ strb(r13, __ post(dst, 1));
10661 __ strb(r12, __ post(dst, 1));
10662 __ sub(length, length, 3);
10663 __ cbnz(length, Process3B);
10664
10665 __ BIND(Exit);
10666 __ ret(lr);
10667
10668 // record the stub entry and end
10669 store_archive_data(stub_id, start, __ pc());
10670
10671 return start;
10672 }
10673
10674 void generate_base64_decode_simdround(Register src, Register dst,
10675 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10676
10677 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
10678 FloatRegister out0 = v20, out1 = v21, out2 = v22;
10679
10680 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10681 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10682
10683 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10684
10685 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10686
10687 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10688
10689 // we need unsigned saturating subtract, to make sure all input values
10690 // in range [0, 63] will have 0U value in the higher half lookup
10691 __ uqsubv(decH0, __ T16B, in0, v27);
10692 __ uqsubv(decH1, __ T16B, in1, v27);
10693 __ uqsubv(decH2, __ T16B, in2, v27);
10694 __ uqsubv(decH3, __ T16B, in3, v27);
10695
10696 // lower half lookup
10697 __ tbl(decL0, arrangement, codecL, 4, in0);
10698 __ tbl(decL1, arrangement, codecL, 4, in1);
10699 __ tbl(decL2, arrangement, codecL, 4, in2);
10700 __ tbl(decL3, arrangement, codecL, 4, in3);
10701
10702 // higher half lookup
10703 __ tbx(decH0, arrangement, codecH, 4, decH0);
10704 __ tbx(decH1, arrangement, codecH, 4, decH1);
10705 __ tbx(decH2, arrangement, codecH, 4, decH2);
10706 __ tbx(decH3, arrangement, codecH, 4, decH3);
10707
10708 // combine lower and higher
10709 __ orr(decL0, arrangement, decL0, decH0);
10710 __ orr(decL1, arrangement, decL1, decH1);
10711 __ orr(decL2, arrangement, decL2, decH2);
10712 __ orr(decL3, arrangement, decL3, decH3);
10713
10714 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10715 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10716 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10717 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10718 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10719 __ orr(in0, arrangement, decH0, decH1);
10720 __ orr(in1, arrangement, decH2, decH3);
10721 __ orr(in2, arrangement, in0, in1);
10722 __ umaxv(in3, arrangement, in2);
10723 __ umov(rscratch2, in3, __ B, 0);
10724
10725 // get the data to output
10726 __ shl(out0, arrangement, decL0, 2);
10727 __ ushr(out1, arrangement, decL1, 4);
10728 __ orr(out0, arrangement, out0, out1);
10729 __ shl(out1, arrangement, decL1, 4);
10730 __ ushr(out2, arrangement, decL2, 2);
10731 __ orr(out1, arrangement, out1, out2);
10732 __ shl(out2, arrangement, decL2, 6);
10733 __ orr(out2, arrangement, out2, decL3);
10734
10735 __ cbz(rscratch2, NoIllegalData);
10736
10737 // handle illegal input
10738 __ umov(r10, in2, __ D, 0);
10739 if (size == 16) {
10740 __ cbnz(r10, ErrorInLowerHalf);
10741
10742 // illegal input is in higher half, store the lower half now.
10743 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10744
10745 __ umov(r10, in2, __ D, 1);
10746 __ umov(r11, out0, __ D, 1);
10747 __ umov(r12, out1, __ D, 1);
10748 __ umov(r13, out2, __ D, 1);
10749 __ b(StoreLegalData);
10750
10751 __ BIND(ErrorInLowerHalf);
10752 }
10753 __ umov(r11, out0, __ D, 0);
10754 __ umov(r12, out1, __ D, 0);
10755 __ umov(r13, out2, __ D, 0);
10756
10757 __ BIND(StoreLegalData);
10758 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10759 __ strb(r11, __ post(dst, 1));
10760 __ strb(r12, __ post(dst, 1));
10761 __ strb(r13, __ post(dst, 1));
10762 __ lsr(r10, r10, 8);
10763 __ lsr(r11, r11, 8);
10764 __ lsr(r12, r12, 8);
10765 __ lsr(r13, r13, 8);
10766 __ b(StoreLegalData);
10767
10768 __ BIND(NoIllegalData);
10769 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10770 }
10771
10772
10773 /**
10774 * Arguments:
10775 *
10776 * Input:
10777 * c_rarg0 - src_start
10778 * c_rarg1 - src_offset
10779 * c_rarg2 - src_length
10780 * c_rarg3 - dest_start
10781 * c_rarg4 - dest_offset
10782 * c_rarg5 - isURL
10783 * c_rarg6 - isMIME
10784 *
10785 */
10786 address generate_base64_decodeBlock() {
10787
10788 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10789 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10790 // titled "Base64 decoding".
10791
10792 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10793 int entry_count = StubInfo::entry_count(stub_id);
10794 assert(entry_count == 1, "sanity check");
10795 address start = load_archive_data(stub_id);
10796 if (start != nullptr) {
10797 return start;
10798 }
10799 __ align(CodeEntryAlignment);
10800 StubCodeMark mark(this, stub_id);
10801 start = __ pc();
10802
10803 Register src = c_rarg0; // source array
10804 Register soff = c_rarg1; // source start offset
10805 Register send = c_rarg2; // source end offset
10806 Register dst = c_rarg3; // dest array
10807 Register doff = c_rarg4; // position for writing to dest array
10808 Register isURL = c_rarg5; // Base64 or URL character set
10809 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10810
10811 Register length = send; // reuse send as length of source data to process
10812
10813 Register simd_codec = c_rarg6;
10814 Register nosimd_codec = c_rarg7;
10815
10816 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10817
10818 __ enter();
10819
10820 __ add(src, src, soff);
10821 __ add(dst, dst, doff);
10822
10823 __ mov(doff, dst);
10824
10825 __ sub(length, send, soff);
10826 __ bfm(length, zr, 0, 1);
10827
10828 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10829 __ cbz(isURL, ProcessData);
10830 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10831
10832 __ BIND(ProcessData);
10833 __ mov(rscratch1, length);
10834 __ cmp(length, (u1)144); // 144 = 80 + 64
10835 __ br(Assembler::LT, Process4B);
10836
10837 // In the MIME case, the line length cannot be more than 76
10838 // bytes (see RFC 2045). This is too short a block for SIMD
10839 // to be worthwhile, so we use non-SIMD here.
10840 __ movw(rscratch1, 79);
10841
10842 __ BIND(Process4B);
10843 __ ldrw(r14, __ post(src, 4));
10844 __ ubfxw(r10, r14, 0, 8);
10845 __ ubfxw(r11, r14, 8, 8);
10846 __ ubfxw(r12, r14, 16, 8);
10847 __ ubfxw(r13, r14, 24, 8);
10848 // get the de-code
10849 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10850 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10851 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10852 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10853 // error detection, 255u indicates an illegal input
10854 __ orrw(r14, r10, r11);
10855 __ orrw(r15, r12, r13);
10856 __ orrw(r14, r14, r15);
10857 __ tbnz(r14, 7, Exit);
10858 // recover the data
10859 __ lslw(r14, r10, 10);
10860 __ bfiw(r14, r11, 4, 6);
10861 __ bfmw(r14, r12, 2, 5);
10862 __ rev16w(r14, r14);
10863 __ bfiw(r13, r12, 6, 2);
10864 __ strh(r14, __ post(dst, 2));
10865 __ strb(r13, __ post(dst, 1));
10866 // non-simd loop
10867 __ subsw(rscratch1, rscratch1, 4);
10868 __ br(Assembler::GT, Process4B);
10869
10870 // if exiting from PreProcess80B, rscratch1 == -1;
10871 // otherwise, rscratch1 == 0.
10872 __ cbzw(rscratch1, Exit);
10873 __ sub(length, length, 80);
10874
10875 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10876 __ cbz(isURL, SIMDEnter);
10877 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10878
10879 __ BIND(SIMDEnter);
10880 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10881 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10882 __ mov(rscratch1, 63);
10883 __ dup(v27, __ T16B, rscratch1);
10884
10885 __ BIND(Process64B);
10886 __ cmp(length, (u1)64);
10887 __ br(Assembler::LT, Process32B);
10888 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10889 __ sub(length, length, 64);
10890 __ b(Process64B);
10891
10892 __ BIND(Process32B);
10893 __ cmp(length, (u1)32);
10894 __ br(Assembler::LT, SIMDExit);
10895 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10896 __ sub(length, length, 32);
10897 __ b(Process32B);
10898
10899 __ BIND(SIMDExit);
10900 __ cbz(length, Exit);
10901 __ movw(rscratch1, length);
10902 __ b(Process4B);
10903
10904 __ BIND(Exit);
10905 __ sub(c_rarg0, dst, doff);
10906
10907 __ leave();
10908 __ ret(lr);
10909
10910 // record the stub entry and end
10911 store_archive_data(stub_id, start, __ pc());
10912
10913 return start;
10914 }
10915
10916 // Support for spin waits.
10917 address generate_spin_wait() {
10918 StubId stub_id = StubId::stubgen_spin_wait_id;
10919 int entry_count = StubInfo::entry_count(stub_id);
10920 assert(entry_count == 1, "sanity check");
10921 address start = load_archive_data(stub_id);
10922 if (start != nullptr) {
10923 return start;
10924 }
10925 __ align(CodeEntryAlignment);
10926 StubCodeMark mark(this, stub_id);
10927 start = __ pc();
10928
10929 __ spin_wait();
10930 __ ret(lr);
10931
10932 // record the stub entry and end
10933 store_archive_data(stub_id, start, __ pc());
10934
10935 return start;
10936 }
10937
10938 void generate_lookup_secondary_supers_table_stub() {
10939 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10940 GrowableArray<address> entries;
10941 int entry_count = StubInfo::entry_count(stub_id);
10942 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10943 address start = load_archive_data(stub_id, &entries);
10944 if (start != nullptr) {
10945 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10946 "unexpected extra entry count %d", entries.length());
10947 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10948 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10949 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10950 }
10951 return;
10952 }
10953
10954 StubCodeMark mark(this, stub_id);
10955
10956 const Register
10957 r_super_klass = r0,
10958 r_array_base = r1,
10959 r_array_length = r2,
10960 r_array_index = r3,
10961 r_sub_klass = r4,
10962 r_bitmap = rscratch2,
10963 result = r5;
10964 const FloatRegister
10965 vtemp = v0;
10966
10967 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10968 address next_entry = __ pc();
10969 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
10970 if (slot == 0) {
10971 start = next_entry;
10972 } else {
10973 entries.append(next_entry);
10974 }
10975 Label L_success;
10976 __ enter();
10977 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
10978 r_array_base, r_array_length, r_array_index,
10979 vtemp, result, slot,
10980 /*stub_is_near*/true);
10981 __ leave();
10982 __ ret(lr);
10983 }
10984 // record the stub entry and end plus all the auxiliary entries
10985 store_archive_data(stub_id, start, __ pc(), &entries);
10986 }
10987
10988 // Slow path implementation for UseSecondarySupersTable.
10989 address generate_lookup_secondary_supers_table_slow_path_stub() {
10990 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
10991 int entry_count = StubInfo::entry_count(stub_id);
10992 assert(entry_count == 1, "sanity check");
10993 address start = load_archive_data(stub_id);
10994 if (start != nullptr) {
10995 return start;
10996 }
10997 StubCodeMark mark(this, stub_id);
10998 start = __ pc();
10999 const Register
11000 r_super_klass = r0, // argument
11001 r_array_base = r1, // argument
11002 temp1 = r2, // temp
11003 r_array_index = r3, // argument
11004 r_bitmap = rscratch2, // argument
11005 result = r5; // argument
11006
11007 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11008 __ ret(lr);
11009
11010 // record the stub entry and end
11011 store_archive_data(stub_id, start, __ pc());
11012
11013 return start;
11014 }
11015
11016 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11017
11018 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11019 //
11020 // If LSE is in use, generate LSE versions of all the stubs. The
11021 // non-LSE versions are in atomic_aarch64.S.
11022
11023 // class AtomicStubMark records the entry point of a stub and the
11024 // stub pointer which will point to it. The stub pointer is set to
11025 // the entry point when ~AtomicStubMark() is called, which must be
11026 // after ICache::invalidate_range. This ensures safe publication of
11027 // the generated code.
11028 class AtomicStubMark {
11029 address _entry_point;
11030 aarch64_atomic_stub_t *_stub;
11031 MacroAssembler *_masm;
11032 public:
11033 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11034 _masm = masm;
11035 __ align(32);
11036 _entry_point = __ pc();
11037 _stub = stub;
11038 }
11039 ~AtomicStubMark() {
11040 *_stub = (aarch64_atomic_stub_t)_entry_point;
11041 }
11042 };
11043
11044 // NB: For memory_order_conservative we need a trailing membar after
11045 // LSE atomic operations but not a leading membar.
11046 //
11047 // We don't need a leading membar because a clause in the Arm ARM
11048 // says:
11049 //
11050 // Barrier-ordered-before
11051 //
11052 // Barrier instructions order prior Memory effects before subsequent
11053 // Memory effects generated by the same Observer. A read or a write
11054 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11055 // Observer if and only if RW1 appears in program order before RW 2
11056 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11057 // instruction with both Acquire and Release semantics.
11058 //
11059 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11060 // and Release semantics, therefore we don't need a leading
11061 // barrier. However, there is no corresponding Barrier-ordered-after
11062 // relationship, therefore we need a trailing membar to prevent a
11063 // later store or load from being reordered with the store in an
11064 // atomic instruction.
11065 //
11066 // This was checked by using the herd7 consistency model simulator
11067 // (http://diy.inria.fr/) with this test case:
11068 //
11069 // AArch64 LseCas
11070 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11071 // P0 | P1;
11072 // LDR W4, [X2] | MOV W3, #0;
11073 // DMB LD | MOV W4, #1;
11074 // LDR W3, [X1] | CASAL W3, W4, [X1];
11075 // | DMB ISH;
11076 // | STR W4, [X2];
11077 // exists
11078 // (0:X3=0 /\ 0:X4=1)
11079 //
11080 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11081 // with the store to x in P1. Without the DMB in P1 this may happen.
11082 //
11083 // At the time of writing we don't know of any AArch64 hardware that
11084 // reorders stores in this way, but the Reference Manual permits it.
11085
11086 void gen_cas_entry(Assembler::operand_size size,
11087 atomic_memory_order order) {
11088 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11089 exchange_val = c_rarg2;
11090 bool acquire, release;
11091 switch (order) {
11092 case memory_order_relaxed:
11093 acquire = false;
11094 release = false;
11095 break;
11096 case memory_order_release:
11097 acquire = false;
11098 release = true;
11099 break;
11100 default:
11101 acquire = true;
11102 release = true;
11103 break;
11104 }
11105 __ mov(prev, compare_val);
11106 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11107 if (order == memory_order_conservative) {
11108 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11109 }
11110 if (size == Assembler::xword) {
11111 __ mov(r0, prev);
11112 } else {
11113 __ movw(r0, prev);
11114 }
11115 __ ret(lr);
11116 }
11117
11118 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11119 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11120 // If not relaxed, then default to conservative. Relaxed is the only
11121 // case we use enough to be worth specializing.
11122 if (order == memory_order_relaxed) {
11123 __ ldadd(size, incr, prev, addr);
11124 } else {
11125 __ ldaddal(size, incr, prev, addr);
11126 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11127 }
11128 if (size == Assembler::xword) {
11129 __ mov(r0, prev);
11130 } else {
11131 __ movw(r0, prev);
11132 }
11133 __ ret(lr);
11134 }
11135
11136 void gen_swpal_entry(Assembler::operand_size size) {
11137 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11138 __ swpal(size, incr, prev, addr);
11139 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11140 if (size == Assembler::xword) {
11141 __ mov(r0, prev);
11142 } else {
11143 __ movw(r0, prev);
11144 }
11145 __ ret(lr);
11146 }
11147
11148 void generate_atomic_entry_points() {
11149 if (! UseLSE) {
11150 return;
11151 }
11152 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11153 GrowableArray<address> entries;
11154 int entry_count = StubInfo::entry_count(stub_id);
11155 address start = load_archive_data(stub_id, &entries);
11156 if (start != nullptr) {
11157 assert(entries.length() == entry_count - 1,
11158 "unexpected extra entry count %d", entries.length());
11159 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11160 int idx = 0;
11161 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11162 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11163 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11164 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11165 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11166 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11167 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11168 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11169 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11170 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11171 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11172 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11173 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11174 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11175 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11176 assert(idx == entries.length(), "sanity!");
11177 return;
11178 }
11179
11180 __ align(CodeEntryAlignment);
11181 StubCodeMark mark(this, stub_id);
11182 start = __ pc();
11183 address end;
11184 {
11185 // ADD, memory_order_conservative
11186 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11187 gen_ldadd_entry(Assembler::word, memory_order_conservative);
11188
11189 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11190 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11191
11192 // ADD, memory_order_relaxed
11193 AtomicStubMark mark_fetch_add_4_relaxed
11194 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11195 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11196
11197 AtomicStubMark mark_fetch_add_8_relaxed
11198 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11199 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11200
11201 // XCHG, memory_order_conservative
11202 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11203 gen_swpal_entry(Assembler::word);
11204
11205 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11206 gen_swpal_entry(Assembler::xword);
11207
11208 // CAS, memory_order_conservative
11209 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11210 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11211
11212 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11213 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11214
11215 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11216 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11217
11218 // CAS, memory_order_relaxed
11219 AtomicStubMark mark_cmpxchg_1_relaxed
11220 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11221 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11222
11223 AtomicStubMark mark_cmpxchg_4_relaxed
11224 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11225 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11226
11227 AtomicStubMark mark_cmpxchg_8_relaxed
11228 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11229 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11230
11231 AtomicStubMark mark_cmpxchg_4_release
11232 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11233 gen_cas_entry(MacroAssembler::word, memory_order_release);
11234
11235 AtomicStubMark mark_cmpxchg_8_release
11236 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11237 gen_cas_entry(MacroAssembler::xword, memory_order_release);
11238
11239 AtomicStubMark mark_cmpxchg_4_seq_cst
11240 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11241 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11242
11243 AtomicStubMark mark_cmpxchg_8_seq_cst
11244 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11245 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11246
11247 end = __ pc();
11248
11249 ICache::invalidate_range(start, end - start);
11250 // exit block to force update of AtomicStubMark targets
11251 }
11252
11253 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11254 "atomic stub should be at start of buffer");
11255 // record the stub start and end plus all the entries saved by the
11256 // AtomicStubMark destructor
11257 entries.append((address)aarch64_atomic_fetch_add_8_impl);
11258 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11259 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11260 entries.append((address)aarch64_atomic_xchg_4_impl);
11261 entries.append((address)aarch64_atomic_xchg_8_impl);
11262 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11263 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11264 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11265 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11266 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11267 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11268 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11269 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11270 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11271 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11272
11273 assert(entries.length() == entry_count - 1,
11274 "unexpected extra entry count %d", entries.length());
11275
11276 store_archive_data(stub_id, start, end, &entries);
11277 }
11278 #endif // LINUX
11279
11280 address generate_cont_thaw(Continuation::thaw_kind kind) {
11281 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11282 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11283
11284 address start = __ pc();
11285
11286 if (return_barrier) {
11287 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11288 __ mov(sp, rscratch1);
11289 }
11290 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11291
11292 if (return_barrier) {
11293 // preserve possible return value from a method returning to the return barrier
11294 __ fmovd(rscratch1, v0);
11295 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11296 }
11297
11298 __ movw(c_rarg1, (return_barrier ? 1 : 0));
11299 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11300 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11301
11302 if (return_barrier) {
11303 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11304 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11305 __ fmovd(v0, rscratch1);
11306 }
11307 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11308
11309
11310 Label thaw_success;
11311 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11312 __ cbnz(rscratch2, thaw_success);
11313 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11314 __ br(rscratch1);
11315 __ bind(thaw_success);
11316
11317 // make room for the thawed frames
11318 __ sub(rscratch1, sp, rscratch2);
11319 __ andr(rscratch1, rscratch1, -16); // align
11320 __ mov(sp, rscratch1);
11321
11322 if (return_barrier) {
11323 // save original return value -- again
11324 __ fmovd(rscratch1, v0);
11325 __ stp(rscratch1, r0, Address(__ pre(sp, -2 * wordSize)));
11326 }
11327
11328 // If we want, we can templatize thaw by kind, and have three different entries
11329 __ movw(c_rarg1, (uint32_t)kind);
11330
11331 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11332 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11333
11334 if (return_barrier) {
11335 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11336 __ ldp(rscratch1, r0, Address(__ post(sp, 2 * wordSize)));
11337 __ fmovd(v0, rscratch1);
11338 } else {
11339 __ mov(r0, zr); // return 0 (success) from doYield
11340 }
11341
11342 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11343 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11344 __ mov(rfp, sp);
11345
11346 if (return_barrier_exception) {
11347 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11348 __ authenticate_return_address(c_rarg1);
11349 __ verify_oop(r0);
11350 // save return value containing the exception oop in callee-saved R19
11351 __ mov(r19, r0);
11352
11353 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11354
11355 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11356 // __ reinitialize_ptrue();
11357
11358 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11359
11360 __ mov(r1, r0); // the exception handler
11361 __ mov(r0, r19); // restore return value containing the exception oop
11362 __ verify_oop(r0);
11363
11364 __ leave();
11365 __ mov(r3, lr);
11366 __ br(r1); // the exception handler
11367 } else {
11368 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11369 __ leave();
11370 __ ret(lr);
11371 }
11372
11373 return start;
11374 }
11375
11376 address generate_cont_thaw() {
11377 if (!Continuations::enabled()) return nullptr;
11378
11379 StubId stub_id = StubId::stubgen_cont_thaw_id;
11380 int entry_count = StubInfo::entry_count(stub_id);
11381 assert(entry_count == 1, "sanity check");
11382 address start = load_archive_data(stub_id);
11383 if (start != nullptr) {
11384 return start;
11385 }
11386 StubCodeMark mark(this, stub_id);
11387 start = __ pc();
11388 generate_cont_thaw(Continuation::thaw_top);
11389
11390 // record the stub start and end
11391 store_archive_data(stub_id, start, __ pc());
11392
11393 return start;
11394 }
11395
11396 address generate_cont_returnBarrier() {
11397 if (!Continuations::enabled()) return nullptr;
11398
11399 // TODO: will probably need multiple return barriers depending on return type
11400 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11401 int entry_count = StubInfo::entry_count(stub_id);
11402 assert(entry_count == 1, "sanity check");
11403 address start = load_archive_data(stub_id);
11404 if (start != nullptr) {
11405 return start;
11406 }
11407 StubCodeMark mark(this, stub_id);
11408 start = __ pc();
11409
11410 generate_cont_thaw(Continuation::thaw_return_barrier);
11411
11412 // record the stub start and end
11413 store_archive_data(stub_id, start, __ pc());
11414
11415 return start;
11416 }
11417
11418 address generate_cont_returnBarrier_exception() {
11419 if (!Continuations::enabled()) return nullptr;
11420
11421 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11422 int entry_count = StubInfo::entry_count(stub_id);
11423 assert(entry_count == 1, "sanity check");
11424 address start = load_archive_data(stub_id);
11425 if (start != nullptr) {
11426 return start;
11427 }
11428 StubCodeMark mark(this, stub_id);
11429 start = __ pc();
11430
11431 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11432
11433 // record the stub start and end
11434 store_archive_data(stub_id, start, __ pc());
11435
11436 return start;
11437 }
11438
11439 address generate_cont_preempt_stub() {
11440 if (!Continuations::enabled()) return nullptr;
11441 StubId stub_id = StubId::stubgen_cont_preempt_id;
11442 int entry_count = StubInfo::entry_count(stub_id);
11443 assert(entry_count == 1, "sanity check");
11444 address start = load_archive_data(stub_id);
11445 if (start != nullptr) {
11446 return start;
11447 }
11448 StubCodeMark mark(this, stub_id);
11449 start = __ pc();
11450
11451 __ reset_last_Java_frame(true);
11452
11453 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11454 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11455 __ mov(sp, rscratch2);
11456
11457 Label preemption_cancelled;
11458 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11459 __ cbnz(rscratch1, preemption_cancelled);
11460
11461 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11462 SharedRuntime::continuation_enter_cleanup(_masm);
11463 __ leave();
11464 __ ret(lr);
11465
11466 // We acquired the monitor after freezing the frames so call thaw to continue execution.
11467 __ bind(preemption_cancelled);
11468 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11469 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11470 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11471 __ ldr(rscratch1, Address(rscratch1));
11472 __ br(rscratch1);
11473
11474 // record the stub start and end
11475 store_archive_data(stub_id, start, __ pc());
11476
11477 return start;
11478 }
11479
11480 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11481 // are represented as long[5], with BITS_PER_LIMB = 26.
11482 // Pack five 26-bit limbs into three 64-bit registers.
11483 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11484 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
11485 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
11486 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11487 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
11488
11489 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
11490 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
11491 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11492 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
11493
11494 if (dest2->is_valid()) {
11495 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
11496 } else {
11497 #ifdef ASSERT
11498 Label OK;
11499 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
11500 __ br(__ EQ, OK);
11501 __ stop("high bits of Poly1305 integer should be zero");
11502 __ should_not_reach_here();
11503 __ bind(OK);
11504 #endif
11505 }
11506 }
11507
11508 // As above, but return only a 128-bit integer, packed into two
11509 // 64-bit registers.
11510 void pack_26(Register dest0, Register dest1, Register src) {
11511 pack_26(dest0, dest1, noreg, src);
11512 }
11513
11514 // Multiply and multiply-accumulate unsigned 64-bit registers.
11515 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11516 __ mul(prod_lo, n, m);
11517 __ umulh(prod_hi, n, m);
11518 }
11519 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11520 wide_mul(rscratch1, rscratch2, n, m);
11521 __ adds(sum_lo, sum_lo, rscratch1);
11522 __ adc(sum_hi, sum_hi, rscratch2);
11523 }
11524
11525 // Poly1305, RFC 7539
11526
11527 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11528 // description of the tricks used to simplify and accelerate this
11529 // computation.
11530
11531 address generate_poly1305_processBlocks() {
11532 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11533 int entry_count = StubInfo::entry_count(stub_id);
11534 assert(entry_count == 1, "sanity check");
11535 address start = load_archive_data(stub_id);
11536 if (start != nullptr) {
11537 return start;
11538 }
11539 __ align(CodeEntryAlignment);
11540 StubCodeMark mark(this, stub_id);
11541 start = __ pc();
11542 Label here;
11543 __ enter();
11544 RegSet callee_saved = RegSet::range(r19, r28);
11545 __ push(callee_saved, sp);
11546
11547 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11548
11549 // Arguments
11550 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11551
11552 // R_n is the 128-bit randomly-generated key, packed into two
11553 // registers. The caller passes this key to us as long[5], with
11554 // BITS_PER_LIMB = 26.
11555 const Register R_0 = *++regs, R_1 = *++regs;
11556 pack_26(R_0, R_1, r_start);
11557
11558 // RR_n is (R_n >> 2) * 5
11559 const Register RR_0 = *++regs, RR_1 = *++regs;
11560 __ lsr(RR_0, R_0, 2);
11561 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11562 __ lsr(RR_1, R_1, 2);
11563 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11564
11565 // U_n is the current checksum
11566 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11567 pack_26(U_0, U_1, U_2, acc_start);
11568
11569 static constexpr int BLOCK_LENGTH = 16;
11570 Label DONE, LOOP;
11571
11572 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11573 __ br(Assembler::LT, DONE); {
11574 __ bind(LOOP);
11575
11576 // S_n is to be the sum of U_n and the next block of data
11577 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11578 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11579 __ adds(S_0, U_0, S_0);
11580 __ adcs(S_1, U_1, S_1);
11581 __ adc(S_2, U_2, zr);
11582 __ add(S_2, S_2, 1);
11583
11584 const Register U_0HI = *++regs, U_1HI = *++regs;
11585
11586 // NB: this logic depends on some of the special properties of
11587 // Poly1305 keys. In particular, because we know that the top
11588 // four bits of R_0 and R_1 are zero, we can add together
11589 // partial products without any risk of needing to propagate a
11590 // carry out.
11591 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11592 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
11593 __ andr(U_2, R_0, 3);
11594 __ mul(U_2, S_2, U_2);
11595
11596 // Recycle registers S_0, S_1, S_2
11597 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11598
11599 // Partial reduction mod 2**130 - 5
11600 __ adds(U_1, U_0HI, U_1);
11601 __ adc(U_2, U_1HI, U_2);
11602 // Sum now in U_2:U_1:U_0.
11603 // Dead: U_0HI, U_1HI.
11604 regs = (regs.remaining() + U_0HI + U_1HI).begin();
11605
11606 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11607
11608 // First, U_2:U_1:U_0 += (U_2 >> 2)
11609 __ lsr(rscratch1, U_2, 2);
11610 __ andr(U_2, U_2, (u8)3);
11611 __ adds(U_0, U_0, rscratch1);
11612 __ adcs(U_1, U_1, zr);
11613 __ adc(U_2, U_2, zr);
11614 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11615 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11616 __ adcs(U_1, U_1, zr);
11617 __ adc(U_2, U_2, zr);
11618
11619 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11620 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11621 __ br(~ Assembler::LT, LOOP);
11622 }
11623
11624 // Further reduce modulo 2^130 - 5
11625 __ lsr(rscratch1, U_2, 2);
11626 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11627 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11628 __ adcs(U_1, U_1, zr);
11629 __ andr(U_2, U_2, (u1)3);
11630 __ adc(U_2, U_2, zr);
11631
11632 // Unpack the sum into five 26-bit limbs and write to memory.
11633 __ ubfiz(rscratch1, U_0, 0, 26);
11634 __ ubfx(rscratch2, U_0, 26, 26);
11635 __ stp(rscratch1, rscratch2, Address(acc_start));
11636 __ ubfx(rscratch1, U_0, 52, 12);
11637 __ bfi(rscratch1, U_1, 12, 14);
11638 __ ubfx(rscratch2, U_1, 14, 26);
11639 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11640 __ ubfx(rscratch1, U_1, 40, 24);
11641 __ bfi(rscratch1, U_2, 24, 3);
11642 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11643
11644 __ bind(DONE);
11645 __ pop(callee_saved, sp);
11646 __ leave();
11647 __ ret(lr);
11648
11649 // record the stub start and end
11650 store_archive_data(stub_id, start, __ pc());
11651
11652 return start;
11653 }
11654
11655 // exception handler for upcall stubs
11656 address generate_upcall_stub_exception_handler() {
11657 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11658 int entry_count = StubInfo::entry_count(stub_id);
11659 assert(entry_count == 1, "sanity check");
11660 address start = load_archive_data(stub_id);
11661 if (start != nullptr) {
11662 return start;
11663 }
11664 StubCodeMark mark(this, stub_id);
11665 start = __ pc();
11666
11667 // Native caller has no idea how to handle exceptions,
11668 // so we just crash here. Up to callee to catch exceptions.
11669 __ verify_oop(r0);
11670 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11671 __ blr(rscratch1);
11672 __ should_not_reach_here();
11673
11674 // record the stub start and end
11675 store_archive_data(stub_id, start, __ pc());
11676
11677 return start;
11678 }
11679
11680 // load Method* target of MethodHandle
11681 // j_rarg0 = jobject receiver
11682 // rmethod = result
11683 address generate_upcall_stub_load_target() {
11684 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11685 int entry_count = StubInfo::entry_count(stub_id);
11686 assert(entry_count == 1, "sanity check");
11687 address start = load_archive_data(stub_id);
11688 if (start != nullptr) {
11689 return start;
11690 }
11691 StubCodeMark mark(this, stub_id);
11692 start = __ pc();
11693
11694 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11695 // Load target method from receiver
11696 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11697 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11698 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11699 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11700 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11701 noreg, noreg);
11702 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11703
11704 __ ret(lr);
11705
11706 // record the stub start and end
11707 store_archive_data(stub_id, start, __ pc());
11708
11709 return start;
11710 }
11711
11712 #undef __
11713 #define __ masm->
11714
11715 class MontgomeryMultiplyGenerator : public MacroAssembler {
11716
11717 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11718 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11719
11720 RegSet _toSave;
11721 bool _squaring;
11722
11723 public:
11724 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11725 : MacroAssembler(as->code()), _squaring(squaring) {
11726
11727 // Register allocation
11728
11729 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11730 Pa_base = *regs; // Argument registers
11731 if (squaring)
11732 Pb_base = Pa_base;
11733 else
11734 Pb_base = *++regs;
11735 Pn_base = *++regs;
11736 Rlen= *++regs;
11737 inv = *++regs;
11738 Pm_base = *++regs;
11739
11740 // Working registers:
11741 Ra = *++regs; // The current digit of a, b, n, and m.
11742 Rb = *++regs;
11743 Rm = *++regs;
11744 Rn = *++regs;
11745
11746 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
11747 Pb = *++regs;
11748 Pm = *++regs;
11749 Pn = *++regs;
11750
11751 t0 = *++regs; // Three registers which form a
11752 t1 = *++regs; // triple-precision accumuator.
11753 t2 = *++regs;
11754
11755 Ri = *++regs; // Inner and outer loop indexes.
11756 Rj = *++regs;
11757
11758 Rhi_ab = *++regs; // Product registers: low and high parts
11759 Rlo_ab = *++regs; // of a*b and m*n.
11760 Rhi_mn = *++regs;
11761 Rlo_mn = *++regs;
11762
11763 // r19 and up are callee-saved.
11764 _toSave = RegSet::range(r19, *regs) + Pm_base;
11765 }
11766
11767 private:
11768 void save_regs() {
11769 push(_toSave, sp);
11770 }
11771
11772 void restore_regs() {
11773 pop(_toSave, sp);
11774 }
11775
11776 template <typename T>
11777 void unroll_2(Register count, T block) {
11778 Label loop, end, odd;
11779 tbnz(count, 0, odd);
11780 cbz(count, end);
11781 align(16);
11782 bind(loop);
11783 (this->*block)();
11784 bind(odd);
11785 (this->*block)();
11786 subs(count, count, 2);
11787 br(Assembler::GT, loop);
11788 bind(end);
11789 }
11790
11791 template <typename T>
11792 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11793 Label loop, end, odd;
11794 tbnz(count, 0, odd);
11795 cbz(count, end);
11796 align(16);
11797 bind(loop);
11798 (this->*block)(d, s, tmp);
11799 bind(odd);
11800 (this->*block)(d, s, tmp);
11801 subs(count, count, 2);
11802 br(Assembler::GT, loop);
11803 bind(end);
11804 }
11805
11806 void pre1(RegisterOrConstant i) {
11807 block_comment("pre1");
11808 // Pa = Pa_base;
11809 // Pb = Pb_base + i;
11810 // Pm = Pm_base;
11811 // Pn = Pn_base + i;
11812 // Ra = *Pa;
11813 // Rb = *Pb;
11814 // Rm = *Pm;
11815 // Rn = *Pn;
11816 ldr(Ra, Address(Pa_base));
11817 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11818 ldr(Rm, Address(Pm_base));
11819 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11820 lea(Pa, Address(Pa_base));
11821 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11822 lea(Pm, Address(Pm_base));
11823 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11824
11825 // Zero the m*n result.
11826 mov(Rhi_mn, zr);
11827 mov(Rlo_mn, zr);
11828 }
11829
11830 // The core multiply-accumulate step of a Montgomery
11831 // multiplication. The idea is to schedule operations as a
11832 // pipeline so that instructions with long latencies (loads and
11833 // multiplies) have time to complete before their results are
11834 // used. This most benefits in-order implementations of the
11835 // architecture but out-of-order ones also benefit.
11836 void step() {
11837 block_comment("step");
11838 // MACC(Ra, Rb, t0, t1, t2);
11839 // Ra = *++Pa;
11840 // Rb = *--Pb;
11841 umulh(Rhi_ab, Ra, Rb);
11842 mul(Rlo_ab, Ra, Rb);
11843 ldr(Ra, pre(Pa, wordSize));
11844 ldr(Rb, pre(Pb, -wordSize));
11845 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11846 // previous iteration.
11847 // MACC(Rm, Rn, t0, t1, t2);
11848 // Rm = *++Pm;
11849 // Rn = *--Pn;
11850 umulh(Rhi_mn, Rm, Rn);
11851 mul(Rlo_mn, Rm, Rn);
11852 ldr(Rm, pre(Pm, wordSize));
11853 ldr(Rn, pre(Pn, -wordSize));
11854 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11855 }
11856
11857 void post1() {
11858 block_comment("post1");
11859
11860 // MACC(Ra, Rb, t0, t1, t2);
11861 // Ra = *++Pa;
11862 // Rb = *--Pb;
11863 umulh(Rhi_ab, Ra, Rb);
11864 mul(Rlo_ab, Ra, Rb);
11865 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11866 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11867
11868 // *Pm = Rm = t0 * inv;
11869 mul(Rm, t0, inv);
11870 str(Rm, Address(Pm));
11871
11872 // MACC(Rm, Rn, t0, t1, t2);
11873 // t0 = t1; t1 = t2; t2 = 0;
11874 umulh(Rhi_mn, Rm, Rn);
11875
11876 #ifndef PRODUCT
11877 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11878 {
11879 mul(Rlo_mn, Rm, Rn);
11880 add(Rlo_mn, t0, Rlo_mn);
11881 Label ok;
11882 cbz(Rlo_mn, ok); {
11883 stop("broken Montgomery multiply");
11884 } bind(ok);
11885 }
11886 #endif
11887 // We have very carefully set things up so that
11888 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11889 // the lower half of Rm * Rn because we know the result already:
11890 // it must be -t0. t0 + (-t0) must generate a carry iff
11891 // t0 != 0. So, rather than do a mul and an adds we just set
11892 // the carry flag iff t0 is nonzero.
11893 //
11894 // mul(Rlo_mn, Rm, Rn);
11895 // adds(zr, t0, Rlo_mn);
11896 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11897 adcs(t0, t1, Rhi_mn);
11898 adc(t1, t2, zr);
11899 mov(t2, zr);
11900 }
11901
11902 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11903 block_comment("pre2");
11904 // Pa = Pa_base + i-len;
11905 // Pb = Pb_base + len;
11906 // Pm = Pm_base + i-len;
11907 // Pn = Pn_base + len;
11908
11909 if (i.is_register()) {
11910 sub(Rj, i.as_register(), len);
11911 } else {
11912 mov(Rj, i.as_constant());
11913 sub(Rj, Rj, len);
11914 }
11915 // Rj == i-len
11916
11917 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11918 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11919 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11920 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11921
11922 // Ra = *++Pa;
11923 // Rb = *--Pb;
11924 // Rm = *++Pm;
11925 // Rn = *--Pn;
11926 ldr(Ra, pre(Pa, wordSize));
11927 ldr(Rb, pre(Pb, -wordSize));
11928 ldr(Rm, pre(Pm, wordSize));
11929 ldr(Rn, pre(Pn, -wordSize));
11930
11931 mov(Rhi_mn, zr);
11932 mov(Rlo_mn, zr);
11933 }
11934
11935 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11936 block_comment("post2");
11937 if (i.is_constant()) {
11938 mov(Rj, i.as_constant()-len.as_constant());
11939 } else {
11940 sub(Rj, i.as_register(), len);
11941 }
11942
11943 adds(t0, t0, Rlo_mn); // The pending m*n, low part
11944
11945 // As soon as we know the least significant digit of our result,
11946 // store it.
11947 // Pm_base[i-len] = t0;
11948 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11949
11950 // t0 = t1; t1 = t2; t2 = 0;
11951 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
11952 adc(t1, t2, zr);
11953 mov(t2, zr);
11954 }
11955
11956 // A carry in t0 after Montgomery multiplication means that we
11957 // should subtract multiples of n from our result in m. We'll
11958 // keep doing that until there is no carry.
11959 void normalize(RegisterOrConstant len) {
11960 block_comment("normalize");
11961 // while (t0)
11962 // t0 = sub(Pm_base, Pn_base, t0, len);
11963 Label loop, post, again;
11964 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
11965 cbz(t0, post); {
11966 bind(again); {
11967 mov(i, zr);
11968 mov(cnt, len);
11969 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11970 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11971 subs(zr, zr, zr); // set carry flag, i.e. no borrow
11972 align(16);
11973 bind(loop); {
11974 sbcs(Rm, Rm, Rn);
11975 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11976 add(i, i, 1);
11977 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
11978 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11979 sub(cnt, cnt, 1);
11980 } cbnz(cnt, loop);
11981 sbc(t0, t0, zr);
11982 } cbnz(t0, again);
11983 } bind(post);
11984 }
11985
11986 // Move memory at s to d, reversing words.
11987 // Increments d to end of copied memory
11988 // Destroys tmp1, tmp2
11989 // Preserves len
11990 // Leaves s pointing to the address which was in d at start
11991 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
11992 assert(tmp1->encoding() < r19->encoding(), "register corruption");
11993 assert(tmp2->encoding() < r19->encoding(), "register corruption");
11994
11995 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
11996 mov(tmp1, len);
11997 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
11998 sub(s, d, len, ext::uxtw, LogBytesPerWord);
11999 }
12000 // where
12001 void reverse1(Register d, Register s, Register tmp) {
12002 ldr(tmp, pre(s, -wordSize));
12003 ror(tmp, tmp, 32);
12004 str(tmp, post(d, wordSize));
12005 }
12006
12007 void step_squaring() {
12008 // An extra ACC
12009 step();
12010 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12011 }
12012
12013 void last_squaring(RegisterOrConstant i) {
12014 Label dont;
12015 // if ((i & 1) == 0) {
12016 tbnz(i.as_register(), 0, dont); {
12017 // MACC(Ra, Rb, t0, t1, t2);
12018 // Ra = *++Pa;
12019 // Rb = *--Pb;
12020 umulh(Rhi_ab, Ra, Rb);
12021 mul(Rlo_ab, Ra, Rb);
12022 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12023 } bind(dont);
12024 }
12025
12026 void extra_step_squaring() {
12027 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12028
12029 // MACC(Rm, Rn, t0, t1, t2);
12030 // Rm = *++Pm;
12031 // Rn = *--Pn;
12032 umulh(Rhi_mn, Rm, Rn);
12033 mul(Rlo_mn, Rm, Rn);
12034 ldr(Rm, pre(Pm, wordSize));
12035 ldr(Rn, pre(Pn, -wordSize));
12036 }
12037
12038 void post1_squaring() {
12039 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12040
12041 // *Pm = Rm = t0 * inv;
12042 mul(Rm, t0, inv);
12043 str(Rm, Address(Pm));
12044
12045 // MACC(Rm, Rn, t0, t1, t2);
12046 // t0 = t1; t1 = t2; t2 = 0;
12047 umulh(Rhi_mn, Rm, Rn);
12048
12049 #ifndef PRODUCT
12050 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12051 {
12052 mul(Rlo_mn, Rm, Rn);
12053 add(Rlo_mn, t0, Rlo_mn);
12054 Label ok;
12055 cbz(Rlo_mn, ok); {
12056 stop("broken Montgomery multiply");
12057 } bind(ok);
12058 }
12059 #endif
12060 // We have very carefully set things up so that
12061 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12062 // the lower half of Rm * Rn because we know the result already:
12063 // it must be -t0. t0 + (-t0) must generate a carry iff
12064 // t0 != 0. So, rather than do a mul and an adds we just set
12065 // the carry flag iff t0 is nonzero.
12066 //
12067 // mul(Rlo_mn, Rm, Rn);
12068 // adds(zr, t0, Rlo_mn);
12069 subs(zr, t0, 1); // Set carry iff t0 is nonzero
12070 adcs(t0, t1, Rhi_mn);
12071 adc(t1, t2, zr);
12072 mov(t2, zr);
12073 }
12074
12075 void acc(Register Rhi, Register Rlo,
12076 Register t0, Register t1, Register t2) {
12077 adds(t0, t0, Rlo);
12078 adcs(t1, t1, Rhi);
12079 adc(t2, t2, zr);
12080 }
12081
12082 public:
12083 /**
12084 * Fast Montgomery multiplication. The derivation of the
12085 * algorithm is in A Cryptographic Library for the Motorola
12086 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12087 *
12088 * Arguments:
12089 *
12090 * Inputs for multiplication:
12091 * c_rarg0 - int array elements a
12092 * c_rarg1 - int array elements b
12093 * c_rarg2 - int array elements n (the modulus)
12094 * c_rarg3 - int length
12095 * c_rarg4 - int inv
12096 * c_rarg5 - int array elements m (the result)
12097 *
12098 * Inputs for squaring:
12099 * c_rarg0 - int array elements a
12100 * c_rarg1 - int array elements n (the modulus)
12101 * c_rarg2 - int length
12102 * c_rarg3 - int inv
12103 * c_rarg4 - int array elements m (the result)
12104 *
12105 */
12106 address generate_multiply() {
12107 Label argh, nothing;
12108
12109 align(CodeEntryAlignment);
12110 address entry = pc();
12111
12112 cbzw(Rlen, nothing);
12113
12114 enter();
12115
12116 // Make room.
12117 cmpw(Rlen, 512);
12118 br(Assembler::HI, argh);
12119 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12120 andr(sp, Ra, -2 * wordSize);
12121
12122 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12123
12124 {
12125 // Copy input args, reversing as we go. We use Ra as a
12126 // temporary variable.
12127 reverse(Ra, Pa_base, Rlen, t0, t1);
12128 if (!_squaring)
12129 reverse(Ra, Pb_base, Rlen, t0, t1);
12130 reverse(Ra, Pn_base, Rlen, t0, t1);
12131 }
12132
12133 // Push all call-saved registers and also Pm_base which we'll need
12134 // at the end.
12135 save_regs();
12136
12137 #ifndef PRODUCT
12138 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12139 {
12140 ldr(Rn, Address(Pn_base, 0));
12141 mul(Rlo_mn, Rn, inv);
12142 subs(zr, Rlo_mn, -1);
12143 Label ok;
12144 br(EQ, ok); {
12145 stop("broken inverse in Montgomery multiply");
12146 } bind(ok);
12147 }
12148 #endif
12149
12150 mov(Pm_base, Ra);
12151
12152 mov(t0, zr);
12153 mov(t1, zr);
12154 mov(t2, zr);
12155
12156 block_comment("for (int i = 0; i < len; i++) {");
12157 mov(Ri, zr); {
12158 Label loop, end;
12159 cmpw(Ri, Rlen);
12160 br(Assembler::GE, end);
12161
12162 bind(loop);
12163 pre1(Ri);
12164
12165 block_comment(" for (j = i; j; j--) {"); {
12166 movw(Rj, Ri);
12167 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12168 } block_comment(" } // j");
12169
12170 post1();
12171 addw(Ri, Ri, 1);
12172 cmpw(Ri, Rlen);
12173 br(Assembler::LT, loop);
12174 bind(end);
12175 block_comment("} // i");
12176 }
12177
12178 block_comment("for (int i = len; i < 2*len; i++) {");
12179 mov(Ri, Rlen); {
12180 Label loop, end;
12181 cmpw(Ri, Rlen, Assembler::LSL, 1);
12182 br(Assembler::GE, end);
12183
12184 bind(loop);
12185 pre2(Ri, Rlen);
12186
12187 block_comment(" for (j = len*2-i-1; j; j--) {"); {
12188 lslw(Rj, Rlen, 1);
12189 subw(Rj, Rj, Ri);
12190 subw(Rj, Rj, 1);
12191 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12192 } block_comment(" } // j");
12193
12194 post2(Ri, Rlen);
12195 addw(Ri, Ri, 1);
12196 cmpw(Ri, Rlen, Assembler::LSL, 1);
12197 br(Assembler::LT, loop);
12198 bind(end);
12199 }
12200 block_comment("} // i");
12201
12202 normalize(Rlen);
12203
12204 mov(Ra, Pm_base); // Save Pm_base in Ra
12205 restore_regs(); // Restore caller's Pm_base
12206
12207 // Copy our result into caller's Pm_base
12208 reverse(Pm_base, Ra, Rlen, t0, t1);
12209
12210 leave();
12211 bind(nothing);
12212 ret(lr);
12213
12214 // handler for error case
12215 bind(argh);
12216 stop("MontgomeryMultiply total_allocation must be <= 8192");
12217
12218 return entry;
12219 }
12220 // In C, approximately:
12221
12222 // void
12223 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12224 // julong Pn_base[], julong Pm_base[],
12225 // julong inv, int len) {
12226 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12227 // julong *Pa, *Pb, *Pn, *Pm;
12228 // julong Ra, Rb, Rn, Rm;
12229
12230 // int i;
12231
12232 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12233
12234 // for (i = 0; i < len; i++) {
12235 // int j;
12236
12237 // Pa = Pa_base;
12238 // Pb = Pb_base + i;
12239 // Pm = Pm_base;
12240 // Pn = Pn_base + i;
12241
12242 // Ra = *Pa;
12243 // Rb = *Pb;
12244 // Rm = *Pm;
12245 // Rn = *Pn;
12246
12247 // int iters = i;
12248 // for (j = 0; iters--; j++) {
12249 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12250 // MACC(Ra, Rb, t0, t1, t2);
12251 // Ra = *++Pa;
12252 // Rb = *--Pb;
12253 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12254 // MACC(Rm, Rn, t0, t1, t2);
12255 // Rm = *++Pm;
12256 // Rn = *--Pn;
12257 // }
12258
12259 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12260 // MACC(Ra, Rb, t0, t1, t2);
12261 // *Pm = Rm = t0 * inv;
12262 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12263 // MACC(Rm, Rn, t0, t1, t2);
12264
12265 // assert(t0 == 0, "broken Montgomery multiply");
12266
12267 // t0 = t1; t1 = t2; t2 = 0;
12268 // }
12269
12270 // for (i = len; i < 2*len; i++) {
12271 // int j;
12272
12273 // Pa = Pa_base + i-len;
12274 // Pb = Pb_base + len;
12275 // Pm = Pm_base + i-len;
12276 // Pn = Pn_base + len;
12277
12278 // Ra = *++Pa;
12279 // Rb = *--Pb;
12280 // Rm = *++Pm;
12281 // Rn = *--Pn;
12282
12283 // int iters = len*2-i-1;
12284 // for (j = i-len+1; iters--; j++) {
12285 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12286 // MACC(Ra, Rb, t0, t1, t2);
12287 // Ra = *++Pa;
12288 // Rb = *--Pb;
12289 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12290 // MACC(Rm, Rn, t0, t1, t2);
12291 // Rm = *++Pm;
12292 // Rn = *--Pn;
12293 // }
12294
12295 // Pm_base[i-len] = t0;
12296 // t0 = t1; t1 = t2; t2 = 0;
12297 // }
12298
12299 // while (t0)
12300 // t0 = sub(Pm_base, Pn_base, t0, len);
12301 // }
12302
12303 /**
12304 * Fast Montgomery squaring. This uses asymptotically 25% fewer
12305 * multiplies than Montgomery multiplication so it should be up to
12306 * 25% faster. However, its loop control is more complex and it
12307 * may actually run slower on some machines.
12308 *
12309 * Arguments:
12310 *
12311 * Inputs:
12312 * c_rarg0 - int array elements a
12313 * c_rarg1 - int array elements n (the modulus)
12314 * c_rarg2 - int length
12315 * c_rarg3 - int inv
12316 * c_rarg4 - int array elements m (the result)
12317 *
12318 */
12319 address generate_square() {
12320 Label argh;
12321
12322 align(CodeEntryAlignment);
12323 address entry = pc();
12324
12325 enter();
12326
12327 // Make room.
12328 cmpw(Rlen, 512);
12329 br(Assembler::HI, argh);
12330 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12331 andr(sp, Ra, -2 * wordSize);
12332
12333 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12334
12335 {
12336 // Copy input args, reversing as we go. We use Ra as a
12337 // temporary variable.
12338 reverse(Ra, Pa_base, Rlen, t0, t1);
12339 reverse(Ra, Pn_base, Rlen, t0, t1);
12340 }
12341
12342 // Push all call-saved registers and also Pm_base which we'll need
12343 // at the end.
12344 save_regs();
12345
12346 mov(Pm_base, Ra);
12347
12348 mov(t0, zr);
12349 mov(t1, zr);
12350 mov(t2, zr);
12351
12352 block_comment("for (int i = 0; i < len; i++) {");
12353 mov(Ri, zr); {
12354 Label loop, end;
12355 bind(loop);
12356 cmp(Ri, Rlen);
12357 br(Assembler::GE, end);
12358
12359 pre1(Ri);
12360
12361 block_comment("for (j = (i+1)/2; j; j--) {"); {
12362 add(Rj, Ri, 1);
12363 lsr(Rj, Rj, 1);
12364 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12365 } block_comment(" } // j");
12366
12367 last_squaring(Ri);
12368
12369 block_comment(" for (j = i/2; j; j--) {"); {
12370 lsr(Rj, Ri, 1);
12371 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12372 } block_comment(" } // j");
12373
12374 post1_squaring();
12375 add(Ri, Ri, 1);
12376 cmp(Ri, Rlen);
12377 br(Assembler::LT, loop);
12378
12379 bind(end);
12380 block_comment("} // i");
12381 }
12382
12383 block_comment("for (int i = len; i < 2*len; i++) {");
12384 mov(Ri, Rlen); {
12385 Label loop, end;
12386 bind(loop);
12387 cmp(Ri, Rlen, Assembler::LSL, 1);
12388 br(Assembler::GE, end);
12389
12390 pre2(Ri, Rlen);
12391
12392 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
12393 lsl(Rj, Rlen, 1);
12394 sub(Rj, Rj, Ri);
12395 sub(Rj, Rj, 1);
12396 lsr(Rj, Rj, 1);
12397 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12398 } block_comment(" } // j");
12399
12400 last_squaring(Ri);
12401
12402 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
12403 lsl(Rj, Rlen, 1);
12404 sub(Rj, Rj, Ri);
12405 lsr(Rj, Rj, 1);
12406 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12407 } block_comment(" } // j");
12408
12409 post2(Ri, Rlen);
12410 add(Ri, Ri, 1);
12411 cmp(Ri, Rlen, Assembler::LSL, 1);
12412
12413 br(Assembler::LT, loop);
12414 bind(end);
12415 block_comment("} // i");
12416 }
12417
12418 normalize(Rlen);
12419
12420 mov(Ra, Pm_base); // Save Pm_base in Ra
12421 restore_regs(); // Restore caller's Pm_base
12422
12423 // Copy our result into caller's Pm_base
12424 reverse(Pm_base, Ra, Rlen, t0, t1);
12425
12426 leave();
12427 ret(lr);
12428
12429 // handler for error case
12430 bind(argh);
12431 stop("MontgomeryMultiply total_allocation must be <= 8192");
12432
12433 return entry;
12434 }
12435 // In C, approximately:
12436
12437 // void
12438 // montgomery_square(julong Pa_base[], julong Pn_base[],
12439 // julong Pm_base[], julong inv, int len) {
12440 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12441 // julong *Pa, *Pb, *Pn, *Pm;
12442 // julong Ra, Rb, Rn, Rm;
12443
12444 // int i;
12445
12446 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12447
12448 // for (i = 0; i < len; i++) {
12449 // int j;
12450
12451 // Pa = Pa_base;
12452 // Pb = Pa_base + i;
12453 // Pm = Pm_base;
12454 // Pn = Pn_base + i;
12455
12456 // Ra = *Pa;
12457 // Rb = *Pb;
12458 // Rm = *Pm;
12459 // Rn = *Pn;
12460
12461 // int iters = (i+1)/2;
12462 // for (j = 0; iters--; j++) {
12463 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12464 // MACC2(Ra, Rb, t0, t1, t2);
12465 // Ra = *++Pa;
12466 // Rb = *--Pb;
12467 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12468 // MACC(Rm, Rn, t0, t1, t2);
12469 // Rm = *++Pm;
12470 // Rn = *--Pn;
12471 // }
12472 // if ((i & 1) == 0) {
12473 // assert(Ra == Pa_base[j], "must be");
12474 // MACC(Ra, Ra, t0, t1, t2);
12475 // }
12476 // iters = i/2;
12477 // assert(iters == i-j, "must be");
12478 // for (; iters--; j++) {
12479 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12480 // MACC(Rm, Rn, t0, t1, t2);
12481 // Rm = *++Pm;
12482 // Rn = *--Pn;
12483 // }
12484
12485 // *Pm = Rm = t0 * inv;
12486 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12487 // MACC(Rm, Rn, t0, t1, t2);
12488
12489 // assert(t0 == 0, "broken Montgomery multiply");
12490
12491 // t0 = t1; t1 = t2; t2 = 0;
12492 // }
12493
12494 // for (i = len; i < 2*len; i++) {
12495 // int start = i-len+1;
12496 // int end = start + (len - start)/2;
12497 // int j;
12498
12499 // Pa = Pa_base + i-len;
12500 // Pb = Pa_base + len;
12501 // Pm = Pm_base + i-len;
12502 // Pn = Pn_base + len;
12503
12504 // Ra = *++Pa;
12505 // Rb = *--Pb;
12506 // Rm = *++Pm;
12507 // Rn = *--Pn;
12508
12509 // int iters = (2*len-i-1)/2;
12510 // assert(iters == end-start, "must be");
12511 // for (j = start; iters--; j++) {
12512 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12513 // MACC2(Ra, Rb, t0, t1, t2);
12514 // Ra = *++Pa;
12515 // Rb = *--Pb;
12516 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12517 // MACC(Rm, Rn, t0, t1, t2);
12518 // Rm = *++Pm;
12519 // Rn = *--Pn;
12520 // }
12521 // if ((i & 1) == 0) {
12522 // assert(Ra == Pa_base[j], "must be");
12523 // MACC(Ra, Ra, t0, t1, t2);
12524 // }
12525 // iters = (2*len-i)/2;
12526 // assert(iters == len-j, "must be");
12527 // for (; iters--; j++) {
12528 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12529 // MACC(Rm, Rn, t0, t1, t2);
12530 // Rm = *++Pm;
12531 // Rn = *--Pn;
12532 // }
12533 // Pm_base[i-len] = t0;
12534 // t0 = t1; t1 = t2; t2 = 0;
12535 // }
12536
12537 // while (t0)
12538 // t0 = sub(Pm_base, Pn_base, t0, len);
12539 // }
12540 };
12541
12542 // Initialization
12543 void generate_preuniverse_stubs() {
12544 // preuniverse stubs are not needed for aarch64
12545 }
12546
12547 void generate_initial_stubs() {
12548 // Generate initial stubs and initializes the entry points
12549
12550 // entry points that exist in all platforms Note: This is code
12551 // that could be shared among different platforms - however the
12552 // benefit seems to be smaller than the disadvantage of having a
12553 // much more complicated generator structure. See also comment in
12554 // stubRoutines.hpp.
12555
12556 StubRoutines::_forward_exception_entry = generate_forward_exception();
12557
12558 StubRoutines::_call_stub_entry =
12559 generate_call_stub(StubRoutines::_call_stub_return_address);
12560
12561 // is referenced by megamorphic call
12562 StubRoutines::_catch_exception_entry = generate_catch_exception();
12563
12564 // Initialize table for copy memory (arraycopy) check.
12565 if (UnsafeMemoryAccess::_table == nullptr) {
12566 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12567 }
12568
12569 if (UseCRC32Intrinsics) {
12570 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12571 }
12572
12573 if (UseCRC32CIntrinsics) {
12574 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12575 }
12576
12577 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12578 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12579 }
12580
12581 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12582 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12583 }
12584
12585 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12586 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12587 StubRoutines::_hf2f = generate_float16ToFloat();
12588 StubRoutines::_f2hf = generate_floatToFloat16();
12589 }
12590 }
12591
12592 void generate_continuation_stubs() {
12593 // Continuation stubs:
12594 StubRoutines::_cont_thaw = generate_cont_thaw();
12595 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12596 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12597 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12598 }
12599
12600 void generate_final_stubs() {
12601 // support for verify_oop (must happen after universe_init)
12602 if (VerifyOops) {
12603 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
12604 }
12605
12606 // arraycopy stubs used by compilers
12607 generate_arraycopy_stubs();
12608
12609 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12610
12611 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12612
12613 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12614 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12615
12616 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12617
12618 generate_atomic_entry_points();
12619
12620 #endif // LINUX
12621
12622 #ifdef COMPILER2
12623 if (UseSecondarySupersTable) {
12624 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12625 if (! InlineSecondarySupersTest) {
12626 generate_lookup_secondary_supers_table_stub();
12627 }
12628 }
12629 #endif
12630
12631 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12632 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12633 }
12634
12635 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12636 }
12637
12638 void generate_compiler_stubs() {
12639 #if COMPILER2_OR_JVMCI
12640
12641 if (UseSVE == 0) {
12642 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12643 }
12644
12645 // array equals stub for large arrays.
12646 if (!UseSimpleArrayEquals) {
12647 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12648 }
12649
12650 // arrays_hascode stub for large arrays.
12651 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12652 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12653 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12654 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12655 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12656
12657 // byte_array_inflate stub for large arrays.
12658 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12659
12660 // countPositives stub for large arrays.
12661 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12662
12663 generate_compare_long_strings();
12664
12665 generate_string_indexof_stubs();
12666
12667 #ifdef COMPILER2
12668 if (UseMultiplyToLenIntrinsic) {
12669 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12670 }
12671
12672 if (UseSquareToLenIntrinsic) {
12673 StubRoutines::_squareToLen = generate_squareToLen();
12674 }
12675
12676 if (UseMulAddIntrinsic) {
12677 StubRoutines::_mulAdd = generate_mulAdd();
12678 }
12679
12680 if (UseSIMDForBigIntegerShiftIntrinsics) {
12681 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12682 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12683 }
12684
12685 if (UseMontgomeryMultiplyIntrinsic) {
12686 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12687 address start = load_archive_data(stub_id);
12688 if (start == nullptr) {
12689 // we have to generate it
12690 StubCodeMark mark(this, stub_id);
12691 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12692 start = g.generate_multiply();
12693 // record the stub start and end
12694 store_archive_data(stub_id, start, _masm->pc());
12695 }
12696 StubRoutines::_montgomeryMultiply = start;
12697 }
12698
12699 if (UseMontgomerySquareIntrinsic) {
12700 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12701 address start = load_archive_data(stub_id);
12702 if (start == nullptr) {
12703 // we have to generate it
12704 StubCodeMark mark(this, stub_id);
12705 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12706 // We use generate_multiply() rather than generate_square()
12707 // because it's faster for the sizes of modulus we care about.
12708 start = g.generate_multiply();
12709 // record the stub start and end
12710 store_archive_data(stub_id, start, _masm->pc());
12711 }
12712 StubRoutines::_montgomerySquare = start;
12713 }
12714
12715 #endif // COMPILER2
12716
12717 if (UseChaCha20Intrinsics) {
12718 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12719 }
12720
12721 if (UseKyberIntrinsics) {
12722 StubRoutines::_kyberNtt = generate_kyberNtt();
12723 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12724 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12725 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12726 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12727 StubRoutines::_kyber12To16 = generate_kyber12To16();
12728 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12729 }
12730
12731 if (UseDilithiumIntrinsics) {
12732 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12733 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12734 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12735 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12736 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12737 }
12738
12739 if (UseBASE64Intrinsics) {
12740 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12741 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12742 }
12743
12744 // data cache line writeback
12745 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12746 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12747
12748 if (UseAESIntrinsics) {
12749 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12750 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12751 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12752 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12753 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12754 }
12755 if (UseGHASHIntrinsics) {
12756 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12757 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12758 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12759 }
12760 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12761 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12762 }
12763
12764 if (UseMD5Intrinsics) {
12765 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12766 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12767 }
12768 if (UseSHA1Intrinsics) {
12769 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12770 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12771 }
12772 if (UseSHA256Intrinsics) {
12773 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12774 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12775 }
12776 if (UseSHA512Intrinsics) {
12777 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12778 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12779 }
12780 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12781 StubRoutines::_double_keccak = generate_double_keccak();
12782 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12783 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12784 } else if (UseSHA3Intrinsics) {
12785 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12786 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12787 }
12788
12789 if (UsePoly1305Intrinsics) {
12790 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12791 }
12792
12793 // generate Adler32 intrinsics code
12794 if (UseAdler32Intrinsics) {
12795 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12796 }
12797
12798 #endif // COMPILER2_OR_JVMCI
12799 }
12800
12801 public:
12802 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
12803 switch(blob_id) {
12804 case BlobId::stubgen_preuniverse_id:
12805 generate_preuniverse_stubs();
12806 break;
12807 case BlobId::stubgen_initial_id:
12808 generate_initial_stubs();
12809 break;
12810 case BlobId::stubgen_continuation_id:
12811 generate_continuation_stubs();
12812 break;
12813 case BlobId::stubgen_compiler_id:
12814 generate_compiler_stubs();
12815 break;
12816 case BlobId::stubgen_final_id:
12817 generate_final_stubs();
12818 break;
12819 default:
12820 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
12821 break;
12822 };
12823 }
12824
12825 #if INCLUDE_CDS
12826 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
12827 // external data defined in this file
12828 #define ADD(addr) external_addresses.append((address)(addr));
12829 ADD(_sha256_round_consts);
12830 ADD(_sha512_round_consts);
12831 ADD(_sha3_round_consts);
12832 ADD(_double_keccak_round_consts);
12833 ADD(_encodeBlock_toBase64);
12834 ADD(_encodeBlock_toBase64URL);
12835 ADD(_decodeBlock_fromBase64ForNoSIMD);
12836 ADD(_decodeBlock_fromBase64URLForNoSIMD);
12837 ADD(_decodeBlock_fromBase64ForSIMD);
12838 ADD(_decodeBlock_fromBase64URLForSIMD);
12839 #undef ADD
12840 }
12841 #endif // INCLUDE_CDS
12842 }; // end class declaration
12843
12844 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
12845 StubGenerator g(code, blob_id, stub_data);
12846 }
12847
12848 #if INCLUDE_CDS
12849 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
12850 StubGenerator::init_AOTAddressTable(addresses);
12851 }
12852 #endif // INCLUDE_CDS
12853
12854 #if defined (LINUX)
12855
12856 // Define pointers to atomic stubs and initialize them to point to the
12857 // code in atomic_aarch64.S.
12858
12859 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
12860 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
12861 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
12862 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
12863 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
12864
12865 DEFAULT_ATOMIC_OP(fetch_add, 4, )
12866 DEFAULT_ATOMIC_OP(fetch_add, 8, )
12867 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
12868 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
12869 DEFAULT_ATOMIC_OP(xchg, 4, )
12870 DEFAULT_ATOMIC_OP(xchg, 8, )
12871 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
12872 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
12873 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
12874 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
12875 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
12876 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
12877 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
12878 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
12879 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
12880 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
12881
12882 #undef DEFAULT_ATOMIC_OP
12883
12884 #endif // LINUX