1 /*
2 * Copyright (c) 2003, 2026, Oracle and/or its affiliates. All rights reserved.
3 * Copyright (c) 2014, 2025, Red Hat Inc. All rights reserved.
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5 *
6 * This code is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 only, as
8 * published by the Free Software Foundation.
9 *
10 * This code is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13 * version 2 for more details (a copy is included in the LICENSE file that
14 * accompanied this code).
15 *
16 * You should have received a copy of the GNU General Public License version
17 * 2 along with this work; if not, write to the Free Software Foundation,
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21 * or visit www.oracle.com if you need additional information or have any
22 * questions.
23 *
24 */
25
26 #include "asm/macroAssembler.hpp"
27 #include "asm/macroAssembler.inline.hpp"
28 #include "asm/register.hpp"
29 #include "atomic_aarch64.hpp"
30 #include "compiler/oopMap.hpp"
31 #include "gc/shared/barrierSet.hpp"
32 #include "gc/shared/barrierSetAssembler.hpp"
33 #include "gc/shared/gc_globals.hpp"
34 #include "gc/shared/tlab_globals.hpp"
35 #include "interpreter/interpreter.hpp"
36 #include "memory/universe.hpp"
37 #include "nativeInst_aarch64.hpp"
38 #include "oops/instanceOop.hpp"
39 #include "oops/method.hpp"
40 #include "oops/objArrayKlass.hpp"
41 #include "oops/oop.inline.hpp"
42 #include "prims/methodHandles.hpp"
43 #include "prims/upcallLinker.hpp"
44 #include "runtime/arguments.hpp"
45 #include "runtime/atomicAccess.hpp"
46 #include "runtime/continuation.hpp"
47 #include "runtime/continuationEntry.inline.hpp"
48 #include "runtime/frame.inline.hpp"
49 #include "runtime/handles.inline.hpp"
50 #include "runtime/javaThread.hpp"
51 #include "runtime/sharedRuntime.hpp"
52 #include "runtime/stubCodeGenerator.hpp"
53 #include "runtime/stubRoutines.hpp"
54 #include "utilities/align.hpp"
55 #include "utilities/checkedCast.hpp"
56 #include "utilities/debug.hpp"
57 #include "utilities/globalDefinitions.hpp"
58 #include "utilities/intpow.hpp"
59 #include "utilities/powerOfTwo.hpp"
60 #ifdef COMPILER2
61 #include "opto/runtime.hpp"
62 #endif
63 #if INCLUDE_ZGC
64 #include "gc/z/zThreadLocalData.hpp"
65 #endif
66
67 // Declaration and definition of StubGenerator (no .hpp file).
68 // For a more detailed description of the stub routine structure
69 // see the comment in stubRoutines.hpp
70
71 #undef __
72 #define __ _masm->
73
74 #ifdef PRODUCT
75 #define BLOCK_COMMENT(str) /* nothing */
76 #else
77 #define BLOCK_COMMENT(str) __ block_comment(str)
78 #endif
79
80 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
81
82 // Constant data definitions
83
84 static const uint32_t _sha256_round_consts[64] = {
85 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
86 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
87 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
88 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
89 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
90 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
91 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
92 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
93 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
94 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
95 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
96 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
97 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
98 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
99 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
100 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
101 };
102
103 static const uint64_t _sha512_round_consts[80] = {
104 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL,
105 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L,
106 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L,
107 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L,
108 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L,
109 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L,
110 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L,
111 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L,
112 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL,
113 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L,
114 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL,
115 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL,
116 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L,
117 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L,
118 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L,
119 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L,
120 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L,
121 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL,
122 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL,
123 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL,
124 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L,
125 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L,
126 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL,
127 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL,
128 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL,
129 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL,
130 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L
131 };
132
133 static const uint64_t _sha3_round_consts[24] = {
134 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
135 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
136 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
137 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
138 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
139 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
140 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
141 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
142 };
143
144 static const uint64_t _double_keccak_round_consts[24] = {
145 0x0000000000000001L, 0x0000000000008082L, 0x800000000000808AL,
146 0x8000000080008000L, 0x000000000000808BL, 0x0000000080000001L,
147 0x8000000080008081L, 0x8000000000008009L, 0x000000000000008AL,
148 0x0000000000000088L, 0x0000000080008009L, 0x000000008000000AL,
149 0x000000008000808BL, 0x800000000000008BL, 0x8000000000008089L,
150 0x8000000000008003L, 0x8000000000008002L, 0x8000000000000080L,
151 0x000000000000800AL, 0x800000008000000AL, 0x8000000080008081L,
152 0x8000000000008080L, 0x0000000080000001L, 0x8000000080008008L
153 };
154
155 static const char _encodeBlock_toBase64[64] = {
156 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
157 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
158 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
159 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
160 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/'
161 };
162
163 static const char _encodeBlock_toBase64URL[64] = {
164 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
165 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',
166 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
167 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
168 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '-', '_'
169 };
170
171 // Non-SIMD lookup tables are mostly dumped from fromBase64 array used in java.util.Base64,
172 // except the trailing character '=' is also treated illegal value in this intrinsic. That
173 // is java.util.Base64.fromBase64['='] = -2, while fromBase(URL)64ForNoSIMD['='] = 255 here.
174 static const uint8_t _decodeBlock_fromBase64ForNoSIMD[256] = {
175 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
176 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
177 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
178 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
179 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
180 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 255u,
181 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
182 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
183 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
184 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
185 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
186 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
187 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
188 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
189 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
190 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
191 };
192
193 static const uint8_t _decodeBlock_fromBase64URLForNoSIMD[256] = {
194 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
195 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
196 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
197 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
198 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u, 14u,
199 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u, 63u,
200 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u, 40u,
201 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u, 255u,
202 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
203 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
204 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
205 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
206 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
207 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
208 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
209 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
210 };
211
212 // A legal value of base64 code is in range [0, 127]. We need two lookups
213 // with tbl/tbx and combine them to get the decode data. The 1st table vector
214 // lookup use tbl, out of range indices are set to 0 in destination. The 2nd
215 // table vector lookup use tbx, out of range indices are unchanged in
216 // destination. Input [64..126] is mapped to index [65, 127] in second lookup.
217 // The value of index 64 is set to 0, so that we know that we already get the
218 // decoded data with the 1st lookup.
219 static const uint8_t _decodeBlock_fromBase64ForSIMD[128] = {
220 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
221 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
222 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u, 255u, 63u,
223 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
224 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
225 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
226 255u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
227 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
228 };
229
230 static const uint8_t _decodeBlock_fromBase64URLForSIMD[128] = {
231 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
232 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u,
233 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 255u, 62u, 255u, 255u,
234 52u, 53u, 54u, 55u, 56u, 57u, 58u, 59u, 60u, 61u, 255u, 255u, 255u, 255u, 255u, 255u,
235 0u, 255u, 0u, 1u, 2u, 3u, 4u, 5u, 6u, 7u, 8u, 9u, 10u, 11u, 12u, 13u,
236 14u, 15u, 16u, 17u, 18u, 19u, 20u, 21u, 22u, 23u, 24u, 25u, 255u, 255u, 255u, 255u,
237 63u, 255u, 26u, 27u, 28u, 29u, 30u, 31u, 32u, 33u, 34u, 35u, 36u, 37u, 38u, 39u,
238 40u, 41u, 42u, 43u, 44u, 45u, 46u, 47u, 48u, 49u, 50u, 51u, 255u, 255u, 255u, 255u,
239 };
240
241
242 // Stub Code definitions
243
244 class StubGenerator: public StubCodeGenerator {
245 private:
246
247 #ifdef PRODUCT
248 #define inc_counter_np(counter) ((void)0)
249 #else
250 void inc_counter_np_(uint& counter) {
251 __ incrementw(ExternalAddress((address)&counter));
252 }
253 #define inc_counter_np(counter) \
254 BLOCK_COMMENT("inc_counter " #counter); \
255 inc_counter_np_(counter);
256 #endif
257
258 // Call stubs are used to call Java from C
259 //
260 // Arguments:
261 // c_rarg0: call wrapper address address
262 // c_rarg1: result address
263 // c_rarg2: result type BasicType
264 // c_rarg3: method Method*
265 // c_rarg4: (interpreter) entry point address
266 // c_rarg5: parameters intptr_t*
267 // c_rarg6: parameter size (in words) int
268 // c_rarg7: thread Thread*
269 //
270 // There is no return from the stub itself as any Java result
271 // is written to result
272 //
273 // we save r30 (lr) as the return PC at the base of the frame and
274 // link r29 (fp) below it as the frame pointer installing sp (r31)
275 // into fp.
276 //
277 // we save r0-r7, which accounts for all the c arguments.
278 //
279 // TODO: strictly do we need to save them all? they are treated as
280 // volatile by C so could we omit saving the ones we are going to
281 // place in global registers (thread? method?) or those we only use
282 // during setup of the Java call?
283 //
284 // we don't need to save r8 which C uses as an indirect result location
285 // return register.
286 //
287 // we don't need to save r9-r15 which both C and Java treat as
288 // volatile
289 //
290 // we don't need to save r16-18 because Java does not use them
291 //
292 // we save r19-r28 which Java uses as scratch registers and C
293 // expects to be callee-save
294 //
295 // we save the bottom 64 bits of each value stored in v8-v15; it is
296 // the responsibility of the caller to preserve larger values.
297 //
298 // so the stub frame looks like this when we enter Java code
299 //
300 // [ return_from_Java ] <--- sp
301 // [ argument word n ]
302 // ...
303 // -29 [ argument word 1 ]
304 // -28 [ saved Floating-point Control Register ]
305 // -26 [ saved v15 ] <--- sp_after_call
306 // -25 [ saved v14 ]
307 // -24 [ saved v13 ]
308 // -23 [ saved v12 ]
309 // -22 [ saved v11 ]
310 // -21 [ saved v10 ]
311 // -20 [ saved v9 ]
312 // -19 [ saved v8 ]
313 // -18 [ saved r28 ]
314 // -17 [ saved r27 ]
315 // -16 [ saved r26 ]
316 // -15 [ saved r25 ]
317 // -14 [ saved r24 ]
318 // -13 [ saved r23 ]
319 // -12 [ saved r22 ]
320 // -11 [ saved r21 ]
321 // -10 [ saved r20 ]
322 // -9 [ saved r19 ]
323 // -8 [ call wrapper (r0) ]
324 // -7 [ result (r1) ]
325 // -6 [ result type (r2) ]
326 // -5 [ method (r3) ]
327 // -4 [ entry point (r4) ]
328 // -3 [ parameters (r5) ]
329 // -2 [ parameter size (r6) ]
330 // -1 [ thread (r7) ]
331 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31)
332 // 1 [ saved lr (r30) ]
333
334 // Call stub stack layout word offsets from fp
335 enum call_stub_layout {
336 sp_after_call_off = -28,
337
338 fpcr_off = sp_after_call_off,
339 d15_off = -26,
340 d13_off = -24,
341 d11_off = -22,
342 d9_off = -20,
343
344 r28_off = -18,
345 r26_off = -16,
346 r24_off = -14,
347 r22_off = -12,
348 r20_off = -10,
349 call_wrapper_off = -8,
350 result_off = -7,
351 result_type_off = -6,
352 method_off = -5,
353 entry_point_off = -4,
354 parameter_size_off = -2,
355 thread_off = -1,
356 fp_f = 0,
357 retaddr_off = 1,
358 };
359
360 address generate_call_stub(address& return_address) {
361 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
362 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
363 "adjust this code");
364
365 StubId stub_id = StubId::stubgen_call_stub_id;
366 GrowableArray<address> entries;
367 int entry_count = StubInfo::entry_count(stub_id);
368 assert(entry_count == 2, "sanity check");
369 address start = load_archive_data(stub_id, &entries);
370 if (start != nullptr) {
371 assert(entries.length() == 1, "expected 1 extra entry");
372 return_address = entries.at(0);
373 return start;
374 }
375 StubCodeMark mark(this, stub_id);
376 start = __ pc();
377
378 const Address sp_after_call (rfp, sp_after_call_off * wordSize);
379
380 const Address fpcr_save (rfp, fpcr_off * wordSize);
381 const Address call_wrapper (rfp, call_wrapper_off * wordSize);
382 const Address result (rfp, result_off * wordSize);
383 const Address result_type (rfp, result_type_off * wordSize);
384 const Address method (rfp, method_off * wordSize);
385 const Address entry_point (rfp, entry_point_off * wordSize);
386 const Address parameter_size(rfp, parameter_size_off * wordSize);
387
388 const Address thread (rfp, thread_off * wordSize);
389
390 const Address d15_save (rfp, d15_off * wordSize);
391 const Address d13_save (rfp, d13_off * wordSize);
392 const Address d11_save (rfp, d11_off * wordSize);
393 const Address d9_save (rfp, d9_off * wordSize);
394
395 const Address r28_save (rfp, r28_off * wordSize);
396 const Address r26_save (rfp, r26_off * wordSize);
397 const Address r24_save (rfp, r24_off * wordSize);
398 const Address r22_save (rfp, r22_off * wordSize);
399 const Address r20_save (rfp, r20_off * wordSize);
400
401 // stub code
402
403 address aarch64_entry = __ pc();
404
405 // set up frame and move sp to end of save area
406 __ enter();
407 __ sub(sp, rfp, -sp_after_call_off * wordSize);
408
409 // save register parameters and Java scratch/global registers
410 // n.b. we save thread even though it gets installed in
411 // rthread because we want to sanity check rthread later
412 __ str(c_rarg7, thread);
413 __ strw(c_rarg6, parameter_size);
414 __ stp(c_rarg4, c_rarg5, entry_point);
415 __ stp(c_rarg2, c_rarg3, result_type);
416 __ stp(c_rarg0, c_rarg1, call_wrapper);
417
418 __ stp(r20, r19, r20_save);
419 __ stp(r22, r21, r22_save);
420 __ stp(r24, r23, r24_save);
421 __ stp(r26, r25, r26_save);
422 __ stp(r28, r27, r28_save);
423
424 __ stpd(v9, v8, d9_save);
425 __ stpd(v11, v10, d11_save);
426 __ stpd(v13, v12, d13_save);
427 __ stpd(v15, v14, d15_save);
428
429 __ get_fpcr(rscratch1);
430 __ str(rscratch1, fpcr_save);
431 // Set FPCR to the state we need. We do want Round to Nearest. We
432 // don't want non-IEEE rounding modes or floating-point traps.
433 __ bfi(rscratch1, zr, 22, 4); // Clear DN, FZ, and Rmode
434 __ bfi(rscratch1, zr, 8, 5); // Clear exception-control bits (8-12)
435 __ set_fpcr(rscratch1);
436
437 // install Java thread in global register now we have saved
438 // whatever value it held
439 __ mov(rthread, c_rarg7);
440 // And method
441 __ mov(rmethod, c_rarg3);
442
443 // set up the heapbase register
444 __ reinit_heapbase();
445
446 #ifdef ASSERT
447 // make sure we have no pending exceptions
448 {
449 Label L;
450 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
451 __ cmp(rscratch1, (u1)NULL_WORD);
452 __ br(Assembler::EQ, L);
453 __ stop("StubRoutines::call_stub: entered with pending exception");
454 __ BIND(L);
455 }
456 #endif
457 // pass parameters if any
458 __ mov(esp, sp);
459 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
460 __ andr(sp, rscratch1, -2 * wordSize);
461
462 BLOCK_COMMENT("pass parameters if any");
463 Label parameters_done;
464 // parameter count is still in c_rarg6
465 // and parameter pointer identifying param 1 is in c_rarg5
466 __ cbzw(c_rarg6, parameters_done);
467
468 address loop = __ pc();
469 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
470 __ subsw(c_rarg6, c_rarg6, 1);
471 __ push(rscratch1);
472 __ br(Assembler::GT, loop);
473
474 __ BIND(parameters_done);
475
476 // call Java entry -- passing methdoOop, and current sp
477 // rmethod: Method*
478 // r19_sender_sp: sender sp
479 BLOCK_COMMENT("call Java function");
480 __ mov(r19_sender_sp, sp);
481 __ blr(c_rarg4);
482
483 // we do this here because the notify will already have been done
484 // if we get to the next instruction via an exception
485 //
486 // n.b. adding this instruction here affects the calculation of
487 // whether or not a routine returns to the call stub (used when
488 // doing stack walks) since the normal test is to check the return
489 // pc against the address saved below. so we may need to allow for
490 // this extra instruction in the check.
491
492 // save current address for use by exception handling code
493
494 return_address = __ pc();
495 entries.append(return_address);
496
497 // store result depending on type (everything that is not
498 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
499 // n.b. this assumes Java returns an integral result in r0
500 // and a floating result in j_farg0
501 // All of j_rargN may be used to return inline type fields so be careful
502 // not to clobber those.
503 // SharedRuntime::generate_buffered_inline_type_adapter() knows the register
504 // assignment of Rresult below.
505 Register Rresult = r14, Rresult_type = r15;
506 __ ldr(Rresult, result);
507 Label is_long, is_float, is_double, check_prim, exit;
508 __ ldr(Rresult_type, result_type);
509 __ cmp(Rresult_type, (u1)T_OBJECT);
510 __ br(Assembler::EQ, check_prim);
511 __ cmp(Rresult_type, (u1)T_LONG);
512 __ br(Assembler::EQ, is_long);
513 __ cmp(Rresult_type, (u1)T_FLOAT);
514 __ br(Assembler::EQ, is_float);
515 __ cmp(Rresult_type, (u1)T_DOUBLE);
516 __ br(Assembler::EQ, is_double);
517
518 // handle T_INT case
519 __ strw(r0, Address(Rresult));
520
521 __ BIND(exit);
522
523 // pop parameters
524 __ sub(esp, rfp, -sp_after_call_off * wordSize);
525
526 #ifdef ASSERT
527 // verify that threads correspond
528 {
529 Label L, S;
530 __ ldr(rscratch1, thread);
531 __ cmp(rthread, rscratch1);
532 __ br(Assembler::NE, S);
533 __ get_thread(rscratch1);
534 __ cmp(rthread, rscratch1);
535 __ br(Assembler::EQ, L);
536 __ BIND(S);
537 __ stop("StubRoutines::call_stub: threads must correspond");
538 __ BIND(L);
539 }
540 #endif
541
542 __ pop_cont_fastpath(rthread);
543
544 // restore callee-save registers
545 __ ldpd(v15, v14, d15_save);
546 __ ldpd(v13, v12, d13_save);
547 __ ldpd(v11, v10, d11_save);
548 __ ldpd(v9, v8, d9_save);
549
550 __ ldp(r28, r27, r28_save);
551 __ ldp(r26, r25, r26_save);
552 __ ldp(r24, r23, r24_save);
553 __ ldp(r22, r21, r22_save);
554 __ ldp(r20, r19, r20_save);
555
556 // restore fpcr
557 __ ldr(rscratch1, fpcr_save);
558 __ set_fpcr(rscratch1);
559
560 __ ldp(c_rarg0, c_rarg1, call_wrapper);
561 __ ldrw(c_rarg2, result_type);
562 __ ldr(c_rarg3, method);
563 __ ldp(c_rarg4, c_rarg5, entry_point);
564 __ ldp(c_rarg6, c_rarg7, parameter_size);
565
566 // leave frame and return to caller
567 __ leave();
568 __ ret(lr);
569
570 // handle return types different from T_INT
571 __ BIND(check_prim);
572 if (InlineTypeReturnedAsFields) {
573 // Check for scalarized return value
574 __ tbz(r0, 0, is_long);
575 // Load pack handler address
576 __ andr(rscratch1, r0, -2);
577 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
578 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_jobject_offset()));
579 __ blr(rscratch1);
580 __ b(exit);
581 }
582
583 __ BIND(is_long);
584 __ str(r0, Address(Rresult, 0));
585 __ br(Assembler::AL, exit);
586
587 __ BIND(is_float);
588 __ strs(j_farg0, Address(Rresult, 0));
589 __ br(Assembler::AL, exit);
590
591 __ BIND(is_double);
592 __ strd(j_farg0, Address(Rresult, 0));
593 __ br(Assembler::AL, exit);
594
595 // record the stub entry and end plus the auxiliary entry
596 store_archive_data(stub_id, start, __ pc(), &entries);
597
598 return start;
599 }
600
601 // Return point for a Java call if there's an exception thrown in
602 // Java code. The exception is caught and transformed into a
603 // pending exception stored in JavaThread that can be tested from
604 // within the VM.
605 //
606 // Note: Usually the parameters are removed by the callee. In case
607 // of an exception crossing an activation frame boundary, that is
608 // not the case if the callee is compiled code => need to setup the
609 // rsp.
610 //
611 // r0: exception oop
612
613 address generate_catch_exception() {
614 StubId stub_id = StubId::stubgen_catch_exception_id;
615 int entry_count = StubInfo::entry_count(stub_id);
616 assert(entry_count == 1, "sanity check");
617 address start = load_archive_data(stub_id);
618 if (start != nullptr) {
619 return start;
620 }
621 StubCodeMark mark(this, stub_id);
622 start = __ pc();
623
624 // same as in generate_call_stub():
625 const Address sp_after_call(rfp, sp_after_call_off * wordSize);
626 const Address thread (rfp, thread_off * wordSize);
627
628 #ifdef ASSERT
629 // verify that threads correspond
630 {
631 Label L, S;
632 __ ldr(rscratch1, thread);
633 __ cmp(rthread, rscratch1);
634 __ br(Assembler::NE, S);
635 __ get_thread(rscratch1);
636 __ cmp(rthread, rscratch1);
637 __ br(Assembler::EQ, L);
638 __ bind(S);
639 __ stop("StubRoutines::catch_exception: threads must correspond");
640 __ bind(L);
641 }
642 #endif
643
644 // set pending exception
645 __ verify_oop(r0);
646
647 __ str(r0, Address(rthread, Thread::pending_exception_offset()));
648 // special case -- add file name string to AOT address table
649 address file = (address)AOTCodeCache::add_C_string(__FILE__);
650 __ lea(rscratch1, ExternalAddress(file));
651 __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
652 __ movw(rscratch1, (int)__LINE__);
653 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
654
655 // complete return to VM
656 assert(StubRoutines::_call_stub_return_address != nullptr,
657 "_call_stub_return_address must have been generated before");
658 __ b(RuntimeAddress(StubRoutines::_call_stub_return_address));
659
660 // record the stub entry and end
661 store_archive_data(stub_id, start, __ pc());
662
663 return start;
664 }
665
666 // Continuation point for runtime calls returning with a pending
667 // exception. The pending exception check happened in the runtime
668 // or native call stub. The pending exception in Thread is
669 // converted into a Java-level exception.
670 //
671 // Contract with Java-level exception handlers:
672 // r0: exception
673 // r3: throwing pc
674 //
675 // NOTE: At entry of this stub, exception-pc must be in LR !!
676
677 // NOTE: this is always used as a jump target within generated code
678 // so it just needs to be generated code with no x86 prolog
679
680 address generate_forward_exception() {
681 StubId stub_id = StubId::stubgen_forward_exception_id;
682 int entry_count = StubInfo::entry_count(stub_id);
683 assert(entry_count == 1, "sanity check");
684 address start = load_archive_data(stub_id);
685 if (start != nullptr) {
686 return start;
687 }
688 StubCodeMark mark(this, stub_id);
689 start = __ pc();
690
691 // Upon entry, LR points to the return address returning into
692 // Java (interpreted or compiled) code; i.e., the return address
693 // becomes the throwing pc.
694 //
695 // Arguments pushed before the runtime call are still on the stack
696 // but the exception handler will reset the stack pointer ->
697 // ignore them. A potential result in registers can be ignored as
698 // well.
699
700 #ifdef ASSERT
701 // make sure this code is only executed if there is a pending exception
702 {
703 Label L;
704 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
705 __ cbnz(rscratch1, L);
706 __ stop("StubRoutines::forward exception: no pending exception (1)");
707 __ bind(L);
708 }
709 #endif
710
711 // compute exception handler into r19
712
713 // call the VM to find the handler address associated with the
714 // caller address. pass thread in r0 and caller pc (ret address)
715 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
716 // the stack.
717 __ mov(c_rarg1, lr);
718 // lr will be trashed by the VM call so we move it to R19
719 // (callee-saved) because we also need to pass it to the handler
720 // returned by this call.
721 __ mov(r19, lr);
722 BLOCK_COMMENT("call exception_handler_for_return_address");
723 __ call_VM_leaf(CAST_FROM_FN_PTR(address,
724 SharedRuntime::exception_handler_for_return_address),
725 rthread, c_rarg1);
726 // Reinitialize the ptrue predicate register, in case the external runtime
727 // call clobbers ptrue reg, as we may return to SVE compiled code.
728 __ reinitialize_ptrue();
729
730 // we should not really care that lr is no longer the callee
731 // address. we saved the value the handler needs in r19 so we can
732 // just copy it to r3. however, the C2 handler will push its own
733 // frame and then calls into the VM and the VM code asserts that
734 // the PC for the frame above the handler belongs to a compiled
735 // Java method. So, we restore lr here to satisfy that assert.
736 __ mov(lr, r19);
737 // setup r0 & r3 & clear pending exception
738 __ mov(r3, r19);
739 __ mov(r19, r0);
740 __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
741 __ str(zr, Address(rthread, Thread::pending_exception_offset()));
742
743 #ifdef ASSERT
744 // make sure exception is set
745 {
746 Label L;
747 __ cbnz(r0, L);
748 __ stop("StubRoutines::forward exception: no pending exception (2)");
749 __ bind(L);
750 }
751 #endif
752
753 // continue at exception handler
754 // r0: exception
755 // r3: throwing pc
756 // r19: exception handler
757 __ verify_oop(r0);
758 __ br(r19);
759
760 // record the stub entry and end
761 store_archive_data(stub_id, start, __ pc());
762
763 return start;
764 }
765
766 // Non-destructive plausibility checks for oops
767 //
768 // Arguments:
769 // r0: oop to verify
770 // rscratch1: error message
771 //
772 // Stack after saving c_rarg3:
773 // [tos + 0]: saved c_rarg3
774 // [tos + 1]: saved c_rarg2
775 // [tos + 2]: saved lr
776 // [tos + 3]: saved rscratch2
777 // [tos + 4]: saved r0
778 // [tos + 5]: saved rscratch1
779 address generate_verify_oop() {
780 StubId stub_id = StubId::stubgen_verify_oop_id;
781 int entry_count = StubInfo::entry_count(stub_id);
782 assert(entry_count == 1, "sanity check");
783 address start = load_archive_data(stub_id);
784 if (start != nullptr) {
785 return start;
786 }
787 StubCodeMark mark(this, stub_id);
788 start = __ pc();
789
790 Label exit, error;
791
792 // save c_rarg2 and c_rarg3
793 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
794
795 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
796 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
797 __ ldr(c_rarg3, Address(c_rarg2));
798 __ add(c_rarg3, c_rarg3, 1);
799 __ str(c_rarg3, Address(c_rarg2));
800
801 // object is in r0
802 // make sure object is 'reasonable'
803 __ cbz(r0, exit); // if obj is null it is OK
804
805 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
806 bs_asm->check_oop(_masm, r0, c_rarg2, c_rarg3, error);
807
808 // return if everything seems ok
809 __ bind(exit);
810
811 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
812 __ ret(lr);
813
814 // handle errors
815 __ bind(error);
816 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
817
818 __ push(RegSet::range(r0, r29), sp);
819 // debug(char* msg, int64_t pc, int64_t regs[])
820 __ mov(c_rarg0, rscratch1); // pass address of error message
821 __ mov(c_rarg1, lr); // pass return address
822 __ mov(c_rarg2, sp); // pass address of regs on stack
823 #ifndef PRODUCT
824 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
825 #endif
826 BLOCK_COMMENT("call MacroAssembler::debug");
827 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
828 __ blr(rscratch1);
829 __ hlt(0);
830
831 // record the stub entry and end
832 store_archive_data(stub_id, start, __ pc());
833
834 return start;
835 }
836
837 // Generate indices for iota vector.
838 void generate_iota_indices(StubId stub_id) {
839 GrowableArray<address> entries;
840 int entry_count = StubInfo::entry_count(stub_id);
841 assert(entry_count == VECTOR_IOTA_COUNT, "sanity check");
842 address start = load_archive_data(stub_id, &entries);
843 if (start != nullptr) {
844 assert(entries.length() == entry_count - 1,
845 "unexpected entries count %d", entries.length());
846 StubRoutines::aarch64::_vector_iota_indices[0] = start;
847 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
848 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
849 }
850 return;
851 }
852 __ align(CodeEntryAlignment);
853 StubCodeMark mark(this, stub_id);
854 start = __ pc();
855 // B
856 __ emit_data64(0x0706050403020100, relocInfo::none);
857 __ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
858 entries.append(__ pc());
859 // H
860 __ emit_data64(0x0003000200010000, relocInfo::none);
861 __ emit_data64(0x0007000600050004, relocInfo::none);
862 entries.append(__ pc());
863 // S
864 __ emit_data64(0x0000000100000000, relocInfo::none);
865 __ emit_data64(0x0000000300000002, relocInfo::none);
866 entries.append(__ pc());
867 // D
868 __ emit_data64(0x0000000000000000, relocInfo::none);
869 __ emit_data64(0x0000000000000001, relocInfo::none);
870 entries.append(__ pc());
871 // S - FP
872 __ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
873 __ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
874 entries.append(__ pc());
875 // D - FP
876 __ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
877 __ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
878
879 // record the stub entry and end
880 store_archive_data(stub_id, start, __ pc(), &entries);
881
882 // install the entry addresses in the entry array
883 assert(entries.length() == entry_count - 1,
884 "unexpected entries count %d", entries.length());
885 StubRoutines::aarch64::_vector_iota_indices[0] = start;
886 for (int i = 1; i < VECTOR_IOTA_COUNT; i++) {
887 StubRoutines::aarch64::_vector_iota_indices[i] = entries.at(i - 1);
888 }
889 }
890
891 // The inner part of zero_words(). This is the bulk operation,
892 // zeroing words in blocks, possibly using DC ZVA to do it. The
893 // caller is responsible for zeroing the last few words.
894 //
895 // Inputs:
896 // r10: the HeapWord-aligned base address of an array to zero.
897 // r11: the count in HeapWords, r11 > 0.
898 //
899 // Returns r10 and r11, adjusted for the caller to clear.
900 // r10: the base address of the tail of words left to clear.
901 // r11: the number of words in the tail.
902 // r11 < MacroAssembler::zero_words_block_size.
903
904 address generate_zero_blocks() {
905 StubId stub_id = StubId::stubgen_zero_blocks_id;
906 int entry_count = StubInfo::entry_count(stub_id);
907 assert(entry_count == 1, "sanity check");
908 address start = load_archive_data(stub_id);
909 if (start != nullptr) {
910 return start;
911 }
912 __ align(CodeEntryAlignment);
913 StubCodeMark mark(this, stub_id);
914 Label done;
915 Label base_aligned;
916
917 Register base = r10, cnt = r11;
918
919 start = __ pc();
920
921 if (UseBlockZeroing) {
922 int zva_length = VM_Version::zva_length();
923
924 // Ensure ZVA length can be divided by 16. This is required by
925 // the subsequent operations.
926 assert (zva_length % 16 == 0, "Unexpected ZVA Length");
927
928 __ tbz(base, 3, base_aligned);
929 __ str(zr, Address(__ post(base, 8)));
930 __ sub(cnt, cnt, 1);
931 __ bind(base_aligned);
932
933 // Ensure count >= zva_length * 2 so that it still deserves a zva after
934 // alignment.
935 Label small;
936 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
937 __ subs(rscratch1, cnt, low_limit >> 3);
938 __ br(Assembler::LT, small);
939 __ zero_dcache_blocks(base, cnt);
940 __ bind(small);
941 }
942
943 {
944 // Number of stp instructions we'll unroll
945 const int unroll =
946 MacroAssembler::zero_words_block_size / 2;
947 // Clear the remaining blocks.
948 Label loop;
949 __ subs(cnt, cnt, unroll * 2);
950 __ br(Assembler::LT, done);
951 __ bind(loop);
952 for (int i = 0; i < unroll; i++)
953 __ stp(zr, zr, __ post(base, 16));
954 __ subs(cnt, cnt, unroll * 2);
955 __ br(Assembler::GE, loop);
956 __ bind(done);
957 __ add(cnt, cnt, unroll * 2);
958 }
959
960 __ ret(lr);
961
962 // record the stub entry and end
963 store_archive_data(stub_id, start, __ pc());
964
965 return start;
966 }
967
968
969 typedef enum {
970 copy_forwards = 1,
971 copy_backwards = -1
972 } copy_direction;
973
974 // Helper object to reduce noise when telling the GC barriers how to perform loads and stores
975 // for arraycopy stubs.
976 class ArrayCopyBarrierSetHelper : StackObj {
977 BarrierSetAssembler* _bs_asm;
978 MacroAssembler* _masm;
979 DecoratorSet _decorators;
980 BasicType _type;
981 Register _gct1;
982 Register _gct2;
983 Register _gct3;
984 FloatRegister _gcvt1;
985 FloatRegister _gcvt2;
986 FloatRegister _gcvt3;
987
988 public:
989 ArrayCopyBarrierSetHelper(MacroAssembler* masm,
990 DecoratorSet decorators,
991 BasicType type,
992 Register gct1,
993 Register gct2,
994 Register gct3,
995 FloatRegister gcvt1,
996 FloatRegister gcvt2,
997 FloatRegister gcvt3)
998 : _bs_asm(BarrierSet::barrier_set()->barrier_set_assembler()),
999 _masm(masm),
1000 _decorators(decorators),
1001 _type(type),
1002 _gct1(gct1),
1003 _gct2(gct2),
1004 _gct3(gct3),
1005 _gcvt1(gcvt1),
1006 _gcvt2(gcvt2),
1007 _gcvt3(gcvt3) {
1008 }
1009
1010 void copy_load_at_32(FloatRegister dst1, FloatRegister dst2, Address src) {
1011 _bs_asm->copy_load_at(_masm, _decorators, _type, 32,
1012 dst1, dst2, src,
1013 _gct1, _gct2, _gcvt1);
1014 }
1015
1016 void copy_store_at_32(Address dst, FloatRegister src1, FloatRegister src2) {
1017 _bs_asm->copy_store_at(_masm, _decorators, _type, 32,
1018 dst, src1, src2,
1019 _gct1, _gct2, _gct3, _gcvt1, _gcvt2, _gcvt3);
1020 }
1021
1022 void copy_load_at_16(Register dst1, Register dst2, Address src) {
1023 _bs_asm->copy_load_at(_masm, _decorators, _type, 16,
1024 dst1, dst2, src,
1025 _gct1);
1026 }
1027
1028 void copy_store_at_16(Address dst, Register src1, Register src2) {
1029 _bs_asm->copy_store_at(_masm, _decorators, _type, 16,
1030 dst, src1, src2,
1031 _gct1, _gct2, _gct3);
1032 }
1033
1034 void copy_load_at_8(Register dst, Address src) {
1035 _bs_asm->copy_load_at(_masm, _decorators, _type, 8,
1036 dst, noreg, src,
1037 _gct1);
1038 }
1039
1040 void copy_store_at_8(Address dst, Register src) {
1041 _bs_asm->copy_store_at(_masm, _decorators, _type, 8,
1042 dst, src, noreg,
1043 _gct1, _gct2, _gct3);
1044 }
1045 };
1046
1047 // Bulk copy of blocks of 8 words.
1048 //
1049 // count is a count of words.
1050 //
1051 // Precondition: count >= 8
1052 //
1053 // Postconditions:
1054 //
1055 // The least significant bit of count contains the remaining count
1056 // of words to copy. The rest of count is trash.
1057 //
1058 // s and d are adjusted to point to the remaining words to copy
1059 //
1060 address generate_copy_longs(StubId stub_id, DecoratorSet decorators, Register s, Register d, Register count) {
1061 int entry_count = StubInfo::entry_count(stub_id);
1062 assert(entry_count == 1, "sanity check");
1063 address start = load_archive_data(stub_id);
1064 if (start != nullptr) {
1065 return start;
1066 }
1067 BasicType type;
1068 copy_direction direction;
1069
1070 switch (stub_id) {
1071 case StubId::stubgen_copy_byte_f_id:
1072 direction = copy_forwards;
1073 type = T_BYTE;
1074 break;
1075 case StubId::stubgen_copy_byte_b_id:
1076 direction = copy_backwards;
1077 type = T_BYTE;
1078 break;
1079 case StubId::stubgen_copy_oop_f_id:
1080 direction = copy_forwards;
1081 type = T_OBJECT;
1082 break;
1083 case StubId::stubgen_copy_oop_b_id:
1084 direction = copy_backwards;
1085 type = T_OBJECT;
1086 break;
1087 case StubId::stubgen_copy_oop_uninit_f_id:
1088 direction = copy_forwards;
1089 type = T_OBJECT;
1090 break;
1091 case StubId::stubgen_copy_oop_uninit_b_id:
1092 direction = copy_backwards;
1093 type = T_OBJECT;
1094 break;
1095 default:
1096 ShouldNotReachHere();
1097 }
1098
1099 int unit = wordSize * direction;
1100 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
1101
1102 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
1103 t4 = r7, t5 = r11, t6 = r12, t7 = r13;
1104 const Register stride = r14;
1105 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1106 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1107 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1108
1109 assert_different_registers(rscratch1, rscratch2, t0, t1, t2, t3, t4, t5, t6, t7);
1110 assert_different_registers(s, d, count, rscratch1, rscratch2);
1111
1112 Label again, drain;
1113
1114 __ align(CodeEntryAlignment);
1115
1116 StubCodeMark mark(this, stub_id);
1117
1118 start = __ pc();
1119
1120 Label unaligned_copy_long;
1121 if (AvoidUnalignedAccesses) {
1122 __ tbnz(d, 3, unaligned_copy_long);
1123 }
1124
1125 if (direction == copy_forwards) {
1126 __ sub(s, s, bias);
1127 __ sub(d, d, bias);
1128 }
1129
1130 #ifdef ASSERT
1131 // Make sure we are never given < 8 words
1132 {
1133 Label L;
1134 __ cmp(count, (u1)8);
1135 __ br(Assembler::GE, L);
1136 __ stop("genrate_copy_longs called with < 8 words");
1137 __ bind(L);
1138 }
1139 #endif
1140
1141 // Fill 8 registers
1142 if (UseSIMDForMemoryOps) {
1143 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1144 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1145 } else {
1146 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1147 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1148 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1149 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1150 }
1151
1152 __ subs(count, count, 16);
1153 __ br(Assembler::LO, drain);
1154
1155 int prefetch = PrefetchCopyIntervalInBytes;
1156 bool use_stride = false;
1157 if (direction == copy_backwards) {
1158 use_stride = prefetch > 256;
1159 prefetch = -prefetch;
1160 if (use_stride) __ mov(stride, prefetch);
1161 }
1162
1163 __ bind(again);
1164
1165 if (PrefetchCopyIntervalInBytes > 0)
1166 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1167
1168 if (UseSIMDForMemoryOps) {
1169 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1170 bs.copy_load_at_32(v0, v1, Address(s, 4 * unit));
1171 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1172 bs.copy_load_at_32(v2, v3, Address(__ pre(s, 8 * unit)));
1173 } else {
1174 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1175 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1176 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1177 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1178 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1179 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1180 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1181 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1182 }
1183
1184 __ subs(count, count, 8);
1185 __ br(Assembler::HS, again);
1186
1187 // Drain
1188 __ bind(drain);
1189 if (UseSIMDForMemoryOps) {
1190 bs.copy_store_at_32(Address(d, 4 * unit), v0, v1);
1191 bs.copy_store_at_32(Address(__ pre(d, 8 * unit)), v2, v3);
1192 } else {
1193 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1194 bs.copy_store_at_16(Address(d, 4 * unit), t2, t3);
1195 bs.copy_store_at_16(Address(d, 6 * unit), t4, t5);
1196 bs.copy_store_at_16(Address(__ pre(d, 8 * unit)), t6, t7);
1197 }
1198
1199 {
1200 Label L1, L2;
1201 __ tbz(count, exact_log2(4), L1);
1202 if (UseSIMDForMemoryOps) {
1203 bs.copy_load_at_32(v0, v1, Address(__ pre(s, 4 * unit)));
1204 bs.copy_store_at_32(Address(__ pre(d, 4 * unit)), v0, v1);
1205 } else {
1206 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1207 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1208 bs.copy_store_at_16(Address(d, 2 * unit), t0, t1);
1209 bs.copy_store_at_16(Address(__ pre(d, 4 * unit)), t2, t3);
1210 }
1211 __ bind(L1);
1212
1213 if (direction == copy_forwards) {
1214 __ add(s, s, bias);
1215 __ add(d, d, bias);
1216 }
1217
1218 __ tbz(count, 1, L2);
1219 bs.copy_load_at_16(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1220 bs.copy_store_at_16(Address(__ adjust(d, 2 * unit, direction == copy_backwards)), t0, t1);
1221 __ bind(L2);
1222 }
1223
1224 __ ret(lr);
1225
1226 if (AvoidUnalignedAccesses) {
1227 Label drain, again;
1228 // Register order for storing. Order is different for backward copy.
1229
1230 __ bind(unaligned_copy_long);
1231
1232 // source address is even aligned, target odd aligned
1233 //
1234 // when forward copying word pairs we read long pairs at offsets
1235 // {0, 2, 4, 6} (in long words). when backwards copying we read
1236 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1237 // address by -2 in the forwards case so we can compute the
1238 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1239 // or -1.
1240 //
1241 // when forward copying we need to store 1 word, 3 pairs and
1242 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather than use a
1243 // zero offset We adjust the destination by -1 which means we
1244 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1245 //
1246 // When backwards copyng we need to store 1 word, 3 pairs and
1247 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1248 // offsets {1, 3, 5, 7, 8} * unit.
1249
1250 if (direction == copy_forwards) {
1251 __ sub(s, s, 16);
1252 __ sub(d, d, 8);
1253 }
1254
1255 // Fill 8 registers
1256 //
1257 // for forwards copy s was offset by -16 from the original input
1258 // value of s so the register contents are at these offsets
1259 // relative to the 64 bit block addressed by that original input
1260 // and so on for each successive 64 byte block when s is updated
1261 //
1262 // t0 at offset 0, t1 at offset 8
1263 // t2 at offset 16, t3 at offset 24
1264 // t4 at offset 32, t5 at offset 40
1265 // t6 at offset 48, t7 at offset 56
1266
1267 // for backwards copy s was not offset so the register contents
1268 // are at these offsets into the preceding 64 byte block
1269 // relative to that original input and so on for each successive
1270 // preceding 64 byte block when s is updated. this explains the
1271 // slightly counter-intuitive looking pattern of register usage
1272 // in the stp instructions for backwards copy.
1273 //
1274 // t0 at offset -16, t1 at offset -8
1275 // t2 at offset -32, t3 at offset -24
1276 // t4 at offset -48, t5 at offset -40
1277 // t6 at offset -64, t7 at offset -56
1278
1279 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1280 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1281 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1282 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1283
1284 __ subs(count, count, 16);
1285 __ br(Assembler::LO, drain);
1286
1287 int prefetch = PrefetchCopyIntervalInBytes;
1288 bool use_stride = false;
1289 if (direction == copy_backwards) {
1290 use_stride = prefetch > 256;
1291 prefetch = -prefetch;
1292 if (use_stride) __ mov(stride, prefetch);
1293 }
1294
1295 __ bind(again);
1296
1297 if (PrefetchCopyIntervalInBytes > 0)
1298 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1299
1300 if (direction == copy_forwards) {
1301 // allowing for the offset of -8 the store instructions place
1302 // registers into the target 64 bit block at the following
1303 // offsets
1304 //
1305 // t0 at offset 0
1306 // t1 at offset 8, t2 at offset 16
1307 // t3 at offset 24, t4 at offset 32
1308 // t5 at offset 40, t6 at offset 48
1309 // t7 at offset 56
1310
1311 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1312 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1313 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1314 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1315 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1316 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1317 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1318 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1319 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1320 } else {
1321 // d was not offset when we started so the registers are
1322 // written into the 64 bit block preceding d with the following
1323 // offsets
1324 //
1325 // t1 at offset -8
1326 // t3 at offset -24, t0 at offset -16
1327 // t5 at offset -48, t2 at offset -32
1328 // t7 at offset -56, t4 at offset -48
1329 // t6 at offset -64
1330 //
1331 // note that this matches the offsets previously noted for the
1332 // loads
1333
1334 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1335 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1336 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1337 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1338 bs.copy_load_at_16(t2, t3, Address(s, 4 * unit));
1339 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1340 bs.copy_load_at_16(t4, t5, Address(s, 6 * unit));
1341 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1342 bs.copy_load_at_16(t6, t7, Address(__ pre(s, 8 * unit)));
1343 }
1344
1345 __ subs(count, count, 8);
1346 __ br(Assembler::HS, again);
1347
1348 // Drain
1349 //
1350 // this uses the same pattern of offsets and register arguments
1351 // as above
1352 __ bind(drain);
1353 if (direction == copy_forwards) {
1354 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1355 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1356 bs.copy_store_at_16(Address(d, 4 * unit), t3, t4);
1357 bs.copy_store_at_16(Address(d, 6 * unit), t5, t6);
1358 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t7);
1359 } else {
1360 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1361 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1362 bs.copy_store_at_16(Address(d, 5 * unit), t5, t2);
1363 bs.copy_store_at_16(Address(d, 7 * unit), t7, t4);
1364 bs.copy_store_at_8(Address(__ pre(d, 8 * unit)), t6);
1365 }
1366 // now we need to copy any remaining part block which may
1367 // include a 4 word block subblock and/or a 2 word subblock.
1368 // bits 2 and 1 in the count are the tell-tale for whether we
1369 // have each such subblock
1370 {
1371 Label L1, L2;
1372 __ tbz(count, exact_log2(4), L1);
1373 // this is the same as above but copying only 4 longs hence
1374 // with only one intervening stp between the str instructions
1375 // but note that the offsets and registers still follow the
1376 // same pattern
1377 bs.copy_load_at_16(t0, t1, Address(s, 2 * unit));
1378 bs.copy_load_at_16(t2, t3, Address(__ pre(s, 4 * unit)));
1379 if (direction == copy_forwards) {
1380 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1381 bs.copy_store_at_16(Address(d, 2 * unit), t1, t2);
1382 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t3);
1383 } else {
1384 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1385 bs.copy_store_at_16(Address(d, 3 * unit), t3, t0);
1386 bs.copy_store_at_8(Address(__ pre(d, 4 * unit)), t2);
1387 }
1388 __ bind(L1);
1389
1390 __ tbz(count, 1, L2);
1391 // this is the same as above but copying only 2 longs hence
1392 // there is no intervening stp between the str instructions
1393 // but note that the offset and register patterns are still
1394 // the same
1395 bs.copy_load_at_16(t0, t1, Address(__ pre(s, 2 * unit)));
1396 if (direction == copy_forwards) {
1397 bs.copy_store_at_8(Address(d, 1 * unit), t0);
1398 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t1);
1399 } else {
1400 bs.copy_store_at_8(Address(d, 1 * unit), t1);
1401 bs.copy_store_at_8(Address(__ pre(d, 2 * unit)), t0);
1402 }
1403 __ bind(L2);
1404
1405 // for forwards copy we need to re-adjust the offsets we
1406 // applied so that s and d are follow the last words written
1407
1408 if (direction == copy_forwards) {
1409 __ add(s, s, 16);
1410 __ add(d, d, 8);
1411 }
1412
1413 }
1414
1415 __ ret(lr);
1416 }
1417
1418 // record the stub entry and end
1419 store_archive_data(stub_id, start, __ pc());
1420
1421 return start;
1422 }
1423
1424 // Small copy: less than 16 bytes.
1425 //
1426 // NB: Ignores all of the bits of count which represent more than 15
1427 // bytes, so a caller doesn't have to mask them.
1428
1429 void copy_memory_small(DecoratorSet decorators, BasicType type, Register s, Register d, Register count, int step) {
1430 bool is_backwards = step < 0;
1431 size_t granularity = g_uabs(step);
1432 int direction = is_backwards ? -1 : 1;
1433
1434 Label Lword, Lint, Lshort, Lbyte;
1435
1436 assert(granularity
1437 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1438
1439 const Register t0 = r3;
1440 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1441 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, fnoreg, fnoreg, fnoreg);
1442
1443 // ??? I don't know if this bit-test-and-branch is the right thing
1444 // to do. It does a lot of jumping, resulting in several
1445 // mispredicted branches. It might make more sense to do this
1446 // with something like Duff's device with a single computed branch.
1447
1448 __ tbz(count, 3 - exact_log2(granularity), Lword);
1449 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1450 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1451 __ bind(Lword);
1452
1453 if (granularity <= sizeof (jint)) {
1454 __ tbz(count, 2 - exact_log2(granularity), Lint);
1455 __ ldrw(t0, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1456 __ strw(t0, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1457 __ bind(Lint);
1458 }
1459
1460 if (granularity <= sizeof (jshort)) {
1461 __ tbz(count, 1 - exact_log2(granularity), Lshort);
1462 __ ldrh(t0, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1463 __ strh(t0, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1464 __ bind(Lshort);
1465 }
1466
1467 if (granularity <= sizeof (jbyte)) {
1468 __ tbz(count, 0, Lbyte);
1469 __ ldrb(t0, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1470 __ strb(t0, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1471 __ bind(Lbyte);
1472 }
1473 }
1474
1475 // All-singing all-dancing memory copy.
1476 //
1477 // Copy count units of memory from s to d. The size of a unit is
1478 // step, which can be positive or negative depending on the direction
1479 // of copy. If is_aligned is false, we align the source address.
1480 //
1481
1482 void copy_memory(DecoratorSet decorators, BasicType type, bool is_aligned,
1483 Register s, Register d, Register count, int step) {
1484 copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1485 bool is_backwards = step < 0;
1486 unsigned int granularity = g_uabs(step);
1487 const Register t0 = r3, t1 = r4;
1488
1489 // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1490 // load all the data before writing anything
1491 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1492 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r11;
1493 const Register t6 = r12, t7 = r13, t8 = r14, t9 = r15;
1494 const Register send = r17, dend = r16;
1495 const Register gct1 = rscratch1, gct2 = rscratch2, gct3 = r10;
1496 const FloatRegister gcvt1 = v6, gcvt2 = v7, gcvt3 = v16; // Note that v8-v15 are callee saved
1497 ArrayCopyBarrierSetHelper bs(_masm, decorators, type, gct1, gct2, gct3, gcvt1, gcvt2, gcvt3);
1498
1499 if (PrefetchCopyIntervalInBytes > 0)
1500 __ prfm(Address(s, 0), PLDL1KEEP);
1501 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity));
1502 __ br(Assembler::HI, copy_big);
1503
1504 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1505 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1506
1507 __ cmp(count, u1(16/granularity));
1508 __ br(Assembler::LS, copy16);
1509
1510 __ cmp(count, u1(64/granularity));
1511 __ br(Assembler::HI, copy80);
1512
1513 __ cmp(count, u1(32/granularity));
1514 __ br(Assembler::LS, copy32);
1515
1516 // 33..64 bytes
1517 if (UseSIMDForMemoryOps) {
1518 bs.copy_load_at_32(v0, v1, Address(s, 0));
1519 bs.copy_load_at_32(v2, v3, Address(send, -32));
1520 bs.copy_store_at_32(Address(d, 0), v0, v1);
1521 bs.copy_store_at_32(Address(dend, -32), v2, v3);
1522 } else {
1523 bs.copy_load_at_16(t0, t1, Address(s, 0));
1524 bs.copy_load_at_16(t2, t3, Address(s, 16));
1525 bs.copy_load_at_16(t4, t5, Address(send, -32));
1526 bs.copy_load_at_16(t6, t7, Address(send, -16));
1527
1528 bs.copy_store_at_16(Address(d, 0), t0, t1);
1529 bs.copy_store_at_16(Address(d, 16), t2, t3);
1530 bs.copy_store_at_16(Address(dend, -32), t4, t5);
1531 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1532 }
1533 __ b(finish);
1534
1535 // 17..32 bytes
1536 __ bind(copy32);
1537 bs.copy_load_at_16(t0, t1, Address(s, 0));
1538 bs.copy_load_at_16(t6, t7, Address(send, -16));
1539
1540 bs.copy_store_at_16(Address(d, 0), t0, t1);
1541 bs.copy_store_at_16(Address(dend, -16), t6, t7);
1542 __ b(finish);
1543
1544 // 65..80/96 bytes
1545 // (96 bytes if SIMD because we do 32 byes per instruction)
1546 __ bind(copy80);
1547 if (UseSIMDForMemoryOps) {
1548 bs.copy_load_at_32(v0, v1, Address(s, 0));
1549 bs.copy_load_at_32(v2, v3, Address(s, 32));
1550 // Unaligned pointers can be an issue for copying.
1551 // The issue has more chances to happen when granularity of data is
1552 // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1553 // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1554 // The most performance drop has been seen for the range 65-80 bytes.
1555 // For such cases using the pair of ldp/stp instead of the third pair of
1556 // ldpq/stpq fixes the performance issue.
1557 if (granularity < sizeof (jint)) {
1558 Label copy96;
1559 __ cmp(count, u1(80/granularity));
1560 __ br(Assembler::HI, copy96);
1561 bs.copy_load_at_16(t0, t1, Address(send, -16));
1562
1563 bs.copy_store_at_32(Address(d, 0), v0, v1);
1564 bs.copy_store_at_32(Address(d, 32), v2, v3);
1565
1566 bs.copy_store_at_16(Address(dend, -16), t0, t1);
1567 __ b(finish);
1568
1569 __ bind(copy96);
1570 }
1571 bs.copy_load_at_32(v4, v5, Address(send, -32));
1572
1573 bs.copy_store_at_32(Address(d, 0), v0, v1);
1574 bs.copy_store_at_32(Address(d, 32), v2, v3);
1575
1576 bs.copy_store_at_32(Address(dend, -32), v4, v5);
1577 } else {
1578 bs.copy_load_at_16(t0, t1, Address(s, 0));
1579 bs.copy_load_at_16(t2, t3, Address(s, 16));
1580 bs.copy_load_at_16(t4, t5, Address(s, 32));
1581 bs.copy_load_at_16(t6, t7, Address(s, 48));
1582 bs.copy_load_at_16(t8, t9, Address(send, -16));
1583
1584 bs.copy_store_at_16(Address(d, 0), t0, t1);
1585 bs.copy_store_at_16(Address(d, 16), t2, t3);
1586 bs.copy_store_at_16(Address(d, 32), t4, t5);
1587 bs.copy_store_at_16(Address(d, 48), t6, t7);
1588 bs.copy_store_at_16(Address(dend, -16), t8, t9);
1589 }
1590 __ b(finish);
1591
1592 // 0..16 bytes
1593 __ bind(copy16);
1594 __ cmp(count, u1(8/granularity));
1595 __ br(Assembler::LO, copy8);
1596
1597 // 8..16 bytes
1598 bs.copy_load_at_8(t0, Address(s, 0));
1599 bs.copy_load_at_8(t1, Address(send, -8));
1600 bs.copy_store_at_8(Address(d, 0), t0);
1601 bs.copy_store_at_8(Address(dend, -8), t1);
1602 __ b(finish);
1603
1604 if (granularity < 8) {
1605 // 4..7 bytes
1606 __ bind(copy8);
1607 __ tbz(count, 2 - exact_log2(granularity), copy4);
1608 __ ldrw(t0, Address(s, 0));
1609 __ ldrw(t1, Address(send, -4));
1610 __ strw(t0, Address(d, 0));
1611 __ strw(t1, Address(dend, -4));
1612 __ b(finish);
1613 if (granularity < 4) {
1614 // 0..3 bytes
1615 __ bind(copy4);
1616 __ cbz(count, finish); // get rid of 0 case
1617 if (granularity == 2) {
1618 __ ldrh(t0, Address(s, 0));
1619 __ strh(t0, Address(d, 0));
1620 } else { // granularity == 1
1621 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1622 // the first and last byte.
1623 // Handle the 3 byte case by loading and storing base + count/2
1624 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1625 // This does means in the 1 byte case we load/store the same
1626 // byte 3 times.
1627 __ lsr(count, count, 1);
1628 __ ldrb(t0, Address(s, 0));
1629 __ ldrb(t1, Address(send, -1));
1630 __ ldrb(t2, Address(s, count));
1631 __ strb(t0, Address(d, 0));
1632 __ strb(t1, Address(dend, -1));
1633 __ strb(t2, Address(d, count));
1634 }
1635 __ b(finish);
1636 }
1637 }
1638
1639 __ bind(copy_big);
1640 if (is_backwards) {
1641 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1642 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1643 }
1644
1645 // Now we've got the small case out of the way we can align the
1646 // source address on a 2-word boundary.
1647
1648 // Here we will materialize a count in r15, which is used by copy_memory_small
1649 // and the various generate_copy_longs stubs that we use for 2 word aligned bytes.
1650 // Up until here, we have used t9, which aliases r15, but from here on, that register
1651 // can not be used as a temp register, as it contains the count.
1652
1653 Label aligned;
1654
1655 if (is_aligned) {
1656 // We may have to adjust by 1 word to get s 2-word-aligned.
1657 __ tbz(s, exact_log2(wordSize), aligned);
1658 bs.copy_load_at_8(t0, Address(__ adjust(s, direction * wordSize, is_backwards)));
1659 bs.copy_store_at_8(Address(__ adjust(d, direction * wordSize, is_backwards)), t0);
1660 __ sub(count, count, wordSize/granularity);
1661 } else {
1662 if (is_backwards) {
1663 __ andr(r15, s, 2 * wordSize - 1);
1664 } else {
1665 __ neg(r15, s);
1666 __ andr(r15, r15, 2 * wordSize - 1);
1667 }
1668 // r15 is the byte adjustment needed to align s.
1669 __ cbz(r15, aligned);
1670 int shift = exact_log2(granularity);
1671 if (shift > 0) {
1672 __ lsr(r15, r15, shift);
1673 }
1674 __ sub(count, count, r15);
1675
1676 #if 0
1677 // ?? This code is only correct for a disjoint copy. It may or
1678 // may not make sense to use it in that case.
1679
1680 // Copy the first pair; s and d may not be aligned.
1681 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1682 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1683
1684 // Align s and d, adjust count
1685 if (is_backwards) {
1686 __ sub(s, s, r15);
1687 __ sub(d, d, r15);
1688 } else {
1689 __ add(s, s, r15);
1690 __ add(d, d, r15);
1691 }
1692 #else
1693 copy_memory_small(decorators, type, s, d, r15, step);
1694 #endif
1695 }
1696
1697 __ bind(aligned);
1698
1699 // s is now 2-word-aligned.
1700
1701 // We have a count of units and some trailing bytes. Adjust the
1702 // count and do a bulk copy of words. If the shift is zero
1703 // perform a move instead to benefit from zero latency moves.
1704 int shift = exact_log2(wordSize/granularity);
1705 if (shift > 0) {
1706 __ lsr(r15, count, shift);
1707 } else {
1708 __ mov(r15, count);
1709 }
1710 if (direction == copy_forwards) {
1711 if (type != T_OBJECT) {
1712 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_f()));
1713 __ blr(rscratch1);
1714 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1715 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_f()));
1716 __ blr(rscratch1);
1717 } else {
1718 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_f()));
1719 __ blr(rscratch1);
1720 }
1721 } else {
1722 if (type != T_OBJECT) {
1723 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_byte_b()));
1724 __ blr(rscratch1);
1725 } else if ((decorators & IS_DEST_UNINITIALIZED) != 0) {
1726 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_uninit_b()));
1727 __ blr(rscratch1);
1728 } else {
1729 __ lea(rscratch1, RuntimeAddress(StubRoutines::aarch64::copy_oop_b()));
1730 __ blr(rscratch1);
1731 }
1732 }
1733
1734 // And the tail.
1735 copy_memory_small(decorators, type, s, d, count, step);
1736
1737 if (granularity >= 8) __ bind(copy8);
1738 if (granularity >= 4) __ bind(copy4);
1739 __ bind(finish);
1740 }
1741
1742
1743 void clobber_registers() {
1744 #ifdef ASSERT
1745 RegSet clobbered
1746 = MacroAssembler::call_clobbered_gp_registers() - rscratch1;
1747 __ mov(rscratch1, (uint64_t)0xdeadbeef);
1748 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1749 for (RegSetIterator<Register> it = clobbered.begin(); *it != noreg; ++it) {
1750 __ mov(*it, rscratch1);
1751 }
1752 #endif
1753
1754 }
1755
1756 // Scan over array at a for count oops, verifying each one.
1757 // Preserves a and count, clobbers rscratch1 and rscratch2.
1758 void verify_oop_array (int size, Register a, Register count, Register temp) {
1759 Label loop, end;
1760 __ mov(rscratch1, a);
1761 __ mov(rscratch2, zr);
1762 __ bind(loop);
1763 __ cmp(rscratch2, count);
1764 __ br(Assembler::HS, end);
1765 if (size == wordSize) {
1766 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1767 __ verify_oop(temp);
1768 } else {
1769 __ ldrw(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1770 __ decode_heap_oop(temp); // calls verify_oop
1771 }
1772 __ add(rscratch2, rscratch2, 1);
1773 __ b(loop);
1774 __ bind(end);
1775 }
1776
1777 // Arguments:
1778 // stub_id - is used to name the stub and identify all details of
1779 // how to perform the copy.
1780 //
1781 // nopush_entry - is assigned to the stub's post push entry point
1782 // unless it is null
1783 //
1784 // Inputs:
1785 // c_rarg0 - source array address
1786 // c_rarg1 - destination array address
1787 // c_rarg2 - element count, treated as ssize_t, can be zero
1788 //
1789 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1790 // the hardware handle it. The two dwords within qwords that span
1791 // cache line boundaries will still be loaded and stored atomically.
1792 //
1793 // Side Effects: nopush_entry is set to the (post push) entry point
1794 // so it can be used by the corresponding conjoint
1795 // copy method
1796 //
1797 address generate_disjoint_copy(StubId stub_id, address *nopush_entry) {
1798 int size;
1799 bool aligned;
1800 bool is_oop;
1801 bool dest_uninitialized;
1802 switch (stub_id) {
1803 case StubId::stubgen_jbyte_disjoint_arraycopy_id:
1804 size = sizeof(jbyte);
1805 aligned = false;
1806 is_oop = false;
1807 dest_uninitialized = false;
1808 break;
1809 case StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id:
1810 size = sizeof(jbyte);
1811 aligned = true;
1812 is_oop = false;
1813 dest_uninitialized = false;
1814 break;
1815 case StubId::stubgen_jshort_disjoint_arraycopy_id:
1816 size = sizeof(jshort);
1817 aligned = false;
1818 is_oop = false;
1819 dest_uninitialized = false;
1820 break;
1821 case StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id:
1822 size = sizeof(jshort);
1823 aligned = true;
1824 is_oop = false;
1825 dest_uninitialized = false;
1826 break;
1827 case StubId::stubgen_jint_disjoint_arraycopy_id:
1828 size = sizeof(jint);
1829 aligned = false;
1830 is_oop = false;
1831 dest_uninitialized = false;
1832 break;
1833 case StubId::stubgen_arrayof_jint_disjoint_arraycopy_id:
1834 size = sizeof(jint);
1835 aligned = true;
1836 is_oop = false;
1837 dest_uninitialized = false;
1838 break;
1839 case StubId::stubgen_jlong_disjoint_arraycopy_id:
1840 // since this is always aligned we can (should!) use the same
1841 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
1842 ShouldNotReachHere();
1843 break;
1844 case StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id:
1845 size = sizeof(jlong);
1846 aligned = true;
1847 is_oop = false;
1848 dest_uninitialized = false;
1849 break;
1850 case StubId::stubgen_oop_disjoint_arraycopy_id:
1851 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1852 aligned = !UseCompressedOops;
1853 is_oop = true;
1854 dest_uninitialized = false;
1855 break;
1856 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_id:
1857 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1858 aligned = !UseCompressedOops;
1859 is_oop = true;
1860 dest_uninitialized = false;
1861 break;
1862 case StubId::stubgen_oop_disjoint_arraycopy_uninit_id:
1863 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1864 aligned = !UseCompressedOops;
1865 is_oop = true;
1866 dest_uninitialized = true;
1867 break;
1868 case StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id:
1869 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1870 aligned = !UseCompressedOops;
1871 is_oop = true;
1872 dest_uninitialized = true;
1873 break;
1874 default:
1875 ShouldNotReachHere();
1876 break;
1877 }
1878 // all stubs provide a 2nd entry which omits the frame push for
1879 // use when bailing out from a conjoint copy. However we may also
1880 // need some extra addressses for memory access protection.
1881 int entry_count = StubInfo::entry_count(stub_id);
1882 assert(entry_count == 2, "sanity check");
1883 assert(nopush_entry != nullptr, "all disjoint copy stubs export a nopush entry");
1884
1885 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
1886 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
1887 GrowableArray<address> entries;
1888 GrowableArray<address> extras;
1889 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
1890 address start = load_archive_data(stub_id, &entries, extras_ptr);
1891 if (start != nullptr) {
1892 assert(entries.length() == entry_count - 1,
1893 "unexpected entries count %d", entries.length());
1894 *nopush_entry = entries.at(0);
1895 assert(extras.length() == extra_count,
1896 "unexpected extra count %d", extras.length());
1897 if (add_extras) {
1898 // register one handler at offset 0
1899 register_unsafe_access_handlers(extras, 0, 1);
1900 }
1901 return start;
1902 }
1903
1904 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1905 RegSet saved_reg = RegSet::of(s, d, count);
1906
1907 __ align(CodeEntryAlignment);
1908 StubCodeMark mark(this, stub_id);
1909 start = __ pc();
1910 __ enter();
1911
1912 *nopush_entry = __ pc();
1913 entries.append(*nopush_entry);
1914
1915 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1916 BLOCK_COMMENT("Post-Push Entry:");
1917
1918 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1919 if (dest_uninitialized) {
1920 decorators |= IS_DEST_UNINITIALIZED;
1921 }
1922 if (aligned) {
1923 decorators |= ARRAYCOPY_ALIGNED;
1924 }
1925
1926 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1927 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg);
1928
1929 if (is_oop) {
1930 // save regs before copy_memory
1931 __ push(RegSet::of(d, count), sp);
1932 }
1933 {
1934 // UnsafeMemoryAccess page error: continue after unsafe access
1935 UnsafeMemoryAccessMark umam(this, add_extras, true);
1936 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, size);
1937 }
1938
1939 if (is_oop) {
1940 __ pop(RegSet::of(d, count), sp);
1941 if (VerifyOops)
1942 verify_oop_array(size, d, count, r16);
1943 }
1944
1945 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
1946
1947 __ leave();
1948 __ mov(r0, zr); // return 0
1949 __ ret(lr);
1950
1951 address end = __ pc();
1952
1953 if (add_extras) {
1954 // retrieve the registered handler addresses
1955 retrieve_unsafe_access_handlers(start, end, extras);
1956 assert(extras.length() == extra_count
1957 , "incorrect handlers count %d", extras.length());
1958 }
1959
1960 // record the stub entry and end plus the no_push entry and any
1961 // extra handler addresses
1962 store_archive_data(stub_id, start, end, &entries, extras_ptr);
1963
1964 return start;
1965 }
1966
1967 // Arguments:
1968 // stub_id - is used to name the stub and identify all details of
1969 // how to perform the copy.
1970 //
1971 // nooverlap_target - identifes the (post push) entry for the
1972 // corresponding disjoint copy routine which can be
1973 // jumped to if the ranges do not actually overlap
1974 //
1975 // nopush_entry - is assigned to the stub's post push entry point
1976 // unless it is null
1977 //
1978 //
1979 // Inputs:
1980 // c_rarg0 - source array address
1981 // c_rarg1 - destination array address
1982 // c_rarg2 - element count, treated as ssize_t, can be zero
1983 //
1984 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1985 // the hardware handle it. The two dwords within qwords that span
1986 // cache line boundaries will still be loaded and stored atomically.
1987 //
1988 // Side Effects:
1989 // nopush_entry is set to the no-overlap entry point so it can be
1990 // used by some other conjoint copy method
1991 //
1992 address generate_conjoint_copy(StubId stub_id, address nooverlap_target, address *nopush_entry) {
1993 int size;
1994 bool aligned;
1995 bool is_oop;
1996 bool dest_uninitialized;
1997 switch (stub_id) {
1998 case StubId::stubgen_jbyte_arraycopy_id:
1999 size = sizeof(jbyte);
2000 aligned = false;
2001 is_oop = false;
2002 dest_uninitialized = false;
2003 break;
2004 case StubId::stubgen_arrayof_jbyte_arraycopy_id:
2005 size = sizeof(jbyte);
2006 aligned = true;
2007 is_oop = false;
2008 dest_uninitialized = false;
2009 break;
2010 case StubId::stubgen_jshort_arraycopy_id:
2011 size = sizeof(jshort);
2012 aligned = false;
2013 is_oop = false;
2014 dest_uninitialized = false;
2015 break;
2016 case StubId::stubgen_arrayof_jshort_arraycopy_id:
2017 size = sizeof(jshort);
2018 aligned = true;
2019 is_oop = false;
2020 dest_uninitialized = false;
2021 break;
2022 case StubId::stubgen_jint_arraycopy_id:
2023 size = sizeof(jint);
2024 aligned = false;
2025 is_oop = false;
2026 dest_uninitialized = false;
2027 break;
2028 case StubId::stubgen_arrayof_jint_arraycopy_id:
2029 size = sizeof(jint);
2030 aligned = true;
2031 is_oop = false;
2032 dest_uninitialized = false;
2033 break;
2034 case StubId::stubgen_jlong_arraycopy_id:
2035 // since this is always aligned we can (should!) use the same
2036 // stub as for case StubId::stubgen_arrayof_jlong_disjoint_arraycopy
2037 ShouldNotReachHere();
2038 break;
2039 case StubId::stubgen_arrayof_jlong_arraycopy_id:
2040 size = sizeof(jlong);
2041 aligned = true;
2042 is_oop = false;
2043 dest_uninitialized = false;
2044 break;
2045 case StubId::stubgen_oop_arraycopy_id:
2046 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2047 aligned = !UseCompressedOops;
2048 is_oop = true;
2049 dest_uninitialized = false;
2050 break;
2051 case StubId::stubgen_arrayof_oop_arraycopy_id:
2052 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2053 aligned = !UseCompressedOops;
2054 is_oop = true;
2055 dest_uninitialized = false;
2056 break;
2057 case StubId::stubgen_oop_arraycopy_uninit_id:
2058 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2059 aligned = !UseCompressedOops;
2060 is_oop = true;
2061 dest_uninitialized = true;
2062 break;
2063 case StubId::stubgen_arrayof_oop_arraycopy_uninit_id:
2064 size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
2065 aligned = !UseCompressedOops;
2066 is_oop = true;
2067 dest_uninitialized = true;
2068 break;
2069 default:
2070 ShouldNotReachHere();
2071 }
2072 // only some conjoint stubs generate a 2nd entry
2073 int entry_count = StubInfo::entry_count(stub_id);
2074 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2075 assert(entry_count == expected_entry_count,
2076 "expected entry count %d does not match declared entry count %d for stub %s",
2077 expected_entry_count, entry_count, StubInfo::name(stub_id));
2078
2079 // We need to protect memory accesses in certain cases
2080 bool add_extras = !is_oop && (!aligned || sizeof(jlong) == size);
2081 int extra_count = ((add_extras ? 1 : 0) * UnsafeMemoryAccess::COLUMN_COUNT);
2082 GrowableArray<address> entries;
2083 GrowableArray<address> extras;
2084 GrowableArray<address> *entries_ptr = (nopush_entry != nullptr ? &entries : nullptr);
2085 GrowableArray<address> *extras_ptr = (extra_count > 0 ? &extras : nullptr);
2086 address start = load_archive_data(stub_id, entries_ptr, extras_ptr);
2087 if (start != nullptr) {
2088 assert(entries.length() == expected_entry_count - 1,
2089 "unexpected entries count %d", entries.length());
2090 assert(extras.length() == extra_count,
2091 "unexpected extra count %d", extras.length());
2092 if (nopush_entry != nullptr) {
2093 *nopush_entry = entries.at(0);
2094 }
2095 if (add_extras) {
2096 // register one handler at offset 0
2097 register_unsafe_access_handlers(extras, 0, 1);
2098 }
2099 return start;
2100 }
2101
2102 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2103 RegSet saved_regs = RegSet::of(s, d, count);
2104 StubCodeMark mark(this, stub_id);
2105 start = __ pc();
2106 __ enter();
2107
2108 if (nopush_entry != nullptr) {
2109 *nopush_entry = __ pc();
2110 entries.append(*nopush_entry);
2111 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2112 BLOCK_COMMENT("Post-Push Entry:");
2113 }
2114
2115 // use fwd copy when (d-s) above_equal (count*size)
2116 Label L_overlapping;
2117 __ sub(rscratch1, d, s);
2118 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
2119 __ br(Assembler::LO, L_overlapping);
2120 __ b(RuntimeAddress(nooverlap_target));
2121 __ bind(L_overlapping);
2122
2123 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2124 if (dest_uninitialized) {
2125 decorators |= IS_DEST_UNINITIALIZED;
2126 }
2127 if (aligned) {
2128 decorators |= ARRAYCOPY_ALIGNED;
2129 }
2130
2131 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2132 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs);
2133
2134 if (is_oop) {
2135 // save regs before copy_memory
2136 __ push(RegSet::of(d, count), sp);
2137 }
2138 {
2139 // UnsafeMemoryAccess page error: continue after unsafe access
2140 UnsafeMemoryAccessMark umam(this, add_extras, true);
2141 copy_memory(decorators, is_oop ? T_OBJECT : T_BYTE, aligned, s, d, count, -size);
2142 }
2143 if (is_oop) {
2144 __ pop(RegSet::of(d, count), sp);
2145 if (VerifyOops)
2146 verify_oop_array(size, d, count, r16);
2147 }
2148 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1);
2149 __ leave();
2150 __ mov(r0, zr); // return 0
2151 __ ret(lr);
2152
2153 assert(entries.length() == expected_entry_count - 1,
2154 "unexpected entries count %d", entries.length());
2155
2156 address end = __ pc();
2157
2158 if (add_extras) {
2159 // retrieve the registered handler addresses
2160 retrieve_unsafe_access_handlers(start, end, extras);
2161 assert(extras.length() == extra_count,
2162 "incorrect handlers count %d", extras.length());
2163 }
2164
2165 // record the stub entry and end plus any no_push entry and/or
2166 // extra handler addresses
2167 store_archive_data(stub_id, start, end, entries_ptr, extras_ptr);
2168
2169 return start;
2170 }
2171
2172 // Helper for generating a dynamic type check.
2173 // Smashes rscratch1, rscratch2.
2174 void generate_type_check(Register sub_klass,
2175 Register super_check_offset,
2176 Register super_klass,
2177 Register temp1,
2178 Register temp2,
2179 Register result,
2180 Label& L_success) {
2181 assert_different_registers(sub_klass, super_check_offset, super_klass);
2182
2183 BLOCK_COMMENT("type_check:");
2184
2185 Label L_miss;
2186
2187 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, nullptr,
2188 super_check_offset);
2189 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp1, temp2, &L_success, nullptr);
2190
2191 // Fall through on failure!
2192 __ BIND(L_miss);
2193 }
2194
2195 //
2196 // Generate checkcasting array copy stub
2197 //
2198 // Input:
2199 // c_rarg0 - source array address
2200 // c_rarg1 - destination array address
2201 // c_rarg2 - element count, treated as ssize_t, can be zero
2202 // c_rarg3 - size_t ckoff (super_check_offset)
2203 // c_rarg4 - oop ckval (super_klass)
2204 //
2205 // Output:
2206 // r0 == 0 - success
2207 // r0 == -1^K - failure, where K is partial transfer count
2208 //
2209 address generate_checkcast_copy(StubId stub_id, address *nopush_entry) {
2210 bool dest_uninitialized;
2211 switch (stub_id) {
2212 case StubId::stubgen_checkcast_arraycopy_id:
2213 dest_uninitialized = false;
2214 break;
2215 case StubId::stubgen_checkcast_arraycopy_uninit_id:
2216 dest_uninitialized = true;
2217 break;
2218 default:
2219 ShouldNotReachHere();
2220 }
2221
2222 // The normal stub provides a 2nd entry which omits the frame push
2223 // for use when bailing out from a disjoint copy.
2224 // Only some conjoint stubs generate a 2nd entry
2225 int entry_count = StubInfo::entry_count(stub_id);
2226 int expected_entry_count = (nopush_entry == nullptr ? 1 : 2);
2227 GrowableArray<address> entries;
2228 GrowableArray<address> *entries_ptr = (expected_entry_count == 1 ? nullptr : &entries);
2229 assert(entry_count == expected_entry_count,
2230 "expected entry count %d does not match declared entry count %d for stub %s",
2231 expected_entry_count, entry_count, StubInfo::name(stub_id));
2232 address start = load_archive_data(stub_id, entries_ptr);
2233 if (start != nullptr) {
2234 assert(entries.length() + 1 == expected_entry_count,
2235 "expected entry count %d does not match return entry count %d for stub %s",
2236 expected_entry_count, entries.length() + 1, StubInfo::name(stub_id));
2237 if (nopush_entry != nullptr) {
2238 *nopush_entry = entries.at(0);
2239 }
2240 return start;
2241 }
2242
2243 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
2244
2245 // Input registers (after setup_arg_regs)
2246 const Register from = c_rarg0; // source array address
2247 const Register to = c_rarg1; // destination array address
2248 const Register count = c_rarg2; // elementscount
2249 const Register ckoff = c_rarg3; // super_check_offset
2250 const Register ckval = c_rarg4; // super_klass
2251
2252 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
2253
2254 // Registers used as temps (r19, r20, r21, r22 are save-on-entry)
2255 const Register copied_oop = r22; // actual oop copied
2256 const Register count_save = r21; // orig elementscount
2257 const Register start_to = r20; // destination array start address
2258 const Register r19_klass = r19; // oop._klass
2259
2260 // Registers used as gc temps (r5, r6, r7 are save-on-call)
2261 const Register gct1 = r5, gct2 = r6, gct3 = r7;
2262
2263 //---------------------------------------------------------------
2264 // Assembler stub will be used for this call to arraycopy
2265 // if the two arrays are subtypes of Object[] but the
2266 // destination array type is not equal to or a supertype
2267 // of the source type. Each element must be separately
2268 // checked.
2269
2270 assert_different_registers(from, to, count, ckoff, ckval, start_to,
2271 copied_oop, r19_klass, count_save);
2272
2273 __ align(CodeEntryAlignment);
2274 StubCodeMark mark(this, stub_id);
2275 start = __ pc();
2276
2277 __ enter(); // required for proper stackwalking of RuntimeStub frame
2278
2279 #ifdef ASSERT
2280 // caller guarantees that the arrays really are different
2281 // otherwise, we would have to make conjoint checks
2282 { Label L;
2283 __ b(L); // conjoint check not yet implemented
2284 __ stop("checkcast_copy within a single array");
2285 __ bind(L);
2286 }
2287 #endif //ASSERT
2288
2289 // Caller of this entry point must set up the argument registers.
2290 if (nopush_entry != nullptr) {
2291 *nopush_entry = __ pc();
2292 entries.append(*nopush_entry);
2293 BLOCK_COMMENT("Entry:");
2294 }
2295
2296 // Empty array: Nothing to do.
2297 __ cbz(count, L_done);
2298 __ push(RegSet::of(r19, r20, r21, r22), sp);
2299
2300 #ifdef ASSERT
2301 BLOCK_COMMENT("assert consistent ckoff/ckval");
2302 // The ckoff and ckval must be mutually consistent,
2303 // even though caller generates both.
2304 { Label L;
2305 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2306 __ ldrw(start_to, Address(ckval, sco_offset));
2307 __ cmpw(ckoff, start_to);
2308 __ br(Assembler::EQ, L);
2309 __ stop("super_check_offset inconsistent");
2310 __ bind(L);
2311 }
2312 #endif //ASSERT
2313
2314 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT;
2315 bool is_oop = true;
2316 int element_size = UseCompressedOops ? 4 : 8;
2317 if (dest_uninitialized) {
2318 decorators |= IS_DEST_UNINITIALIZED;
2319 }
2320
2321 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2322 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs);
2323
2324 // save the original count
2325 __ mov(count_save, count);
2326
2327 // Copy from low to high addresses
2328 __ mov(start_to, to); // Save destination array start address
2329 __ b(L_load_element);
2330
2331 // ======== begin loop ========
2332 // (Loop is rotated; its entry is L_load_element.)
2333 // Loop control:
2334 // for (; count != 0; count--) {
2335 // copied_oop = load_heap_oop(from++);
2336 // ... generate_type_check ...;
2337 // store_heap_oop(to++, copied_oop);
2338 // }
2339 __ align(OptoLoopAlignment);
2340
2341 __ BIND(L_store_element);
2342 bs->copy_store_at(_masm, decorators, T_OBJECT, element_size,
2343 __ post(to, element_size), copied_oop, noreg,
2344 gct1, gct2, gct3);
2345 __ sub(count, count, 1);
2346 __ cbz(count, L_do_card_marks);
2347
2348 // ======== loop entry is here ========
2349 __ BIND(L_load_element);
2350 bs->copy_load_at(_masm, decorators, T_OBJECT, element_size,
2351 copied_oop, noreg, __ post(from, element_size),
2352 gct1);
2353 __ cbz(copied_oop, L_store_element);
2354
2355 __ load_klass(r19_klass, copied_oop);// query the object klass
2356
2357 BLOCK_COMMENT("type_check:");
2358 generate_type_check(/*sub_klass*/r19_klass,
2359 /*super_check_offset*/ckoff,
2360 /*super_klass*/ckval,
2361 /*r_array_base*/gct1,
2362 /*temp2*/gct2,
2363 /*result*/r10, L_store_element);
2364
2365 // Fall through on failure!
2366
2367 // ======== end loop ========
2368
2369 // It was a real error; we must depend on the caller to finish the job.
2370 // Register count = remaining oops, count_orig = total oops.
2371 // Emit GC store barriers for the oops we have copied and report
2372 // their number to the caller.
2373
2374 __ subs(count, count_save, count); // K = partially copied oop count
2375 __ eon(count, count, zr); // report (-1^K) to caller
2376 __ br(Assembler::EQ, L_done_pop);
2377
2378 __ BIND(L_do_card_marks);
2379 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1);
2380
2381 __ bind(L_done_pop);
2382 __ pop(RegSet::of(r19, r20, r21, r22), sp);
2383 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2384
2385 __ bind(L_done);
2386 __ mov(r0, count);
2387 __ leave();
2388 __ ret(lr);
2389
2390 // record the stub entry and end plus any no_push entry
2391 store_archive_data(stub_id, start, __ pc() , entries_ptr);
2392 return start;
2393 }
2394
2395 // Perform range checks on the proposed arraycopy.
2396 // Kills temp, but nothing else.
2397 // Also, clean the sign bits of src_pos and dst_pos.
2398 void arraycopy_range_checks(Register src, // source array oop (c_rarg0)
2399 Register src_pos, // source position (c_rarg1)
2400 Register dst, // destination array oo (c_rarg2)
2401 Register dst_pos, // destination position (c_rarg3)
2402 Register length,
2403 Register temp,
2404 Label& L_failed) {
2405 BLOCK_COMMENT("arraycopy_range_checks:");
2406
2407 assert_different_registers(rscratch1, temp);
2408
2409 // if (src_pos + length > arrayOop(src)->length()) FAIL;
2410 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2411 __ addw(temp, length, src_pos);
2412 __ cmpw(temp, rscratch1);
2413 __ br(Assembler::HI, L_failed);
2414
2415 // if (dst_pos + length > arrayOop(dst)->length()) FAIL;
2416 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2417 __ addw(temp, length, dst_pos);
2418 __ cmpw(temp, rscratch1);
2419 __ br(Assembler::HI, L_failed);
2420
2421 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2422 __ movw(src_pos, src_pos);
2423 __ movw(dst_pos, dst_pos);
2424
2425 BLOCK_COMMENT("arraycopy_range_checks done");
2426 }
2427
2428 // These stubs get called from some dumb test routine.
2429 // I'll write them properly when they're called from
2430 // something that's actually doing something.
2431 static void fake_arraycopy_stub(address src, address dst, int count) {
2432 assert(count == 0, "huh?");
2433 }
2434
2435
2436 //
2437 // Generate 'unsafe' array copy stub
2438 // Though just as safe as the other stubs, it takes an unscaled
2439 // size_t argument instead of an element count.
2440 //
2441 // Input:
2442 // c_rarg0 - source array address
2443 // c_rarg1 - destination array address
2444 // c_rarg2 - byte count, treated as ssize_t, can be zero
2445 //
2446 // Examines the alignment of the operands and dispatches
2447 // to a long, int, short, or byte copy loop.
2448 //
2449 address generate_unsafe_copy(address byte_copy_entry,
2450 address short_copy_entry,
2451 address int_copy_entry,
2452 address long_copy_entry) {
2453 StubId stub_id = StubId::stubgen_unsafe_arraycopy_id;
2454 int entry_count = StubInfo::entry_count(stub_id);
2455 assert(entry_count == 1, "sanity check");
2456 address start = load_archive_data(stub_id);
2457 if (start != nullptr) {
2458 return start;
2459 }
2460 Label L_long_aligned, L_int_aligned, L_short_aligned;
2461 Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2462
2463 __ align(CodeEntryAlignment);
2464 StubCodeMark mark(this, stub_id);
2465 start = __ pc();
2466 __ enter(); // required for proper stackwalking of RuntimeStub frame
2467
2468 // bump this on entry, not on exit:
2469 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2470
2471 __ orr(rscratch1, s, d);
2472 __ orr(rscratch1, rscratch1, count);
2473
2474 __ andr(rscratch1, rscratch1, BytesPerLong-1);
2475 __ cbz(rscratch1, L_long_aligned);
2476 __ andr(rscratch1, rscratch1, BytesPerInt-1);
2477 __ cbz(rscratch1, L_int_aligned);
2478 __ tbz(rscratch1, 0, L_short_aligned);
2479 __ b(RuntimeAddress(byte_copy_entry));
2480
2481 __ BIND(L_short_aligned);
2482 __ lsr(count, count, LogBytesPerShort); // size => short_count
2483 __ b(RuntimeAddress(short_copy_entry));
2484 __ BIND(L_int_aligned);
2485 __ lsr(count, count, LogBytesPerInt); // size => int_count
2486 __ b(RuntimeAddress(int_copy_entry));
2487 __ BIND(L_long_aligned);
2488 __ lsr(count, count, LogBytesPerLong); // size => long_count
2489 __ b(RuntimeAddress(long_copy_entry));
2490
2491 // record the stub entry and end
2492 store_archive_data(stub_id, start, __ pc());
2493
2494 return start;
2495 }
2496
2497 //
2498 // Generate generic array copy stubs
2499 //
2500 // Input:
2501 // c_rarg0 - src oop
2502 // c_rarg1 - src_pos (32-bits)
2503 // c_rarg2 - dst oop
2504 // c_rarg3 - dst_pos (32-bits)
2505 // c_rarg4 - element count (32-bits)
2506 //
2507 // Output:
2508 // r0 == 0 - success
2509 // r0 == -1^K - failure, where K is partial transfer count
2510 //
2511 address generate_generic_copy(address byte_copy_entry, address short_copy_entry,
2512 address int_copy_entry, address oop_copy_entry,
2513 address long_copy_entry, address checkcast_copy_entry) {
2514 StubId stub_id = StubId::stubgen_generic_arraycopy_id;
2515 int entry_count = StubInfo::entry_count(stub_id);
2516 assert(entry_count == 1, "sanity check");
2517 address start = load_archive_data(stub_id);
2518 if (start != nullptr) {
2519 return start;
2520 }
2521 Label L_failed, L_objArray;
2522 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2523
2524 // Input registers
2525 const Register src = c_rarg0; // source array oop
2526 const Register src_pos = c_rarg1; // source position
2527 const Register dst = c_rarg2; // destination array oop
2528 const Register dst_pos = c_rarg3; // destination position
2529 const Register length = c_rarg4;
2530
2531
2532 // Registers used as temps
2533 const Register dst_klass = c_rarg5;
2534
2535 __ align(CodeEntryAlignment);
2536
2537 StubCodeMark mark(this, stub_id);
2538
2539 start = __ pc();
2540
2541 __ enter(); // required for proper stackwalking of RuntimeStub frame
2542
2543 // bump this on entry, not on exit:
2544 inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2545
2546 //-----------------------------------------------------------------------
2547 // Assembler stub will be used for this call to arraycopy
2548 // if the following conditions are met:
2549 //
2550 // (1) src and dst must not be null.
2551 // (2) src_pos must not be negative.
2552 // (3) dst_pos must not be negative.
2553 // (4) length must not be negative.
2554 // (5) src klass and dst klass should be the same and not null.
2555 // (6) src and dst should be arrays.
2556 // (7) src_pos + length must not exceed length of src.
2557 // (8) dst_pos + length must not exceed length of dst.
2558 //
2559
2560 // if (src == nullptr) return -1;
2561 __ cbz(src, L_failed);
2562
2563 // if (src_pos < 0) return -1;
2564 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set
2565
2566 // if (dst == nullptr) return -1;
2567 __ cbz(dst, L_failed);
2568
2569 // if (dst_pos < 0) return -1;
2570 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set
2571
2572 // registers used as temp
2573 const Register scratch_length = r16; // elements count to copy
2574 const Register scratch_src_klass = r17; // array klass
2575 const Register lh = r15; // layout helper
2576
2577 // if (length < 0) return -1;
2578 __ movw(scratch_length, length); // length (elements count, 32-bits value)
2579 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set
2580
2581 __ load_klass(scratch_src_klass, src);
2582 #ifdef ASSERT
2583 // assert(src->klass() != nullptr);
2584 {
2585 BLOCK_COMMENT("assert klasses not null {");
2586 Label L1, L2;
2587 __ cbnz(scratch_src_klass, L2); // it is broken if klass is null
2588 __ bind(L1);
2589 __ stop("broken null klass");
2590 __ bind(L2);
2591 __ load_klass(rscratch1, dst);
2592 __ cbz(rscratch1, L1); // this would be broken also
2593 BLOCK_COMMENT("} assert klasses not null done");
2594 }
2595 #endif
2596
2597 // Load layout helper (32-bits)
2598 //
2599 // |array_tag| | header_size | element_type | |log2_element_size|
2600 // 32 30 24 16 8 2 0
2601 //
2602 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2603 //
2604
2605 const int lh_offset = in_bytes(Klass::layout_helper_offset());
2606
2607 // Handle objArrays completely differently...
2608 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2609 __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2610 __ movw(rscratch1, objArray_lh);
2611 __ eorw(rscratch2, lh, rscratch1);
2612 __ cbzw(rscratch2, L_objArray);
2613
2614 // if (src->klass() != dst->klass()) return -1;
2615 __ load_klass(rscratch2, dst);
2616 __ eor(rscratch2, rscratch2, scratch_src_klass);
2617 __ cbnz(rscratch2, L_failed);
2618
2619 // Check for flat inline type array -> return -1
2620 __ test_flat_array_oop(src, rscratch2, L_failed);
2621
2622 // Check for null-free (non-flat) inline type array -> handle as object array
2623 __ test_null_free_array_oop(src, rscratch2, L_objArray);
2624
2625 // if (!src->is_Array()) return -1;
2626 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0)
2627
2628 // At this point, it is known to be a typeArray (array_tag 0x3).
2629 #ifdef ASSERT
2630 {
2631 BLOCK_COMMENT("assert primitive array {");
2632 Label L;
2633 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2634 __ cmpw(lh, rscratch2);
2635 __ br(Assembler::GE, L);
2636 __ stop("must be a primitive array");
2637 __ bind(L);
2638 BLOCK_COMMENT("} assert primitive array done");
2639 }
2640 #endif
2641
2642 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2643 rscratch2, L_failed);
2644
2645 // TypeArrayKlass
2646 //
2647 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2648 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2649 //
2650
2651 const Register rscratch1_offset = rscratch1; // array offset
2652 const Register r15_elsize = lh; // element size
2653
2654 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2655 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset
2656 __ add(src, src, rscratch1_offset); // src array offset
2657 __ add(dst, dst, rscratch1_offset); // dst array offset
2658 BLOCK_COMMENT("choose copy loop based on element size");
2659
2660 // next registers should be set before the jump to corresponding stub
2661 const Register from = c_rarg0; // source array address
2662 const Register to = c_rarg1; // destination array address
2663 const Register count = c_rarg2; // elements count
2664
2665 // 'from', 'to', 'count' registers should be set in such order
2666 // since they are the same as 'src', 'src_pos', 'dst'.
2667
2668 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2669
2670 // The possible values of elsize are 0-3, i.e. exact_log2(element
2671 // size in bytes). We do a simple bitwise binary search.
2672 __ BIND(L_copy_bytes);
2673 __ tbnz(r15_elsize, 1, L_copy_ints);
2674 __ tbnz(r15_elsize, 0, L_copy_shorts);
2675 __ lea(from, Address(src, src_pos));// src_addr
2676 __ lea(to, Address(dst, dst_pos));// dst_addr
2677 __ movw(count, scratch_length); // length
2678 __ b(RuntimeAddress(byte_copy_entry));
2679
2680 __ BIND(L_copy_shorts);
2681 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2682 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2683 __ movw(count, scratch_length); // length
2684 __ b(RuntimeAddress(short_copy_entry));
2685
2686 __ BIND(L_copy_ints);
2687 __ tbnz(r15_elsize, 0, L_copy_longs);
2688 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2689 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2690 __ movw(count, scratch_length); // length
2691 __ b(RuntimeAddress(int_copy_entry));
2692
2693 __ BIND(L_copy_longs);
2694 #ifdef ASSERT
2695 {
2696 BLOCK_COMMENT("assert long copy {");
2697 Label L;
2698 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize
2699 __ cmpw(r15_elsize, LogBytesPerLong);
2700 __ br(Assembler::EQ, L);
2701 __ stop("must be long copy, but elsize is wrong");
2702 __ bind(L);
2703 BLOCK_COMMENT("} assert long copy done");
2704 }
2705 #endif
2706 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2707 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2708 __ movw(count, scratch_length); // length
2709 __ b(RuntimeAddress(long_copy_entry));
2710
2711 // ObjArrayKlass
2712 __ BIND(L_objArray);
2713 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2714
2715 Label L_plain_copy, L_checkcast_copy;
2716 // test array classes for subtyping
2717 __ load_klass(r15, dst);
2718 __ cmp(scratch_src_klass, r15); // usual case is exact equality
2719 __ br(Assembler::NE, L_checkcast_copy);
2720
2721 // Identically typed arrays can be copied without element-wise checks.
2722 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2723 rscratch2, L_failed);
2724
2725 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2726 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2727 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2728 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2729 __ movw(count, scratch_length); // length
2730 __ BIND(L_plain_copy);
2731 __ b(RuntimeAddress(oop_copy_entry));
2732
2733 __ BIND(L_checkcast_copy);
2734 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass)
2735 {
2736 // Before looking at dst.length, make sure dst is also an objArray.
2737 __ ldrw(rscratch1, Address(r15, lh_offset));
2738 __ movw(rscratch2, objArray_lh);
2739 __ eorw(rscratch1, rscratch1, rscratch2);
2740 __ cbnzw(rscratch1, L_failed);
2741
2742 // It is safe to examine both src.length and dst.length.
2743 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2744 r15, L_failed);
2745
2746 __ load_klass(dst_klass, dst); // reload
2747
2748 // Marshal the base address arguments now, freeing registers.
2749 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2750 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2751 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2752 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2753 __ movw(count, length); // length (reloaded)
2754 Register sco_temp = c_rarg3; // this register is free now
2755 assert_different_registers(from, to, count, sco_temp,
2756 dst_klass, scratch_src_klass);
2757 // assert_clean_int(count, sco_temp);
2758
2759 // Generate the type check.
2760 const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2761 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2762
2763 // Smashes rscratch1, rscratch2
2764 generate_type_check(scratch_src_klass, sco_temp, dst_klass, /*temps*/ noreg, noreg, noreg,
2765 L_plain_copy);
2766
2767 // Fetch destination element klass from the ObjArrayKlass header.
2768 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2769 __ ldr(dst_klass, Address(dst_klass, ek_offset));
2770 __ ldrw(sco_temp, Address(dst_klass, sco_offset));
2771
2772 // the checkcast_copy loop needs two extra arguments:
2773 assert(c_rarg3 == sco_temp, "#3 already in place");
2774 // Set up arguments for checkcast_copy_entry.
2775 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass
2776 __ b(RuntimeAddress(checkcast_copy_entry));
2777 }
2778
2779 __ BIND(L_failed);
2780 __ mov(r0, -1);
2781 __ leave(); // required for proper stackwalking of RuntimeStub frame
2782 __ ret(lr);
2783
2784 // record the stub entry and end
2785 store_archive_data(stub_id, start, __ pc());
2786
2787 return start;
2788 }
2789
2790 //
2791 // Generate stub for array fill. If "aligned" is true, the
2792 // "to" address is assumed to be heapword aligned.
2793 //
2794 // Arguments for generated stub:
2795 // to: c_rarg0
2796 // value: c_rarg1
2797 // count: c_rarg2 treated as signed
2798 //
2799 address generate_fill(StubId stub_id) {
2800 BasicType t;
2801 bool aligned;
2802
2803 switch (stub_id) {
2804 case StubId::stubgen_jbyte_fill_id:
2805 t = T_BYTE;
2806 aligned = false;
2807 break;
2808 case StubId::stubgen_jshort_fill_id:
2809 t = T_SHORT;
2810 aligned = false;
2811 break;
2812 case StubId::stubgen_jint_fill_id:
2813 t = T_INT;
2814 aligned = false;
2815 break;
2816 case StubId::stubgen_arrayof_jbyte_fill_id:
2817 t = T_BYTE;
2818 aligned = true;
2819 break;
2820 case StubId::stubgen_arrayof_jshort_fill_id:
2821 t = T_SHORT;
2822 aligned = true;
2823 break;
2824 case StubId::stubgen_arrayof_jint_fill_id:
2825 t = T_INT;
2826 aligned = true;
2827 break;
2828 default:
2829 ShouldNotReachHere();
2830 };
2831 int entry_count = StubInfo::entry_count(stub_id);
2832 assert(entry_count == 1, "sanity check");
2833 address start = load_archive_data(stub_id);
2834 if (start != nullptr) {
2835 return start;
2836 }
2837 __ align(CodeEntryAlignment);
2838 StubCodeMark mark(this, stub_id);
2839 start = __ pc();
2840
2841 BLOCK_COMMENT("Entry:");
2842
2843 const Register to = c_rarg0; // source array address
2844 const Register value = c_rarg1; // value
2845 const Register count = c_rarg2; // elements count
2846
2847 const Register bz_base = r10; // base for block_zero routine
2848 const Register cnt_words = r11; // temp register
2849
2850 __ enter();
2851
2852 Label L_fill_elements, L_exit1;
2853
2854 int shift = -1;
2855 switch (t) {
2856 case T_BYTE:
2857 shift = 0;
2858 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2859 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit
2860 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2861 __ br(Assembler::LO, L_fill_elements);
2862 break;
2863 case T_SHORT:
2864 shift = 1;
2865 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2866 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2867 __ br(Assembler::LO, L_fill_elements);
2868 break;
2869 case T_INT:
2870 shift = 2;
2871 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2872 __ br(Assembler::LO, L_fill_elements);
2873 break;
2874 default: ShouldNotReachHere();
2875 }
2876
2877 // Align source address at 8 bytes address boundary.
2878 Label L_skip_align1, L_skip_align2, L_skip_align4;
2879 if (!aligned) {
2880 switch (t) {
2881 case T_BYTE:
2882 // One byte misalignment happens only for byte arrays.
2883 __ tbz(to, 0, L_skip_align1);
2884 __ strb(value, Address(__ post(to, 1)));
2885 __ subw(count, count, 1);
2886 __ bind(L_skip_align1);
2887 // Fallthrough
2888 case T_SHORT:
2889 // Two bytes misalignment happens only for byte and short (char) arrays.
2890 __ tbz(to, 1, L_skip_align2);
2891 __ strh(value, Address(__ post(to, 2)));
2892 __ subw(count, count, 2 >> shift);
2893 __ bind(L_skip_align2);
2894 // Fallthrough
2895 case T_INT:
2896 // Align to 8 bytes, we know we are 4 byte aligned to start.
2897 __ tbz(to, 2, L_skip_align4);
2898 __ strw(value, Address(__ post(to, 4)));
2899 __ subw(count, count, 4 >> shift);
2900 __ bind(L_skip_align4);
2901 break;
2902 default: ShouldNotReachHere();
2903 }
2904 }
2905
2906 //
2907 // Fill large chunks
2908 //
2909 __ lsrw(cnt_words, count, 3 - shift); // number of words
2910 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit
2911 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2912 if (UseBlockZeroing) {
2913 Label non_block_zeroing, rest;
2914 // If the fill value is zero we can use the fast zero_words().
2915 __ cbnz(value, non_block_zeroing);
2916 __ mov(bz_base, to);
2917 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2918 address tpc = __ zero_words(bz_base, cnt_words);
2919 if (tpc == nullptr) {
2920 fatal("CodeCache is full at generate_fill");
2921 }
2922 __ b(rest);
2923 __ bind(non_block_zeroing);
2924 __ fill_words(to, cnt_words, value);
2925 __ bind(rest);
2926 } else {
2927 __ fill_words(to, cnt_words, value);
2928 }
2929
2930 // Remaining count is less than 8 bytes. Fill it by a single store.
2931 // Note that the total length is no less than 8 bytes.
2932 if (t == T_BYTE || t == T_SHORT) {
2933 Label L_exit1;
2934 __ cbzw(count, L_exit1);
2935 __ add(to, to, count, Assembler::LSL, shift); // points to the end
2936 __ str(value, Address(to, -8)); // overwrite some elements
2937 __ bind(L_exit1);
2938 __ leave();
2939 __ ret(lr);
2940 }
2941
2942 // Handle copies less than 8 bytes.
2943 Label L_fill_2, L_fill_4, L_exit2;
2944 __ bind(L_fill_elements);
2945 switch (t) {
2946 case T_BYTE:
2947 __ tbz(count, 0, L_fill_2);
2948 __ strb(value, Address(__ post(to, 1)));
2949 __ bind(L_fill_2);
2950 __ tbz(count, 1, L_fill_4);
2951 __ strh(value, Address(__ post(to, 2)));
2952 __ bind(L_fill_4);
2953 __ tbz(count, 2, L_exit2);
2954 __ strw(value, Address(to));
2955 break;
2956 case T_SHORT:
2957 __ tbz(count, 0, L_fill_4);
2958 __ strh(value, Address(__ post(to, 2)));
2959 __ bind(L_fill_4);
2960 __ tbz(count, 1, L_exit2);
2961 __ strw(value, Address(to));
2962 break;
2963 case T_INT:
2964 __ cbzw(count, L_exit2);
2965 __ strw(value, Address(to));
2966 break;
2967 default: ShouldNotReachHere();
2968 }
2969 __ bind(L_exit2);
2970 __ leave();
2971 __ ret(lr);
2972
2973 // record the stub entry and end
2974 store_archive_data(stub_id, start, __ pc());
2975
2976 return start;
2977 }
2978
2979 address generate_unsafecopy_common_error_exit() {
2980 StubId stub_id = StubId::stubgen_unsafecopy_common_id;
2981 int entry_count = StubInfo::entry_count(stub_id);
2982 assert(entry_count == 1, "sanity check");
2983 address start = load_archive_data(stub_id);
2984 if (start != nullptr) {
2985 return start;
2986 }
2987 __ align(CodeEntryAlignment);
2988 StubCodeMark mark(this, stub_id);
2989 start = __ pc();
2990 __ leave();
2991 __ mov(r0, 0);
2992 __ ret(lr);
2993
2994 // record the stub entry and end
2995 store_archive_data(stub_id, start, __ pc());
2996
2997 return start;
2998 }
2999
3000 //
3001 // Generate 'unsafe' set memory stub
3002 // Though just as safe as the other stubs, it takes an unscaled
3003 // size_t (# bytes) argument instead of an element count.
3004 //
3005 // This fill operation is atomicity preserving: as long as the
3006 // address supplied is sufficiently aligned, all writes of up to 64
3007 // bits in size are single-copy atomic.
3008 //
3009 // Input:
3010 // c_rarg0 - destination array address
3011 // c_rarg1 - byte count (size_t)
3012 // c_rarg2 - byte value
3013 //
3014 address generate_unsafe_setmemory() {
3015 StubId stub_id = StubId::stubgen_unsafe_setmemory_id;
3016 int entry_count = StubInfo::entry_count(stub_id);
3017 assert(entry_count == 1, "sanity check");
3018 // we expect one set of extra unsafememory access handler entries
3019 GrowableArray<address> extras;
3020 int extra_count = 1 * UnsafeMemoryAccess::COLUMN_COUNT;
3021 address start = load_archive_data(stub_id, nullptr, &extras);
3022 if (start != nullptr) {
3023 assert(extras.length() == extra_count,
3024 "unexpected extra entry count %d", extras.length());
3025 register_unsafe_access_handlers(extras, 0, 1);
3026 return start;
3027 }
3028
3029 __ align(CodeEntryAlignment);
3030 StubCodeMark mark(this, stub_id);
3031 start = __ pc();
3032
3033 Register dest = c_rarg0, count = c_rarg1, value = c_rarg2;
3034 Label tail;
3035
3036 {
3037 UnsafeMemoryAccessMark umam(this, true, false);
3038
3039 __ enter(); // required for proper stackwalking of RuntimeStub frame
3040
3041 __ dup(v0, __ T16B, value);
3042
3043 if (AvoidUnalignedAccesses) {
3044 __ cmp(count, (u1)16);
3045 __ br(__ LO, tail);
3046
3047 __ mov(rscratch1, 16);
3048 __ andr(rscratch2, dest, 15);
3049 __ sub(rscratch1, rscratch1, rscratch2); // Bytes needed to 16-align dest
3050 __ strq(v0, Address(dest));
3051 __ sub(count, count, rscratch1);
3052 __ add(dest, dest, rscratch1);
3053 }
3054
3055 __ subs(count, count, (u1)64);
3056 __ br(__ LO, tail);
3057 {
3058 Label again;
3059 __ bind(again);
3060 __ stpq(v0, v0, Address(dest));
3061 __ stpq(v0, v0, Address(dest, 32));
3062
3063 __ subs(count, count, 64);
3064 __ add(dest, dest, 64);
3065 __ br(__ HS, again);
3066 }
3067
3068 __ bind(tail);
3069 // The count of bytes is off by 64, but we don't need to correct
3070 // it because we're only going to use the least-significant few
3071 // count bits from here on.
3072 // __ add(count, count, 64);
3073
3074 {
3075 Label dont;
3076 __ tbz(count, exact_log2(32), dont);
3077 __ stpq(v0, v0, __ post(dest, 32));
3078 __ bind(dont);
3079 }
3080 {
3081 Label dont;
3082 __ tbz(count, exact_log2(16), dont);
3083 __ strq(v0, __ post(dest, 16));
3084 __ bind(dont);
3085 }
3086 {
3087 Label dont;
3088 __ tbz(count, exact_log2(8), dont);
3089 __ strd(v0, __ post(dest, 8));
3090 __ bind(dont);
3091 }
3092
3093 Label finished;
3094 __ tst(count, 7);
3095 __ br(__ EQ, finished);
3096
3097 {
3098 Label dont;
3099 __ tbz(count, exact_log2(4), dont);
3100 __ strs(v0, __ post(dest, 4));
3101 __ bind(dont);
3102 }
3103 {
3104 Label dont;
3105 __ tbz(count, exact_log2(2), dont);
3106 __ bfi(value, value, 8, 8);
3107 __ strh(value, __ post(dest, 2));
3108 __ bind(dont);
3109 }
3110 {
3111 Label dont;
3112 __ tbz(count, exact_log2(1), dont);
3113 __ strb(value, Address(dest));
3114 __ bind(dont);
3115 }
3116
3117 __ bind(finished);
3118 __ leave();
3119 __ ret(lr);
3120 // have to exit the block and destroy the UnsafeMemoryAccessMark
3121 // in order to retrieve the handler end address
3122 }
3123
3124 // install saved handler addresses in extras
3125 address end = __ pc();
3126 retrieve_unsafe_access_handlers(start, end, extras);
3127 assert(extras.length() == extra_count,
3128 "incorrect handlers count %d", extras.length());
3129 // record the stub entry and end plus the extras
3130 store_archive_data(stub_id, start, end, nullptr, &extras);
3131
3132 return start;
3133 }
3134
3135 address generate_data_cache_writeback() {
3136 const Register line = c_rarg0; // address of line to write back
3137
3138 StubId stub_id = StubId::stubgen_data_cache_writeback_id;
3139 int entry_count = StubInfo::entry_count(stub_id);
3140 assert(entry_count == 1, "sanity check");
3141 address start = load_archive_data(stub_id);
3142 if (start != nullptr) {
3143 return start;
3144 }
3145 __ align(CodeEntryAlignment);
3146 StubCodeMark mark(this, stub_id);
3147
3148 start = __ pc();
3149 __ enter();
3150 __ cache_wb(Address(line, 0));
3151 __ leave();
3152 __ ret(lr);
3153
3154 // record the stub entry and end
3155 store_archive_data(stub_id, start, __ pc());
3156
3157 return start;
3158 }
3159
3160 address generate_data_cache_writeback_sync() {
3161 StubId stub_id = StubId::stubgen_data_cache_writeback_sync_id;
3162 int entry_count = StubInfo::entry_count(stub_id);
3163 assert(entry_count == 1, "sanity check");
3164 address start = load_archive_data(stub_id);
3165 if (start != nullptr) {
3166 return start;
3167 }
3168 const Register is_pre = c_rarg0; // pre or post sync
3169 __ align(CodeEntryAlignment);
3170 StubCodeMark mark(this, stub_id);
3171
3172 // pre wbsync is a no-op
3173 // post wbsync translates to an sfence
3174
3175 Label skip;
3176 start = __ pc();
3177 __ enter();
3178 __ cbnz(is_pre, skip);
3179 __ cache_wbsync(false);
3180 __ bind(skip);
3181 __ leave();
3182 __ ret(lr);
3183
3184 // record the stub entry and end
3185 store_archive_data(stub_id, start, __ pc());
3186
3187 return start;
3188 }
3189
3190 void generate_arraycopy_stubs() {
3191 // Some copy stubs publish a normal entry and then a 2nd 'fallback'
3192 // entry immediately following their stack push. This can be used
3193 // as a post-push branch target for compatible stubs when they
3194 // identify a special case that can be handled by the fallback
3195 // stub e.g a disjoint copy stub may be use as a special case
3196 // fallback for its compatible conjoint copy stub.
3197 //
3198 // A no push entry is always returned in the following local and
3199 // then published by assigning to the appropriate entry field in
3200 // class StubRoutines. The entry value is then passed to the
3201 // generator for the compatible stub. That means the entry must be
3202 // listed when saving to/restoring from the AOT cache, ensuring
3203 // that the inter-stub jumps are noted at AOT-cache save and
3204 // relocated at AOT cache load.
3205 address nopush_entry;
3206
3207 // generate the common exit first so later stubs can rely on it if
3208 // they want an UnsafeMemoryAccess exit non-local to the stub
3209 StubRoutines::_unsafecopy_common_exit = generate_unsafecopy_common_error_exit();
3210 // register the stub as the default exit with class UnsafeMemoryAccess
3211 UnsafeMemoryAccess::set_common_exit_stub_pc(StubRoutines::_unsafecopy_common_exit);
3212
3213 // generate and publish arch64-specific bulk copy routines first
3214 // so we can call them from other copy stubs
3215 StubRoutines::aarch64::_copy_byte_f = generate_copy_longs(StubId::stubgen_copy_byte_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3216 StubRoutines::aarch64::_copy_byte_b = generate_copy_longs(StubId::stubgen_copy_byte_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3217
3218 StubRoutines::aarch64::_copy_oop_f = generate_copy_longs(StubId::stubgen_copy_oop_f_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3219 StubRoutines::aarch64::_copy_oop_b = generate_copy_longs(StubId::stubgen_copy_oop_b_id, IN_HEAP | IS_ARRAY, r0, r1, r15);
3220
3221 StubRoutines::aarch64::_copy_oop_uninit_f = generate_copy_longs(StubId::stubgen_copy_oop_uninit_f_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3222 StubRoutines::aarch64::_copy_oop_uninit_b = generate_copy_longs(StubId::stubgen_copy_oop_uninit_b_id, IN_HEAP | IS_ARRAY | IS_DEST_UNINITIALIZED, r0, r1, r15);
3223
3224 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
3225
3226 //*** jbyte
3227 // Always need aligned and unaligned versions
3228 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jbyte_disjoint_arraycopy_id, &nopush_entry);
3229 // disjoint nopush entry is needed by conjoint copy
3230 StubRoutines::_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3231 StubRoutines::_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_jbyte_arraycopy_id, StubRoutines::_jbyte_disjoint_arraycopy_nopush, &nopush_entry);
3232 // conjoint nopush entry is needed by generic/unsafe copy
3233 StubRoutines::_jbyte_arraycopy_nopush = nopush_entry;
3234 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jbyte_disjoint_arraycopy_id, &nopush_entry);
3235 // disjoint arrayof nopush entry is needed by conjoint copy
3236 StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush = nopush_entry;
3237 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jbyte_arraycopy_id, StubRoutines::_arrayof_jbyte_disjoint_arraycopy_nopush, nullptr);
3238
3239 //*** jshort
3240 // Always need aligned and unaligned versions
3241 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jshort_disjoint_arraycopy_id, &nopush_entry);
3242 // disjoint nopush entry is needed by conjoint copy
3243 StubRoutines::_jshort_disjoint_arraycopy_nopush = nopush_entry;
3244 StubRoutines::_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_jshort_arraycopy_id, StubRoutines::_jshort_disjoint_arraycopy_nopush, &nopush_entry);
3245 // conjoint nopush entry is used by generic/unsafe copy
3246 StubRoutines::_jshort_arraycopy_nopush = nopush_entry;
3247 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jshort_disjoint_arraycopy_id, &nopush_entry);
3248 // disjoint arrayof nopush entry is needed by conjoint copy
3249 StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush = nopush_entry;
3250 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jshort_arraycopy_id, StubRoutines::_arrayof_jshort_disjoint_arraycopy_nopush, nullptr);
3251
3252 //*** jint
3253 // Aligned versions
3254 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jint_disjoint_arraycopy_id, &nopush_entry);
3255 // disjoint arrayof nopush entry is needed by conjoint copy
3256 StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush = nopush_entry;
3257 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jint_arraycopy_id, StubRoutines::_arrayof_jint_disjoint_arraycopy_nopush, nullptr);
3258 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3259 // jint_arraycopy_nopush always points to the unaligned version
3260 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_jint_disjoint_arraycopy_id, &nopush_entry);
3261 // disjoint nopush entry is needed by conjoint copy
3262 StubRoutines::_jint_disjoint_arraycopy_nopush = nopush_entry;
3263 StubRoutines::_jint_arraycopy = generate_conjoint_copy(StubId::stubgen_jint_arraycopy_id, StubRoutines::_jint_disjoint_arraycopy_nopush, &nopush_entry);
3264 // conjoint nopush entry is needed by generic/unsafe copy
3265 StubRoutines::_jint_arraycopy_nopush = nopush_entry;
3266
3267 //*** jlong
3268 // It is always aligned
3269 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_copy(StubId::stubgen_arrayof_jlong_disjoint_arraycopy_id, &nopush_entry);
3270 // disjoint arrayof nopush entry is needed by conjoint copy
3271 StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush = nopush_entry;
3272 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_copy(StubId::stubgen_arrayof_jlong_arraycopy_id, StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush, &nopush_entry);
3273 // conjoint nopush entry is needed by generic/unsafe copy
3274 StubRoutines::_jlong_arraycopy_nopush = nopush_entry;
3275 // disjoint normal/nopush and conjoint normal entries are not
3276 // generated since the arrayof versions are the same
3277 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3278 StubRoutines::_jlong_disjoint_arraycopy_nopush = StubRoutines::_arrayof_jlong_disjoint_arraycopy_nopush;
3279 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3280
3281 //*** oops
3282 {
3283 StubRoutines::_arrayof_oop_disjoint_arraycopy
3284 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_id, &nopush_entry);
3285 // disjoint arrayof nopush entry is needed by conjoint copy
3286 StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush = nopush_entry;
3287 StubRoutines::_arrayof_oop_arraycopy
3288 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush, &nopush_entry);
3289 // conjoint arrayof nopush entry is needed by generic/unsafe copy
3290 StubRoutines::_oop_arraycopy_nopush = nopush_entry;
3291 // Aligned versions without pre-barriers
3292 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
3293 = generate_disjoint_copy(StubId::stubgen_arrayof_oop_disjoint_arraycopy_uninit_id, &nopush_entry);
3294 // disjoint arrayof+uninit nopush entry is needed by conjoint copy
3295 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush = nopush_entry;
3296 // note that we don't need a returned nopush entry because the
3297 // generic/unsafe copy does not cater for uninit arrays.
3298 StubRoutines::_arrayof_oop_arraycopy_uninit
3299 = generate_conjoint_copy(StubId::stubgen_arrayof_oop_arraycopy_uninit_id, StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush, nullptr);
3300 }
3301
3302 // for oop copies reuse arrayof entries for non-arrayof cases
3303 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3304 StubRoutines::_oop_disjoint_arraycopy_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_nopush;
3305 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3306 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3307 StubRoutines::_oop_disjoint_arraycopy_uninit_nopush = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit_nopush;
3308 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3309
3310 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_id, &nopush_entry);
3311 // checkcast nopush entry is needed by generic copy
3312 StubRoutines::_checkcast_arraycopy_nopush = nopush_entry;
3313 // note that we don't need a returned nopush entry because the
3314 // generic copy does not cater for uninit arrays.
3315 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy(StubId::stubgen_checkcast_arraycopy_uninit_id, nullptr);
3316
3317 // unsafe arraycopy may fallback on conjoint stubs
3318 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy(StubRoutines::_jbyte_arraycopy_nopush,
3319 StubRoutines::_jshort_arraycopy_nopush,
3320 StubRoutines::_jint_arraycopy_nopush,
3321 StubRoutines::_jlong_arraycopy_nopush);
3322
3323 // generic arraycopy may fallback on conjoint stubs
3324 StubRoutines::_generic_arraycopy = generate_generic_copy(StubRoutines::_jbyte_arraycopy_nopush,
3325 StubRoutines::_jshort_arraycopy_nopush,
3326 StubRoutines::_jint_arraycopy_nopush,
3327 StubRoutines::_oop_arraycopy_nopush,
3328 StubRoutines::_jlong_arraycopy_nopush,
3329 StubRoutines::_checkcast_arraycopy_nopush);
3330
3331 StubRoutines::_jbyte_fill = generate_fill(StubId::stubgen_jbyte_fill_id);
3332 StubRoutines::_jshort_fill = generate_fill(StubId::stubgen_jshort_fill_id);
3333 StubRoutines::_jint_fill = generate_fill(StubId::stubgen_jint_fill_id);
3334 StubRoutines::_arrayof_jbyte_fill = generate_fill(StubId::stubgen_arrayof_jbyte_fill_id);
3335 StubRoutines::_arrayof_jshort_fill = generate_fill(StubId::stubgen_arrayof_jshort_fill_id);
3336 StubRoutines::_arrayof_jint_fill = generate_fill(StubId::stubgen_arrayof_jint_fill_id);
3337 }
3338
3339 void generate_math_stubs() { Unimplemented(); }
3340
3341 // Arguments:
3342 //
3343 // Inputs:
3344 // c_rarg0 - source byte array address
3345 // c_rarg1 - destination byte array address
3346 // c_rarg2 - sessionKe (key) in little endian int array
3347 //
3348 address generate_aescrypt_encryptBlock() {
3349 assert(UseAES, "need AES cryptographic extension support");
3350 StubId stub_id = StubId::stubgen_aescrypt_encryptBlock_id;
3351 int entry_count = StubInfo::entry_count(stub_id);
3352 assert(entry_count == 1, "sanity check");
3353 address start = load_archive_data(stub_id);
3354 if (start != nullptr) {
3355 return start;
3356 }
3357 __ align(CodeEntryAlignment);
3358 StubCodeMark mark(this, stub_id);
3359
3360 const Register from = c_rarg0; // source array address
3361 const Register to = c_rarg1; // destination array address
3362 const Register key = c_rarg2; // key array address
3363 const Register keylen = rscratch1;
3364
3365 start = __ pc();
3366 __ enter();
3367
3368 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3369
3370 __ aesenc_loadkeys(key, keylen);
3371 __ aesecb_encrypt(from, to, keylen);
3372
3373 __ mov(r0, 0);
3374
3375 __ leave();
3376 __ ret(lr);
3377
3378 // record the stub entry and end
3379 store_archive_data(stub_id, start, __ pc());
3380
3381 return start;
3382 }
3383
3384 // Arguments:
3385 //
3386 // Inputs:
3387 // c_rarg0 - source byte array address
3388 // c_rarg1 - destination byte array address
3389 // c_rarg2 - sessionKd (key) in little endian int array
3390 //
3391 address generate_aescrypt_decryptBlock() {
3392 assert(UseAES, "need AES cryptographic extension support");
3393 StubId stub_id = StubId::stubgen_aescrypt_decryptBlock_id;
3394 int entry_count = StubInfo::entry_count(stub_id);
3395 assert(entry_count == 1, "sanity check");
3396 address start = load_archive_data(stub_id);
3397 if (start != nullptr) {
3398 return start;
3399 }
3400 __ align(CodeEntryAlignment);
3401 StubCodeMark mark(this, stub_id);
3402 Label L_doLast;
3403
3404 const Register from = c_rarg0; // source array address
3405 const Register to = c_rarg1; // destination array address
3406 const Register key = c_rarg2; // key array address
3407 const Register keylen = rscratch1;
3408
3409 start = __ pc();
3410 __ enter(); // required for proper stackwalking of RuntimeStub frame
3411
3412 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3413
3414 __ aesecb_decrypt(from, to, key, keylen);
3415
3416 __ mov(r0, 0);
3417
3418 __ leave();
3419 __ ret(lr);
3420
3421 // record the stub entry and end
3422 store_archive_data(stub_id, start, __ pc());
3423
3424 return start;
3425 }
3426
3427 // Arguments:
3428 //
3429 // Inputs:
3430 // c_rarg0 - source byte array address
3431 // c_rarg1 - destination byte array address
3432 // c_rarg2 - sessionKe (key) in little endian int array
3433 // c_rarg3 - r vector byte array address
3434 // c_rarg4 - input length
3435 //
3436 // Output:
3437 // x0 - input length
3438 //
3439 address generate_cipherBlockChaining_encryptAESCrypt() {
3440 assert(UseAES, "need AES cryptographic extension support");
3441 StubId stub_id = StubId::stubgen_cipherBlockChaining_encryptAESCrypt_id;
3442 int entry_count = StubInfo::entry_count(stub_id);
3443 assert(entry_count == 1, "sanity check");
3444 address start = load_archive_data(stub_id);
3445 if (start != nullptr) {
3446 return start;
3447 }
3448 __ align(CodeEntryAlignment);
3449 StubCodeMark mark(this, stub_id);
3450
3451 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3452
3453 const Register from = c_rarg0; // source array address
3454 const Register to = c_rarg1; // destination array address
3455 const Register key = c_rarg2; // key array address
3456 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3457 // and left with the results of the last encryption block
3458 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3459 const Register keylen = rscratch1;
3460
3461 start = __ pc();
3462
3463 __ enter();
3464
3465 __ movw(rscratch2, len_reg);
3466
3467 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3468
3469 __ ld1(v0, __ T16B, rvec);
3470
3471 __ cmpw(keylen, 52);
3472 __ br(Assembler::CC, L_loadkeys_44);
3473 __ br(Assembler::EQ, L_loadkeys_52);
3474
3475 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3476 __ rev32(v17, __ T16B, v17);
3477 __ rev32(v18, __ T16B, v18);
3478 __ BIND(L_loadkeys_52);
3479 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3480 __ rev32(v19, __ T16B, v19);
3481 __ rev32(v20, __ T16B, v20);
3482 __ BIND(L_loadkeys_44);
3483 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3484 __ rev32(v21, __ T16B, v21);
3485 __ rev32(v22, __ T16B, v22);
3486 __ rev32(v23, __ T16B, v23);
3487 __ rev32(v24, __ T16B, v24);
3488 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3489 __ rev32(v25, __ T16B, v25);
3490 __ rev32(v26, __ T16B, v26);
3491 __ rev32(v27, __ T16B, v27);
3492 __ rev32(v28, __ T16B, v28);
3493 __ ld1(v29, v30, v31, __ T16B, key);
3494 __ rev32(v29, __ T16B, v29);
3495 __ rev32(v30, __ T16B, v30);
3496 __ rev32(v31, __ T16B, v31);
3497
3498 __ BIND(L_aes_loop);
3499 __ ld1(v1, __ T16B, __ post(from, 16));
3500 __ eor(v0, __ T16B, v0, v1);
3501
3502 __ br(Assembler::CC, L_rounds_44);
3503 __ br(Assembler::EQ, L_rounds_52);
3504
3505 __ aese(v0, v17); __ aesmc(v0, v0);
3506 __ aese(v0, v18); __ aesmc(v0, v0);
3507 __ BIND(L_rounds_52);
3508 __ aese(v0, v19); __ aesmc(v0, v0);
3509 __ aese(v0, v20); __ aesmc(v0, v0);
3510 __ BIND(L_rounds_44);
3511 __ aese(v0, v21); __ aesmc(v0, v0);
3512 __ aese(v0, v22); __ aesmc(v0, v0);
3513 __ aese(v0, v23); __ aesmc(v0, v0);
3514 __ aese(v0, v24); __ aesmc(v0, v0);
3515 __ aese(v0, v25); __ aesmc(v0, v0);
3516 __ aese(v0, v26); __ aesmc(v0, v0);
3517 __ aese(v0, v27); __ aesmc(v0, v0);
3518 __ aese(v0, v28); __ aesmc(v0, v0);
3519 __ aese(v0, v29); __ aesmc(v0, v0);
3520 __ aese(v0, v30);
3521 __ eor(v0, __ T16B, v0, v31);
3522
3523 __ st1(v0, __ T16B, __ post(to, 16));
3524
3525 __ subw(len_reg, len_reg, 16);
3526 __ cbnzw(len_reg, L_aes_loop);
3527
3528 __ st1(v0, __ T16B, rvec);
3529
3530 __ mov(r0, rscratch2);
3531
3532 __ leave();
3533 __ ret(lr);
3534
3535 // record the stub entry and end
3536 store_archive_data(stub_id, start, __ pc());
3537
3538 return start;
3539 }
3540
3541 // Arguments:
3542 //
3543 // Inputs:
3544 // c_rarg0 - source byte array address
3545 // c_rarg1 - destination byte array address
3546 // c_rarg2 - sessionKd (key) in little endian int array
3547 // c_rarg3 - r vector byte array address
3548 // c_rarg4 - input length
3549 //
3550 // Output:
3551 // r0 - input length
3552 //
3553 address generate_cipherBlockChaining_decryptAESCrypt() {
3554 assert(UseAES, "need AES cryptographic extension support");
3555 StubId stub_id = StubId::stubgen_cipherBlockChaining_decryptAESCrypt_id;
3556 int entry_count = StubInfo::entry_count(stub_id);
3557 assert(entry_count == 1, "sanity check");
3558 address start = load_archive_data(stub_id);
3559 if (start != nullptr) {
3560 return start;
3561 }
3562 __ align(CodeEntryAlignment);
3563 StubCodeMark mark(this, stub_id);
3564
3565 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3566
3567 const Register from = c_rarg0; // source array address
3568 const Register to = c_rarg1; // destination array address
3569 const Register key = c_rarg2; // key array address
3570 const Register rvec = c_rarg3; // r byte array initialized from initvector array address
3571 // and left with the results of the last encryption block
3572 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16)
3573 const Register keylen = rscratch1;
3574
3575 start = __ pc();
3576
3577 __ enter();
3578
3579 __ movw(rscratch2, len_reg);
3580
3581 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3582
3583 __ ld1(v2, __ T16B, rvec);
3584
3585 __ ld1(v31, __ T16B, __ post(key, 16));
3586 __ rev32(v31, __ T16B, v31);
3587
3588 __ cmpw(keylen, 52);
3589 __ br(Assembler::CC, L_loadkeys_44);
3590 __ br(Assembler::EQ, L_loadkeys_52);
3591
3592 __ ld1(v17, v18, __ T16B, __ post(key, 32));
3593 __ rev32(v17, __ T16B, v17);
3594 __ rev32(v18, __ T16B, v18);
3595 __ BIND(L_loadkeys_52);
3596 __ ld1(v19, v20, __ T16B, __ post(key, 32));
3597 __ rev32(v19, __ T16B, v19);
3598 __ rev32(v20, __ T16B, v20);
3599 __ BIND(L_loadkeys_44);
3600 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3601 __ rev32(v21, __ T16B, v21);
3602 __ rev32(v22, __ T16B, v22);
3603 __ rev32(v23, __ T16B, v23);
3604 __ rev32(v24, __ T16B, v24);
3605 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3606 __ rev32(v25, __ T16B, v25);
3607 __ rev32(v26, __ T16B, v26);
3608 __ rev32(v27, __ T16B, v27);
3609 __ rev32(v28, __ T16B, v28);
3610 __ ld1(v29, v30, __ T16B, key);
3611 __ rev32(v29, __ T16B, v29);
3612 __ rev32(v30, __ T16B, v30);
3613
3614 __ BIND(L_aes_loop);
3615 __ ld1(v0, __ T16B, __ post(from, 16));
3616 __ orr(v1, __ T16B, v0, v0);
3617
3618 __ br(Assembler::CC, L_rounds_44);
3619 __ br(Assembler::EQ, L_rounds_52);
3620
3621 __ aesd(v0, v17); __ aesimc(v0, v0);
3622 __ aesd(v0, v18); __ aesimc(v0, v0);
3623 __ BIND(L_rounds_52);
3624 __ aesd(v0, v19); __ aesimc(v0, v0);
3625 __ aesd(v0, v20); __ aesimc(v0, v0);
3626 __ BIND(L_rounds_44);
3627 __ aesd(v0, v21); __ aesimc(v0, v0);
3628 __ aesd(v0, v22); __ aesimc(v0, v0);
3629 __ aesd(v0, v23); __ aesimc(v0, v0);
3630 __ aesd(v0, v24); __ aesimc(v0, v0);
3631 __ aesd(v0, v25); __ aesimc(v0, v0);
3632 __ aesd(v0, v26); __ aesimc(v0, v0);
3633 __ aesd(v0, v27); __ aesimc(v0, v0);
3634 __ aesd(v0, v28); __ aesimc(v0, v0);
3635 __ aesd(v0, v29); __ aesimc(v0, v0);
3636 __ aesd(v0, v30);
3637 __ eor(v0, __ T16B, v0, v31);
3638 __ eor(v0, __ T16B, v0, v2);
3639
3640 __ st1(v0, __ T16B, __ post(to, 16));
3641 __ orr(v2, __ T16B, v1, v1);
3642
3643 __ subw(len_reg, len_reg, 16);
3644 __ cbnzw(len_reg, L_aes_loop);
3645
3646 __ st1(v2, __ T16B, rvec);
3647
3648 __ mov(r0, rscratch2);
3649
3650 __ leave();
3651 __ ret(lr);
3652
3653 // record the stub entry and end
3654 store_archive_data(stub_id, start, __ pc());
3655
3656 return start;
3657 }
3658
3659 // Big-endian 128-bit + 64-bit -> 128-bit addition.
3660 // Inputs: 128-bits. in is preserved.
3661 // The least-significant 64-bit word is in the upper dword of each vector.
3662 // inc (the 64-bit increment) is preserved. Its lower dword must be zero.
3663 // Output: result
3664 void be_add_128_64(FloatRegister result, FloatRegister in,
3665 FloatRegister inc, FloatRegister tmp) {
3666 assert_different_registers(result, tmp, inc);
3667
3668 __ addv(result, __ T2D, in, inc); // Add inc to the least-significant dword of
3669 // input
3670 __ cm(__ HI, tmp, __ T2D, inc, result);// Check for result overflowing
3671 __ ext(tmp, __ T16B, tmp, tmp, 0x08); // Swap LSD of comparison result to MSD and
3672 // MSD == 0 (must be!) to LSD
3673 __ subv(result, __ T2D, result, tmp); // Subtract -1 from MSD if there was an overflow
3674 }
3675
3676 // CTR AES crypt.
3677 // Arguments:
3678 //
3679 // Inputs:
3680 // c_rarg0 - source byte array address
3681 // c_rarg1 - destination byte array address
3682 // c_rarg2 - sessionKe (key) in little endian int array
3683 // c_rarg3 - counter vector byte array address
3684 // c_rarg4 - input length
3685 // c_rarg5 - saved encryptedCounter start
3686 // c_rarg6 - saved used length
3687 //
3688 // Output:
3689 // r0 - input length
3690 //
3691 address generate_counterMode_AESCrypt() {
3692 StubId stub_id = StubId::stubgen_counterMode_AESCrypt_id;
3693 int entry_count = StubInfo::entry_count(stub_id);
3694 assert(entry_count == 1, "sanity check");
3695 address start = load_archive_data(stub_id);
3696 if (start != nullptr) {
3697 return start;
3698 }
3699 const Register in = c_rarg0;
3700 const Register out = c_rarg1;
3701 const Register key = c_rarg2;
3702 const Register counter = c_rarg3;
3703 const Register saved_len = c_rarg4, len = r10;
3704 const Register saved_encrypted_ctr = c_rarg5;
3705 const Register used_ptr = c_rarg6, used = r12;
3706
3707 const Register offset = r7;
3708 const Register keylen = r11;
3709
3710 const unsigned char block_size = 16;
3711 const int bulk_width = 4;
3712 // NB: bulk_width can be 4 or 8. 8 gives slightly faster
3713 // performance with larger data sizes, but it also means that the
3714 // fast path isn't used until you have at least 8 blocks, and up
3715 // to 127 bytes of data will be executed on the slow path. For
3716 // that reason, and also so as not to blow away too much icache, 4
3717 // blocks seems like a sensible compromise.
3718
3719 // Algorithm:
3720 //
3721 // if (len == 0) {
3722 // goto DONE;
3723 // }
3724 // int result = len;
3725 // do {
3726 // if (used >= blockSize) {
3727 // if (len >= bulk_width * blockSize) {
3728 // CTR_large_block();
3729 // if (len == 0)
3730 // goto DONE;
3731 // }
3732 // for (;;) {
3733 // 16ByteVector v0 = counter;
3734 // embeddedCipher.encryptBlock(v0, 0, encryptedCounter, 0);
3735 // used = 0;
3736 // if (len < blockSize)
3737 // break; /* goto NEXT */
3738 // 16ByteVector v1 = load16Bytes(in, offset);
3739 // v1 = v1 ^ encryptedCounter;
3740 // store16Bytes(out, offset);
3741 // used = blockSize;
3742 // offset += blockSize;
3743 // len -= blockSize;
3744 // if (len == 0)
3745 // goto DONE;
3746 // }
3747 // }
3748 // NEXT:
3749 // out[outOff++] = (byte)(in[inOff++] ^ encryptedCounter[used++]);
3750 // len--;
3751 // } while (len != 0);
3752 // DONE:
3753 // return result;
3754 //
3755 // CTR_large_block()
3756 // Wide bulk encryption of whole blocks.
3757
3758 __ align(CodeEntryAlignment);
3759 StubCodeMark mark(this, stub_id);
3760 start = __ pc();
3761 __ enter();
3762
3763 Label DONE, CTR_large_block, large_block_return;
3764 __ ldrw(used, Address(used_ptr));
3765 __ cbzw(saved_len, DONE);
3766
3767 __ mov(len, saved_len);
3768 __ mov(offset, 0);
3769
3770 // Compute #rounds for AES based on the length of the key array
3771 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3772
3773 __ aesenc_loadkeys(key, keylen);
3774
3775 {
3776 Label L_CTR_loop, NEXT;
3777
3778 __ bind(L_CTR_loop);
3779
3780 __ cmp(used, block_size);
3781 __ br(__ LO, NEXT);
3782
3783 // Maybe we have a lot of data
3784 __ subsw(rscratch1, len, bulk_width * block_size);
3785 __ br(__ HS, CTR_large_block);
3786 __ BIND(large_block_return);
3787 __ cbzw(len, DONE);
3788
3789 // Setup the counter
3790 __ movi(v4, __ T4S, 0);
3791 __ movi(v5, __ T4S, 1);
3792 __ ins(v4, __ S, v5, 2, 2); // v4 contains { 0, 1 }
3793
3794 // 128-bit big-endian increment
3795 __ ld1(v0, __ T16B, counter);
3796 __ rev64(v16, __ T16B, v0);
3797 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3798 __ rev64(v16, __ T16B, v16);
3799 __ st1(v16, __ T16B, counter);
3800 // Previous counter value is in v0
3801 // v4 contains { 0, 1 }
3802
3803 {
3804 // We have fewer than bulk_width blocks of data left. Encrypt
3805 // them one by one until there is less than a full block
3806 // remaining, being careful to save both the encrypted counter
3807 // and the counter.
3808
3809 Label inner_loop;
3810 __ bind(inner_loop);
3811 // Counter to encrypt is in v0
3812 __ aesecb_encrypt(noreg, noreg, keylen);
3813 __ st1(v0, __ T16B, saved_encrypted_ctr);
3814
3815 // Do we have a remaining full block?
3816
3817 __ mov(used, 0);
3818 __ cmp(len, block_size);
3819 __ br(__ LO, NEXT);
3820
3821 // Yes, we have a full block
3822 __ ldrq(v1, Address(in, offset));
3823 __ eor(v1, __ T16B, v1, v0);
3824 __ strq(v1, Address(out, offset));
3825 __ mov(used, block_size);
3826 __ add(offset, offset, block_size);
3827
3828 __ subw(len, len, block_size);
3829 __ cbzw(len, DONE);
3830
3831 // Increment the counter, store it back
3832 __ orr(v0, __ T16B, v16, v16);
3833 __ rev64(v16, __ T16B, v16);
3834 be_add_128_64(v16, v16, v4, /*tmp*/v5);
3835 __ rev64(v16, __ T16B, v16);
3836 __ st1(v16, __ T16B, counter); // Save the incremented counter back
3837
3838 __ b(inner_loop);
3839 }
3840
3841 __ BIND(NEXT);
3842
3843 // Encrypt a single byte, and loop.
3844 // We expect this to be a rare event.
3845 __ ldrb(rscratch1, Address(in, offset));
3846 __ ldrb(rscratch2, Address(saved_encrypted_ctr, used));
3847 __ eor(rscratch1, rscratch1, rscratch2);
3848 __ strb(rscratch1, Address(out, offset));
3849 __ add(offset, offset, 1);
3850 __ add(used, used, 1);
3851 __ subw(len, len,1);
3852 __ cbnzw(len, L_CTR_loop);
3853 }
3854
3855 __ bind(DONE);
3856 __ strw(used, Address(used_ptr));
3857 __ mov(r0, saved_len);
3858
3859 __ leave(); // required for proper stackwalking of RuntimeStub frame
3860 __ ret(lr);
3861
3862 // Bulk encryption
3863
3864 __ BIND (CTR_large_block);
3865 assert(bulk_width == 4 || bulk_width == 8, "must be");
3866
3867 if (bulk_width == 8) {
3868 __ sub(sp, sp, 4 * 16);
3869 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3870 }
3871 __ sub(sp, sp, 4 * 16);
3872 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3873 RegSet saved_regs = (RegSet::of(in, out, offset)
3874 + RegSet::of(saved_encrypted_ctr, used_ptr, len));
3875 __ push(saved_regs, sp);
3876 __ andr(len, len, -16 * bulk_width); // 8/4 encryptions, 16 bytes per encryption
3877 __ add(in, in, offset);
3878 __ add(out, out, offset);
3879
3880 // Keys should already be loaded into the correct registers
3881
3882 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
3883 __ rev64(v16, __ T16B, v0); // v16 contains byte-reversed counter
3884
3885 // AES/CTR loop
3886 {
3887 Label L_CTR_loop;
3888 __ BIND(L_CTR_loop);
3889
3890 // Setup the counters
3891 __ movi(v8, __ T4S, 0);
3892 __ movi(v9, __ T4S, 1);
3893 __ ins(v8, __ S, v9, 2, 2); // v8 contains { 0, 1 }
3894
3895 for (int i = 0; i < bulk_width; i++) {
3896 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3897 __ rev64(v0_ofs, __ T16B, v16);
3898 be_add_128_64(v16, v16, v8, /*tmp*/v9);
3899 }
3900
3901 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
3902
3903 // Encrypt the counters
3904 __ aesecb_encrypt(noreg, noreg, keylen, v0, bulk_width);
3905
3906 if (bulk_width == 8) {
3907 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
3908 }
3909
3910 // XOR the encrypted counters with the inputs
3911 for (int i = 0; i < bulk_width; i++) {
3912 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
3913 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
3914 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
3915 }
3916
3917 // Write the encrypted data
3918 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
3919 if (bulk_width == 8) {
3920 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
3921 }
3922
3923 __ subw(len, len, 16 * bulk_width);
3924 __ cbnzw(len, L_CTR_loop);
3925 }
3926
3927 // Save the counter back where it goes
3928 __ rev64(v16, __ T16B, v16);
3929 __ st1(v16, __ T16B, counter);
3930
3931 __ pop(saved_regs, sp);
3932
3933 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
3934 if (bulk_width == 8) {
3935 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
3936 }
3937
3938 __ andr(rscratch1, len, -16 * bulk_width);
3939 __ sub(len, len, rscratch1);
3940 __ add(offset, offset, rscratch1);
3941 __ mov(used, 16);
3942 __ strw(used, Address(used_ptr));
3943 __ b(large_block_return);
3944
3945 // record the stub entry and end
3946 store_archive_data(stub_id, start, __ pc());
3947
3948 return start;
3949 }
3950
3951 // Vector AES Galois Counter Mode implementation. Parameters:
3952 //
3953 // in = c_rarg0
3954 // len = c_rarg1
3955 // ct = c_rarg2 - ciphertext that ghash will read (in for encrypt, out for decrypt)
3956 // out = c_rarg3
3957 // key = c_rarg4
3958 // state = c_rarg5 - GHASH.state
3959 // subkeyHtbl = c_rarg6 - powers of H
3960 // counter = c_rarg7 - 16 bytes of CTR
3961 // return - number of processed bytes
3962 address generate_galoisCounterMode_AESCrypt() {
3963 Label ghash_polynomial; // local data generated after code
3964 StubId stub_id = StubId::stubgen_galoisCounterMode_AESCrypt_id;
3965 int entry_count = StubInfo::entry_count(stub_id);
3966 assert(entry_count == 1, "sanity check");
3967 address start = load_archive_data(stub_id);
3968 if (start != nullptr) {
3969 return start;
3970 }
3971 __ align(CodeEntryAlignment);
3972 StubCodeMark mark(this, stub_id);
3973 start = __ pc();
3974 __ enter();
3975
3976 const Register in = c_rarg0;
3977 const Register len = c_rarg1;
3978 const Register ct = c_rarg2;
3979 const Register out = c_rarg3;
3980 // and updated with the incremented counter in the end
3981
3982 const Register key = c_rarg4;
3983 const Register state = c_rarg5;
3984
3985 const Register subkeyHtbl = c_rarg6;
3986
3987 const Register counter = c_rarg7;
3988
3989 const Register keylen = r10;
3990 // Save state before entering routine
3991 __ sub(sp, sp, 4 * 16);
3992 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
3993 __ sub(sp, sp, 4 * 16);
3994 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
3995
3996 // __ andr(len, len, -512);
3997 __ andr(len, len, -16 * 8); // 8 encryptions, 16 bytes per encryption
3998 __ str(len, __ pre(sp, -2 * wordSize));
3999
4000 Label DONE;
4001 __ cbz(len, DONE);
4002
4003 // Compute #rounds for AES based on the length of the key array
4004 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4005
4006 __ aesenc_loadkeys(key, keylen);
4007 __ ld1(v0, __ T16B, counter); // v0 contains the first counter
4008 __ rev32(v16, __ T16B, v0); // v16 contains byte-reversed counter
4009
4010 // AES/CTR loop
4011 {
4012 Label L_CTR_loop;
4013 __ BIND(L_CTR_loop);
4014
4015 // Setup the counters
4016 __ movi(v8, __ T4S, 0);
4017 __ movi(v9, __ T4S, 1);
4018 __ ins(v8, __ S, v9, 3, 3); // v8 contains { 0, 0, 0, 1 }
4019
4020 assert(v0->encoding() < v8->encoding(), "");
4021 for (int i = v0->encoding(); i < v8->encoding(); i++) {
4022 FloatRegister f = as_FloatRegister(i);
4023 __ rev32(f, __ T16B, v16);
4024 __ addv(v16, __ T4S, v16, v8);
4025 }
4026
4027 __ ld1(v8, v9, v10, v11, __ T16B, __ post(in, 4 * 16));
4028
4029 // Encrypt the counters
4030 __ aesecb_encrypt(noreg, noreg, keylen, v0, /*unrolls*/8);
4031
4032 __ ld1(v12, v13, v14, v15, __ T16B, __ post(in, 4 * 16));
4033
4034 // XOR the encrypted counters with the inputs
4035 for (int i = 0; i < 8; i++) {
4036 FloatRegister v0_ofs = as_FloatRegister(v0->encoding() + i);
4037 FloatRegister v8_ofs = as_FloatRegister(v8->encoding() + i);
4038 __ eor(v0_ofs, __ T16B, v0_ofs, v8_ofs);
4039 }
4040 __ st1(v0, v1, v2, v3, __ T16B, __ post(out, 4 * 16));
4041 __ st1(v4, v5, v6, v7, __ T16B, __ post(out, 4 * 16));
4042
4043 __ subw(len, len, 16 * 8);
4044 __ cbnzw(len, L_CTR_loop);
4045 }
4046
4047 __ rev32(v16, __ T16B, v16);
4048 __ st1(v16, __ T16B, counter);
4049
4050 __ ldr(len, Address(sp));
4051 __ lsr(len, len, exact_log2(16)); // We want the count of blocks
4052
4053 // GHASH/CTR loop
4054 __ ghash_processBlocks_wide(ghash_polynomial, state, subkeyHtbl, ct,
4055 len, /*unrolls*/4);
4056
4057 #ifdef ASSERT
4058 { Label L;
4059 __ cmp(len, (unsigned char)0);
4060 __ br(Assembler::EQ, L);
4061 __ stop("stubGenerator: abort");
4062 __ bind(L);
4063 }
4064 #endif
4065
4066 __ bind(DONE);
4067 // Return the number of bytes processed
4068 __ ldr(r0, __ post(sp, 2 * wordSize));
4069
4070 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
4071 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
4072
4073 __ leave(); // required for proper stackwalking of RuntimeStub frame
4074 __ ret(lr);
4075
4076 // bind label and generate polynomial data
4077 __ align(wordSize * 2);
4078 __ bind(ghash_polynomial);
4079 __ emit_int64(0x87); // The low-order bits of the field
4080 // polynomial (i.e. p = z^7+z^2+z+1)
4081 // repeated in the low and high parts of a
4082 // 128-bit vector
4083 __ emit_int64(0x87);
4084
4085 // record the stub entry and end
4086 store_archive_data(stub_id, start, __ pc());
4087
4088 return start;
4089 }
4090
4091 class Cached64Bytes {
4092 private:
4093 MacroAssembler *_masm;
4094 Register _regs[8];
4095
4096 public:
4097 Cached64Bytes(MacroAssembler *masm, RegSet rs): _masm(masm) {
4098 assert(rs.size() == 8, "%u registers are used to cache 16 4-byte data", rs.size());
4099 auto it = rs.begin();
4100 for (auto &r: _regs) {
4101 r = *it;
4102 ++it;
4103 }
4104 }
4105
4106 void gen_loads(Register base) {
4107 for (int i = 0; i < 8; i += 2) {
4108 __ ldp(_regs[i], _regs[i + 1], Address(base, 8 * i));
4109 }
4110 }
4111
4112 // Generate code extracting i-th unsigned word (4 bytes) from cached 64 bytes.
4113 void extract_u32(Register dest, int i) {
4114 __ ubfx(dest, _regs[i / 2], 32 * (i % 2), 32);
4115 }
4116 };
4117
4118 // Utility routines for md5.
4119 // Clobbers r10 and r11.
4120 void md5_FF(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4121 int k, int s, int t) {
4122 Register rscratch3 = r10;
4123 Register rscratch4 = r11;
4124
4125 __ eorw(rscratch3, r3, r4);
4126 __ movw(rscratch2, t);
4127 __ andw(rscratch3, rscratch3, r2);
4128 __ addw(rscratch4, r1, rscratch2);
4129 reg_cache.extract_u32(rscratch1, k);
4130 __ eorw(rscratch3, rscratch3, r4);
4131 __ addw(rscratch4, rscratch4, rscratch1);
4132 __ addw(rscratch3, rscratch3, rscratch4);
4133 __ rorw(rscratch2, rscratch3, 32 - s);
4134 __ addw(r1, rscratch2, r2);
4135 }
4136
4137 void md5_GG(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4138 int k, int s, int t) {
4139 Register rscratch3 = r10;
4140 Register rscratch4 = r11;
4141
4142 reg_cache.extract_u32(rscratch1, k);
4143 __ movw(rscratch2, t);
4144 __ addw(rscratch4, r1, rscratch2);
4145 __ addw(rscratch4, rscratch4, rscratch1);
4146 __ bicw(rscratch2, r3, r4);
4147 __ andw(rscratch3, r2, r4);
4148 __ addw(rscratch2, rscratch2, rscratch4);
4149 __ addw(rscratch2, rscratch2, rscratch3);
4150 __ rorw(rscratch2, rscratch2, 32 - s);
4151 __ addw(r1, rscratch2, r2);
4152 }
4153
4154 void md5_HH(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4155 int k, int s, int t) {
4156 Register rscratch3 = r10;
4157 Register rscratch4 = r11;
4158
4159 __ eorw(rscratch3, r3, r4);
4160 __ movw(rscratch2, t);
4161 __ addw(rscratch4, r1, rscratch2);
4162 reg_cache.extract_u32(rscratch1, k);
4163 __ eorw(rscratch3, rscratch3, r2);
4164 __ addw(rscratch4, rscratch4, rscratch1);
4165 __ addw(rscratch3, rscratch3, rscratch4);
4166 __ rorw(rscratch2, rscratch3, 32 - s);
4167 __ addw(r1, rscratch2, r2);
4168 }
4169
4170 void md5_II(Cached64Bytes& reg_cache, Register r1, Register r2, Register r3, Register r4,
4171 int k, int s, int t) {
4172 Register rscratch3 = r10;
4173 Register rscratch4 = r11;
4174
4175 __ movw(rscratch3, t);
4176 __ ornw(rscratch2, r2, r4);
4177 __ addw(rscratch4, r1, rscratch3);
4178 reg_cache.extract_u32(rscratch1, k);
4179 __ eorw(rscratch3, rscratch2, r3);
4180 __ addw(rscratch4, rscratch4, rscratch1);
4181 __ addw(rscratch3, rscratch3, rscratch4);
4182 __ rorw(rscratch2, rscratch3, 32 - s);
4183 __ addw(r1, rscratch2, r2);
4184 }
4185
4186 // Arguments:
4187 //
4188 // Inputs:
4189 // c_rarg0 - byte[] source+offset
4190 // c_rarg1 - int[] SHA.state
4191 // c_rarg2 - int offset
4192 // c_rarg3 - int limit
4193 //
4194 address generate_md5_implCompress(StubId stub_id) {
4195 bool multi_block;
4196 switch (stub_id) {
4197 case StubId::stubgen_md5_implCompress_id:
4198 multi_block = false;
4199 break;
4200 case StubId::stubgen_md5_implCompressMB_id:
4201 multi_block = true;
4202 break;
4203 default:
4204 ShouldNotReachHere();
4205 }
4206 int entry_count = StubInfo::entry_count(stub_id);
4207 assert(entry_count == 1, "sanity check");
4208 address start = load_archive_data(stub_id);
4209 if (start != nullptr) {
4210 return start;
4211 }
4212 __ align(CodeEntryAlignment);
4213
4214 StubCodeMark mark(this, stub_id);
4215 start = __ pc();
4216
4217 Register buf = c_rarg0;
4218 Register state = c_rarg1;
4219 Register ofs = c_rarg2;
4220 Register limit = c_rarg3;
4221 Register a = r4;
4222 Register b = r5;
4223 Register c = r6;
4224 Register d = r7;
4225 Register rscratch3 = r10;
4226 Register rscratch4 = r11;
4227
4228 Register state_regs[2] = { r12, r13 };
4229 RegSet saved_regs = RegSet::range(r16, r22) - r18_tls;
4230 Cached64Bytes reg_cache(_masm, RegSet::of(r14, r15) + saved_regs); // using 8 registers
4231
4232 __ push(saved_regs, sp);
4233
4234 __ ldp(state_regs[0], state_regs[1], Address(state));
4235 __ ubfx(a, state_regs[0], 0, 32);
4236 __ ubfx(b, state_regs[0], 32, 32);
4237 __ ubfx(c, state_regs[1], 0, 32);
4238 __ ubfx(d, state_regs[1], 32, 32);
4239
4240 Label md5_loop;
4241 __ BIND(md5_loop);
4242
4243 reg_cache.gen_loads(buf);
4244
4245 // Round 1
4246 md5_FF(reg_cache, a, b, c, d, 0, 7, 0xd76aa478);
4247 md5_FF(reg_cache, d, a, b, c, 1, 12, 0xe8c7b756);
4248 md5_FF(reg_cache, c, d, a, b, 2, 17, 0x242070db);
4249 md5_FF(reg_cache, b, c, d, a, 3, 22, 0xc1bdceee);
4250 md5_FF(reg_cache, a, b, c, d, 4, 7, 0xf57c0faf);
4251 md5_FF(reg_cache, d, a, b, c, 5, 12, 0x4787c62a);
4252 md5_FF(reg_cache, c, d, a, b, 6, 17, 0xa8304613);
4253 md5_FF(reg_cache, b, c, d, a, 7, 22, 0xfd469501);
4254 md5_FF(reg_cache, a, b, c, d, 8, 7, 0x698098d8);
4255 md5_FF(reg_cache, d, a, b, c, 9, 12, 0x8b44f7af);
4256 md5_FF(reg_cache, c, d, a, b, 10, 17, 0xffff5bb1);
4257 md5_FF(reg_cache, b, c, d, a, 11, 22, 0x895cd7be);
4258 md5_FF(reg_cache, a, b, c, d, 12, 7, 0x6b901122);
4259 md5_FF(reg_cache, d, a, b, c, 13, 12, 0xfd987193);
4260 md5_FF(reg_cache, c, d, a, b, 14, 17, 0xa679438e);
4261 md5_FF(reg_cache, b, c, d, a, 15, 22, 0x49b40821);
4262
4263 // Round 2
4264 md5_GG(reg_cache, a, b, c, d, 1, 5, 0xf61e2562);
4265 md5_GG(reg_cache, d, a, b, c, 6, 9, 0xc040b340);
4266 md5_GG(reg_cache, c, d, a, b, 11, 14, 0x265e5a51);
4267 md5_GG(reg_cache, b, c, d, a, 0, 20, 0xe9b6c7aa);
4268 md5_GG(reg_cache, a, b, c, d, 5, 5, 0xd62f105d);
4269 md5_GG(reg_cache, d, a, b, c, 10, 9, 0x02441453);
4270 md5_GG(reg_cache, c, d, a, b, 15, 14, 0xd8a1e681);
4271 md5_GG(reg_cache, b, c, d, a, 4, 20, 0xe7d3fbc8);
4272 md5_GG(reg_cache, a, b, c, d, 9, 5, 0x21e1cde6);
4273 md5_GG(reg_cache, d, a, b, c, 14, 9, 0xc33707d6);
4274 md5_GG(reg_cache, c, d, a, b, 3, 14, 0xf4d50d87);
4275 md5_GG(reg_cache, b, c, d, a, 8, 20, 0x455a14ed);
4276 md5_GG(reg_cache, a, b, c, d, 13, 5, 0xa9e3e905);
4277 md5_GG(reg_cache, d, a, b, c, 2, 9, 0xfcefa3f8);
4278 md5_GG(reg_cache, c, d, a, b, 7, 14, 0x676f02d9);
4279 md5_GG(reg_cache, b, c, d, a, 12, 20, 0x8d2a4c8a);
4280
4281 // Round 3
4282 md5_HH(reg_cache, a, b, c, d, 5, 4, 0xfffa3942);
4283 md5_HH(reg_cache, d, a, b, c, 8, 11, 0x8771f681);
4284 md5_HH(reg_cache, c, d, a, b, 11, 16, 0x6d9d6122);
4285 md5_HH(reg_cache, b, c, d, a, 14, 23, 0xfde5380c);
4286 md5_HH(reg_cache, a, b, c, d, 1, 4, 0xa4beea44);
4287 md5_HH(reg_cache, d, a, b, c, 4, 11, 0x4bdecfa9);
4288 md5_HH(reg_cache, c, d, a, b, 7, 16, 0xf6bb4b60);
4289 md5_HH(reg_cache, b, c, d, a, 10, 23, 0xbebfbc70);
4290 md5_HH(reg_cache, a, b, c, d, 13, 4, 0x289b7ec6);
4291 md5_HH(reg_cache, d, a, b, c, 0, 11, 0xeaa127fa);
4292 md5_HH(reg_cache, c, d, a, b, 3, 16, 0xd4ef3085);
4293 md5_HH(reg_cache, b, c, d, a, 6, 23, 0x04881d05);
4294 md5_HH(reg_cache, a, b, c, d, 9, 4, 0xd9d4d039);
4295 md5_HH(reg_cache, d, a, b, c, 12, 11, 0xe6db99e5);
4296 md5_HH(reg_cache, c, d, a, b, 15, 16, 0x1fa27cf8);
4297 md5_HH(reg_cache, b, c, d, a, 2, 23, 0xc4ac5665);
4298
4299 // Round 4
4300 md5_II(reg_cache, a, b, c, d, 0, 6, 0xf4292244);
4301 md5_II(reg_cache, d, a, b, c, 7, 10, 0x432aff97);
4302 md5_II(reg_cache, c, d, a, b, 14, 15, 0xab9423a7);
4303 md5_II(reg_cache, b, c, d, a, 5, 21, 0xfc93a039);
4304 md5_II(reg_cache, a, b, c, d, 12, 6, 0x655b59c3);
4305 md5_II(reg_cache, d, a, b, c, 3, 10, 0x8f0ccc92);
4306 md5_II(reg_cache, c, d, a, b, 10, 15, 0xffeff47d);
4307 md5_II(reg_cache, b, c, d, a, 1, 21, 0x85845dd1);
4308 md5_II(reg_cache, a, b, c, d, 8, 6, 0x6fa87e4f);
4309 md5_II(reg_cache, d, a, b, c, 15, 10, 0xfe2ce6e0);
4310 md5_II(reg_cache, c, d, a, b, 6, 15, 0xa3014314);
4311 md5_II(reg_cache, b, c, d, a, 13, 21, 0x4e0811a1);
4312 md5_II(reg_cache, a, b, c, d, 4, 6, 0xf7537e82);
4313 md5_II(reg_cache, d, a, b, c, 11, 10, 0xbd3af235);
4314 md5_II(reg_cache, c, d, a, b, 2, 15, 0x2ad7d2bb);
4315 md5_II(reg_cache, b, c, d, a, 9, 21, 0xeb86d391);
4316
4317 __ addw(a, state_regs[0], a);
4318 __ ubfx(rscratch2, state_regs[0], 32, 32);
4319 __ addw(b, rscratch2, b);
4320 __ addw(c, state_regs[1], c);
4321 __ ubfx(rscratch4, state_regs[1], 32, 32);
4322 __ addw(d, rscratch4, d);
4323
4324 __ orr(state_regs[0], a, b, Assembler::LSL, 32);
4325 __ orr(state_regs[1], c, d, Assembler::LSL, 32);
4326
4327 if (multi_block) {
4328 __ add(buf, buf, 64);
4329 __ add(ofs, ofs, 64);
4330 __ cmp(ofs, limit);
4331 __ br(Assembler::LE, md5_loop);
4332 __ mov(c_rarg0, ofs); // return ofs
4333 }
4334
4335 // write hash values back in the correct order
4336 __ stp(state_regs[0], state_regs[1], Address(state));
4337
4338 __ pop(saved_regs, sp);
4339
4340 __ ret(lr);
4341
4342 // record the stub entry and end
4343 store_archive_data(stub_id, start, __ pc());
4344
4345 return start;
4346 }
4347
4348 // Arguments:
4349 //
4350 // Inputs:
4351 // c_rarg0 - byte[] source+offset
4352 // c_rarg1 - int[] SHA.state
4353 // c_rarg2 - int offset
4354 // c_rarg3 - int limit
4355 //
4356 address generate_sha1_implCompress(StubId stub_id) {
4357 bool multi_block;
4358 switch (stub_id) {
4359 case StubId::stubgen_sha1_implCompress_id:
4360 multi_block = false;
4361 break;
4362 case StubId::stubgen_sha1_implCompressMB_id:
4363 multi_block = true;
4364 break;
4365 default:
4366 ShouldNotReachHere();
4367 }
4368 int entry_count = StubInfo::entry_count(stub_id);
4369 assert(entry_count == 1, "sanity check");
4370 address start = load_archive_data(stub_id);
4371 if (start != nullptr) {
4372 return start;
4373 }
4374 __ align(CodeEntryAlignment);
4375
4376 StubCodeMark mark(this, stub_id);
4377 start = __ pc();
4378
4379 Register buf = c_rarg0;
4380 Register state = c_rarg1;
4381 Register ofs = c_rarg2;
4382 Register limit = c_rarg3;
4383
4384 Label keys;
4385 Label sha1_loop;
4386
4387 // load the keys into v0..v3
4388 __ adr(rscratch1, keys);
4389 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
4390 // load 5 words state into v6, v7
4391 __ ldrq(v6, Address(state, 0));
4392 __ ldrs(v7, Address(state, 16));
4393
4394
4395 __ BIND(sha1_loop);
4396 // load 64 bytes of data into v16..v19
4397 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
4398 __ rev32(v16, __ T16B, v16);
4399 __ rev32(v17, __ T16B, v17);
4400 __ rev32(v18, __ T16B, v18);
4401 __ rev32(v19, __ T16B, v19);
4402
4403 // do the sha1
4404 __ addv(v4, __ T4S, v16, v0);
4405 __ orr(v20, __ T16B, v6, v6);
4406
4407 FloatRegister d0 = v16;
4408 FloatRegister d1 = v17;
4409 FloatRegister d2 = v18;
4410 FloatRegister d3 = v19;
4411
4412 for (int round = 0; round < 20; round++) {
4413 FloatRegister tmp1 = (round & 1) ? v4 : v5;
4414 FloatRegister tmp2 = (round & 1) ? v21 : v22;
4415 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
4416 FloatRegister tmp4 = (round & 1) ? v5 : v4;
4417 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
4418
4419 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
4420 if (round < 19) __ addv(tmp1, __ T4S, d1, key);
4421 __ sha1h(tmp2, __ T4S, v20);
4422 if (round < 5)
4423 __ sha1c(v20, __ T4S, tmp3, tmp4);
4424 else if (round < 10 || round >= 15)
4425 __ sha1p(v20, __ T4S, tmp3, tmp4);
4426 else
4427 __ sha1m(v20, __ T4S, tmp3, tmp4);
4428 if (round < 16) __ sha1su1(d0, __ T4S, d3);
4429
4430 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4431 }
4432
4433 __ addv(v7, __ T2S, v7, v21);
4434 __ addv(v6, __ T4S, v6, v20);
4435
4436 if (multi_block) {
4437 __ add(ofs, ofs, 64);
4438 __ cmp(ofs, limit);
4439 __ br(Assembler::LE, sha1_loop);
4440 __ mov(c_rarg0, ofs); // return ofs
4441 }
4442
4443 __ strq(v6, Address(state, 0));
4444 __ strs(v7, Address(state, 16));
4445
4446 __ ret(lr);
4447
4448 __ bind(keys);
4449 __ emit_int32(0x5a827999);
4450 __ emit_int32(0x6ed9eba1);
4451 __ emit_int32(0x8f1bbcdc);
4452 __ emit_int32(0xca62c1d6);
4453
4454 // record the stub entry and end
4455 store_archive_data(stub_id, start, __ pc());
4456
4457 return start;
4458 }
4459
4460
4461 // Arguments:
4462 //
4463 // Inputs:
4464 // c_rarg0 - byte[] source+offset
4465 // c_rarg1 - int[] SHA.state
4466 // c_rarg2 - int offset
4467 // c_rarg3 - int limit
4468 //
4469 address generate_sha256_implCompress(StubId stub_id) {
4470 bool multi_block;
4471 switch (stub_id) {
4472 case StubId::stubgen_sha256_implCompress_id:
4473 multi_block = false;
4474 break;
4475 case StubId::stubgen_sha256_implCompressMB_id:
4476 multi_block = true;
4477 break;
4478 default:
4479 ShouldNotReachHere();
4480 }
4481 int entry_count = StubInfo::entry_count(stub_id);
4482 assert(entry_count == 1, "sanity check");
4483 address start = load_archive_data(stub_id);
4484 if (start != nullptr) {
4485 return start;
4486 }
4487 __ align(CodeEntryAlignment);
4488 StubCodeMark mark(this, stub_id);
4489 start = __ pc();
4490
4491 Register buf = c_rarg0;
4492 Register state = c_rarg1;
4493 Register ofs = c_rarg2;
4494 Register limit = c_rarg3;
4495
4496 Label sha1_loop;
4497
4498 __ stpd(v8, v9, __ pre(sp, -32));
4499 __ stpd(v10, v11, Address(sp, 16));
4500
4501 // dga == v0
4502 // dgb == v1
4503 // dg0 == v2
4504 // dg1 == v3
4505 // dg2 == v4
4506 // t0 == v6
4507 // t1 == v7
4508
4509 // load 16 keys to v16..v31
4510 __ lea(rscratch1, ExternalAddress((address)_sha256_round_consts));
4511 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
4512 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
4513 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
4514 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
4515
4516 // load 8 words (256 bits) state
4517 __ ldpq(v0, v1, state);
4518
4519 __ BIND(sha1_loop);
4520 // load 64 bytes of data into v8..v11
4521 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
4522 __ rev32(v8, __ T16B, v8);
4523 __ rev32(v9, __ T16B, v9);
4524 __ rev32(v10, __ T16B, v10);
4525 __ rev32(v11, __ T16B, v11);
4526
4527 __ addv(v6, __ T4S, v8, v16);
4528 __ orr(v2, __ T16B, v0, v0);
4529 __ orr(v3, __ T16B, v1, v1);
4530
4531 FloatRegister d0 = v8;
4532 FloatRegister d1 = v9;
4533 FloatRegister d2 = v10;
4534 FloatRegister d3 = v11;
4535
4536
4537 for (int round = 0; round < 16; round++) {
4538 FloatRegister tmp1 = (round & 1) ? v6 : v7;
4539 FloatRegister tmp2 = (round & 1) ? v7 : v6;
4540 FloatRegister tmp3 = (round & 1) ? v2 : v4;
4541 FloatRegister tmp4 = (round & 1) ? v4 : v2;
4542
4543 if (round < 12) __ sha256su0(d0, __ T4S, d1);
4544 __ orr(v4, __ T16B, v2, v2);
4545 if (round < 15)
4546 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
4547 __ sha256h(v2, __ T4S, v3, tmp2);
4548 __ sha256h2(v3, __ T4S, v4, tmp2);
4549 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
4550
4551 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
4552 }
4553
4554 __ addv(v0, __ T4S, v0, v2);
4555 __ addv(v1, __ T4S, v1, v3);
4556
4557 if (multi_block) {
4558 __ add(ofs, ofs, 64);
4559 __ cmp(ofs, limit);
4560 __ br(Assembler::LE, sha1_loop);
4561 __ mov(c_rarg0, ofs); // return ofs
4562 }
4563
4564 __ ldpd(v10, v11, Address(sp, 16));
4565 __ ldpd(v8, v9, __ post(sp, 32));
4566
4567 __ stpq(v0, v1, state);
4568
4569 __ ret(lr);
4570
4571 // record the stub entry and end
4572 store_archive_data(stub_id, start, __ pc());
4573
4574 return start;
4575 }
4576
4577 // Double rounds for sha512.
4578 void sha512_dround(int dr,
4579 FloatRegister vi0, FloatRegister vi1,
4580 FloatRegister vi2, FloatRegister vi3,
4581 FloatRegister vi4, FloatRegister vrc0,
4582 FloatRegister vrc1, FloatRegister vin0,
4583 FloatRegister vin1, FloatRegister vin2,
4584 FloatRegister vin3, FloatRegister vin4) {
4585 if (dr < 36) {
4586 __ ld1(vrc1, __ T2D, __ post(rscratch2, 16));
4587 }
4588 __ addv(v5, __ T2D, vrc0, vin0);
4589 __ ext(v6, __ T16B, vi2, vi3, 8);
4590 __ ext(v5, __ T16B, v5, v5, 8);
4591 __ ext(v7, __ T16B, vi1, vi2, 8);
4592 __ addv(vi3, __ T2D, vi3, v5);
4593 if (dr < 32) {
4594 __ ext(v5, __ T16B, vin3, vin4, 8);
4595 __ sha512su0(vin0, __ T2D, vin1);
4596 }
4597 __ sha512h(vi3, __ T2D, v6, v7);
4598 if (dr < 32) {
4599 __ sha512su1(vin0, __ T2D, vin2, v5);
4600 }
4601 __ addv(vi4, __ T2D, vi1, vi3);
4602 __ sha512h2(vi3, __ T2D, vi1, vi0);
4603 }
4604
4605 // Arguments:
4606 //
4607 // Inputs:
4608 // c_rarg0 - byte[] source+offset
4609 // c_rarg1 - int[] SHA.state
4610 // c_rarg2 - int offset
4611 // c_rarg3 - int limit
4612 //
4613 address generate_sha512_implCompress(StubId stub_id) {
4614 bool multi_block;
4615 switch (stub_id) {
4616 case StubId::stubgen_sha512_implCompress_id:
4617 multi_block = false;
4618 break;
4619 case StubId::stubgen_sha512_implCompressMB_id:
4620 multi_block = true;
4621 break;
4622 default:
4623 ShouldNotReachHere();
4624 }
4625 int entry_count = StubInfo::entry_count(stub_id);
4626 assert(entry_count == 1, "sanity check");
4627 address start = load_archive_data(stub_id);
4628 if (start != nullptr) {
4629 return start;
4630 }
4631 __ align(CodeEntryAlignment);
4632 StubCodeMark mark(this, stub_id);
4633 start = __ pc();
4634
4635 Register buf = c_rarg0;
4636 Register state = c_rarg1;
4637 Register ofs = c_rarg2;
4638 Register limit = c_rarg3;
4639
4640 __ stpd(v8, v9, __ pre(sp, -64));
4641 __ stpd(v10, v11, Address(sp, 16));
4642 __ stpd(v12, v13, Address(sp, 32));
4643 __ stpd(v14, v15, Address(sp, 48));
4644
4645 Label sha512_loop;
4646
4647 // load state
4648 __ ld1(v8, v9, v10, v11, __ T2D, state);
4649
4650 // load first 4 round constants
4651 __ lea(rscratch1, ExternalAddress((address)_sha512_round_consts));
4652 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64));
4653
4654 __ BIND(sha512_loop);
4655 // load 128B of data into v12..v19
4656 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64));
4657 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64));
4658 __ rev64(v12, __ T16B, v12);
4659 __ rev64(v13, __ T16B, v13);
4660 __ rev64(v14, __ T16B, v14);
4661 __ rev64(v15, __ T16B, v15);
4662 __ rev64(v16, __ T16B, v16);
4663 __ rev64(v17, __ T16B, v17);
4664 __ rev64(v18, __ T16B, v18);
4665 __ rev64(v19, __ T16B, v19);
4666
4667 __ mov(rscratch2, rscratch1);
4668
4669 __ mov(v0, __ T16B, v8);
4670 __ mov(v1, __ T16B, v9);
4671 __ mov(v2, __ T16B, v10);
4672 __ mov(v3, __ T16B, v11);
4673
4674 sha512_dround( 0, v0, v1, v2, v3, v4, v20, v24, v12, v13, v19, v16, v17);
4675 sha512_dround( 1, v3, v0, v4, v2, v1, v21, v25, v13, v14, v12, v17, v18);
4676 sha512_dround( 2, v2, v3, v1, v4, v0, v22, v26, v14, v15, v13, v18, v19);
4677 sha512_dround( 3, v4, v2, v0, v1, v3, v23, v27, v15, v16, v14, v19, v12);
4678 sha512_dround( 4, v1, v4, v3, v0, v2, v24, v28, v16, v17, v15, v12, v13);
4679 sha512_dround( 5, v0, v1, v2, v3, v4, v25, v29, v17, v18, v16, v13, v14);
4680 sha512_dround( 6, v3, v0, v4, v2, v1, v26, v30, v18, v19, v17, v14, v15);
4681 sha512_dround( 7, v2, v3, v1, v4, v0, v27, v31, v19, v12, v18, v15, v16);
4682 sha512_dround( 8, v4, v2, v0, v1, v3, v28, v24, v12, v13, v19, v16, v17);
4683 sha512_dround( 9, v1, v4, v3, v0, v2, v29, v25, v13, v14, v12, v17, v18);
4684 sha512_dround(10, v0, v1, v2, v3, v4, v30, v26, v14, v15, v13, v18, v19);
4685 sha512_dround(11, v3, v0, v4, v2, v1, v31, v27, v15, v16, v14, v19, v12);
4686 sha512_dround(12, v2, v3, v1, v4, v0, v24, v28, v16, v17, v15, v12, v13);
4687 sha512_dround(13, v4, v2, v0, v1, v3, v25, v29, v17, v18, v16, v13, v14);
4688 sha512_dround(14, v1, v4, v3, v0, v2, v26, v30, v18, v19, v17, v14, v15);
4689 sha512_dround(15, v0, v1, v2, v3, v4, v27, v31, v19, v12, v18, v15, v16);
4690 sha512_dround(16, v3, v0, v4, v2, v1, v28, v24, v12, v13, v19, v16, v17);
4691 sha512_dround(17, v2, v3, v1, v4, v0, v29, v25, v13, v14, v12, v17, v18);
4692 sha512_dround(18, v4, v2, v0, v1, v3, v30, v26, v14, v15, v13, v18, v19);
4693 sha512_dround(19, v1, v4, v3, v0, v2, v31, v27, v15, v16, v14, v19, v12);
4694 sha512_dround(20, v0, v1, v2, v3, v4, v24, v28, v16, v17, v15, v12, v13);
4695 sha512_dround(21, v3, v0, v4, v2, v1, v25, v29, v17, v18, v16, v13, v14);
4696 sha512_dround(22, v2, v3, v1, v4, v0, v26, v30, v18, v19, v17, v14, v15);
4697 sha512_dround(23, v4, v2, v0, v1, v3, v27, v31, v19, v12, v18, v15, v16);
4698 sha512_dround(24, v1, v4, v3, v0, v2, v28, v24, v12, v13, v19, v16, v17);
4699 sha512_dround(25, v0, v1, v2, v3, v4, v29, v25, v13, v14, v12, v17, v18);
4700 sha512_dround(26, v3, v0, v4, v2, v1, v30, v26, v14, v15, v13, v18, v19);
4701 sha512_dround(27, v2, v3, v1, v4, v0, v31, v27, v15, v16, v14, v19, v12);
4702 sha512_dround(28, v4, v2, v0, v1, v3, v24, v28, v16, v17, v15, v12, v13);
4703 sha512_dround(29, v1, v4, v3, v0, v2, v25, v29, v17, v18, v16, v13, v14);
4704 sha512_dround(30, v0, v1, v2, v3, v4, v26, v30, v18, v19, v17, v14, v15);
4705 sha512_dround(31, v3, v0, v4, v2, v1, v27, v31, v19, v12, v18, v15, v16);
4706 sha512_dround(32, v2, v3, v1, v4, v0, v28, v24, v12, v0, v0, v0, v0);
4707 sha512_dround(33, v4, v2, v0, v1, v3, v29, v25, v13, v0, v0, v0, v0);
4708 sha512_dround(34, v1, v4, v3, v0, v2, v30, v26, v14, v0, v0, v0, v0);
4709 sha512_dround(35, v0, v1, v2, v3, v4, v31, v27, v15, v0, v0, v0, v0);
4710 sha512_dround(36, v3, v0, v4, v2, v1, v24, v0, v16, v0, v0, v0, v0);
4711 sha512_dround(37, v2, v3, v1, v4, v0, v25, v0, v17, v0, v0, v0, v0);
4712 sha512_dround(38, v4, v2, v0, v1, v3, v26, v0, v18, v0, v0, v0, v0);
4713 sha512_dround(39, v1, v4, v3, v0, v2, v27, v0, v19, v0, v0, v0, v0);
4714
4715 __ addv(v8, __ T2D, v8, v0);
4716 __ addv(v9, __ T2D, v9, v1);
4717 __ addv(v10, __ T2D, v10, v2);
4718 __ addv(v11, __ T2D, v11, v3);
4719
4720 if (multi_block) {
4721 __ add(ofs, ofs, 128);
4722 __ cmp(ofs, limit);
4723 __ br(Assembler::LE, sha512_loop);
4724 __ mov(c_rarg0, ofs); // return ofs
4725 }
4726
4727 __ st1(v8, v9, v10, v11, __ T2D, state);
4728
4729 __ ldpd(v14, v15, Address(sp, 48));
4730 __ ldpd(v12, v13, Address(sp, 32));
4731 __ ldpd(v10, v11, Address(sp, 16));
4732 __ ldpd(v8, v9, __ post(sp, 64));
4733
4734 __ ret(lr);
4735
4736 // record the stub entry and end
4737 store_archive_data(stub_id, start, __ pc());
4738
4739 return start;
4740 }
4741
4742 // Execute one round of keccak of two computations in parallel.
4743 // One of the states should be loaded into the lower halves of
4744 // the vector registers v0-v24, the other should be loaded into
4745 // the upper halves of those registers. The ld1r instruction loads
4746 // the round constant into both halves of register v31.
4747 // Intermediate results c0...c5 and d0...d5 are computed
4748 // in registers v25...v30.
4749 // All vector instructions that are used operate on both register
4750 // halves in parallel.
4751 // If only a single computation is needed, one can only load the lower halves.
4752 void keccak_round(Register rscratch1) {
4753 __ eor3(v29, __ T16B, v4, v9, v14); // c4 = a4 ^ a9 ^ a14
4754 __ eor3(v26, __ T16B, v1, v6, v11); // c1 = a1 ^ a16 ^ a11
4755 __ eor3(v28, __ T16B, v3, v8, v13); // c3 = a3 ^ a8 ^a13
4756 __ eor3(v25, __ T16B, v0, v5, v10); // c0 = a0 ^ a5 ^ a10
4757 __ eor3(v27, __ T16B, v2, v7, v12); // c2 = a2 ^ a7 ^ a12
4758 __ eor3(v29, __ T16B, v29, v19, v24); // c4 ^= a19 ^ a24
4759 __ eor3(v26, __ T16B, v26, v16, v21); // c1 ^= a16 ^ a21
4760 __ eor3(v28, __ T16B, v28, v18, v23); // c3 ^= a18 ^ a23
4761 __ eor3(v25, __ T16B, v25, v15, v20); // c0 ^= a15 ^ a20
4762 __ eor3(v27, __ T16B, v27, v17, v22); // c2 ^= a17 ^ a22
4763
4764 __ rax1(v30, __ T2D, v29, v26); // d0 = c4 ^ rol(c1, 1)
4765 __ rax1(v26, __ T2D, v26, v28); // d2 = c1 ^ rol(c3, 1)
4766 __ rax1(v28, __ T2D, v28, v25); // d4 = c3 ^ rol(c0, 1)
4767 __ rax1(v25, __ T2D, v25, v27); // d1 = c0 ^ rol(c2, 1)
4768 __ rax1(v27, __ T2D, v27, v29); // d3 = c2 ^ rol(c4, 1)
4769
4770 __ eor(v0, __ T16B, v0, v30); // a0 = a0 ^ d0
4771 __ xar(v29, __ T2D, v1, v25, (64 - 1)); // a10' = rol((a1^d1), 1)
4772 __ xar(v1, __ T2D, v6, v25, (64 - 44)); // a1 = rol(a6^d1), 44)
4773 __ xar(v6, __ T2D, v9, v28, (64 - 20)); // a6 = rol((a9^d4), 20)
4774 __ xar(v9, __ T2D, v22, v26, (64 - 61)); // a9 = rol((a22^d2), 61)
4775 __ xar(v22, __ T2D, v14, v28, (64 - 39)); // a22 = rol((a14^d4), 39)
4776 __ xar(v14, __ T2D, v20, v30, (64 - 18)); // a14 = rol((a20^d0), 18)
4777 __ xar(v31, __ T2D, v2, v26, (64 - 62)); // a20' = rol((a2^d2), 62)
4778 __ xar(v2, __ T2D, v12, v26, (64 - 43)); // a2 = rol((a12^d2), 43)
4779 __ xar(v12, __ T2D, v13, v27, (64 - 25)); // a12 = rol((a13^d3), 25)
4780 __ xar(v13, __ T2D, v19, v28, (64 - 8)); // a13 = rol((a19^d4), 8)
4781 __ xar(v19, __ T2D, v23, v27, (64 - 56)); // a19 = rol((a23^d3), 56)
4782 __ xar(v23, __ T2D, v15, v30, (64 - 41)); // a23 = rol((a15^d0), 41)
4783 __ xar(v15, __ T2D, v4, v28, (64 - 27)); // a15 = rol((a4^d4), 27)
4784 __ xar(v28, __ T2D, v24, v28, (64 - 14)); // a4' = rol((a24^d4), 14)
4785 __ xar(v24, __ T2D, v21, v25, (64 - 2)); // a24 = rol((a21^d1), 2)
4786 __ xar(v8, __ T2D, v8, v27, (64 - 55)); // a21' = rol((a8^d3), 55)
4787 __ xar(v4, __ T2D, v16, v25, (64 - 45)); // a8' = rol((a16^d1), 45)
4788 __ xar(v16, __ T2D, v5, v30, (64 - 36)); // a16 = rol((a5^d0), 36)
4789 __ xar(v5, __ T2D, v3, v27, (64 - 28)); // a5 = rol((a3^d3), 28)
4790 __ xar(v27, __ T2D, v18, v27, (64 - 21)); // a3' = rol((a18^d3), 21)
4791 __ xar(v3, __ T2D, v17, v26, (64 - 15)); // a18' = rol((a17^d2), 15)
4792 __ xar(v25, __ T2D, v11, v25, (64 - 10)); // a17' = rol((a11^d1), 10)
4793 __ xar(v26, __ T2D, v7, v26, (64 - 6)); // a11' = rol((a7^d2), 6)
4794 __ xar(v30, __ T2D, v10, v30, (64 - 3)); // a7' = rol((a10^d0), 3)
4795
4796 __ bcax(v20, __ T16B, v31, v22, v8); // a20 = a20' ^ (~a21 & a22')
4797 __ bcax(v21, __ T16B, v8, v23, v22); // a21 = a21' ^ (~a22 & a23)
4798 __ bcax(v22, __ T16B, v22, v24, v23); // a22 = a22 ^ (~a23 & a24)
4799 __ bcax(v23, __ T16B, v23, v31, v24); // a23 = a23 ^ (~a24 & a20')
4800 __ bcax(v24, __ T16B, v24, v8, v31); // a24 = a24 ^ (~a20' & a21')
4801
4802 __ ld1r(v31, __ T2D, __ post(rscratch1, 8)); // rc = round_constants[i]
4803
4804 __ bcax(v17, __ T16B, v25, v19, v3); // a17 = a17' ^ (~a18' & a19)
4805 __ bcax(v18, __ T16B, v3, v15, v19); // a18 = a18' ^ (~a19 & a15')
4806 __ bcax(v19, __ T16B, v19, v16, v15); // a19 = a19 ^ (~a15 & a16)
4807 __ bcax(v15, __ T16B, v15, v25, v16); // a15 = a15 ^ (~a16 & a17')
4808 __ bcax(v16, __ T16B, v16, v3, v25); // a16 = a16 ^ (~a17' & a18')
4809
4810 __ bcax(v10, __ T16B, v29, v12, v26); // a10 = a10' ^ (~a11' & a12)
4811 __ bcax(v11, __ T16B, v26, v13, v12); // a11 = a11' ^ (~a12 & a13)
4812 __ bcax(v12, __ T16B, v12, v14, v13); // a12 = a12 ^ (~a13 & a14)
4813 __ bcax(v13, __ T16B, v13, v29, v14); // a13 = a13 ^ (~a14 & a10')
4814 __ bcax(v14, __ T16B, v14, v26, v29); // a14 = a14 ^ (~a10' & a11')
4815
4816 __ bcax(v7, __ T16B, v30, v9, v4); // a7 = a7' ^ (~a8' & a9)
4817 __ bcax(v8, __ T16B, v4, v5, v9); // a8 = a8' ^ (~a9 & a5)
4818 __ bcax(v9, __ T16B, v9, v6, v5); // a9 = a9 ^ (~a5 & a6)
4819 __ bcax(v5, __ T16B, v5, v30, v6); // a5 = a5 ^ (~a6 & a7)
4820 __ bcax(v6, __ T16B, v6, v4, v30); // a6 = a6 ^ (~a7 & a8')
4821
4822 __ bcax(v3, __ T16B, v27, v0, v28); // a3 = a3' ^ (~a4' & a0)
4823 __ bcax(v4, __ T16B, v28, v1, v0); // a4 = a4' ^ (~a0 & a1)
4824 __ bcax(v0, __ T16B, v0, v2, v1); // a0 = a0 ^ (~a1 & a2)
4825 __ bcax(v1, __ T16B, v1, v27, v2); // a1 = a1 ^ (~a2 & a3)
4826 __ bcax(v2, __ T16B, v2, v28, v27); // a2 = a2 ^ (~a3 & a4')
4827
4828 __ eor(v0, __ T16B, v0, v31); // a0 = a0 ^ rc
4829 }
4830
4831 // Arguments:
4832 //
4833 // Inputs:
4834 // c_rarg0 - byte[] source+offset
4835 // c_rarg1 - byte[] SHA.state
4836 // c_rarg2 - int block_size
4837 // c_rarg3 - int offset
4838 // c_rarg4 - int limit
4839 //
4840 address generate_sha3_implCompress(StubId stub_id) {
4841 bool multi_block;
4842 switch (stub_id) {
4843 case StubId::stubgen_sha3_implCompress_id:
4844 multi_block = false;
4845 break;
4846 case StubId::stubgen_sha3_implCompressMB_id:
4847 multi_block = true;
4848 break;
4849 default:
4850 ShouldNotReachHere();
4851 }
4852 int entry_count = StubInfo::entry_count(stub_id);
4853 assert(entry_count == 1, "sanity check");
4854 address start = load_archive_data(stub_id);
4855 if (start != nullptr) {
4856 return start;
4857 }
4858 __ align(CodeEntryAlignment);
4859 StubCodeMark mark(this, stub_id);
4860 start = __ pc();
4861
4862 Register buf = c_rarg0;
4863 Register state = c_rarg1;
4864 Register block_size = c_rarg2;
4865 Register ofs = c_rarg3;
4866 Register limit = c_rarg4;
4867
4868 Label sha3_loop, rounds24_loop;
4869 Label sha3_512_or_sha3_384, shake128;
4870
4871 __ stpd(v8, v9, __ pre(sp, -64));
4872 __ stpd(v10, v11, Address(sp, 16));
4873 __ stpd(v12, v13, Address(sp, 32));
4874 __ stpd(v14, v15, Address(sp, 48));
4875
4876 // load state
4877 __ add(rscratch1, state, 32);
4878 __ ld1(v0, v1, v2, v3, __ T1D, state);
4879 __ ld1(v4, v5, v6, v7, __ T1D, __ post(rscratch1, 32));
4880 __ ld1(v8, v9, v10, v11, __ T1D, __ post(rscratch1, 32));
4881 __ ld1(v12, v13, v14, v15, __ T1D, __ post(rscratch1, 32));
4882 __ ld1(v16, v17, v18, v19, __ T1D, __ post(rscratch1, 32));
4883 __ ld1(v20, v21, v22, v23, __ T1D, __ post(rscratch1, 32));
4884 __ ld1(v24, __ T1D, rscratch1);
4885
4886 __ BIND(sha3_loop);
4887
4888 // 24 keccak rounds
4889 __ movw(rscratch2, 24);
4890
4891 // load round_constants base
4892 __ lea(rscratch1, ExternalAddress((address) _sha3_round_consts));
4893
4894 // load input
4895 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4896 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4897 __ eor(v0, __ T8B, v0, v25);
4898 __ eor(v1, __ T8B, v1, v26);
4899 __ eor(v2, __ T8B, v2, v27);
4900 __ eor(v3, __ T8B, v3, v28);
4901 __ eor(v4, __ T8B, v4, v29);
4902 __ eor(v5, __ T8B, v5, v30);
4903 __ eor(v6, __ T8B, v6, v31);
4904
4905 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
4906 __ tbz(block_size, 7, sha3_512_or_sha3_384);
4907
4908 __ ld1(v25, v26, v27, v28, __ T8B, __ post(buf, 32));
4909 __ ld1(v29, v30, v31, __ T8B, __ post(buf, 24));
4910 __ eor(v7, __ T8B, v7, v25);
4911 __ eor(v8, __ T8B, v8, v26);
4912 __ eor(v9, __ T8B, v9, v27);
4913 __ eor(v10, __ T8B, v10, v28);
4914 __ eor(v11, __ T8B, v11, v29);
4915 __ eor(v12, __ T8B, v12, v30);
4916 __ eor(v13, __ T8B, v13, v31);
4917
4918 __ ld1(v25, v26, v27, __ T8B, __ post(buf, 24));
4919 __ eor(v14, __ T8B, v14, v25);
4920 __ eor(v15, __ T8B, v15, v26);
4921 __ eor(v16, __ T8B, v16, v27);
4922
4923 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
4924 __ andw(c_rarg5, block_size, 48);
4925 __ cbzw(c_rarg5, rounds24_loop);
4926
4927 __ tbnz(block_size, 5, shake128);
4928 // block_size == 144, bit5 == 0, SHA3-224
4929 __ ldrd(v28, __ post(buf, 8));
4930 __ eor(v17, __ T8B, v17, v28);
4931 __ b(rounds24_loop);
4932
4933 __ BIND(shake128);
4934 __ ld1(v28, v29, v30, v31, __ T8B, __ post(buf, 32));
4935 __ eor(v17, __ T8B, v17, v28);
4936 __ eor(v18, __ T8B, v18, v29);
4937 __ eor(v19, __ T8B, v19, v30);
4938 __ eor(v20, __ T8B, v20, v31);
4939 __ b(rounds24_loop); // block_size == 168, SHAKE128
4940
4941 __ BIND(sha3_512_or_sha3_384);
4942 __ ld1(v25, v26, __ T8B, __ post(buf, 16));
4943 __ eor(v7, __ T8B, v7, v25);
4944 __ eor(v8, __ T8B, v8, v26);
4945 __ tbz(block_size, 5, rounds24_loop); // SHA3-512
4946
4947 // SHA3-384
4948 __ ld1(v27, v28, v29, v30, __ T8B, __ post(buf, 32));
4949 __ eor(v9, __ T8B, v9, v27);
4950 __ eor(v10, __ T8B, v10, v28);
4951 __ eor(v11, __ T8B, v11, v29);
4952 __ eor(v12, __ T8B, v12, v30);
4953
4954 __ BIND(rounds24_loop);
4955 __ subw(rscratch2, rscratch2, 1);
4956
4957 keccak_round(rscratch1);
4958
4959 __ cbnzw(rscratch2, rounds24_loop);
4960
4961 if (multi_block) {
4962 __ add(ofs, ofs, block_size);
4963 __ cmp(ofs, limit);
4964 __ br(Assembler::LE, sha3_loop);
4965 __ mov(c_rarg0, ofs); // return ofs
4966 }
4967
4968 __ st1(v0, v1, v2, v3, __ T1D, __ post(state, 32));
4969 __ st1(v4, v5, v6, v7, __ T1D, __ post(state, 32));
4970 __ st1(v8, v9, v10, v11, __ T1D, __ post(state, 32));
4971 __ st1(v12, v13, v14, v15, __ T1D, __ post(state, 32));
4972 __ st1(v16, v17, v18, v19, __ T1D, __ post(state, 32));
4973 __ st1(v20, v21, v22, v23, __ T1D, __ post(state, 32));
4974 __ st1(v24, __ T1D, state);
4975
4976 // restore callee-saved registers
4977 __ ldpd(v14, v15, Address(sp, 48));
4978 __ ldpd(v12, v13, Address(sp, 32));
4979 __ ldpd(v10, v11, Address(sp, 16));
4980 __ ldpd(v8, v9, __ post(sp, 64));
4981
4982 __ ret(lr);
4983
4984 // record the stub entry and end
4985 store_archive_data(stub_id, start, __ pc());
4986
4987 return start;
4988 }
4989
4990 // Inputs:
4991 // c_rarg0 - long[] state0
4992 // c_rarg1 - long[] state1
4993 address generate_double_keccak() {
4994 StubId stub_id = StubId::stubgen_double_keccak_id;
4995 int entry_count = StubInfo::entry_count(stub_id);
4996 assert(entry_count == 1, "sanity check");
4997 address start = load_archive_data(stub_id);
4998 if (start != nullptr) {
4999 return start;
5000 }
5001 // Implements the double_keccak() method of the
5002 // sun.secyrity.provider.SHA3Parallel class
5003 __ align(CodeEntryAlignment);
5004 StubCodeMark mark(this, stub_id);
5005 start = __ pc();
5006 __ enter();
5007
5008 Register state0 = c_rarg0;
5009 Register state1 = c_rarg1;
5010
5011 Label rounds24_loop;
5012
5013 // save callee-saved registers
5014 __ stpd(v8, v9, __ pre(sp, -64));
5015 __ stpd(v10, v11, Address(sp, 16));
5016 __ stpd(v12, v13, Address(sp, 32));
5017 __ stpd(v14, v15, Address(sp, 48));
5018
5019 // load states
5020 __ add(rscratch1, state0, 32);
5021 __ ld4(v0, v1, v2, v3, __ D, 0, state0);
5022 __ ld4(v4, v5, v6, v7, __ D, 0, __ post(rscratch1, 32));
5023 __ ld4(v8, v9, v10, v11, __ D, 0, __ post(rscratch1, 32));
5024 __ ld4(v12, v13, v14, v15, __ D, 0, __ post(rscratch1, 32));
5025 __ ld4(v16, v17, v18, v19, __ D, 0, __ post(rscratch1, 32));
5026 __ ld4(v20, v21, v22, v23, __ D, 0, __ post(rscratch1, 32));
5027 __ ld1(v24, __ D, 0, rscratch1);
5028 __ add(rscratch1, state1, 32);
5029 __ ld4(v0, v1, v2, v3, __ D, 1, state1);
5030 __ ld4(v4, v5, v6, v7, __ D, 1, __ post(rscratch1, 32));
5031 __ ld4(v8, v9, v10, v11, __ D, 1, __ post(rscratch1, 32));
5032 __ ld4(v12, v13, v14, v15, __ D, 1, __ post(rscratch1, 32));
5033 __ ld4(v16, v17, v18, v19, __ D, 1, __ post(rscratch1, 32));
5034 __ ld4(v20, v21, v22, v23, __ D, 1, __ post(rscratch1, 32));
5035 __ ld1(v24, __ D, 1, rscratch1);
5036
5037 // 24 keccak rounds
5038 __ movw(rscratch2, 24);
5039
5040 // load round_constants base
5041 __ lea(rscratch1, ExternalAddress((address) _double_keccak_round_consts));
5042
5043 __ BIND(rounds24_loop);
5044 __ subw(rscratch2, rscratch2, 1);
5045 keccak_round(rscratch1);
5046 __ cbnzw(rscratch2, rounds24_loop);
5047
5048 __ st4(v0, v1, v2, v3, __ D, 0, __ post(state0, 32));
5049 __ st4(v4, v5, v6, v7, __ D, 0, __ post(state0, 32));
5050 __ st4(v8, v9, v10, v11, __ D, 0, __ post(state0, 32));
5051 __ st4(v12, v13, v14, v15, __ D, 0, __ post(state0, 32));
5052 __ st4(v16, v17, v18, v19, __ D, 0, __ post(state0, 32));
5053 __ st4(v20, v21, v22, v23, __ D, 0, __ post(state0, 32));
5054 __ st1(v24, __ D, 0, state0);
5055 __ st4(v0, v1, v2, v3, __ D, 1, __ post(state1, 32));
5056 __ st4(v4, v5, v6, v7, __ D, 1, __ post(state1, 32));
5057 __ st4(v8, v9, v10, v11, __ D, 1, __ post(state1, 32));
5058 __ st4(v12, v13, v14, v15, __ D, 1, __ post(state1, 32));
5059 __ st4(v16, v17, v18, v19, __ D, 1, __ post(state1, 32));
5060 __ st4(v20, v21, v22, v23, __ D, 1, __ post(state1, 32));
5061 __ st1(v24, __ D, 1, state1);
5062
5063 // restore callee-saved vector registers
5064 __ ldpd(v14, v15, Address(sp, 48));
5065 __ ldpd(v12, v13, Address(sp, 32));
5066 __ ldpd(v10, v11, Address(sp, 16));
5067 __ ldpd(v8, v9, __ post(sp, 64));
5068
5069 __ leave(); // required for proper stackwalking of RuntimeStub frame
5070 __ mov(r0, zr); // return 0
5071 __ ret(lr);
5072
5073 // record the stub entry and end
5074 store_archive_data(stub_id, start, __ pc());
5075
5076 return start;
5077 }
5078
5079 // ChaCha20 block function. This version parallelizes the 32-bit
5080 // state elements on each of 16 vectors, producing 4 blocks of
5081 // keystream at a time.
5082 //
5083 // state (int[16]) = c_rarg0
5084 // keystream (byte[256]) = c_rarg1
5085 // return - number of bytes of produced keystream (always 256)
5086 //
5087 // This implementation takes each 32-bit integer from the state
5088 // array and broadcasts it across all 4 32-bit lanes of a vector register
5089 // (e.g. state[0] is replicated on all 4 lanes of v4, state[1] to all 4 lanes
5090 // of v5, etc.). Once all 16 elements have been broadcast onto 16 vectors,
5091 // the quarter round schedule is implemented as outlined in RFC 7539 section
5092 // 2.3. However, instead of sequentially processing the 3 quarter round
5093 // operations represented by one QUARTERROUND function, we instead stack all
5094 // the adds, xors and left-rotations from the first 4 quarter rounds together
5095 // and then do the same for the second set of 4 quarter rounds. This removes
5096 // some latency that would otherwise be incurred by waiting for an add to
5097 // complete before performing an xor (which depends on the result of the
5098 // add), etc. An adjustment happens between the first and second groups of 4
5099 // quarter rounds, but this is done only in the inputs to the macro functions
5100 // that generate the assembly instructions - these adjustments themselves are
5101 // not part of the resulting assembly.
5102 // The 4 registers v0-v3 are used during the quarter round operations as
5103 // scratch registers. Once the 20 rounds are complete, these 4 scratch
5104 // registers become the vectors involved in adding the start state back onto
5105 // the post-QR working state. After the adds are complete, each of the 16
5106 // vectors write their first lane back to the keystream buffer, followed
5107 // by the second lane from all vectors and so on.
5108 address generate_chacha20Block_blockpar() {
5109 StubId stub_id = StubId::stubgen_chacha20Block_id;
5110 int entry_count = StubInfo::entry_count(stub_id);
5111 assert(entry_count == 1, "sanity check");
5112 address start = load_archive_data(stub_id);
5113 if (start != nullptr) {
5114 return start;
5115 }
5116 Label L_twoRounds, L_cc20_const;
5117 __ align(CodeEntryAlignment);
5118 StubCodeMark mark(this, stub_id);
5119 start = __ pc();
5120 __ enter();
5121
5122 int i, j;
5123 const Register state = c_rarg0;
5124 const Register keystream = c_rarg1;
5125 const Register loopCtr = r10;
5126 const Register tmpAddr = r11;
5127 const FloatRegister ctrAddOverlay = v28;
5128 const FloatRegister lrot8Tbl = v29;
5129
5130 // Organize SIMD registers in an array that facilitates
5131 // putting repetitive opcodes into loop structures. It is
5132 // important that each grouping of 4 registers is monotonically
5133 // increasing to support the requirements of multi-register
5134 // instructions (e.g. ld4r, st4, etc.)
5135 const FloatRegister workSt[16] = {
5136 v4, v5, v6, v7, v16, v17, v18, v19,
5137 v20, v21, v22, v23, v24, v25, v26, v27
5138 };
5139
5140 // Pull in constant data. The first 16 bytes are the add overlay
5141 // which is applied to the vector holding the counter (state[12]).
5142 // The second 16 bytes is the index register for the 8-bit left
5143 // rotation tbl instruction.
5144 __ adr(tmpAddr, L_cc20_const);
5145 __ ldpq(ctrAddOverlay, lrot8Tbl, Address(tmpAddr));
5146
5147 // Load from memory and interlace across 16 SIMD registers,
5148 // With each word from memory being broadcast to all lanes of
5149 // each successive SIMD register.
5150 // Addr(0) -> All lanes in workSt[i]
5151 // Addr(4) -> All lanes workSt[i + 1], etc.
5152 __ mov(tmpAddr, state);
5153 for (i = 0; i < 16; i += 4) {
5154 __ ld4r(workSt[i], workSt[i + 1], workSt[i + 2], workSt[i + 3], __ T4S,
5155 __ post(tmpAddr, 16));
5156 }
5157 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5158
5159 // Before entering the loop, create 5 4-register arrays. These
5160 // will hold the 4 registers that represent the a/b/c/d fields
5161 // in the quarter round operation. For instance the "b" field
5162 // for the first 4 quarter round operations is the set of v16/v17/v18/v19,
5163 // but in the second 4 quarter rounds it gets adjusted to v17/v18/v19/v16
5164 // since it is part of a diagonal organization. The aSet and scratch
5165 // register sets are defined at declaration time because they do not change
5166 // organization at any point during the 20-round processing.
5167 FloatRegister aSet[4] = { v4, v5, v6, v7 };
5168 FloatRegister bSet[4];
5169 FloatRegister cSet[4];
5170 FloatRegister dSet[4];
5171 FloatRegister scratch[4] = { v0, v1, v2, v3 };
5172
5173 // Set up the 10 iteration loop and perform all 8 quarter round ops
5174 __ mov(loopCtr, 10);
5175 __ BIND(L_twoRounds);
5176
5177 // Set to columnar organization and do the following 4 quarter-rounds:
5178 // QUARTERROUND(0, 4, 8, 12)
5179 // QUARTERROUND(1, 5, 9, 13)
5180 // QUARTERROUND(2, 6, 10, 14)
5181 // QUARTERROUND(3, 7, 11, 15)
5182 __ cc20_set_qr_registers(bSet, workSt, 4, 5, 6, 7);
5183 __ cc20_set_qr_registers(cSet, workSt, 8, 9, 10, 11);
5184 __ cc20_set_qr_registers(dSet, workSt, 12, 13, 14, 15);
5185
5186 __ cc20_qr_add4(aSet, bSet); // a += b
5187 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5188 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5189
5190 __ cc20_qr_add4(cSet, dSet); // c += d
5191 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5192 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5193
5194 __ cc20_qr_add4(aSet, bSet); // a += b
5195 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5196 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5197
5198 __ cc20_qr_add4(cSet, dSet); // c += d
5199 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5200 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5201
5202 // Set to diagonal organization and do the next 4 quarter-rounds:
5203 // QUARTERROUND(0, 5, 10, 15)
5204 // QUARTERROUND(1, 6, 11, 12)
5205 // QUARTERROUND(2, 7, 8, 13)
5206 // QUARTERROUND(3, 4, 9, 14)
5207 __ cc20_set_qr_registers(bSet, workSt, 5, 6, 7, 4);
5208 __ cc20_set_qr_registers(cSet, workSt, 10, 11, 8, 9);
5209 __ cc20_set_qr_registers(dSet, workSt, 15, 12, 13, 14);
5210
5211 __ cc20_qr_add4(aSet, bSet); // a += b
5212 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5213 __ cc20_qr_lrot4(dSet, dSet, 16, lrot8Tbl); // d <<<= 16
5214
5215 __ cc20_qr_add4(cSet, dSet); // c += d
5216 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5217 __ cc20_qr_lrot4(scratch, bSet, 12, lrot8Tbl); // b <<<= 12
5218
5219 __ cc20_qr_add4(aSet, bSet); // a += b
5220 __ cc20_qr_xor4(dSet, aSet, dSet); // d ^= a
5221 __ cc20_qr_lrot4(dSet, dSet, 8, lrot8Tbl); // d <<<= 8
5222
5223 __ cc20_qr_add4(cSet, dSet); // c += d
5224 __ cc20_qr_xor4(bSet, cSet, scratch); // b ^= c (scratch)
5225 __ cc20_qr_lrot4(scratch, bSet, 7, lrot8Tbl); // b <<<= 12
5226
5227 // Decrement and iterate
5228 __ sub(loopCtr, loopCtr, 1);
5229 __ cbnz(loopCtr, L_twoRounds);
5230
5231 __ mov(tmpAddr, state);
5232
5233 // Add the starting state back to the post-loop keystream
5234 // state. We read/interlace the state array from memory into
5235 // 4 registers similar to what we did in the beginning. Then
5236 // add the counter overlay onto workSt[12] at the end.
5237 for (i = 0; i < 16; i += 4) {
5238 __ ld4r(v0, v1, v2, v3, __ T4S, __ post(tmpAddr, 16));
5239 __ addv(workSt[i], __ T4S, workSt[i], v0);
5240 __ addv(workSt[i + 1], __ T4S, workSt[i + 1], v1);
5241 __ addv(workSt[i + 2], __ T4S, workSt[i + 2], v2);
5242 __ addv(workSt[i + 3], __ T4S, workSt[i + 3], v3);
5243 }
5244 __ addv(workSt[12], __ T4S, workSt[12], ctrAddOverlay); // Add ctr overlay
5245
5246 // Write working state into the keystream buffer. This is accomplished
5247 // by taking the lane "i" from each of the four vectors and writing
5248 // it to consecutive 4-byte offsets, then post-incrementing by 16 and
5249 // repeating with the next 4 vectors until all 16 vectors have been used.
5250 // Then move to the next lane and repeat the process until all lanes have
5251 // been written.
5252 for (i = 0; i < 4; i++) {
5253 for (j = 0; j < 16; j += 4) {
5254 __ st4(workSt[j], workSt[j + 1], workSt[j + 2], workSt[j + 3], __ S, i,
5255 __ post(keystream, 16));
5256 }
5257 }
5258
5259 __ mov(r0, 256); // Return length of output keystream
5260 __ leave();
5261 __ ret(lr);
5262
5263 // bind label and generate local constant data used by this stub
5264 // The constant data is broken into two 128-bit segments to be loaded
5265 // onto FloatRegisters. The first 128 bits are a counter add overlay
5266 // that adds +0/+1/+2/+3 to the vector holding replicated state[12].
5267 // The second 128-bits is a table constant used for 8-bit left rotations.
5268 __ BIND(L_cc20_const);
5269 __ emit_int64(0x0000000100000000UL);
5270 __ emit_int64(0x0000000300000002UL);
5271 __ emit_int64(0x0605040702010003UL);
5272 __ emit_int64(0x0E0D0C0F0A09080BUL);
5273
5274 // record the stub entry and end
5275 store_archive_data(stub_id, start, __ pc());
5276
5277 return start;
5278 }
5279
5280 // Helpers to schedule parallel operation bundles across vector
5281 // register sequences of size 2, 4 or 8.
5282
5283 // Implement various primitive computations across vector sequences
5284
5285 template<int N>
5286 void vs_addv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5287 const VSeq<N>& v1, const VSeq<N>& v2) {
5288 // output must not be constant
5289 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5290 // output cannot overwrite pending inputs
5291 assert(!vs_write_before_read(v, v1), "output overwrites input");
5292 assert(!vs_write_before_read(v, v2), "output overwrites input");
5293 for (int i = 0; i < N; i++) {
5294 __ addv(v[i], T, v1[i], v2[i]);
5295 }
5296 }
5297
5298 template<int N>
5299 void vs_subv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5300 const VSeq<N>& v1, const VSeq<N>& v2) {
5301 // output must not be constant
5302 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5303 // output cannot overwrite pending inputs
5304 assert(!vs_write_before_read(v, v1), "output overwrites input");
5305 assert(!vs_write_before_read(v, v2), "output overwrites input");
5306 for (int i = 0; i < N; i++) {
5307 __ subv(v[i], T, v1[i], v2[i]);
5308 }
5309 }
5310
5311 template<int N>
5312 void vs_mulv(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5313 const VSeq<N>& v1, const VSeq<N>& v2) {
5314 // output must not be constant
5315 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5316 // output cannot overwrite pending inputs
5317 assert(!vs_write_before_read(v, v1), "output overwrites input");
5318 assert(!vs_write_before_read(v, v2), "output overwrites input");
5319 for (int i = 0; i < N; i++) {
5320 __ mulv(v[i], T, v1[i], v2[i]);
5321 }
5322 }
5323
5324 template<int N>
5325 void vs_negr(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1) {
5326 // output must not be constant
5327 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5328 // output cannot overwrite pending inputs
5329 assert(!vs_write_before_read(v, v1), "output overwrites input");
5330 for (int i = 0; i < N; i++) {
5331 __ negr(v[i], T, v1[i]);
5332 }
5333 }
5334
5335 template<int N>
5336 void vs_sshr(const VSeq<N>& v, Assembler::SIMD_Arrangement T,
5337 const VSeq<N>& v1, int shift) {
5338 // output must not be constant
5339 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5340 // output cannot overwrite pending inputs
5341 assert(!vs_write_before_read(v, v1), "output overwrites input");
5342 for (int i = 0; i < N; i++) {
5343 __ sshr(v[i], T, v1[i], shift);
5344 }
5345 }
5346
5347 template<int N>
5348 void vs_andr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5349 // output must not be constant
5350 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5351 // output cannot overwrite pending inputs
5352 assert(!vs_write_before_read(v, v1), "output overwrites input");
5353 assert(!vs_write_before_read(v, v2), "output overwrites input");
5354 for (int i = 0; i < N; i++) {
5355 __ andr(v[i], __ T16B, v1[i], v2[i]);
5356 }
5357 }
5358
5359 template<int N>
5360 void vs_orr(const VSeq<N>& v, const VSeq<N>& v1, const VSeq<N>& v2) {
5361 // output must not be constant
5362 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5363 // output cannot overwrite pending inputs
5364 assert(!vs_write_before_read(v, v1), "output overwrites input");
5365 assert(!vs_write_before_read(v, v2), "output overwrites input");
5366 for (int i = 0; i < N; i++) {
5367 __ orr(v[i], __ T16B, v1[i], v2[i]);
5368 }
5369 }
5370
5371 template<int N>
5372 void vs_notr(const VSeq<N>& v, const VSeq<N>& v1) {
5373 // output must not be constant
5374 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5375 // output cannot overwrite pending inputs
5376 assert(!vs_write_before_read(v, v1), "output overwrites input");
5377 for (int i = 0; i < N; i++) {
5378 __ notr(v[i], __ T16B, v1[i]);
5379 }
5380 }
5381
5382 template<int N>
5383 void vs_sqdmulh(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, const VSeq<N>& v2) {
5384 // output must not be constant
5385 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5386 // output cannot overwrite pending inputs
5387 assert(!vs_write_before_read(v, v1), "output overwrites input");
5388 assert(!vs_write_before_read(v, v2), "output overwrites input");
5389 for (int i = 0; i < N; i++) {
5390 __ sqdmulh(v[i], T, v1[i], v2[i]);
5391 }
5392 }
5393
5394 template<int N>
5395 void vs_mlsv(const VSeq<N>& v, Assembler::SIMD_Arrangement T, const VSeq<N>& v1, VSeq<N>& v2) {
5396 // output must not be constant
5397 assert(N == 1 || !v.is_constant(), "cannot output multiple values to a constant vector");
5398 // output cannot overwrite pending inputs
5399 assert(!vs_write_before_read(v, v1), "output overwrites input");
5400 assert(!vs_write_before_read(v, v2), "output overwrites input");
5401 for (int i = 0; i < N; i++) {
5402 __ mlsv(v[i], T, v1[i], v2[i]);
5403 }
5404 }
5405
5406 // load N/2 successive pairs of quadword values from memory in order
5407 // into N successive vector registers of the sequence via the
5408 // address supplied in base.
5409 template<int N>
5410 void vs_ldpq(const VSeq<N>& v, Register base) {
5411 for (int i = 0; i < N; i += 2) {
5412 __ ldpq(v[i], v[i+1], Address(base, 32 * i));
5413 }
5414 }
5415
5416 // load N/2 successive pairs of quadword values from memory in order
5417 // into N vector registers of the sequence via the address supplied
5418 // in base using post-increment addressing
5419 template<int N>
5420 void vs_ldpq_post(const VSeq<N>& v, Register base) {
5421 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5422 for (int i = 0; i < N; i += 2) {
5423 __ ldpq(v[i], v[i+1], __ post(base, 32));
5424 }
5425 }
5426
5427 // store N successive vector registers of the sequence into N/2
5428 // successive pairs of quadword memory locations via the address
5429 // supplied in base using post-increment addressing
5430 template<int N>
5431 void vs_stpq_post(const VSeq<N>& v, Register base) {
5432 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5433 for (int i = 0; i < N; i += 2) {
5434 __ stpq(v[i], v[i+1], __ post(base, 32));
5435 }
5436 }
5437
5438 // load N/2 pairs of quadword values from memory de-interleaved into
5439 // N vector registers 2 at a time via the address supplied in base
5440 // using post-increment addressing.
5441 template<int N>
5442 void vs_ld2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5443 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5444 for (int i = 0; i < N; i += 2) {
5445 __ ld2(v[i], v[i+1], T, __ post(base, 32));
5446 }
5447 }
5448
5449 // store N vector registers interleaved into N/2 pairs of quadword
5450 // memory locations via the address supplied in base using
5451 // post-increment addressing.
5452 template<int N>
5453 void vs_st2_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5454 static_assert((N & (N - 1)) == 0, "sequence length must be even");
5455 for (int i = 0; i < N; i += 2) {
5456 __ st2(v[i], v[i+1], T, __ post(base, 32));
5457 }
5458 }
5459
5460 // load N quadword values from memory de-interleaved into N vector
5461 // registers 3 elements at a time via the address supplied in base.
5462 template<int N>
5463 void vs_ld3(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5464 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5465 for (int i = 0; i < N; i += 3) {
5466 __ ld3(v[i], v[i+1], v[i+2], T, base);
5467 }
5468 }
5469
5470 // load N quadword values from memory de-interleaved into N vector
5471 // registers 3 elements at a time via the address supplied in base
5472 // using post-increment addressing.
5473 template<int N>
5474 void vs_ld3_post(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base) {
5475 static_assert(N == ((N / 3) * 3), "sequence length must be multiple of 3");
5476 for (int i = 0; i < N; i += 3) {
5477 __ ld3(v[i], v[i+1], v[i+2], T, __ post(base, 48));
5478 }
5479 }
5480
5481 // load N/2 pairs of quadword values from memory into N vector
5482 // registers via the address supplied in base with each pair indexed
5483 // using the the start offset plus the corresponding entry in the
5484 // offsets array
5485 template<int N>
5486 void vs_ldpq_indexed(const VSeq<N>& v, Register base, int start, int (&offsets)[N/2]) {
5487 for (int i = 0; i < N/2; i++) {
5488 __ ldpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5489 }
5490 }
5491
5492 // store N vector registers into N/2 pairs of quadword memory
5493 // locations via the address supplied in base with each pair indexed
5494 // using the the start offset plus the corresponding entry in the
5495 // offsets array
5496 template<int N>
5497 void vs_stpq_indexed(const VSeq<N>& v, Register base, int start, int offsets[N/2]) {
5498 for (int i = 0; i < N/2; i++) {
5499 __ stpq(v[2*i], v[2*i+1], Address(base, start + offsets[i]));
5500 }
5501 }
5502
5503 // load N single quadword values from memory into N vector registers
5504 // via the address supplied in base with each value indexed using
5505 // the the start offset plus the corresponding entry in the offsets
5506 // array
5507 template<int N>
5508 void vs_ldr_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5509 int start, int (&offsets)[N]) {
5510 for (int i = 0; i < N; i++) {
5511 __ ldr(v[i], T, Address(base, start + offsets[i]));
5512 }
5513 }
5514
5515 // store N vector registers into N single quadword memory locations
5516 // via the address supplied in base with each value indexed using
5517 // the the start offset plus the corresponding entry in the offsets
5518 // array
5519 template<int N>
5520 void vs_str_indexed(const VSeq<N>& v, Assembler::SIMD_RegVariant T, Register base,
5521 int start, int (&offsets)[N]) {
5522 for (int i = 0; i < N; i++) {
5523 __ str(v[i], T, Address(base, start + offsets[i]));
5524 }
5525 }
5526
5527 // load N/2 pairs of quadword values from memory de-interleaved into
5528 // N vector registers 2 at a time via the address supplied in base
5529 // with each pair indexed using the the start offset plus the
5530 // corresponding entry in the offsets array
5531 template<int N>
5532 void vs_ld2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5533 Register tmp, int start, int (&offsets)[N/2]) {
5534 for (int i = 0; i < N/2; i++) {
5535 __ add(tmp, base, start + offsets[i]);
5536 __ ld2(v[2*i], v[2*i+1], T, tmp);
5537 }
5538 }
5539
5540 // store N vector registers 2 at a time interleaved into N/2 pairs
5541 // of quadword memory locations via the address supplied in base
5542 // with each pair indexed using the the start offset plus the
5543 // corresponding entry in the offsets array
5544 template<int N>
5545 void vs_st2_indexed(const VSeq<N>& v, Assembler::SIMD_Arrangement T, Register base,
5546 Register tmp, int start, int (&offsets)[N/2]) {
5547 for (int i = 0; i < N/2; i++) {
5548 __ add(tmp, base, start + offsets[i]);
5549 __ st2(v[2*i], v[2*i+1], T, tmp);
5550 }
5551 }
5552
5553 // Helper routines for various flavours of Montgomery multiply
5554
5555 // Perform 16 32-bit (4x4S) or 32 16-bit (4 x 8H) Montgomery
5556 // multiplications in parallel
5557 //
5558
5559 // See the montMul() method of the sun.security.provider.ML_DSA
5560 // class.
5561 //
5562 // Computes 4x4S results or 8x8H results
5563 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5564 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5565 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5566 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5567 // Outputs: va - 4x4S or 4x8H vector register sequences
5568 // vb, vc, vtmp and vq must all be disjoint
5569 // va must be disjoint from all other inputs/temps or must equal vc
5570 // va must have a non-zero delta i.e. it must not be a constant vseq.
5571 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5572 void vs_montmul4(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5573 Assembler::SIMD_Arrangement T,
5574 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5575 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5576 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5577 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5578 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5579
5580 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5581 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5582
5583 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5584
5585 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5586 assert(vs_disjoint(va, vb), "va and vb overlap");
5587 assert(vs_disjoint(va, vq), "va and vq overlap");
5588 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5589 assert(!va.is_constant(), "output vector must identify 4 different registers");
5590
5591 // schedule 4 streams of instructions across the vector sequences
5592 for (int i = 0; i < 4; i++) {
5593 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5594 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5595 }
5596
5597 for (int i = 0; i < 4; i++) {
5598 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5599 }
5600
5601 for (int i = 0; i < 4; i++) {
5602 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5603 }
5604
5605 for (int i = 0; i < 4; i++) {
5606 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5607 }
5608 }
5609
5610 // Perform 8 32-bit (4x4S) or 16 16-bit (2 x 8H) Montgomery
5611 // multiplications in parallel
5612 //
5613
5614 // See the montMul() method of the sun.security.provider.ML_DSA
5615 // class.
5616 //
5617 // Computes 4x4S results or 8x8H results
5618 // a = b * c * 2^MONT_R_BITS mod MONT_Q
5619 // Inputs: vb, vc - 4x4S or 4x8H vector register sequences
5620 // vq - 2x4S or 2x8H constants <MONT_Q, MONT_Q_INV_MOD_R>
5621 // Temps: vtmp - 4x4S or 4x8H vector sequence trashed after call
5622 // Outputs: va - 4x4S or 4x8H vector register sequences
5623 // vb, vc, vtmp and vq must all be disjoint
5624 // va must be disjoint from all other inputs/temps or must equal vc
5625 // va must have a non-zero delta i.e. it must not be a constant vseq.
5626 // n.b. MONT_R_BITS is 16 or 32, so the right shift by it is implicit.
5627 void vs_montmul2(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5628 Assembler::SIMD_Arrangement T,
5629 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5630 assert (T == __ T4S || T == __ T8H, "invalid arrangement for montmul");
5631 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5632 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5633 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5634
5635 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5636 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5637
5638 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5639
5640 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5641 assert(vs_disjoint(va, vb), "va and vb overlap");
5642 assert(vs_disjoint(va, vq), "va and vq overlap");
5643 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5644 assert(!va.is_constant(), "output vector must identify 2 different registers");
5645
5646 // schedule 2 streams of instructions across the vector sequences
5647 for (int i = 0; i < 2; i++) {
5648 __ sqdmulh(vtmp[i], T, vb[i], vc[i]); // aHigh = hi32(2 * b * c)
5649 __ mulv(va[i], T, vb[i], vc[i]); // aLow = lo32(b * c)
5650 }
5651
5652 for (int i = 0; i < 2; i++) {
5653 __ mulv(va[i], T, va[i], vq[0]); // m = aLow * qinv
5654 }
5655
5656 for (int i = 0; i < 2; i++) {
5657 __ sqdmulh(va[i], T, va[i], vq[1]); // n = hi32(2 * m * q)
5658 }
5659
5660 for (int i = 0; i < 2; i++) {
5661 __ shsubv(va[i], T, vtmp[i], va[i]); // a = (aHigh - n) / 2
5662 }
5663 }
5664
5665 // Perform 16 16-bit Montgomery multiplications in parallel.
5666 void kyber_montmul16(const VSeq<2>& va, const VSeq<2>& vb, const VSeq<2>& vc,
5667 const VSeq<2>& vtmp, const VSeq<2>& vq) {
5668 // Use the helper routine to schedule a 2x8H Montgomery multiply.
5669 // It will assert that the register use is valid
5670 vs_montmul2(va, vb, vc, __ T8H, vtmp, vq);
5671 }
5672
5673 // Perform 32 16-bit Montgomery multiplications in parallel.
5674 void kyber_montmul32(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
5675 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5676 // Use the helper routine to schedule a 4x8H Montgomery multiply.
5677 // It will assert that the register use is valid
5678 vs_montmul4(va, vb, vc, __ T8H, vtmp, vq);
5679 }
5680
5681 // Perform 64 16-bit Montgomery multiplications in parallel.
5682 void kyber_montmul64(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
5683 const VSeq<4>& vtmp, const VSeq<2>& vq) {
5684 // Schedule two successive 4x8H multiplies via the montmul helper
5685 // on the front and back halves of va, vb and vc. The helper will
5686 // assert that the register use has no overlap conflicts on each
5687 // individual call but we also need to ensure that the necessary
5688 // disjoint/equality constraints are met across both calls.
5689
5690 // vb, vc, vtmp and vq must be disjoint. va must either be
5691 // disjoint from all other registers or equal vc
5692
5693 assert(vs_disjoint(vb, vc), "vb and vc overlap");
5694 assert(vs_disjoint(vb, vq), "vb and vq overlap");
5695 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
5696
5697 assert(vs_disjoint(vc, vq), "vc and vq overlap");
5698 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
5699
5700 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
5701
5702 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
5703 assert(vs_disjoint(va, vb), "va and vb overlap");
5704 assert(vs_disjoint(va, vq), "va and vq overlap");
5705 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
5706
5707 // we multiply the front and back halves of each sequence 4 at a
5708 // time because
5709 //
5710 // 1) we are currently only able to get 4-way instruction
5711 // parallelism at best
5712 //
5713 // 2) we need registers for the constants in vq and temporary
5714 // scratch registers to hold intermediate results so vtmp can only
5715 // be a VSeq<4> which means we only have 4 scratch slots
5716
5717 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T8H, vtmp, vq);
5718 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T8H, vtmp, vq);
5719 }
5720
5721 void kyber_montmul32_sub_add(const VSeq<4>& va0, const VSeq<4>& va1,
5722 const VSeq<4>& vc,
5723 const VSeq<4>& vtmp,
5724 const VSeq<2>& vq) {
5725 // compute a = montmul(a1, c)
5726 kyber_montmul32(vc, va1, vc, vtmp, vq);
5727 // ouptut a1 = a0 - a
5728 vs_subv(va1, __ T8H, va0, vc);
5729 // and a0 = a0 + a
5730 vs_addv(va0, __ T8H, va0, vc);
5731 }
5732
5733 void kyber_sub_add_montmul32(const VSeq<4>& va0, const VSeq<4>& va1,
5734 const VSeq<4>& vb,
5735 const VSeq<4>& vtmp1,
5736 const VSeq<4>& vtmp2,
5737 const VSeq<2>& vq) {
5738 // compute c = a0 - a1
5739 vs_subv(vtmp1, __ T8H, va0, va1);
5740 // output a0 = a0 + a1
5741 vs_addv(va0, __ T8H, va0, va1);
5742 // output a1 = b montmul c
5743 kyber_montmul32(va1, vtmp1, vb, vtmp2, vq);
5744 }
5745
5746 void load64shorts(const VSeq<8>& v, Register shorts) {
5747 vs_ldpq_post(v, shorts);
5748 }
5749
5750 void load32shorts(const VSeq<4>& v, Register shorts) {
5751 vs_ldpq_post(v, shorts);
5752 }
5753
5754 void store64shorts(VSeq<8> v, Register tmpAddr) {
5755 vs_stpq_post(v, tmpAddr);
5756 }
5757
5758 // Kyber NTT function.
5759 // Implements
5760 // static int implKyberNtt(short[] poly, short[] ntt_zetas) {}
5761 //
5762 // coeffs (short[256]) = c_rarg0
5763 // ntt_zetas (short[256]) = c_rarg1
5764 address generate_kyberNtt() {
5765 StubId stub_id = StubId::stubgen_kyberNtt_id;
5766 int entry_count = StubInfo::entry_count(stub_id);
5767 assert(entry_count == 1, "sanity check");
5768 address start = load_archive_data(stub_id);
5769 if (start != nullptr) {
5770 return start;
5771 }
5772 __ align(CodeEntryAlignment);
5773 StubCodeMark mark(this, stub_id);
5774 start = __ pc();
5775 __ enter();
5776
5777 const Register coeffs = c_rarg0;
5778 const Register zetas = c_rarg1;
5779
5780 const Register kyberConsts = r10;
5781 const Register tmpAddr = r11;
5782
5783 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
5784 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
5785 VSeq<2> vq(30); // n.b. constants overlap vs3
5786
5787 __ lea(kyberConsts, ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
5788 // load the montmul constants
5789 vs_ldpq(vq, kyberConsts);
5790
5791 // Each level corresponds to an iteration of the outermost loop of the
5792 // Java method seilerNTT(int[] coeffs). There are some differences
5793 // from what is done in the seilerNTT() method, though:
5794 // 1. The computation is using 16-bit signed values, we do not convert them
5795 // to ints here.
5796 // 2. The zetas are delivered in a bigger array, 128 zetas are stored in
5797 // this array for each level, it is easier that way to fill up the vector
5798 // registers.
5799 // 3. In the seilerNTT() method we use R = 2^20 for the Montgomery
5800 // multiplications (this is because that way there should not be any
5801 // overflow during the inverse NTT computation), here we usr R = 2^16 so
5802 // that we can use the 16-bit arithmetic in the vector unit.
5803 //
5804 // On each level, we fill up the vector registers in such a way that the
5805 // array elements that need to be multiplied by the zetas go into one
5806 // set of vector registers while the corresponding ones that don't need to
5807 // be multiplied, go into another set.
5808 // We can do 32 Montgomery multiplications in parallel, using 12 vector
5809 // registers interleaving the steps of 4 identical computations,
5810 // each done on 8 16-bit values per register.
5811
5812 // At levels 0-3 the coefficients multiplied by or added/subtracted
5813 // to the zetas occur in discrete blocks whose size is some multiple
5814 // of 32.
5815
5816 // level 0
5817 __ add(tmpAddr, coeffs, 256);
5818 load64shorts(vs1, tmpAddr);
5819 load64shorts(vs2, zetas);
5820 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5821 __ add(tmpAddr, coeffs, 0);
5822 load64shorts(vs1, tmpAddr);
5823 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5824 vs_addv(vs1, __ T8H, vs1, vs2);
5825 __ add(tmpAddr, coeffs, 0);
5826 vs_stpq_post(vs1, tmpAddr);
5827 __ add(tmpAddr, coeffs, 256);
5828 vs_stpq_post(vs3, tmpAddr);
5829 // restore montmul constants
5830 vs_ldpq(vq, kyberConsts);
5831 load64shorts(vs1, tmpAddr);
5832 load64shorts(vs2, zetas);
5833 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5834 __ add(tmpAddr, coeffs, 128);
5835 load64shorts(vs1, tmpAddr);
5836 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5837 vs_addv(vs1, __ T8H, vs1, vs2);
5838 __ add(tmpAddr, coeffs, 128);
5839 store64shorts(vs1, tmpAddr);
5840 __ add(tmpAddr, coeffs, 384);
5841 store64shorts(vs3, tmpAddr);
5842
5843 // level 1
5844 // restore montmul constants
5845 vs_ldpq(vq, kyberConsts);
5846 __ add(tmpAddr, coeffs, 128);
5847 load64shorts(vs1, tmpAddr);
5848 load64shorts(vs2, zetas);
5849 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5850 __ add(tmpAddr, coeffs, 0);
5851 load64shorts(vs1, tmpAddr);
5852 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5853 vs_addv(vs1, __ T8H, vs1, vs2);
5854 __ add(tmpAddr, coeffs, 0);
5855 store64shorts(vs1, tmpAddr);
5856 store64shorts(vs3, tmpAddr);
5857 vs_ldpq(vq, kyberConsts);
5858 __ add(tmpAddr, coeffs, 384);
5859 load64shorts(vs1, tmpAddr);
5860 load64shorts(vs2, zetas);
5861 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5862 __ add(tmpAddr, coeffs, 256);
5863 load64shorts(vs1, tmpAddr);
5864 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5865 vs_addv(vs1, __ T8H, vs1, vs2);
5866 __ add(tmpAddr, coeffs, 256);
5867 store64shorts(vs1, tmpAddr);
5868 store64shorts(vs3, tmpAddr);
5869
5870 // level 2
5871 vs_ldpq(vq, kyberConsts);
5872 int offsets1[4] = { 0, 32, 128, 160 };
5873 vs_ldpq_indexed(vs1, coeffs, 64, offsets1);
5874 load64shorts(vs2, zetas);
5875 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5876 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
5877 // kyber_subv_addv64();
5878 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5879 vs_addv(vs1, __ T8H, vs1, vs2);
5880 __ add(tmpAddr, coeffs, 0);
5881 vs_stpq_post(vs_front(vs1), tmpAddr);
5882 vs_stpq_post(vs_front(vs3), tmpAddr);
5883 vs_stpq_post(vs_back(vs1), tmpAddr);
5884 vs_stpq_post(vs_back(vs3), tmpAddr);
5885 vs_ldpq(vq, kyberConsts);
5886 vs_ldpq_indexed(vs1, tmpAddr, 64, offsets1);
5887 load64shorts(vs2, zetas);
5888 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5889 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
5890 // kyber_subv_addv64();
5891 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5892 vs_addv(vs1, __ T8H, vs1, vs2);
5893 __ add(tmpAddr, coeffs, 256);
5894 vs_stpq_post(vs_front(vs1), tmpAddr);
5895 vs_stpq_post(vs_front(vs3), tmpAddr);
5896 vs_stpq_post(vs_back(vs1), tmpAddr);
5897 vs_stpq_post(vs_back(vs3), tmpAddr);
5898
5899 // level 3
5900 vs_ldpq(vq, kyberConsts);
5901 int offsets2[4] = { 0, 64, 128, 192 };
5902 vs_ldpq_indexed(vs1, coeffs, 32, offsets2);
5903 load64shorts(vs2, zetas);
5904 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5905 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
5906 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5907 vs_addv(vs1, __ T8H, vs1, vs2);
5908 vs_stpq_indexed(vs1, coeffs, 0, offsets2);
5909 vs_stpq_indexed(vs3, coeffs, 32, offsets2);
5910
5911 vs_ldpq(vq, kyberConsts);
5912 vs_ldpq_indexed(vs1, coeffs, 256 + 32, offsets2);
5913 load64shorts(vs2, zetas);
5914 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5915 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
5916 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5917 vs_addv(vs1, __ T8H, vs1, vs2);
5918 vs_stpq_indexed(vs1, coeffs, 256, offsets2);
5919 vs_stpq_indexed(vs3, coeffs, 256 + 32, offsets2);
5920
5921 // level 4
5922 // At level 4 coefficients occur in 8 discrete blocks of size 16
5923 // so they are loaded using employing an ldr at 8 distinct offsets.
5924
5925 vs_ldpq(vq, kyberConsts);
5926 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
5927 vs_ldr_indexed(vs1, __ Q, coeffs, 16, offsets3);
5928 load64shorts(vs2, zetas);
5929 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5930 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
5931 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5932 vs_addv(vs1, __ T8H, vs1, vs2);
5933 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
5934 vs_str_indexed(vs3, __ Q, coeffs, 16, offsets3);
5935
5936 vs_ldpq(vq, kyberConsts);
5937 vs_ldr_indexed(vs1, __ Q, coeffs, 256 + 16, offsets3);
5938 load64shorts(vs2, zetas);
5939 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
5940 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
5941 vs_subv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
5942 vs_addv(vs1, __ T8H, vs1, vs2);
5943 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
5944 vs_str_indexed(vs3, __ Q, coeffs, 256 + 16, offsets3);
5945
5946 // level 5
5947 // At level 5 related coefficients occur in discrete blocks of size 8 so
5948 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
5949
5950 vs_ldpq(vq, kyberConsts);
5951 int offsets4[4] = { 0, 32, 64, 96 };
5952 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5953 load32shorts(vs_front(vs2), zetas);
5954 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5955 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
5956 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5957 load32shorts(vs_front(vs2), zetas);
5958 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5959 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
5960 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5961 load32shorts(vs_front(vs2), zetas);
5962 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5963 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
5964
5965 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5966 load32shorts(vs_front(vs2), zetas);
5967 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5968 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
5969
5970 // level 6
5971 // At level 6 related coefficients occur in discrete blocks of size 4 so
5972 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
5973
5974 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5975 load32shorts(vs_front(vs2), zetas);
5976 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5977 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
5978 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5979 // __ ldpq(v18, v19, __ post(zetas, 32));
5980 load32shorts(vs_front(vs2), zetas);
5981 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5982 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
5983
5984 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5985 load32shorts(vs_front(vs2), zetas);
5986 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5987 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
5988
5989 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5990 load32shorts(vs_front(vs2), zetas);
5991 kyber_montmul32_sub_add(vs_even(vs1), vs_odd(vs1), vs_front(vs2), vtmp, vq);
5992 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
5993
5994 __ leave(); // required for proper stackwalking of RuntimeStub frame
5995 __ mov(r0, zr); // return 0
5996 __ ret(lr);
5997
5998 // record the stub entry and end
5999 store_archive_data(stub_id, start, __ pc());
6000
6001 return start;
6002 }
6003
6004 // Kyber Inverse NTT function
6005 // Implements
6006 // static int implKyberInverseNtt(short[] poly, short[] zetas) {}
6007 //
6008 // coeffs (short[256]) = c_rarg0
6009 // ntt_zetas (short[256]) = c_rarg1
6010 address generate_kyberInverseNtt() {
6011 StubId stub_id = StubId::stubgen_kyberInverseNtt_id;
6012 int entry_count = StubInfo::entry_count(stub_id);
6013 assert(entry_count == 1, "sanity check");
6014 address start = load_archive_data(stub_id);
6015 if (start != nullptr) {
6016 return start;
6017 }
6018 __ align(CodeEntryAlignment);
6019 StubCodeMark mark(this, stub_id);
6020 start = __ pc();
6021 __ enter();
6022
6023 const Register coeffs = c_rarg0;
6024 const Register zetas = c_rarg1;
6025
6026 const Register kyberConsts = r10;
6027 const Register tmpAddr = r11;
6028 const Register tmpAddr2 = c_rarg2;
6029
6030 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x8H inputs/outputs
6031 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6032 VSeq<2> vq(30); // n.b. constants overlap vs3
6033
6034 __ lea(kyberConsts,
6035 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6036
6037 // level 0
6038 // At level 0 related coefficients occur in discrete blocks of size 4 so
6039 // need to be loaded interleaved using an ld2 operation with arrangement 4S.
6040
6041 vs_ldpq(vq, kyberConsts);
6042 int offsets4[4] = { 0, 32, 64, 96 };
6043 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6044 load32shorts(vs_front(vs2), zetas);
6045 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6046 vs_front(vs2), vs_back(vs2), vtmp, vq);
6047 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 0, offsets4);
6048 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6049 load32shorts(vs_front(vs2), zetas);
6050 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6051 vs_front(vs2), vs_back(vs2), vtmp, vq);
6052 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 128, offsets4);
6053 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6054 load32shorts(vs_front(vs2), zetas);
6055 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6056 vs_front(vs2), vs_back(vs2), vtmp, vq);
6057 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 256, offsets4);
6058 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6059 load32shorts(vs_front(vs2), zetas);
6060 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6061 vs_front(vs2), vs_back(vs2), vtmp, vq);
6062 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, 384, offsets4);
6063
6064 // level 1
6065 // At level 1 related coefficients occur in discrete blocks of size 8 so
6066 // need to be loaded interleaved using an ld2 operation with arrangement 2D.
6067
6068 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6069 load32shorts(vs_front(vs2), zetas);
6070 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6071 vs_front(vs2), vs_back(vs2), vtmp, vq);
6072 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 0, offsets4);
6073 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6074 load32shorts(vs_front(vs2), zetas);
6075 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6076 vs_front(vs2), vs_back(vs2), vtmp, vq);
6077 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 128, offsets4);
6078
6079 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6080 load32shorts(vs_front(vs2), zetas);
6081 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6082 vs_front(vs2), vs_back(vs2), vtmp, vq);
6083 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 256, offsets4);
6084 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6085 load32shorts(vs_front(vs2), zetas);
6086 kyber_sub_add_montmul32(vs_even(vs1), vs_odd(vs1),
6087 vs_front(vs2), vs_back(vs2), vtmp, vq);
6088 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, 384, offsets4);
6089
6090 // level 2
6091 // At level 2 coefficients occur in 8 discrete blocks of size 16
6092 // so they are loaded using employing an ldr at 8 distinct offsets.
6093
6094 int offsets3[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
6095 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6096 vs_ldr_indexed(vs2, __ Q, coeffs, 16, offsets3);
6097 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6098 vs_subv(vs1, __ T8H, vs1, vs2);
6099 vs_str_indexed(vs3, __ Q, coeffs, 0, offsets3);
6100 load64shorts(vs2, zetas);
6101 vs_ldpq(vq, kyberConsts);
6102 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6103 vs_str_indexed(vs2, __ Q, coeffs, 16, offsets3);
6104
6105 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6106 vs_ldr_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6107 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6108 vs_subv(vs1, __ T8H, vs1, vs2);
6109 vs_str_indexed(vs3, __ Q, coeffs, 256, offsets3);
6110 load64shorts(vs2, zetas);
6111 vs_ldpq(vq, kyberConsts);
6112 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6113 vs_str_indexed(vs2, __ Q, coeffs, 256 + 16, offsets3);
6114
6115 // Barrett reduction at indexes where overflow may happen
6116
6117 // load q and the multiplier for the Barrett reduction
6118 __ add(tmpAddr, kyberConsts, 16);
6119 vs_ldpq(vq, tmpAddr);
6120
6121 VSeq<8> vq1 = VSeq<8>(vq[0], 0); // 2 constant 8 sequences
6122 VSeq<8> vq2 = VSeq<8>(vq[1], 0); // for above two kyber constants
6123 VSeq<8> vq3 = VSeq<8>(v29, 0); // 3rd sequence for const montmul
6124 vs_ldr_indexed(vs1, __ Q, coeffs, 0, offsets3);
6125 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6126 vs_sshr(vs2, __ T8H, vs2, 11);
6127 vs_mlsv(vs1, __ T8H, vs2, vq1);
6128 vs_str_indexed(vs1, __ Q, coeffs, 0, offsets3);
6129 vs_ldr_indexed(vs1, __ Q, coeffs, 256, offsets3);
6130 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6131 vs_sshr(vs2, __ T8H, vs2, 11);
6132 vs_mlsv(vs1, __ T8H, vs2, vq1);
6133 vs_str_indexed(vs1, __ Q, coeffs, 256, offsets3);
6134
6135 // level 3
6136 // From level 3 upwards coefficients occur in discrete blocks whose size is
6137 // some multiple of 32 so can be loaded using ldpq and suitable indexes.
6138
6139 int offsets2[4] = { 0, 64, 128, 192 };
6140 vs_ldpq_indexed(vs1, coeffs, 0, offsets2);
6141 vs_ldpq_indexed(vs2, coeffs, 32, offsets2);
6142 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6143 vs_subv(vs1, __ T8H, vs1, vs2);
6144 vs_stpq_indexed(vs3, coeffs, 0, offsets2);
6145 load64shorts(vs2, zetas);
6146 vs_ldpq(vq, kyberConsts);
6147 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6148 vs_stpq_indexed(vs2, coeffs, 32, offsets2);
6149
6150 vs_ldpq_indexed(vs1, coeffs, 256, offsets2);
6151 vs_ldpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6152 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6153 vs_subv(vs1, __ T8H, vs1, vs2);
6154 vs_stpq_indexed(vs3, coeffs, 256, offsets2);
6155 load64shorts(vs2, zetas);
6156 vs_ldpq(vq, kyberConsts);
6157 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6158 vs_stpq_indexed(vs2, coeffs, 256 + 32, offsets2);
6159
6160 // level 4
6161
6162 int offsets1[4] = { 0, 32, 128, 160 };
6163 vs_ldpq_indexed(vs1, coeffs, 0, offsets1);
6164 vs_ldpq_indexed(vs2, coeffs, 64, offsets1);
6165 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6166 vs_subv(vs1, __ T8H, vs1, vs2);
6167 vs_stpq_indexed(vs3, coeffs, 0, offsets1);
6168 load64shorts(vs2, zetas);
6169 vs_ldpq(vq, kyberConsts);
6170 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6171 vs_stpq_indexed(vs2, coeffs, 64, offsets1);
6172
6173 vs_ldpq_indexed(vs1, coeffs, 256, offsets1);
6174 vs_ldpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6175 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6176 vs_subv(vs1, __ T8H, vs1, vs2);
6177 vs_stpq_indexed(vs3, coeffs, 256, offsets1);
6178 load64shorts(vs2, zetas);
6179 vs_ldpq(vq, kyberConsts);
6180 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6181 vs_stpq_indexed(vs2, coeffs, 256 + 64, offsets1);
6182
6183 // level 5
6184
6185 __ add(tmpAddr, coeffs, 0);
6186 load64shorts(vs1, tmpAddr);
6187 __ add(tmpAddr, coeffs, 128);
6188 load64shorts(vs2, tmpAddr);
6189 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6190 vs_subv(vs1, __ T8H, vs1, vs2);
6191 __ add(tmpAddr, coeffs, 0);
6192 store64shorts(vs3, tmpAddr);
6193 load64shorts(vs2, zetas);
6194 vs_ldpq(vq, kyberConsts);
6195 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6196 __ add(tmpAddr, coeffs, 128);
6197 store64shorts(vs2, tmpAddr);
6198
6199 load64shorts(vs1, tmpAddr);
6200 __ add(tmpAddr, coeffs, 384);
6201 load64shorts(vs2, tmpAddr);
6202 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6203 vs_subv(vs1, __ T8H, vs1, vs2);
6204 __ add(tmpAddr, coeffs, 256);
6205 store64shorts(vs3, tmpAddr);
6206 load64shorts(vs2, zetas);
6207 vs_ldpq(vq, kyberConsts);
6208 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6209 __ add(tmpAddr, coeffs, 384);
6210 store64shorts(vs2, tmpAddr);
6211
6212 // Barrett reduction at indexes where overflow may happen
6213
6214 // load q and the multiplier for the Barrett reduction
6215 __ add(tmpAddr, kyberConsts, 16);
6216 vs_ldpq(vq, tmpAddr);
6217
6218 int offsets0[2] = { 0, 256 };
6219 vs_ldpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6220 vs_sqdmulh(vs2, __ T8H, vs1, vq2);
6221 vs_sshr(vs2, __ T8H, vs2, 11);
6222 vs_mlsv(vs1, __ T8H, vs2, vq1);
6223 vs_stpq_indexed(vs_front(vs1), coeffs, 0, offsets0);
6224
6225 // level 6
6226
6227 __ add(tmpAddr, coeffs, 0);
6228 load64shorts(vs1, tmpAddr);
6229 __ add(tmpAddr, coeffs, 256);
6230 load64shorts(vs2, tmpAddr);
6231 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6232 vs_subv(vs1, __ T8H, vs1, vs2);
6233 __ add(tmpAddr, coeffs, 0);
6234 store64shorts(vs3, tmpAddr);
6235 load64shorts(vs2, zetas);
6236 vs_ldpq(vq, kyberConsts);
6237 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6238 __ add(tmpAddr, coeffs, 256);
6239 store64shorts(vs2, tmpAddr);
6240
6241 __ add(tmpAddr, coeffs, 128);
6242 load64shorts(vs1, tmpAddr);
6243 __ add(tmpAddr, coeffs, 384);
6244 load64shorts(vs2, tmpAddr);
6245 vs_addv(vs3, __ T8H, vs1, vs2); // n.b. trashes vq
6246 vs_subv(vs1, __ T8H, vs1, vs2);
6247 __ add(tmpAddr, coeffs, 128);
6248 store64shorts(vs3, tmpAddr);
6249 load64shorts(vs2, zetas);
6250 vs_ldpq(vq, kyberConsts);
6251 kyber_montmul64(vs2, vs1, vs2, vtmp, vq);
6252 __ add(tmpAddr, coeffs, 384);
6253 store64shorts(vs2, tmpAddr);
6254
6255 // multiply by 2^-n
6256
6257 // load toMont(2^-n mod q)
6258 __ add(tmpAddr, kyberConsts, 48);
6259 __ ldr(v29, __ Q, tmpAddr);
6260
6261 vs_ldpq(vq, kyberConsts);
6262 __ add(tmpAddr, coeffs, 0);
6263 load64shorts(vs1, tmpAddr);
6264 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6265 __ add(tmpAddr, coeffs, 0);
6266 store64shorts(vs2, tmpAddr);
6267
6268 // now tmpAddr contains coeffs + 128 because store64shorts adjusted it so
6269 load64shorts(vs1, tmpAddr);
6270 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6271 __ add(tmpAddr, coeffs, 128);
6272 store64shorts(vs2, tmpAddr);
6273
6274 // now tmpAddr contains coeffs + 256
6275 load64shorts(vs1, tmpAddr);
6276 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6277 __ add(tmpAddr, coeffs, 256);
6278 store64shorts(vs2, tmpAddr);
6279
6280 // now tmpAddr contains coeffs + 384
6281 load64shorts(vs1, tmpAddr);
6282 kyber_montmul64(vs2, vs1, vq3, vtmp, vq);
6283 __ add(tmpAddr, coeffs, 384);
6284 store64shorts(vs2, tmpAddr);
6285
6286 __ leave(); // required for proper stackwalking of RuntimeStub frame
6287 __ mov(r0, zr); // return 0
6288 __ ret(lr);
6289
6290 // record the stub entry and end
6291 store_archive_data(stub_id, start, __ pc());
6292
6293 return start;
6294 }
6295
6296 // Kyber multiply polynomials in the NTT domain.
6297 // Implements
6298 // static int implKyberNttMult(
6299 // short[] result, short[] ntta, short[] nttb, short[] zetas) {}
6300 //
6301 // The actual algorithm that is used here differs from the one in the Java
6302 // implementation, it uses Montgomery multiplications instead of Barrett
6303 // reduction, but the end result modulo MLKEM_Q is the same. This is the
6304 // Java equivalent of this intrinsic implementation:
6305 // static void implKyberNttMultJava(short[] result, short[] ntta, short[] nttb) {
6306 // for (int m = 0; m < ML_KEM_N / 2; m++) {
6307 // int a0 = ntta[2 * m];
6308 // int a1 = ntta[2 * m + 1];
6309 // int b0 = nttb[2 * m];
6310 // int b1 = nttb[2 * m + 1];
6311 // int r = montMul(a0, b0) +
6312 // montMul(montMul(a1, b1), MONT_ZETAS_FOR_NTT_MULT[m]);
6313 // result[2 * m] = (short) montMul(r, MONT_R_SQUARE_MOD_Q);
6314 // result[2 * m + 1] = (short) montMul(
6315 // (montMul(a0, b1) + montMul(a1, b0)), MONT_R_SQUARE_MOD_Q);
6316 // }
6317 // }
6318 //
6319 // result (short[256]) = c_rarg0
6320 // ntta (short[256]) = c_rarg1
6321 // nttb (short[256]) = c_rarg2
6322 // zetas (short[128]) = c_rarg3
6323 address generate_kyberNttMult() {
6324 StubId stub_id = StubId::stubgen_kyberNttMult_id;
6325 int entry_count = StubInfo::entry_count(stub_id);
6326 assert(entry_count == 1, "sanity check");
6327 address start = load_archive_data(stub_id);
6328 if (start != nullptr) {
6329 return start;
6330 }
6331 __ align(CodeEntryAlignment);
6332 StubCodeMark mark(this, stub_id);
6333 start = __ pc();
6334 __ enter();
6335
6336 const Register result = c_rarg0;
6337 const Register ntta = c_rarg1;
6338 const Register nttb = c_rarg2;
6339 const Register zetas = c_rarg3;
6340
6341 const Register kyberConsts = r10;
6342 const Register limit = r11;
6343
6344 VSeq<4> vs1(0), vs2(4); // 4 sets of 8x8H inputs/outputs/tmps
6345 VSeq<4> vs3(16), vs4(20);
6346 VSeq<2> vq(30); // pair of constants for montmul: q, qinv
6347 VSeq<2> vz(28); // pair of zetas
6348 VSeq<4> vc(27, 0); // constant sequence for montmul: montRSquareModQ
6349
6350 __ lea(kyberConsts,
6351 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6352
6353 Label kyberNttMult_loop;
6354
6355 __ add(limit, result, 512);
6356
6357 // load q and qinv
6358 vs_ldpq(vq, kyberConsts);
6359
6360 // load R^2 mod q (to convert back from Montgomery representation)
6361 __ add(kyberConsts, kyberConsts, 64);
6362 __ ldr(v27, __ Q, kyberConsts);
6363
6364 __ BIND(kyberNttMult_loop);
6365
6366 // load 16 zetas
6367 vs_ldpq_post(vz, zetas);
6368
6369 // load 2 sets of 32 coefficients from the two input arrays
6370 // interleaved as shorts. i.e. pairs of shorts adjacent in memory
6371 // are striped across pairs of vector registers
6372 vs_ld2_post(vs_front(vs1), __ T8H, ntta); // <a0, a1> x 8H
6373 vs_ld2_post(vs_back(vs1), __ T8H, nttb); // <b0, b1> x 8H
6374 vs_ld2_post(vs_front(vs4), __ T8H, ntta); // <a2, a3> x 8H
6375 vs_ld2_post(vs_back(vs4), __ T8H, nttb); // <b2, b3> x 8H
6376
6377 // compute 4 montmul cross-products for pairs (a0,a1) and (b0,b1)
6378 // i.e. montmul the first and second halves of vs1 in order and
6379 // then with one sequence reversed storing the two results in vs3
6380 //
6381 // vs3[0] <- montmul(a0, b0)
6382 // vs3[1] <- montmul(a1, b1)
6383 // vs3[2] <- montmul(a0, b1)
6384 // vs3[3] <- montmul(a1, b0)
6385 kyber_montmul16(vs_front(vs3), vs_front(vs1), vs_back(vs1), vs_front(vs2), vq);
6386 kyber_montmul16(vs_back(vs3),
6387 vs_front(vs1), vs_reverse(vs_back(vs1)), vs_back(vs2), vq);
6388
6389 // compute 4 montmul cross-products for pairs (a2,a3) and (b2,b3)
6390 // i.e. montmul the first and second halves of vs4 in order and
6391 // then with one sequence reversed storing the two results in vs1
6392 //
6393 // vs1[0] <- montmul(a2, b2)
6394 // vs1[1] <- montmul(a3, b3)
6395 // vs1[2] <- montmul(a2, b3)
6396 // vs1[3] <- montmul(a3, b2)
6397 kyber_montmul16(vs_front(vs1), vs_front(vs4), vs_back(vs4), vs_front(vs2), vq);
6398 kyber_montmul16(vs_back(vs1),
6399 vs_front(vs4), vs_reverse(vs_back(vs4)), vs_back(vs2), vq);
6400
6401 // montmul result 2 of each cross-product i.e. (a1*b1, a3*b3) by a zeta.
6402 // We can schedule two montmuls at a time if we use a suitable vector
6403 // sequence <vs3[1], vs1[1]>.
6404 int delta = vs1[1]->encoding() - vs3[1]->encoding();
6405 VSeq<2> vs5(vs3[1], delta);
6406
6407 // vs3[1] <- montmul(montmul(a1, b1), z0)
6408 // vs1[1] <- montmul(montmul(a3, b3), z1)
6409 kyber_montmul16(vs5, vz, vs5, vs_front(vs2), vq);
6410
6411 // add results in pairs storing in vs3
6412 // vs3[0] <- montmul(a0, b0) + montmul(montmul(a1, b1), z0);
6413 // vs3[1] <- montmul(a0, b1) + montmul(a1, b0);
6414 vs_addv(vs_front(vs3), __ T8H, vs_even(vs3), vs_odd(vs3));
6415
6416 // vs3[2] <- montmul(a2, b2) + montmul(montmul(a3, b3), z1);
6417 // vs3[3] <- montmul(a2, b3) + montmul(a3, b2);
6418 vs_addv(vs_back(vs3), __ T8H, vs_even(vs1), vs_odd(vs1));
6419
6420 // vs1 <- montmul(vs3, montRSquareModQ)
6421 kyber_montmul32(vs1, vs3, vc, vs2, vq);
6422
6423 // store back the two pairs of result vectors de-interleaved as 8H elements
6424 // i.e. storing each pairs of shorts striped across a register pair adjacent
6425 // in memory
6426 vs_st2_post(vs1, __ T8H, result);
6427
6428 __ cmp(result, limit);
6429 __ br(Assembler::NE, kyberNttMult_loop);
6430
6431 __ leave(); // required for proper stackwalking of RuntimeStub frame
6432 __ mov(r0, zr); // return 0
6433 __ ret(lr);
6434
6435 // record the stub entry and end
6436 store_archive_data(stub_id, start, __ pc());
6437
6438 return start;
6439 }
6440
6441 // Kyber add 2 polynomials.
6442 // Implements
6443 // static int implKyberAddPoly(short[] result, short[] a, short[] b) {}
6444 //
6445 // result (short[256]) = c_rarg0
6446 // a (short[256]) = c_rarg1
6447 // b (short[256]) = c_rarg2
6448 address generate_kyberAddPoly_2() {
6449 StubId stub_id = StubId::stubgen_kyberAddPoly_2_id;
6450 int entry_count = StubInfo::entry_count(stub_id);
6451 assert(entry_count == 1, "sanity check");
6452 address start = load_archive_data(stub_id);
6453 if (start != nullptr) {
6454 return start;
6455 }
6456 __ align(CodeEntryAlignment);
6457 StubCodeMark mark(this, stub_id);
6458 start = __ pc();
6459 __ enter();
6460
6461 const Register result = c_rarg0;
6462 const Register a = c_rarg1;
6463 const Register b = c_rarg2;
6464
6465 const Register kyberConsts = r11;
6466
6467 // We sum 256 sets of values in total i.e. 32 x 8H quadwords.
6468 // So, we can load, add and store the data in 3 groups of 11,
6469 // 11 and 10 at a time i.e. we need to map sets of 10 or 11
6470 // registers. A further constraint is that the mapping needs
6471 // to skip callee saves. So, we allocate the register
6472 // sequences using two 8 sequences, two 2 sequences and two
6473 // single registers.
6474 VSeq<8> vs1_1(0);
6475 VSeq<2> vs1_2(16);
6476 FloatRegister vs1_3 = v28;
6477 VSeq<8> vs2_1(18);
6478 VSeq<2> vs2_2(26);
6479 FloatRegister vs2_3 = v29;
6480
6481 // two constant vector sequences
6482 VSeq<8> vc_1(31, 0);
6483 VSeq<2> vc_2(31, 0);
6484
6485 FloatRegister vc_3 = v31;
6486 __ lea(kyberConsts,
6487 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6488
6489 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6490 for (int i = 0; i < 3; i++) {
6491 // load 80 or 88 values from a into vs1_1/2/3
6492 vs_ldpq_post(vs1_1, a);
6493 vs_ldpq_post(vs1_2, a);
6494 if (i < 2) {
6495 __ ldr(vs1_3, __ Q, __ post(a, 16));
6496 }
6497 // load 80 or 88 values from b into vs2_1/2/3
6498 vs_ldpq_post(vs2_1, b);
6499 vs_ldpq_post(vs2_2, b);
6500 if (i < 2) {
6501 __ ldr(vs2_3, __ Q, __ post(b, 16));
6502 }
6503 // sum 80 or 88 values across vs1 and vs2 into vs1
6504 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6505 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6506 if (i < 2) {
6507 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6508 }
6509 // add constant to all 80 or 88 results
6510 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6511 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6512 if (i < 2) {
6513 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6514 }
6515 // store 80 or 88 values
6516 vs_stpq_post(vs1_1, result);
6517 vs_stpq_post(vs1_2, result);
6518 if (i < 2) {
6519 __ str(vs1_3, __ Q, __ post(result, 16));
6520 }
6521 }
6522
6523 __ leave(); // required for proper stackwalking of RuntimeStub frame
6524 __ mov(r0, zr); // return 0
6525 __ ret(lr);
6526
6527 // record the stub entry and end
6528 store_archive_data(stub_id, start, __ pc());
6529
6530 return start;
6531 }
6532
6533 // Kyber add 3 polynomials.
6534 // Implements
6535 // static int implKyberAddPoly(short[] result, short[] a, short[] b, short[] c) {}
6536 //
6537 // result (short[256]) = c_rarg0
6538 // a (short[256]) = c_rarg1
6539 // b (short[256]) = c_rarg2
6540 // c (short[256]) = c_rarg3
6541 address generate_kyberAddPoly_3() {
6542 StubId stub_id = StubId::stubgen_kyberAddPoly_3_id;
6543 int entry_count = StubInfo::entry_count(stub_id);
6544 assert(entry_count == 1, "sanity check");
6545 address start = load_archive_data(stub_id);
6546 if (start != nullptr) {
6547 return start;
6548 }
6549 __ align(CodeEntryAlignment);
6550 StubCodeMark mark(this, stub_id);
6551 start = __ pc();
6552 __ enter();
6553
6554 const Register result = c_rarg0;
6555 const Register a = c_rarg1;
6556 const Register b = c_rarg2;
6557 const Register c = c_rarg3;
6558
6559 const Register kyberConsts = r11;
6560
6561 // As above we sum 256 sets of values in total i.e. 32 x 8H
6562 // quadwords. So, we can load, add and store the data in 3
6563 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6564 // of 10 or 11 registers. A further constraint is that the
6565 // mapping needs to skip callee saves. So, we allocate the
6566 // register sequences using two 8 sequences, two 2 sequences
6567 // and two single registers.
6568 VSeq<8> vs1_1(0);
6569 VSeq<2> vs1_2(16);
6570 FloatRegister vs1_3 = v28;
6571 VSeq<8> vs2_1(18);
6572 VSeq<2> vs2_2(26);
6573 FloatRegister vs2_3 = v29;
6574
6575 // two constant vector sequences
6576 VSeq<8> vc_1(31, 0);
6577 VSeq<2> vc_2(31, 0);
6578
6579 FloatRegister vc_3 = v31;
6580
6581 __ lea(kyberConsts,
6582 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6583
6584 __ ldr(vc_3, __ Q, Address(kyberConsts, 16)); // q
6585 for (int i = 0; i < 3; i++) {
6586 // load 80 or 88 values from a into vs1_1/2/3
6587 vs_ldpq_post(vs1_1, a);
6588 vs_ldpq_post(vs1_2, a);
6589 if (i < 2) {
6590 __ ldr(vs1_3, __ Q, __ post(a, 16));
6591 }
6592 // load 80 or 88 values from b into vs2_1/2/3
6593 vs_ldpq_post(vs2_1, b);
6594 vs_ldpq_post(vs2_2, b);
6595 if (i < 2) {
6596 __ ldr(vs2_3, __ Q, __ post(b, 16));
6597 }
6598 // sum 80 or 88 values across vs1 and vs2 into vs1
6599 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6600 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6601 if (i < 2) {
6602 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6603 }
6604 // load 80 or 88 values from c into vs2_1/2/3
6605 vs_ldpq_post(vs2_1, c);
6606 vs_ldpq_post(vs2_2, c);
6607 if (i < 2) {
6608 __ ldr(vs2_3, __ Q, __ post(c, 16));
6609 }
6610 // sum 80 or 88 values across vs1 and vs2 into vs1
6611 vs_addv(vs1_1, __ T8H, vs1_1, vs2_1);
6612 vs_addv(vs1_2, __ T8H, vs1_2, vs2_2);
6613 if (i < 2) {
6614 __ addv(vs1_3, __ T8H, vs1_3, vs2_3);
6615 }
6616 // add constant to all 80 or 88 results
6617 vs_addv(vs1_1, __ T8H, vs1_1, vc_1);
6618 vs_addv(vs1_2, __ T8H, vs1_2, vc_2);
6619 if (i < 2) {
6620 __ addv(vs1_3, __ T8H, vs1_3, vc_3);
6621 }
6622 // store 80 or 88 values
6623 vs_stpq_post(vs1_1, result);
6624 vs_stpq_post(vs1_2, result);
6625 if (i < 2) {
6626 __ str(vs1_3, __ Q, __ post(result, 16));
6627 }
6628 }
6629
6630 __ leave(); // required for proper stackwalking of RuntimeStub frame
6631 __ mov(r0, zr); // return 0
6632 __ ret(lr);
6633
6634 // record the stub entry and end
6635 store_archive_data(stub_id, start, __ pc());
6636
6637 return start;
6638 }
6639
6640 // Kyber parse XOF output to polynomial coefficient candidates
6641 // or decodePoly(12, ...).
6642 // Implements
6643 // static int implKyber12To16(
6644 // byte[] condensed, int index, short[] parsed, int parsedLength) {}
6645 //
6646 // we assume that parsed and condensed are allocated such that for
6647 // n = (parsedLength + 63) / 64
6648 // n blocks of 96 bytes of input can be processed, i.e.
6649 // index + n * 96 <= condensed.length and
6650 // n * 64 <= parsed.length
6651 //
6652 // condensed (byte[]) = c_rarg0
6653 // condensedIndex = c_rarg1
6654 // parsed (short[]) = c_rarg2
6655 // parsedLength = c_rarg3
6656 address generate_kyber12To16() {
6657 StubId stub_id = StubId::stubgen_kyber12To16_id;
6658 int entry_count = StubInfo::entry_count(stub_id);
6659 assert(entry_count == 1, "sanity check");
6660 address start = load_archive_data(stub_id);
6661 if (start != nullptr) {
6662 return start;
6663 }
6664 Label L_F00, L_loop;
6665
6666 __ align(CodeEntryAlignment);
6667 StubCodeMark mark(this, stub_id);
6668 start = __ pc();
6669 __ enter();
6670
6671 const Register condensed = c_rarg0;
6672 const Register condensedOffs = c_rarg1;
6673 const Register parsed = c_rarg2;
6674 const Register parsedLength = c_rarg3;
6675
6676 const Register tmpAddr = r11;
6677
6678 // Data is input 96 bytes at a time i.e. in groups of 6 x 16B
6679 // quadwords so we need a 6 vector sequence for the inputs.
6680 // Parsing produces 64 shorts, employing two 8 vector
6681 // sequences to store and combine the intermediate data.
6682 VSeq<6> vin(24);
6683 VSeq<8> va(0), vb(16);
6684
6685 __ adr(tmpAddr, L_F00);
6686 __ ldr(v31, __ Q, tmpAddr); // 8H times 0x0f00
6687 __ add(condensed, condensed, condensedOffs);
6688
6689 __ BIND(L_loop);
6690 // load 96 (6 x 16B) byte values
6691 vs_ld3_post(vin, __ T16B, condensed);
6692
6693 // The front half of sequence vin (vin[0], vin[1] and vin[2])
6694 // holds 48 (16x3) contiguous bytes from memory striped
6695 // horizontally across each of the 16 byte lanes. Equivalently,
6696 // that is 16 pairs of 12-bit integers. Likewise the back half
6697 // holds the next 48 bytes in the same arrangement.
6698
6699 // Each vector in the front half can also be viewed as a vertical
6700 // strip across the 16 pairs of 12 bit integers. Each byte in
6701 // vin[0] stores the low 8 bits of the first int in a pair. Each
6702 // byte in vin[1] stores the high 4 bits of the first int and the
6703 // low 4 bits of the second int. Each byte in vin[2] stores the
6704 // high 8 bits of the second int. Likewise the vectors in second
6705 // half.
6706
6707 // Converting the data to 16-bit shorts requires first of all
6708 // expanding each of the 6 x 16B vectors into 6 corresponding
6709 // pairs of 8H vectors. Mask, shift and add operations on the
6710 // resulting vector pairs can be used to combine 4 and 8 bit
6711 // parts of related 8H vector elements.
6712 //
6713 // The middle vectors (vin[2] and vin[5]) are actually expanded
6714 // twice, one copy manipulated to provide the lower 4 bits
6715 // belonging to the first short in a pair and another copy
6716 // manipulated to provide the higher 4 bits belonging to the
6717 // second short in a pair. This is why the the vector sequences va
6718 // and vb used to hold the expanded 8H elements are of length 8.
6719
6720 // Expand vin[0] into va[0:1], and vin[1] into va[2:3] and va[4:5]
6721 // n.b. target elements 2 and 3 duplicate elements 4 and 5
6722 __ ushll(va[0], __ T8H, vin[0], __ T8B, 0);
6723 __ ushll2(va[1], __ T8H, vin[0], __ T16B, 0);
6724 __ ushll(va[2], __ T8H, vin[1], __ T8B, 0);
6725 __ ushll2(va[3], __ T8H, vin[1], __ T16B, 0);
6726 __ ushll(va[4], __ T8H, vin[1], __ T8B, 0);
6727 __ ushll2(va[5], __ T8H, vin[1], __ T16B, 0);
6728
6729 // likewise expand vin[3] into vb[0:1], and vin[4] into vb[2:3]
6730 // and vb[4:5]
6731 __ ushll(vb[0], __ T8H, vin[3], __ T8B, 0);
6732 __ ushll2(vb[1], __ T8H, vin[3], __ T16B, 0);
6733 __ ushll(vb[2], __ T8H, vin[4], __ T8B, 0);
6734 __ ushll2(vb[3], __ T8H, vin[4], __ T16B, 0);
6735 __ ushll(vb[4], __ T8H, vin[4], __ T8B, 0);
6736 __ ushll2(vb[5], __ T8H, vin[4], __ T16B, 0);
6737
6738 // shift lo byte of copy 1 of the middle stripe into the high byte
6739 __ shl(va[2], __ T8H, va[2], 8);
6740 __ shl(va[3], __ T8H, va[3], 8);
6741 __ shl(vb[2], __ T8H, vb[2], 8);
6742 __ shl(vb[3], __ T8H, vb[3], 8);
6743
6744 // expand vin[2] into va[6:7] and vin[5] into vb[6:7] but this
6745 // time pre-shifted by 4 to ensure top bits of input 12-bit int
6746 // are in bit positions [4..11].
6747 __ ushll(va[6], __ T8H, vin[2], __ T8B, 4);
6748 __ ushll2(va[7], __ T8H, vin[2], __ T16B, 4);
6749 __ ushll(vb[6], __ T8H, vin[5], __ T8B, 4);
6750 __ ushll2(vb[7], __ T8H, vin[5], __ T16B, 4);
6751
6752 // mask hi 4 bits of the 1st 12-bit int in a pair from copy1 and
6753 // shift lo 4 bits of the 2nd 12-bit int in a pair to the bottom of
6754 // copy2
6755 __ andr(va[2], __ T16B, va[2], v31);
6756 __ andr(va[3], __ T16B, va[3], v31);
6757 __ ushr(va[4], __ T8H, va[4], 4);
6758 __ ushr(va[5], __ T8H, va[5], 4);
6759 __ andr(vb[2], __ T16B, vb[2], v31);
6760 __ andr(vb[3], __ T16B, vb[3], v31);
6761 __ ushr(vb[4], __ T8H, vb[4], 4);
6762 __ ushr(vb[5], __ T8H, vb[5], 4);
6763
6764 // sum hi 4 bits and lo 8 bits of the 1st 12-bit int in each pair and
6765 // hi 8 bits plus lo 4 bits of the 2nd 12-bit int in each pair
6766 // n.b. the ordering ensures: i) inputs are consumed before they
6767 // are overwritten ii) the order of 16-bit results across successive
6768 // pairs of vectors in va and then vb reflects the order of the
6769 // corresponding 12-bit inputs
6770 __ addv(va[0], __ T8H, va[0], va[2]);
6771 __ addv(va[2], __ T8H, va[1], va[3]);
6772 __ addv(va[1], __ T8H, va[4], va[6]);
6773 __ addv(va[3], __ T8H, va[5], va[7]);
6774 __ addv(vb[0], __ T8H, vb[0], vb[2]);
6775 __ addv(vb[2], __ T8H, vb[1], vb[3]);
6776 __ addv(vb[1], __ T8H, vb[4], vb[6]);
6777 __ addv(vb[3], __ T8H, vb[5], vb[7]);
6778
6779 // store 64 results interleaved as shorts
6780 vs_st2_post(vs_front(va), __ T8H, parsed);
6781 vs_st2_post(vs_front(vb), __ T8H, parsed);
6782
6783 __ sub(parsedLength, parsedLength, 64);
6784 __ cmp(parsedLength, (u1)0);
6785 __ br(Assembler::GT, L_loop);
6786
6787 __ leave(); // required for proper stackwalking of RuntimeStub frame
6788 __ mov(r0, zr); // return 0
6789 __ ret(lr);
6790
6791 // bind label and generate constant data used by this stub
6792 __ BIND(L_F00);
6793 __ emit_int64(0x0f000f000f000f00);
6794 __ emit_int64(0x0f000f000f000f00);
6795
6796 // record the stub entry and end
6797 store_archive_data(stub_id, start, __ pc());
6798
6799 return start;
6800 }
6801
6802 // Kyber Barrett reduce function.
6803 // Implements
6804 // static int implKyberBarrettReduce(short[] coeffs) {}
6805 //
6806 // coeffs (short[256]) = c_rarg0
6807 address generate_kyberBarrettReduce() {
6808 StubId stub_id = StubId::stubgen_kyberBarrettReduce_id;
6809 int entry_count = StubInfo::entry_count(stub_id);
6810 assert(entry_count == 1, "sanity check");
6811 address start = load_archive_data(stub_id);
6812 if (start != nullptr) {
6813 return start;
6814 }
6815 __ align(CodeEntryAlignment);
6816 StubCodeMark mark(this, stub_id);
6817 start = __ pc();
6818 __ enter();
6819
6820 const Register coeffs = c_rarg0;
6821
6822 const Register kyberConsts = r10;
6823 const Register result = r11;
6824
6825 // As above we process 256 sets of values in total i.e. 32 x
6826 // 8H quadwords. So, we can load, add and store the data in 3
6827 // groups of 11, 11 and 10 at a time i.e. we need to map sets
6828 // of 10 or 11 registers. A further constraint is that the
6829 // mapping needs to skip callee saves. So, we allocate the
6830 // register sequences using two 8 sequences, two 2 sequences
6831 // and two single registers.
6832 VSeq<8> vs1_1(0);
6833 VSeq<2> vs1_2(16);
6834 FloatRegister vs1_3 = v28;
6835 VSeq<8> vs2_1(18);
6836 VSeq<2> vs2_2(26);
6837 FloatRegister vs2_3 = v29;
6838
6839 // we also need a pair of corresponding constant sequences
6840
6841 VSeq<8> vc1_1(30, 0);
6842 VSeq<2> vc1_2(30, 0);
6843 FloatRegister vc1_3 = v30; // for kyber_q
6844
6845 VSeq<8> vc2_1(31, 0);
6846 VSeq<2> vc2_2(31, 0);
6847 FloatRegister vc2_3 = v31; // for kyberBarrettMultiplier
6848
6849 __ add(result, coeffs, 0);
6850 __ lea(kyberConsts,
6851 ExternalAddress((address) StubRoutines::aarch64::_kyberConsts));
6852
6853 // load q and the multiplier for the Barrett reduction
6854 __ add(kyberConsts, kyberConsts, 16);
6855 __ ldpq(vc1_3, vc2_3, kyberConsts);
6856
6857 for (int i = 0; i < 3; i++) {
6858 // load 80 or 88 coefficients
6859 vs_ldpq_post(vs1_1, coeffs);
6860 vs_ldpq_post(vs1_2, coeffs);
6861 if (i < 2) {
6862 __ ldr(vs1_3, __ Q, __ post(coeffs, 16));
6863 }
6864
6865 // vs2 <- (2 * vs1 * kyberBarrettMultiplier) >> 16
6866 vs_sqdmulh(vs2_1, __ T8H, vs1_1, vc2_1);
6867 vs_sqdmulh(vs2_2, __ T8H, vs1_2, vc2_2);
6868 if (i < 2) {
6869 __ sqdmulh(vs2_3, __ T8H, vs1_3, vc2_3);
6870 }
6871
6872 // vs2 <- (vs1 * kyberBarrettMultiplier) >> 26
6873 vs_sshr(vs2_1, __ T8H, vs2_1, 11);
6874 vs_sshr(vs2_2, __ T8H, vs2_2, 11);
6875 if (i < 2) {
6876 __ sshr(vs2_3, __ T8H, vs2_3, 11);
6877 }
6878
6879 // vs1 <- vs1 - vs2 * kyber_q
6880 vs_mlsv(vs1_1, __ T8H, vs2_1, vc1_1);
6881 vs_mlsv(vs1_2, __ T8H, vs2_2, vc1_2);
6882 if (i < 2) {
6883 __ mlsv(vs1_3, __ T8H, vs2_3, vc1_3);
6884 }
6885
6886 vs_stpq_post(vs1_1, result);
6887 vs_stpq_post(vs1_2, result);
6888 if (i < 2) {
6889 __ str(vs1_3, __ Q, __ post(result, 16));
6890 }
6891 }
6892
6893 __ leave(); // required for proper stackwalking of RuntimeStub frame
6894 __ mov(r0, zr); // return 0
6895 __ ret(lr);
6896
6897 // record the stub entry and end
6898 store_archive_data(stub_id, start, __ pc());
6899
6900 return start;
6901 }
6902
6903
6904 // Dilithium-specific montmul helper routines that generate parallel
6905 // code for, respectively, a single 4x4s vector sequence montmul or
6906 // two such multiplies in a row.
6907
6908 // Perform 16 32-bit Montgomery multiplications in parallel
6909 void dilithium_montmul16(const VSeq<4>& va, const VSeq<4>& vb, const VSeq<4>& vc,
6910 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6911 // Use the helper routine to schedule a 4x4S Montgomery multiply.
6912 // It will assert that the register use is valid
6913 vs_montmul4(va, vb, vc, __ T4S, vtmp, vq);
6914 }
6915
6916 // Perform 2x16 32-bit Montgomery multiplications in parallel
6917 void dilithium_montmul32(const VSeq<8>& va, const VSeq<8>& vb, const VSeq<8>& vc,
6918 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6919 // Schedule two successive 4x4S multiplies via the montmul helper
6920 // on the front and back halves of va, vb and vc. The helper will
6921 // assert that the register use has no overlap conflicts on each
6922 // individual call but we also need to ensure that the necessary
6923 // disjoint/equality constraints are met across both calls.
6924
6925 // vb, vc, vtmp and vq must be disjoint. va must either be
6926 // disjoint from all other registers or equal vc
6927
6928 assert(vs_disjoint(vb, vc), "vb and vc overlap");
6929 assert(vs_disjoint(vb, vq), "vb and vq overlap");
6930 assert(vs_disjoint(vb, vtmp), "vb and vtmp overlap");
6931
6932 assert(vs_disjoint(vc, vq), "vc and vq overlap");
6933 assert(vs_disjoint(vc, vtmp), "vc and vtmp overlap");
6934
6935 assert(vs_disjoint(vq, vtmp), "vq and vtmp overlap");
6936
6937 assert(vs_disjoint(va, vc) || vs_same(va, vc), "va and vc neither disjoint nor equal");
6938 assert(vs_disjoint(va, vb), "va and vb overlap");
6939 assert(vs_disjoint(va, vq), "va and vq overlap");
6940 assert(vs_disjoint(va, vtmp), "va and vtmp overlap");
6941
6942 // We multiply the front and back halves of each sequence 4 at a
6943 // time because
6944 //
6945 // 1) we are currently only able to get 4-way instruction
6946 // parallelism at best
6947 //
6948 // 2) we need registers for the constants in vq and temporary
6949 // scratch registers to hold intermediate results so vtmp can only
6950 // be a VSeq<4> which means we only have 4 scratch slots.
6951
6952 vs_montmul4(vs_front(va), vs_front(vb), vs_front(vc), __ T4S, vtmp, vq);
6953 vs_montmul4(vs_back(va), vs_back(vb), vs_back(vc), __ T4S, vtmp, vq);
6954 }
6955
6956 // Perform combined montmul then add/sub on 4x4S vectors.
6957 void dilithium_montmul16_sub_add(
6958 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vc,
6959 const VSeq<4>& vtmp, const VSeq<2>& vq) {
6960 // compute a = montmul(a1, c)
6961 dilithium_montmul16(vc, va1, vc, vtmp, vq);
6962 // ouptut a1 = a0 - a
6963 vs_subv(va1, __ T4S, va0, vc);
6964 // and a0 = a0 + a
6965 vs_addv(va0, __ T4S, va0, vc);
6966 }
6967
6968 // Perform combined add/sub then montul on 4x4S vectors.
6969 void dilithium_sub_add_montmul16(
6970 const VSeq<4>& va0, const VSeq<4>& va1, const VSeq<4>& vb,
6971 const VSeq<4>& vtmp1, const VSeq<4>& vtmp2, const VSeq<2>& vq) {
6972 // compute c = a0 - a1
6973 vs_subv(vtmp1, __ T4S, va0, va1);
6974 // output a0 = a0 + a1
6975 vs_addv(va0, __ T4S, va0, va1);
6976 // output a1 = b montmul c
6977 dilithium_montmul16(va1, vtmp1, vb, vtmp2, vq);
6978 }
6979
6980 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
6981 // in the Java implementation come in sequences of at least 8, so we
6982 // can use ldpq to collect the corresponding data into pairs of vector
6983 // registers.
6984 // We collect the coefficients corresponding to the 'j+l' indexes into
6985 // the vector registers v0-v7, the zetas into the vector registers v16-v23
6986 // then we do the (Montgomery) multiplications by the zetas in parallel
6987 // into v16-v23, load the coeffs corresponding to the 'j' indexes into
6988 // v0-v7, then do the additions into v24-v31 and the subtractions into
6989 // v0-v7 and finally save the results back to the coeffs array.
6990 void dilithiumNttLevel0_4(const Register dilithiumConsts,
6991 const Register coeffs, const Register zetas) {
6992 int c1 = 0;
6993 int c2 = 512;
6994 int startIncr;
6995 // don't use callee save registers v8 - v15
6996 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
6997 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
6998 VSeq<2> vq(30); // n.b. constants overlap vs3
6999 int offsets[4] = { 0, 32, 64, 96 };
7000
7001 for (int level = 0; level < 5; level++) {
7002 int c1Start = c1;
7003 int c2Start = c2;
7004 if (level == 3) {
7005 offsets[1] = 32;
7006 offsets[2] = 128;
7007 offsets[3] = 160;
7008 } else if (level == 4) {
7009 offsets[1] = 64;
7010 offsets[2] = 128;
7011 offsets[3] = 192;
7012 }
7013
7014 // For levels 1 - 4 we simply load 2 x 4 adjacent values at a
7015 // time at 4 different offsets and multiply them in order by the
7016 // next set of input values. So we employ indexed load and store
7017 // pair instructions with arrangement 4S.
7018 for (int i = 0; i < 4; i++) {
7019 // reload q and qinv
7020 vs_ldpq(vq, dilithiumConsts); // qInv, q
7021 // load 8x4S coefficients via second start pos == c2
7022 vs_ldpq_indexed(vs1, coeffs, c2Start, offsets);
7023 // load next 8x4S inputs == b
7024 vs_ldpq_post(vs2, zetas);
7025 // compute a == c2 * b mod MONT_Q
7026 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7027 // load 8x4s coefficients via first start pos == c1
7028 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7029 // compute a1 = c1 + a
7030 vs_addv(vs3, __ T4S, vs1, vs2);
7031 // compute a2 = c1 - a
7032 vs_subv(vs1, __ T4S, vs1, vs2);
7033 // output a1 and a2
7034 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7035 vs_stpq_indexed(vs1, coeffs, c2Start, offsets);
7036
7037 int k = 4 * level + i;
7038
7039 if (k > 7) {
7040 startIncr = 256;
7041 } else if (k == 5) {
7042 startIncr = 384;
7043 } else {
7044 startIncr = 128;
7045 }
7046
7047 c1Start += startIncr;
7048 c2Start += startIncr;
7049 }
7050
7051 c2 /= 2;
7052 }
7053 }
7054
7055 // Dilithium NTT function except for the final "normalization" to |coeff| < Q.
7056 // Implements the method
7057 // static int implDilithiumAlmostNtt(int[] coeffs, int zetas[]) {}
7058 // of the Java class sun.security.provider
7059 //
7060 // coeffs (int[256]) = c_rarg0
7061 // zetas (int[256]) = c_rarg1
7062 address generate_dilithiumAlmostNtt() {
7063 StubId stub_id = StubId::stubgen_dilithiumAlmostNtt_id;
7064 int entry_count = StubInfo::entry_count(stub_id);
7065 assert(entry_count == 1, "sanity check");
7066 address start = load_archive_data(stub_id);
7067 if (start != nullptr) {
7068 return start;
7069 }
7070 __ align(CodeEntryAlignment);
7071 StubCodeMark mark(this, stub_id);
7072 start = __ pc();
7073 __ enter();
7074
7075 const Register coeffs = c_rarg0;
7076 const Register zetas = c_rarg1;
7077
7078 const Register tmpAddr = r9;
7079 const Register dilithiumConsts = r10;
7080 const Register result = r11;
7081 // don't use callee save registers v8 - v15
7082 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7083 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7084 VSeq<2> vq(30); // n.b. constants overlap vs3
7085 int offsets[4] = { 0, 32, 64, 96};
7086 int offsets1[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7087 int offsets2[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7088 __ add(result, coeffs, 0);
7089 __ lea(dilithiumConsts,
7090 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7091
7092 // Each level represents one iteration of the outer for loop of the Java version.
7093
7094 // level 0-4
7095 dilithiumNttLevel0_4(dilithiumConsts, coeffs, zetas);
7096
7097 // level 5
7098
7099 // At level 5 the coefficients we need to combine with the zetas
7100 // are grouped in memory in blocks of size 4. So, for both sets of
7101 // coefficients we load 4 adjacent values at 8 different offsets
7102 // using an indexed ldr with register variant Q and multiply them
7103 // in sequence order by the next set of inputs. Likewise we store
7104 // the resuls using an indexed str with register variant Q.
7105 for (int i = 0; i < 1024; i += 256) {
7106 // reload constants q, qinv each iteration as they get clobbered later
7107 vs_ldpq(vq, dilithiumConsts); // qInv, q
7108 // load 32 (8x4S) coefficients via first offsets = c1
7109 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7110 // load next 32 (8x4S) inputs = b
7111 vs_ldpq_post(vs2, zetas);
7112 // a = b montul c1
7113 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7114 // load 32 (8x4S) coefficients via second offsets = c2
7115 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets2);
7116 // add/sub with result of multiply
7117 vs_addv(vs3, __ T4S, vs1, vs2); // a1 = a - c2
7118 vs_subv(vs1, __ T4S, vs1, vs2); // a0 = a + c1
7119 // write back new coefficients using same offsets
7120 vs_str_indexed(vs3, __ Q, coeffs, i, offsets2);
7121 vs_str_indexed(vs1, __ Q, coeffs, i, offsets1);
7122 }
7123
7124 // level 6
7125 // At level 6 the coefficients we need to combine with the zetas
7126 // are grouped in memory in pairs, the first two being montmul
7127 // inputs and the second add/sub inputs. We can still implement
7128 // the montmul+sub+add using 4-way parallelism but only if we
7129 // combine the coefficients with the zetas 16 at a time. We load 8
7130 // adjacent values at 4 different offsets using an ld2 load with
7131 // arrangement 2D. That interleaves the lower and upper halves of
7132 // each pair of quadwords into successive vector registers. We
7133 // then need to montmul the 4 even elements of the coefficients
7134 // register sequence by the zetas in order and then add/sub the 4
7135 // odd elements of the coefficients register sequence. We use an
7136 // equivalent st2 operation to store the results back into memory
7137 // de-interleaved.
7138 for (int i = 0; i < 1024; i += 128) {
7139 // reload constants q, qinv each iteration as they get clobbered later
7140 vs_ldpq(vq, dilithiumConsts); // qInv, q
7141 // load interleaved 16 (4x2D) coefficients via offsets
7142 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7143 // load next 16 (4x4S) inputs
7144 vs_ldpq_post(vs_front(vs2), zetas);
7145 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7146 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7147 vs_front(vs2), vtmp, vq);
7148 // store interleaved 16 (4x2D) coefficients via offsets
7149 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7150 }
7151
7152 // level 7
7153 // At level 7 the coefficients we need to combine with the zetas
7154 // occur singly with montmul inputs alterating with add/sub
7155 // inputs. Once again we can use 4-way parallelism to combine 16
7156 // zetas at a time. However, we have to load 8 adjacent values at
7157 // 4 different offsets using an ld2 load with arrangement 4S. That
7158 // interleaves the the odd words of each pair into one
7159 // coefficients vector register and the even words of the pair
7160 // into the next register. We then need to montmul the 4 even
7161 // elements of the coefficients register sequence by the zetas in
7162 // order and then add/sub the 4 odd elements of the coefficients
7163 // register sequence. We use an equivalent st2 operation to store
7164 // the results back into memory de-interleaved.
7165
7166 for (int i = 0; i < 1024; i += 128) {
7167 // reload constants q, qinv each iteration as they get clobbered later
7168 vs_ldpq(vq, dilithiumConsts); // qInv, q
7169 // load interleaved 16 (4x4S) coefficients via offsets
7170 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7171 // load next 16 (4x4S) inputs
7172 vs_ldpq_post(vs_front(vs2), zetas);
7173 // mont multiply odd elements of vs1 by vs2 and add/sub into odds/evens
7174 dilithium_montmul16_sub_add(vs_even(vs1), vs_odd(vs1),
7175 vs_front(vs2), vtmp, vq);
7176 // store interleaved 16 (4x4S) coefficients via offsets
7177 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7178 }
7179 __ leave(); // required for proper stackwalking of RuntimeStub frame
7180 __ mov(r0, zr); // return 0
7181 __ ret(lr);
7182
7183 // record the stub entry and end
7184 store_archive_data(stub_id, start, __ pc());
7185
7186 return start;
7187 }
7188
7189 // At these levels, the indices that correspond to the 'j's (and 'j+l's)
7190 // in the Java implementation come in sequences of at least 8, so we
7191 // can use ldpq to collect the corresponding data into pairs of vector
7192 // registers
7193 // We collect the coefficients that correspond to the 'j's into vs1
7194 // the coefficiets that correspond to the 'j+l's into vs2 then
7195 // do the additions into vs3 and the subtractions into vs1 then
7196 // save the result of the additions, load the zetas into vs2
7197 // do the (Montgomery) multiplications by zeta in parallel into vs2
7198 // finally save the results back to the coeffs array
7199 void dilithiumInverseNttLevel3_7(const Register dilithiumConsts,
7200 const Register coeffs, const Register zetas) {
7201 int c1 = 0;
7202 int c2 = 32;
7203 int startIncr;
7204 int offsets[4];
7205 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7206 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7207 VSeq<2> vq(30); // n.b. constants overlap vs3
7208
7209 offsets[0] = 0;
7210
7211 for (int level = 3; level < 8; level++) {
7212 int c1Start = c1;
7213 int c2Start = c2;
7214 if (level == 3) {
7215 offsets[1] = 64;
7216 offsets[2] = 128;
7217 offsets[3] = 192;
7218 } else if (level == 4) {
7219 offsets[1] = 32;
7220 offsets[2] = 128;
7221 offsets[3] = 160;
7222 } else {
7223 offsets[1] = 32;
7224 offsets[2] = 64;
7225 offsets[3] = 96;
7226 }
7227
7228 // For levels 3 - 7 we simply load 2 x 4 adjacent values at a
7229 // time at 4 different offsets and multiply them in order by the
7230 // next set of input values. So we employ indexed load and store
7231 // pair instructions with arrangement 4S.
7232 for (int i = 0; i < 4; i++) {
7233 // load v1 32 (8x4S) coefficients relative to first start index
7234 vs_ldpq_indexed(vs1, coeffs, c1Start, offsets);
7235 // load v2 32 (8x4S) coefficients relative to second start index
7236 vs_ldpq_indexed(vs2, coeffs, c2Start, offsets);
7237 // a0 = v1 + v2 -- n.b. clobbers vqs
7238 vs_addv(vs3, __ T4S, vs1, vs2);
7239 // a1 = v1 - v2
7240 vs_subv(vs1, __ T4S, vs1, vs2);
7241 // save a1 relative to first start index
7242 vs_stpq_indexed(vs3, coeffs, c1Start, offsets);
7243 // load constants q, qinv each iteration as they get clobbered above
7244 vs_ldpq(vq, dilithiumConsts); // qInv, q
7245 // load b next 32 (8x4S) inputs
7246 vs_ldpq_post(vs2, zetas);
7247 // a = a1 montmul b
7248 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7249 // save a relative to second start index
7250 vs_stpq_indexed(vs2, coeffs, c2Start, offsets);
7251
7252 int k = 4 * level + i;
7253
7254 if (k < 24) {
7255 startIncr = 256;
7256 } else if (k == 25) {
7257 startIncr = 384;
7258 } else {
7259 startIncr = 128;
7260 }
7261
7262 c1Start += startIncr;
7263 c2Start += startIncr;
7264 }
7265
7266 c2 *= 2;
7267 }
7268 }
7269
7270 // Dilithium Inverse NTT function except the final mod Q division by 2^256.
7271 // Implements the method
7272 // static int implDilithiumAlmostInverseNtt(int[] coeffs, int[] zetas) {} of
7273 // the sun.security.provider.ML_DSA class.
7274 //
7275 // coeffs (int[256]) = c_rarg0
7276 // zetas (int[256]) = c_rarg1
7277 address generate_dilithiumAlmostInverseNtt() {
7278 StubId stub_id = StubId::stubgen_dilithiumAlmostInverseNtt_id;
7279 int entry_count = StubInfo::entry_count(stub_id);
7280 assert(entry_count == 1, "sanity check");
7281 address start = load_archive_data(stub_id);
7282 if (start != nullptr) {
7283 return start;
7284 }
7285 __ align(CodeEntryAlignment);
7286 StubCodeMark mark(this, stub_id);
7287 start = __ pc();
7288 __ enter();
7289
7290 const Register coeffs = c_rarg0;
7291 const Register zetas = c_rarg1;
7292
7293 const Register tmpAddr = r9;
7294 const Register dilithiumConsts = r10;
7295 const Register result = r11;
7296 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7297 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7298 VSeq<2> vq(30); // n.b. constants overlap vs3
7299 int offsets[4] = { 0, 32, 64, 96 };
7300 int offsets1[8] = { 0, 32, 64, 96, 128, 160, 192, 224 };
7301 int offsets2[8] = { 16, 48, 80, 112, 144, 176, 208, 240 };
7302
7303 __ add(result, coeffs, 0);
7304 __ lea(dilithiumConsts,
7305 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7306
7307 // Each level represents one iteration of the outer for loop of the Java version
7308
7309 // level 0
7310 // At level 0 we need to interleave adjacent quartets of
7311 // coefficients before we multiply and add/sub by the next 16
7312 // zetas just as we did for level 7 in the multiply code. So we
7313 // load and store the values using an ld2/st2 with arrangement 4S.
7314 for (int i = 0; i < 1024; i += 128) {
7315 // load constants q, qinv
7316 // n.b. this can be moved out of the loop as they do not get
7317 // clobbered by first two loops
7318 vs_ldpq(vq, dilithiumConsts); // qInv, q
7319 // a0/a1 load interleaved 32 (8x4S) coefficients
7320 vs_ld2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7321 // b load next 32 (8x4S) inputs
7322 vs_ldpq_post(vs_front(vs2), zetas);
7323 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7324 // n.b. second half of vs2 provides temporary register storage
7325 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7326 vs_front(vs2), vs_back(vs2), vtmp, vq);
7327 // a0/a1 store interleaved 32 (8x4S) coefficients
7328 vs_st2_indexed(vs1, __ T4S, coeffs, tmpAddr, i, offsets);
7329 }
7330
7331 // level 1
7332 // At level 1 we need to interleave pairs of adjacent pairs of
7333 // coefficients before we multiply by the next 16 zetas just as we
7334 // did for level 6 in the multiply code. So we load and store the
7335 // values an ld2/st2 with arrangement 2D.
7336 for (int i = 0; i < 1024; i += 128) {
7337 // a0/a1 load interleaved 32 (8x2D) coefficients
7338 vs_ld2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7339 // b load next 16 (4x4S) inputs
7340 vs_ldpq_post(vs_front(vs2), zetas);
7341 // compute in parallel (a0, a1) = (a0 + a1, (a0 - a1) montmul b)
7342 // n.b. second half of vs2 provides temporary register storage
7343 dilithium_sub_add_montmul16(vs_even(vs1), vs_odd(vs1),
7344 vs_front(vs2), vs_back(vs2), vtmp, vq);
7345 // a0/a1 store interleaved 32 (8x2D) coefficients
7346 vs_st2_indexed(vs1, __ T2D, coeffs, tmpAddr, i, offsets);
7347 }
7348
7349 // level 2
7350 // At level 2 coefficients come in blocks of 4. So, we load 4
7351 // adjacent coefficients at 8 distinct offsets for both the first
7352 // and second coefficient sequences, using an ldr with register
7353 // variant Q then combine them with next set of 32 zetas. Likewise
7354 // we store the results using an str with register variant Q.
7355 for (int i = 0; i < 1024; i += 256) {
7356 // c0 load 32 (8x4S) coefficients via first offsets
7357 vs_ldr_indexed(vs1, __ Q, coeffs, i, offsets1);
7358 // c1 load 32 (8x4S) coefficients via second offsets
7359 vs_ldr_indexed(vs2, __ Q,coeffs, i, offsets2);
7360 // a0 = c0 + c1 n.b. clobbers vq which overlaps vs3
7361 vs_addv(vs3, __ T4S, vs1, vs2);
7362 // c = c0 - c1
7363 vs_subv(vs1, __ T4S, vs1, vs2);
7364 // store a0 32 (8x4S) coefficients via first offsets
7365 vs_str_indexed(vs3, __ Q, coeffs, i, offsets1);
7366 // b load 32 (8x4S) next inputs
7367 vs_ldpq_post(vs2, zetas);
7368 // reload constants q, qinv -- they were clobbered earlier
7369 vs_ldpq(vq, dilithiumConsts); // qInv, q
7370 // compute a1 = b montmul c
7371 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7372 // store a1 32 (8x4S) coefficients via second offsets
7373 vs_str_indexed(vs2, __ Q, coeffs, i, offsets2);
7374 }
7375
7376 // level 3-7
7377 dilithiumInverseNttLevel3_7(dilithiumConsts, coeffs, zetas);
7378
7379 __ leave(); // required for proper stackwalking of RuntimeStub frame
7380 __ mov(r0, zr); // return 0
7381 __ ret(lr);
7382
7383 // record the stub entry and end
7384 store_archive_data(stub_id, start, __ pc());
7385
7386 return start;
7387 }
7388
7389 // Dilithium multiply polynomials in the NTT domain.
7390 // Straightforward implementation of the method
7391 // static int implDilithiumNttMult(
7392 // int[] result, int[] ntta, int[] nttb {} of
7393 // the sun.security.provider.ML_DSA class.
7394 //
7395 // result (int[256]) = c_rarg0
7396 // poly1 (int[256]) = c_rarg1
7397 // poly2 (int[256]) = c_rarg2
7398 address generate_dilithiumNttMult() {
7399 StubId stub_id = StubId::stubgen_dilithiumNttMult_id;
7400 int entry_count = StubInfo::entry_count(stub_id);
7401 assert(entry_count == 1, "sanity check");
7402 address start = load_archive_data(stub_id);
7403 if (start != nullptr) {
7404 return start;
7405 }
7406 __ align(CodeEntryAlignment);
7407 StubCodeMark mark(this, stub_id);
7408 start = __ pc();
7409 __ enter();
7410
7411 Label L_loop;
7412
7413 const Register result = c_rarg0;
7414 const Register poly1 = c_rarg1;
7415 const Register poly2 = c_rarg2;
7416
7417 const Register dilithiumConsts = r10;
7418 const Register len = r11;
7419
7420 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7421 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7422 VSeq<2> vq(30); // n.b. constants overlap vs3
7423 VSeq<8> vrsquare(29, 0); // for montmul by constant RSQUARE
7424
7425 __ lea(dilithiumConsts,
7426 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7427
7428 // load constants q, qinv
7429 vs_ldpq(vq, dilithiumConsts); // qInv, q
7430 // load constant rSquare into v29
7431 __ ldr(v29, __ Q, Address(dilithiumConsts, 48)); // rSquare
7432
7433 __ mov(len, zr);
7434 __ add(len, len, 1024);
7435
7436 __ BIND(L_loop);
7437
7438 // b load 32 (8x4S) next inputs from poly1
7439 vs_ldpq_post(vs1, poly1);
7440 // c load 32 (8x4S) next inputs from poly2
7441 vs_ldpq_post(vs2, poly2);
7442 // compute a = b montmul c
7443 dilithium_montmul32(vs2, vs1, vs2, vtmp, vq);
7444 // compute a = rsquare montmul a
7445 dilithium_montmul32(vs2, vrsquare, vs2, vtmp, vq);
7446 // save a 32 (8x4S) results
7447 vs_stpq_post(vs2, result);
7448
7449 __ sub(len, len, 128);
7450 __ cmp(len, (u1)128);
7451 __ br(Assembler::GE, L_loop);
7452
7453 __ leave(); // required for proper stackwalking of RuntimeStub frame
7454 __ mov(r0, zr); // return 0
7455 __ ret(lr);
7456
7457 // record the stub entry and end
7458 store_archive_data(stub_id, start, __ pc());
7459
7460 return start;
7461 }
7462
7463 // Dilithium Motgomery multiply an array by a constant.
7464 // A straightforward implementation of the method
7465 // static int implDilithiumMontMulByConstant(int[] coeffs, int constant) {}
7466 // of the sun.security.provider.MLDSA class
7467 //
7468 // coeffs (int[256]) = c_rarg0
7469 // constant (int) = c_rarg1
7470 address generate_dilithiumMontMulByConstant() {
7471 StubId stub_id = StubId::stubgen_dilithiumMontMulByConstant_id;
7472 int entry_count = StubInfo::entry_count(stub_id);
7473 assert(entry_count == 1, "sanity check");
7474 address start = load_archive_data(stub_id);
7475 if (start != nullptr) {
7476 return start;
7477 }
7478 __ align(CodeEntryAlignment);
7479 StubCodeMark mark(this, stub_id);
7480 start = __ pc();
7481 __ enter();
7482
7483 Label L_loop;
7484
7485 const Register coeffs = c_rarg0;
7486 const Register constant = c_rarg1;
7487
7488 const Register dilithiumConsts = r10;
7489 const Register result = r11;
7490 const Register len = r12;
7491
7492 VSeq<8> vs1(0), vs2(16), vs3(24); // 3 sets of 8x4s inputs/outputs
7493 VSeq<4> vtmp = vs_front(vs3); // n.b. tmp registers overlap vs3
7494 VSeq<2> vq(30); // n.b. constants overlap vs3
7495 VSeq<8> vconst(29, 0); // for montmul by constant
7496
7497 // results track inputs
7498 __ add(result, coeffs, 0);
7499 __ lea(dilithiumConsts,
7500 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7501
7502 // load constants q, qinv -- they do not get clobbered by first two loops
7503 vs_ldpq(vq, dilithiumConsts); // qInv, q
7504 // copy caller supplied constant across vconst
7505 __ dup(vconst[0], __ T4S, constant);
7506 __ mov(len, zr);
7507 __ add(len, len, 1024);
7508
7509 __ BIND(L_loop);
7510
7511 // load next 32 inputs
7512 vs_ldpq_post(vs2, coeffs);
7513 // mont mul by constant
7514 dilithium_montmul32(vs2, vconst, vs2, vtmp, vq);
7515 // write next 32 results
7516 vs_stpq_post(vs2, result);
7517
7518 __ sub(len, len, 128);
7519 __ cmp(len, (u1)128);
7520 __ br(Assembler::GE, L_loop);
7521
7522 __ leave(); // required for proper stackwalking of RuntimeStub frame
7523 __ mov(r0, zr); // return 0
7524 __ ret(lr);
7525
7526 // record the stub entry and end
7527 store_archive_data(stub_id, start, __ pc());
7528
7529 return start;
7530 }
7531
7532 // Dilithium decompose poly.
7533 // Implements the method
7534 // static int implDilithiumDecomposePoly(int[] coeffs, int constant) {}
7535 // of the sun.security.provider.ML_DSA class
7536 //
7537 // input (int[256]) = c_rarg0
7538 // lowPart (int[256]) = c_rarg1
7539 // highPart (int[256]) = c_rarg2
7540 // twoGamma2 (int) = c_rarg3
7541 // multiplier (int) = c_rarg4
7542 address generate_dilithiumDecomposePoly() {
7543 StubId stub_id = StubId::stubgen_dilithiumDecomposePoly_id;
7544 int entry_count = StubInfo::entry_count(stub_id);
7545 assert(entry_count == 1, "sanity check");
7546 address start = load_archive_data(stub_id);
7547 if (start != nullptr) {
7548 return start;
7549 }
7550 __ align(CodeEntryAlignment);
7551 StubCodeMark mark(this, stub_id);
7552 start = __ pc();
7553 Label L_loop;
7554
7555 const Register input = c_rarg0;
7556 const Register lowPart = c_rarg1;
7557 const Register highPart = c_rarg2;
7558 const Register twoGamma2 = c_rarg3;
7559 const Register multiplier = c_rarg4;
7560
7561 const Register len = r9;
7562 const Register dilithiumConsts = r10;
7563 const Register tmp = r11;
7564
7565 // 6 independent sets of 4x4s values
7566 VSeq<4> vs1(0), vs2(4), vs3(8);
7567 VSeq<4> vs4(12), vs5(16), vtmp(20);
7568
7569 // 7 constants for cross-multiplying
7570 VSeq<4> one(25, 0);
7571 VSeq<4> qminus1(26, 0);
7572 VSeq<4> g2(27, 0);
7573 VSeq<4> twog2(28, 0);
7574 VSeq<4> mult(29, 0);
7575 VSeq<4> q(30, 0);
7576 VSeq<4> qadd(31, 0);
7577
7578 __ enter();
7579
7580 __ lea(dilithiumConsts,
7581 ExternalAddress((address) StubRoutines::aarch64::_dilithiumConsts));
7582
7583 // save callee-saved registers
7584 __ stpd(v8, v9, __ pre(sp, -64));
7585 __ stpd(v10, v11, Address(sp, 16));
7586 __ stpd(v12, v13, Address(sp, 32));
7587 __ stpd(v14, v15, Address(sp, 48));
7588
7589 // populate constant registers
7590 __ mov(tmp, zr);
7591 __ add(tmp, tmp, 1);
7592 __ dup(one[0], __ T4S, tmp); // 1
7593 __ ldr(q[0], __ Q, Address(dilithiumConsts, 16)); // q
7594 __ ldr(qadd[0], __ Q, Address(dilithiumConsts, 64)); // addend for mod q reduce
7595 __ dup(twog2[0], __ T4S, twoGamma2); // 2 * gamma2
7596 __ dup(mult[0], __ T4S, multiplier); // multiplier for mod 2 * gamma reduce
7597 __ subv(qminus1[0], __ T4S, v30, v25); // q - 1
7598 __ sshr(g2[0], __ T4S, v28, 1); // gamma2
7599
7600 __ mov(len, zr);
7601 __ add(len, len, 1024);
7602
7603 __ BIND(L_loop);
7604
7605 // load next 4x4S inputs interleaved: rplus --> vs1
7606 __ ld4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(input, 64));
7607
7608 // rplus = rplus - ((rplus + qadd) >> 23) * q
7609 vs_addv(vtmp, __ T4S, vs1, qadd);
7610 vs_sshr(vtmp, __ T4S, vtmp, 23);
7611 vs_mulv(vtmp, __ T4S, vtmp, q);
7612 vs_subv(vs1, __ T4S, vs1, vtmp);
7613
7614 // rplus = rplus + ((rplus >> 31) & dilithium_q);
7615 vs_sshr(vtmp, __ T4S, vs1, 31);
7616 vs_andr(vtmp, vtmp, q);
7617 vs_addv(vs1, __ T4S, vs1, vtmp);
7618
7619 // quotient --> vs2
7620 // int quotient = (rplus * multiplier) >> 22;
7621 vs_mulv(vtmp, __ T4S, vs1, mult);
7622 vs_sshr(vs2, __ T4S, vtmp, 22);
7623
7624 // r0 --> vs3
7625 // int r0 = rplus - quotient * twoGamma2;
7626 vs_mulv(vtmp, __ T4S, vs2, twog2);
7627 vs_subv(vs3, __ T4S, vs1, vtmp);
7628
7629 // mask --> vs4
7630 // int mask = (twoGamma2 - r0) >> 22;
7631 vs_subv(vtmp, __ T4S, twog2, vs3);
7632 vs_sshr(vs4, __ T4S, vtmp, 22);
7633
7634 // r0 -= (mask & twoGamma2);
7635 vs_andr(vtmp, vs4, twog2);
7636 vs_subv(vs3, __ T4S, vs3, vtmp);
7637
7638 // quotient += (mask & 1);
7639 vs_andr(vtmp, vs4, one);
7640 vs_addv(vs2, __ T4S, vs2, vtmp);
7641
7642 // mask = (twoGamma2 / 2 - r0) >> 31;
7643 vs_subv(vtmp, __ T4S, g2, vs3);
7644 vs_sshr(vs4, __ T4S, vtmp, 31);
7645
7646 // r0 -= (mask & twoGamma2);
7647 vs_andr(vtmp, vs4, twog2);
7648 vs_subv(vs3, __ T4S, vs3, vtmp);
7649
7650 // quotient += (mask & 1);
7651 vs_andr(vtmp, vs4, one);
7652 vs_addv(vs2, __ T4S, vs2, vtmp);
7653
7654 // r1 --> vs5
7655 // int r1 = rplus - r0 - (dilithium_q - 1);
7656 vs_subv(vtmp, __ T4S, vs1, vs3);
7657 vs_subv(vs5, __ T4S, vtmp, qminus1);
7658
7659 // r1 --> vs1 (overwriting rplus)
7660 // r1 = (r1 | (-r1)) >> 31; // 0 if rplus - r0 == (dilithium_q - 1), -1 otherwise
7661 vs_negr(vtmp, __ T4S, vs5);
7662 vs_orr(vtmp, vs5, vtmp);
7663 vs_sshr(vs1, __ T4S, vtmp, 31);
7664
7665 // r0 += ~r1;
7666 vs_notr(vtmp, vs1);
7667 vs_addv(vs3, __ T4S, vs3, vtmp);
7668
7669 // r1 = r1 & quotient;
7670 vs_andr(vs1, vs2, vs1);
7671
7672 // store results inteleaved
7673 // lowPart[m] = r0;
7674 // highPart[m] = r1;
7675 __ st4(vs3[0], vs3[1], vs3[2], vs3[3], __ T4S, __ post(lowPart, 64));
7676 __ st4(vs1[0], vs1[1], vs1[2], vs1[3], __ T4S, __ post(highPart, 64));
7677
7678 __ sub(len, len, 64);
7679 __ cmp(len, (u1)64);
7680 __ br(Assembler::GE, L_loop);
7681
7682 // restore callee-saved vector registers
7683 __ ldpd(v14, v15, Address(sp, 48));
7684 __ ldpd(v12, v13, Address(sp, 32));
7685 __ ldpd(v10, v11, Address(sp, 16));
7686 __ ldpd(v8, v9, __ post(sp, 64));
7687
7688 __ leave(); // required for proper stackwalking of RuntimeStub frame
7689 __ mov(r0, zr); // return 0
7690 __ ret(lr);
7691
7692 // record the stub entry and end
7693 store_archive_data(stub_id, start, __ pc());
7694
7695 return start;
7696 }
7697
7698 void bcax5(Register a0, Register a1, Register a2, Register a3, Register a4,
7699 Register tmp0, Register tmp1, Register tmp2) {
7700 __ bic(tmp0, a2, a1); // for a0
7701 __ bic(tmp1, a3, a2); // for a1
7702 __ bic(tmp2, a4, a3); // for a2
7703 __ eor(a2, a2, tmp2);
7704 __ bic(tmp2, a0, a4); // for a3
7705 __ eor(a3, a3, tmp2);
7706 __ bic(tmp2, a1, a0); // for a4
7707 __ eor(a0, a0, tmp0);
7708 __ eor(a1, a1, tmp1);
7709 __ eor(a4, a4, tmp2);
7710 }
7711
7712 void keccak_round_gpr(bool can_use_fp, bool can_use_r18, Register rc,
7713 Register a0, Register a1, Register a2, Register a3, Register a4,
7714 Register a5, Register a6, Register a7, Register a8, Register a9,
7715 Register a10, Register a11, Register a12, Register a13, Register a14,
7716 Register a15, Register a16, Register a17, Register a18, Register a19,
7717 Register a20, Register a21, Register a22, Register a23, Register a24,
7718 Register tmp0, Register tmp1, Register tmp2) {
7719 __ eor3(tmp1, a4, a9, a14);
7720 __ eor3(tmp0, tmp1, a19, a24); // tmp0 = a4^a9^a14^a19^a24 = c4
7721 __ eor3(tmp2, a1, a6, a11);
7722 __ eor3(tmp1, tmp2, a16, a21); // tmp1 = a1^a6^a11^a16^a21 = c1
7723 __ rax1(tmp2, tmp0, tmp1); // d0
7724 {
7725
7726 Register tmp3, tmp4;
7727 if (can_use_fp && can_use_r18) {
7728 tmp3 = rfp;
7729 tmp4 = r18_tls;
7730 } else {
7731 tmp3 = a4;
7732 tmp4 = a9;
7733 __ stp(tmp3, tmp4, __ pre(sp, -16));
7734 }
7735
7736 __ eor3(tmp3, a0, a5, a10);
7737 __ eor3(tmp4, tmp3, a15, a20); // tmp4 = a0^a5^a10^a15^a20 = c0
7738 __ eor(a0, a0, tmp2);
7739 __ eor(a5, a5, tmp2);
7740 __ eor(a10, a10, tmp2);
7741 __ eor(a15, a15, tmp2);
7742 __ eor(a20, a20, tmp2); // d0(tmp2)
7743 __ eor3(tmp3, a2, a7, a12);
7744 __ eor3(tmp2, tmp3, a17, a22); // tmp2 = a2^a7^a12^a17^a22 = c2
7745 __ rax1(tmp3, tmp4, tmp2); // d1
7746 __ eor(a1, a1, tmp3);
7747 __ eor(a6, a6, tmp3);
7748 __ eor(a11, a11, tmp3);
7749 __ eor(a16, a16, tmp3);
7750 __ eor(a21, a21, tmp3); // d1(tmp3)
7751 __ rax1(tmp3, tmp2, tmp0); // d3
7752 __ eor3(tmp2, a3, a8, a13);
7753 __ eor3(tmp0, tmp2, a18, a23); // tmp0 = a3^a8^a13^a18^a23 = c3
7754 __ eor(a3, a3, tmp3);
7755 __ eor(a8, a8, tmp3);
7756 __ eor(a13, a13, tmp3);
7757 __ eor(a18, a18, tmp3);
7758 __ eor(a23, a23, tmp3);
7759 __ rax1(tmp2, tmp1, tmp0); // d2
7760 __ eor(a2, a2, tmp2);
7761 __ eor(a7, a7, tmp2);
7762 __ eor(a12, a12, tmp2);
7763 __ rax1(tmp0, tmp0, tmp4); // d4
7764 if (!can_use_fp || !can_use_r18) {
7765 __ ldp(tmp3, tmp4, __ post(sp, 16));
7766 }
7767 __ eor(a17, a17, tmp2);
7768 __ eor(a22, a22, tmp2);
7769 __ eor(a4, a4, tmp0);
7770 __ eor(a9, a9, tmp0);
7771 __ eor(a14, a14, tmp0);
7772 __ eor(a19, a19, tmp0);
7773 __ eor(a24, a24, tmp0);
7774 }
7775
7776 __ rol(tmp0, a10, 3);
7777 __ rol(a10, a1, 1);
7778 __ rol(a1, a6, 44);
7779 __ rol(a6, a9, 20);
7780 __ rol(a9, a22, 61);
7781 __ rol(a22, a14, 39);
7782 __ rol(a14, a20, 18);
7783 __ rol(a20, a2, 62);
7784 __ rol(a2, a12, 43);
7785 __ rol(a12, a13, 25);
7786 __ rol(a13, a19, 8) ;
7787 __ rol(a19, a23, 56);
7788 __ rol(a23, a15, 41);
7789 __ rol(a15, a4, 27);
7790 __ rol(a4, a24, 14);
7791 __ rol(a24, a21, 2);
7792 __ rol(a21, a8, 55);
7793 __ rol(a8, a16, 45);
7794 __ rol(a16, a5, 36);
7795 __ rol(a5, a3, 28);
7796 __ rol(a3, a18, 21);
7797 __ rol(a18, a17, 15);
7798 __ rol(a17, a11, 10);
7799 __ rol(a11, a7, 6);
7800 __ mov(a7, tmp0);
7801
7802 bcax5(a0, a1, a2, a3, a4, tmp0, tmp1, tmp2);
7803 bcax5(a5, a6, a7, a8, a9, tmp0, tmp1, tmp2);
7804 bcax5(a10, a11, a12, a13, a14, tmp0, tmp1, tmp2);
7805 bcax5(a15, a16, a17, a18, a19, tmp0, tmp1, tmp2);
7806 bcax5(a20, a21, a22, a23, a24, tmp0, tmp1, tmp2);
7807
7808 __ ldr(tmp1, __ post(rc, 8));
7809 __ eor(a0, a0, tmp1);
7810
7811 }
7812
7813 // Arguments:
7814 //
7815 // Inputs:
7816 // c_rarg0 - byte[] source+offset
7817 // c_rarg1 - byte[] SHA.state
7818 // c_rarg2 - int block_size
7819 // c_rarg3 - int offset
7820 // c_rarg4 - int limit
7821 //
7822 address generate_sha3_implCompress_gpr(StubId stub_id) {
7823 bool multi_block;
7824 switch (stub_id) {
7825 case StubId::stubgen_sha3_implCompress_id:
7826 multi_block = false;
7827 break;
7828 case StubId::stubgen_sha3_implCompressMB_id:
7829 multi_block = true;
7830 break;
7831 default:
7832 ShouldNotReachHere();
7833 }
7834 int entry_count = StubInfo::entry_count(stub_id);
7835 assert(entry_count == 1, "sanity check");
7836 address start = load_archive_data(stub_id);
7837 if (start != nullptr) {
7838 return start;
7839 }
7840 __ align(CodeEntryAlignment);
7841 StubCodeMark mark(this, stub_id);
7842 start = __ pc();
7843
7844 Register buf = c_rarg0;
7845 Register state = c_rarg1;
7846 Register block_size = c_rarg2;
7847 Register ofs = c_rarg3;
7848 Register limit = c_rarg4;
7849
7850 // use r3.r17,r19..r28 to keep a0..a24.
7851 // a0..a24 are respective locals from SHA3.java
7852 Register a0 = r25,
7853 a1 = r26,
7854 a2 = r27,
7855 a3 = r3,
7856 a4 = r4,
7857 a5 = r5,
7858 a6 = r6,
7859 a7 = r7,
7860 a8 = rscratch1, // r8
7861 a9 = rscratch2, // r9
7862 a10 = r10,
7863 a11 = r11,
7864 a12 = r12,
7865 a13 = r13,
7866 a14 = r14,
7867 a15 = r15,
7868 a16 = r16,
7869 a17 = r17,
7870 a18 = r28,
7871 a19 = r19,
7872 a20 = r20,
7873 a21 = r21,
7874 a22 = r22,
7875 a23 = r23,
7876 a24 = r24;
7877
7878 Register tmp0 = block_size, tmp1 = buf, tmp2 = state, tmp3 = r30;
7879
7880 Label sha3_loop, rounds24_preloop, loop_body;
7881 Label sha3_512_or_sha3_384, shake128;
7882
7883 bool can_use_r18 = false;
7884 #ifndef R18_RESERVED
7885 can_use_r18 = true;
7886 #endif
7887 bool can_use_fp = !PreserveFramePointer;
7888
7889 __ enter();
7890
7891 // save almost all yet unsaved gpr registers on stack
7892 __ str(block_size, __ pre(sp, -128));
7893 if (multi_block) {
7894 __ stpw(ofs, limit, Address(sp, 8));
7895 }
7896 // 8 bytes at sp+16 will be used to keep buf
7897 __ stp(r19, r20, Address(sp, 32));
7898 __ stp(r21, r22, Address(sp, 48));
7899 __ stp(r23, r24, Address(sp, 64));
7900 __ stp(r25, r26, Address(sp, 80));
7901 __ stp(r27, r28, Address(sp, 96));
7902 if (can_use_r18 && can_use_fp) {
7903 __ stp(r18_tls, state, Address(sp, 112));
7904 } else {
7905 __ str(state, Address(sp, 112));
7906 }
7907
7908 // begin sha3 calculations: loading a0..a24 from state arrary
7909 __ ldp(a0, a1, state);
7910 __ ldp(a2, a3, Address(state, 16));
7911 __ ldp(a4, a5, Address(state, 32));
7912 __ ldp(a6, a7, Address(state, 48));
7913 __ ldp(a8, a9, Address(state, 64));
7914 __ ldp(a10, a11, Address(state, 80));
7915 __ ldp(a12, a13, Address(state, 96));
7916 __ ldp(a14, a15, Address(state, 112));
7917 __ ldp(a16, a17, Address(state, 128));
7918 __ ldp(a18, a19, Address(state, 144));
7919 __ ldp(a20, a21, Address(state, 160));
7920 __ ldp(a22, a23, Address(state, 176));
7921 __ ldr(a24, Address(state, 192));
7922
7923 __ BIND(sha3_loop);
7924
7925 // load input
7926 __ ldp(tmp3, tmp2, __ post(buf, 16));
7927 __ eor(a0, a0, tmp3);
7928 __ eor(a1, a1, tmp2);
7929 __ ldp(tmp3, tmp2, __ post(buf, 16));
7930 __ eor(a2, a2, tmp3);
7931 __ eor(a3, a3, tmp2);
7932 __ ldp(tmp3, tmp2, __ post(buf, 16));
7933 __ eor(a4, a4, tmp3);
7934 __ eor(a5, a5, tmp2);
7935 __ ldr(tmp3, __ post(buf, 8));
7936 __ eor(a6, a6, tmp3);
7937
7938 // block_size == 72, SHA3-512; block_size == 104, SHA3-384
7939 __ tbz(block_size, 7, sha3_512_or_sha3_384);
7940
7941 __ ldp(tmp3, tmp2, __ post(buf, 16));
7942 __ eor(a7, a7, tmp3);
7943 __ eor(a8, a8, tmp2);
7944 __ ldp(tmp3, tmp2, __ post(buf, 16));
7945 __ eor(a9, a9, tmp3);
7946 __ eor(a10, a10, tmp2);
7947 __ ldp(tmp3, tmp2, __ post(buf, 16));
7948 __ eor(a11, a11, tmp3);
7949 __ eor(a12, a12, tmp2);
7950 __ ldp(tmp3, tmp2, __ post(buf, 16));
7951 __ eor(a13, a13, tmp3);
7952 __ eor(a14, a14, tmp2);
7953 __ ldp(tmp3, tmp2, __ post(buf, 16));
7954 __ eor(a15, a15, tmp3);
7955 __ eor(a16, a16, tmp2);
7956
7957 // block_size == 136, bit4 == 0 and bit5 == 0, SHA3-256 or SHAKE256
7958 __ andw(tmp2, block_size, 48);
7959 __ cbzw(tmp2, rounds24_preloop);
7960 __ tbnz(block_size, 5, shake128);
7961 // block_size == 144, bit5 == 0, SHA3-244
7962 __ ldr(tmp3, __ post(buf, 8));
7963 __ eor(a17, a17, tmp3);
7964 __ b(rounds24_preloop);
7965
7966 __ BIND(shake128);
7967 __ ldp(tmp3, tmp2, __ post(buf, 16));
7968 __ eor(a17, a17, tmp3);
7969 __ eor(a18, a18, tmp2);
7970 __ ldp(tmp3, tmp2, __ post(buf, 16));
7971 __ eor(a19, a19, tmp3);
7972 __ eor(a20, a20, tmp2);
7973 __ b(rounds24_preloop); // block_size == 168, SHAKE128
7974
7975 __ BIND(sha3_512_or_sha3_384);
7976 __ ldp(tmp3, tmp2, __ post(buf, 16));
7977 __ eor(a7, a7, tmp3);
7978 __ eor(a8, a8, tmp2);
7979 __ tbz(block_size, 5, rounds24_preloop); // SHA3-512
7980
7981 // SHA3-384
7982 __ ldp(tmp3, tmp2, __ post(buf, 16));
7983 __ eor(a9, a9, tmp3);
7984 __ eor(a10, a10, tmp2);
7985 __ ldp(tmp3, tmp2, __ post(buf, 16));
7986 __ eor(a11, a11, tmp3);
7987 __ eor(a12, a12, tmp2);
7988
7989 __ BIND(rounds24_preloop);
7990 __ fmovs(v0, 24.0); // float loop counter,
7991 __ fmovs(v1, 1.0); // exact representation
7992
7993 __ str(buf, Address(sp, 16));
7994 __ lea(tmp3, ExternalAddress((address) _sha3_round_consts));
7995
7996 __ BIND(loop_body);
7997 keccak_round_gpr(can_use_fp, can_use_r18, tmp3,
7998 a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12,
7999 a13, a14, a15, a16, a17, a18, a19, a20, a21, a22, a23, a24,
8000 tmp0, tmp1, tmp2);
8001 __ fsubs(v0, v0, v1);
8002 __ fcmps(v0, 0.0);
8003 __ br(__ NE, loop_body);
8004
8005 if (multi_block) {
8006 __ ldrw(block_size, sp); // block_size
8007 __ ldpw(tmp2, tmp1, Address(sp, 8)); // offset, limit
8008 __ addw(tmp2, tmp2, block_size);
8009 __ cmpw(tmp2, tmp1);
8010 __ strw(tmp2, Address(sp, 8)); // store offset in case we're jumping
8011 __ ldr(buf, Address(sp, 16)); // restore buf in case we're jumping
8012 __ br(Assembler::LE, sha3_loop);
8013 __ movw(c_rarg0, tmp2); // return offset
8014 }
8015 if (can_use_fp && can_use_r18) {
8016 __ ldp(r18_tls, state, Address(sp, 112));
8017 } else {
8018 __ ldr(state, Address(sp, 112));
8019 }
8020 // save calculated sha3 state
8021 __ stp(a0, a1, Address(state));
8022 __ stp(a2, a3, Address(state, 16));
8023 __ stp(a4, a5, Address(state, 32));
8024 __ stp(a6, a7, Address(state, 48));
8025 __ stp(a8, a9, Address(state, 64));
8026 __ stp(a10, a11, Address(state, 80));
8027 __ stp(a12, a13, Address(state, 96));
8028 __ stp(a14, a15, Address(state, 112));
8029 __ stp(a16, a17, Address(state, 128));
8030 __ stp(a18, a19, Address(state, 144));
8031 __ stp(a20, a21, Address(state, 160));
8032 __ stp(a22, a23, Address(state, 176));
8033 __ str(a24, Address(state, 192));
8034
8035 // restore required registers from stack
8036 __ ldp(r19, r20, Address(sp, 32));
8037 __ ldp(r21, r22, Address(sp, 48));
8038 __ ldp(r23, r24, Address(sp, 64));
8039 __ ldp(r25, r26, Address(sp, 80));
8040 __ ldp(r27, r28, Address(sp, 96));
8041 if (can_use_fp && can_use_r18) {
8042 __ add(rfp, sp, 128); // leave() will copy rfp to sp below
8043 } // else no need to recalculate rfp, since it wasn't changed
8044
8045 __ leave();
8046
8047 __ ret(lr);
8048
8049 // record the stub entry and end
8050 store_archive_data(stub_id, start, __ pc());
8051
8052 return start;
8053 }
8054
8055 /**
8056 * Arguments:
8057 *
8058 * Inputs:
8059 * c_rarg0 - int crc
8060 * c_rarg1 - byte* buf
8061 * c_rarg2 - int length
8062 *
8063 * Output:
8064 * rax - int crc result
8065 */
8066 address generate_updateBytesCRC32() {
8067 assert(UseCRC32Intrinsics, "what are we doing here?");
8068 StubId stub_id = StubId::stubgen_updateBytesCRC32_id;
8069 int entry_count = StubInfo::entry_count(stub_id);
8070 assert(entry_count == 1, "sanity check");
8071 address start = load_archive_data(stub_id);
8072 if (start != nullptr) {
8073 return start;
8074 }
8075 __ align(CodeEntryAlignment);
8076 StubCodeMark mark(this, stub_id);
8077
8078 start = __ pc();
8079
8080 const Register crc = c_rarg0; // crc
8081 const Register buf = c_rarg1; // source java byte array address
8082 const Register len = c_rarg2; // length
8083 const Register table0 = c_rarg3; // crc_table address
8084 const Register table1 = c_rarg4;
8085 const Register table2 = c_rarg5;
8086 const Register table3 = c_rarg6;
8087 const Register tmp3 = c_rarg7;
8088
8089 BLOCK_COMMENT("Entry:");
8090 __ enter(); // required for proper stackwalking of RuntimeStub frame
8091
8092 __ kernel_crc32(crc, buf, len,
8093 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8094
8095 __ leave(); // required for proper stackwalking of RuntimeStub frame
8096 __ ret(lr);
8097
8098 // record the stub entry and end
8099 store_archive_data(stub_id, start, __ pc());
8100
8101 return start;
8102 }
8103
8104 /**
8105 * Arguments:
8106 *
8107 * Inputs:
8108 * c_rarg0 - int crc
8109 * c_rarg1 - byte* buf
8110 * c_rarg2 - int length
8111 * c_rarg3 - int* table
8112 *
8113 * Output:
8114 * r0 - int crc result
8115 */
8116 address generate_updateBytesCRC32C() {
8117 assert(UseCRC32CIntrinsics, "what are we doing here?");
8118 StubId stub_id = StubId::stubgen_updateBytesCRC32C_id;
8119 int entry_count = StubInfo::entry_count(stub_id);
8120 assert(entry_count == 1, "sanity check");
8121 address start = load_archive_data(stub_id);
8122 if (start != nullptr) {
8123 return start;
8124 }
8125 __ align(CodeEntryAlignment);
8126 StubCodeMark mark(this, stub_id);
8127
8128 start = __ pc();
8129
8130 const Register crc = c_rarg0; // crc
8131 const Register buf = c_rarg1; // source java byte array address
8132 const Register len = c_rarg2; // length
8133 const Register table0 = c_rarg3; // crc_table address
8134 const Register table1 = c_rarg4;
8135 const Register table2 = c_rarg5;
8136 const Register table3 = c_rarg6;
8137 const Register tmp3 = c_rarg7;
8138
8139 BLOCK_COMMENT("Entry:");
8140 __ enter(); // required for proper stackwalking of RuntimeStub frame
8141
8142 __ kernel_crc32c(crc, buf, len,
8143 table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
8144
8145 __ leave(); // required for proper stackwalking of RuntimeStub frame
8146 __ ret(lr);
8147
8148 // record the stub entry and end
8149 store_archive_data(stub_id, start, __ pc());
8150
8151 return start;
8152 }
8153
8154 /***
8155 * Arguments:
8156 *
8157 * Inputs:
8158 * c_rarg0 - int adler
8159 * c_rarg1 - byte* buff
8160 * c_rarg2 - int len
8161 *
8162 * Output:
8163 * c_rarg0 - int adler result
8164 */
8165 address generate_updateBytesAdler32() {
8166 StubId stub_id = StubId::stubgen_updateBytesAdler32_id;
8167 int entry_count = StubInfo::entry_count(stub_id);
8168 assert(entry_count == 1, "sanity check");
8169 address start = load_archive_data(stub_id);
8170 if (start != nullptr) {
8171 return start;
8172 }
8173 __ align(CodeEntryAlignment);
8174 StubCodeMark mark(this, stub_id);
8175 start = __ pc();
8176
8177 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
8178
8179 // Aliases
8180 Register adler = c_rarg0;
8181 Register s1 = c_rarg0;
8182 Register s2 = c_rarg3;
8183 Register buff = c_rarg1;
8184 Register len = c_rarg2;
8185 Register nmax = r4;
8186 Register base = r5;
8187 Register count = r6;
8188 Register temp0 = rscratch1;
8189 Register temp1 = rscratch2;
8190 FloatRegister vbytes = v0;
8191 FloatRegister vs1acc = v1;
8192 FloatRegister vs2acc = v2;
8193 FloatRegister vtable = v3;
8194
8195 // Max number of bytes we can process before having to take the mod
8196 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
8197 uint64_t BASE = 0xfff1;
8198 uint64_t NMAX = 0x15B0;
8199
8200 __ mov(base, BASE);
8201 __ mov(nmax, NMAX);
8202
8203 // Load accumulation coefficients for the upper 16 bits
8204 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table));
8205 __ ld1(vtable, __ T16B, Address(temp0));
8206
8207 // s1 is initialized to the lower 16 bits of adler
8208 // s2 is initialized to the upper 16 bits of adler
8209 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff)
8210 __ uxth(s1, adler); // s1 = (adler & 0xffff)
8211
8212 // The pipelined loop needs at least 16 elements for 1 iteration
8213 // It does check this, but it is more effective to skip to the cleanup loop
8214 __ cmp(len, (u1)16);
8215 __ br(Assembler::HS, L_nmax);
8216 __ cbz(len, L_combine);
8217
8218 __ bind(L_simple_by1_loop);
8219 __ ldrb(temp0, Address(__ post(buff, 1)));
8220 __ add(s1, s1, temp0);
8221 __ add(s2, s2, s1);
8222 __ subs(len, len, 1);
8223 __ br(Assembler::HI, L_simple_by1_loop);
8224
8225 // s1 = s1 % BASE
8226 __ subs(temp0, s1, base);
8227 __ csel(s1, temp0, s1, Assembler::HS);
8228
8229 // s2 = s2 % BASE
8230 __ lsr(temp0, s2, 16);
8231 __ lsl(temp1, temp0, 4);
8232 __ sub(temp1, temp1, temp0);
8233 __ add(s2, temp1, s2, ext::uxth);
8234
8235 __ subs(temp0, s2, base);
8236 __ csel(s2, temp0, s2, Assembler::HS);
8237
8238 __ b(L_combine);
8239
8240 __ bind(L_nmax);
8241 __ subs(len, len, nmax);
8242 __ sub(count, nmax, 16);
8243 __ br(Assembler::LO, L_by16);
8244
8245 __ bind(L_nmax_loop);
8246
8247 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8248 vbytes, vs1acc, vs2acc, vtable);
8249
8250 __ subs(count, count, 16);
8251 __ br(Assembler::HS, L_nmax_loop);
8252
8253 // s1 = s1 % BASE
8254 __ lsr(temp0, s1, 16);
8255 __ lsl(temp1, temp0, 4);
8256 __ sub(temp1, temp1, temp0);
8257 __ add(temp1, temp1, s1, ext::uxth);
8258
8259 __ lsr(temp0, temp1, 16);
8260 __ lsl(s1, temp0, 4);
8261 __ sub(s1, s1, temp0);
8262 __ add(s1, s1, temp1, ext:: uxth);
8263
8264 __ subs(temp0, s1, base);
8265 __ csel(s1, temp0, s1, Assembler::HS);
8266
8267 // s2 = s2 % BASE
8268 __ lsr(temp0, s2, 16);
8269 __ lsl(temp1, temp0, 4);
8270 __ sub(temp1, temp1, temp0);
8271 __ add(temp1, temp1, s2, ext::uxth);
8272
8273 __ lsr(temp0, temp1, 16);
8274 __ lsl(s2, temp0, 4);
8275 __ sub(s2, s2, temp0);
8276 __ add(s2, s2, temp1, ext:: uxth);
8277
8278 __ subs(temp0, s2, base);
8279 __ csel(s2, temp0, s2, Assembler::HS);
8280
8281 __ subs(len, len, nmax);
8282 __ sub(count, nmax, 16);
8283 __ br(Assembler::HS, L_nmax_loop);
8284
8285 __ bind(L_by16);
8286 __ adds(len, len, count);
8287 __ br(Assembler::LO, L_by1);
8288
8289 __ bind(L_by16_loop);
8290
8291 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1,
8292 vbytes, vs1acc, vs2acc, vtable);
8293
8294 __ subs(len, len, 16);
8295 __ br(Assembler::HS, L_by16_loop);
8296
8297 __ bind(L_by1);
8298 __ adds(len, len, 15);
8299 __ br(Assembler::LO, L_do_mod);
8300
8301 __ bind(L_by1_loop);
8302 __ ldrb(temp0, Address(__ post(buff, 1)));
8303 __ add(s1, temp0, s1);
8304 __ add(s2, s2, s1);
8305 __ subs(len, len, 1);
8306 __ br(Assembler::HS, L_by1_loop);
8307
8308 __ bind(L_do_mod);
8309 // s1 = s1 % BASE
8310 __ lsr(temp0, s1, 16);
8311 __ lsl(temp1, temp0, 4);
8312 __ sub(temp1, temp1, temp0);
8313 __ add(temp1, temp1, s1, ext::uxth);
8314
8315 __ lsr(temp0, temp1, 16);
8316 __ lsl(s1, temp0, 4);
8317 __ sub(s1, s1, temp0);
8318 __ add(s1, s1, temp1, ext:: uxth);
8319
8320 __ subs(temp0, s1, base);
8321 __ csel(s1, temp0, s1, Assembler::HS);
8322
8323 // s2 = s2 % BASE
8324 __ lsr(temp0, s2, 16);
8325 __ lsl(temp1, temp0, 4);
8326 __ sub(temp1, temp1, temp0);
8327 __ add(temp1, temp1, s2, ext::uxth);
8328
8329 __ lsr(temp0, temp1, 16);
8330 __ lsl(s2, temp0, 4);
8331 __ sub(s2, s2, temp0);
8332 __ add(s2, s2, temp1, ext:: uxth);
8333
8334 __ subs(temp0, s2, base);
8335 __ csel(s2, temp0, s2, Assembler::HS);
8336
8337 // Combine lower bits and higher bits
8338 __ bind(L_combine);
8339 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
8340
8341 __ ret(lr);
8342
8343 // record the stub entry and end
8344 store_archive_data(stub_id, start, __ pc());
8345
8346 return start;
8347 }
8348
8349 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff,
8350 Register temp0, Register temp1, FloatRegister vbytes,
8351 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) {
8352 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes.
8353 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration.
8354 // In non-vectorized code, we update s1 and s2 as:
8355 // s1 <- s1 + b1
8356 // s2 <- s2 + s1
8357 // s1 <- s1 + b2
8358 // s2 <- s2 + b1
8359 // ...
8360 // s1 <- s1 + b16
8361 // s2 <- s2 + s1
8362 // Putting above assignments together, we have:
8363 // s1_new = s1 + b1 + b2 + ... + b16
8364 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16)
8365 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1)
8366 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1)
8367 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16)));
8368
8369 // s2 = s2 + s1 * 16
8370 __ add(s2, s2, s1, Assembler::LSL, 4);
8371
8372 // vs1acc = b1 + b2 + b3 + ... + b16
8373 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1)
8374 __ umullv(vs2acc, __ T8B, vtable, vbytes);
8375 __ umlalv(vs2acc, __ T16B, vtable, vbytes);
8376 __ uaddlv(vs1acc, __ T16B, vbytes);
8377 __ uaddlv(vs2acc, __ T8H, vs2acc);
8378
8379 // s1 = s1 + vs1acc, s2 = s2 + vs2acc
8380 __ fmovd(temp0, vs1acc);
8381 __ fmovd(temp1, vs2acc);
8382 __ add(s1, s1, temp0);
8383 __ add(s2, s2, temp1);
8384 }
8385
8386 /**
8387 * Arguments:
8388 *
8389 * Input:
8390 * c_rarg0 - x address
8391 * c_rarg1 - x length
8392 * c_rarg2 - y address
8393 * c_rarg3 - y length
8394 * c_rarg4 - z address
8395 */
8396 address generate_multiplyToLen() {
8397 StubId stub_id = StubId::stubgen_multiplyToLen_id;
8398 int entry_count = StubInfo::entry_count(stub_id);
8399 assert(entry_count == 1, "sanity check");
8400 address start = load_archive_data(stub_id);
8401 if (start != nullptr) {
8402 return start;
8403 }
8404 __ align(CodeEntryAlignment);
8405 StubCodeMark mark(this, stub_id);
8406
8407 start = __ pc();
8408 const Register x = r0;
8409 const Register xlen = r1;
8410 const Register y = r2;
8411 const Register ylen = r3;
8412 const Register z = r4;
8413
8414 const Register tmp0 = r5;
8415 const Register tmp1 = r10;
8416 const Register tmp2 = r11;
8417 const Register tmp3 = r12;
8418 const Register tmp4 = r13;
8419 const Register tmp5 = r14;
8420 const Register tmp6 = r15;
8421 const Register tmp7 = r16;
8422
8423 BLOCK_COMMENT("Entry:");
8424 __ enter(); // required for proper stackwalking of RuntimeStub frame
8425 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8426 __ leave(); // required for proper stackwalking of RuntimeStub frame
8427 __ ret(lr);
8428
8429 // record the stub entry and end
8430 store_archive_data(stub_id, start, __ pc());
8431
8432 return start;
8433 }
8434
8435 address generate_squareToLen() {
8436 // squareToLen algorithm for sizes 1..127 described in java code works
8437 // faster than multiply_to_len on some CPUs and slower on others, but
8438 // multiply_to_len shows a bit better overall results
8439 StubId stub_id = StubId::stubgen_squareToLen_id;
8440 int entry_count = StubInfo::entry_count(stub_id);
8441 assert(entry_count == 1, "sanity check");
8442 address start = load_archive_data(stub_id);
8443 if (start != nullptr) {
8444 return start;
8445 }
8446 __ align(CodeEntryAlignment);
8447 StubCodeMark mark(this, stub_id);
8448 start = __ pc();
8449
8450 const Register x = r0;
8451 const Register xlen = r1;
8452 const Register z = r2;
8453 const Register y = r4; // == x
8454 const Register ylen = r5; // == xlen
8455
8456 const Register tmp0 = r3;
8457 const Register tmp1 = r10;
8458 const Register tmp2 = r11;
8459 const Register tmp3 = r12;
8460 const Register tmp4 = r13;
8461 const Register tmp5 = r14;
8462 const Register tmp6 = r15;
8463 const Register tmp7 = r16;
8464
8465 RegSet spilled_regs = RegSet::of(y, ylen);
8466 BLOCK_COMMENT("Entry:");
8467 __ enter();
8468 __ push(spilled_regs, sp);
8469 __ mov(y, x);
8470 __ mov(ylen, xlen);
8471 __ multiply_to_len(x, xlen, y, ylen, z, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
8472 __ pop(spilled_regs, sp);
8473 __ leave();
8474 __ ret(lr);
8475
8476 // record the stub entry and end
8477 store_archive_data(stub_id, start, __ pc());
8478
8479 return start;
8480 }
8481
8482 address generate_mulAdd() {
8483 StubId stub_id = StubId::stubgen_mulAdd_id;
8484 int entry_count = StubInfo::entry_count(stub_id);
8485 assert(entry_count == 1, "sanity check");
8486 address start = load_archive_data(stub_id);
8487 if (start != nullptr) {
8488 return start;
8489 }
8490 __ align(CodeEntryAlignment);
8491 StubCodeMark mark(this, stub_id);
8492
8493 start = __ pc();
8494
8495 const Register out = r0;
8496 const Register in = r1;
8497 const Register offset = r2;
8498 const Register len = r3;
8499 const Register k = r4;
8500
8501 BLOCK_COMMENT("Entry:");
8502 __ enter();
8503 __ mul_add(out, in, offset, len, k);
8504 __ leave();
8505 __ ret(lr);
8506
8507 // record the stub entry and end
8508 store_archive_data(stub_id, start, __ pc());
8509
8510 return start;
8511 }
8512
8513 // Arguments:
8514 //
8515 // Input:
8516 // c_rarg0 - newArr address
8517 // c_rarg1 - oldArr address
8518 // c_rarg2 - newIdx
8519 // c_rarg3 - shiftCount
8520 // c_rarg4 - numIter
8521 //
8522 address generate_bigIntegerRightShift() {
8523 StubId stub_id = StubId::stubgen_bigIntegerRightShiftWorker_id;
8524 int entry_count = StubInfo::entry_count(stub_id);
8525 assert(entry_count == 1, "sanity check");
8526 address start = load_archive_data(stub_id);
8527 if (start != nullptr) {
8528 return start;
8529 }
8530 __ align(CodeEntryAlignment);
8531 StubCodeMark mark(this, stub_id);
8532 start = __ pc();
8533
8534 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8535
8536 Register newArr = c_rarg0;
8537 Register oldArr = c_rarg1;
8538 Register newIdx = c_rarg2;
8539 Register shiftCount = c_rarg3;
8540 Register numIter = c_rarg4;
8541 Register idx = numIter;
8542
8543 Register newArrCur = rscratch1;
8544 Register shiftRevCount = rscratch2;
8545 Register oldArrCur = r13;
8546 Register oldArrNext = r14;
8547
8548 FloatRegister oldElem0 = v0;
8549 FloatRegister oldElem1 = v1;
8550 FloatRegister newElem = v2;
8551 FloatRegister shiftVCount = v3;
8552 FloatRegister shiftVRevCount = v4;
8553
8554 __ cbz(idx, Exit);
8555
8556 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8557
8558 // left shift count
8559 __ movw(shiftRevCount, 32);
8560 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8561
8562 // numIter too small to allow a 4-words SIMD loop, rolling back
8563 __ cmp(numIter, (u1)4);
8564 __ br(Assembler::LT, ShiftThree);
8565
8566 __ dup(shiftVCount, __ T4S, shiftCount);
8567 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8568 __ negr(shiftVCount, __ T4S, shiftVCount);
8569
8570 __ BIND(ShiftSIMDLoop);
8571
8572 // Calculate the load addresses
8573 __ sub(idx, idx, 4);
8574 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8575 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8576 __ add(oldArrCur, oldArrNext, 4);
8577
8578 // Load 4 words and process
8579 __ ld1(oldElem0, __ T4S, Address(oldArrCur));
8580 __ ld1(oldElem1, __ T4S, Address(oldArrNext));
8581 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8582 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8583 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8584 __ st1(newElem, __ T4S, Address(newArrCur));
8585
8586 __ cmp(idx, (u1)4);
8587 __ br(Assembler::LT, ShiftTwoLoop);
8588 __ b(ShiftSIMDLoop);
8589
8590 __ BIND(ShiftTwoLoop);
8591 __ cbz(idx, Exit);
8592 __ cmp(idx, (u1)1);
8593 __ br(Assembler::EQ, ShiftOne);
8594
8595 // Calculate the load addresses
8596 __ sub(idx, idx, 2);
8597 __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2);
8598 __ add(newArrCur, newArr, idx, Assembler::LSL, 2);
8599 __ add(oldArrCur, oldArrNext, 4);
8600
8601 // Load 2 words and process
8602 __ ld1(oldElem0, __ T2S, Address(oldArrCur));
8603 __ ld1(oldElem1, __ T2S, Address(oldArrNext));
8604 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8605 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8606 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8607 __ st1(newElem, __ T2S, Address(newArrCur));
8608 __ b(ShiftTwoLoop);
8609
8610 __ BIND(ShiftThree);
8611 __ tbz(idx, 1, ShiftOne);
8612 __ tbz(idx, 0, ShiftTwo);
8613 __ ldrw(r10, Address(oldArr, 12));
8614 __ ldrw(r11, Address(oldArr, 8));
8615 __ lsrvw(r10, r10, shiftCount);
8616 __ lslvw(r11, r11, shiftRevCount);
8617 __ orrw(r12, r10, r11);
8618 __ strw(r12, Address(newArr, 8));
8619
8620 __ BIND(ShiftTwo);
8621 __ ldrw(r10, Address(oldArr, 8));
8622 __ ldrw(r11, Address(oldArr, 4));
8623 __ lsrvw(r10, r10, shiftCount);
8624 __ lslvw(r11, r11, shiftRevCount);
8625 __ orrw(r12, r10, r11);
8626 __ strw(r12, Address(newArr, 4));
8627
8628 __ BIND(ShiftOne);
8629 __ ldrw(r10, Address(oldArr, 4));
8630 __ ldrw(r11, Address(oldArr));
8631 __ lsrvw(r10, r10, shiftCount);
8632 __ lslvw(r11, r11, shiftRevCount);
8633 __ orrw(r12, r10, r11);
8634 __ strw(r12, Address(newArr));
8635
8636 __ BIND(Exit);
8637 __ ret(lr);
8638
8639 // record the stub entry and end
8640 store_archive_data(stub_id, start, __ pc());
8641
8642 return start;
8643 }
8644
8645 // Arguments:
8646 //
8647 // Input:
8648 // c_rarg0 - newArr address
8649 // c_rarg1 - oldArr address
8650 // c_rarg2 - newIdx
8651 // c_rarg3 - shiftCount
8652 // c_rarg4 - numIter
8653 //
8654 address generate_bigIntegerLeftShift() {
8655 StubId stub_id = StubId::stubgen_bigIntegerLeftShiftWorker_id;
8656 int entry_count = StubInfo::entry_count(stub_id);
8657 assert(entry_count == 1, "sanity check");
8658 address start = load_archive_data(stub_id);
8659 if (start != nullptr) {
8660 return start;
8661 }
8662 __ align(CodeEntryAlignment);
8663 StubCodeMark mark(this, stub_id);
8664 start = __ pc();
8665
8666 Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
8667
8668 Register newArr = c_rarg0;
8669 Register oldArr = c_rarg1;
8670 Register newIdx = c_rarg2;
8671 Register shiftCount = c_rarg3;
8672 Register numIter = c_rarg4;
8673
8674 Register shiftRevCount = rscratch1;
8675 Register oldArrNext = rscratch2;
8676
8677 FloatRegister oldElem0 = v0;
8678 FloatRegister oldElem1 = v1;
8679 FloatRegister newElem = v2;
8680 FloatRegister shiftVCount = v3;
8681 FloatRegister shiftVRevCount = v4;
8682
8683 __ cbz(numIter, Exit);
8684
8685 __ add(oldArrNext, oldArr, 4);
8686 __ add(newArr, newArr, newIdx, Assembler::LSL, 2);
8687
8688 // right shift count
8689 __ movw(shiftRevCount, 32);
8690 __ subw(shiftRevCount, shiftRevCount, shiftCount);
8691
8692 // numIter too small to allow a 4-words SIMD loop, rolling back
8693 __ cmp(numIter, (u1)4);
8694 __ br(Assembler::LT, ShiftThree);
8695
8696 __ dup(shiftVCount, __ T4S, shiftCount);
8697 __ dup(shiftVRevCount, __ T4S, shiftRevCount);
8698 __ negr(shiftVRevCount, __ T4S, shiftVRevCount);
8699
8700 __ BIND(ShiftSIMDLoop);
8701
8702 // load 4 words and process
8703 __ ld1(oldElem0, __ T4S, __ post(oldArr, 16));
8704 __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16));
8705 __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount);
8706 __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount);
8707 __ orr(newElem, __ T16B, oldElem0, oldElem1);
8708 __ st1(newElem, __ T4S, __ post(newArr, 16));
8709 __ sub(numIter, numIter, 4);
8710
8711 __ cmp(numIter, (u1)4);
8712 __ br(Assembler::LT, ShiftTwoLoop);
8713 __ b(ShiftSIMDLoop);
8714
8715 __ BIND(ShiftTwoLoop);
8716 __ cbz(numIter, Exit);
8717 __ cmp(numIter, (u1)1);
8718 __ br(Assembler::EQ, ShiftOne);
8719
8720 // load 2 words and process
8721 __ ld1(oldElem0, __ T2S, __ post(oldArr, 8));
8722 __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8));
8723 __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount);
8724 __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount);
8725 __ orr(newElem, __ T8B, oldElem0, oldElem1);
8726 __ st1(newElem, __ T2S, __ post(newArr, 8));
8727 __ sub(numIter, numIter, 2);
8728 __ b(ShiftTwoLoop);
8729
8730 __ BIND(ShiftThree);
8731 __ ldrw(r10, __ post(oldArr, 4));
8732 __ ldrw(r11, __ post(oldArrNext, 4));
8733 __ lslvw(r10, r10, shiftCount);
8734 __ lsrvw(r11, r11, shiftRevCount);
8735 __ orrw(r12, r10, r11);
8736 __ strw(r12, __ post(newArr, 4));
8737 __ tbz(numIter, 1, Exit);
8738 __ tbz(numIter, 0, ShiftOne);
8739
8740 __ BIND(ShiftTwo);
8741 __ ldrw(r10, __ post(oldArr, 4));
8742 __ ldrw(r11, __ post(oldArrNext, 4));
8743 __ lslvw(r10, r10, shiftCount);
8744 __ lsrvw(r11, r11, shiftRevCount);
8745 __ orrw(r12, r10, r11);
8746 __ strw(r12, __ post(newArr, 4));
8747
8748 __ BIND(ShiftOne);
8749 __ ldrw(r10, Address(oldArr));
8750 __ ldrw(r11, Address(oldArrNext));
8751 __ lslvw(r10, r10, shiftCount);
8752 __ lsrvw(r11, r11, shiftRevCount);
8753 __ orrw(r12, r10, r11);
8754 __ strw(r12, Address(newArr));
8755
8756 __ BIND(Exit);
8757 __ ret(lr);
8758
8759 // record the stub entry and end
8760 store_archive_data(stub_id, start, __ pc());
8761
8762 return start;
8763 }
8764
8765 address generate_count_positives(address &count_positives_long) {
8766 StubId stub_id = StubId::stubgen_count_positives_id;
8767 GrowableArray<address> entries;
8768 int entry_count = StubInfo::entry_count(stub_id);
8769 // We have an extra entry for count_positives_long.
8770 assert(entry_count == 2, "sanity check");
8771 address start = load_archive_data(stub_id, &entries);
8772 if (start != nullptr) {
8773 assert(entries.length() == 1,
8774 "unexpected extra entry count %d", entries.length());
8775 count_positives_long = entries.at(0);
8776 return start;
8777 }
8778 const u1 large_loop_size = 64;
8779 const uint64_t UPPER_BIT_MASK=0x8080808080808080;
8780 int dcache_line = VM_Version::dcache_line_size();
8781
8782 Register ary1 = r1, len = r2, result = r0;
8783
8784 __ align(CodeEntryAlignment);
8785 StubCodeMark mark(this, stub_id);
8786
8787 address entry = __ pc();
8788
8789 __ enter();
8790 // precondition: a copy of len is already in result
8791 // __ mov(result, len);
8792
8793 Label RET_ADJUST, RET_ADJUST_16, RET_ADJUST_LONG, RET_NO_POP, RET_LEN, ALIGNED, LOOP16, CHECK_16,
8794 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
8795
8796 __ cmp(len, (u1)15);
8797 __ br(Assembler::GT, LEN_OVER_15);
8798 // The only case when execution falls into this code is when pointer is near
8799 // the end of memory page and we have to avoid reading next page
8800 __ add(ary1, ary1, len);
8801 __ subs(len, len, 8);
8802 __ br(Assembler::GT, LEN_OVER_8);
8803 __ ldr(rscratch2, Address(ary1, -8));
8804 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes.
8805 __ lsrv(rscratch2, rscratch2, rscratch1);
8806 __ tst(rscratch2, UPPER_BIT_MASK);
8807 __ csel(result, zr, result, Assembler::NE);
8808 __ leave();
8809 __ ret(lr);
8810 __ bind(LEN_OVER_8);
8811 __ ldp(rscratch1, rscratch2, Address(ary1, -16));
8812 __ sub(len, len, 8); // no data dep., then sub can be executed while loading
8813 __ tst(rscratch2, UPPER_BIT_MASK);
8814 __ br(Assembler::NE, RET_NO_POP);
8815 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
8816 __ lsrv(rscratch1, rscratch1, rscratch2);
8817 __ tst(rscratch1, UPPER_BIT_MASK);
8818 __ bind(RET_NO_POP);
8819 __ csel(result, zr, result, Assembler::NE);
8820 __ leave();
8821 __ ret(lr);
8822
8823 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
8824 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
8825
8826 count_positives_long = __ pc(); // 2nd entry point
8827 entries.append(count_positives_long);
8828
8829 __ enter();
8830
8831 __ bind(LEN_OVER_15);
8832 __ push(spilled_regs, sp);
8833 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
8834 __ cbz(rscratch2, ALIGNED);
8835 __ ldp(tmp6, tmp1, Address(ary1));
8836 __ mov(tmp5, 16);
8837 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
8838 __ add(ary1, ary1, rscratch1);
8839 __ orr(tmp6, tmp6, tmp1);
8840 __ tst(tmp6, UPPER_BIT_MASK);
8841 __ br(Assembler::NE, RET_ADJUST);
8842 __ sub(len, len, rscratch1);
8843
8844 __ bind(ALIGNED);
8845 __ cmp(len, large_loop_size);
8846 __ br(Assembler::LT, CHECK_16);
8847 // Perform 16-byte load as early return in pre-loop to handle situation
8848 // when initially aligned large array has negative values at starting bytes,
8849 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
8850 // slower. Cases with negative bytes further ahead won't be affected that
8851 // much. In fact, it'll be faster due to early loads, less instructions and
8852 // less branches in LARGE_LOOP.
8853 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
8854 __ sub(len, len, 16);
8855 __ orr(tmp6, tmp6, tmp1);
8856 __ tst(tmp6, UPPER_BIT_MASK);
8857 __ br(Assembler::NE, RET_ADJUST_16);
8858 __ cmp(len, large_loop_size);
8859 __ br(Assembler::LT, CHECK_16);
8860
8861 if (SoftwarePrefetchHintDistance >= 0
8862 && SoftwarePrefetchHintDistance >= dcache_line) {
8863 // initial prefetch
8864 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
8865 }
8866 __ bind(LARGE_LOOP);
8867 if (SoftwarePrefetchHintDistance >= 0) {
8868 __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
8869 }
8870 // Issue load instructions first, since it can save few CPU/MEM cycles, also
8871 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
8872 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
8873 // instructions per cycle and have less branches, but this approach disables
8874 // early return, thus, all 64 bytes are loaded and checked every time.
8875 __ ldp(tmp2, tmp3, Address(ary1));
8876 __ ldp(tmp4, tmp5, Address(ary1, 16));
8877 __ ldp(rscratch1, rscratch2, Address(ary1, 32));
8878 __ ldp(tmp6, tmp1, Address(ary1, 48));
8879 __ add(ary1, ary1, large_loop_size);
8880 __ sub(len, len, large_loop_size);
8881 __ orr(tmp2, tmp2, tmp3);
8882 __ orr(tmp4, tmp4, tmp5);
8883 __ orr(rscratch1, rscratch1, rscratch2);
8884 __ orr(tmp6, tmp6, tmp1);
8885 __ orr(tmp2, tmp2, tmp4);
8886 __ orr(rscratch1, rscratch1, tmp6);
8887 __ orr(tmp2, tmp2, rscratch1);
8888 __ tst(tmp2, UPPER_BIT_MASK);
8889 __ br(Assembler::NE, RET_ADJUST_LONG);
8890 __ cmp(len, large_loop_size);
8891 __ br(Assembler::GE, LARGE_LOOP);
8892
8893 __ bind(CHECK_16); // small 16-byte load pre-loop
8894 __ cmp(len, (u1)16);
8895 __ br(Assembler::LT, POST_LOOP16);
8896
8897 __ bind(LOOP16); // small 16-byte load loop
8898 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
8899 __ sub(len, len, 16);
8900 __ orr(tmp2, tmp2, tmp3);
8901 __ tst(tmp2, UPPER_BIT_MASK);
8902 __ br(Assembler::NE, RET_ADJUST_16);
8903 __ cmp(len, (u1)16);
8904 __ br(Assembler::GE, LOOP16); // 16-byte load loop end
8905
8906 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
8907 __ cmp(len, (u1)8);
8908 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
8909 __ ldr(tmp3, Address(__ post(ary1, 8)));
8910 __ tst(tmp3, UPPER_BIT_MASK);
8911 __ br(Assembler::NE, RET_ADJUST);
8912 __ sub(len, len, 8);
8913
8914 __ bind(POST_LOOP16_LOAD_TAIL);
8915 __ cbz(len, RET_LEN); // Can't shift left by 64 when len==0
8916 __ ldr(tmp1, Address(ary1));
8917 __ mov(tmp2, 64);
8918 __ sub(tmp4, tmp2, len, __ LSL, 3);
8919 __ lslv(tmp1, tmp1, tmp4);
8920 __ tst(tmp1, UPPER_BIT_MASK);
8921 __ br(Assembler::NE, RET_ADJUST);
8922 // Fallthrough
8923
8924 __ bind(RET_LEN);
8925 __ pop(spilled_regs, sp);
8926 __ leave();
8927 __ ret(lr);
8928
8929 // difference result - len is the count of guaranteed to be
8930 // positive bytes
8931
8932 __ bind(RET_ADJUST_LONG);
8933 __ add(len, len, (u1)(large_loop_size - 16));
8934 __ bind(RET_ADJUST_16);
8935 __ add(len, len, 16);
8936 __ bind(RET_ADJUST);
8937 __ pop(spilled_regs, sp);
8938 __ leave();
8939 __ sub(result, result, len);
8940 __ ret(lr);
8941
8942 // record the stub entry and end plus the extra entry
8943 store_archive_data(stub_id, entry, __ pc(), &entries);
8944
8945 return entry;
8946 }
8947
8948 void generate_large_array_equals_loop_nonsimd(int loopThreshold,
8949 bool usePrefetch, Label &NOT_EQUAL) {
8950 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
8951 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
8952 tmp7 = r12, tmp8 = r13;
8953 Label LOOP;
8954
8955 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8956 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8957 __ bind(LOOP);
8958 if (usePrefetch) {
8959 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
8960 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
8961 }
8962 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8963 __ eor(tmp1, tmp1, tmp2);
8964 __ eor(tmp3, tmp3, tmp4);
8965 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8966 __ orr(tmp1, tmp1, tmp3);
8967 __ cbnz(tmp1, NOT_EQUAL);
8968 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8969 __ eor(tmp5, tmp5, tmp6);
8970 __ eor(tmp7, tmp7, tmp8);
8971 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8972 __ orr(tmp5, tmp5, tmp7);
8973 __ cbnz(tmp5, NOT_EQUAL);
8974 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
8975 __ eor(tmp1, tmp1, tmp2);
8976 __ eor(tmp3, tmp3, tmp4);
8977 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
8978 __ orr(tmp1, tmp1, tmp3);
8979 __ cbnz(tmp1, NOT_EQUAL);
8980 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
8981 __ eor(tmp5, tmp5, tmp6);
8982 __ sub(cnt1, cnt1, 8 * wordSize);
8983 __ eor(tmp7, tmp7, tmp8);
8984 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
8985 // tmp6 is not used. MacroAssembler::subs is used here (rather than
8986 // cmp) because subs allows an unlimited range of immediate operand.
8987 __ subs(tmp6, cnt1, loopThreshold);
8988 __ orr(tmp5, tmp5, tmp7);
8989 __ cbnz(tmp5, NOT_EQUAL);
8990 __ br(__ GE, LOOP);
8991 // post-loop
8992 __ eor(tmp1, tmp1, tmp2);
8993 __ eor(tmp3, tmp3, tmp4);
8994 __ orr(tmp1, tmp1, tmp3);
8995 __ sub(cnt1, cnt1, 2 * wordSize);
8996 __ cbnz(tmp1, NOT_EQUAL);
8997 }
8998
8999 void generate_large_array_equals_loop_simd(int loopThreshold,
9000 bool usePrefetch, Label &NOT_EQUAL) {
9001 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
9002 tmp2 = rscratch2;
9003 Label LOOP;
9004
9005 __ bind(LOOP);
9006 if (usePrefetch) {
9007 __ prfm(Address(a1, SoftwarePrefetchHintDistance));
9008 __ prfm(Address(a2, SoftwarePrefetchHintDistance));
9009 }
9010 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
9011 __ sub(cnt1, cnt1, 8 * wordSize);
9012 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
9013 __ subs(tmp1, cnt1, loopThreshold);
9014 __ eor(v0, __ T16B, v0, v4);
9015 __ eor(v1, __ T16B, v1, v5);
9016 __ eor(v2, __ T16B, v2, v6);
9017 __ eor(v3, __ T16B, v3, v7);
9018 __ orr(v0, __ T16B, v0, v1);
9019 __ orr(v1, __ T16B, v2, v3);
9020 __ orr(v0, __ T16B, v0, v1);
9021 __ umov(tmp1, v0, __ D, 0);
9022 __ umov(tmp2, v0, __ D, 1);
9023 __ orr(tmp1, tmp1, tmp2);
9024 __ cbnz(tmp1, NOT_EQUAL);
9025 __ br(__ GE, LOOP);
9026 }
9027
9028 // a1 = r1 - array1 address
9029 // a2 = r2 - array2 address
9030 // result = r0 - return value. Already contains "false"
9031 // cnt1 = r10 - amount of elements left to check, reduced by wordSize
9032 // r3-r5 are reserved temporary registers
9033 // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2
9034 address generate_large_array_equals() {
9035 StubId stub_id = StubId::stubgen_large_array_equals_id;
9036 int entry_count = StubInfo::entry_count(stub_id);
9037 assert(entry_count == 1, "sanity check");
9038 address start = load_archive_data(stub_id);
9039 if (start != nullptr) {
9040 return start;
9041 }
9042 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
9043 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
9044 tmp7 = r12, tmp8 = r13;
9045 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
9046 SMALL_LOOP, POST_LOOP;
9047 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
9048 // calculate if at least 32 prefetched bytes are used
9049 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
9050 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
9051 RegSet spilled_regs = RegSet::range(tmp6, tmp8);
9052 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
9053 tmp5, tmp6, tmp7, tmp8);
9054
9055 __ align(CodeEntryAlignment);
9056
9057 StubCodeMark mark(this, stub_id);
9058
9059 address entry = __ pc();
9060 __ enter();
9061 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub
9062 // also advance pointers to use post-increment instead of pre-increment
9063 __ add(a1, a1, wordSize);
9064 __ add(a2, a2, wordSize);
9065 if (AvoidUnalignedAccesses) {
9066 // both implementations (SIMD/nonSIMD) are using relatively large load
9067 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
9068 // on some CPUs in case of address is not at least 16-byte aligned.
9069 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
9070 // load if needed at least for 1st address and make if 16-byte aligned.
9071 Label ALIGNED16;
9072 __ tbz(a1, 3, ALIGNED16);
9073 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9074 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9075 __ sub(cnt1, cnt1, wordSize);
9076 __ eor(tmp1, tmp1, tmp2);
9077 __ cbnz(tmp1, NOT_EQUAL_NO_POP);
9078 __ bind(ALIGNED16);
9079 }
9080 if (UseSIMDForArrayEquals) {
9081 if (SoftwarePrefetchHintDistance >= 0) {
9082 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9083 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9084 generate_large_array_equals_loop_simd(prefetchLoopThreshold,
9085 /* prfm = */ true, NOT_EQUAL);
9086 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9087 __ br(__ LT, TAIL);
9088 }
9089 __ bind(NO_PREFETCH_LARGE_LOOP);
9090 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
9091 /* prfm = */ false, NOT_EQUAL);
9092 } else {
9093 __ push(spilled_regs, sp);
9094 if (SoftwarePrefetchHintDistance >= 0) {
9095 __ subs(tmp1, cnt1, prefetchLoopThreshold);
9096 __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
9097 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
9098 /* prfm = */ true, NOT_EQUAL);
9099 __ subs(zr, cnt1, nonPrefetchLoopThreshold);
9100 __ br(__ LT, TAIL);
9101 }
9102 __ bind(NO_PREFETCH_LARGE_LOOP);
9103 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
9104 /* prfm = */ false, NOT_EQUAL);
9105 }
9106 __ bind(TAIL);
9107 __ cbz(cnt1, EQUAL);
9108 __ subs(cnt1, cnt1, wordSize);
9109 __ br(__ LE, POST_LOOP);
9110 __ bind(SMALL_LOOP);
9111 __ ldr(tmp1, Address(__ post(a1, wordSize)));
9112 __ ldr(tmp2, Address(__ post(a2, wordSize)));
9113 __ subs(cnt1, cnt1, wordSize);
9114 __ eor(tmp1, tmp1, tmp2);
9115 __ cbnz(tmp1, NOT_EQUAL);
9116 __ br(__ GT, SMALL_LOOP);
9117 __ bind(POST_LOOP);
9118 __ ldr(tmp1, Address(a1, cnt1));
9119 __ ldr(tmp2, Address(a2, cnt1));
9120 __ eor(tmp1, tmp1, tmp2);
9121 __ cbnz(tmp1, NOT_EQUAL);
9122 __ bind(EQUAL);
9123 __ mov(result, true);
9124 __ bind(NOT_EQUAL);
9125 if (!UseSIMDForArrayEquals) {
9126 __ pop(spilled_regs, sp);
9127 }
9128 __ bind(NOT_EQUAL_NO_POP);
9129 __ leave();
9130 __ ret(lr);
9131
9132 // record the stub entry and end
9133 store_archive_data(stub_id, entry, __ pc());
9134
9135 return entry;
9136 }
9137
9138 // result = r0 - return value. Contains initial hashcode value on entry.
9139 // ary = r1 - array address
9140 // cnt = r2 - elements count
9141 // Clobbers: v0-v13, rscratch1, rscratch2
9142 address generate_large_arrays_hashcode(BasicType eltype) {
9143 StubId stub_id;
9144 switch (eltype) {
9145 case T_BOOLEAN:
9146 stub_id = StubId::stubgen_large_arrays_hashcode_boolean_id;
9147 break;
9148 case T_BYTE:
9149 stub_id = StubId::stubgen_large_arrays_hashcode_byte_id;
9150 break;
9151 case T_CHAR:
9152 stub_id = StubId::stubgen_large_arrays_hashcode_char_id;
9153 break;
9154 case T_SHORT:
9155 stub_id = StubId::stubgen_large_arrays_hashcode_short_id;
9156 break;
9157 case T_INT:
9158 stub_id = StubId::stubgen_large_arrays_hashcode_int_id;
9159 break;
9160 default:
9161 stub_id = StubId::NO_STUBID;
9162 ShouldNotReachHere();
9163 };
9164 int entry_count = StubInfo::entry_count(stub_id);
9165 assert(entry_count == 1, "sanity check");
9166 address start = load_archive_data(stub_id);
9167 if (start != nullptr) {
9168 return start;
9169 }
9170 const Register result = r0, ary = r1, cnt = r2;
9171 const FloatRegister vdata0 = v3, vdata1 = v2, vdata2 = v1, vdata3 = v0;
9172 const FloatRegister vmul0 = v4, vmul1 = v5, vmul2 = v6, vmul3 = v7;
9173 const FloatRegister vpow = v12; // powers of 31: <31^3, ..., 31^0>
9174 const FloatRegister vpowm = v13;
9175
9176 ARRAYS_HASHCODE_REGISTERS;
9177
9178 Label SMALL_LOOP, LARGE_LOOP_PREHEADER, LARGE_LOOP, TAIL, TAIL_SHORTCUT, BR_BASE;
9179
9180 unsigned int vf; // vectorization factor
9181 bool multiply_by_halves;
9182 Assembler::SIMD_Arrangement load_arrangement;
9183 switch (eltype) {
9184 case T_BOOLEAN:
9185 case T_BYTE:
9186 load_arrangement = Assembler::T8B;
9187 multiply_by_halves = true;
9188 vf = 8;
9189 break;
9190 case T_CHAR:
9191 case T_SHORT:
9192 load_arrangement = Assembler::T8H;
9193 multiply_by_halves = true;
9194 vf = 8;
9195 break;
9196 case T_INT:
9197 load_arrangement = Assembler::T4S;
9198 multiply_by_halves = false;
9199 vf = 4;
9200 break;
9201 default:
9202 ShouldNotReachHere();
9203 }
9204
9205 // Unroll factor
9206 const unsigned uf = 4;
9207
9208 // Effective vectorization factor
9209 const unsigned evf = vf * uf;
9210
9211 __ align(CodeEntryAlignment);
9212
9213 StubCodeMark mark(this, stub_id);
9214
9215 address entry = __ pc();
9216 __ enter();
9217
9218 // Put 0-3'th powers of 31 into a single SIMD register together. The register will be used in
9219 // the SMALL and LARGE LOOPS' epilogues. The initialization is hoisted here and the register's
9220 // value shouldn't change throughout both loops.
9221 __ movw(rscratch1, intpow(31U, 3));
9222 __ mov(vpow, Assembler::S, 0, rscratch1);
9223 __ movw(rscratch1, intpow(31U, 2));
9224 __ mov(vpow, Assembler::S, 1, rscratch1);
9225 __ movw(rscratch1, intpow(31U, 1));
9226 __ mov(vpow, Assembler::S, 2, rscratch1);
9227 __ movw(rscratch1, intpow(31U, 0));
9228 __ mov(vpow, Assembler::S, 3, rscratch1);
9229
9230 __ mov(vmul0, Assembler::T16B, 0);
9231 __ mov(vmul0, Assembler::S, 3, result);
9232
9233 __ andr(rscratch2, cnt, (uf - 1) * vf);
9234 __ cbz(rscratch2, LARGE_LOOP_PREHEADER);
9235
9236 __ movw(rscratch1, intpow(31U, multiply_by_halves ? vf / 2 : vf));
9237 __ mov(vpowm, Assembler::S, 0, rscratch1);
9238
9239 // SMALL LOOP
9240 __ bind(SMALL_LOOP);
9241
9242 __ ld1(vdata0, load_arrangement, Address(__ post(ary, vf * type2aelembytes(eltype))));
9243 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9244 __ subsw(rscratch2, rscratch2, vf);
9245
9246 if (load_arrangement == Assembler::T8B) {
9247 // Extend 8B to 8H to be able to use vector multiply
9248 // instructions
9249 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9250 if (is_signed_subword_type(eltype)) {
9251 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9252 } else {
9253 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9254 }
9255 }
9256
9257 switch (load_arrangement) {
9258 case Assembler::T4S:
9259 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9260 break;
9261 case Assembler::T8B:
9262 case Assembler::T8H:
9263 assert(is_subword_type(eltype), "subword type expected");
9264 if (is_signed_subword_type(eltype)) {
9265 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9266 } else {
9267 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9268 }
9269 break;
9270 default:
9271 __ should_not_reach_here();
9272 }
9273
9274 // Process the upper half of a vector
9275 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9276 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9277 if (is_signed_subword_type(eltype)) {
9278 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9279 } else {
9280 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9281 }
9282 }
9283
9284 __ br(Assembler::HI, SMALL_LOOP);
9285
9286 // SMALL LOOP'S EPILOQUE
9287 __ lsr(rscratch2, cnt, exact_log2(evf));
9288 __ cbnz(rscratch2, LARGE_LOOP_PREHEADER);
9289
9290 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9291 __ addv(vmul0, Assembler::T4S, vmul0);
9292 __ umov(result, vmul0, Assembler::S, 0);
9293
9294 // TAIL
9295 __ bind(TAIL);
9296
9297 // The andr performs cnt % vf. The subtract shifted by 3 offsets past vf - 1 - (cnt % vf) pairs
9298 // of load + madd insns i.e. it only executes cnt % vf load + madd pairs.
9299 assert(is_power_of_2(vf), "can't use this value to calculate the jump target PC");
9300 __ andr(rscratch2, cnt, vf - 1);
9301 __ bind(TAIL_SHORTCUT);
9302 __ adr(rscratch1, BR_BASE);
9303 // For Cortex-A53 offset is 4 because 2 nops are generated.
9304 __ sub(rscratch1, rscratch1, rscratch2, ext::uxtw, VM_Version::supports_a53mac() ? 4 : 3);
9305 __ movw(rscratch2, 0x1f);
9306 __ br(rscratch1);
9307
9308 for (size_t i = 0; i < vf - 1; ++i) {
9309 __ load(rscratch1, Address(__ post(ary, type2aelembytes(eltype))),
9310 eltype);
9311 __ maddw(result, result, rscratch2, rscratch1);
9312 // maddw generates an extra nop for Cortex-A53 (see maddw definition in macroAssembler).
9313 // Generate 2nd nop to have 4 instructions per iteration.
9314 if (VM_Version::supports_a53mac()) {
9315 __ nop();
9316 }
9317 }
9318 __ bind(BR_BASE);
9319
9320 __ leave();
9321 __ ret(lr);
9322
9323 // LARGE LOOP
9324 __ bind(LARGE_LOOP_PREHEADER);
9325
9326 __ lsr(rscratch2, cnt, exact_log2(evf));
9327
9328 if (multiply_by_halves) {
9329 // 31^4 - multiplier between lower and upper parts of a register
9330 __ movw(rscratch1, intpow(31U, vf / 2));
9331 __ mov(vpowm, Assembler::S, 1, rscratch1);
9332 // 31^28 - remainder of the iteraion multiplier, 28 = 32 - 4
9333 __ movw(rscratch1, intpow(31U, evf - vf / 2));
9334 __ mov(vpowm, Assembler::S, 0, rscratch1);
9335 } else {
9336 // 31^16
9337 __ movw(rscratch1, intpow(31U, evf));
9338 __ mov(vpowm, Assembler::S, 0, rscratch1);
9339 }
9340
9341 __ mov(vmul3, Assembler::T16B, 0);
9342 __ mov(vmul2, Assembler::T16B, 0);
9343 __ mov(vmul1, Assembler::T16B, 0);
9344
9345 __ bind(LARGE_LOOP);
9346
9347 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 0);
9348 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 0);
9349 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 0);
9350 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 0);
9351
9352 __ ld1(vdata3, vdata2, vdata1, vdata0, load_arrangement,
9353 Address(__ post(ary, evf * type2aelembytes(eltype))));
9354
9355 if (load_arrangement == Assembler::T8B) {
9356 // Extend 8B to 8H to be able to use vector multiply
9357 // instructions
9358 assert(load_arrangement == Assembler::T8B, "expected to extend 8B to 8H");
9359 if (is_signed_subword_type(eltype)) {
9360 __ sxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9361 __ sxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9362 __ sxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9363 __ sxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9364 } else {
9365 __ uxtl(vdata3, Assembler::T8H, vdata3, load_arrangement);
9366 __ uxtl(vdata2, Assembler::T8H, vdata2, load_arrangement);
9367 __ uxtl(vdata1, Assembler::T8H, vdata1, load_arrangement);
9368 __ uxtl(vdata0, Assembler::T8H, vdata0, load_arrangement);
9369 }
9370 }
9371
9372 switch (load_arrangement) {
9373 case Assembler::T4S:
9374 __ addv(vmul3, load_arrangement, vmul3, vdata3);
9375 __ addv(vmul2, load_arrangement, vmul2, vdata2);
9376 __ addv(vmul1, load_arrangement, vmul1, vdata1);
9377 __ addv(vmul0, load_arrangement, vmul0, vdata0);
9378 break;
9379 case Assembler::T8B:
9380 case Assembler::T8H:
9381 assert(is_subword_type(eltype), "subword type expected");
9382 if (is_signed_subword_type(eltype)) {
9383 __ saddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9384 __ saddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9385 __ saddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9386 __ saddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9387 } else {
9388 __ uaddwv(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T4H);
9389 __ uaddwv(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T4H);
9390 __ uaddwv(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T4H);
9391 __ uaddwv(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T4H);
9392 }
9393 break;
9394 default:
9395 __ should_not_reach_here();
9396 }
9397
9398 // Process the upper half of a vector
9399 if (load_arrangement == Assembler::T8B || load_arrangement == Assembler::T8H) {
9400 __ mulvs(vmul3, Assembler::T4S, vmul3, vpowm, 1);
9401 __ mulvs(vmul2, Assembler::T4S, vmul2, vpowm, 1);
9402 __ mulvs(vmul1, Assembler::T4S, vmul1, vpowm, 1);
9403 __ mulvs(vmul0, Assembler::T4S, vmul0, vpowm, 1);
9404 if (is_signed_subword_type(eltype)) {
9405 __ saddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9406 __ saddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9407 __ saddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9408 __ saddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9409 } else {
9410 __ uaddwv2(vmul3, vmul3, Assembler::T4S, vdata3, Assembler::T8H);
9411 __ uaddwv2(vmul2, vmul2, Assembler::T4S, vdata2, Assembler::T8H);
9412 __ uaddwv2(vmul1, vmul1, Assembler::T4S, vdata1, Assembler::T8H);
9413 __ uaddwv2(vmul0, vmul0, Assembler::T4S, vdata0, Assembler::T8H);
9414 }
9415 }
9416
9417 __ subsw(rscratch2, rscratch2, 1);
9418 __ br(Assembler::HI, LARGE_LOOP);
9419
9420 __ mulv(vmul3, Assembler::T4S, vmul3, vpow);
9421 __ addv(vmul3, Assembler::T4S, vmul3);
9422 __ umov(result, vmul3, Assembler::S, 0);
9423
9424 __ mov(rscratch2, intpow(31U, vf));
9425
9426 __ mulv(vmul2, Assembler::T4S, vmul2, vpow);
9427 __ addv(vmul2, Assembler::T4S, vmul2);
9428 __ umov(rscratch1, vmul2, Assembler::S, 0);
9429 __ maddw(result, result, rscratch2, rscratch1);
9430
9431 __ mulv(vmul1, Assembler::T4S, vmul1, vpow);
9432 __ addv(vmul1, Assembler::T4S, vmul1);
9433 __ umov(rscratch1, vmul1, Assembler::S, 0);
9434 __ maddw(result, result, rscratch2, rscratch1);
9435
9436 __ mulv(vmul0, Assembler::T4S, vmul0, vpow);
9437 __ addv(vmul0, Assembler::T4S, vmul0);
9438 __ umov(rscratch1, vmul0, Assembler::S, 0);
9439 __ maddw(result, result, rscratch2, rscratch1);
9440
9441 __ andr(rscratch2, cnt, vf - 1);
9442 __ cbnz(rscratch2, TAIL_SHORTCUT);
9443
9444 __ leave();
9445 __ ret(lr);
9446
9447 // record the stub entry and end
9448 store_archive_data(stub_id, entry, __ pc());
9449
9450 return entry;
9451 }
9452
9453 address generate_dsin_dcos(bool isCos) {
9454 StubId stub_id = (isCos ? StubId::stubgen_dcos_id : StubId::stubgen_dsin_id);
9455 int entry_count = StubInfo::entry_count(stub_id);
9456 assert(entry_count == 1, "sanity check");
9457 address start = load_archive_data(stub_id);
9458 if (start != nullptr) {
9459 return start;
9460 }
9461 __ align(CodeEntryAlignment);
9462 StubCodeMark mark(this, stub_id);
9463 start = __ pc();
9464 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
9465 (address)StubRoutines::aarch64::_two_over_pi,
9466 (address)StubRoutines::aarch64::_pio2,
9467 (address)StubRoutines::aarch64::_dsin_coef,
9468 (address)StubRoutines::aarch64::_dcos_coef);
9469
9470 // record the stub entry and end
9471 store_archive_data(stub_id, start, __ pc());
9472
9473 return start;
9474 }
9475
9476 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
9477 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
9478 Label &DIFF2) {
9479 Register cnt1 = r2, tmp2 = r11, tmp3 = r12;
9480 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
9481
9482 __ ldrq(vtmp, Address(__ post(tmp2, 16)));
9483 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9484 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
9485 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
9486
9487 __ fmovd(tmpL, vtmp3);
9488 __ eor(rscratch2, tmp3, tmpL);
9489 __ cbnz(rscratch2, DIFF2);
9490
9491 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9492 __ umov(tmpL, vtmp3, __ D, 1);
9493 __ eor(rscratch2, tmpU, tmpL);
9494 __ cbnz(rscratch2, DIFF1);
9495
9496 __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
9497 __ ldr(tmpU, Address(__ post(cnt1, 8)));
9498 __ fmovd(tmpL, vtmp);
9499 __ eor(rscratch2, tmp3, tmpL);
9500 __ cbnz(rscratch2, DIFF2);
9501
9502 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9503 __ umov(tmpL, vtmp, __ D, 1);
9504 __ eor(rscratch2, tmpU, tmpL);
9505 __ cbnz(rscratch2, DIFF1);
9506 }
9507
9508 // r0 = result
9509 // r1 = str1
9510 // r2 = cnt1
9511 // r3 = str2
9512 // r4 = cnt2
9513 // r10 = tmp1
9514 // r11 = tmp2
9515 address generate_compare_long_string_different_encoding(bool isLU) {
9516 StubId stub_id = (isLU ? StubId::stubgen_compare_long_string_LU_id : StubId::stubgen_compare_long_string_UL_id);
9517 int entry_count = StubInfo::entry_count(stub_id);
9518 assert(entry_count == 1, "sanity check");
9519 address start = load_archive_data(stub_id);
9520 if (start != nullptr) {
9521 return start;
9522 }
9523 __ align(CodeEntryAlignment);
9524 StubCodeMark mark(this, stub_id);
9525 address entry = __ pc();
9526 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
9527 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH,
9528 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
9529 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9530 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
9531 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
9532 RegSet spilled_regs = RegSet::of(tmp3, tmp4);
9533
9534 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2);
9535
9536 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
9537 // cnt2 == amount of characters left to compare
9538 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
9539 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9540 __ add(str1, str1, isLU ? wordSize/2 : wordSize);
9541 __ add(str2, str2, isLU ? wordSize : wordSize/2);
9542 __ fmovd(isLU ? tmp1 : tmp2, vtmp);
9543 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
9544 __ eor(rscratch2, tmp1, tmp2);
9545 __ mov(rscratch1, tmp2);
9546 __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
9547 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
9548 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
9549 __ push(spilled_regs, sp);
9550 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load
9551 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load
9552
9553 __ ldr(tmp3, Address(__ post(cnt1, 8)));
9554
9555 if (SoftwarePrefetchHintDistance >= 0) {
9556 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9557 __ br(__ LT, NO_PREFETCH);
9558 __ bind(LARGE_LOOP_PREFETCH);
9559 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
9560 __ mov(tmp4, 2);
9561 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9562 __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
9563 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9564 __ subs(tmp4, tmp4, 1);
9565 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
9566 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
9567 __ mov(tmp4, 2);
9568 __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
9569 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9570 __ subs(tmp4, tmp4, 1);
9571 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
9572 __ sub(cnt2, cnt2, 64);
9573 __ subs(rscratch2, cnt2, prefetchLoopExitCondition);
9574 __ br(__ GE, LARGE_LOOP_PREFETCH);
9575 }
9576 __ cbz(cnt2, LOAD_LAST); // no characters left except last load
9577 __ bind(NO_PREFETCH);
9578 __ subs(cnt2, cnt2, 16);
9579 __ br(__ LT, TAIL);
9580 __ align(OptoLoopAlignment);
9581 __ bind(SMALL_LOOP); // smaller loop
9582 __ subs(cnt2, cnt2, 16);
9583 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
9584 __ br(__ GE, SMALL_LOOP);
9585 __ cmn(cnt2, (u1)16);
9586 __ br(__ EQ, LOAD_LAST);
9587 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters)
9588 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string
9589 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string
9590 __ ldr(tmp3, Address(cnt1, -8));
9591 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load
9592 __ b(LOAD_LAST);
9593 __ bind(DIFF2);
9594 __ mov(tmpU, tmp3);
9595 __ bind(DIFF1);
9596 __ pop(spilled_regs, sp);
9597 __ b(CALCULATE_DIFFERENCE);
9598 __ bind(LOAD_LAST);
9599 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU.
9600 // No need to load it again
9601 __ mov(tmpU, tmp3);
9602 __ pop(spilled_regs, sp);
9603
9604 // tmp2 points to the address of the last 4 Latin1 characters right now
9605 __ ldrs(vtmp, Address(tmp2));
9606 __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
9607 __ fmovd(tmpL, vtmp);
9608
9609 __ eor(rscratch2, tmpU, tmpL);
9610 __ cbz(rscratch2, DONE);
9611
9612 // Find the first different characters in the longwords and
9613 // compute their difference.
9614 __ bind(CALCULATE_DIFFERENCE);
9615 __ rev(rscratch2, rscratch2);
9616 __ clz(rscratch2, rscratch2);
9617 __ andr(rscratch2, rscratch2, -16);
9618 __ lsrv(tmp1, tmp1, rscratch2);
9619 __ uxthw(tmp1, tmp1);
9620 __ lsrv(rscratch1, rscratch1, rscratch2);
9621 __ uxthw(rscratch1, rscratch1);
9622 __ subw(result, tmp1, rscratch1);
9623 __ bind(DONE);
9624 __ ret(lr);
9625
9626 // record the stub entry and end
9627 store_archive_data(stub_id, entry, __ pc());
9628
9629 return entry;
9630 }
9631
9632 // r0 = input (float16)
9633 // v0 = result (float)
9634 // v1 = temporary float register
9635 address generate_float16ToFloat() {
9636 StubId stub_id = StubId::stubgen_hf2f_id;
9637 int entry_count = StubInfo::entry_count(stub_id);
9638 assert(entry_count == 1, "sanity check");
9639 address start = load_archive_data(stub_id);
9640 if (start != nullptr) {
9641 return start;
9642 }
9643 __ align(CodeEntryAlignment);
9644 StubCodeMark mark(this, stub_id);
9645 address entry = __ pc();
9646 BLOCK_COMMENT("Entry:");
9647 __ flt16_to_flt(v0, r0, v1);
9648 __ ret(lr);
9649
9650 // record the stub entry and end
9651 store_archive_data(stub_id, entry, __ pc());
9652
9653 return entry;
9654 }
9655
9656 // v0 = input (float)
9657 // r0 = result (float16)
9658 // v1 = temporary float register
9659 address generate_floatToFloat16() {
9660 StubId stub_id = StubId::stubgen_f2hf_id;
9661 int entry_count = StubInfo::entry_count(stub_id);
9662 assert(entry_count == 1, "sanity check");
9663 address start = load_archive_data(stub_id);
9664 if (start != nullptr) {
9665 return start;
9666 }
9667 __ align(CodeEntryAlignment);
9668 StubCodeMark mark(this, stub_id);
9669 address entry = __ pc();
9670 BLOCK_COMMENT("Entry:");
9671 __ flt_to_flt16(r0, v0, v1);
9672 __ ret(lr);
9673
9674 // record the stub entry and end
9675 store_archive_data(stub_id, entry, __ pc());
9676
9677 return entry;
9678 }
9679
9680 address generate_method_entry_barrier() {
9681 StubId stub_id = StubId::stubgen_method_entry_barrier_id;
9682 int entry_count = StubInfo::entry_count(stub_id);
9683 assert(entry_count == 1, "sanity check");
9684 address start = load_archive_data(stub_id);
9685 if (start != nullptr) {
9686 return start;
9687 }
9688 __ align(CodeEntryAlignment);
9689 StubCodeMark mark(this, stub_id);
9690
9691 Label deoptimize_label;
9692
9693 start = __ pc();
9694
9695 BarrierSetAssembler* bs_asm = BarrierSet::barrier_set()->barrier_set_assembler();
9696
9697 if (bs_asm->nmethod_patching_type() == NMethodPatchingType::conc_instruction_and_data_patch) {
9698 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod();
9699 // We can get here despite the nmethod being good, if we have not
9700 // yet applied our cross modification fence (or data fence).
9701 Address thread_epoch_addr(rthread, in_bytes(bs_nm->thread_disarmed_guard_value_offset()) + 4);
9702 __ lea(rscratch2, ExternalAddress(bs_asm->patching_epoch_addr()));
9703 __ ldrw(rscratch2, rscratch2);
9704 __ strw(rscratch2, thread_epoch_addr);
9705 __ isb();
9706 __ membar(__ LoadLoad);
9707 }
9708
9709 __ set_last_Java_frame(sp, rfp, lr, rscratch1);
9710
9711 __ enter();
9712 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr
9713
9714 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc}
9715
9716 __ push_call_clobbered_registers();
9717
9718 __ mov(c_rarg0, rscratch2);
9719 __ call_VM_leaf
9720 (CAST_FROM_FN_PTR
9721 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1);
9722
9723 __ reset_last_Java_frame(true);
9724
9725 __ mov(rscratch1, r0);
9726
9727 __ pop_call_clobbered_registers();
9728
9729 __ cbnz(rscratch1, deoptimize_label);
9730
9731 __ leave();
9732 __ ret(lr);
9733
9734 __ BIND(deoptimize_label);
9735
9736 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize));
9737 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize));
9738
9739 __ mov(sp, rscratch1);
9740 __ br(rscratch2);
9741
9742 // record the stub entry and end
9743 store_archive_data(stub_id, start, __ pc());
9744
9745 return start;
9746 }
9747
9748 // r0 = result
9749 // r1 = str1
9750 // r2 = cnt1
9751 // r3 = str2
9752 // r4 = cnt2
9753 // r10 = tmp1
9754 // r11 = tmp2
9755 address generate_compare_long_string_same_encoding(bool isLL) {
9756 StubId stub_id = (isLL ? StubId::stubgen_compare_long_string_LL_id : StubId::stubgen_compare_long_string_UU_id);
9757 int entry_count = StubInfo::entry_count(stub_id);
9758 assert(entry_count == 1, "sanity check");
9759 address start = load_archive_data(stub_id);
9760 if (start != nullptr) {
9761 return start;
9762 }
9763 __ align(CodeEntryAlignment);
9764 StubCodeMark mark(this, stub_id);
9765 address entry = __ pc();
9766 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9767 tmp1 = r10, tmp2 = r11, tmp1h = rscratch1, tmp2h = rscratch2;
9768
9769 Label LARGE_LOOP_PREFETCH, LOOP_COMPARE16, DIFF, LESS16, LESS8, CAL_DIFFERENCE, LENGTH_DIFF;
9770
9771 // exit from large loop when less than 64 bytes left to read or we're about
9772 // to prefetch memory behind array border
9773 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
9774
9775 // before jumping to stub, pre-load 8 bytes already, so do comparison directly
9776 __ eor(rscratch2, tmp1, tmp2);
9777 __ cbnz(rscratch2, CAL_DIFFERENCE);
9778
9779 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
9780 // update pointers, because of previous read
9781 __ add(str1, str1, wordSize);
9782 __ add(str2, str2, wordSize);
9783 if (SoftwarePrefetchHintDistance >= 0) {
9784 __ align(OptoLoopAlignment);
9785 __ bind(LARGE_LOOP_PREFETCH);
9786 __ prfm(Address(str1, SoftwarePrefetchHintDistance));
9787 __ prfm(Address(str2, SoftwarePrefetchHintDistance));
9788
9789 for (int i = 0; i < 4; i++) {
9790 __ ldp(tmp1, tmp1h, Address(str1, i * 16));
9791 __ ldp(tmp2, tmp2h, Address(str2, i * 16));
9792 __ cmp(tmp1, tmp2);
9793 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9794 __ br(Assembler::NE, DIFF);
9795 }
9796 __ sub(cnt2, cnt2, isLL ? 64 : 32);
9797 __ add(str1, str1, 64);
9798 __ add(str2, str2, 64);
9799 __ subs(rscratch2, cnt2, largeLoopExitCondition);
9800 __ br(Assembler::GE, LARGE_LOOP_PREFETCH);
9801 __ cbz(cnt2, LENGTH_DIFF); // no more chars left?
9802 }
9803
9804 __ subs(rscratch1, cnt2, isLL ? 16 : 8);
9805 __ br(Assembler::LE, LESS16);
9806 __ align(OptoLoopAlignment);
9807 __ bind(LOOP_COMPARE16);
9808 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9809 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9810 __ cmp(tmp1, tmp2);
9811 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9812 __ br(Assembler::NE, DIFF);
9813 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9814 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9815 __ br(Assembler::LT, LESS16);
9816
9817 __ ldp(tmp1, tmp1h, Address(__ post(str1, 16)));
9818 __ ldp(tmp2, tmp2h, Address(__ post(str2, 16)));
9819 __ cmp(tmp1, tmp2);
9820 __ ccmp(tmp1h, tmp2h, 0, Assembler::EQ);
9821 __ br(Assembler::NE, DIFF);
9822 __ sub(cnt2, cnt2, isLL ? 16 : 8);
9823 __ subs(rscratch2, cnt2, isLL ? 16 : 8);
9824 __ br(Assembler::GE, LOOP_COMPARE16);
9825 __ cbz(cnt2, LENGTH_DIFF);
9826
9827 __ bind(LESS16);
9828 // each 8 compare
9829 __ subs(cnt2, cnt2, isLL ? 8 : 4);
9830 __ br(Assembler::LE, LESS8);
9831 __ ldr(tmp1, Address(__ post(str1, 8)));
9832 __ ldr(tmp2, Address(__ post(str2, 8)));
9833 __ eor(rscratch2, tmp1, tmp2);
9834 __ cbnz(rscratch2, CAL_DIFFERENCE);
9835 __ sub(cnt2, cnt2, isLL ? 8 : 4);
9836
9837 __ bind(LESS8); // directly load last 8 bytes
9838 if (!isLL) {
9839 __ add(cnt2, cnt2, cnt2);
9840 }
9841 __ ldr(tmp1, Address(str1, cnt2));
9842 __ ldr(tmp2, Address(str2, cnt2));
9843 __ eor(rscratch2, tmp1, tmp2);
9844 __ cbz(rscratch2, LENGTH_DIFF);
9845 __ b(CAL_DIFFERENCE);
9846
9847 __ bind(DIFF);
9848 __ cmp(tmp1, tmp2);
9849 __ csel(tmp1, tmp1, tmp1h, Assembler::NE);
9850 __ csel(tmp2, tmp2, tmp2h, Assembler::NE);
9851 // reuse rscratch2 register for the result of eor instruction
9852 __ eor(rscratch2, tmp1, tmp2);
9853
9854 __ bind(CAL_DIFFERENCE);
9855 __ rev(rscratch2, rscratch2);
9856 __ clz(rscratch2, rscratch2);
9857 __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
9858 __ lsrv(tmp1, tmp1, rscratch2);
9859 __ lsrv(tmp2, tmp2, rscratch2);
9860 if (isLL) {
9861 __ uxtbw(tmp1, tmp1);
9862 __ uxtbw(tmp2, tmp2);
9863 } else {
9864 __ uxthw(tmp1, tmp1);
9865 __ uxthw(tmp2, tmp2);
9866 }
9867 __ subw(result, tmp1, tmp2);
9868
9869 __ bind(LENGTH_DIFF);
9870 __ ret(lr);
9871
9872 // record the stub entry and end
9873 store_archive_data(stub_id, entry, __ pc());
9874
9875 return entry;
9876 }
9877
9878 enum string_compare_mode {
9879 LL,
9880 LU,
9881 UL,
9882 UU,
9883 };
9884
9885 // The following registers are declared in aarch64.ad
9886 // r0 = result
9887 // r1 = str1
9888 // r2 = cnt1
9889 // r3 = str2
9890 // r4 = cnt2
9891 // r10 = tmp1
9892 // r11 = tmp2
9893 // z0 = ztmp1
9894 // z1 = ztmp2
9895 // p0 = pgtmp1
9896 // p1 = pgtmp2
9897 address generate_compare_long_string_sve(string_compare_mode mode) {
9898 StubId stub_id;
9899 switch (mode) {
9900 case LL: stub_id = StubId::stubgen_compare_long_string_LL_id; break;
9901 case LU: stub_id = StubId::stubgen_compare_long_string_LU_id; break;
9902 case UL: stub_id = StubId::stubgen_compare_long_string_UL_id; break;
9903 case UU: stub_id = StubId::stubgen_compare_long_string_UU_id; break;
9904 default: ShouldNotReachHere();
9905 }
9906 int entry_count = StubInfo::entry_count(stub_id);
9907 assert(entry_count == 1, "sanity check");
9908 address start = load_archive_data(stub_id);
9909 if (start != nullptr) {
9910 return start;
9911 }
9912 __ align(CodeEntryAlignment);
9913 StubCodeMark mark(this, stub_id);
9914 address entry = __ pc();
9915 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
9916 tmp1 = r10, tmp2 = r11;
9917
9918 Label LOOP, DONE, MISMATCH;
9919 Register vec_len = tmp1;
9920 Register idx = tmp2;
9921 // The minimum of the string lengths has been stored in cnt2.
9922 Register cnt = cnt2;
9923 FloatRegister ztmp1 = z0, ztmp2 = z1;
9924 PRegister pgtmp1 = p0, pgtmp2 = p1;
9925
9926 #define LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx) \
9927 switch (mode) { \
9928 case LL: \
9929 __ sve_ld1b(ztmp1, __ B, pgtmp1, Address(str1, idx)); \
9930 __ sve_ld1b(ztmp2, __ B, pgtmp1, Address(str2, idx)); \
9931 break; \
9932 case LU: \
9933 __ sve_ld1b(ztmp1, __ H, pgtmp1, Address(str1, idx)); \
9934 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9935 break; \
9936 case UL: \
9937 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9938 __ sve_ld1b(ztmp2, __ H, pgtmp1, Address(str2, idx)); \
9939 break; \
9940 case UU: \
9941 __ sve_ld1h(ztmp1, __ H, pgtmp1, Address(str1, idx, Address::lsl(1))); \
9942 __ sve_ld1h(ztmp2, __ H, pgtmp1, Address(str2, idx, Address::lsl(1))); \
9943 break; \
9944 default: \
9945 ShouldNotReachHere(); \
9946 }
9947
9948 __ mov(idx, 0);
9949 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9950
9951 if (mode == LL) {
9952 __ sve_cntb(vec_len);
9953 } else {
9954 __ sve_cnth(vec_len);
9955 }
9956
9957 __ sub(rscratch1, cnt, vec_len);
9958
9959 __ bind(LOOP);
9960
9961 // main loop
9962 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9963 __ add(idx, idx, vec_len);
9964 // Compare strings.
9965 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9966 __ br(__ NE, MISMATCH);
9967 __ cmp(idx, rscratch1);
9968 __ br(__ LT, LOOP);
9969
9970 // post loop, last iteration
9971 __ sve_whilelt(pgtmp1, mode == LL ? __ B : __ H, idx, cnt);
9972
9973 LOAD_PAIR(ztmp1, ztmp2, pgtmp1, src1, src2, idx);
9974 __ sve_cmp(Assembler::NE, pgtmp2, mode == LL ? __ B : __ H, pgtmp1, ztmp1, ztmp2);
9975 __ br(__ EQ, DONE);
9976
9977 __ bind(MISMATCH);
9978
9979 // Crop the vector to find its location.
9980 __ sve_brkb(pgtmp2, pgtmp1, pgtmp2, false /* isMerge */);
9981 // Extract the first different characters of each string.
9982 __ sve_lasta(rscratch1, mode == LL ? __ B : __ H, pgtmp2, ztmp1);
9983 __ sve_lasta(rscratch2, mode == LL ? __ B : __ H, pgtmp2, ztmp2);
9984
9985 // Compute the difference of the first different characters.
9986 __ sub(result, rscratch1, rscratch2);
9987
9988 __ bind(DONE);
9989 __ ret(lr);
9990 #undef LOAD_PAIR
9991
9992 // record the stub entry and end
9993 store_archive_data(stub_id, entry, __ pc());
9994
9995 return entry;
9996 }
9997
9998 void generate_compare_long_strings() {
9999 if (UseSVE == 0) {
10000 StubRoutines::aarch64::_compare_long_string_LL
10001 = generate_compare_long_string_same_encoding(true);
10002 StubRoutines::aarch64::_compare_long_string_UU
10003 = generate_compare_long_string_same_encoding(false);
10004 StubRoutines::aarch64::_compare_long_string_LU
10005 = generate_compare_long_string_different_encoding(true);
10006 StubRoutines::aarch64::_compare_long_string_UL
10007 = generate_compare_long_string_different_encoding(false);
10008 } else {
10009 StubRoutines::aarch64::_compare_long_string_LL
10010 = generate_compare_long_string_sve(LL);
10011 StubRoutines::aarch64::_compare_long_string_UU
10012 = generate_compare_long_string_sve(UU);
10013 StubRoutines::aarch64::_compare_long_string_LU
10014 = generate_compare_long_string_sve(LU);
10015 StubRoutines::aarch64::_compare_long_string_UL
10016 = generate_compare_long_string_sve(UL);
10017 }
10018 }
10019
10020 // R0 = result
10021 // R1 = str2
10022 // R2 = cnt1
10023 // R3 = str1
10024 // R4 = cnt2
10025 // Clobbers: rscratch1, rscratch2, v0, v1, rflags
10026 //
10027 // This generic linear code use few additional ideas, which makes it faster:
10028 // 1) we can safely keep at least 1st register of pattern(since length >= 8)
10029 // in order to skip initial loading(help in systems with 1 ld pipeline)
10030 // 2) we can use "fast" algorithm of finding single character to search for
10031 // first symbol with less branches(1 branch per each loaded register instead
10032 // of branch for each symbol), so, this is where constants like
10033 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
10034 // 3) after loading and analyzing 1st register of source string, it can be
10035 // used to search for every 1st character entry, saving few loads in
10036 // comparison with "simplier-but-slower" implementation
10037 // 4) in order to avoid lots of push/pop operations, code below is heavily
10038 // re-using/re-initializing/compressing register values, which makes code
10039 // larger and a bit less readable, however, most of extra operations are
10040 // issued during loads or branches, so, penalty is minimal
10041 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
10042 StubId stub_id;
10043 if (str1_isL) {
10044 if (str2_isL) {
10045 stub_id = StubId::stubgen_string_indexof_linear_ll_id;
10046 } else {
10047 stub_id = StubId::stubgen_string_indexof_linear_ul_id;
10048 }
10049 } else {
10050 if (str2_isL) {
10051 ShouldNotReachHere();
10052 } else {
10053 stub_id = StubId::stubgen_string_indexof_linear_uu_id;
10054 }
10055 }
10056 int entry_count = StubInfo::entry_count(stub_id);
10057 assert(entry_count == 1, "sanity check");
10058 address start = load_archive_data(stub_id);
10059 if (start != nullptr) {
10060 return start;
10061 }
10062 __ align(CodeEntryAlignment);
10063 StubCodeMark mark(this, stub_id);
10064 address entry = __ pc();
10065
10066 int str1_chr_size = str1_isL ? 1 : 2;
10067 int str2_chr_size = str2_isL ? 1 : 2;
10068 int str1_chr_shift = str1_isL ? 0 : 1;
10069 int str2_chr_shift = str2_isL ? 0 : 1;
10070 bool isL = str1_isL && str2_isL;
10071 // parameters
10072 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
10073 // temporary registers
10074 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
10075 RegSet spilled_regs = RegSet::range(tmp1, tmp4);
10076 // redefinitions
10077 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
10078
10079 __ push(spilled_regs, sp);
10080 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO,
10081 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
10082 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
10083 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
10084 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
10085 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
10086 // Read whole register from str1. It is safe, because length >=8 here
10087 __ ldr(ch1, Address(str1));
10088 // Read whole register from str2. It is safe, because length >=8 here
10089 __ ldr(ch2, Address(str2));
10090 __ sub(cnt2, cnt2, cnt1);
10091 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
10092 if (str1_isL != str2_isL) {
10093 __ eor(v0, __ T16B, v0, v0);
10094 }
10095 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
10096 __ mul(first, first, tmp1);
10097 // check if we have less than 1 register to check
10098 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
10099 if (str1_isL != str2_isL) {
10100 __ fmovd(v1, ch1);
10101 }
10102 __ br(__ LE, L_SMALL);
10103 __ eor(ch2, first, ch2);
10104 if (str1_isL != str2_isL) {
10105 __ zip1(v1, __ T16B, v1, v0);
10106 }
10107 __ sub(tmp2, ch2, tmp1);
10108 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10109 __ bics(tmp2, tmp2, ch2);
10110 if (str1_isL != str2_isL) {
10111 __ fmovd(ch1, v1);
10112 }
10113 __ br(__ NE, L_HAS_ZERO);
10114 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10115 __ add(result, result, wordSize/str2_chr_size);
10116 __ add(str2, str2, wordSize);
10117 __ br(__ LT, L_POST_LOOP);
10118 __ BIND(L_LOOP);
10119 __ ldr(ch2, Address(str2));
10120 __ eor(ch2, first, ch2);
10121 __ sub(tmp2, ch2, tmp1);
10122 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10123 __ bics(tmp2, tmp2, ch2);
10124 __ br(__ NE, L_HAS_ZERO);
10125 __ BIND(L_LOOP_PROCEED);
10126 __ subs(cnt2, cnt2, wordSize/str2_chr_size);
10127 __ add(str2, str2, wordSize);
10128 __ add(result, result, wordSize/str2_chr_size);
10129 __ br(__ GE, L_LOOP);
10130 __ BIND(L_POST_LOOP);
10131 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check
10132 __ br(__ LE, NOMATCH);
10133 __ ldr(ch2, Address(str2));
10134 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10135 __ eor(ch2, first, ch2);
10136 __ sub(tmp2, ch2, tmp1);
10137 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10138 __ mov(tmp4, -1); // all bits set
10139 __ b(L_SMALL_PROCEED);
10140 __ align(OptoLoopAlignment);
10141 __ BIND(L_SMALL);
10142 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
10143 __ eor(ch2, first, ch2);
10144 if (str1_isL != str2_isL) {
10145 __ zip1(v1, __ T16B, v1, v0);
10146 }
10147 __ sub(tmp2, ch2, tmp1);
10148 __ mov(tmp4, -1); // all bits set
10149 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
10150 if (str1_isL != str2_isL) {
10151 __ fmovd(ch1, v1); // move converted 4 symbols
10152 }
10153 __ BIND(L_SMALL_PROCEED);
10154 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
10155 __ bic(tmp2, tmp2, ch2);
10156 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
10157 __ rbit(tmp2, tmp2);
10158 __ br(__ EQ, NOMATCH);
10159 __ BIND(L_SMALL_HAS_ZERO_LOOP);
10160 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
10161 __ cmp(cnt1, u1(wordSize/str2_chr_size));
10162 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
10163 if (str2_isL) { // LL
10164 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10165 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10166 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10167 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10168 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10169 } else {
10170 __ mov(ch2, 0xE); // all bits in byte set except last one
10171 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10172 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10173 __ lslv(tmp2, tmp2, tmp4);
10174 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10175 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10176 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10177 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10178 }
10179 __ cmp(ch1, ch2);
10180 __ mov(tmp4, wordSize/str2_chr_size);
10181 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10182 __ BIND(L_SMALL_CMP_LOOP);
10183 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10184 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10185 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10186 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10187 __ add(tmp4, tmp4, 1);
10188 __ cmp(tmp4, cnt1);
10189 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
10190 __ cmp(first, ch2);
10191 __ br(__ EQ, L_SMALL_CMP_LOOP);
10192 __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
10193 __ cbz(tmp2, NOMATCH); // no more matches. exit
10194 __ clz(tmp4, tmp2);
10195 __ add(result, result, 1); // advance index
10196 __ add(str2, str2, str2_chr_size); // advance pointer
10197 __ b(L_SMALL_HAS_ZERO_LOOP);
10198 __ align(OptoLoopAlignment);
10199 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
10200 __ cmp(first, ch2);
10201 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10202 __ b(DONE);
10203 __ align(OptoLoopAlignment);
10204 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
10205 if (str2_isL) { // LL
10206 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
10207 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
10208 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
10209 __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
10210 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10211 } else {
10212 __ mov(ch2, 0xE); // all bits in byte set except last one
10213 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10214 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10215 __ lslv(tmp2, tmp2, tmp4);
10216 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10217 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10218 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
10219 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10220 }
10221 __ cmp(ch1, ch2);
10222 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
10223 __ b(DONE);
10224 __ align(OptoLoopAlignment);
10225 __ BIND(L_HAS_ZERO);
10226 __ rbit(tmp2, tmp2);
10227 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
10228 // Now, perform compression of counters(cnt2 and cnt1) into one register.
10229 // It's fine because both counters are 32bit and are not changed in this
10230 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
10231 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
10232 __ sub(result, result, 1);
10233 __ BIND(L_HAS_ZERO_LOOP);
10234 __ mov(cnt1, wordSize/str2_chr_size);
10235 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10236 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
10237 if (str2_isL) {
10238 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10239 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10240 __ lslv(tmp2, tmp2, tmp4);
10241 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10242 __ add(tmp4, tmp4, 1);
10243 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10244 __ lsl(tmp2, tmp2, 1);
10245 __ mov(tmp4, wordSize/str2_chr_size);
10246 } else {
10247 __ mov(ch2, 0xE);
10248 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10249 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10250 __ lslv(tmp2, tmp2, tmp4);
10251 __ add(tmp4, tmp4, 1);
10252 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10253 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10254 __ lsl(tmp2, tmp2, 1);
10255 __ mov(tmp4, wordSize/str2_chr_size);
10256 __ sub(str2, str2, str2_chr_size);
10257 }
10258 __ cmp(ch1, ch2);
10259 __ mov(tmp4, wordSize/str2_chr_size);
10260 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10261 __ BIND(L_CMP_LOOP);
10262 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
10263 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
10264 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
10265 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
10266 __ add(tmp4, tmp4, 1);
10267 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
10268 __ br(__ GE, L_CMP_LOOP_LAST_CMP);
10269 __ cmp(cnt1, ch2);
10270 __ br(__ EQ, L_CMP_LOOP);
10271 __ BIND(L_CMP_LOOP_NOMATCH);
10272 // here we're not matched
10273 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
10274 __ clz(tmp4, tmp2);
10275 __ add(str2, str2, str2_chr_size); // advance pointer
10276 __ b(L_HAS_ZERO_LOOP);
10277 __ align(OptoLoopAlignment);
10278 __ BIND(L_CMP_LOOP_LAST_CMP);
10279 __ cmp(cnt1, ch2);
10280 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10281 __ b(DONE);
10282 __ align(OptoLoopAlignment);
10283 __ BIND(L_CMP_LOOP_LAST_CMP2);
10284 if (str2_isL) {
10285 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
10286 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10287 __ lslv(tmp2, tmp2, tmp4);
10288 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10289 __ add(tmp4, tmp4, 1);
10290 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10291 __ lsl(tmp2, tmp2, 1);
10292 } else {
10293 __ mov(ch2, 0xE);
10294 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
10295 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
10296 __ lslv(tmp2, tmp2, tmp4);
10297 __ add(tmp4, tmp4, 1);
10298 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
10299 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
10300 __ lsl(tmp2, tmp2, 1);
10301 __ sub(str2, str2, str2_chr_size);
10302 }
10303 __ cmp(ch1, ch2);
10304 __ br(__ NE, L_CMP_LOOP_NOMATCH);
10305 __ b(DONE);
10306 __ align(OptoLoopAlignment);
10307 __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
10308 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
10309 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
10310 // so, result was increased at max by wordSize/str2_chr_size - 1, so,
10311 // respective high bit wasn't changed. L_LOOP_PROCEED will increase
10312 // result by analyzed characters value, so, we can just reset lower bits
10313 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
10314 // 2) restore cnt1 and cnt2 values from "compressed" cnt2
10315 // 3) advance str2 value to represent next str2 octet. result & 7/3 is
10316 // index of last analyzed substring inside current octet. So, str2 in at
10317 // respective start address. We need to advance it to next octet
10318 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
10319 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
10320 __ bfm(result, zr, 0, 2 - str2_chr_shift);
10321 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
10322 __ movw(cnt2, cnt2);
10323 __ b(L_LOOP_PROCEED);
10324 __ align(OptoLoopAlignment);
10325 __ BIND(NOMATCH);
10326 __ mov(result, -1);
10327 __ BIND(DONE);
10328 __ pop(spilled_regs, sp);
10329 __ ret(lr);
10330
10331 // record the stub entry and end
10332 store_archive_data(stub_id, entry, __ pc());
10333
10334 return entry;
10335 }
10336
10337 void generate_string_indexof_stubs() {
10338 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
10339 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
10340 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
10341 }
10342
10343 void inflate_and_store_2_fp_registers(bool generatePrfm,
10344 FloatRegister src1, FloatRegister src2) {
10345 Register dst = r1;
10346 __ zip1(v1, __ T16B, src1, v0);
10347 __ zip2(v2, __ T16B, src1, v0);
10348 if (generatePrfm) {
10349 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
10350 }
10351 __ zip1(v3, __ T16B, src2, v0);
10352 __ zip2(v4, __ T16B, src2, v0);
10353 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
10354 }
10355
10356 // R0 = src
10357 // R1 = dst
10358 // R2 = len
10359 // R3 = len >> 3
10360 // V0 = 0
10361 // v1 = loaded 8 bytes
10362 // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6
10363 address generate_large_byte_array_inflate() {
10364 StubId stub_id = StubId::stubgen_large_byte_array_inflate_id;
10365 int entry_count = StubInfo::entry_count(stub_id);
10366 assert(entry_count == 1, "sanity check");
10367 address start = load_archive_data(stub_id);
10368 if (start != nullptr) {
10369 return start;
10370 }
10371 __ align(CodeEntryAlignment);
10372 StubCodeMark mark(this, stub_id);
10373 address entry = __ pc();
10374 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
10375 Register src = r0, dst = r1, len = r2, octetCounter = r3;
10376 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4;
10377
10378 // do one more 8-byte read to have address 16-byte aligned in most cases
10379 // also use single store instruction
10380 __ ldrd(v2, __ post(src, 8));
10381 __ sub(octetCounter, octetCounter, 2);
10382 __ zip1(v1, __ T16B, v1, v0);
10383 __ zip1(v2, __ T16B, v2, v0);
10384 __ st1(v1, v2, __ T16B, __ post(dst, 32));
10385 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10386 __ subs(rscratch1, octetCounter, large_loop_threshold);
10387 __ br(__ LE, LOOP_START);
10388 __ b(LOOP_PRFM_START);
10389 __ bind(LOOP_PRFM);
10390 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10391 __ bind(LOOP_PRFM_START);
10392 __ prfm(Address(src, SoftwarePrefetchHintDistance));
10393 __ sub(octetCounter, octetCounter, 8);
10394 __ subs(rscratch1, octetCounter, large_loop_threshold);
10395 inflate_and_store_2_fp_registers(true, v3, v4);
10396 inflate_and_store_2_fp_registers(true, v5, v6);
10397 __ br(__ GT, LOOP_PRFM);
10398 __ cmp(octetCounter, (u1)8);
10399 __ br(__ LT, DONE);
10400 __ bind(LOOP);
10401 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
10402 __ bind(LOOP_START);
10403 __ sub(octetCounter, octetCounter, 8);
10404 __ cmp(octetCounter, (u1)8);
10405 inflate_and_store_2_fp_registers(false, v3, v4);
10406 inflate_and_store_2_fp_registers(false, v5, v6);
10407 __ br(__ GE, LOOP);
10408 __ bind(DONE);
10409 __ ret(lr);
10410
10411 // record the stub entry and end
10412 store_archive_data(stub_id, entry, __ pc());
10413
10414 return entry;
10415 }
10416
10417 /**
10418 * Arguments:
10419 *
10420 * Input:
10421 * c_rarg0 - current state address
10422 * c_rarg1 - H key address
10423 * c_rarg2 - data address
10424 * c_rarg3 - number of blocks
10425 *
10426 * Output:
10427 * Updated state at c_rarg0
10428 */
10429 address generate_ghash_processBlocks_small() {
10430 // Bafflingly, GCM uses little-endian for the byte order, but
10431 // big-endian for the bit order. For example, the polynomial 1 is
10432 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
10433 //
10434 // So, we must either reverse the bytes in each word and do
10435 // everything big-endian or reverse the bits in each byte and do
10436 // it little-endian. On AArch64 it's more idiomatic to reverse
10437 // the bits in each byte (we have an instruction, RBIT, to do
10438 // that) and keep the data in little-endian bit order through the
10439 // calculation, bit-reversing the inputs and outputs.
10440
10441 StubId stub_id = StubId::stubgen_ghash_processBlocks_small_id;
10442 int entry_count = StubInfo::entry_count(stub_id);
10443 assert(entry_count == 1, "sanity check");
10444 address start = load_archive_data(stub_id);
10445 if (start != nullptr) {
10446 return start;
10447 }
10448 __ align(CodeEntryAlignment);
10449 StubCodeMark mark(this, stub_id);
10450 Label polynomial; // local data generated at end of stub
10451 start = __ pc();
10452
10453 Register state = c_rarg0;
10454 Register subkeyH = c_rarg1;
10455 Register data = c_rarg2;
10456 Register blocks = c_rarg3;
10457
10458 FloatRegister vzr = v30;
10459 __ eor(vzr, __ T16B, vzr, vzr); // zero register
10460
10461 __ adr(rscratch1, polynomial);
10462 __ ldrq(v24, rscratch1); // The field polynomial
10463
10464 __ ldrq(v0, Address(state));
10465 __ ldrq(v1, Address(subkeyH));
10466
10467 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH
10468 __ rbit(v0, __ T16B, v0);
10469 __ rev64(v1, __ T16B, v1);
10470 __ rbit(v1, __ T16B, v1);
10471
10472 __ ext(v4, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
10473 __ eor(v4, __ T16B, v4, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
10474
10475 {
10476 Label L_ghash_loop;
10477 __ bind(L_ghash_loop);
10478
10479 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
10480 // reversing each byte
10481 __ rbit(v2, __ T16B, v2);
10482 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state
10483
10484 // Multiply state in v2 by subkey in v1
10485 __ ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
10486 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v4,
10487 /*temps*/v6, v3, /*reuse/clobber b*/v2);
10488 // Reduce v7:v5 by the field polynomial
10489 __ ghash_reduce(/*result*/v0, /*lo*/v5, /*hi*/v7, /*p*/v24, vzr, /*temp*/v3);
10490
10491 __ sub(blocks, blocks, 1);
10492 __ cbnz(blocks, L_ghash_loop);
10493 }
10494
10495 // The bit-reversed result is at this point in v0
10496 __ rev64(v0, __ T16B, v0);
10497 __ rbit(v0, __ T16B, v0);
10498
10499 __ st1(v0, __ T16B, state);
10500 __ ret(lr);
10501
10502 // bind label and generate local polynomial data
10503 __ align(wordSize * 2);
10504 __ bind(polynomial);
10505 __ emit_int64(0x87); // The low-order bits of the field
10506 // polynomial (i.e. p = z^7+z^2+z+1)
10507 // repeated in the low and high parts of a
10508 // 128-bit vector
10509 __ emit_int64(0x87);
10510
10511 // record the stub entry and end
10512 store_archive_data(stub_id, start, __ pc());
10513
10514 return start;
10515 }
10516
10517 address generate_ghash_processBlocks(address small) {
10518 StubId stub_id = StubId::stubgen_ghash_processBlocks_id;
10519 int entry_count = StubInfo::entry_count(stub_id);
10520 assert(entry_count == 1, "sanity check");
10521 address start = load_archive_data(stub_id);
10522 if (start != nullptr) {
10523 return start;
10524 }
10525 Label polynomial; // local data generated after stub
10526 __ align(CodeEntryAlignment);
10527 StubCodeMark mark(this, stub_id);
10528 start = __ pc();
10529
10530 Register state = c_rarg0;
10531 Register subkeyH = c_rarg1;
10532 Register data = c_rarg2;
10533 Register blocks = c_rarg3;
10534
10535 const int unroll = 4;
10536
10537 __ cmp(blocks, (unsigned char)(unroll * 2));
10538 __ br(__ LT, small);
10539
10540 if (unroll > 1) {
10541 // Save state before entering routine
10542 __ sub(sp, sp, 4 * 16);
10543 __ st1(v12, v13, v14, v15, __ T16B, Address(sp));
10544 __ sub(sp, sp, 4 * 16);
10545 __ st1(v8, v9, v10, v11, __ T16B, Address(sp));
10546 }
10547
10548 __ ghash_processBlocks_wide(polynomial, state, subkeyH, data, blocks, unroll);
10549
10550 if (unroll > 1) {
10551 // And restore state
10552 __ ld1(v8, v9, v10, v11, __ T16B, __ post(sp, 4 * 16));
10553 __ ld1(v12, v13, v14, v15, __ T16B, __ post(sp, 4 * 16));
10554 }
10555
10556 __ cmp(blocks, (unsigned char)0);
10557 __ br(__ GT, small);
10558
10559 __ ret(lr);
10560
10561 // bind label and generate polynomial data
10562 __ align(wordSize * 2);
10563 __ bind(polynomial);
10564 __ emit_int64(0x87); // The low-order bits of the field
10565 // polynomial (i.e. p = z^7+z^2+z+1)
10566 // repeated in the low and high parts of a
10567 // 128-bit vector
10568 __ emit_int64(0x87);
10569
10570 // record the stub entry and end
10571 store_archive_data(stub_id, start, __ pc());
10572
10573 return start;
10574 }
10575
10576 void generate_base64_encode_simdround(Register src, Register dst,
10577 FloatRegister codec, u8 size) {
10578
10579 FloatRegister in0 = v4, in1 = v5, in2 = v6;
10580 FloatRegister out0 = v16, out1 = v17, out2 = v18, out3 = v19;
10581 FloatRegister ind0 = v20, ind1 = v21, ind2 = v22, ind3 = v23;
10582
10583 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10584
10585 __ ld3(in0, in1, in2, arrangement, __ post(src, 3 * size));
10586
10587 __ ushr(ind0, arrangement, in0, 2);
10588
10589 __ ushr(ind1, arrangement, in1, 2);
10590 __ shl(in0, arrangement, in0, 6);
10591 __ orr(ind1, arrangement, ind1, in0);
10592 __ ushr(ind1, arrangement, ind1, 2);
10593
10594 __ ushr(ind2, arrangement, in2, 4);
10595 __ shl(in1, arrangement, in1, 4);
10596 __ orr(ind2, arrangement, in1, ind2);
10597 __ ushr(ind2, arrangement, ind2, 2);
10598
10599 __ shl(ind3, arrangement, in2, 2);
10600 __ ushr(ind3, arrangement, ind3, 2);
10601
10602 __ tbl(out0, arrangement, codec, 4, ind0);
10603 __ tbl(out1, arrangement, codec, 4, ind1);
10604 __ tbl(out2, arrangement, codec, 4, ind2);
10605 __ tbl(out3, arrangement, codec, 4, ind3);
10606
10607 __ st4(out0, out1, out2, out3, arrangement, __ post(dst, 4 * size));
10608 }
10609
10610 /**
10611 * Arguments:
10612 *
10613 * Input:
10614 * c_rarg0 - src_start
10615 * c_rarg1 - src_offset
10616 * c_rarg2 - src_length
10617 * c_rarg3 - dest_start
10618 * c_rarg4 - dest_offset
10619 * c_rarg5 - isURL
10620 *
10621 */
10622 address generate_base64_encodeBlock() {
10623
10624 StubId stub_id = StubId::stubgen_base64_encodeBlock_id;
10625 int entry_count = StubInfo::entry_count(stub_id);
10626 assert(entry_count == 1, "sanity check");
10627 address start = load_archive_data(stub_id);
10628 if (start != nullptr) {
10629 return start;
10630 }
10631 __ align(CodeEntryAlignment);
10632 StubCodeMark mark(this, stub_id);
10633 start = __ pc();
10634
10635 Register src = c_rarg0; // source array
10636 Register soff = c_rarg1; // source start offset
10637 Register send = c_rarg2; // source end offset
10638 Register dst = c_rarg3; // dest array
10639 Register doff = c_rarg4; // position for writing to dest array
10640 Register isURL = c_rarg5; // Base64 or URL character set
10641
10642 // c_rarg6 and c_rarg7 are free to use as temps
10643 Register codec = c_rarg6;
10644 Register length = c_rarg7;
10645
10646 Label ProcessData, Process48B, Process24B, Process3B, SIMDExit, Exit;
10647
10648 __ add(src, src, soff);
10649 __ add(dst, dst, doff);
10650 __ sub(length, send, soff);
10651
10652 // load the codec base address
10653 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64));
10654 __ cbz(isURL, ProcessData);
10655 __ lea(codec, ExternalAddress((address) _encodeBlock_toBase64URL));
10656
10657 __ BIND(ProcessData);
10658
10659 // too short to formup a SIMD loop, roll back
10660 __ cmp(length, (u1)24);
10661 __ br(Assembler::LT, Process3B);
10662
10663 __ ld1(v0, v1, v2, v3, __ T16B, Address(codec));
10664
10665 __ BIND(Process48B);
10666 __ cmp(length, (u1)48);
10667 __ br(Assembler::LT, Process24B);
10668 generate_base64_encode_simdround(src, dst, v0, 16);
10669 __ sub(length, length, 48);
10670 __ b(Process48B);
10671
10672 __ BIND(Process24B);
10673 __ cmp(length, (u1)24);
10674 __ br(Assembler::LT, SIMDExit);
10675 generate_base64_encode_simdround(src, dst, v0, 8);
10676 __ sub(length, length, 24);
10677
10678 __ BIND(SIMDExit);
10679 __ cbz(length, Exit);
10680
10681 __ BIND(Process3B);
10682 // 3 src bytes, 24 bits
10683 __ ldrb(r10, __ post(src, 1));
10684 __ ldrb(r11, __ post(src, 1));
10685 __ ldrb(r12, __ post(src, 1));
10686 __ orrw(r11, r11, r10, Assembler::LSL, 8);
10687 __ orrw(r12, r12, r11, Assembler::LSL, 8);
10688 // codec index
10689 __ ubfmw(r15, r12, 18, 23);
10690 __ ubfmw(r14, r12, 12, 17);
10691 __ ubfmw(r13, r12, 6, 11);
10692 __ andw(r12, r12, 63);
10693 // get the code based on the codec
10694 __ ldrb(r15, Address(codec, r15, Address::uxtw(0)));
10695 __ ldrb(r14, Address(codec, r14, Address::uxtw(0)));
10696 __ ldrb(r13, Address(codec, r13, Address::uxtw(0)));
10697 __ ldrb(r12, Address(codec, r12, Address::uxtw(0)));
10698 __ strb(r15, __ post(dst, 1));
10699 __ strb(r14, __ post(dst, 1));
10700 __ strb(r13, __ post(dst, 1));
10701 __ strb(r12, __ post(dst, 1));
10702 __ sub(length, length, 3);
10703 __ cbnz(length, Process3B);
10704
10705 __ BIND(Exit);
10706 __ ret(lr);
10707
10708 // record the stub entry and end
10709 store_archive_data(stub_id, start, __ pc());
10710
10711 return start;
10712 }
10713
10714 void generate_base64_decode_simdround(Register src, Register dst,
10715 FloatRegister codecL, FloatRegister codecH, int size, Label& Exit) {
10716
10717 FloatRegister in0 = v16, in1 = v17, in2 = v18, in3 = v19;
10718 FloatRegister out0 = v20, out1 = v21, out2 = v22;
10719
10720 FloatRegister decL0 = v23, decL1 = v24, decL2 = v25, decL3 = v26;
10721 FloatRegister decH0 = v28, decH1 = v29, decH2 = v30, decH3 = v31;
10722
10723 Label NoIllegalData, ErrorInLowerHalf, StoreLegalData;
10724
10725 Assembler::SIMD_Arrangement arrangement = size == 16 ? __ T16B : __ T8B;
10726
10727 __ ld4(in0, in1, in2, in3, arrangement, __ post(src, 4 * size));
10728
10729 // we need unsigned saturating subtract, to make sure all input values
10730 // in range [0, 63] will have 0U value in the higher half lookup
10731 __ uqsubv(decH0, __ T16B, in0, v27);
10732 __ uqsubv(decH1, __ T16B, in1, v27);
10733 __ uqsubv(decH2, __ T16B, in2, v27);
10734 __ uqsubv(decH3, __ T16B, in3, v27);
10735
10736 // lower half lookup
10737 __ tbl(decL0, arrangement, codecL, 4, in0);
10738 __ tbl(decL1, arrangement, codecL, 4, in1);
10739 __ tbl(decL2, arrangement, codecL, 4, in2);
10740 __ tbl(decL3, arrangement, codecL, 4, in3);
10741
10742 // higher half lookup
10743 __ tbx(decH0, arrangement, codecH, 4, decH0);
10744 __ tbx(decH1, arrangement, codecH, 4, decH1);
10745 __ tbx(decH2, arrangement, codecH, 4, decH2);
10746 __ tbx(decH3, arrangement, codecH, 4, decH3);
10747
10748 // combine lower and higher
10749 __ orr(decL0, arrangement, decL0, decH0);
10750 __ orr(decL1, arrangement, decL1, decH1);
10751 __ orr(decL2, arrangement, decL2, decH2);
10752 __ orr(decL3, arrangement, decL3, decH3);
10753
10754 // check illegal inputs, value larger than 63 (maximum of 6 bits)
10755 __ cm(Assembler::HI, decH0, arrangement, decL0, v27);
10756 __ cm(Assembler::HI, decH1, arrangement, decL1, v27);
10757 __ cm(Assembler::HI, decH2, arrangement, decL2, v27);
10758 __ cm(Assembler::HI, decH3, arrangement, decL3, v27);
10759 __ orr(in0, arrangement, decH0, decH1);
10760 __ orr(in1, arrangement, decH2, decH3);
10761 __ orr(in2, arrangement, in0, in1);
10762 __ umaxv(in3, arrangement, in2);
10763 __ umov(rscratch2, in3, __ B, 0);
10764
10765 // get the data to output
10766 __ shl(out0, arrangement, decL0, 2);
10767 __ ushr(out1, arrangement, decL1, 4);
10768 __ orr(out0, arrangement, out0, out1);
10769 __ shl(out1, arrangement, decL1, 4);
10770 __ ushr(out2, arrangement, decL2, 2);
10771 __ orr(out1, arrangement, out1, out2);
10772 __ shl(out2, arrangement, decL2, 6);
10773 __ orr(out2, arrangement, out2, decL3);
10774
10775 __ cbz(rscratch2, NoIllegalData);
10776
10777 // handle illegal input
10778 __ umov(r10, in2, __ D, 0);
10779 if (size == 16) {
10780 __ cbnz(r10, ErrorInLowerHalf);
10781
10782 // illegal input is in higher half, store the lower half now.
10783 __ st3(out0, out1, out2, __ T8B, __ post(dst, 24));
10784
10785 __ umov(r10, in2, __ D, 1);
10786 __ umov(r11, out0, __ D, 1);
10787 __ umov(r12, out1, __ D, 1);
10788 __ umov(r13, out2, __ D, 1);
10789 __ b(StoreLegalData);
10790
10791 __ BIND(ErrorInLowerHalf);
10792 }
10793 __ umov(r11, out0, __ D, 0);
10794 __ umov(r12, out1, __ D, 0);
10795 __ umov(r13, out2, __ D, 0);
10796
10797 __ BIND(StoreLegalData);
10798 __ tbnz(r10, 5, Exit); // 0xff indicates illegal input
10799 __ strb(r11, __ post(dst, 1));
10800 __ strb(r12, __ post(dst, 1));
10801 __ strb(r13, __ post(dst, 1));
10802 __ lsr(r10, r10, 8);
10803 __ lsr(r11, r11, 8);
10804 __ lsr(r12, r12, 8);
10805 __ lsr(r13, r13, 8);
10806 __ b(StoreLegalData);
10807
10808 __ BIND(NoIllegalData);
10809 __ st3(out0, out1, out2, arrangement, __ post(dst, 3 * size));
10810 }
10811
10812
10813 /**
10814 * Arguments:
10815 *
10816 * Input:
10817 * c_rarg0 - src_start
10818 * c_rarg1 - src_offset
10819 * c_rarg2 - src_length
10820 * c_rarg3 - dest_start
10821 * c_rarg4 - dest_offset
10822 * c_rarg5 - isURL
10823 * c_rarg6 - isMIME
10824 *
10825 */
10826 address generate_base64_decodeBlock() {
10827
10828 // The SIMD part of this Base64 decode intrinsic is based on the algorithm outlined
10829 // on http://0x80.pl/articles/base64-simd-neon.html#encoding-quadwords, in section
10830 // titled "Base64 decoding".
10831
10832 StubId stub_id = StubId::stubgen_base64_decodeBlock_id;
10833 int entry_count = StubInfo::entry_count(stub_id);
10834 assert(entry_count == 1, "sanity check");
10835 address start = load_archive_data(stub_id);
10836 if (start != nullptr) {
10837 return start;
10838 }
10839 __ align(CodeEntryAlignment);
10840 StubCodeMark mark(this, stub_id);
10841 start = __ pc();
10842
10843 Register src = c_rarg0; // source array
10844 Register soff = c_rarg1; // source start offset
10845 Register send = c_rarg2; // source end offset
10846 Register dst = c_rarg3; // dest array
10847 Register doff = c_rarg4; // position for writing to dest array
10848 Register isURL = c_rarg5; // Base64 or URL character set
10849 Register isMIME = c_rarg6; // Decoding MIME block - unused in this implementation
10850
10851 Register length = send; // reuse send as length of source data to process
10852
10853 Register simd_codec = c_rarg6;
10854 Register nosimd_codec = c_rarg7;
10855
10856 Label ProcessData, Process64B, Process32B, Process4B, SIMDEnter, SIMDExit, Exit;
10857
10858 __ enter();
10859
10860 __ add(src, src, soff);
10861 __ add(dst, dst, doff);
10862
10863 __ mov(doff, dst);
10864
10865 __ sub(length, send, soff);
10866 __ bfm(length, zr, 0, 1);
10867
10868 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForNoSIMD));
10869 __ cbz(isURL, ProcessData);
10870 __ lea(nosimd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForNoSIMD));
10871
10872 __ BIND(ProcessData);
10873 __ mov(rscratch1, length);
10874 __ cmp(length, (u1)144); // 144 = 80 + 64
10875 __ br(Assembler::LT, Process4B);
10876
10877 // In the MIME case, the line length cannot be more than 76
10878 // bytes (see RFC 2045). This is too short a block for SIMD
10879 // to be worthwhile, so we use non-SIMD here.
10880 __ movw(rscratch1, 79);
10881
10882 __ BIND(Process4B);
10883 __ ldrw(r14, __ post(src, 4));
10884 __ ubfxw(r10, r14, 0, 8);
10885 __ ubfxw(r11, r14, 8, 8);
10886 __ ubfxw(r12, r14, 16, 8);
10887 __ ubfxw(r13, r14, 24, 8);
10888 // get the de-code
10889 __ ldrb(r10, Address(nosimd_codec, r10, Address::uxtw(0)));
10890 __ ldrb(r11, Address(nosimd_codec, r11, Address::uxtw(0)));
10891 __ ldrb(r12, Address(nosimd_codec, r12, Address::uxtw(0)));
10892 __ ldrb(r13, Address(nosimd_codec, r13, Address::uxtw(0)));
10893 // error detection, 255u indicates an illegal input
10894 __ orrw(r14, r10, r11);
10895 __ orrw(r15, r12, r13);
10896 __ orrw(r14, r14, r15);
10897 __ tbnz(r14, 7, Exit);
10898 // recover the data
10899 __ lslw(r14, r10, 10);
10900 __ bfiw(r14, r11, 4, 6);
10901 __ bfmw(r14, r12, 2, 5);
10902 __ rev16w(r14, r14);
10903 __ bfiw(r13, r12, 6, 2);
10904 __ strh(r14, __ post(dst, 2));
10905 __ strb(r13, __ post(dst, 1));
10906 // non-simd loop
10907 __ subsw(rscratch1, rscratch1, 4);
10908 __ br(Assembler::GT, Process4B);
10909
10910 // if exiting from PreProcess80B, rscratch1 == -1;
10911 // otherwise, rscratch1 == 0.
10912 __ cbzw(rscratch1, Exit);
10913 __ sub(length, length, 80);
10914
10915 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64ForSIMD));
10916 __ cbz(isURL, SIMDEnter);
10917 __ lea(simd_codec, ExternalAddress((address) _decodeBlock_fromBase64URLForSIMD));
10918
10919 __ BIND(SIMDEnter);
10920 __ ld1(v0, v1, v2, v3, __ T16B, __ post(simd_codec, 64));
10921 __ ld1(v4, v5, v6, v7, __ T16B, Address(simd_codec));
10922 __ mov(rscratch1, 63);
10923 __ dup(v27, __ T16B, rscratch1);
10924
10925 __ BIND(Process64B);
10926 __ cmp(length, (u1)64);
10927 __ br(Assembler::LT, Process32B);
10928 generate_base64_decode_simdround(src, dst, v0, v4, 16, Exit);
10929 __ sub(length, length, 64);
10930 __ b(Process64B);
10931
10932 __ BIND(Process32B);
10933 __ cmp(length, (u1)32);
10934 __ br(Assembler::LT, SIMDExit);
10935 generate_base64_decode_simdround(src, dst, v0, v4, 8, Exit);
10936 __ sub(length, length, 32);
10937 __ b(Process32B);
10938
10939 __ BIND(SIMDExit);
10940 __ cbz(length, Exit);
10941 __ movw(rscratch1, length);
10942 __ b(Process4B);
10943
10944 __ BIND(Exit);
10945 __ sub(c_rarg0, dst, doff);
10946
10947 __ leave();
10948 __ ret(lr);
10949
10950 // record the stub entry and end
10951 store_archive_data(stub_id, start, __ pc());
10952
10953 return start;
10954 }
10955
10956 // Support for spin waits.
10957 address generate_spin_wait() {
10958 StubId stub_id = StubId::stubgen_spin_wait_id;
10959 int entry_count = StubInfo::entry_count(stub_id);
10960 assert(entry_count == 1, "sanity check");
10961 address start = load_archive_data(stub_id);
10962 if (start != nullptr) {
10963 return start;
10964 }
10965 __ align(CodeEntryAlignment);
10966 StubCodeMark mark(this, stub_id);
10967 start = __ pc();
10968
10969 __ spin_wait();
10970 __ ret(lr);
10971
10972 // record the stub entry and end
10973 store_archive_data(stub_id, start, __ pc());
10974
10975 return start;
10976 }
10977
10978 void generate_lookup_secondary_supers_table_stub() {
10979 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_id;
10980 GrowableArray<address> entries;
10981 int entry_count = StubInfo::entry_count(stub_id);
10982 assert(entry_count == Klass::SECONDARY_SUPERS_TABLE_SIZE, "sanity check");
10983 address start = load_archive_data(stub_id, &entries);
10984 if (start != nullptr) {
10985 assert(entries.length() == Klass::SECONDARY_SUPERS_TABLE_SIZE - 1,
10986 "unexpected extra entry count %d", entries.length());
10987 StubRoutines::_lookup_secondary_supers_table_stubs[0] = start;
10988 for (int slot = 1; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
10989 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = entries.at(slot - 1);
10990 }
10991 return;
10992 }
10993
10994 StubCodeMark mark(this, stub_id);
10995
10996 const Register
10997 r_super_klass = r0,
10998 r_array_base = r1,
10999 r_array_length = r2,
11000 r_array_index = r3,
11001 r_sub_klass = r4,
11002 r_bitmap = rscratch2,
11003 result = r5;
11004 const FloatRegister
11005 vtemp = v0;
11006
11007 for (int slot = 0; slot < Klass::SECONDARY_SUPERS_TABLE_SIZE; slot++) {
11008 address next_entry = __ pc();
11009 StubRoutines::_lookup_secondary_supers_table_stubs[slot] = next_entry;
11010 if (slot == 0) {
11011 start = next_entry;
11012 } else {
11013 entries.append(next_entry);
11014 }
11015 Label L_success;
11016 __ enter();
11017 __ lookup_secondary_supers_table_const(r_sub_klass, r_super_klass,
11018 r_array_base, r_array_length, r_array_index,
11019 vtemp, result, slot,
11020 /*stub_is_near*/true);
11021 __ leave();
11022 __ ret(lr);
11023 }
11024 // record the stub entry and end plus all the auxiliary entries
11025 store_archive_data(stub_id, start, __ pc(), &entries);
11026 }
11027
11028 // Slow path implementation for UseSecondarySupersTable.
11029 address generate_lookup_secondary_supers_table_slow_path_stub() {
11030 StubId stub_id = StubId::stubgen_lookup_secondary_supers_table_slow_path_id;
11031 int entry_count = StubInfo::entry_count(stub_id);
11032 assert(entry_count == 1, "sanity check");
11033 address start = load_archive_data(stub_id);
11034 if (start != nullptr) {
11035 return start;
11036 }
11037 StubCodeMark mark(this, stub_id);
11038 start = __ pc();
11039 const Register
11040 r_super_klass = r0, // argument
11041 r_array_base = r1, // argument
11042 temp1 = r2, // temp
11043 r_array_index = r3, // argument
11044 r_bitmap = rscratch2, // argument
11045 result = r5; // argument
11046
11047 __ lookup_secondary_supers_table_slow_path(r_super_klass, r_array_base, r_array_index, r_bitmap, temp1, result);
11048 __ ret(lr);
11049
11050 // record the stub entry and end
11051 store_archive_data(stub_id, start, __ pc());
11052
11053 return start;
11054 }
11055
11056 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
11057
11058 // ARMv8.1 LSE versions of the atomic stubs used by AtomicAccess::PlatformXX.
11059 //
11060 // If LSE is in use, generate LSE versions of all the stubs. The
11061 // non-LSE versions are in atomic_aarch64.S.
11062
11063 // class AtomicStubMark records the entry point of a stub and the
11064 // stub pointer which will point to it. The stub pointer is set to
11065 // the entry point when ~AtomicStubMark() is called, which must be
11066 // after ICache::invalidate_range. This ensures safe publication of
11067 // the generated code.
11068 class AtomicStubMark {
11069 address _entry_point;
11070 aarch64_atomic_stub_t *_stub;
11071 MacroAssembler *_masm;
11072 public:
11073 AtomicStubMark(MacroAssembler *masm, aarch64_atomic_stub_t *stub) {
11074 _masm = masm;
11075 __ align(32);
11076 _entry_point = __ pc();
11077 _stub = stub;
11078 }
11079 ~AtomicStubMark() {
11080 *_stub = (aarch64_atomic_stub_t)_entry_point;
11081 }
11082 };
11083
11084 // NB: For memory_order_conservative we need a trailing membar after
11085 // LSE atomic operations but not a leading membar.
11086 //
11087 // We don't need a leading membar because a clause in the Arm ARM
11088 // says:
11089 //
11090 // Barrier-ordered-before
11091 //
11092 // Barrier instructions order prior Memory effects before subsequent
11093 // Memory effects generated by the same Observer. A read or a write
11094 // RW1 is Barrier-ordered-before a read or a write RW 2 from the same
11095 // Observer if and only if RW1 appears in program order before RW 2
11096 // and [ ... ] at least one of RW 1 and RW 2 is generated by an atomic
11097 // instruction with both Acquire and Release semantics.
11098 //
11099 // All the atomic instructions {ldaddal, swapal, casal} have Acquire
11100 // and Release semantics, therefore we don't need a leading
11101 // barrier. However, there is no corresponding Barrier-ordered-after
11102 // relationship, therefore we need a trailing membar to prevent a
11103 // later store or load from being reordered with the store in an
11104 // atomic instruction.
11105 //
11106 // This was checked by using the herd7 consistency model simulator
11107 // (http://diy.inria.fr/) with this test case:
11108 //
11109 // AArch64 LseCas
11110 // { 0:X1=x; 0:X2=y; 1:X1=x; 1:X2=y; }
11111 // P0 | P1;
11112 // LDR W4, [X2] | MOV W3, #0;
11113 // DMB LD | MOV W4, #1;
11114 // LDR W3, [X1] | CASAL W3, W4, [X1];
11115 // | DMB ISH;
11116 // | STR W4, [X2];
11117 // exists
11118 // (0:X3=0 /\ 0:X4=1)
11119 //
11120 // If X3 == 0 && X4 == 1, the store to y in P1 has been reordered
11121 // with the store to x in P1. Without the DMB in P1 this may happen.
11122 //
11123 // At the time of writing we don't know of any AArch64 hardware that
11124 // reorders stores in this way, but the Reference Manual permits it.
11125
11126 void gen_cas_entry(Assembler::operand_size size,
11127 atomic_memory_order order) {
11128 Register prev = r3, ptr = c_rarg0, compare_val = c_rarg1,
11129 exchange_val = c_rarg2;
11130 bool acquire, release;
11131 switch (order) {
11132 case memory_order_relaxed:
11133 acquire = false;
11134 release = false;
11135 break;
11136 case memory_order_release:
11137 acquire = false;
11138 release = true;
11139 break;
11140 default:
11141 acquire = true;
11142 release = true;
11143 break;
11144 }
11145 __ mov(prev, compare_val);
11146 __ lse_cas(prev, exchange_val, ptr, size, acquire, release, /*not_pair*/true);
11147 if (order == memory_order_conservative) {
11148 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11149 }
11150 if (size == Assembler::xword) {
11151 __ mov(r0, prev);
11152 } else {
11153 __ movw(r0, prev);
11154 }
11155 __ ret(lr);
11156 }
11157
11158 void gen_ldadd_entry(Assembler::operand_size size, atomic_memory_order order) {
11159 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11160 // If not relaxed, then default to conservative. Relaxed is the only
11161 // case we use enough to be worth specializing.
11162 if (order == memory_order_relaxed) {
11163 __ ldadd(size, incr, prev, addr);
11164 } else {
11165 __ ldaddal(size, incr, prev, addr);
11166 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11167 }
11168 if (size == Assembler::xword) {
11169 __ mov(r0, prev);
11170 } else {
11171 __ movw(r0, prev);
11172 }
11173 __ ret(lr);
11174 }
11175
11176 void gen_swpal_entry(Assembler::operand_size size) {
11177 Register prev = r2, addr = c_rarg0, incr = c_rarg1;
11178 __ swpal(size, incr, prev, addr);
11179 __ membar(Assembler::StoreStore|Assembler::StoreLoad);
11180 if (size == Assembler::xword) {
11181 __ mov(r0, prev);
11182 } else {
11183 __ movw(r0, prev);
11184 }
11185 __ ret(lr);
11186 }
11187
11188 void generate_atomic_entry_points() {
11189 if (! UseLSE) {
11190 return;
11191 }
11192 StubId stub_id = StubId::stubgen_atomic_entry_points_id;
11193 GrowableArray<address> entries;
11194 int entry_count = StubInfo::entry_count(stub_id);
11195 address start = load_archive_data(stub_id, &entries);
11196 if (start != nullptr) {
11197 assert(entries.length() == entry_count - 1,
11198 "unexpected extra entry count %d", entries.length());
11199 aarch64_atomic_fetch_add_4_impl = (aarch64_atomic_stub_t)start;
11200 int idx = 0;
11201 aarch64_atomic_fetch_add_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11202 aarch64_atomic_fetch_add_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11203 aarch64_atomic_fetch_add_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11204 aarch64_atomic_xchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11205 aarch64_atomic_xchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11206 aarch64_atomic_cmpxchg_1_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11207 aarch64_atomic_cmpxchg_4_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11208 aarch64_atomic_cmpxchg_8_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11209 aarch64_atomic_cmpxchg_1_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11210 aarch64_atomic_cmpxchg_4_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11211 aarch64_atomic_cmpxchg_8_relaxed_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11212 aarch64_atomic_cmpxchg_4_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11213 aarch64_atomic_cmpxchg_8_release_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11214 aarch64_atomic_cmpxchg_4_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11215 aarch64_atomic_cmpxchg_8_seq_cst_impl = (aarch64_atomic_stub_t)entries.at(idx++);
11216 assert(idx == entries.length(), "sanity!");
11217 return;
11218 }
11219
11220 __ align(CodeEntryAlignment);
11221 StubCodeMark mark(this, stub_id);
11222 start = __ pc();
11223 address end;
11224 {
11225 // ADD, memory_order_conservative
11226 AtomicStubMark mark_fetch_add_4(_masm, &aarch64_atomic_fetch_add_4_impl);
11227 gen_ldadd_entry(Assembler::word, memory_order_conservative);
11228
11229 AtomicStubMark mark_fetch_add_8(_masm, &aarch64_atomic_fetch_add_8_impl);
11230 gen_ldadd_entry(Assembler::xword, memory_order_conservative);
11231
11232 // ADD, memory_order_relaxed
11233 AtomicStubMark mark_fetch_add_4_relaxed
11234 (_masm, &aarch64_atomic_fetch_add_4_relaxed_impl);
11235 gen_ldadd_entry(MacroAssembler::word, memory_order_relaxed);
11236
11237 AtomicStubMark mark_fetch_add_8_relaxed
11238 (_masm, &aarch64_atomic_fetch_add_8_relaxed_impl);
11239 gen_ldadd_entry(MacroAssembler::xword, memory_order_relaxed);
11240
11241 // XCHG, memory_order_conservative
11242 AtomicStubMark mark_xchg_4(_masm, &aarch64_atomic_xchg_4_impl);
11243 gen_swpal_entry(Assembler::word);
11244
11245 AtomicStubMark mark_xchg_8(_masm, &aarch64_atomic_xchg_8_impl);
11246 gen_swpal_entry(Assembler::xword);
11247
11248 // CAS, memory_order_conservative
11249 AtomicStubMark mark_cmpxchg_1(_masm, &aarch64_atomic_cmpxchg_1_impl);
11250 gen_cas_entry(MacroAssembler::byte, memory_order_conservative);
11251
11252 AtomicStubMark mark_cmpxchg_4(_masm, &aarch64_atomic_cmpxchg_4_impl);
11253 gen_cas_entry(MacroAssembler::word, memory_order_conservative);
11254
11255 AtomicStubMark mark_cmpxchg_8(_masm, &aarch64_atomic_cmpxchg_8_impl);
11256 gen_cas_entry(MacroAssembler::xword, memory_order_conservative);
11257
11258 // CAS, memory_order_relaxed
11259 AtomicStubMark mark_cmpxchg_1_relaxed
11260 (_masm, &aarch64_atomic_cmpxchg_1_relaxed_impl);
11261 gen_cas_entry(MacroAssembler::byte, memory_order_relaxed);
11262
11263 AtomicStubMark mark_cmpxchg_4_relaxed
11264 (_masm, &aarch64_atomic_cmpxchg_4_relaxed_impl);
11265 gen_cas_entry(MacroAssembler::word, memory_order_relaxed);
11266
11267 AtomicStubMark mark_cmpxchg_8_relaxed
11268 (_masm, &aarch64_atomic_cmpxchg_8_relaxed_impl);
11269 gen_cas_entry(MacroAssembler::xword, memory_order_relaxed);
11270
11271 AtomicStubMark mark_cmpxchg_4_release
11272 (_masm, &aarch64_atomic_cmpxchg_4_release_impl);
11273 gen_cas_entry(MacroAssembler::word, memory_order_release);
11274
11275 AtomicStubMark mark_cmpxchg_8_release
11276 (_masm, &aarch64_atomic_cmpxchg_8_release_impl);
11277 gen_cas_entry(MacroAssembler::xword, memory_order_release);
11278
11279 AtomicStubMark mark_cmpxchg_4_seq_cst
11280 (_masm, &aarch64_atomic_cmpxchg_4_seq_cst_impl);
11281 gen_cas_entry(MacroAssembler::word, memory_order_seq_cst);
11282
11283 AtomicStubMark mark_cmpxchg_8_seq_cst
11284 (_masm, &aarch64_atomic_cmpxchg_8_seq_cst_impl);
11285 gen_cas_entry(MacroAssembler::xword, memory_order_seq_cst);
11286
11287 end = __ pc();
11288
11289 ICache::invalidate_range(start, end - start);
11290 // exit block to force update of AtomicStubMark targets
11291 }
11292
11293 assert(start == (address)aarch64_atomic_fetch_add_4_impl,
11294 "atomic stub should be at start of buffer");
11295 // record the stub start and end plus all the entries saved by the
11296 // AtomicStubMark destructor
11297 entries.append((address)aarch64_atomic_fetch_add_8_impl);
11298 entries.append((address)aarch64_atomic_fetch_add_4_relaxed_impl);
11299 entries.append((address)aarch64_atomic_fetch_add_8_relaxed_impl);
11300 entries.append((address)aarch64_atomic_xchg_4_impl);
11301 entries.append((address)aarch64_atomic_xchg_8_impl);
11302 entries.append((address)aarch64_atomic_cmpxchg_1_impl);
11303 entries.append((address)aarch64_atomic_cmpxchg_4_impl);
11304 entries.append((address)aarch64_atomic_cmpxchg_8_impl);
11305 entries.append((address)aarch64_atomic_cmpxchg_1_relaxed_impl);
11306 entries.append((address)aarch64_atomic_cmpxchg_4_relaxed_impl);
11307 entries.append((address)aarch64_atomic_cmpxchg_8_relaxed_impl);
11308 entries.append((address)aarch64_atomic_cmpxchg_4_release_impl);
11309 entries.append((address)aarch64_atomic_cmpxchg_8_release_impl);
11310 entries.append((address)aarch64_atomic_cmpxchg_4_seq_cst_impl);
11311 entries.append((address)aarch64_atomic_cmpxchg_8_seq_cst_impl);
11312
11313 assert(entries.length() == entry_count - 1,
11314 "unexpected extra entry count %d", entries.length());
11315
11316 store_archive_data(stub_id, start, end, &entries);
11317 }
11318 #endif // LINUX
11319
11320 static void save_return_registers(MacroAssembler* masm) {
11321 if (InlineTypeReturnedAsFields) {
11322 masm->push(RegSet::range(r0, r7), sp);
11323 masm->sub(sp, sp, 4 * wordSize);
11324 masm->st1(v0, v1, v2, v3, masm->T1D, Address(sp));
11325 masm->sub(sp, sp, 4 * wordSize);
11326 masm->st1(v4, v5, v6, v7, masm->T1D, Address(sp));
11327 } else {
11328 masm->fmovd(rscratch1, v0);
11329 masm->stp(rscratch1, r0, Address(masm->pre(sp, -2 * wordSize)));
11330 }
11331 }
11332
11333 static void restore_return_registers(MacroAssembler* masm) {
11334 if (InlineTypeReturnedAsFields) {
11335 masm->ld1(v4, v5, v6, v7, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11336 masm->ld1(v0, v1, v2, v3, masm->T1D, Address(masm->post(sp, 4 * wordSize)));
11337 masm->pop(RegSet::range(r0, r7), sp);
11338 } else {
11339 masm->ldp(rscratch1, r0, Address(masm->post(sp, 2 * wordSize)));
11340 masm->fmovd(v0, rscratch1);
11341 }
11342 }
11343
11344 address generate_cont_thaw(Continuation::thaw_kind kind) {
11345 bool return_barrier = Continuation::is_thaw_return_barrier(kind);
11346 bool return_barrier_exception = Continuation::is_thaw_return_barrier_exception(kind);
11347
11348 address start = __ pc();
11349
11350 if (return_barrier) {
11351 __ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset()));
11352 __ mov(sp, rscratch1);
11353 }
11354 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11355
11356 if (return_barrier) {
11357 // preserve possible return value from a method returning to the return barrier
11358 save_return_registers(_masm);
11359 }
11360
11361 __ movw(c_rarg1, (return_barrier ? 1 : 0));
11362 __ call_VM_leaf(CAST_FROM_FN_PTR(address, Continuation::prepare_thaw), rthread, c_rarg1);
11363 __ mov(rscratch2, r0); // r0 contains the size of the frames to thaw, 0 if overflow or no more frames
11364
11365 if (return_barrier) {
11366 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11367 restore_return_registers(_masm);
11368 }
11369 assert_asm(_masm, (__ ldr(rscratch1, Address(rthread, JavaThread::cont_entry_offset())), __ cmp(sp, rscratch1)), Assembler::EQ, "incorrect sp");
11370
11371
11372 Label thaw_success;
11373 // rscratch2 contains the size of the frames to thaw, 0 if overflow or no more frames
11374 __ cbnz(rscratch2, thaw_success);
11375 __ lea(rscratch1, RuntimeAddress(SharedRuntime::throw_StackOverflowError_entry()));
11376 __ br(rscratch1);
11377 __ bind(thaw_success);
11378
11379 // make room for the thawed frames
11380 __ sub(rscratch1, sp, rscratch2);
11381 __ andr(rscratch1, rscratch1, -16); // align
11382 __ mov(sp, rscratch1);
11383
11384 if (return_barrier) {
11385 // save original return value -- again
11386 save_return_registers(_masm);
11387 }
11388
11389 // If we want, we can templatize thaw by kind, and have three different entries
11390 __ movw(c_rarg1, (uint32_t)kind);
11391
11392 __ call_VM_leaf(Continuation::thaw_entry(), rthread, c_rarg1);
11393 __ mov(rscratch2, r0); // r0 is the sp of the yielding frame
11394
11395 if (return_barrier) {
11396 // restore return value (no safepoint in the call to thaw, so even an oop return value should be OK)
11397 restore_return_registers(_masm);
11398 } else {
11399 __ mov(r0, zr); // return 0 (success) from doYield
11400 }
11401
11402 // we're now on the yield frame (which is in an address above us b/c rsp has been pushed down)
11403 __ sub(sp, rscratch2, 2*wordSize); // now pointing to rfp spill
11404 __ mov(rfp, sp);
11405
11406 if (return_barrier_exception) {
11407 __ ldr(c_rarg1, Address(rfp, wordSize)); // return address
11408 __ authenticate_return_address(c_rarg1);
11409 __ verify_oop(r0);
11410 // save return value containing the exception oop in callee-saved R19
11411 __ mov(r19, r0);
11412
11413 __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, c_rarg1);
11414
11415 // Reinitialize the ptrue predicate register, in case the external runtime call clobbers ptrue reg, as we may return to SVE compiled code.
11416 // __ reinitialize_ptrue();
11417
11418 // see OptoRuntime::generate_exception_blob: r0 -- exception oop, r3 -- exception pc
11419
11420 __ mov(r1, r0); // the exception handler
11421 __ mov(r0, r19); // restore return value containing the exception oop
11422 __ verify_oop(r0);
11423
11424 __ leave();
11425 __ mov(r3, lr);
11426 __ br(r1); // the exception handler
11427 } else {
11428 // We're "returning" into the topmost thawed frame; see Thaw::push_return_frame
11429 __ leave();
11430 __ ret(lr);
11431 }
11432
11433 return start;
11434 }
11435
11436 address generate_cont_thaw() {
11437 if (!Continuations::enabled()) return nullptr;
11438
11439 StubId stub_id = StubId::stubgen_cont_thaw_id;
11440 int entry_count = StubInfo::entry_count(stub_id);
11441 assert(entry_count == 1, "sanity check");
11442 address start = load_archive_data(stub_id);
11443 if (start != nullptr) {
11444 return start;
11445 }
11446 StubCodeMark mark(this, stub_id);
11447 start = __ pc();
11448 generate_cont_thaw(Continuation::thaw_top);
11449
11450 // record the stub start and end
11451 store_archive_data(stub_id, start, __ pc());
11452
11453 return start;
11454 }
11455
11456 address generate_cont_returnBarrier() {
11457 if (!Continuations::enabled()) return nullptr;
11458
11459 // TODO: will probably need multiple return barriers depending on return type
11460 StubId stub_id = StubId::stubgen_cont_returnBarrier_id;
11461 int entry_count = StubInfo::entry_count(stub_id);
11462 assert(entry_count == 1, "sanity check");
11463 address start = load_archive_data(stub_id);
11464 if (start != nullptr) {
11465 return start;
11466 }
11467 StubCodeMark mark(this, stub_id);
11468 start = __ pc();
11469
11470 generate_cont_thaw(Continuation::thaw_return_barrier);
11471
11472 // record the stub start and end
11473 store_archive_data(stub_id, start, __ pc());
11474
11475 return start;
11476 }
11477
11478 address generate_cont_returnBarrier_exception() {
11479 if (!Continuations::enabled()) return nullptr;
11480
11481 StubId stub_id = StubId::stubgen_cont_returnBarrierExc_id;
11482 int entry_count = StubInfo::entry_count(stub_id);
11483 assert(entry_count == 1, "sanity check");
11484 address start = load_archive_data(stub_id);
11485 if (start != nullptr) {
11486 return start;
11487 }
11488 StubCodeMark mark(this, stub_id);
11489 start = __ pc();
11490
11491 generate_cont_thaw(Continuation::thaw_return_barrier_exception);
11492
11493 // record the stub start and end
11494 store_archive_data(stub_id, start, __ pc());
11495
11496 return start;
11497 }
11498
11499 address generate_cont_preempt_stub() {
11500 if (!Continuations::enabled()) return nullptr;
11501 StubId stub_id = StubId::stubgen_cont_preempt_id;
11502 int entry_count = StubInfo::entry_count(stub_id);
11503 assert(entry_count == 1, "sanity check");
11504 address start = load_archive_data(stub_id);
11505 if (start != nullptr) {
11506 return start;
11507 }
11508 StubCodeMark mark(this, stub_id);
11509 start = __ pc();
11510
11511 __ reset_last_Java_frame(true);
11512
11513 // Set sp to enterSpecial frame, i.e. remove all frames copied into the heap.
11514 __ ldr(rscratch2, Address(rthread, JavaThread::cont_entry_offset()));
11515 __ mov(sp, rscratch2);
11516
11517 Label preemption_cancelled;
11518 __ ldrb(rscratch1, Address(rthread, JavaThread::preemption_cancelled_offset()));
11519 __ cbnz(rscratch1, preemption_cancelled);
11520
11521 // Remove enterSpecial frame from the stack and return to Continuation.run() to unmount.
11522 SharedRuntime::continuation_enter_cleanup(_masm);
11523 __ leave();
11524 __ ret(lr);
11525
11526 // We acquired the monitor after freezing the frames so call thaw to continue execution.
11527 __ bind(preemption_cancelled);
11528 __ strb(zr, Address(rthread, JavaThread::preemption_cancelled_offset()));
11529 __ lea(rfp, Address(sp, checked_cast<int32_t>(ContinuationEntry::size())));
11530 __ lea(rscratch1, ExternalAddress(ContinuationEntry::thaw_call_pc_address()));
11531 __ ldr(rscratch1, Address(rscratch1));
11532 __ br(rscratch1);
11533
11534 // record the stub start and end
11535 store_archive_data(stub_id, start, __ pc());
11536
11537 return start;
11538 }
11539
11540 // In sun.security.util.math.intpoly.IntegerPolynomial1305, integers
11541 // are represented as long[5], with BITS_PER_LIMB = 26.
11542 // Pack five 26-bit limbs into three 64-bit registers.
11543 void pack_26(Register dest0, Register dest1, Register dest2, Register src) {
11544 __ ldp(dest0, rscratch1, Address(src, 0)); // 26 bits
11545 __ add(dest0, dest0, rscratch1, Assembler::LSL, 26); // 26 bits
11546 __ ldp(rscratch1, rscratch2, Address(src, 2 * sizeof (jlong)));
11547 __ add(dest0, dest0, rscratch1, Assembler::LSL, 52); // 12 bits
11548
11549 __ add(dest1, zr, rscratch1, Assembler::LSR, 12); // 14 bits
11550 __ add(dest1, dest1, rscratch2, Assembler::LSL, 14); // 26 bits
11551 __ ldr(rscratch1, Address(src, 4 * sizeof (jlong)));
11552 __ add(dest1, dest1, rscratch1, Assembler::LSL, 40); // 24 bits
11553
11554 if (dest2->is_valid()) {
11555 __ add(dest2, zr, rscratch1, Assembler::LSR, 24); // 2 bits
11556 } else {
11557 #ifdef ASSERT
11558 Label OK;
11559 __ cmp(zr, rscratch1, Assembler::LSR, 24); // 2 bits
11560 __ br(__ EQ, OK);
11561 __ stop("high bits of Poly1305 integer should be zero");
11562 __ should_not_reach_here();
11563 __ bind(OK);
11564 #endif
11565 }
11566 }
11567
11568 // As above, but return only a 128-bit integer, packed into two
11569 // 64-bit registers.
11570 void pack_26(Register dest0, Register dest1, Register src) {
11571 pack_26(dest0, dest1, noreg, src);
11572 }
11573
11574 // Multiply and multiply-accumulate unsigned 64-bit registers.
11575 void wide_mul(Register prod_lo, Register prod_hi, Register n, Register m) {
11576 __ mul(prod_lo, n, m);
11577 __ umulh(prod_hi, n, m);
11578 }
11579 void wide_madd(Register sum_lo, Register sum_hi, Register n, Register m) {
11580 wide_mul(rscratch1, rscratch2, n, m);
11581 __ adds(sum_lo, sum_lo, rscratch1);
11582 __ adc(sum_hi, sum_hi, rscratch2);
11583 }
11584
11585 // Poly1305, RFC 7539
11586
11587 // See https://loup-vaillant.fr/tutorials/poly1305-design for a
11588 // description of the tricks used to simplify and accelerate this
11589 // computation.
11590
11591 address generate_poly1305_processBlocks() {
11592 StubId stub_id = StubId::stubgen_poly1305_processBlocks_id;
11593 int entry_count = StubInfo::entry_count(stub_id);
11594 assert(entry_count == 1, "sanity check");
11595 address start = load_archive_data(stub_id);
11596 if (start != nullptr) {
11597 return start;
11598 }
11599 __ align(CodeEntryAlignment);
11600 StubCodeMark mark(this, stub_id);
11601 start = __ pc();
11602 Label here;
11603 __ enter();
11604 RegSet callee_saved = RegSet::range(r19, r28);
11605 __ push(callee_saved, sp);
11606
11607 RegSetIterator<Register> regs = (RegSet::range(c_rarg0, r28) - r18_tls - rscratch1 - rscratch2).begin();
11608
11609 // Arguments
11610 const Register input_start = *regs, length = *++regs, acc_start = *++regs, r_start = *++regs;
11611
11612 // R_n is the 128-bit randomly-generated key, packed into two
11613 // registers. The caller passes this key to us as long[5], with
11614 // BITS_PER_LIMB = 26.
11615 const Register R_0 = *++regs, R_1 = *++regs;
11616 pack_26(R_0, R_1, r_start);
11617
11618 // RR_n is (R_n >> 2) * 5
11619 const Register RR_0 = *++regs, RR_1 = *++regs;
11620 __ lsr(RR_0, R_0, 2);
11621 __ add(RR_0, RR_0, RR_0, Assembler::LSL, 2);
11622 __ lsr(RR_1, R_1, 2);
11623 __ add(RR_1, RR_1, RR_1, Assembler::LSL, 2);
11624
11625 // U_n is the current checksum
11626 const Register U_0 = *++regs, U_1 = *++regs, U_2 = *++regs;
11627 pack_26(U_0, U_1, U_2, acc_start);
11628
11629 static constexpr int BLOCK_LENGTH = 16;
11630 Label DONE, LOOP;
11631
11632 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11633 __ br(Assembler::LT, DONE); {
11634 __ bind(LOOP);
11635
11636 // S_n is to be the sum of U_n and the next block of data
11637 const Register S_0 = *++regs, S_1 = *++regs, S_2 = *++regs;
11638 __ ldp(S_0, S_1, __ post(input_start, 2 * wordSize));
11639 __ adds(S_0, U_0, S_0);
11640 __ adcs(S_1, U_1, S_1);
11641 __ adc(S_2, U_2, zr);
11642 __ add(S_2, S_2, 1);
11643
11644 const Register U_0HI = *++regs, U_1HI = *++regs;
11645
11646 // NB: this logic depends on some of the special properties of
11647 // Poly1305 keys. In particular, because we know that the top
11648 // four bits of R_0 and R_1 are zero, we can add together
11649 // partial products without any risk of needing to propagate a
11650 // carry out.
11651 wide_mul(U_0, U_0HI, S_0, R_0); wide_madd(U_0, U_0HI, S_1, RR_1); wide_madd(U_0, U_0HI, S_2, RR_0);
11652 wide_mul(U_1, U_1HI, S_0, R_1); wide_madd(U_1, U_1HI, S_1, R_0); wide_madd(U_1, U_1HI, S_2, RR_1);
11653 __ andr(U_2, R_0, 3);
11654 __ mul(U_2, S_2, U_2);
11655
11656 // Recycle registers S_0, S_1, S_2
11657 regs = (regs.remaining() + S_0 + S_1 + S_2).begin();
11658
11659 // Partial reduction mod 2**130 - 5
11660 __ adds(U_1, U_0HI, U_1);
11661 __ adc(U_2, U_1HI, U_2);
11662 // Sum now in U_2:U_1:U_0.
11663 // Dead: U_0HI, U_1HI.
11664 regs = (regs.remaining() + U_0HI + U_1HI).begin();
11665
11666 // U_2:U_1:U_0 += (U_2 >> 2) * 5 in two steps
11667
11668 // First, U_2:U_1:U_0 += (U_2 >> 2)
11669 __ lsr(rscratch1, U_2, 2);
11670 __ andr(U_2, U_2, (u8)3);
11671 __ adds(U_0, U_0, rscratch1);
11672 __ adcs(U_1, U_1, zr);
11673 __ adc(U_2, U_2, zr);
11674 // Second, U_2:U_1:U_0 += (U_2 >> 2) << 2
11675 __ adds(U_0, U_0, rscratch1, Assembler::LSL, 2);
11676 __ adcs(U_1, U_1, zr);
11677 __ adc(U_2, U_2, zr);
11678
11679 __ sub(length, length, checked_cast<u1>(BLOCK_LENGTH));
11680 __ cmp(length, checked_cast<u1>(BLOCK_LENGTH));
11681 __ br(~ Assembler::LT, LOOP);
11682 }
11683
11684 // Further reduce modulo 2^130 - 5
11685 __ lsr(rscratch1, U_2, 2);
11686 __ add(rscratch1, rscratch1, rscratch1, Assembler::LSL, 2); // rscratch1 = U_2 * 5
11687 __ adds(U_0, U_0, rscratch1); // U_0 += U_2 * 5
11688 __ adcs(U_1, U_1, zr);
11689 __ andr(U_2, U_2, (u1)3);
11690 __ adc(U_2, U_2, zr);
11691
11692 // Unpack the sum into five 26-bit limbs and write to memory.
11693 __ ubfiz(rscratch1, U_0, 0, 26);
11694 __ ubfx(rscratch2, U_0, 26, 26);
11695 __ stp(rscratch1, rscratch2, Address(acc_start));
11696 __ ubfx(rscratch1, U_0, 52, 12);
11697 __ bfi(rscratch1, U_1, 12, 14);
11698 __ ubfx(rscratch2, U_1, 14, 26);
11699 __ stp(rscratch1, rscratch2, Address(acc_start, 2 * sizeof (jlong)));
11700 __ ubfx(rscratch1, U_1, 40, 24);
11701 __ bfi(rscratch1, U_2, 24, 3);
11702 __ str(rscratch1, Address(acc_start, 4 * sizeof (jlong)));
11703
11704 __ bind(DONE);
11705 __ pop(callee_saved, sp);
11706 __ leave();
11707 __ ret(lr);
11708
11709 // record the stub start and end
11710 store_archive_data(stub_id, start, __ pc());
11711
11712 return start;
11713 }
11714
11715 // exception handler for upcall stubs
11716 address generate_upcall_stub_exception_handler() {
11717 StubId stub_id = StubId::stubgen_upcall_stub_exception_handler_id;
11718 int entry_count = StubInfo::entry_count(stub_id);
11719 assert(entry_count == 1, "sanity check");
11720 address start = load_archive_data(stub_id);
11721 if (start != nullptr) {
11722 return start;
11723 }
11724 StubCodeMark mark(this, stub_id);
11725 start = __ pc();
11726
11727 // Native caller has no idea how to handle exceptions,
11728 // so we just crash here. Up to callee to catch exceptions.
11729 __ verify_oop(r0);
11730 __ movptr(rscratch1, CAST_FROM_FN_PTR(uint64_t, UpcallLinker::handle_uncaught_exception));
11731 __ blr(rscratch1);
11732 __ should_not_reach_here();
11733
11734 // record the stub start and end
11735 store_archive_data(stub_id, start, __ pc());
11736
11737 return start;
11738 }
11739
11740 // load Method* target of MethodHandle
11741 // j_rarg0 = jobject receiver
11742 // rmethod = result
11743 address generate_upcall_stub_load_target() {
11744 StubId stub_id = StubId::stubgen_upcall_stub_load_target_id;
11745 int entry_count = StubInfo::entry_count(stub_id);
11746 assert(entry_count == 1, "sanity check");
11747 address start = load_archive_data(stub_id);
11748 if (start != nullptr) {
11749 return start;
11750 }
11751 StubCodeMark mark(this, stub_id);
11752 start = __ pc();
11753
11754 __ resolve_global_jobject(j_rarg0, rscratch1, rscratch2);
11755 // Load target method from receiver
11756 __ load_heap_oop(rmethod, Address(j_rarg0, java_lang_invoke_MethodHandle::form_offset()), rscratch1, rscratch2);
11757 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_LambdaForm::vmentry_offset()), rscratch1, rscratch2);
11758 __ load_heap_oop(rmethod, Address(rmethod, java_lang_invoke_MemberName::method_offset()), rscratch1, rscratch2);
11759 __ access_load_at(T_ADDRESS, IN_HEAP, rmethod,
11760 Address(rmethod, java_lang_invoke_ResolvedMethodName::vmtarget_offset()),
11761 noreg, noreg);
11762 __ str(rmethod, Address(rthread, JavaThread::callee_target_offset())); // just in case callee is deoptimized
11763
11764 __ ret(lr);
11765
11766 // record the stub start and end
11767 store_archive_data(stub_id, start, __ pc());
11768
11769 return start;
11770 }
11771
11772 #undef __
11773 #define __ masm->
11774
11775 class MontgomeryMultiplyGenerator : public MacroAssembler {
11776
11777 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
11778 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
11779
11780 RegSet _toSave;
11781 bool _squaring;
11782
11783 public:
11784 MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
11785 : MacroAssembler(as->code()), _squaring(squaring) {
11786
11787 // Register allocation
11788
11789 RegSetIterator<Register> regs = (RegSet::range(r0, r26) - r18_tls).begin();
11790 Pa_base = *regs; // Argument registers
11791 if (squaring)
11792 Pb_base = Pa_base;
11793 else
11794 Pb_base = *++regs;
11795 Pn_base = *++regs;
11796 Rlen= *++regs;
11797 inv = *++regs;
11798 Pm_base = *++regs;
11799
11800 // Working registers:
11801 Ra = *++regs; // The current digit of a, b, n, and m.
11802 Rb = *++regs;
11803 Rm = *++regs;
11804 Rn = *++regs;
11805
11806 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m.
11807 Pb = *++regs;
11808 Pm = *++regs;
11809 Pn = *++regs;
11810
11811 t0 = *++regs; // Three registers which form a
11812 t1 = *++regs; // triple-precision accumuator.
11813 t2 = *++regs;
11814
11815 Ri = *++regs; // Inner and outer loop indexes.
11816 Rj = *++regs;
11817
11818 Rhi_ab = *++regs; // Product registers: low and high parts
11819 Rlo_ab = *++regs; // of a*b and m*n.
11820 Rhi_mn = *++regs;
11821 Rlo_mn = *++regs;
11822
11823 // r19 and up are callee-saved.
11824 _toSave = RegSet::range(r19, *regs) + Pm_base;
11825 }
11826
11827 private:
11828 void save_regs() {
11829 push(_toSave, sp);
11830 }
11831
11832 void restore_regs() {
11833 pop(_toSave, sp);
11834 }
11835
11836 template <typename T>
11837 void unroll_2(Register count, T block) {
11838 Label loop, end, odd;
11839 tbnz(count, 0, odd);
11840 cbz(count, end);
11841 align(16);
11842 bind(loop);
11843 (this->*block)();
11844 bind(odd);
11845 (this->*block)();
11846 subs(count, count, 2);
11847 br(Assembler::GT, loop);
11848 bind(end);
11849 }
11850
11851 template <typename T>
11852 void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
11853 Label loop, end, odd;
11854 tbnz(count, 0, odd);
11855 cbz(count, end);
11856 align(16);
11857 bind(loop);
11858 (this->*block)(d, s, tmp);
11859 bind(odd);
11860 (this->*block)(d, s, tmp);
11861 subs(count, count, 2);
11862 br(Assembler::GT, loop);
11863 bind(end);
11864 }
11865
11866 void pre1(RegisterOrConstant i) {
11867 block_comment("pre1");
11868 // Pa = Pa_base;
11869 // Pb = Pb_base + i;
11870 // Pm = Pm_base;
11871 // Pn = Pn_base + i;
11872 // Ra = *Pa;
11873 // Rb = *Pb;
11874 // Rm = *Pm;
11875 // Rn = *Pn;
11876 ldr(Ra, Address(Pa_base));
11877 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11878 ldr(Rm, Address(Pm_base));
11879 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11880 lea(Pa, Address(Pa_base));
11881 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
11882 lea(Pm, Address(Pm_base));
11883 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
11884
11885 // Zero the m*n result.
11886 mov(Rhi_mn, zr);
11887 mov(Rlo_mn, zr);
11888 }
11889
11890 // The core multiply-accumulate step of a Montgomery
11891 // multiplication. The idea is to schedule operations as a
11892 // pipeline so that instructions with long latencies (loads and
11893 // multiplies) have time to complete before their results are
11894 // used. This most benefits in-order implementations of the
11895 // architecture but out-of-order ones also benefit.
11896 void step() {
11897 block_comment("step");
11898 // MACC(Ra, Rb, t0, t1, t2);
11899 // Ra = *++Pa;
11900 // Rb = *--Pb;
11901 umulh(Rhi_ab, Ra, Rb);
11902 mul(Rlo_ab, Ra, Rb);
11903 ldr(Ra, pre(Pa, wordSize));
11904 ldr(Rb, pre(Pb, -wordSize));
11905 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
11906 // previous iteration.
11907 // MACC(Rm, Rn, t0, t1, t2);
11908 // Rm = *++Pm;
11909 // Rn = *--Pn;
11910 umulh(Rhi_mn, Rm, Rn);
11911 mul(Rlo_mn, Rm, Rn);
11912 ldr(Rm, pre(Pm, wordSize));
11913 ldr(Rn, pre(Pn, -wordSize));
11914 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11915 }
11916
11917 void post1() {
11918 block_comment("post1");
11919
11920 // MACC(Ra, Rb, t0, t1, t2);
11921 // Ra = *++Pa;
11922 // Rb = *--Pb;
11923 umulh(Rhi_ab, Ra, Rb);
11924 mul(Rlo_ab, Ra, Rb);
11925 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
11926 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
11927
11928 // *Pm = Rm = t0 * inv;
11929 mul(Rm, t0, inv);
11930 str(Rm, Address(Pm));
11931
11932 // MACC(Rm, Rn, t0, t1, t2);
11933 // t0 = t1; t1 = t2; t2 = 0;
11934 umulh(Rhi_mn, Rm, Rn);
11935
11936 #ifndef PRODUCT
11937 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
11938 {
11939 mul(Rlo_mn, Rm, Rn);
11940 add(Rlo_mn, t0, Rlo_mn);
11941 Label ok;
11942 cbz(Rlo_mn, ok); {
11943 stop("broken Montgomery multiply");
11944 } bind(ok);
11945 }
11946 #endif
11947 // We have very carefully set things up so that
11948 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
11949 // the lower half of Rm * Rn because we know the result already:
11950 // it must be -t0. t0 + (-t0) must generate a carry iff
11951 // t0 != 0. So, rather than do a mul and an adds we just set
11952 // the carry flag iff t0 is nonzero.
11953 //
11954 // mul(Rlo_mn, Rm, Rn);
11955 // adds(zr, t0, Rlo_mn);
11956 subs(zr, t0, 1); // Set carry iff t0 is nonzero
11957 adcs(t0, t1, Rhi_mn);
11958 adc(t1, t2, zr);
11959 mov(t2, zr);
11960 }
11961
11962 void pre2(RegisterOrConstant i, RegisterOrConstant len) {
11963 block_comment("pre2");
11964 // Pa = Pa_base + i-len;
11965 // Pb = Pb_base + len;
11966 // Pm = Pm_base + i-len;
11967 // Pn = Pn_base + len;
11968
11969 if (i.is_register()) {
11970 sub(Rj, i.as_register(), len);
11971 } else {
11972 mov(Rj, i.as_constant());
11973 sub(Rj, Rj, len);
11974 }
11975 // Rj == i-len
11976
11977 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
11978 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
11979 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
11980 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
11981
11982 // Ra = *++Pa;
11983 // Rb = *--Pb;
11984 // Rm = *++Pm;
11985 // Rn = *--Pn;
11986 ldr(Ra, pre(Pa, wordSize));
11987 ldr(Rb, pre(Pb, -wordSize));
11988 ldr(Rm, pre(Pm, wordSize));
11989 ldr(Rn, pre(Pn, -wordSize));
11990
11991 mov(Rhi_mn, zr);
11992 mov(Rlo_mn, zr);
11993 }
11994
11995 void post2(RegisterOrConstant i, RegisterOrConstant len) {
11996 block_comment("post2");
11997 if (i.is_constant()) {
11998 mov(Rj, i.as_constant()-len.as_constant());
11999 } else {
12000 sub(Rj, i.as_register(), len);
12001 }
12002
12003 adds(t0, t0, Rlo_mn); // The pending m*n, low part
12004
12005 // As soon as we know the least significant digit of our result,
12006 // store it.
12007 // Pm_base[i-len] = t0;
12008 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
12009
12010 // t0 = t1; t1 = t2; t2 = 0;
12011 adcs(t0, t1, Rhi_mn); // The pending m*n, high part
12012 adc(t1, t2, zr);
12013 mov(t2, zr);
12014 }
12015
12016 // A carry in t0 after Montgomery multiplication means that we
12017 // should subtract multiples of n from our result in m. We'll
12018 // keep doing that until there is no carry.
12019 void normalize(RegisterOrConstant len) {
12020 block_comment("normalize");
12021 // while (t0)
12022 // t0 = sub(Pm_base, Pn_base, t0, len);
12023 Label loop, post, again;
12024 Register cnt = t1, i = t2; // Re-use registers; we're done with them now
12025 cbz(t0, post); {
12026 bind(again); {
12027 mov(i, zr);
12028 mov(cnt, len);
12029 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12030 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12031 subs(zr, zr, zr); // set carry flag, i.e. no borrow
12032 align(16);
12033 bind(loop); {
12034 sbcs(Rm, Rm, Rn);
12035 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12036 add(i, i, 1);
12037 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
12038 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
12039 sub(cnt, cnt, 1);
12040 } cbnz(cnt, loop);
12041 sbc(t0, t0, zr);
12042 } cbnz(t0, again);
12043 } bind(post);
12044 }
12045
12046 // Move memory at s to d, reversing words.
12047 // Increments d to end of copied memory
12048 // Destroys tmp1, tmp2
12049 // Preserves len
12050 // Leaves s pointing to the address which was in d at start
12051 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
12052 assert(tmp1->encoding() < r19->encoding(), "register corruption");
12053 assert(tmp2->encoding() < r19->encoding(), "register corruption");
12054
12055 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
12056 mov(tmp1, len);
12057 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
12058 sub(s, d, len, ext::uxtw, LogBytesPerWord);
12059 }
12060 // where
12061 void reverse1(Register d, Register s, Register tmp) {
12062 ldr(tmp, pre(s, -wordSize));
12063 ror(tmp, tmp, 32);
12064 str(tmp, post(d, wordSize));
12065 }
12066
12067 void step_squaring() {
12068 // An extra ACC
12069 step();
12070 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12071 }
12072
12073 void last_squaring(RegisterOrConstant i) {
12074 Label dont;
12075 // if ((i & 1) == 0) {
12076 tbnz(i.as_register(), 0, dont); {
12077 // MACC(Ra, Rb, t0, t1, t2);
12078 // Ra = *++Pa;
12079 // Rb = *--Pb;
12080 umulh(Rhi_ab, Ra, Rb);
12081 mul(Rlo_ab, Ra, Rb);
12082 acc(Rhi_ab, Rlo_ab, t0, t1, t2);
12083 } bind(dont);
12084 }
12085
12086 void extra_step_squaring() {
12087 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12088
12089 // MACC(Rm, Rn, t0, t1, t2);
12090 // Rm = *++Pm;
12091 // Rn = *--Pn;
12092 umulh(Rhi_mn, Rm, Rn);
12093 mul(Rlo_mn, Rm, Rn);
12094 ldr(Rm, pre(Pm, wordSize));
12095 ldr(Rn, pre(Pn, -wordSize));
12096 }
12097
12098 void post1_squaring() {
12099 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n
12100
12101 // *Pm = Rm = t0 * inv;
12102 mul(Rm, t0, inv);
12103 str(Rm, Address(Pm));
12104
12105 // MACC(Rm, Rn, t0, t1, t2);
12106 // t0 = t1; t1 = t2; t2 = 0;
12107 umulh(Rhi_mn, Rm, Rn);
12108
12109 #ifndef PRODUCT
12110 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
12111 {
12112 mul(Rlo_mn, Rm, Rn);
12113 add(Rlo_mn, t0, Rlo_mn);
12114 Label ok;
12115 cbz(Rlo_mn, ok); {
12116 stop("broken Montgomery multiply");
12117 } bind(ok);
12118 }
12119 #endif
12120 // We have very carefully set things up so that
12121 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
12122 // the lower half of Rm * Rn because we know the result already:
12123 // it must be -t0. t0 + (-t0) must generate a carry iff
12124 // t0 != 0. So, rather than do a mul and an adds we just set
12125 // the carry flag iff t0 is nonzero.
12126 //
12127 // mul(Rlo_mn, Rm, Rn);
12128 // adds(zr, t0, Rlo_mn);
12129 subs(zr, t0, 1); // Set carry iff t0 is nonzero
12130 adcs(t0, t1, Rhi_mn);
12131 adc(t1, t2, zr);
12132 mov(t2, zr);
12133 }
12134
12135 void acc(Register Rhi, Register Rlo,
12136 Register t0, Register t1, Register t2) {
12137 adds(t0, t0, Rlo);
12138 adcs(t1, t1, Rhi);
12139 adc(t2, t2, zr);
12140 }
12141
12142 public:
12143 /**
12144 * Fast Montgomery multiplication. The derivation of the
12145 * algorithm is in A Cryptographic Library for the Motorola
12146 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
12147 *
12148 * Arguments:
12149 *
12150 * Inputs for multiplication:
12151 * c_rarg0 - int array elements a
12152 * c_rarg1 - int array elements b
12153 * c_rarg2 - int array elements n (the modulus)
12154 * c_rarg3 - int length
12155 * c_rarg4 - int inv
12156 * c_rarg5 - int array elements m (the result)
12157 *
12158 * Inputs for squaring:
12159 * c_rarg0 - int array elements a
12160 * c_rarg1 - int array elements n (the modulus)
12161 * c_rarg2 - int length
12162 * c_rarg3 - int inv
12163 * c_rarg4 - int array elements m (the result)
12164 *
12165 */
12166 address generate_multiply() {
12167 Label argh, nothing;
12168
12169 align(CodeEntryAlignment);
12170 address entry = pc();
12171
12172 cbzw(Rlen, nothing);
12173
12174 enter();
12175
12176 // Make room.
12177 cmpw(Rlen, 512);
12178 br(Assembler::HI, argh);
12179 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12180 andr(sp, Ra, -2 * wordSize);
12181
12182 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12183
12184 {
12185 // Copy input args, reversing as we go. We use Ra as a
12186 // temporary variable.
12187 reverse(Ra, Pa_base, Rlen, t0, t1);
12188 if (!_squaring)
12189 reverse(Ra, Pb_base, Rlen, t0, t1);
12190 reverse(Ra, Pn_base, Rlen, t0, t1);
12191 }
12192
12193 // Push all call-saved registers and also Pm_base which we'll need
12194 // at the end.
12195 save_regs();
12196
12197 #ifndef PRODUCT
12198 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
12199 {
12200 ldr(Rn, Address(Pn_base, 0));
12201 mul(Rlo_mn, Rn, inv);
12202 subs(zr, Rlo_mn, -1);
12203 Label ok;
12204 br(EQ, ok); {
12205 stop("broken inverse in Montgomery multiply");
12206 } bind(ok);
12207 }
12208 #endif
12209
12210 mov(Pm_base, Ra);
12211
12212 mov(t0, zr);
12213 mov(t1, zr);
12214 mov(t2, zr);
12215
12216 block_comment("for (int i = 0; i < len; i++) {");
12217 mov(Ri, zr); {
12218 Label loop, end;
12219 cmpw(Ri, Rlen);
12220 br(Assembler::GE, end);
12221
12222 bind(loop);
12223 pre1(Ri);
12224
12225 block_comment(" for (j = i; j; j--) {"); {
12226 movw(Rj, Ri);
12227 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12228 } block_comment(" } // j");
12229
12230 post1();
12231 addw(Ri, Ri, 1);
12232 cmpw(Ri, Rlen);
12233 br(Assembler::LT, loop);
12234 bind(end);
12235 block_comment("} // i");
12236 }
12237
12238 block_comment("for (int i = len; i < 2*len; i++) {");
12239 mov(Ri, Rlen); {
12240 Label loop, end;
12241 cmpw(Ri, Rlen, Assembler::LSL, 1);
12242 br(Assembler::GE, end);
12243
12244 bind(loop);
12245 pre2(Ri, Rlen);
12246
12247 block_comment(" for (j = len*2-i-1; j; j--) {"); {
12248 lslw(Rj, Rlen, 1);
12249 subw(Rj, Rj, Ri);
12250 subw(Rj, Rj, 1);
12251 unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
12252 } block_comment(" } // j");
12253
12254 post2(Ri, Rlen);
12255 addw(Ri, Ri, 1);
12256 cmpw(Ri, Rlen, Assembler::LSL, 1);
12257 br(Assembler::LT, loop);
12258 bind(end);
12259 }
12260 block_comment("} // i");
12261
12262 normalize(Rlen);
12263
12264 mov(Ra, Pm_base); // Save Pm_base in Ra
12265 restore_regs(); // Restore caller's Pm_base
12266
12267 // Copy our result into caller's Pm_base
12268 reverse(Pm_base, Ra, Rlen, t0, t1);
12269
12270 leave();
12271 bind(nothing);
12272 ret(lr);
12273
12274 // handler for error case
12275 bind(argh);
12276 stop("MontgomeryMultiply total_allocation must be <= 8192");
12277
12278 return entry;
12279 }
12280 // In C, approximately:
12281
12282 // void
12283 // montgomery_multiply(julong Pa_base[], julong Pb_base[],
12284 // julong Pn_base[], julong Pm_base[],
12285 // julong inv, int len) {
12286 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12287 // julong *Pa, *Pb, *Pn, *Pm;
12288 // julong Ra, Rb, Rn, Rm;
12289
12290 // int i;
12291
12292 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12293
12294 // for (i = 0; i < len; i++) {
12295 // int j;
12296
12297 // Pa = Pa_base;
12298 // Pb = Pb_base + i;
12299 // Pm = Pm_base;
12300 // Pn = Pn_base + i;
12301
12302 // Ra = *Pa;
12303 // Rb = *Pb;
12304 // Rm = *Pm;
12305 // Rn = *Pn;
12306
12307 // int iters = i;
12308 // for (j = 0; iters--; j++) {
12309 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12310 // MACC(Ra, Rb, t0, t1, t2);
12311 // Ra = *++Pa;
12312 // Rb = *--Pb;
12313 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12314 // MACC(Rm, Rn, t0, t1, t2);
12315 // Rm = *++Pm;
12316 // Rn = *--Pn;
12317 // }
12318
12319 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
12320 // MACC(Ra, Rb, t0, t1, t2);
12321 // *Pm = Rm = t0 * inv;
12322 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12323 // MACC(Rm, Rn, t0, t1, t2);
12324
12325 // assert(t0 == 0, "broken Montgomery multiply");
12326
12327 // t0 = t1; t1 = t2; t2 = 0;
12328 // }
12329
12330 // for (i = len; i < 2*len; i++) {
12331 // int j;
12332
12333 // Pa = Pa_base + i-len;
12334 // Pb = Pb_base + len;
12335 // Pm = Pm_base + i-len;
12336 // Pn = Pn_base + len;
12337
12338 // Ra = *++Pa;
12339 // Rb = *--Pb;
12340 // Rm = *++Pm;
12341 // Rn = *--Pn;
12342
12343 // int iters = len*2-i-1;
12344 // for (j = i-len+1; iters--; j++) {
12345 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
12346 // MACC(Ra, Rb, t0, t1, t2);
12347 // Ra = *++Pa;
12348 // Rb = *--Pb;
12349 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12350 // MACC(Rm, Rn, t0, t1, t2);
12351 // Rm = *++Pm;
12352 // Rn = *--Pn;
12353 // }
12354
12355 // Pm_base[i-len] = t0;
12356 // t0 = t1; t1 = t2; t2 = 0;
12357 // }
12358
12359 // while (t0)
12360 // t0 = sub(Pm_base, Pn_base, t0, len);
12361 // }
12362
12363 /**
12364 * Fast Montgomery squaring. This uses asymptotically 25% fewer
12365 * multiplies than Montgomery multiplication so it should be up to
12366 * 25% faster. However, its loop control is more complex and it
12367 * may actually run slower on some machines.
12368 *
12369 * Arguments:
12370 *
12371 * Inputs:
12372 * c_rarg0 - int array elements a
12373 * c_rarg1 - int array elements n (the modulus)
12374 * c_rarg2 - int length
12375 * c_rarg3 - int inv
12376 * c_rarg4 - int array elements m (the result)
12377 *
12378 */
12379 address generate_square() {
12380 Label argh;
12381
12382 align(CodeEntryAlignment);
12383 address entry = pc();
12384
12385 enter();
12386
12387 // Make room.
12388 cmpw(Rlen, 512);
12389 br(Assembler::HI, argh);
12390 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
12391 andr(sp, Ra, -2 * wordSize);
12392
12393 lsrw(Rlen, Rlen, 1); // length in longwords = len/2
12394
12395 {
12396 // Copy input args, reversing as we go. We use Ra as a
12397 // temporary variable.
12398 reverse(Ra, Pa_base, Rlen, t0, t1);
12399 reverse(Ra, Pn_base, Rlen, t0, t1);
12400 }
12401
12402 // Push all call-saved registers and also Pm_base which we'll need
12403 // at the end.
12404 save_regs();
12405
12406 mov(Pm_base, Ra);
12407
12408 mov(t0, zr);
12409 mov(t1, zr);
12410 mov(t2, zr);
12411
12412 block_comment("for (int i = 0; i < len; i++) {");
12413 mov(Ri, zr); {
12414 Label loop, end;
12415 bind(loop);
12416 cmp(Ri, Rlen);
12417 br(Assembler::GE, end);
12418
12419 pre1(Ri);
12420
12421 block_comment("for (j = (i+1)/2; j; j--) {"); {
12422 add(Rj, Ri, 1);
12423 lsr(Rj, Rj, 1);
12424 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12425 } block_comment(" } // j");
12426
12427 last_squaring(Ri);
12428
12429 block_comment(" for (j = i/2; j; j--) {"); {
12430 lsr(Rj, Ri, 1);
12431 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12432 } block_comment(" } // j");
12433
12434 post1_squaring();
12435 add(Ri, Ri, 1);
12436 cmp(Ri, Rlen);
12437 br(Assembler::LT, loop);
12438
12439 bind(end);
12440 block_comment("} // i");
12441 }
12442
12443 block_comment("for (int i = len; i < 2*len; i++) {");
12444 mov(Ri, Rlen); {
12445 Label loop, end;
12446 bind(loop);
12447 cmp(Ri, Rlen, Assembler::LSL, 1);
12448 br(Assembler::GE, end);
12449
12450 pre2(Ri, Rlen);
12451
12452 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); {
12453 lsl(Rj, Rlen, 1);
12454 sub(Rj, Rj, Ri);
12455 sub(Rj, Rj, 1);
12456 lsr(Rj, Rj, 1);
12457 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
12458 } block_comment(" } // j");
12459
12460 last_squaring(Ri);
12461
12462 block_comment(" for (j = (2*len-i)/2; j; j--) {"); {
12463 lsl(Rj, Rlen, 1);
12464 sub(Rj, Rj, Ri);
12465 lsr(Rj, Rj, 1);
12466 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
12467 } block_comment(" } // j");
12468
12469 post2(Ri, Rlen);
12470 add(Ri, Ri, 1);
12471 cmp(Ri, Rlen, Assembler::LSL, 1);
12472
12473 br(Assembler::LT, loop);
12474 bind(end);
12475 block_comment("} // i");
12476 }
12477
12478 normalize(Rlen);
12479
12480 mov(Ra, Pm_base); // Save Pm_base in Ra
12481 restore_regs(); // Restore caller's Pm_base
12482
12483 // Copy our result into caller's Pm_base
12484 reverse(Pm_base, Ra, Rlen, t0, t1);
12485
12486 leave();
12487 ret(lr);
12488
12489 // handler for error case
12490 bind(argh);
12491 stop("MontgomeryMultiply total_allocation must be <= 8192");
12492
12493 return entry;
12494 }
12495 // In C, approximately:
12496
12497 // void
12498 // montgomery_square(julong Pa_base[], julong Pn_base[],
12499 // julong Pm_base[], julong inv, int len) {
12500 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
12501 // julong *Pa, *Pb, *Pn, *Pm;
12502 // julong Ra, Rb, Rn, Rm;
12503
12504 // int i;
12505
12506 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
12507
12508 // for (i = 0; i < len; i++) {
12509 // int j;
12510
12511 // Pa = Pa_base;
12512 // Pb = Pa_base + i;
12513 // Pm = Pm_base;
12514 // Pn = Pn_base + i;
12515
12516 // Ra = *Pa;
12517 // Rb = *Pb;
12518 // Rm = *Pm;
12519 // Rn = *Pn;
12520
12521 // int iters = (i+1)/2;
12522 // for (j = 0; iters--; j++) {
12523 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12524 // MACC2(Ra, Rb, t0, t1, t2);
12525 // Ra = *++Pa;
12526 // Rb = *--Pb;
12527 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12528 // MACC(Rm, Rn, t0, t1, t2);
12529 // Rm = *++Pm;
12530 // Rn = *--Pn;
12531 // }
12532 // if ((i & 1) == 0) {
12533 // assert(Ra == Pa_base[j], "must be");
12534 // MACC(Ra, Ra, t0, t1, t2);
12535 // }
12536 // iters = i/2;
12537 // assert(iters == i-j, "must be");
12538 // for (; iters--; j++) {
12539 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12540 // MACC(Rm, Rn, t0, t1, t2);
12541 // Rm = *++Pm;
12542 // Rn = *--Pn;
12543 // }
12544
12545 // *Pm = Rm = t0 * inv;
12546 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
12547 // MACC(Rm, Rn, t0, t1, t2);
12548
12549 // assert(t0 == 0, "broken Montgomery multiply");
12550
12551 // t0 = t1; t1 = t2; t2 = 0;
12552 // }
12553
12554 // for (i = len; i < 2*len; i++) {
12555 // int start = i-len+1;
12556 // int end = start + (len - start)/2;
12557 // int j;
12558
12559 // Pa = Pa_base + i-len;
12560 // Pb = Pa_base + len;
12561 // Pm = Pm_base + i-len;
12562 // Pn = Pn_base + len;
12563
12564 // Ra = *++Pa;
12565 // Rb = *--Pb;
12566 // Rm = *++Pm;
12567 // Rn = *--Pn;
12568
12569 // int iters = (2*len-i-1)/2;
12570 // assert(iters == end-start, "must be");
12571 // for (j = start; iters--; j++) {
12572 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
12573 // MACC2(Ra, Rb, t0, t1, t2);
12574 // Ra = *++Pa;
12575 // Rb = *--Pb;
12576 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12577 // MACC(Rm, Rn, t0, t1, t2);
12578 // Rm = *++Pm;
12579 // Rn = *--Pn;
12580 // }
12581 // if ((i & 1) == 0) {
12582 // assert(Ra == Pa_base[j], "must be");
12583 // MACC(Ra, Ra, t0, t1, t2);
12584 // }
12585 // iters = (2*len-i)/2;
12586 // assert(iters == len-j, "must be");
12587 // for (; iters--; j++) {
12588 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
12589 // MACC(Rm, Rn, t0, t1, t2);
12590 // Rm = *++Pm;
12591 // Rn = *--Pn;
12592 // }
12593 // Pm_base[i-len] = t0;
12594 // t0 = t1; t1 = t2; t2 = 0;
12595 // }
12596
12597 // while (t0)
12598 // t0 = sub(Pm_base, Pn_base, t0, len);
12599 // }
12600 };
12601
12602 // Call here from the interpreter or compiled code to either load
12603 // multiple returned values from the inline type instance being
12604 // returned to registers or to store returned values to a newly
12605 // allocated inline type instance.
12606 address generate_return_value_stub(address destination, const char* name, bool has_res) {
12607 // We need to save all registers the calling convention may use so
12608 // the runtime calls read or update those registers. This needs to
12609 // be in sync with SharedRuntime::java_return_convention().
12610 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
12611 enum layout {
12612 j_rarg7_off = 0, j_rarg7_2, // j_rarg7 is r0
12613 j_rarg6_off, j_rarg6_2,
12614 j_rarg5_off, j_rarg5_2,
12615 j_rarg4_off, j_rarg4_2,
12616 j_rarg3_off, j_rarg3_2,
12617 j_rarg2_off, j_rarg2_2,
12618 j_rarg1_off, j_rarg1_2,
12619 j_rarg0_off, j_rarg0_2,
12620
12621 j_farg7_off, j_farg7_2,
12622 j_farg6_off, j_farg6_2,
12623 j_farg5_off, j_farg5_2,
12624 j_farg4_off, j_farg4_2,
12625 j_farg3_off, j_farg3_2,
12626 j_farg2_off, j_farg2_2,
12627 j_farg1_off, j_farg1_2,
12628 j_farg0_off, j_farg0_2,
12629
12630 rfp_off, rfp_off2,
12631 return_off, return_off2,
12632
12633 framesize // inclusive of return address
12634 };
12635
12636 CodeBuffer code(name, 512, 64);
12637 MacroAssembler* masm = new MacroAssembler(&code);
12638
12639 int frame_size_in_bytes = align_up(framesize*BytesPerInt, 16);
12640 assert(frame_size_in_bytes == framesize*BytesPerInt, "misaligned");
12641 int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
12642 int frame_size_in_words = frame_size_in_bytes / wordSize;
12643
12644 OopMapSet* oop_maps = new OopMapSet();
12645 OopMap* map = new OopMap(frame_size_in_slots, 0);
12646
12647 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg());
12648 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg());
12649 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg());
12650 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg());
12651 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg());
12652 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg());
12653 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg());
12654 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg());
12655
12656 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg());
12657 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg());
12658 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg());
12659 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg());
12660 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg());
12661 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg());
12662 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg());
12663 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg());
12664
12665 address start = __ pc();
12666
12667 __ enter(); // Save FP and LR before call
12668
12669 __ stpd(j_farg1, j_farg0, Address(__ pre(sp, -2 * wordSize)));
12670 __ stpd(j_farg3, j_farg2, Address(__ pre(sp, -2 * wordSize)));
12671 __ stpd(j_farg5, j_farg4, Address(__ pre(sp, -2 * wordSize)));
12672 __ stpd(j_farg7, j_farg6, Address(__ pre(sp, -2 * wordSize)));
12673
12674 __ stp(j_rarg1, j_rarg0, Address(__ pre(sp, -2 * wordSize)));
12675 __ stp(j_rarg3, j_rarg2, Address(__ pre(sp, -2 * wordSize)));
12676 __ stp(j_rarg5, j_rarg4, Address(__ pre(sp, -2 * wordSize)));
12677 __ stp(j_rarg7, j_rarg6, Address(__ pre(sp, -2 * wordSize)));
12678
12679 int frame_complete = __ offset();
12680
12681 // Set up last_Java_sp and last_Java_fp
12682 address the_pc = __ pc();
12683 __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
12684
12685 // Call runtime
12686 __ mov(c_rarg1, r0);
12687 __ mov(c_rarg0, rthread);
12688
12689 __ mov(rscratch1, destination);
12690 __ blr(rscratch1);
12691
12692 oop_maps->add_gc_map(the_pc - start, map);
12693
12694 __ reset_last_Java_frame(false);
12695
12696 __ ldp(j_rarg7, j_rarg6, Address(__ post(sp, 2 * wordSize)));
12697 __ ldp(j_rarg5, j_rarg4, Address(__ post(sp, 2 * wordSize)));
12698 __ ldp(j_rarg3, j_rarg2, Address(__ post(sp, 2 * wordSize)));
12699 __ ldp(j_rarg1, j_rarg0, Address(__ post(sp, 2 * wordSize)));
12700
12701 __ ldpd(j_farg7, j_farg6, Address(__ post(sp, 2 * wordSize)));
12702 __ ldpd(j_farg5, j_farg4, Address(__ post(sp, 2 * wordSize)));
12703 __ ldpd(j_farg3, j_farg2, Address(__ post(sp, 2 * wordSize)));
12704 __ ldpd(j_farg1, j_farg0, Address(__ post(sp, 2 * wordSize)));
12705
12706 // check for pending exceptions
12707 Label pending;
12708 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
12709 __ cbnz(rscratch1, pending);
12710
12711 if (has_res) {
12712 // We just called SharedRuntime::store_inline_type_fields_to_buf. Check if we still
12713 // need to initialize the buffer and if so, call the inline class specific pack handler.
12714 Label skip_pack;
12715 __ get_vm_result_oop(r0, rthread);
12716 __ get_vm_result_metadata(rscratch1, rthread);
12717 __ cbz(rscratch1, skip_pack);
12718 __ ldr(rscratch1, Address(rscratch1, InlineKlass::adr_members_offset()));
12719 __ ldr(rscratch1, Address(rscratch1, InlineKlass::pack_handler_offset()));
12720 __ blr(rscratch1);
12721 __ membar(Assembler::StoreStore);
12722 __ bind(skip_pack);
12723 }
12724
12725 __ leave();
12726 __ ret(lr);
12727
12728 __ bind(pending);
12729 __ leave();
12730 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
12731
12732 // -------------
12733 // make sure all code is generated
12734 masm->flush();
12735
12736 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false);
12737 return stub->entry_point();
12738 }
12739
12740 // Initialization
12741 void generate_preuniverse_stubs() {
12742 // preuniverse stubs are not needed for aarch64
12743 }
12744
12745 void generate_initial_stubs() {
12746 // Generate initial stubs and initializes the entry points
12747
12748 // entry points that exist in all platforms Note: This is code
12749 // that could be shared among different platforms - however the
12750 // benefit seems to be smaller than the disadvantage of having a
12751 // much more complicated generator structure. See also comment in
12752 // stubRoutines.hpp.
12753
12754 StubRoutines::_forward_exception_entry = generate_forward_exception();
12755
12756 StubRoutines::_call_stub_entry =
12757 generate_call_stub(StubRoutines::_call_stub_return_address);
12758
12759 // is referenced by megamorphic call
12760 StubRoutines::_catch_exception_entry = generate_catch_exception();
12761
12762 // Initialize table for copy memory (arraycopy) check.
12763 if (UnsafeMemoryAccess::_table == nullptr) {
12764 UnsafeMemoryAccess::create_table(8 + 4); // 8 for copyMemory; 4 for setMemory
12765 }
12766
12767 if (UseCRC32Intrinsics) {
12768 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
12769 }
12770
12771 if (UseCRC32CIntrinsics) {
12772 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
12773 }
12774
12775 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
12776 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
12777 }
12778
12779 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
12780 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
12781 }
12782
12783 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_float16ToFloat) &&
12784 vmIntrinsics::is_intrinsic_available(vmIntrinsics::_floatToFloat16)) {
12785 StubRoutines::_hf2f = generate_float16ToFloat();
12786 StubRoutines::_f2hf = generate_floatToFloat16();
12787 }
12788
12789 if (InlineTypeReturnedAsFields) {
12790 StubRoutines::_load_inline_type_fields_in_regs =
12791 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_inline_type_fields_in_regs), "load_inline_type_fields_in_regs", false);
12792 StubRoutines::_store_inline_type_fields_to_buf =
12793 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_inline_type_fields_to_buf), "store_inline_type_fields_to_buf", true);
12794 }
12795
12796 }
12797
12798 void generate_continuation_stubs() {
12799 // Continuation stubs:
12800 StubRoutines::_cont_thaw = generate_cont_thaw();
12801 StubRoutines::_cont_returnBarrier = generate_cont_returnBarrier();
12802 StubRoutines::_cont_returnBarrierExc = generate_cont_returnBarrier_exception();
12803 StubRoutines::_cont_preempt_stub = generate_cont_preempt_stub();
12804 }
12805
12806 void generate_final_stubs() {
12807 // support for verify_oop (must happen after universe_init)
12808 if (VerifyOops) {
12809 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop();
12810 }
12811
12812 // arraycopy stubs used by compilers
12813 generate_arraycopy_stubs();
12814
12815 StubRoutines::_method_entry_barrier = generate_method_entry_barrier();
12816
12817 StubRoutines::aarch64::_spin_wait = generate_spin_wait();
12818
12819 StubRoutines::_upcall_stub_exception_handler = generate_upcall_stub_exception_handler();
12820 StubRoutines::_upcall_stub_load_target = generate_upcall_stub_load_target();
12821
12822 #if defined (LINUX) && !defined (__ARM_FEATURE_ATOMICS)
12823
12824 generate_atomic_entry_points();
12825
12826 #endif // LINUX
12827
12828 #ifdef COMPILER2
12829 if (UseSecondarySupersTable) {
12830 StubRoutines::_lookup_secondary_supers_table_slow_path_stub = generate_lookup_secondary_supers_table_slow_path_stub();
12831 if (! InlineSecondarySupersTest) {
12832 generate_lookup_secondary_supers_table_stub();
12833 }
12834 }
12835 #endif
12836
12837 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_setMemory)) {
12838 StubRoutines::_unsafe_setmemory = generate_unsafe_setmemory();
12839 }
12840
12841 StubRoutines::aarch64::set_completed(); // Inidicate that arraycopy and zero_blocks stubs are generated
12842 }
12843
12844 void generate_compiler_stubs() {
12845 #ifdef COMPILER2
12846
12847 if (UseSVE == 0) {
12848 generate_iota_indices(StubId::stubgen_vector_iota_indices_id);
12849 }
12850
12851 // array equals stub for large arrays.
12852 if (!UseSimpleArrayEquals) {
12853 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
12854 }
12855
12856 // arrays_hascode stub for large arrays.
12857 StubRoutines::aarch64::_large_arrays_hashcode_boolean = generate_large_arrays_hashcode(T_BOOLEAN);
12858 StubRoutines::aarch64::_large_arrays_hashcode_byte = generate_large_arrays_hashcode(T_BYTE);
12859 StubRoutines::aarch64::_large_arrays_hashcode_char = generate_large_arrays_hashcode(T_CHAR);
12860 StubRoutines::aarch64::_large_arrays_hashcode_int = generate_large_arrays_hashcode(T_INT);
12861 StubRoutines::aarch64::_large_arrays_hashcode_short = generate_large_arrays_hashcode(T_SHORT);
12862
12863 // byte_array_inflate stub for large arrays.
12864 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
12865
12866 // countPositives stub for large arrays.
12867 StubRoutines::aarch64::_count_positives = generate_count_positives(StubRoutines::aarch64::_count_positives_long);
12868
12869 generate_compare_long_strings();
12870
12871 generate_string_indexof_stubs();
12872
12873 if (UseMultiplyToLenIntrinsic) {
12874 StubRoutines::_multiplyToLen = generate_multiplyToLen();
12875 }
12876
12877 if (UseSquareToLenIntrinsic) {
12878 StubRoutines::_squareToLen = generate_squareToLen();
12879 }
12880
12881 if (UseMulAddIntrinsic) {
12882 StubRoutines::_mulAdd = generate_mulAdd();
12883 }
12884
12885 if (UseSIMDForBigIntegerShiftIntrinsics) {
12886 StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift();
12887 StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift();
12888 }
12889
12890 if (UseMontgomeryMultiplyIntrinsic) {
12891 StubId stub_id = StubId::stubgen_montgomeryMultiply_id;
12892 address start = load_archive_data(stub_id);
12893 if (start == nullptr) {
12894 // we have to generate it
12895 StubCodeMark mark(this, stub_id);
12896 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
12897 start = g.generate_multiply();
12898 // record the stub start and end
12899 store_archive_data(stub_id, start, _masm->pc());
12900 }
12901 StubRoutines::_montgomeryMultiply = start;
12902 }
12903
12904 if (UseMontgomerySquareIntrinsic) {
12905 StubId stub_id = StubId::stubgen_montgomerySquare_id;
12906 address start = load_archive_data(stub_id);
12907 if (start == nullptr) {
12908 // we have to generate it
12909 StubCodeMark mark(this, stub_id);
12910 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
12911 // We use generate_multiply() rather than generate_square()
12912 // because it's faster for the sizes of modulus we care about.
12913 start = g.generate_multiply();
12914 // record the stub start and end
12915 store_archive_data(stub_id, start, _masm->pc());
12916 }
12917 StubRoutines::_montgomerySquare = start;
12918 }
12919
12920 if (UseChaCha20Intrinsics) {
12921 StubRoutines::_chacha20Block = generate_chacha20Block_blockpar();
12922 }
12923
12924 if (UseKyberIntrinsics) {
12925 StubRoutines::_kyberNtt = generate_kyberNtt();
12926 StubRoutines::_kyberInverseNtt = generate_kyberInverseNtt();
12927 StubRoutines::_kyberNttMult = generate_kyberNttMult();
12928 StubRoutines::_kyberAddPoly_2 = generate_kyberAddPoly_2();
12929 StubRoutines::_kyberAddPoly_3 = generate_kyberAddPoly_3();
12930 StubRoutines::_kyber12To16 = generate_kyber12To16();
12931 StubRoutines::_kyberBarrettReduce = generate_kyberBarrettReduce();
12932 }
12933
12934 if (UseDilithiumIntrinsics) {
12935 StubRoutines::_dilithiumAlmostNtt = generate_dilithiumAlmostNtt();
12936 StubRoutines::_dilithiumAlmostInverseNtt = generate_dilithiumAlmostInverseNtt();
12937 StubRoutines::_dilithiumNttMult = generate_dilithiumNttMult();
12938 StubRoutines::_dilithiumMontMulByConstant = generate_dilithiumMontMulByConstant();
12939 StubRoutines::_dilithiumDecomposePoly = generate_dilithiumDecomposePoly();
12940 }
12941
12942 if (UseBASE64Intrinsics) {
12943 StubRoutines::_base64_encodeBlock = generate_base64_encodeBlock();
12944 StubRoutines::_base64_decodeBlock = generate_base64_decodeBlock();
12945 }
12946
12947 // data cache line writeback
12948 StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
12949 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
12950
12951 if (UseAESIntrinsics) {
12952 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
12953 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
12954 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
12955 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
12956 StubRoutines::_counterMode_AESCrypt = generate_counterMode_AESCrypt();
12957 }
12958 if (UseGHASHIntrinsics) {
12959 // StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
12960 StubRoutines::aarch64::_ghash_processBlocks_small = generate_ghash_processBlocks_small();
12961 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(StubRoutines::aarch64::_ghash_processBlocks_small);
12962 }
12963 if (UseAESIntrinsics && UseGHASHIntrinsics) {
12964 StubRoutines::_galoisCounterMode_AESCrypt = generate_galoisCounterMode_AESCrypt();
12965 }
12966
12967 if (UseMD5Intrinsics) {
12968 StubRoutines::_md5_implCompress = generate_md5_implCompress(StubId::stubgen_md5_implCompress_id);
12969 StubRoutines::_md5_implCompressMB = generate_md5_implCompress(StubId::stubgen_md5_implCompressMB_id);
12970 }
12971 if (UseSHA1Intrinsics) {
12972 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(StubId::stubgen_sha1_implCompress_id);
12973 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(StubId::stubgen_sha1_implCompressMB_id);
12974 }
12975 if (UseSHA256Intrinsics) {
12976 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(StubId::stubgen_sha256_implCompress_id);
12977 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(StubId::stubgen_sha256_implCompressMB_id);
12978 }
12979 if (UseSHA512Intrinsics) {
12980 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(StubId::stubgen_sha512_implCompress_id);
12981 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(StubId::stubgen_sha512_implCompressMB_id);
12982 }
12983 if (UseSHA3Intrinsics && UseSIMDForSHA3Intrinsic) {
12984 StubRoutines::_double_keccak = generate_double_keccak();
12985 StubRoutines::_sha3_implCompress = generate_sha3_implCompress(StubId::stubgen_sha3_implCompress_id);
12986 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress(StubId::stubgen_sha3_implCompressMB_id);
12987 } else if (UseSHA3Intrinsics) {
12988 StubRoutines::_sha3_implCompress = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompress_id);
12989 StubRoutines::_sha3_implCompressMB = generate_sha3_implCompress_gpr(StubId::stubgen_sha3_implCompressMB_id);
12990 }
12991
12992 if (UsePoly1305Intrinsics) {
12993 StubRoutines::_poly1305_processBlocks = generate_poly1305_processBlocks();
12994 }
12995
12996 // generate Adler32 intrinsics code
12997 if (UseAdler32Intrinsics) {
12998 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
12999 }
13000
13001 #endif // COMPILER2
13002 }
13003
13004 public:
13005 StubGenerator(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) : StubCodeGenerator(code, blob_id, stub_data) {
13006 switch(blob_id) {
13007 case BlobId::stubgen_preuniverse_id:
13008 generate_preuniverse_stubs();
13009 break;
13010 case BlobId::stubgen_initial_id:
13011 generate_initial_stubs();
13012 break;
13013 case BlobId::stubgen_continuation_id:
13014 generate_continuation_stubs();
13015 break;
13016 case BlobId::stubgen_compiler_id:
13017 generate_compiler_stubs();
13018 break;
13019 case BlobId::stubgen_final_id:
13020 generate_final_stubs();
13021 break;
13022 default:
13023 fatal("unexpected blob id: %s", StubInfo::name(blob_id));
13024 break;
13025 };
13026 }
13027
13028 #if INCLUDE_CDS
13029 static void init_AOTAddressTable(GrowableArray<address>& external_addresses) {
13030 // external data defined in this file
13031 #define ADD(addr) external_addresses.append((address)(addr));
13032 ADD(_sha256_round_consts);
13033 ADD(_sha512_round_consts);
13034 ADD(_sha3_round_consts);
13035 ADD(_double_keccak_round_consts);
13036 ADD(_encodeBlock_toBase64);
13037 ADD(_encodeBlock_toBase64URL);
13038 ADD(_decodeBlock_fromBase64ForNoSIMD);
13039 ADD(_decodeBlock_fromBase64URLForNoSIMD);
13040 ADD(_decodeBlock_fromBase64ForSIMD);
13041 ADD(_decodeBlock_fromBase64URLForSIMD);
13042 #undef ADD
13043 }
13044 #endif // INCLUDE_CDS
13045 }; // end class declaration
13046
13047 void StubGenerator_generate(CodeBuffer* code, BlobId blob_id, AOTStubData* stub_data) {
13048 StubGenerator g(code, blob_id, stub_data);
13049 }
13050
13051 #if INCLUDE_CDS
13052 void StubGenerator_init_AOTAddressTable(GrowableArray<address>& addresses) {
13053 StubGenerator::init_AOTAddressTable(addresses);
13054 }
13055 #endif // INCLUDE_CDS
13056
13057 #if defined (LINUX)
13058
13059 // Define pointers to atomic stubs and initialize them to point to the
13060 // code in atomic_aarch64.S.
13061
13062 #define DEFAULT_ATOMIC_OP(OPNAME, SIZE, RELAXED) \
13063 extern "C" uint64_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl \
13064 (volatile void *ptr, uint64_t arg1, uint64_t arg2); \
13065 aarch64_atomic_stub_t aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _impl \
13066 = aarch64_atomic_ ## OPNAME ## _ ## SIZE ## RELAXED ## _default_impl;
13067
13068 DEFAULT_ATOMIC_OP(fetch_add, 4, )
13069 DEFAULT_ATOMIC_OP(fetch_add, 8, )
13070 DEFAULT_ATOMIC_OP(fetch_add, 4, _relaxed)
13071 DEFAULT_ATOMIC_OP(fetch_add, 8, _relaxed)
13072 DEFAULT_ATOMIC_OP(xchg, 4, )
13073 DEFAULT_ATOMIC_OP(xchg, 8, )
13074 DEFAULT_ATOMIC_OP(cmpxchg, 1, )
13075 DEFAULT_ATOMIC_OP(cmpxchg, 4, )
13076 DEFAULT_ATOMIC_OP(cmpxchg, 8, )
13077 DEFAULT_ATOMIC_OP(cmpxchg, 1, _relaxed)
13078 DEFAULT_ATOMIC_OP(cmpxchg, 4, _relaxed)
13079 DEFAULT_ATOMIC_OP(cmpxchg, 8, _relaxed)
13080 DEFAULT_ATOMIC_OP(cmpxchg, 4, _release)
13081 DEFAULT_ATOMIC_OP(cmpxchg, 8, _release)
13082 DEFAULT_ATOMIC_OP(cmpxchg, 4, _seq_cst)
13083 DEFAULT_ATOMIC_OP(cmpxchg, 8, _seq_cst)
13084
13085 #undef DEFAULT_ATOMIC_OP
13086
13087 #endif // LINUX